Skip to content
Snippets Groups Projects
Commit 9eb457d6 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

Merge branch 'dev' into f-authorization

parents e1923b9f 98b1878f
No related branches found
No related tags found
2 merge requests!53Release 0.1,!32ENH: add security levels that may prevent updates or inserts
Pipeline #26269 failed
Showing with 449 additions and 19 deletions
......@@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
- Added new converters for tables: CSVTableConverter and XLSXTableConverter
- FIX: Variables are now also replaced when the value is given as a list.
### Added
* Everything
......
......@@ -24,6 +24,8 @@ install_requires =
caosadvancedtools
yaml-header-tools
pyyaml
odfpy
pandas
[options.packages.find]
where = src
......
......@@ -23,6 +23,8 @@ cfood:
- Definitions
- Dict
- JSONFile
- CSVTableConverter
- XLSXTableConverter
description: Type of this converter node.
match:
description: typically a regexp which is matched to a structure element name
......
......@@ -40,6 +40,8 @@ from abc import abstractmethod
from string import Template
import yaml_header_tools
import pandas as pd
import yaml
# These are special properties which are (currently) treated differently
......@@ -48,6 +50,15 @@ SPECIAL_PROPERTIES = ("description", "name", "id", "path",
"file", "checksum", "size")
def str_to_bool(x):
if str(x).lower() == "true":
return True
elif str(x).lower() == "false":
return False
else:
raise RuntimeError("Should be 'true' or 'false'.")
class ConverterValidationError(Exception):
"""To be raised if contents of an element to be converted are invalid."""
......@@ -55,13 +66,37 @@ class ConverterValidationError(Exception):
self.message = msg
def handle_value(value: Union[dict, str], values: GeneralStore):
"""
Function to generically handle values for properties defined in the
yaml structure.
def replace_variables(propvalue, values: GeneralStore):
# Check if the replacement is a single variable containing a record:
match = re.match(r"^\$(\{)?(?P<varname>[0-9a-zA-Z_]+)(\})?$", propvalue)
if match is not None:
varname = match.group("varname")
if varname in values:
if values[varname] is None:
return None
if isinstance(values[varname], db.Entity):
return values[varname]
propvalue_template = Template(propvalue)
return propvalue_template.safe_substitute(**values.get_storage())
def handle_value(value: Union[dict, str, list], values: GeneralStore):
"""
determines whether the given value needs to set a property, be added to an existing value (create a list) or
add as an additional property (multiproperty).
Variable names (starting with a "$") are replaced by the corresponding value stored in the
`values` GeneralStore.
Parameters:
- value: if str, the value to be interpreted. E.g. "4", "hallo" or "$a" etc.
if dict, must have keys "value" and "collection_mode". The returned tuple is directly
created from the corresponding values.
if list, each element is checked for replacement and the resulting list will be used
as (list) value for the property
Returns a tuple:
- the final value of the property
- the final value of the property; variable names contained in `values` are replaced.
- the collection mode (can be single, list or multiproperty)
"""
# @review Florian Spreckelsen 2022-05-13
......@@ -90,22 +125,19 @@ def handle_value(value: Union[dict, str], values: GeneralStore):
# different from the two cases above.
collection_mode = "single"
propvalue = value
return (propvalue, collection_mode)
# Check if the replacement is a single variable containing a record:
match = re.match(r"^\$(\{)?(?P<varname>[0-9a-zA-Z_]+)(\})?$", propvalue)
if match is not None:
varname = match.group("varname")
if varname in values:
if values[varname] is None:
propvalue = None
return (propvalue, collection_mode)
if isinstance(values[varname], db.Entity):
propvalue = values[varname]
# variables replacement:
propvalue = [replace_variables(i, values) for i in propvalue]
return (propvalue, collection_mode)
else:
# value is another simple type
# collection_mode = "single"
# propvalue = value["value"]
# return (propvalue, collection_mode)
raise RuntimeError()
propvalue_template = Template(propvalue)
propvalue = propvalue_template.safe_substitute(**values.get_storage())
propvalue = replace_variables(propvalue, values)
return (propvalue, collection_mode)
......@@ -143,6 +175,11 @@ def create_records(values: GeneralStore,
for key, value in record.items():
if key == "parents" or key == "role":
continue
# Allow replacing variables in keys / names of properties:
key_template = Template(key)
key = key_template.safe_substitute(**values.get_storage())
keys_modified.append((name, key))
propvalue, collection_mode = handle_value(value, values)
......@@ -151,6 +188,9 @@ def create_records(values: GeneralStore,
# list mode does not work for them
if key == "path" and not propvalue.startswith(os.path.sep):
propvalue = os.path.sep + propvalue
# Convert relative to absolute paths:
propvalue = os.path.normpath(propvalue)
setattr(c_record, key, propvalue)
else:
......@@ -607,3 +647,102 @@ class TextElementConverter(Converter):
if m is None:
return None
return m.groupdict()
class TableConverter(Converter):
"""
This converter reads tables in different formats line by line and
allows matching the corresponding rows.
The subtree generated by the table converter consists of DictDictElements, each being
a row. The corresponding header elements will become the dictionary keys.
The rows can be matched using a DictDictElementConverter.
"""
@abstractmethod
def get_options(self):
"""
This method needs to be overwritten by the specific table converter to provide
information about the possible options.
"""
pass
def _get_options(self, possible_options):
option_dict = dict()
for opt_name, opt_conversion in possible_options:
if opt_name in self.definition:
el = self.definition[opt_name]
# The option can often either be a single value or a list of values.
# In the latter case each element of the list will be converted to the defined type.
if isinstance(el, list):
option_dict[opt_name] = [opt_conversion(el_el) for el_el in el]
else:
option_dict[opt_name] = opt_conversion(el)
return option_dict
def typecheck(self, element: StructureElement):
return isinstance(element, File)
def match(self, element: StructureElement):
if not isinstance(element, File):
raise RuntimeError("Element must be a File.")
m = re.match(self.definition["match"], element.name)
if m is None:
return None
return m.groupdict()
class XLSXTableConverter(TableConverter):
def get_options(self):
return self._get_options([
("sheet_name", str),
("header", int),
("names", str),
("index_col", int),
("usecols", int),
("true_values", str),
("false_values", str),
("na_values", str),
("skiprows", int),
("nrows", int),
("keep_default_na", str_to_bool), ]
)
def create_children(self, generalStore: GeneralStore,
element: StructureElement):
if not isinstance(element, File):
raise RuntimeError("Element must be a File.")
table = pd.read_excel(element.path, **self.get_options())
child_elements = list()
for index, row in table.iterrows():
child_elements.append(
DictDictElement(str(index), row.to_dict()))
return child_elements
class CSVTableConverter(TableConverter):
def get_options(self):
return self._get_options([
("sep", str),
("delimiter", str),
("header", int),
("names", str),
("index_col", int),
("usecols", int),
("true_values", str),
("false_values", str),
("na_values", str),
("skiprows", int),
("nrows", int),
("keep_default_na", str_to_bool), ])
def create_children(self, generalStore: GeneralStore,
element: StructureElement):
if not isinstance(element, File):
raise RuntimeError("Element must be a File.")
table = pd.read_csv(element.path, **self.get_options())
child_elements = list()
for index, row in table.iterrows():
child_elements.append(
DictDictElement(str(index), row.to_dict()))
return child_elements
......@@ -300,6 +300,12 @@ class Crawler(object):
"JSONFile": {
"converter": "JSONFileConverter",
"package": "caoscrawler.converters"},
"CSVTableConverter": {
"converter": "CSVTableConverter",
"package": "caoscrawler.converters"},
"XLSXTableConverter": {
"converter": "XLSXTableConverter",
"package": "caoscrawler.converters"},
"Dict": {
"converter": "DictConverter",
"package": "caoscrawler.converters"},
......
......@@ -38,6 +38,8 @@ from caoscrawler.structure_elements import (File, DictTextElement,
DictBooleanElement, DictDictElement,
DictIntegerElement, DictFloatElement)
from caoscrawler.converters import handle_value
from test_tool import rfp
import pytest
......@@ -244,3 +246,33 @@ def test_json_converter(converter_registry):
children = jsonconverter.create_children(None, broken_json)
assert err.value.message.startswith("Couldn't validate")
def test_variable_replacement():
values = GeneralStore()
values["a"] = 4
values["b"] = "68"
assert handle_value("b", values) == ("b", "single")
assert handle_value("+b", values) == ("b", "list")
assert handle_value("*b", values) == ("b", "multiproperty")
assert handle_value("$b", values) == ("68", "single")
assert handle_value("+$b", values) == ("68", "list")
assert handle_value("*$b", values) == ("68", "multiproperty")
assert handle_value({"value": "b",
"collection_mode": "single"}, values) == ("b", "single")
assert handle_value({"value": "b",
"collection_mode": "list"}, values) == ("b", "list")
assert handle_value({"value": "b",
"collection_mode": "multiproperty"}, values) == ("b", "multiproperty")
assert handle_value({"value": "$b",
"collection_mode": "single"}, values) == ("68", "single")
assert handle_value({"value": "$b",
"collection_mode": "list"}, values) == ("68", "list")
assert handle_value({"value": "$b",
"collection_mode": "multiproperty"}, values) == ("68", "multiproperty")
assert handle_value(["a", "b"], values) == (["a", "b"], "single")
assert handle_value(["$a", "$b"], values) == (["4", "68"], "single")
Col_1,Col_2,Col_3,text
Index,description,,
,m,s,
0,12,1,jdsfkljadskf
1,14,3,jdkfljad
2,3,4,jadkfjdsk
3,4.5,6,
4,8,7,jadskfj
File added
ExperimentalData:
type: Directory
match: ExperimentalData
records:
Project:
name: project
subtree:
XLSXTable:
type: XLSXTableConverter
match: test1\.xlsx
skiprows: [1, 2]
header: 0
records:
Experiment: {}
subtree:
Row:
type: DictDictElement
match_name: .*
records:
Measurement: {}
Experiment:
Measurements: +$Measurement
subtree:
Col_1:
type: DictIntegerElement
match_name: Col_1
match_value: (?P<Value>[0-9]+)
records:
Measurement:
Col_1: $Value
CSVTable:
type: CSVTableConverter
match: test1\.csv
skiprows: [1, 2]
header: 0
records:
Experiment: {}
subtree:
Row:
type: DictDictElement
match_name: .*
records:
Measurement: {}
Experiment:
Measurements: +$Measurement
subtree:
Col_1:
type: DictIntegerElement
match_name: Col_1
match_value: (?P<Value>[0-9]+)
records:
Measurement:
Col_1: $Value
#!/usr/bin/env python3
# encoding: utf-8
#
# ** header v3.0
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2022 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# ** end header
#
"""
test the converters module
"""
from caoscrawler.converters import Converter
from caoscrawler.stores import GeneralStore
from caoscrawler.converters import (ConverterValidationError,
DictConverter, XLSXTableConverter, CSVTableConverter)
from caoscrawler.structure_elements import Directory
from caoscrawler.structure_elements import (File, DictTextElement,
DictListElement, DictElement,
DictBooleanElement, DictDictElement,
DictIntegerElement, DictFloatElement)
from os.path import join, dirname, basename
from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter
import pytest
import os
import importlib
import math
from caoscrawler import Crawler
import caosdb as db
@pytest.fixture
def converter_registry():
converter_registry: dict[str, dict[str, str]] = {
"Directory": {
"converter": "DirectoryConverter",
"package": "caoscrawler.converters"},
"CSVTableConverter": {
"converter": "CSVTableConverter",
"package": "caoscrawler.converters"},
"XLSXTableConverter": {
"converter": "XLSXTableConverter",
"package": "caoscrawler.converters"},
"DictDictElement": {
"converter": "DictDictElementConverter",
"package": "caoscrawler.converters"},
"DictTextElement": {
"converter": "DictTextElementConverter",
"package": "caoscrawler.converters"},
"DictIntegerElement": {
"converter": "DictIntegerElementConverter",
"package": "caoscrawler.converters"},
"DictFloatElement": {
"converter": "DictFloatElementConverter",
"package": "caoscrawler.converters"},
}
def rfp(*pathcomponents):
"""
Return full path.
Shorthand convenience function.
"""
return join(dirname(__file__), *pathcomponents)
def dircheckstr(*pathcomponents):
"""
Return the debug tree identifier for a given path.
"""
return "caoscrawler.structure_elements.File: " + basename(join(*pathcomponents)) + ", " + rfp("test_directories", "examples_tables", "ExperimentalData", *pathcomponents)
@pytest.fixture
def crawler():
crawler = Crawler(debug=True)
crawler.crawl_directory(rfp("test_directories", "examples_tables", "ExperimentalData"),
rfp("test_directories", "examples_tables", "crawler_for_tables.yml"))
return crawler
def test_convert_table(converter_registry):
for file_ext in ["xlsx", "ods", "csv", "tsv"]:
def_opt = {"skiprows": ["1", "2"], "header": 0}
if file_ext == "tsv":
def_opt["sep"] = "\t"
if file_ext in ["csv", "tsv"]:
converter = CSVTableConverter(
def_opt,
"Tab",
converter_registry)
else:
converter = XLSXTableConverter(
def_opt,
"Tab",
converter_registry)
store = GeneralStore()
file_element = File("table." + file_ext,
rfp("test_tables", "test1." + file_ext))
res = converter.create_children(store,
file_element)
assert len(res) == 5
for i in range(5):
assert res[i].name == str(i)
assert type(res[i].name) == str
assert type(res[i].value) == dict
assert len(res[i].value) == 4
assert type(res[i].value["Col_1"]) == int
assert res[i].value["Col_1"] == i
assert type(res[i].value["Col_2"]) == float
assert type(res[i].value["Col_3"]) == int
if i != 3:
assert type(res[i].value["text"]) == str
else:
assert type(res[i].value["text"]) == float # the nan value
assert math.isnan(res[i].value["text"])
# Using an index col:
converter = XLSXTableConverter(
{"skiprows": ["1", "2"], "header": 0, "index_col": "3"},
"XLSXTable",
converter_registry)
store = GeneralStore()
file_element = File("table.xlsx",
rfp("test_tables", "test1.xlsx"))
res = converter.create_children(store,
file_element)
assert res[0].name == "jdsfkljadskf"
def test_crawl_csv_table(crawler):
for file_ext in ["xlsx", "csv"]:
subd = crawler.debug_tree[dircheckstr("test1." + file_ext)]
record_experiment = subd[1]["Experiment"]
assert isinstance(record_experiment, db.Record)
assert isinstance(record_experiment.get_property("Measurements").value, list)
assert len(record_experiment.get_property("Measurements").value) == 5
prop_measure = record_experiment.get_property("Measurements").value[2]
assert isinstance(prop_measure, db.Record)
assert prop_measure.get_property("Col_1").value == "2"
Col_1,Col_2,Col_3,text
Index,description,,
,m,s,
0,12,1,jdsfkljadskf
1,14,3,jdkfljad
2,3,4,jadkfjdsk
3,4.5,6,
4,8,7,jadskfj
File added
Col_1 Col_2 Col_3 text
Index description
m s
0 12 1 jdsfkljadskf
1 14 3 jdkfljad
2 3 4 jadkfjdsk
3 4.5 6
4 8 7 jadskfj
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment