diff --git a/.gitignore b/.gitignore index adde15824bc54a7edd7cf6739befedc6222b53ba..208fe2c4b9e7e925e3235bbb46d80ba51a6bde4d 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,4 @@ build/ # documentation _apidoc /dist/ +*~ diff --git a/.gitlab/merge_request_templates/Default.md b/.gitlab/merge_request_templates/Default.md index 35c6d01c5904289b77fc7f1de9419ef91a1510e9..3629e0ca3695000863d8c254516f64bf59a7bf60 100644 --- a/.gitlab/merge_request_templates/Default.md +++ b/.gitlab/merge_request_templates/Default.md @@ -28,6 +28,7 @@ guidelines](https://gitlab.com/caosdb/caosdb/-/blob/dev/REVIEW_GUIDELINES.md) - [ ] Up-to-date CHANGELOG.md (or not necessary) - [ ] Up-to-date JSON schema (or not necessary) - [ ] Appropriate user and developer documentation (or not necessary) + - Update / write published documentation (`make doc`). - How do I use the software? Assume "stupid" users. - How do I develop or debug the software? Assume novice developers. - [ ] Annotations in code (Gitlab comments) @@ -41,7 +42,8 @@ guidelines](https://gitlab.com/caosdb/caosdb/-/blob/dev/REVIEW_GUIDELINES.md) - [ ] I understand the intent of this MR - [ ] All automated tests pass - [ ] Up-to-date CHANGELOG.md (or not necessary) -- [ ] Appropriate user and developer documentation (or not necessary) +- [ ] Appropriate user and developer documentation (or not necessary), also in published + documentation. - [ ] The test environment setup works and the intended behavior is reproducible in the test environment - [ ] In-code documentation and comments are up-to-date. diff --git a/CHANGELOG.md b/CHANGELOG.md index e5498c05c7c663192919bd7b1a1ff27a6001c8de..e51f2b4aee24eb87e4b83729b3d2571cf772f099 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,24 +7,51 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ## ### Added ### -- TableImporter now accepts a `existing_columns` argument which demands that certain columns exist + +* Added support for passing callables as `find_func` to the `BaseTableExporter`. +* Added member `BaseTableExporter.all_keys` +* Parsing from YAML now allows to give an existing model to which the YAML data model shall be + added. ### Changed ### -- Name change: `caosdb` -> `linkahead` -- The converters and datatype arguments of TableImporter now may have keys for nonexisting columns +* Name change: `caosdb` -> `linkahead` +* The converters and datatype arguments of TableImporter now may have keys for nonexisting columns +* A bit better error handling in the yaml model parser. +* `TableImporter.check_datatypes` allows numeric values in string columns if + `strict=False` (default). ### Deprecated ### ### Removed ### ### Fixed ### -- refactored to work with the new default key word in FIND queries: RECORD + +* `TableImporter.check_missing` in case of array-valued fields in table ### Security ### ### Documentation ### +## [0.8.0] - 2023-05-30 ## +(Florian Spreckelsen) + +### Added ### + +- TableImporter now accepts a `existing_columns` argument which demands that certain columns exist +- The `JsonSchemaParser` class supports `patternProperties` +- The `JsonSchemaParser` calss supports json-schema references (`$ref`) + +### Changed ### + +- The converters and datatype arguments of TableImporter now may have keys for nonexisting columns +- The `JsonSchemaParser` class does not require the top-level entry of a json + schema definition to specify a RecordType. + +### Fixed ### + +- refactored to work with the new default key word in FIND queries: RECORD + ## [0.7.0] - 2023-03-09 ## (Florian Spreckelsen) diff --git a/CITATION.cff b/CITATION.cff index 6aae9e3deb96898e814d7c0fa9068fc8bec74cd6..c0e278a6779f325da3f436bd6534a7b3cc6279f8 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -20,6 +20,6 @@ authors: given-names: Stefan orcid: https://orcid.org/0000-0001-7214-8125 title: LinkAhead - Advanced User Tools -version: 0.7.0 +version: 0.8.0 doi: 10.3390/data4020083 -date-released: 2023-01-20 \ No newline at end of file +date-released: 2023-05-30 diff --git a/manual_tests/test_labfolder_import.py b/manual_tests/test_labfolder_import.py index af8b7b53b0a9539d1a353b5cb972853d47ac1e14..bed1eda2c759439d2afb1483c150f32b39218e3b 100644 --- a/manual_tests/test_labfolder_import.py +++ b/manual_tests/test_labfolder_import.py @@ -32,7 +32,7 @@ from linkaheadadvancedtools.converter import labfolder_export as labfolder def main(args): """The main function.""" - model = parse_model_from_yaml("./model.yml") + model = parse_model_from_yaml("./models/model.yml") model.sync_data_model() labfolder.import_data(args.folder) diff --git a/manual_tests/test_labfolder_retrieve.py b/manual_tests/test_labfolder_retrieve.py index 6b140f6c5db7530a444ceac5329ceb55d1e77afd..ec2764ed8ba19108ec0698b41e118b20c8ca4081 100644 --- a/manual_tests/test_labfolder_retrieve.py +++ b/manual_tests/test_labfolder_retrieve.py @@ -31,7 +31,7 @@ from linkaheadadvancedtools.converter.labfolder_api import Importer def main(args): """The main function.""" - model = parse_model_from_yaml("./model.yml") + model = parse_model_from_yaml("./models/model.yml") # model.sync_data_model() importer = Importer() diff --git a/release.sh b/release.sh index 1af097f014de6cd9eb3d3e8ba5da34aea0fe1671..f6335ae20d0c29e760b508aac831a35460a59ef3 100755 --- a/release.sh +++ b/release.sh @@ -1,4 +1,4 @@ #!/bin/bash rm -rf dist/ build/ .eggs/ python setup.py sdist bdist_wheel -python -m twine upload -s dist/* +python -m twine upload dist/* diff --git a/setup.py b/setup.py index d3a12780e9246dcec6002edc097dfd99a3c94610..118f96e059fdadb190759ac9cf763f7b22c82c47 100755 --- a/setup.py +++ b/setup.py @@ -46,7 +46,7 @@ from setuptools import find_packages, setup ######################################################################## MAJOR = 0 -MINOR = 7 +MINOR = 8 MICRO = 1 PRE = "" # e.g. rc0, alpha.1, 0.beta-23 ISRELEASED = False @@ -96,7 +96,8 @@ def get_version_info(): elif os.path.exists('src/linkaheadadvancedtools/version.py'): # must be a source distribution, use existing version file try: - from linkaheadadvancedtools.version import git_revision as GIT_REVISION + from linkaheadadvancedtools.version import \ + git_revision as GIT_REVISION except ImportError: raise ImportError("Unable to import git_revision. Try removing " "src/linkaheadadvancedtools/version.py and the build directory " @@ -156,6 +157,7 @@ def setup_package(): author_email='h.tomwoerden@indiscale.com', python_requires='>=3.7', install_requires=["linkahead>=0.11.0", + "jsonref", "jsonschema>=4.4.0", "numpy>=1.17.3", "openpyxl>=3.0.7", @@ -163,7 +165,7 @@ def setup_package(): "xlrd>=2.0", ], extras_require={"h5-crawler": ["h5py>=3.3.0", ], - "gitignore-parser ": ["gitignore-parser >=0.1.0", ], + "gitignore-parser": ["gitignore-parser >=0.1.0", ], }, packages=find_packages('src'), package_dir={'': 'src'}, diff --git a/src/doc/conf.py b/src/doc/conf.py index 622c37d4f2a2e909d2ac7a0f2b3dad6996306e92..09a7869b2ac2e8720c7d9b458c4dd4ecf30594b9 100644 --- a/src/doc/conf.py +++ b/src/doc/conf.py @@ -27,9 +27,9 @@ copyright = '2021, IndiScale GmbH' author = 'Daniel Hornung' # The short X.Y version -version = '0.7.1' +version = '0.8.1' # The full version, including alpha/beta/rc tags -release = '0.7.1-dev' +release = '0.8.1-dev' # -- General configuration --------------------------------------------------- diff --git a/src/doc/index.rst b/src/doc/index.rst index 62158834ce618a088b65f2fb92c786fbce2828ec..1e0c843851e03ad344d31a5ca22e96082f3522be 100644 --- a/src/doc/index.rst +++ b/src/doc/index.rst @@ -15,6 +15,7 @@ This documentation helps you to :doc:`get started<getting_started>`, explains th Concepts <concepts> The LinkAhead Crawler <crawler> YAML data model specification <yaml_interface> + Specifying a datamodel with JSON schema <json_schema_interface> _apidoc/modules diff --git a/src/doc/json_schema_interface.rst b/src/doc/json_schema_interface.rst new file mode 100644 index 0000000000000000000000000000000000000000..0e8aebd3a4204f29608212f7ed0c115fd1d4a134 --- /dev/null +++ b/src/doc/json_schema_interface.rst @@ -0,0 +1,75 @@ +Defining datamodels with a JSON schema specification +==================================================== + +TODO, see https://gitlab.com/caosdb/caosdb-advanced-user-tools/-/issues/42 + +Further information +################### + +Pattern Properties +%%%%%%%%%%%%%%%%%% + +The JSON-schema parser has rudimentary support for ``patternProperties``. Since +their names (only the pattern that their names will suffice) are not known a +priori, we create RecordTypes for all pattern properties. The names of these +RecordTypes are created from their parent element's name by appending the string +``"Entry"`` and possibly a number if there are more than one pattern properties +for one parent. + +All the RecordTypes created for pattern properties have at least an obligatory +``__matched_pattern`` property which will -- as the name suggests -- store the +matched pattern of an actual data entry. + +.. note:: + + The ``__matched_pattern`` property is added automatically to your datamodel + as soon as there is at least one pattern property in your JSON schema. So be + sure that you don't happen to have an entity with exactly this name in your + database. + +E.g., a json schema with + +.. code-block:: json + + "dataset": { + "patternProperties": { + "^[0-9]{4,4}": { + "type": "boolean" + }, + "^[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}": { + "type": "object", + "properties": { + "date_id": { + "$ref": "#/definitions/uuid" + } + } + } + } + } + +Would result in a ``Dataset`` RecordType that has the two properties +``DatasetEntry_1`` and ``DatasetEntry_2`` (as always, name can be overwritten +explicitly by specifying the ``title`` property), referencing corresponding +``DatasetEntry_1`` and ``DatasetEntry_2`` Records. + +Apart from the aforementioned ``__matched_pattern`` property, ``DatasetEntry_1`` +also has the ``DatasetEntry_1_value`` property with datatype ``BOOLEAN``, that +stores the actual value. In turn, ``DatasetEntry_2`` is of ``type: object`` and +is treated like any other RecordType. Consequently, it has, appart from the +``__matched_pattern`` property, a ``date_id`` property as specified in its +``properties``. + +Array entries without ``items`` specification +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +JSON schema allows for properties of ``type: array`` without the ``items`` +specification that consequently can be arrays of any (and of mixed) types. While +this is in general problematic when specifying a data model, sometimes these +properties cannot be specified further, e.g., when you're using an external +schema that you cannot change. + +These properties can still be added to your datamodel by specifying their types +explicitly in a dictionary or, alternatively, they can be ignored. See the +``types_for_missing_array_items`` and ``ignore_unspecified_array_items`` +parameters of ``models.parser.JsonSchemaParser``, respectively, for more +information. diff --git a/src/doc/yaml_interface.rst b/src/doc/yaml_interface.rst index 51eb44fc95a8c3de108eaf6c2d277288373da68f..85d63666cb396907556d3f3e69f7b05806fb8fc3 100644 --- a/src/doc/yaml_interface.rst +++ b/src/doc/yaml_interface.rst @@ -125,6 +125,13 @@ You can use the yaml parser directly in python as follows: This creates a DataModel object containing all entities defined in the yaml file. +If the parsed data model shall be appended to a pre-exsting data model, the optional +``extisting_model`` can be used: + +.. code-block:: python + + new_model = parser.parse_model_from_yaml("model.yml", existing_model=old_model) + You can then use the functions from linkaheadadvancedtools.models.data_model.DataModel to synchronize the model with a LinkAhead instance, e.g.: diff --git a/src/linkaheadadvancedtools/models/data_model.py b/src/linkaheadadvancedtools/models/data_model.py index d207ff738d9fbf4d56b88d8d84a7f1e4e6225b39..1ed2ba4cd049a9b3abcdd9df1ba37b36e127b6b9 100644 --- a/src/linkaheadadvancedtools/models/data_model.py +++ b/src/linkaheadadvancedtools/models/data_model.py @@ -60,7 +60,8 @@ class DataModel(dict): different purpose (e.g. someone else's experiment). DataModel inherits from dict. The keys are always the names of the - entities. Thus you cannot have unnamed entities in your model. + entities. Thus you cannot have unnamed or ambiguously named entities in your + model. Example: diff --git a/src/linkaheadadvancedtools/models/parser.py b/src/linkaheadadvancedtools/models/parser.py index 803fc7facd0d6544b8ccf2d59b41e9e78177083e..d688b604c2082015a2b4653d6f5e2823249b0dd5 100644 --- a/src/linkaheadadvancedtools/models/parser.py +++ b/src/linkaheadadvancedtools/models/parser.py @@ -1,8 +1,8 @@ # This file is a part of the LinkAhead Project. # -# Copyright (C) 2022 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> # Copyright (C) 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com> -# Copyright (C) 2022 Daniel Hornung <d.hornung@indiscale.com> +# Copyright (C) 2023 Daniel Hornung <d.hornung@indiscale.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -35,13 +35,14 @@ not defined, simply the name can be supplied with no value. Parents can be provided under the 'inherit_from_xxxx' keywords. The value needs to be a list with the names. Here, NO NEW entities can be defined. """ -import json import argparse +import json +import jsonref import re import sys import yaml -from typing import List +from typing import List, Optional from warnings import warn import jsonschema @@ -76,7 +77,8 @@ JSON_SCHEMA_ATOMIC_TYPES = [ "string", "boolean", "integer", - "number" + "number", + "null" ] @@ -138,30 +140,69 @@ class JsonSchemaDefinitionError(RuntimeError): super().__init__(msg) -def parse_model_from_yaml(filename): - """Shortcut if the Parser object is not needed.""" +def parse_model_from_yaml(filename, existing_model: Optional[dict] = None): + """Shortcut if the Parser object is not needed. + +Parameters +---------- + +existing_model : dict, optional + An existing model to which the created model shall be added. + """ parser = Parser() - return parser.parse_model_from_yaml(filename) + return parser.parse_model_from_yaml(filename, existing_model=existing_model) + + +def parse_model_from_string(string, existing_model: Optional[dict] = None): + """Shortcut if the Parser object is not needed. +Parameters +---------- -def parse_model_from_string(string): - """Shortcut if the Parser object is not needed.""" +existing_model : dict, optional + An existing model to which the created model shall be added. + """ parser = Parser() - return parser.parse_model_from_string(string) + return parser.parse_model_from_string(string, existing_model=existing_model) -def parse_model_from_json_schema(filename: str): +def parse_model_from_json_schema( + filename: str, + top_level_recordtype: bool = True, + types_for_missing_array_items: dict = {}, + ignore_unspecified_array_items: bool = False, + existing_model: Optional[dict] = None +): """Return a datamodel parsed from a json schema definition. Parameters ---------- + filename : str The path of the json schema file that is to be parsed + top_level_recordtype : bool, optional + Whether there is a record type defined at the top level of the + schema. Default is true. + + types_for_missing_array_items : dict, optional + dictionary containing fall-back types for json entries with `type: + array` but without `items` specification. Default is an empty dict. + + ignore_unspecified_array_items : bool, optional + Whether to ignore `type: array` entries the type of which is not + specified by their `items` property or given in + `types_for_missing_array_items`. An error is raised if they are not + ignored. Default is False. + + existing_model : dict, optional + An existing model to which the created model shall be added. + Returns ------- + out : Datamodel The datamodel generated from the input schema which then can be used for synchronizing with LinkAhead. @@ -174,10 +215,10 @@ def parse_model_from_json_schema(filename: str): """ # @author Florian Spreckelsen # @date 2022-02-17 - # @review Daniel Hornung 2022-02-18 - parser = JsonSchemaParser() + # @review Timm Fitschen 2023-05-25 + parser = JsonSchemaParser(types_for_missing_array_items, ignore_unspecified_array_items) - return parser.parse_model_from_json_schema(filename) + return parser.parse_model_from_json_schema(filename, top_level_recordtype) class Parser(object): @@ -189,7 +230,7 @@ class Parser(object): self.model = {} self.treated = [] - def parse_model_from_yaml(self, filename): + def parse_model_from_yaml(self, filename, existing_model: Optional[dict] = None): """Create and return a data model from the given file. Parameters @@ -197,6 +238,9 @@ class Parser(object): filename : str The path to the YAML file. + existing_model : dict, optional + An existing model to which the created model shall be added. + Returns ------- out : DataModel @@ -205,9 +249,9 @@ class Parser(object): with open(filename, 'r') as outfile: ymlmodel = yaml.load(outfile, Loader=SafeLineLoader) - return self._create_model_from_dict(ymlmodel) + return self._create_model_from_dict(ymlmodel, existing_model=existing_model) - def parse_model_from_string(self, string): + def parse_model_from_string(self, string, existing_model: Optional[dict] = None): """Create and return a data model from the given YAML string. Parameters @@ -215,6 +259,9 @@ class Parser(object): string : str The YAML string. + existing_model : dict, optional + An existing model to which the created model shall be added. + Returns ------- out : DataModel @@ -222,9 +269,9 @@ class Parser(object): """ ymlmodel = yaml.load(string, Loader=SafeLineLoader) - return self._create_model_from_dict(ymlmodel) + return self._create_model_from_dict(ymlmodel, existing_model=existing_model) - def _create_model_from_dict(self, ymlmodel): + def _create_model_from_dict(self, ymlmodel, existing_model: Optional[dict] = None): """Create and return a data model out of the YAML dict `ymlmodel`. Parameters @@ -232,6 +279,9 @@ class Parser(object): ymlmodel : dict The dictionary parsed from a YAML file. + existing_model : dict, optional + An existing model to which the created model shall be added. + Returns ------- out : DataModel @@ -241,6 +291,9 @@ class Parser(object): if not isinstance(ymlmodel, dict): raise ValueError("Yaml file should only contain one dictionary!") + if existing_model is not None: + self.model.update(existing_model) + # Extern keyword: # The extern keyword can be used to include Properties and RecordTypes # from existing LinkAhead datamodels into the current model. @@ -258,9 +311,9 @@ class Parser(object): self.model[name] = db.Property(name=name).retrieve() continue for role in ("Property", "RecordType", "Record", "File"): - if db.execute_query("COUNT {} {}".format(role, name)) > 0: + if db.execute_query("COUNT {} \"{}\"".format(role, name)) > 0: self.model[name] = db.execute_query( - f"FIND {role} WITH name={name}", unique=True) + f"FIND {role} WITH name=\"{name}\"", unique=True) break else: raise Exception("Did not find {}".format(name)) @@ -276,7 +329,12 @@ class Parser(object): self._check_and_convert_datatypes() for name, entity in ymlmodel.items(): - self._treat_entity(name, entity, line=ymlmodel["__line__"]) + try: + self._treat_entity(name, entity, line=ymlmodel["__line__"]) + except ValueError as err: + err_str = err.args[0].replace("invalid keyword:", + f"invalid keyword in line {entity['__line__']}:", 1) + raise ValueError(err_str, *err.args[1:]) from err return DataModel(self.model.values()) @@ -327,8 +385,7 @@ class Parser(object): if definition is None: return - if (self.model[name] is None - and isinstance(definition, dict) + if (self.model[name] is None and isinstance(definition, dict) # is it a property and "datatype" in definition # but not simply an RT of the model @@ -600,14 +657,13 @@ class Parser(object): class JsonSchemaParser(Parser): """Extends the yaml parser to read in datamodels defined in a json schema. - **EXPERIMENTAL:** While this calss can already be used to create data models + **EXPERIMENTAL:** While this class can already be used to create data models from basic json schemas, there are the following limitations and missing features: * Due to limitations of json-schema itself, we currently do not support inheritance in the imported data models * The same goes for suggested properties of RecordTypes - * Currently, ``$defs`` and ``$ref`` in the input schema are not resolved. * Already defined RecordTypes and (scalar) Properties can't be re-used as list properties * Reference properties that are different from the referenced RT. (Although @@ -615,15 +671,18 @@ class JsonSchemaParser(Parser): * Values * Roles * The extern keyword from the yaml parser - * Currently, a json-schema cannot be transformed into a data model if its - root element isn't a RecordType (or Property) with ``title`` and ``type``. """ # @author Florian Spreckelsen # @date 2022-02-17 - # @review Timm Fitschen 2022-02-30 + # @review Timm Fitschen 2023-05-25 + + def __init__(self, types_for_missing_array_items={}, ignore_unspecified_array_items=False): + super().__init__() + self.types_for_missing_array_items = types_for_missing_array_items + self.ignore_unspecified_array_items = ignore_unspecified_array_items - def parse_model_from_json_schema(self, filename: str): + def parse_model_from_json_schema(self, filename: str, top_level_recordtype: bool = True): """Return a datamodel created from the definition in the json schema in `filename`. @@ -631,6 +690,9 @@ class JsonSchemaParser(Parser): ---------- filename : str The path to the json-schema file containing the datamodel definition + top_level_recordtype : bool, optional + Whether there is a record type defined at the top level of the + schema. Default is true. Returns ------- @@ -639,13 +701,13 @@ class JsonSchemaParser(Parser): """ # @author Florian Spreckelsen # @date 2022-02-17 - # @review Timm Fitschen 2022-02-30 + # @review Timm Fitschen 2023-05-25 with open(filename, 'r') as schema_file: - model_dict = json.load(schema_file) + model_dict = jsonref.load(schema_file) - return self._create_model_from_dict(model_dict) + return self._create_model_from_dict(model_dict, top_level_recordtype=top_level_recordtype) - def _create_model_from_dict(self, model_dict: [dict, List[dict]]): + def _create_model_from_dict(self, model_dict: [dict, List[dict]], top_level_recordtype: bool = True): """Parse a dictionary and return the Datamodel created from it. The dictionary was typically created from the model definition in a json schema file. @@ -654,36 +716,68 @@ class JsonSchemaParser(Parser): ---------- model_dict : dict or list[dict] One or several dictionaries read in from a json-schema file + top_level_recordtype : bool, optional + Whether there is a record type defined at the top level of the + schema. Default is true. Returns ------- our : DataModel The datamodel defined in `model_dict` """ - # @review Timm Fitschen 2022-02-30 + # @review Timm Fitschen 2023-05-25 if isinstance(model_dict, dict): model_dict = [model_dict] for ii, elt in enumerate(model_dict): - if "title" not in elt: - raise JsonSchemaDefinitionError( - f"Object {ii+1} is lacking the `title` key word") - if "type" not in elt: - raise JsonSchemaDefinitionError( - f"Object {ii+1} is lacking the `type` key word") - # Check if this is a valid Json Schema try: jsonschema.Draft202012Validator.check_schema(elt) except jsonschema.SchemaError as err: + key = elt["title"] if "title" in elt else f"element {ii}" raise JsonSchemaDefinitionError( - f"Json Schema error in {elt['title']}:\n{str(err)}") from err - name = self._stringify(elt["title"], context=elt) - self._treat_element(elt, name) + f"Json Schema error in {key}:\n{str(err)}") from err + + if top_level_recordtype: + if "title" not in elt: + raise JsonSchemaDefinitionError( + f"Object {ii+1} is lacking the `title` key word") + if "type" not in elt: + raise JsonSchemaDefinitionError( + f"Object {ii+1} is lacking the `type` key word") + # Check if this is a valid Json Schema + name = self._stringify(elt["title"], context=elt) + self._treat_element(elt, name) + elif "properties" in elt or "patternProperties" in elt: + # No top-level type but there are entities + if "properties" in elt: + for key, prop in elt["properties"].items(): + name = self._get_name_from_property(key, prop) + self._treat_element(prop, name) + if "patternProperties" in elt: + # See also treatment in ``_treat_record_type``. Since here, + # there is no top-level RT we use the prefix `__Pattern`, + # i.e., the resulting Record Types will be called + # `__PatternElement`. + self._treat_pattern_properties( + elt["patternProperties"], name_prefix="__Pattern") + else: + # Neither RecordType itself, nor further properties in schema, + # so nothing to do here. Maybe add something in the future. + continue return DataModel(self.model.values()) + def _get_name_from_property(self, key: str, prop: dict): + # @review Timm Fitschen 2023-05-25 + if "title" in prop: + name = self._stringify(prop["title"]) + else: + name = self._stringify(key) + + return name + def _get_atomic_datatype(self, elt): - # @review Timm Fitschen 2022-02-30 + # @review Timm Fitschen 2023-05-25 if elt["type"] == "string": if "format" in elt and elt["format"] in ["date", "date-time"]: return db.DATETIME @@ -695,11 +789,15 @@ class JsonSchemaParser(Parser): return db.DOUBLE elif elt["type"] == "boolean": return db.BOOLEAN + elif elt["type"] == "null": + # This could be any datatype since a valid json will never have a + # value in a null property. We use TEXT for convenience. + return db.TEXT else: raise JsonSchemaDefinitionError(f"Unkown atomic type in {elt}.") def _treat_element(self, elt: dict, name: str): - # @review Timm Fitschen 2022-02-30 + # @review Timm Fitschen 2023-05-25 force_list = False if name in self.model: return self.model[name], force_list @@ -710,12 +808,17 @@ class JsonSchemaParser(Parser): if name == "name": # This is identified with the LinkAhead name property as long as the # type is correct. - if not elt["type"] == "string": + if not elt["type"] == "string" and "string" not in elt["type"]: raise JsonSchemaDefinitionError( "The 'name' property must be string-typed, otherwise it cannot " "be identified with LinkAhead's name property." ) return None, force_list + # LinkAhead suports null for all types, so in the very special case of + # `"type": ["null", "<other_type>"]`, only consider the other type: + if isinstance(elt["type"], list) and len(elt["type"]) == 2 and "null" in elt["type"]: + elt["type"].remove("null") + elt["type"] = elt["type"][0] if "enum" in elt: ent = self._treat_enum(elt, name) elif elt["type"] in JSON_SCHEMA_ATOMIC_TYPES: @@ -733,11 +836,12 @@ class JsonSchemaParser(Parser): # treat_something function ent.description = elt["description"] - self.model[name] = ent + if ent is not None: + self.model[name] = ent return ent, force_list def _treat_record_type(self, elt: dict, name: str): - # @review Timm Fitschen 2022-02-30 + # @review Timm Fitschen 2023-05-25 rt = db.RecordType(name=name) if "required" in elt: required = elt["required"] @@ -745,10 +849,7 @@ class JsonSchemaParser(Parser): required = [] if "properties" in elt: for key, prop in elt["properties"].items(): - if "title" in prop: - name = self._stringify(prop["title"]) - else: - name = self._stringify(key) + name = self._get_name_from_property(key, prop) prop_ent, force_list = self._treat_element(prop, name) if prop_ent is None: # Nothing to be appended since the property has to be @@ -762,6 +863,17 @@ class JsonSchemaParser(Parser): rt.add_property(prop_ent, importance=importance, datatype=db.LIST(prop_ent)) + if "patternProperties" in elt: + + pattern_property_rts = self._treat_pattern_properties( + elt["patternProperties"], name_prefix=name) + for ppr in pattern_property_rts: + # add reference to pattern property type. These can never be + # obligatory since pattern properties cannot be required in the + # original schema (since their actual names are not known a + # priori). + rt.add_property(ppr) + if "description" in elt: rt.description = elt["description"] return rt @@ -783,28 +895,96 @@ class JsonSchemaParser(Parser): return rt def _treat_list(self, elt: dict, name: str): - # @review Timm Fitschen 2022-02-30 + # @review Timm Fitschen 2023-05-25 - if "items" not in elt: + if "items" not in elt and name not in self.types_for_missing_array_items: + if self.ignore_unspecified_array_items: + return None, False raise JsonSchemaDefinitionError( f"The definition of the list items is missing in {elt}.") - items = elt["items"] - if "enum" in items: - return self._treat_enum(items, name), True - if items["type"] in JSON_SCHEMA_ATOMIC_TYPES: - datatype = db.LIST(self._get_atomic_datatype(items)) + if "items" in elt: + items = elt["items"] + if "enum" in items: + return self._treat_enum(items, name), True + if items["type"] in JSON_SCHEMA_ATOMIC_TYPES: + datatype = db.LIST(self._get_atomic_datatype(items)) + return db.Property(name=name, datatype=datatype), False + if items["type"] == "object": + if "title" not in items or self._stringify(items["title"]) == name: + # Property is RecordType + return self._treat_record_type(items, name), True + else: + # List property will be an entity of its own with a name + # different from the referenced RT + ref_rt = self._treat_record_type( + items, self._stringify(items["title"])) + self.model[ref_rt.name] = ref_rt + return db.Property(name=name, datatype=db.LIST(ref_rt)), False + else: + # Use predefined type: + datatype = db.LIST(self.types_for_missing_array_items[name]) return db.Property(name=name, datatype=datatype), False - if items["type"] == "object": - if "title" not in items or self._stringify(items["title"]) == name: - # Property is RecordType - return self._treat_record_type(items, name), True + + def _get_pattern_prop(self): + # @review Timm Fitschen 2023-05-25 + if "__pattern_property_pattern_property" in self.model: + return self.model["__pattern_property_pattern_property"] + pp = db.Property(name="__matched_pattern", datatype=db.TEXT) + self.model["__pattern_property_pattern_property"] = pp + return pp + + def _treat_pattern_properties(self, pattern_elements, name_prefix=""): + """Special Treatment for pattern properties: A RecordType is created for + each pattern property. In case of a `type: object` PatternProperty, the + remaining properties of the JSON entry are appended to the new + RecordType; in case of an atomic type PatternProperty, a single value + Property is added to the RecordType. + + Raises + ------ + NotImplementedError + In case of patternProperties with non-object, non-atomic type, e.g., + array. + + """ + # @review Timm Fitschen 2023-05-25 + num_patterns = len(pattern_elements) + pattern_prop = self._get_pattern_prop() + returns = [] + for ii, (key, element) in enumerate(pattern_elements.items()): + if "title" not in element: + name_suffix = f"_{ii+1}" if num_patterns > 1 else "" + name = name_prefix + "Entry" + name_suffix else: - # List property will be an entity of its own with a name - # different from the referenced RT - ref_rt = self._treat_record_type( - items, self._stringify(items["title"])) - self.model[ref_rt.name] = ref_rt - return db.Property(name=name, datatype=db.LIST(ref_rt)), False + name = element["title"] + if element["type"] == "object": + # simple, is already an object, so can be treated like any other + # record type. + pattern_type = self._treat_record_type(element, name) + elif element["type"] in JSON_SCHEMA_ATOMIC_TYPES: + # create a property that stores the actual value of the pattern + # property. + propname = f"{name}_value" + prop = db.Property(name=propname, datatype=self._get_atomic_datatype(element)) + self.model[propname] = prop + pattern_type = db.RecordType(name=name) + pattern_type.add_property(prop) + else: + raise NotImplementedError( + "Pattern properties are currently only supported for types " + + ", ".join(JSON_SCHEMA_ATOMIC_TYPES) + ", and object.") + + # Add pattern property and description + pattern_type.add_property(pattern_prop, importance=db.OBLIGATORY) + if pattern_type.description: + pattern_type.description += f"\n\npattern: {key}" + else: + pattern_type.description = f"pattern: {key}" + + self.model[name] = pattern_type + returns.append(pattern_type) + + return returns if __name__ == "__main__": diff --git a/src/linkaheadadvancedtools/table_export.py b/src/linkaheadadvancedtools/table_export.py index be544b4862611ceb8e59bf73becb3df52572e22a..42e83daf7025a1409fc2a8d9cd3a61cf76f18cd1 100644 --- a/src/linkaheadadvancedtools/table_export.py +++ b/src/linkaheadadvancedtools/table_export.py @@ -27,6 +27,7 @@ them for an export as a table, e.g., for the export to metadata repositories. """ +from inspect import signature import json import logging @@ -83,7 +84,7 @@ class BaseTableExporter(object): ``` {"entry_to_be_exported: { "optional": True/False - "find_func": name of member function + "find_func": callable or name of member function "query": query string "selector": selector for the query "error": error explanation @@ -97,8 +98,8 @@ class BaseTableExporter(object): - optional: True or False, if not present, the entry is assumed to be mandatory. - find_func: name of the member function that returns the - value for this entry. Must not exist together with - `query` + value for this entry or callable object. Must not exist + together with `query` - query: Query string for finding the value for this entry. If this is given, a record must be given to the constructor of this class. The query is then executed as @@ -132,6 +133,7 @@ class BaseTableExporter(object): self._check_sanity_of_export_dict() self.raise_error_if_missing = raise_error_if_missing self.info = {} + self.all_keys = [key for key in self.export_dict] def collect_information(self): """Use the items of `export_dict` to collect the information for the @@ -139,7 +141,8 @@ class BaseTableExporter(object): """ - for e, d in self.export_dict.items(): + for e in self.all_keys: + d = self.export_dict[e] if QUERY in d: # TODO: How do we make this more general? There might # be queries that don't need the record or work with @@ -163,12 +166,15 @@ class BaseTableExporter(object): else: self._append_missing(e, d) elif FIND_FUNCTION in d: - find_fun = getattr(self, d[FIND_FUNCTION]) try: - self.info[e] = find_fun() + val = self._call_find_function(d[FIND_FUNCTION], e) + if val is not None: + self.info[e] = val + else: + self._append_missing(e, d) except Exception as exc: self._append_missing(e, d) - logger.debug(exc) + logger.error(exc) # last resort: check if record has e as property: else: try: @@ -200,6 +206,22 @@ class BaseTableExporter(object): else: logger.error(errmssg) + def _call_find_function(self, find_function, e): + account_for_self = 0 + if callable(find_function): + find_fun = find_function + else: + find_fun = getattr(self, find_function) + account_for_self = 1 + + sig = signature(find_fun) + params = sig.parameters + if len(params) > (account_for_self + 1): + return find_fun(self.record, e) + elif len(params) > account_for_self: + return find_fun(self.record) + return find_fun() + def prepare_csv_export(self, delimiter=',', print_header=False, skip_empty_optionals=False): """Return the values in self.info as a single-line string, separated @@ -238,7 +260,8 @@ class BaseTableExporter(object): if print_header: header = "" - for e, d in self.export_dict.items(): + for e in self.all_keys: + d = self.export_dict[e] if e in self.info: body += str(self.info[e]) + delimiter @@ -287,7 +310,9 @@ class BaseTableExporter(object): # check find function if present if FIND_FUNCTION in d: - if not hasattr(self, d[FIND_FUNCTION]): + if callable(d[FIND_FUNCTION]): + pass + elif not hasattr(self, d[FIND_FUNCTION]): raise TableExportError( "Find function " + d[FIND_FUNCTION] + " was specified for entry " + e + diff --git a/src/linkaheadadvancedtools/table_importer.py b/src/linkaheadadvancedtools/table_importer.py index ea063cb5e7e5adfc37923c6f7637cb428d439b4d..d46920086e63479d41a3c3ab9f9184e22025f4a2 100755 --- a/src/linkaheadadvancedtools/table_importer.py +++ b/src/linkaheadadvancedtools/table_importer.py @@ -322,7 +322,7 @@ class TableImporter(): .. note:: If columns are integer, but should be float, this method converts the respective columns - in place. + in place. The same for columns that should have string value but have numeric value. Parameters ---------- @@ -338,9 +338,11 @@ class TableImporter(): # float, because LinkAhead does not have different sizes anyway. col_dtype = df.dtypes[key] if not strict and not np.issubdtype(col_dtype, datatype): - issub = np.issubdtype # These special cases should be fine. - if issub(col_dtype, np.integer) and issub(datatype, np.floating): + if ((datatype == str) + or (np.issubdtype(col_dtype, np.integer) + and np.issubdtype(datatype, np.floating)) + ): # NOQA df[key] = df[key].astype(datatype) # Now check each element @@ -388,7 +390,8 @@ class TableImporter(): if key not in df.columns: continue - if pd.isnull(row.loc[key]): + null_check = pd.isnull(row.loc[key]) + if (isinstance(null_check, np.ndarray) and null_check.any()) or (not isinstance(null_check, np.ndarray) and null_check): errmsg = ( "Required information is missing ({}) in {}. row" " (without header) of " diff --git a/unittests/json-schema-models/datamodel_atomic_properties.schema.json b/unittests/json-schema-models/datamodel_atomic_properties.schema.json index 3828f131180a839d5c9b8bc5aa1a1285717da723..7b4a23e5bb48b995d07a261bcae0a8a486b7969a 100644 --- a/unittests/json-schema-models/datamodel_atomic_properties.schema.json +++ b/unittests/json-schema-models/datamodel_atomic_properties.schema.json @@ -18,7 +18,8 @@ "date": { "type": "string", "format": "date" }, "integer": { "type": "integer", "description": "Some integer property" }, "boolean": { "type": "boolean" }, - "number_prop": { "type": "number", "description": "Some float property" } + "number_prop": { "type": "number", "description": "Some float property" }, + "null_prop": { "type": "null", "description": "This property will never have a value." } } } ] diff --git a/unittests/json-schema-models/datamodel_missing_array_items.schema.json b/unittests/json-schema-models/datamodel_missing_array_items.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..8ac17ac3162def3dbf070d7027fd318366bb4682 --- /dev/null +++ b/unittests/json-schema-models/datamodel_missing_array_items.schema.json @@ -0,0 +1,9 @@ +{ + "title": "something_with_missing_array_items", + "type": "object", + "properties": { + "missing": { + "type": "array" + } + } +} diff --git a/unittests/json-schema-models/datamodel_no_toplevel_entity.schema.json b/unittests/json-schema-models/datamodel_no_toplevel_entity.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..35240d765479b719576e6ee67e387790d3d6d160 --- /dev/null +++ b/unittests/json-schema-models/datamodel_no_toplevel_entity.schema.json @@ -0,0 +1,56 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://my-schema-id.net", + "type": "object", + "definitions": { + "uuid": { + "type": [ + "string", + "null" + ], + "pattern": "^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$" + }, + "datetime": { + "type": "string", + "format": "date-time" + } + }, + "properties": { + "Dataset1": { + "title": "Dataset1", + "description": "Some description", + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "full dataset title" + }, + "campaign": { + "type": "string", + "description": "FIXME" + }, + "number_prop": { + "type": "number", + "description": "Some float property" + }, + "user_id": { + "$ref": "#/definitions/uuid" + } + }, + "required": ["title", "number_prop"] + } + }, + "patternProperties": { + "^[0-9]{4,4}": { + "type": "boolean" + }, + "^[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}": { + "type": "object", + "properties": { + "date_id": { + "$ref": "#/definitions/uuid" + } + } + } + } +} diff --git a/unittests/json-schema-models/datamodel_pattern_properties.schema.json b/unittests/json-schema-models/datamodel_pattern_properties.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..9b85c7b80cf0990713f8f130050c21751e311b42 --- /dev/null +++ b/unittests/json-schema-models/datamodel_pattern_properties.schema.json @@ -0,0 +1,39 @@ +[ + { + "title": "Dataset", + "type": "object", + "patternProperties": { + "^[0-9]{4,4}": { + "type": "boolean" + }, + "^[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}": { + "type": "object", + "properties": { + "date_id": { + "type": [ + "string", + "null" + ], + "pattern": "^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$" + } + } + } + } + }, + { + "title": "Dataset2", + "type": "object", + "properties": { + "datetime": { + "type": "string", + "format": "date-time" + } + }, + "patternProperties": { + ".*": { + "title": "Literally anything", + "type": "object" + } + } + } +] diff --git a/unittests/model.yml b/unittests/models/model.yml similarity index 100% rename from unittests/model.yml rename to unittests/models/model.yml diff --git a/unittests/models/model_invalid.yml b/unittests/models/model_invalid.yml new file mode 100644 index 0000000000000000000000000000000000000000..c8368b9701db9b3461b7e0f1f3514c2411f56b56 --- /dev/null +++ b/unittests/models/model_invalid.yml @@ -0,0 +1,2 @@ +Project: + ObligatoryProperties: diff --git a/unittests/test_json_schema_model_parser.py b/unittests/test_json_schema_model_parser.py index ea52701deb0ae04c0be93693d160a9eec36b98ee..00ba20552741ca823af76b571cd9f8361d3a814b 100644 --- a/unittests/test_json_schema_model_parser.py +++ b/unittests/test_json_schema_model_parser.py @@ -22,11 +22,11 @@ # @review Daniel Hornung 2022-02-18 import os -import pytest import linkahead as db -from linkaheadadvancedtools.models.parser import (parse_model_from_json_schema, - JsonSchemaDefinitionError) +import pytest +from linkaheadadvancedtools.models.parser import (JsonSchemaDefinitionError, + parse_model_from_json_schema) FILEPATH = os.path.join(os.path.dirname( os.path.abspath(__file__)), 'json-schema-models') @@ -103,7 +103,7 @@ def test_datamodel_with_atomic_properties(): assert isinstance(rt2, db.RecordType) assert rt2.name == "Dataset2" assert not rt2.description - assert len(rt2.get_properties()) == 5 + assert len(rt2.get_properties()) == 6 date_prop = rt2.get_property("date") assert date_prop.datatype == db.DATETIME @@ -121,6 +121,9 @@ def test_datamodel_with_atomic_properties(): float_prop2 = rt2.get_property("number_prop") assert float_prop.datatype == float_prop2.datatype + null_prop = rt2.get_property("null_prop") + assert null_prop.datatype == db.TEXT + def test_required_no_list(): """Exception must be raised when "required" is not a list.""" @@ -356,3 +359,130 @@ def test_name_property(): assert str(err.value).startswith( "The 'name' property must be string-typed, otherwise it cannot be identified with " "LinkAhead's name property.") + + +def test_no_toplevel_entity(): + model = parse_model_from_json_schema(os.path.join( + FILEPATH, "datamodel_no_toplevel_entity.schema.json"), top_level_recordtype=False) + + assert "Dataset1" in model + rt1 = model["Dataset1"] + + assert rt1.name == "Dataset1" + assert rt1.description == "Some description" + assert len(rt1.get_properties()) == 4 + + assert rt1.get_property("title") is not None + assert rt1.get_property("campaign") is not None + assert rt1.get_property("number_prop") is not None + assert rt1.get_property("user_id") is not None + + title_prop = rt1.get_property("title") + assert title_prop.datatype == db.TEXT + assert rt1.get_importance(title_prop.name) == db.OBLIGATORY + + campaign_prop = rt1.get_property("campaign") + assert campaign_prop.datatype == db.TEXT + assert rt1.get_importance(campaign_prop.name) == db.RECOMMENDED + + float_prop = rt1.get_property("number_prop") + assert float_prop.datatype == db.DOUBLE + assert rt1.get_importance(float_prop.name) == db.OBLIGATORY + + uid_prop = rt1.get_property("user_id") + assert uid_prop.datatype == db.TEXT + assert rt1.get_importance(uid_prop.name) == db.RECOMMENDED + + # pattern properties without top-level entity: + assert "__PatternEntry_1" in model + assert "__PatternEntry_2" in model + + pattern_boolean_rt = model["__PatternEntry_1"] + assert "pattern: " in pattern_boolean_rt.description + assert len(pattern_boolean_rt.properties) == 2 + pp = pattern_boolean_rt.get_property("__matched_pattern") + assert pp.datatype == db.TEXT + assert pattern_boolean_rt.get_importance(pp.name) == db.OBLIGATORY + value_prop = pattern_boolean_rt.get_property("__PatternEntry_1_value") + assert value_prop.datatype == db.BOOLEAN + + pattern_object_rt = model["__PatternEntry_2"] + assert "pattern: " in pattern_object_rt.description + assert len(pattern_object_rt.properties) == 2 + pp = pattern_object_rt.get_property("__matched_pattern") + assert pp.datatype == db.TEXT + assert pattern_object_rt.get_importance(pp.name) == db.OBLIGATORY + date_id_prop = pattern_object_rt.get_property("date_id") + assert date_id_prop.datatype == db.TEXT + + +def test_missing_array_items(): + + # strict behavior + with pytest.raises(JsonSchemaDefinitionError) as err: + parse_model_from_json_schema(os.path.join( + FILEPATH, "datamodel_missing_array_items.schema.json")) + + assert "{'type': 'array'}" in str(err) + + # ignore all problems, so a RT is created that does not have the property + model = parse_model_from_json_schema(os.path.join( + FILEPATH, "datamodel_missing_array_items.schema.json"), ignore_unspecified_array_items=True) + assert "something_with_missing_array_items" in model + rt = model["something_with_missing_array_items"] + assert isinstance(rt, db.RecordType) + assert rt.get_property("missing") is None + + # specify the type: + type_dict = {"missing": db.FILE} + model = parse_model_from_json_schema(os.path.join( + FILEPATH, "datamodel_missing_array_items.schema.json"), types_for_missing_array_items=type_dict) + assert "something_with_missing_array_items" in model + rt = model["something_with_missing_array_items"] + assert rt.get_property("missing") is not None + assert rt.get_property("missing").datatype == db.LIST(db.FILE) + + +def test_pattern_properties(): + + model = parse_model_from_json_schema(os.path.join( + FILEPATH, "datamodel_pattern_properties.schema.json")) + + assert "Dataset" in model + rt1 = model["Dataset"] + assert len(rt1.properties) == 2 + for name in ["DatasetEntry_1", "DatasetEntry_2"]: + assert rt1.get_property(name) is not None + assert rt1.get_property(name).is_reference() + + pattern_boolean_rt = model["DatasetEntry_1"] + assert "pattern: " in pattern_boolean_rt.description + assert len(pattern_boolean_rt.properties) == 2 + pp = pattern_boolean_rt.get_property("__matched_pattern") + assert pp.datatype == db.TEXT + assert pattern_boolean_rt.get_importance(pp.name) == db.OBLIGATORY + value_prop = pattern_boolean_rt.get_property("DatasetEntry_1_value") + assert value_prop.datatype == db.BOOLEAN + + pattern_object_rt = model["DatasetEntry_2"] + assert "pattern: " in pattern_object_rt.description + assert len(pattern_object_rt.properties) == 2 + pp = pattern_object_rt.get_property("__matched_pattern") + assert pp.datatype == db.TEXT + assert pattern_object_rt.get_importance(pp.name) == db.OBLIGATORY + date_id_prop = pattern_object_rt.get_property("date_id") + assert date_id_prop.datatype == db.TEXT + + assert "Dataset2" in model + rt2 = model["Dataset2"] + assert len(rt2.properties) == 2 + # This has been tested elsewhere, just make sure that it is properly created + # in the presence of pattern properties, too. + assert rt2.get_property("datetime") is not None + + assert rt2.get_property("Literally anything") is not None + assert rt2.get_property("Literally anything").is_reference() + + pattern_named_rt = model["Literally anything"] + assert len(pattern_named_rt.properties) == 1 + assert pattern_named_rt.get_property("__matched_pattern") is not None diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py index 384da22da2938934459da0903a68636fc62f8dce..4ac3d45651fd473ab42eb9aa1f99d238b88cb826 100644 --- a/unittests/test_table_importer.py +++ b/unittests/test_table_importer.py @@ -44,7 +44,7 @@ from test_utils import BaseMockUpTest # For testing the table importer IMPORTER_KWARGS = dict( converters={'c': float, 'd': yes_no_converter, 'x': float}, # x does not exist - datatypes={'a': str, 'b': int, 'x': int}, # x does not exist + datatypes={'a': str, 'b': int, 'float': float, 'x': int}, # x does not exist obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')], existing_columns=['e'], ) @@ -181,12 +181,47 @@ class TableImporterTest(unittest.TestCase): self.assertEqual(df_new.shape[1], 4) self.assertEqual(df_new.iloc[0].b, 5) + # check that missing array-valued fields are detected correctly: + df = pd.DataFrame([[[None, None], 4, 2.0, 'yes'], + ['b', 5, 3.0, 'no']], + columns=['a', 'b', 'c', 'd']) + df_new = importer.check_missing(df) + self.assertEqual(df_new.shape[0], 1) + self.assertEqual(df_new.shape[1], 4) + self.assertEqual(df_new.iloc[0].b, 5) + def test_wrong_datatype(self): importer = TableImporter(**self.importer_kwargs) - df = pd.DataFrame([[None, np.nan, 2.0, 'yes'], + df = pd.DataFrame([[1234, 0, 2.0, 3, 'yes'], + [5678, 1, 2.0, 3, 'yes']], + columns=['a', 'b', 'c', 'float', 'd']) + # wrong datatypes before + assert df["a"].dtype == int + assert df["float"].dtype == int + # strict = False by default, so this shouldn't raise an error + importer.check_datatype(df) + # The types should be correct now. + assert df["a"].dtype == pd.StringDtype + assert df["float"].dtype == float + + # Resetting `df` since check_datatype may change datatypes + df = pd.DataFrame([[None, 0, 2.0, 'yes'], [5, 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd']) - self.assertRaises(DataInconsistencyError, importer.check_datatype, df) + # strict=True, so number in str column raises an error + self.assertRaises(DataInconsistencyError, importer.check_datatype, df, None, True) + + df = pd.DataFrame([[0], + [1]], + columns=['float']) + # strict=True, so int in float column raises an error + self.assertRaises(DataInconsistencyError, importer.check_datatype, df, None, True) + + # This is always wrong (float in int column) + df = pd.DataFrame([[None, np.nan, 2.0, 'yes'], + [5, 1.7, 2.0, 'yes']], + columns=['a', 'b', 'c', 'd']) + self.assertRaises(DataInconsistencyError, importer.check_datatype, df, None, False) def test_unique(self): importer = TableImporter(**self.importer_kwargs) @@ -266,6 +301,30 @@ class CSVImporterTest(TableImporterTest): importer = CSVImporter(**self.importer_kwargs) importer.read_file(tmp.name) + def test_with_generous_datatypes(self): + """Same as above but check that values are converted as expected.""" + tmp = NamedTemporaryFile(delete=False, suffix=".csv") + tmp.close() + self.valid_df.to_csv(tmp.name) + # Copy and use float for columns with integer values, string for columns + # with numeric values + kwargs = self.importer_kwargs.copy() + kwargs["datatypes"] = { + 'a': str, + 'b': float, + 'c': str + } + importer = CSVImporter(**kwargs) + importer.read_file(tmp.name) + + kwargs["datatypes"] = { + 'a': str, + 'b': str, + 'c': str + } + importer = CSVImporter(**kwargs) + importer.read_file(tmp.name) + class TSVImporterTest(TableImporterTest): def test_full(self): diff --git a/unittests/test_yaml_model_parser.py b/unittests/test_yaml_model_parser.py index b75114bf671785f204954b07ab65bbd74c381622..8fc7b146719a51c3d1fb45fc690ad3bc7f77594b 100644 --- a/unittests/test_yaml_model_parser.py +++ b/unittests/test_yaml_model_parser.py @@ -1,3 +1,21 @@ +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2023 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + import unittest from datetime import date from tempfile import NamedTemporaryFile @@ -340,6 +358,35 @@ A: assert "line {}".format(line) in yde.exception.args[0] +def test_existing_model(): + """Parsing more than one model may require to append to existing models.""" + model_str_1 = """ +A: + obligatory_properties: + number: + datatype: INTEGER + """ + model_str_2 = """ +B: + obligatory_properties: + A: + """ + model_1 = parse_model_from_string(model_str_1) + model_2 = parse_model_from_string(model_str_2, existing_model=model_1) + for ent in ["A", "B", "number"]: + assert ent in model_2 + + model_str_redefine = """ +number: + datatype: DOUBLE + description: Hello number! + """ + model_redefine = parse_model_from_string(model_str_redefine, existing_model=model_1) + print(model_redefine) + assert model_redefine["number"].description == "Hello number!" + assert model_redefine["number"].datatype == db.INTEGER # FIXME Shouldn't this be DOUBLE? + + def test_define_role(): model = """ A: @@ -511,3 +558,11 @@ R3: # Until removal, both do the same assert has_parent(r3, par) assert r3.get_parent(par)._flags["inheritance"] == db.OBLIGATORY + + +def test_yaml_error(): + """Testing error while parsing a yaml. + """ + + with raises(ValueError, match=r"line 2: .*"): + parse_model_from_yaml("unittests/models/model_invalid.yml")