diff --git a/.gitignore b/.gitignore index 4402aa11bc399c03400c4427c669b93ebb2637ce..182ed05e1404483ecb553c8a4e469a86a77ba27c 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,4 @@ src/doc/_apidoc/ start_caosdb_docker.sh src/doc/_apidoc /dist/ +*.egg-info diff --git a/CHANGELOG.md b/CHANGELOG.md index 02dcb340a1055881a12c31d90e079c2a5859a041..efb3c04c853ac8b03c7b72fbd5fe27d4317d75f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added some StructureElements: BooleanElement, FloatElement, IntegerElement, ListElement, DictElement - String representation for Identifiables +- JSON schema validation can also be used in the DictElementConverter ### Changed ### @@ -20,6 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Dict, DictElement and DictDictElement were merged into DictElement. - DictTextElement and TextElement were merged into TextElement. The "match" keyword is now invalid for TextElements. +- JSONFileConverter creates another level of StructureElements (see "How to upgrade" in the docs) ### Deprecated ### diff --git a/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml b/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml index 69cb53d4ffb86a3353fbccc2cae3dc3fbea25009..7a64d708667182b80b739812e5fdf3369fc5b462 100644 --- a/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml +++ b/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml @@ -32,107 +32,111 @@ Data: match: .dataspace.json validate: schema/dataspace.schema.json subtree: - dataspace_id_element: - type: IntegerElement - match_name: "dataspace_id" - match_value: "(?P<id>[0-9]+)" - records: - Dataspace: - dataspace_id: $id - archived_element: - type: BooleanElement - match_name: "archived" - match_value: "(?P<archived>.*)" - records: - Dataspace: - archived: $archived - url_element: - type: TextElement - match_name: "url" - match_value: "(?P<url>.*)" - records: - Dataspace: - url: $url - coordinator_element: + jsondict: type: DictElement - match_name: "coordinator" - records: - Person: - parents: - - Person - Dataspace: - Person: $Person - subtree: &person_subtree - full_name_element: - type: TextElement - match_name: "full_name" - match_value: "(?P<full_name>.*)" + match: .* + subtree: + dataspace_id_element: + type: IntegerElement + match_name: "dataspace_id" + match_value: "(?P<id>[0-9]+)" records: - Person: - full_name: $full_name - full_name_nonlatin_element: - type: TextElement - match_name: "full_name_nonlatin" - match_value: "(?P<full_name_nonlatin>.*)" + Dataspace: + dataspace_id: $id + archived_element: + type: BooleanElement + match_name: "archived" + match_value: "(?P<archived>.*)" records: - Person: - full_name_nonlatin: $full_name_nonlatin - family_name_element: + Dataspace: + archived: $archived + url_element: type: TextElement - match_name: "family_name" - match_value: "(?P<family_name>.*)" + match_name: "url" + match_value: "(?P<url>.*)" records: - Person: - family_name: $family_name - given_name_element: - type: TextElement - match_name: "given_name" - match_value: "(?P<given_name>.*)" + Dataspace: + url: $url + coordinator_element: + type: DictElement + match_name: "coordinator" records: Person: - given_name: $given_name - email_element: + parents: + - Person + Dataspace: + Person: $Person + subtree: &person_subtree + full_name_element: + type: TextElement + match_name: "full_name" + match_value: "(?P<full_name>.*)" + records: + Person: + full_name: $full_name + full_name_nonlatin_element: + type: TextElement + match_name: "full_name_nonlatin" + match_value: "(?P<full_name_nonlatin>.*)" + records: + Person: + full_name_nonlatin: $full_name_nonlatin + family_name_element: + type: TextElement + match_name: "family_name" + match_value: "(?P<family_name>.*)" + records: + Person: + family_name: $family_name + given_name_element: + type: TextElement + match_name: "given_name" + match_value: "(?P<given_name>.*)" + records: + Person: + given_name: $given_name + email_element: + type: TextElement + match_name: "email" + match_value: "(?P<email>.*)" + records: + Person: + email: $email + affiliation_element: + type: TextElement + match_name: "affiliation" + match_value: "(?P<affiliation>.*)" + records: + Person: + affiliation: $affiliation + ORCID_element: + type: TextElement + match_name: "ORCID" + match_value: "(?P<ORCID>.*)" + records: + Person: + ORCID: $ORCID + start_date_element: type: TextElement - match_name: "email" - match_value: "(?P<email>.*)" + match_name: "start_date" + match_value: "(?P<start_date>.*)" records: - Person: - email: $email - affiliation_element: + Dataspace: + start_date: $start_date + end_date_element: type: TextElement - match_name: "affiliation" - match_value: "(?P<affiliation>.*)" + match_name: "end_date" + match_value: "(?P<end_date>.*)" records: - Person: - affiliation: $affiliation - ORCID_element: + Dataspace: + end_date: $end_date + comment: type: TextElement - match_name: "ORCID" - match_value: "(?P<ORCID>.*)" + match_name: "comment" + match_value: "(?P<comment>.*)" records: - Person: - ORCID: $ORCID - start_date_element: - type: TextElement - match_name: "start_date" - match_value: "(?P<start_date>.*)" - records: - Dataspace: - start_date: $start_date - end_date_element: - type: TextElement - match_name: "end_date" - match_value: "(?P<end_date>.*)" - records: - Dataspace: - end_date: $end_date - comment: - type: TextElement - match_name: "comment" - match_value: "(?P<comment>.*)" - records: - Dataspace: - comment: $comment + Dataspace: + comment: $comment raw_data_dir: type: Directory match: 03_raw_data @@ -151,370 +155,374 @@ Data: match: metadata.json validate: schema/dataset.schema.json subtree: - title_element: - type: TextElement - match_name: "title" - match_value: "(?P<title>.*)" - records: - Dataset: - title: $title - authors_element: - type: ListElement - match_name: "authors" - subtree: - author_element: - type: DictElement - records: - Person: - parents: - - Person - Dataset: - authors: +$Person - subtree: *person_subtree - abstract_element: - type: TextElement - match_name: "abstract" - match_value: "(?P<abstract>.*)" - records: - Dataset: - abstract: $abstract - comment_element: - type: TextElement - match_name: "comment" - match_value: "(?P<comment>.*)" - records: - Dataset: - comment: $comment - license_element: - type: TextElement - match_name: "license" - match_value: "(?P<license_name>.*)" - records: - license: - # TODO: As soon as such things can be validated, a - # creation of a new license has to be forbidden here - # (although this is effectively done already by - # validating against the above schema.) - name: $license_name - Dataset: - license: $license - dataset_doi_element: - type: TextElement - match_name: "dataset_doi" - match_value: "(?P<dataset_doi>.*)" - records: - Dataset: - dataset_doi: $dataset_doi - related_to_dois_element: - type: ListElement - match_name: "related_to_dois" + jsondict: + type: DictElement + match: .* subtree: - related_to_doi_element: + title_element: type: TextElement - match_value: "(?P<related_to_doi>).*" - records: - Dataset: - related_to_dois: +$related_to_doi - Keywords_element: - type: ListElement - match_name: "Keyword" - Events_element: - type: ListElement - match_name: "Event" - subtree: - Event_element: - type: DictElement + match_name: "title" + match_value: "(?P<title>.*)" records: - Event: - parents: - - Event Dataset: - Event: +$Event + title: $title + authors_element: + type: ListElement + match_name: "authors" subtree: - label_element: - type: TextElement - match_name: "label" - match_value: "(?P<label>.*)" - records: - Event: - label: $label - comment_element: - type: TextElement - match_name: "comment" - match_value: "(?P<comment>.*)" - records: - Event: - comment: $comment - start_datetime_element: - type: TextElement - match_name: start_datetime - match_value: "(?P<start_datetime>.*)" - records: - Event: - start_datetime: $start_datetime - end_datetime_element: - type: TextElement - match_name: end_datetime - match_value: "(?P<end_datetime>.*)" - records: - Event: - end_datetime: $end_datetime - longitude_element: - type: FloatElement - match_name: "longitude" - match_value: "(?P<longitude>.*)" - records: - Event: - longitude: $longitude - latitude_element: - type: FloatElement - match_name: "latitude" - match_value: "(?P<latitude>.*)" - records: - Event: - latitude: $latitude - elevation_element: - type: FloatElement - match_name: "elevation" - match_value: "(?P<elevation>.*)" - records: - Event: - elevation: $elevation - location_element: - type: TextElement - match_name: location - match_value: "(?P<location>.*)" - records: - Event: - location: $location - igsn_element: - type: TextElement - match_name: igsn - match_value: "(?P<igsn>.*)" + author_element: + type: DictElement records: - Event: - igsn: $igsn - events_in_data_element: - type: BooleanElement - match_name: "events_in_data" - match_value: "(?P<events_in_data>.*)" - records: - Dataset: - events_in_data: $events_in_data - geojson_element: - type: TextElement - match_name: "geojson" - match_value: "(?P<geojson>.*)" - records: - Dataset: - geojson: $geojson - project_element: - type: DictElement - match_name: "project" - records: - Project: - parents: - - Project - Dataset: - Project: $Project - subtree: - name_element: - type: TextElement - match_name: "name" - match_value: "(?P<name>.*)" - records: - Project: - name: $name - full_name_element: - type: TextElement - match_name: "full_name" - match_value: "(?P<full_name>.*)" - records: - Project: - full_name: $full_name - project_id_element: - type: TextElement - match_name: "project_id" - match_value: "(?P<project_id>.*)" - records: - Project: - project_id: $project_id - project_type_element: - type: TextElement - match_name: "project_type" - match_value: "(?P<project_type_name>.*)" - records: - project_type: - name: $project_type_name - Project: - project_type: $project_type - institute_element: + Person: + parents: + - Person + Dataset: + authors: +$Person + subtree: *person_subtree + abstract_element: type: TextElement - match_name: "institute" - match_value: "(?P<institute>.*)" + match_name: "abstract" + match_value: "(?P<abstract>.*)" records: - Project: - institute: $institute - start_date_element: + Dataset: + abstract: $abstract + comment_element: type: TextElement - match_name: "start_date" - match_value: "(?P<start_date>.*)" + match_name: "comment" + match_value: "(?P<comment>.*)" records: - Project: - start_date: $start_date - end_date_element: + Dataset: + comment: $comment + license_element: type: TextElement - match_name: "end_date" - match_value: "(?P<end_date>.*)" + match_name: "license" + match_value: "(?P<license_name>.*)" records: - Project: - end_date: $end_date - url_element: + license: + # TODO: As soon as such things can be validated, a + # creation of a new license has to be forbidden here + # (although this is effectively done already by + # validating against the above schema.) + name: $license_name + Dataset: + license: $license + dataset_doi_element: type: TextElement - match_name: "url" - match_value: "(?P<url>.*)" + match_name: "dataset_doi" + match_value: "(?P<dataset_doi>.*)" records: - Project: - url: $url - coordinators_element: + Dataset: + dataset_doi: $dataset_doi + related_to_dois_element: type: ListElement - match_name: "coordinators" + match_name: "related_to_dois" subtree: - coordinator_element: + related_to_doi_element: + type: TextElement + match_value: "(?P<related_to_doi>).*" + records: + Dataset: + related_to_dois: +$related_to_doi + Keywords_element: + type: ListElement + match_name: "Keyword" + Events_element: + type: ListElement + match_name: "Event" + subtree: + Event_element: type: DictElement records: - Person: + Event: parents: - - Person - Project: - coordinators: +$Person - subtree: *person_subtree - campaign_element: + - Event + Dataset: + Event: +$Event + subtree: + label_element: + type: TextElement + match_name: "label" + match_value: "(?P<label>.*)" + records: + Event: + label: $label + comment_element: + type: TextElement + match_name: "comment" + match_value: "(?P<comment>.*)" + records: + Event: + comment: $comment + start_datetime_element: + type: TextElement + match_name: start_datetime + match_value: "(?P<start_datetime>.*)" + records: + Event: + start_datetime: $start_datetime + end_datetime_element: + type: TextElement + match_name: end_datetime + match_value: "(?P<end_datetime>.*)" + records: + Event: + end_datetime: $end_datetime + longitude_element: + type: FloatElement + match_name: "longitude" + match_value: "(?P<longitude>.*)" + records: + Event: + longitude: $longitude + latitude_element: + type: FloatElement + match_name: "latitude" + match_value: "(?P<latitude>.*)" + records: + Event: + latitude: $latitude + elevation_element: + type: FloatElement + match_name: "elevation" + match_value: "(?P<elevation>.*)" + records: + Event: + elevation: $elevation + location_element: + type: TextElement + match_name: location + match_value: "(?P<location>.*)" + records: + Event: + location: $location + igsn_element: + type: TextElement + match_name: igsn + match_value: "(?P<igsn>.*)" + records: + Event: + igsn: $igsn + events_in_data_element: + type: BooleanElement + match_name: "events_in_data" + match_value: "(?P<events_in_data>.*)" + records: + Dataset: + events_in_data: $events_in_data + geojson_element: + type: TextElement + match_name: "geojson" + match_value: "(?P<geojson>.*)" + records: + Dataset: + geojson: $geojson + project_element: type: DictElement - match_name: "campaign" + match_name: "project" records: - Campaign: + Project: parents: - - Campaign + - Project Dataset: - Campaign: $Campaign + Project: $Project subtree: - label_element: + name_element: type: TextElement - match_name: "label" - match_value: "(?P<label>.*)" + match_name: "name" + match_value: "(?P<name>.*)" records: - Campaign: - label: $label - optional_label_element: + Project: + name: $name + full_name_element: type: TextElement - match_name: "optional_label" - match_value: "(?P<optional_label>.*)" + match_name: "full_name" + match_value: "(?P<full_name>.*)" records: - Campaign: - optional_label: $optional_label + Project: + full_name: $full_name + project_id_element: + type: TextElement + match_name: "project_id" + match_value: "(?P<project_id>.*)" + records: + Project: + project_id: $project_id + project_type_element: + type: TextElement + match_name: "project_type" + match_value: "(?P<project_type_name>.*)" + records: + project_type: + name: $project_type_name + Project: + project_type: $project_type + institute_element: + type: TextElement + match_name: "institute" + match_value: "(?P<institute>.*)" + records: + Project: + institute: $institute start_date_element: type: TextElement match_name: "start_date" match_value: "(?P<start_date>.*)" records: - Campaign: + Project: start_date: $start_date end_date_element: type: TextElement match_name: "end_date" match_value: "(?P<end_date>.*)" records: - Campaign: + Project: end_date: $end_date - responsible_scientists_element: + url_element: + type: TextElement + match_name: "url" + match_value: "(?P<url>.*)" + records: + Project: + url: $url + coordinators_element: type: ListElement - match_name: "responsible_scientists" + match_name: "coordinators" subtree: - responsible_scientist_element: + coordinator_element: type: DictElement records: Person: parents: - Person - Campaign: - responsible_scientists: +$Person + Project: + coordinators: +$Person subtree: *person_subtree - Methods_element: - type: ListElement - match_name: "Method" - subtree: - Method_element: + campaign_element: type: DictElement + match_name: "campaign" records: - Method: + Campaign: parents: - - Method + - Campaign Dataset: - Method: +$Method + Campaign: $Campaign subtree: - method_name_element: + label_element: type: TextElement - match_name: "method_name" - match_value: "(?P<method_name>.*)" + match_name: "label" + match_value: "(?P<label>.*)" records: - Method: - name: $method_name - abbreviation_element: + Campaign: + label: $label + optional_label_element: type: TextElement - match_name: "abbreviation" - match_value: "(?P<abbreviation>.*)" + match_name: "optional_label" + match_value: "(?P<optional_label>.*)" records: - Method: - abbreviation: $abbreviation - url_element: + Campaign: + optional_label: $optional_label + start_date_element: + type: TextElement + match_name: "start_date" + match_value: "(?P<start_date>.*)" + records: + Campaign: + start_date: $start_date + end_date_element: type: TextElement - match_name: "url" - match_value: "(?P<url>.*)" + match_name: "end_date" + match_value: "(?P<end_date>.*)" + records: + Campaign: + end_date: $end_date + responsible_scientists_element: + type: ListElement + match_name: "responsible_scientists" + subtree: + responsible_scientist_element: + type: DictElement + records: + Person: + parents: + - Person + Campaign: + responsible_scientists: +$Person + subtree: *person_subtree + Methods_element: + type: ListElement + match_name: "Method" + subtree: + Method_element: + type: DictElement records: Method: - url: $url - Taxa_element: - type: ListElement - match_name: "Taxon" - subtree: - Taxon_element: - type: DictElement - records: - Taxon: - parents: - - Taxon - Dataset: - Taxon: +$Taxon + parents: + - Method + Dataset: + Method: +$Method + subtree: + method_name_element: + type: TextElement + match_name: "method_name" + match_value: "(?P<method_name>.*)" + records: + Method: + name: $method_name + abbreviation_element: + type: TextElement + match_name: "abbreviation" + match_value: "(?P<abbreviation>.*)" + records: + Method: + abbreviation: $abbreviation + url_element: + type: TextElement + match_name: "url" + match_value: "(?P<url>.*)" + records: + Method: + url: $url + Taxa_element: + type: ListElement + match_name: "Taxon" subtree: - taxon_name_element: - type: TextElement - match_name: "taxon_name" - match_value: "(?P<taxon_name>.*)" + Taxon_element: + type: DictElement records: Taxon: - name: $taxon_name - archived_element: - type: BooleanElement - match_name: "archived" - match_value: "(P<archived>.*)" - records: - Dataset: - archived: $archived - publication_date_element: - type: TextElement - match_name: "publication_date" - match_value: "(P<publication_date>.*)" - records: - Dataset: - publication_date: $publication_date - max_files_element: - type: IntegerElement - match_name: "max_files" - match_value: "(P<max_files>.*)" - records: - Dataset: - max_files: $max_files + parents: + - Taxon + Dataset: + Taxon: +$Taxon + subtree: + taxon_name_element: + type: TextElement + match_name: "taxon_name" + match_value: "(?P<taxon_name>.*)" + records: + Taxon: + name: $taxon_name + archived_element: + type: BooleanElement + match_name: "archived" + match_value: "(P<archived>.*)" + records: + Dataset: + archived: $archived + publication_date_element: + type: TextElement + match_name: "publication_date" + match_value: "(P<publication_date>.*)" + records: + Dataset: + publication_date: $publication_date + max_files_element: + type: IntegerElement + match_name: "max_files" + match_value: "(P<max_files>.*)" + records: + Dataset: + max_files: $max_files auxiliary_file: &aux_file_template type: File match: "(?P<aux_file_name>(?!metadata.json).*)" diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index e79fbcabdb4fe48bc791287fa5551c0ade69e842..1bb47af3da398dcf3b93f246c8fd4d2c8e70849a 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -32,7 +32,7 @@ import warnings from .utils import has_parent from .stores import GeneralStore, RecordStore from .structure_elements import (StructureElement, Directory, File, DictElement, JSONFile, - IntegerElement, BooleanElement, FloatElement, + IntegerElement, BooleanElement, FloatElement, NoneElement, TextElement, TextElement, ListElement) from typing import List, Optional, Tuple, Union from abc import ABCMeta, abstractmethod @@ -74,8 +74,28 @@ def str_to_bool(x): else: raise RuntimeError("Should be 'true' or 'false'.") +# TODO: Comment on types and inheritance +# Currently, we often check the type of StructureElements, because serveral converters assume that +# they are called only with the appropriate class. +# Raising an Error if the type is not sufficient (e.g. TextElement instead of DictElement) means +# that the generic parent class StructureElement is actually NOT a valid type for the argument and +# type hints should reflect this. +# However, we should not narrow down the type of the arguments compared to the function definitions +# in the parent Converter class. See +# - https://mypy.readthedocs.io/en/stable/common_issues.html#incompatible-overrides +# - https://stackoverflow.com/questions/56860/what-is-an-example-of-the-liskov-substitution-principle +# - https://blog.daftcode.pl/covariance-contravariance-and-invariance-the-ultimate-python-guide-8fabc0c24278 +# Thus, the problem lies in the following design: +# Converter instances are supposed to be used by the Crawler in a generic way (The crawler calls +# `match` and `typecheck` etc) but the functions are not supposed to be called with generic +# StructureElements. One direction out of this would be a refactoring that makes the crawler class +# expose a generic function like `treat_element`, which can be called with any StructureElement and +# the Converter decides what to do (e.g. do nothing if the type is one that it does not care +# about). + class ConverterValidationError(Exception): + """To be raised if contents of an element to be converted are invalid.""" def __init__(self, msg): @@ -383,6 +403,7 @@ class Converter(object, metaclass=ABCMeta): class DirectoryConverter(Converter): def create_children(self, generalStore: GeneralStore, element: StructureElement): + # TODO: See comment on types and inheritance if not isinstance(element, Directory): raise RuntimeError( "Directory converters can only create children from directories.") @@ -403,6 +424,7 @@ class DirectoryConverter(Converter): # TODO basically all converters implement such a match function. Shouldn't this be the one # of the parent class and subclasses can overwrite if needed? def match(self, element: StructureElement): + # TODO: See comment on types and inheritance if not isinstance(element, Directory): raise RuntimeError("Element must be a directory.") m = re.match(self.definition["match"], element.name) @@ -444,6 +466,7 @@ class SimpleFileConverter(Converter): return list() def match(self, element: StructureElement): + # TODO: See comment on types and inheritance if not isinstance(element, File): raise RuntimeError("Element must be a file.") m = re.match(self.definition["match"], element.name) @@ -466,7 +489,7 @@ class MarkdownFileConverter(Converter): def create_children(self, generalStore: GeneralStore, element: StructureElement): - # TODO: isn't the type check sufficient? + # TODO: See comment on types and inheritance if not isinstance(element, File): raise RuntimeError("A markdown file is needed to create children.") @@ -488,7 +511,7 @@ class MarkdownFileConverter(Converter): return isinstance(element, File) def match(self, element: StructureElement): - # TODO: isn't the type check sufficient? + # TODO: See comment on types and inheritance if not isinstance(element, File): raise RuntimeError("Element must be a file.") m = re.match(self.definition["match"], element.name) @@ -503,40 +526,74 @@ class MarkdownFileConverter(Converter): return m.groupdict() +def convert_basic_element(element: Union[list, dict, bool, int, float, str, None], name=None, + msg_prefix=""): + """converts basic Python objects to the corresponding StructureElements """ + if isinstance(element, list): + return ListElement(name, element) + elif isinstance(element, dict): + return DictElement(name, element) + elif isinstance(element, bool): + return BooleanElement(name, element) + elif isinstance(element, int): + return IntegerElement(name, element) + elif isinstance(element, float): + return FloatElement(name, element) + elif isinstance(element, str): + return TextElement(name, element) + elif element is None: + return NoneElement(name) + else: + raise NotImplementedError( + msg_prefix + f"The object that has an unexpected type: {type(element)}\n" + f"The object is:\n{str(element)}") + + +def validate_against_json_schema(instance, schema_resource: Union[dict, str]): + """validates given ``instance`` against given ``schema_resource``. + + Args: + instance: instance to be validated, typically ``dict`` but can be ``list``, ``str``, etc. + schema_resource: Either a path to the JSON file containing the schema or a ``dict`` with + the schema + """ + if isinstance(schema_resource, dict): + schema = schema_resource + elif isinstance(schema_resource, str): + with open(schema_resource, 'r') as json_file: + schema = json.load(json_file) + else: + raise ValueError("The value of 'validate' has to be a string describing the path " + "to the json schema file (relative to the cfood yml) " + "or a dict containing the schema.") + # validate instance (e.g. JSON content) against schema + try: + validate(instance=instance, schema=schema) + except ValidationError as err: + raise ConverterValidationError( + f"Couldn't validate {instance}:\n{err.message}") + + class DictElementConverter(Converter): - # TODO use Dict as typecheck? def create_children(self, generalStore: GeneralStore, element: StructureElement): - if not self.typecheck(element): - raise RuntimeError("A dict is needed to create children") + # TODO: See comment on types and inheritance + if not isinstance(element, DictElement): + raise ValueError("create_children was called with wrong type of StructureElement") return self._create_children_from_dict(element.value) def _create_children_from_dict(self, data): + if "validate" in self.definition and self.definition["validate"]: + validate_against_json_schema(data, self.definition["validate"]) + children = [] for name, value in data.items(): - if type(value) == list: - children.append(ListElement(name, value)) - elif type(value) == str: - children.append(TextElement(name, value)) - elif type(value) == dict: - children.append(DictElement(name, value)) - elif type(value) == int: - children.append(IntegerElement(name, value)) - elif type(value) == bool: - children.append(BooleanElement(name, value)) - elif type(value) == float: - children.append(FloatElement(name, value)) - elif type(value) == type(None): - continue - else: - children.append(DictElement(name, value)) - warnings.warn(f"The value in the dict for key:{name} has an unknown type. " - "The fallback type DictElement is used.") + children.append(convert_basic_element( + value, name, f"The value in the dict for key:{name} has an unknown type.")) return children - # TODO use Dict as typecheck? def typecheck(self, element: StructureElement): return isinstance(element, DictElement) @@ -544,6 +601,7 @@ class DictElementConverter(Converter): """ Allways matches if the element has the right type. """ + # TODO: See comment on types and inheritance if not isinstance(element, DictElement): raise RuntimeError("Element must be a DictElement.") return match_name_and_value(self.definition, element.name, element.value) @@ -563,11 +621,12 @@ class DictDictElementConverter(DictElementConverter): super().__init__(*args, **kwargs) -class JSONFileConverter(DictElementConverter): +class JSONFileConverter(Converter): def typecheck(self, element: StructureElement): return isinstance(element, File) def match(self, element: StructureElement): + # TODO: See comment on types and inheritance if not self.typecheck(element): raise RuntimeError("Element must be a file") m = re.match(self.definition["match"], element.name) @@ -576,33 +635,17 @@ class JSONFileConverter(DictElementConverter): return m.groupdict() def create_children(self, generalStore: GeneralStore, element: StructureElement): - if not self.typecheck(element): - raise RuntimeError("A JSON file is needed to create children") - # TODO: either add explicit type check for File structure element here, - # or add a comment to suppress mypy type warning. + # TODO: See comment on types and inheritance + if not isinstance(element, File): + raise ValueError("create_children was called with wrong type of StructureElement") with open(element.path, 'r') as json_file: json_data = json.load(json_file) - if not isinstance(json_data, dict): - raise NotImplementedError("JSON file must contain a dict") if "validate" in self.definition and self.definition["validate"]: - if isinstance(self.definition["validate"], dict): - schema = self.definition["validate"] - elif isinstance(self.definition["validate"], str): - - with open(self.definition["validate"], 'r') as json_file: - schema = json.load(json_file) - else: - raise ValueError("The value of 'validate' has to be a string describing the path " - "to the json schema file (relative to the cfood yml) " - "or a dict containing the schema.") - # Validate the json content - try: - validate(instance=json_data, schema=schema) - except ValidationError as err: - raise ConverterValidationError( - f"Couldn't validate {json_data}:\n{err.message}") - - return self._create_children_from_dict(json_data) + validate_against_json_schema(json_data, self.definition["validate"]) + structure_element = convert_basic_element( + json_data, "The JSON File contained content that was parsed to a Python object" + " with an unexpected type.") + return [structure_element] def match_name_and_value(definition, name, value): @@ -684,9 +727,12 @@ class _AbstractScalarValueElementConverter(Converter): Else return a dictionary containing the variables from the matched regexp as key value pairs. """ - if not self.typecheck(element): - raise RuntimeError( - f"Element has an invalid type: {type(element)}.") + # TODO: See comment on types and inheritance + if (not isinstance(element, TextElement) + and not isinstance(element, BooleanElement) + and not isinstance(element, IntegerElement) + and not isinstance(element, FloatElement)): + raise ValueError("create_children was called with wrong type of StructureElement") return match_name_and_value(self.definition, element.name, element.value) def _typecheck(self, element: StructureElement, allowed_matches: dict): @@ -802,10 +848,11 @@ class DictIntegerElementConverter(IntegerElementConverter): class ListElementConverter(Converter): def create_children(self, generalStore: GeneralStore, element: StructureElement): + # TODO: See comment on types and inheritance if not isinstance(element, ListElement): raise RuntimeError( "This converter can only process DictListElements.") - children = [] + children: list[StructureElement] = [] for index, list_element in enumerate(element.value): # TODO(fspreck): Refactor this and merge with DictXXXElements maybe? if isinstance(list_element, str): @@ -823,6 +870,7 @@ class ListElementConverter(Converter): return isinstance(element, ListElement) def match(self, element: StructureElement): + # TODO: See comment on types and inheritance if not isinstance(element, ListElement): raise RuntimeError("Element must be a ListElement.") m = re.match(self.definition["match_name"], element.name) @@ -878,6 +926,7 @@ class TableConverter(Converter): return isinstance(element, File) def match(self, element: StructureElement): + # TODO: See comment on types and inheritance if not isinstance(element, File): raise RuntimeError("Element must be a File.") m = re.match(self.definition["match"], element.name) @@ -904,6 +953,7 @@ class XLSXTableConverter(TableConverter): def create_children(self, generalStore: GeneralStore, element: StructureElement): + # TODO: See comment on types and inheritance if not isinstance(element, File): raise RuntimeError("Element must be a File.") table = pd.read_excel(element.path, **self.get_options()) @@ -932,6 +982,7 @@ class CSVTableConverter(TableConverter): def create_children(self, generalStore: GeneralStore, element: StructureElement): + # TODO: See comment on types and inheritance if not isinstance(element, File): raise RuntimeError("Element must be a File.") table = pd.read_csv(element.path, **self.get_options()) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index cccbbdb040c556da8f904f27dd9d03aafb6d4872..945050720f7a08e3e59145efd9350dbbf0595405 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -63,7 +63,7 @@ from .identifiable_adapters import (IdentifiableAdapter, from .identified_cache import IdentifiedCache from .macros import defmacro_constructor, macro_constructor from .stores import GeneralStore, RecordStore -from .structure_elements import StructureElement, Directory +from .structure_elements import StructureElement, Directory, NoneElement logger = logging.getLogger(__name__) diff --git a/src/caoscrawler/structure_elements.py b/src/caoscrawler/structure_elements.py index cb5fad211b5e3b1b766ee95fd6f0a31c965d032b..952f29d012f8373062ed9dfe8a830bd18c4b0baa 100644 --- a/src/caoscrawler/structure_elements.py +++ b/src/caoscrawler/structure_elements.py @@ -56,6 +56,10 @@ class FileSystemStructureElement(StructureElement): return "{}: {}, {}".format(class_name_short, self.name, self.path) +class NoneElement(StructureElement): + pass + + class Directory(FileSystemStructureElement): pass diff --git a/src/doc/how-to-upgrade.md b/src/doc/how-to-upgrade.md index 56298e695bc4aa7ee83e407fffd39a5d0d8c21f5..931fa0cd2f2d621c89c35046d6df4ba6ac9b7a1e 100644 --- a/src/doc/how-to-upgrade.md +++ b/src/doc/how-to-upgrade.md @@ -5,6 +5,7 @@ DictElementConverter (old: DictConverter) now can use "match" keywords. If none are in the definition, the behavior is as before. If you had "match", "match_name" or "match_value" in the definition of a + DictConverter (StructureElement: Dict) before, you probably want to remove those. They were ignored before and are now used. @@ -14,3 +15,21 @@ forbidden to used. If you used the 'match' keyword in the definition of TextElementConverter (StructureElement: TextElement) before, you need to change the key from "match" to "match_name" in order to preserve the behavior. + +The JSONFileConverter was changed such that it creates StructureElements as children corresponding +to the content. I.e. if there is an Object (dict in Python) in the file, the children will be a list +with one DictElement. Before, only JSON files with one Object were accepted and the key value pairs +of the dict were directly transformed to children. This means, that all previously used +JSONFileConverters need to introduce a new level for the DictElement: + +``` + json: + type: JSONFile + match: metadata.json + validate: schema/dataset.schema.json + subtree: + jsondict: # new + type: DictElement # new + match: .* # new + subtree: # new +``` diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 7ddf0c9e07049f5f12cafeae4f3ef25ddab28c56..1178f187ad6ec28afbdeb200b071d8b9694b951a 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -193,43 +193,47 @@ def test_json_converter(converter_registry): assert m is not None assert len(m) == 0 - children = jsonconverter.create_children(None, test_json) - assert len(children) == 8 - assert children[0].__class__ == TextElement - assert children[0].name == "name" - assert children[0].value.__class__ == str - assert children[0].value == "DEMO" - - assert children[1].__class__ == IntegerElement - assert children[1].name == "projectId" - assert children[1].value.__class__ == int - assert children[1].value == 10002 - - assert children[2].__class__ == BooleanElement - assert children[2].name == "archived" - assert children[2].value.__class__ == bool - - assert children[3].__class__ == ListElement - assert children[3].name == "Person" - assert children[3].value.__class__ == list - assert len(children[3].value) == 2 + dict_el = jsonconverter.create_children(None, test_json) + assert len(dict_el) == 1 - assert children[4].__class__ == TextElement - assert children[4].name == "start_date" - assert children[4].value.__class__ == str - - assert children[5].__class__ == ListElement - assert children[5].name == "candidates" - assert children[5].value.__class__ == list - assert children[5].value == ["Mouse", "Penguine"] - - assert children[6].__class__ == FloatElement - assert children[6].name == "rvalue" - assert children[6].value.__class__ == float - - assert children[7].__class__ == TextElement - assert children[7].name == "url" - assert children[7].value.__class__ == str + dictconverter = DictElementConverter( + definition={"match_name": "(.*)"}, + name="dictconv", + converter_registry=converter_registry) + children = dictconverter.create_children(None, dict_el[0]) + for child in children: + if child.name == "name": + assert isinstance(child, TextElement) + assert isinstance(child.value, str) + assert child.value == "DEMO" + elif child.name == "projectId": + assert isinstance(child, IntegerElement) + assert isinstance(child.value, int) + assert child.value == 10002 + elif child.name == "archived": + assert isinstance(child, BooleanElement) + assert isinstance(child.value, bool) + assert child.value is False + elif child.name == "Person": + assert isinstance(child, ListElement) + assert isinstance(child.value, list) + assert len(child.value) == 2 + elif child.name == "start_date": + assert isinstance(child, TextElement) + assert isinstance(child.value, str) + assert child.value == '2022-03-01' + elif child.name == "candidates": + assert isinstance(child, ListElement) + assert isinstance(child.value, list) + assert child.value == ["Mouse", "Penguine"] + elif child.name == "rvalue": + assert isinstance(child, FloatElement) + assert isinstance(child.value, float) + elif child.name == "url": + assert isinstance(child, TextElement) + assert isinstance(child.value, str) + else: + raise ValueError() broken_json = File( "brokenjson.json", @@ -239,7 +243,7 @@ def test_json_converter(converter_registry): # Doesn't validate because of missing required 'name' property with pytest.raises(ConverterValidationError) as err: - children = jsonconverter.create_children(None, broken_json) + jsonconverter.create_children(None, broken_json) assert err.value.message.startswith("Couldn't validate") @@ -429,8 +433,7 @@ def test_converter_value_match(converter_registry): name="Test", converter_registry=converter_registry ) - with pytest.raises(RuntimeError) as err: - m = dc.match(IntegerElement(name="a", value=4)) + assert dc.typecheck(IntegerElement(name="a", value=4)) is False # overwrite default with match for float dc = IntegerElementConverter( diff --git a/unittests/test_directories/examples_json/jsontest_cfood.yml b/unittests/test_directories/examples_json/jsontest_cfood.yml index 875773e6bf523500dba46abffda25c0edcb3abc4..2dcef5c7da6d2bee18ae7807210ad297915ea397 100644 --- a/unittests/test_directories/examples_json/jsontest_cfood.yml +++ b/unittests/test_directories/examples_json/jsontest_cfood.yml @@ -8,51 +8,55 @@ JSONTest: # name of the converter parents: - Project # not needed as the name is equivalent subtree: - name_element: - type: TextElement - match_name: "name" - match_value: "(?P<name>.*)" - records: - Project: - name: $name - url_element: # name of the first subtree element which is a converter - type: TextElement - match_value: "(?P<url>.*)" - match_name: "url" - records: - Project: - url: $url - persons_element: - type: ListElement - match_name: "Person" + dictel: + type: DictElement + match: '(.*)' subtree: - person_element: - type: DictElement + name_element: + type: TextElement + match_name: "name" + match_value: "(?P<name>.*)" records: - Person: - parents: - - Person Project: - Person: +$Person + name: $name + url_element: # name of the first subtree element which is a converter + type: TextElement + match_value: "(?P<url>.*)" + match_name: "url" + records: + Project: + url: $url + persons_element: + type: ListElement + match_name: "Person" subtree: - firstname_element: - type: TextElement - match_name: "firstname" - match_value: "(?P<firstname>.*)" - records: - Person: - firstname: $firstname - lastname_element: - type: TextElement - match_name: "lastname" - match_value: "(?P<lastname>.*)" - records: - Person: - lastname: $lastname - email_element: - type: TextElement - match_name: "email" - match_value: "(?P<email>.*)" + person_element: + type: DictElement records: Person: - email: $email + parents: + - Person + Project: + Person: +$Person + subtree: + firstname_element: + type: TextElement + match_name: "firstname" + match_value: "(?P<firstname>.*)" + records: + Person: + firstname: $firstname + lastname_element: + type: TextElement + match_name: "lastname" + match_value: "(?P<lastname>.*)" + records: + Person: + lastname: $lastname + email_element: + type: TextElement + match_name: "email" + match_value: "(?P<email>.*)" + records: + Person: + email: $email