diff --git a/.gitignore b/.gitignore index 905a0ea8ea6816a80ce75de95045b2f12946e2d0..6df7e28419776d5976ed34c11a69b39a3cbd3dec 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ src/newcrawler.egg-info/ .coverage +__pycache__ +.tox TAGS src/.coverage build/ @@ -10,4 +12,4 @@ provenance.yml *.pem *.jks *.tar.gz -*.sql \ No newline at end of file +*.sql diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index f386df4b0a349d790b322021d191f7e7083f439a..ffdaed6318a98b9e1ce2da9a0d7390332f8ebcaa 100644 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -187,7 +187,7 @@ class Crawler(object): # from the crawler definition and add them to the yaml schema that will be # tested in the next lines of code: - # Load and validate the cfood schema: + # Load the cfood schema: with open(files('newcrawler').joinpath('cfood-schema.yml'), "r") as f: schema = yaml.safe_load(f) @@ -197,6 +197,7 @@ class Crawler(object): schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( key) + # Validate the cfood schema: validate(instance=crawler_definition, schema=schema["cfood"]) return crawler_definition @@ -247,7 +248,6 @@ class Crawler(object): # Load modules and associate classes: for key, value in converter_registry.items(): module = importlib.import_module(value["package"]) - print(value) value["class"] = getattr(module, value["converter"]) return converter_registry @@ -302,9 +302,6 @@ class Crawler(object): # This function builds the tree of converters out of the crawler definition. - if not isinstance(item, Directory): - raise NotImplementedError("Currently only directories are supported as items.") - if self.generalStore is None: raise RuntimeError("Should not happen.") @@ -312,7 +309,7 @@ class Crawler(object): converter_registry) # This recursive crawling procedure generates the update list: self.updateList: list[db.Record] = [] - self._crawl(DirectoryConverter.create_children_from_directory(item), + self._crawl([item], self.global_converters, local_converters, self.generalStore, self.recordStore, [], []) diff --git a/unittests/scifolder_cfood.yml b/unittests/scifolder_cfood.yml index 7cc4d11f090f068098f30cf841dc42aede0194f0..1fd7c98d57b35fa651e36bee2c529a46e3a96cde 100644 --- a/unittests/scifolder_cfood.yml +++ b/unittests/scifolder_cfood.yml @@ -6,73 +6,77 @@ Definitions: type: Definitions #include "description.yml" -DataAnalysis: # name of the converter +Data: # name of the converter type: Directory - match: DataAnalysis - subtree: &template - project_dir: # name of the first subtree element which is a converter + match: (.*) + subtree: + DataAnalysis: # name of the converter type: Directory - match: (?P<date>.*?)_(?P<identifier>.*) - records: - Project: # this is an identifiable in this case - parents: - - Project # not needed as the name is equivalent - date: $date - identifier: $identifier - - subtree: - measurement: # new name for folders on the 3rd level + match: DataAnalysis + subtree: &template + project_dir: # name of the first subtree element which is a converter type: Directory - match: (?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))? + match: (?P<date>.*?)_(?P<identifier>.*) records: - Measurement: + Project: # this is an identifiable in this case + parents: + - Project # not needed as the name is equivalent date: $date identifier: $identifier - project: $Project + subtree: - README: - type: MarkdownFile # this is a subclass of converter File - # function signature: GeneralStore, StructureElement - # preprocessors: custom.caosdb.convert_values - match: ^README\.md$ - # how to make match case insensitive? + measurement: # new name for folders on the 3rd level + type: Directory + match: (?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))? + records: + Measurement: + date: $date + identifier: $identifier + project: $Project subtree: - description: - type: DictTextElement - match_value: (?P<description>.*) - match_name: description - records: - Measurement: - description: $description - responsible_single: - type: DictTextElement - match_name: responsible - match_value: &person_regexp ((?P<first_name>.+) )?(?P<last_name>.+) - records: &responsible_records - Person: - first_name: $first_name - last_name: $last_name - Measurement: # this uses the reference to the above defined record - responsible: +$Person # each record also implicitely creates a variable - # with the same name. The "+" indicates, that - # this will become a list entry in list property - # "responsible" belonging to Measurement. - - responsible_list: - type: DictListElement - match_name: responsible + README: + type: MarkdownFile # this is a subclass of converter File + # function signature: GeneralStore, StructureElement + # preprocessors: custom.caosdb.convert_values + match: ^README\.md$ + # how to make match case insensitive? subtree: - Person: - type: TextElement - match: *person_regexp - records: *responsible_records + description: + type: DictTextElement + match_value: (?P<description>.*) + match_name: description + records: + Measurement: + description: $description + responsible_single: + type: DictTextElement + match_name: responsible + match_value: &person_regexp ((?P<first_name>.+) )?(?P<last_name>.+) + records: &responsible_records + Person: + first_name: $first_name + last_name: $last_name + Measurement: # this uses the reference to the above defined record + responsible: +$Person # each record also implicitely creates a variable + # with the same name. The "+" indicates, that + # this will become a list entry in list property + # "responsible" belonging to Measurement. -ExperimentalData: # name of the converter - type: Directory - match: ExperimentalData - subtree: *template + responsible_list: + type: DictListElement + match_name: responsible + subtree: + Person: + type: TextElement + match: *person_regexp + records: *responsible_records -SimulationData: # name of the converter - type: Directory - match: SimulationData - subtree: *template + ExperimentalData: # name of the converter + type: Directory + match: ExperimentalData + subtree: *template + + SimulationData: # name of the converter + type: Directory + match: SimulationData + subtree: *template diff --git a/unittests/scifolder_extended.yml b/unittests/scifolder_extended.yml index eea2ba43ec8fca2465dd5e898ceb157ec4b4cf57..2a1416b778e96ba57fc216d9763572568703ab75 100644 --- a/unittests/scifolder_extended.yml +++ b/unittests/scifolder_extended.yml @@ -6,94 +6,98 @@ Definitions: type: Definitions #include "description.yml" -DataAnalysis: # name of the converter +Data: # name of the converter type: Directory - match: DataAnalysis - subtree: &template - project_dir: # name of the first subtree element which is a converter + match: (.*) + subtree: + DataAnalysis: # name of the converter type: Directory - match: (?P<date>.*?)_(?P<identifier>.*) - records: - Project: # this is an identifiable in this case - parents: - - Project # not needed as the name is equivalent - date: $date - identifier: $identifier - - subtree: - measurement: # new name for folders on the 3rd level + match: DataAnalysis + subtree: &template + project_dir: # name of the first subtree element which is a converter type: Directory - match: (?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))? + match: (?P<date>.*?)_(?P<identifier>.*) records: - Measurement: + Project: # this is an identifiable in this case + parents: + - Project # not needed as the name is equivalent date: $date identifier: $identifier - project: $Project + subtree: - README: - type: MarkdownFile # this is a subclass of converter File - # function signature: GeneralStore, StructureElement - # preprocessors: custom.caosdb.convert_values - match: ^README\.md$ - # how to make match case insensitive? - records: # this block is very verbose and intended to make sure that this - # file is inserted correctly (and can be supplemented with properties - # and / or parents), TODO: maybe there should be a shorthand - ReadmeFile: - parents: [] - role: File - path: $README - file: $README # this is automatically the relative path - # starting from the top level structure element - # of this element + measurement: # new name for folders on the 3rd level + type: Directory + match: (?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))? + records: Measurement: - ReadmeFile: $ReadmeFile - + date: $date + identifier: $identifier + project: $Project subtree: - description: - type: DictTextElement - match_value: (?P<description>.*) - match_name: description - records: + README: + type: MarkdownFile # this is a subclass of converter File + # function signature: GeneralStore, StructureElement + # preprocessors: custom.caosdb.convert_values + match: ^README\.md$ + # how to make match case insensitive? + records: # this block is very verbose and intended to make sure that this + # file is inserted correctly (and can be supplemented with properties + # and / or parents), TODO: maybe there should be a shorthand + ReadmeFile: + parents: [] + role: File + path: $README + file: $README # this is automatically the relative path + # starting from the top level structure element + # of this element Measurement: - description: $description - responsible_single: - type: DictTextElement - match_name: responsible - match_value: &person_regexp ((?P<first_name>.+) )?(?P<last_name>.+) - records: &responsible_records - Person: - first_name: $first_name - last_name: $last_name - Measurement: # this uses the reference to the above defined record - responsible: +$Person # each record also implicitely creates a variable - # with the same name. The "+" indicates, that - # this will become a list entry in list property - # "responsible" belonging to Measurement. - - responsible_list: - type: DictListElement - match_name: responsible + ReadmeFile: $ReadmeFile + subtree: - Person: - type: TextElement - match: *person_regexp - records: *responsible_records + description: + type: DictTextElement + match_value: (?P<description>.*) + match_name: description + records: + Measurement: + description: $description + responsible_single: + type: DictTextElement + match_name: responsible + match_value: &person_regexp ((?P<first_name>.+) )?(?P<last_name>.+) + records: &responsible_records + Person: + first_name: $first_name + last_name: $last_name + Measurement: # this uses the reference to the above defined record + responsible: +$Person # each record also implicitely creates a variable + # with the same name. The "+" indicates, that + # this will become a list entry in list property + # "responsible" belonging to Measurement. - # sources_list: - # type: DictListElement - # match_name: sources - # subtree: - # Source: - # type: TextElement - # match: &path ... ??? + responsible_list: + type: DictListElement + match_name: responsible + subtree: + Person: + type: TextElement + match: *person_regexp + records: *responsible_records -ExperimentalData: # name of the converter - type: Directory - match: ExperimentalData - subtree: *template + # sources_list: + # type: DictListElement + # match_name: sources + # subtree: + # Source: + # type: TextElement + # match: &path ... ??? -SimulationData: # name of the converter - type: Directory - match: SimulationData - subtree: *template + ExperimentalData: # name of the converter + type: Directory + match: ExperimentalData + subtree: *template + + SimulationData: # name of the converter + type: Directory + match: SimulationData + subtree: *template diff --git a/unittests/test_tool.py b/unittests/test_tool.py index ac57a8f32be649f47e8038a37f9edfc35df58605..dd9fb83d772496cc6b3729f2893997360d318f18 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -74,14 +74,14 @@ def test_record_structure_generation(crawler): subd = crawler.debug_tree[dircheckstr("DataAnalysis")] subc = crawler.debug_metadata["copied"][dircheckstr("DataAnalysis")] assert len(subd) == 2 - assert len(subd[0]) == 1 # variables store on Data Analysis node of debug tree + assert len(subd[0]) == 2 # variables store on Data Analysis node of debug tree assert len(subd[1]) == 0 # record store on Data Analysis node of debug tree assert len(subc) == 2 - assert len(subc[0]) == 1 + assert len(subc[0]) == 2 assert len(subc[1]) == 0 # The data analysis node creates one variable for the node itself: - assert subd[0]["DataAnalysis"] == "DataAnalysis" + assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" assert subc[0]["DataAnalysis"] == False subd = crawler.debug_tree[dircheckstr("DataAnalysis", "2020_climate-model-predict")] @@ -94,18 +94,18 @@ def test_record_structure_generation(crawler): assert subd[1]["Project"].get_property("date").value == "2020" assert subd[1]["Project"].get_property("identifier").value == "climate-model-predict" - assert len(subd[0]) == 5 + assert len(subd[0]) == 6 assert subd[0]["date"] == "2020" assert subd[0]["identifier"] == "climate-model-predict" assert subd[0]["Project"].__class__ == db.Record - assert subd[0]["DataAnalysis"] == "DataAnalysis" + assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" assert subc[0]["DataAnalysis"] == True - assert subd[0]["project_dir"] == "DataAnalysis/2020_climate-model-predict" + assert subd[0]["project_dir"] == "examples_article/DataAnalysis/2020_climate-model-predict" assert subc[0]["project_dir"] == False # Check the copy flags for the first level in the hierarchy: - assert len(subc[0]) == 5 + assert len(subc[0]) == 6 assert len(subc[1]) == 1 assert subc[1]["Project"] is False assert subc[0]["Project"] is False @@ -118,7 +118,7 @@ def test_record_structure_generation(crawler): subc = crawler.debug_metadata["copied"][dircheckstr("DataAnalysis", "2020_climate-model-predict", "2020-02-08_prediction-errors")] - assert len(subd[0]) == 7 + assert len(subd[0]) == 8 assert subd[0]["date"] == "2020-02-08" assert subd[0]["identifier"] == "prediction-errors" assert subd[0]["Project"].__class__ == db.Record @@ -260,6 +260,7 @@ def test_identifiable_adapter(): .add_property("last_name", value="B")) assert query.lower() == "find record person with 'first_name'='a' and 'last_name'='b' " + def test_remove_unnecessary_updates(): # test trvial case upl = [db.Record().add_parent("A")] @@ -495,5 +496,3 @@ def test_replace_entities_by_ids(crawler): assert a.get_property("A").value == 12345 assert a.get_property("B").value == 12345 assert a.get_property("C").value == [12345, 233324] - -