diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index 605f1463d9853a100443ea8ed698e4169266fa13..2a32f084b129d2ffe7ea4b1428752a317e095f83 100644 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -122,6 +122,23 @@ def check_identical(record1: db.Entity, record2: db.Entity, ignore_id=False): return True +def _resolve_datatype(prop: db.Property, remote_entity: db.Entity): + + if remote_entity.role == "Property": + datatype = remote_entity.datatype + elif remote_entity.role == "RecordType": + datatype = remote_entity.name + else: + raise RuntimeError("Cannot set datatype.") + + # Treat lists separately + if isinstance(prop.value, list) and not datatype.startswith("LIST"): + datatype = db.LIST(datatype) + + prop.datatype = datatype + return prop + + class Crawler(object): """ Crawler class that encapsulates crawling functions. @@ -465,8 +482,8 @@ class Crawler(object): """ for p in record.properties: if (isinstance(p.value, list)): + lst = [] for el in p.value: - lst = [] if (isinstance(el, db.Entity) and el.id is None): cached = self.get_identified_record_from_local_cache( el) @@ -481,7 +498,7 @@ class Crawler(object): lst.append(cached) else: lst.append(el) - p.value = lst + p.value = lst if (isinstance(p.value, db.Entity) and p.value.id is None): cached = self.get_identified_record_from_local_cache(p.value) if cached is None: @@ -703,6 +720,10 @@ class Crawler(object): @staticmethod def execute_inserts_in_list(to_be_inserted): + for record in to_be_inserted: + for prop in record.properties: + entity = db.Entity(name=prop.name).retrieve() + prop = _resolve_datatype(prop, entity) print("INSERT") print(to_be_inserted) if len(to_be_inserted) > 0: @@ -719,12 +740,7 @@ class Crawler(object): if prop.id is None: entity = db.Entity(name=prop.name).retrieve() prop.id = entity.id - if entity.role == "Property": - prop.datatype = entity.datatype - elif entity.role == "RecordType": - prop.datatype = entity.name - else: - raise RuntimeError("Cannot set datatype.") + prop = _resolve_datatype(prop, entity) print("UPDATE") print(to_be_updated) if len(to_be_updated) > 0: @@ -996,6 +1012,7 @@ def parse_args(): return parser.parse_args() + def main(): args = parse_args() return crawler_main( @@ -1009,5 +1026,6 @@ def main(): args.prefix ) + if __name__ == "__main__": sys.exit(main()) diff --git a/unittests/records.xml b/unittests/records.xml index 0ae34124a8875a723d7f0879687d8f0bdec51de0..f7455ec6b8995db8cd205f69729c32358beee8c0 100644 --- a/unittests/records.xml +++ b/unittests/records.xml @@ -78,6 +78,7 @@ <Property id="248" name="identifier" description="identifier of the experiment" datatype="TEXT" importance="FIX" flag="inheritance:FIX">TimeOfFlight</Property> <Property id="250" name="project" datatype="Project" importance="FIX" flag="inheritance:FIX">287</Property> <Property id="249" name="responsible" datatype="LIST<Person>" importance="FIX" flag="inheritance:FIX"> + <Value>289</Value> <Value>288</Value> </Property> </Record> diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 100b10062916fb992d2bb19241d1cf8ea543e44c..fcc89f2e1467f23accfe47bd52a51d934ccecf91 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -203,9 +203,10 @@ def test_json_converter(converter_registry): assert children[2].name == "archived" assert children[2].value.__class__ == bool - assert children[3].__class__ == DictDictElement - assert children[3].name == "coordinator" - assert children[3].value.__class__ == dict + assert children[3].__class__ == DictListElement + assert children[3].name == "Person" + assert children[3].value.__class__ == list + assert len(children[3].value) == 2 assert children[4].__class__ == DictTextElement assert children[4].name == "start_date" diff --git a/unittests/test_directories/examples_json/jsontest_cfood.yml b/unittests/test_directories/examples_json/jsontest_cfood.yml index bcf79a2d5183ebb496f8e180e9c264bb3ac05e48..f1eb6a9fa186c07f551bd12a84050f544abfdabc 100644 --- a/unittests/test_directories/examples_json/jsontest_cfood.yml +++ b/unittests/test_directories/examples_json/jsontest_cfood.yml @@ -3,13 +3,56 @@ JSONTest: # name of the converter type: JSONFile match: '(.*)' validate: ./testjson.schema.json - subtree: - element: # name of the first subtree element which is a converter + records: + Project: # this is an identifiable in this case + parents: + - Project # not needed as the name is equivalent + subtree: + name_element: + type: DictTextElement + match_name: "name" + match_value: "(?P<name>.*)" + records: + Project: + name: $name + url_element: # name of the first subtree element which is a converter type: DictTextElement match_value: "(?P<url>.*)" match_name: "url" records: - Project: # this is an identifiable in this case - parents: - - Project # not needed as the name is equivalent - url: $url + Project: + url: $url + persons_element: + type: DictListElement + match_name: "Person" + subtree: + person_element: + type: Dict + records: + Person: + parents: + - Person + Project: + Person: +$Person + subtree: + firstname_element: + type: DictTextElement + match_name: "firstname" + match_value: "(?P<firstname>.*)" + records: + Person: + firstname: $firstname + lastname_element: + type: DictTextElement + match_name: "lastname" + match_value: "(?P<lastname>.*)" + records: + Person: + lastname: $lastname + email_element: + type: DictTextElement + match_name: "email" + match_value: "(?P<email>.*)" + records: + Person: + email: $email diff --git a/unittests/test_directories/examples_json/testjson.json b/unittests/test_directories/examples_json/testjson.json index cd26c9c3295d6a2a8a6110f0876fffb62f60419e..b893b608a6a2119c5c3252cd9cff4c4100f404da 100644 --- a/unittests/test_directories/examples_json/testjson.json +++ b/unittests/test_directories/examples_json/testjson.json @@ -2,11 +2,18 @@ "name": "DEMO", "projectId": 10002, "archived": false, - "coordinator": { - "firstname": "Miri", - "lastname": "Mueller", - "email": "miri.mueller@science.de" - }, + "Person": [ + { + "firstname": "Miri", + "lastname": "Mueller", + "email": "miri.mueller@science.de" + }, + { + "firstname": "Mara", + "lastname": "Mueller", + "email": "mara.mueller@science.de" + } + ], "start_date": "2022-03-01", "candidates": ["Mouse", "Penguine"], "rvalue": 0.4444, diff --git a/unittests/test_directories/examples_json/testjson.schema.json b/unittests/test_directories/examples_json/testjson.schema.json index a684e9b663d8cba1ba1931aae5615040b2797240..fc784a61079e4737f1a0176fe4240133f5d1b5d0 100644 --- a/unittests/test_directories/examples_json/testjson.schema.json +++ b/unittests/test_directories/examples_json/testjson.schema.json @@ -11,25 +11,28 @@ "archived": { "type": "boolean" }, - "coordinator": { - "type": "object", - "properties": { - "firstname": { - "type": "string" - }, - "lastname": { - "type": "string" - }, - "email": { - "type": "string" + "Person": { + "type": "array", + "items": { + "type": "object", + "properties": { + "firstname": { + "type": "string" + }, + "lastname": { + "type": "string" + }, + "email": { + "type": "string" + } + }, + "required": [ + "firstname", + "lastname", + "email" + ], + "additionalProperties": true } - }, - "required": [ - "firstname", - "lastname", - "email" - ], - "additionalProperties": true }, "start_date": { "type": "string", @@ -51,7 +54,7 @@ "required": [ "name", "projectId", - "coordinator" + "Person" ], "additionalProperties": false } diff --git a/unittests/test_json.py b/unittests/test_json.py index d4da1fe7f20d3b2ea8c623315542fce90fb18497..237eca741cfbc8502c6b516788aec889879a3055 100644 --- a/unittests/test_json.py +++ b/unittests/test_json.py @@ -31,6 +31,8 @@ import os from pytest import raises +import caosdb as db + from newcrawler.converters import JSONFileConverter, DictConverter from newcrawler.crawl import Crawler from newcrawler.structure_elements import File, JSONFile @@ -47,19 +49,27 @@ def test_json(): # Load and register converter packages: converter_registry = crawler.load_converters(crawler_definition) - crawler.start_crawling( + records = crawler.start_crawling( JSONFile(os.path.basename(json_file_path), json_file_path), crawler_definition, converter_registry ) - subd = crawler.debug_tree - subc = crawler.debug_metadata - #print(json.dumps(subd, indent=3)) - print(subd) - print(subc) + + rec = [r for r in records if r.name == "DEMO"] + assert len(rec) == 1 + rec = rec[0] + assert len(rec.parents) == 1 + assert rec.parents[0].name == "Project" + assert rec.get_property("url") is not None + assert rec.get_property("url").value == "https://site.de/index.php/" + assert rec.get_property("Person") is not None + assert isinstance(rec.get_property("Person").value, list) + assert len(rec.get_property("Person").value) == 2 + def test_broken_validation(): - crawler_definition_path = rfp("broken_cfoods", "broken_validation_path.yml") + crawler_definition_path = rfp( + "broken_cfoods", "broken_validation_path.yml") crawler = Crawler() with raises(FileNotFoundError) as err: crawler_definition = crawler.load_definition(crawler_definition_path)