diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 000d5394af127468b41807369c2030475c35e389..535a14745282016cd55acd4ca3fcf0ceb0ccd7ec 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -811,43 +811,148 @@ class DictElementConverter(Converter): return match_name_and_value(self.definition, element.name, element.value) -class HeuristicDictConverter(DictElementConverter): +class PropertiesFromDictConverter(DictElementConverter): """Extend the :py:class:`DictElementConverter` by a heuristic to set property values from the dictionary keys. """ - def _validate_definition(self, definition: dict, name: str): + def _validate_definition(self): - if "record_from_dict" not in definition or definition["record_from_dict"] is None: + if "record_from_dict" not in self.definition or self.definition["record_from_dict"] is None: raise ValueError( "You need to specify the (root) record, the properties of " - f"which will be set from the dict in converter {name}." + f"which will be set from the dict in converter {self.name}." ) - def __init__(self, definition: dict, name: str, converter_registry: dict): + if not "variable_name" in self.definition["record_from_dict"] or not self.definition["record_from_dict"]["variable_name"]: + + raise ValueError( + f"The root record in converter {self.name} needs to have a " + "`variable_name` by which it is accessed in the subtree." + ) + + def __init__(self, definition: dict, name: str, converter_registry: dict, + referenced_record_callback: Optional[callable] = None): - _validate_definition(definition) super().__init__(definition, name, converter_registry) + self._validate_definition() + self.referenced_record_callback = referenced_record_callback + + def _recursively_create_records(self, subdict: dict, root_record: db.Record, + root_rec_name: str, + values: GeneralStore, records: RecordStore, + referenced_record_callback: callable, + keys_modified: list = [] + ): + """Create a record form the given `subdict` and recursively create referenced records.""" + + blacklisted_keys = self.definition["record_from_dict"][ + "properties_blacklist"] if "properties_blacklist" in self.definition["record_from_dict"] else [] + special_references = self.definition["record_from_dict"]["references"] if "references" in self.definition["record_from_dict"] else [ + ] + + for key, value in subdict.items(): + + if key in blacklisted_keys: + # We ignore this in the automated property generation + continue + if isinstance(value, list): + if not any([isinstance(val, dict) for val in value]): + # no dict in list, i.e., no references, so this is simple + root_record.add_property(name=key, value=value) + else: + if not all([isinstance(val, dict) for val in value]): + # if this is not an error (most probably it is), this + # needs to be handled manually for now. + raise ValueError( + f"{key} in {subdict} contains a mixed list of references and scalars.") + ref_recs = [] + for ii, ref_dict in enumerate(value): + ref_rec = db.Record() + ref_var_name = f"{root_rec_name}.{key}.{ii}" + if key in special_references: + for par in special_references[key]["parents"]: + ref_rec.add_parent(par) + else: + ref_rec.add_parent(key) + records[ref_var_name] = ref_rec + values[ref_var_name] = ref_rec + keys_modified, ref_rec = self._recursively_create_records( + subdict=ref_dict, + root_record=ref_rec, + root_rec_name=ref_var_name, + values=values, + records=records, + referenced_record_callback=referenced_record_callback, + keys_modified=keys_modified, + ) + ref_recs.append(ref_rec) + root_record.add_property(name=key, value=ref_recs) + + elif isinstance(value, dict): + ref_rec = db.Record() + ref_var_name = f"{root_rec_name}.{key}" + if key in special_references: + for par in special_references[key]["parents"]: + ref_rec.add_parent(par) + else: + ref_rec.add_parent(key) + records[ref_var_name] = ref_rec + values[ref_var_name] = ref_rec + keys_modified, ref_rec = self._recursively_create_records( + subdict=value, + root_record=ref_rec, + root_rec_name=ref_var_name, + values=values, + records=records, + referenced_record_callback=referenced_record_callback, + keys_modified=keys_modified + ) + root_record.add_property(key, ref_rec) + else: + if key.lower() in SPECIAL_PROPERTIES: + setattr(root_record, key.lower(), value) + else: + root_record.add_property(name=key, value=value) + keys_modified.append((root_rec_name, key)) - def create_records(self, values: GeneralStore, records: RecordStore, - element: StructureElement, referenced_record_callback: - Optional[callable] = None): + if referenced_record_callback: + root_record = referenced_record_callback(root_record) - keys_modified = [] + return keys_modified, root_record - def _insert_into_stores(rec: db.Record, rec_name: str): + def create_records(self, values: GeneralStore, records: RecordStore, + element: StructureElement): - records[rec_name] = rec - values[rec_name] = rec + keys_modified = [] - def _create_or_return_record(rec_name: str, parent_names: Optional[Union[str, List[str]]] = None): + rfd = self.definition["record_from_dict"] + if rfd["variable_name"] not in records: + rec = db.Record() + if "name" in rfd: + rec.name = rfd["name"] + if "parents" in rfd: + for par in rfd["parents"]: + rec.add_parent(par) + else: + rec.add_parent(rfd["variable_name"]) + records[rfd["variable_name"]] = rec + values[rfd["variable_name"]] = rec - if parent_names is None: - parent_names = [rec_name] - elif not isinstance(parent_names, list): - parent_names = [parent_names] + else: + rec = records[rfd["variable_name"]] + + keys_modified, rec = self._recursively_create_records( + subdict=element.value, + root_record=rec, + root_rec_name=rfd["variable_name"], + values=values, + records=records, + referenced_record_callback=self.referenced_record_callback, + keys_modified=keys_modified, + ) keys_modified.extend(super().create_records( values=values, records=records, element=element)) diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 83ae702cdc6f32d437fe89f410b34e730add9ee3..05981e2a605065fb8c86b83f5196f726d1375944 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -28,12 +28,15 @@ import importlib import json import logging import os +import pytest import sys +import yaml + from itertools import product from pathlib import Path -import pytest -import yaml +import linkahead as db + from caoscrawler.converters import (Converter, ConverterValidationError, DateElementConverter, DictElementConverter, DictIntegerElementConverter, @@ -697,6 +700,7 @@ def test_properties_from_dict_basic(converter_registry): "a": 5, "b": ["a", "b", "c"], "scalar_ref": { + "name": "Scalar Ref", "a": 23, "blacklisted_int": 42 }, @@ -718,7 +722,7 @@ def test_properties_from_dict_basic(converter_registry): "blacklisted_ref": { "a": 25 }, - "author": { + "authors": { "full_name": "Some Author" } }) @@ -726,6 +730,9 @@ def test_properties_from_dict_basic(converter_registry): assert "MyRec" in records my_rec = records["MyRec"] assert isinstance(my_rec, db.Record) + assert len(my_rec.parents) == 2 + assert "DictRT1" in [par.name for par in my_rec.parents] + assert "DictRT2" in [par.name for par in my_rec.parents] # scalar prop assert my_rec.get_property("a") is not None @@ -741,8 +748,9 @@ def test_properties_from_dict_basic(converter_registry): assert my_rec.get_property("scalar_ref") is not None referenced = my_rec.get_property("scalar_ref").value assert isinstance(referenced, db.Record) + assert referenced.name == "Scalar Ref" assert len(referenced.parents) == 1 - assert referenced.has_parent("scalar_ref") + assert "scalar_ref" in [par.name for par in referenced.parents] assert referenced.get_property("a") is not None assert referenced.get_property("a").value == 23 # blacklisted @@ -755,9 +763,9 @@ def test_properties_from_dict_basic(converter_registry): for rec in my_rec.get_property("list_ref").value: assert isinstance(rec, db.Record) assert len(rec.parents) == 1 - assert rec.has_parent("list_ref") + assert "list_ref" in [par.name for par in rec.parents] assert rec.get_property("c") is not None - assert type(rec.get_property("c")) is bool + assert type(rec.get_property("c").value) is bool assert True in [rec.get_property("c").value for rec in my_rec.get_property("list_ref").value] assert False in [rec.get_property("c").value for rec in my_rec.get_property("list_ref").value] @@ -766,14 +774,14 @@ def test_properties_from_dict_basic(converter_registry): outer_rec = my_rec.get_property("ref_with_ref").value assert isinstance(outer_rec, db.Record) assert len(outer_rec.parents) == 1 - assert outer_rec.has_parent("ref_with_ref") + assert "ref_with_ref" in [par.name for par in outer_rec.parents] assert outer_rec.get_property("a") is not None assert outer_rec.get_property("a").value == 789 assert outer_rec.get_property("ref_in_ref") is not None inner_rec = outer_rec.get_property("ref_in_ref").value assert isinstance(inner_rec, db.Record) assert len(inner_rec.parents) == 1 - assert inner_rec.has_parent("ref_in_ref") + assert "ref_in_ref" in [par.name for par in inner_rec.parents] assert inner_rec.get_property("b") is not None assert inner_rec.get_property("b").value == "something" @@ -782,11 +790,11 @@ def test_properties_from_dict_basic(converter_registry): assert my_rec.get_property("blacklisted_ref") is None # named reference property - assert my_rec.get_property("author") is not None - author_rec = my_rec.get_property("author").value + assert my_rec.get_property("authors") is not None + author_rec = my_rec.get_property("authors").value assert isinstance(author_rec, db.Record) assert len(author_rec.parents) == 1 - assert author_rec.has_parent("Person") + assert "Person" in [par.name for par in author_rec.parents] assert author_rec.get_property("full_name") is not None assert author_rec.get_property("full_name").value == "Some Author" @@ -834,13 +842,13 @@ def test_properties_from_dict_callable(converter_registry): pdfc = PropertiesFromDictConverter( definition={ "record_from_dict": { - "variable_name": "MyRec" + "variable_name": "MyRec", "name": "My New Record" - }, - name = "TestConverter", - converter_registry = converter_registry, - referenced_record_callback = convert_some_values - } + } + }, + name="TestConverter", + converter_registry=converter_registry, + referenced_record_callback=convert_some_values ) values = GeneralStore() @@ -851,7 +859,7 @@ def test_properties_from_dict_callable(converter_registry): "url": "referenced" }, "referenced2": { - "nourl": "something else" + "nourl": "something else", "url": "https://indiscale.com" } }) @@ -860,7 +868,7 @@ def test_properties_from_dict_callable(converter_registry): my_rec = records["MyRec"] assert isinstance(my_rec, db.Record) assert len(my_rec.parents) == 1 - assert my_rec.has_parent("MyRec") + assert "MyRec" in [par.name for par in my_rec.parents] assert my_rec.name == "My New Record" # simple conversion