Skip to content
Snippets Groups Projects
Commit 2cbeb584 authored by Florian Spreckelsen's avatar Florian Spreckelsen
Browse files

ENH: Implement property from dict generation

parent 7e7eeddc
Branches
Tags
2 merge requests!178FIX: #96 Better error output for crawl.py script.,!163F dict heuristic
Pipeline #49210 failed
...@@ -811,43 +811,148 @@ class DictElementConverter(Converter): ...@@ -811,43 +811,148 @@ class DictElementConverter(Converter):
return match_name_and_value(self.definition, element.name, element.value) return match_name_and_value(self.definition, element.name, element.value)
class HeuristicDictConverter(DictElementConverter): class PropertiesFromDictConverter(DictElementConverter):
"""Extend the :py:class:`DictElementConverter` by a heuristic to set """Extend the :py:class:`DictElementConverter` by a heuristic to set
property values from the dictionary keys. property values from the dictionary keys.
""" """
def _validate_definition(self, definition: dict, name: str): def _validate_definition(self):
if "record_from_dict" not in definition or definition["record_from_dict"] is None: if "record_from_dict" not in self.definition or self.definition["record_from_dict"] is None:
raise ValueError( raise ValueError(
"You need to specify the (root) record, the properties of " "You need to specify the (root) record, the properties of "
f"which will be set from the dict in converter {name}." f"which will be set from the dict in converter {self.name}."
) )
def __init__(self, definition: dict, name: str, converter_registry: dict): if not "variable_name" in self.definition["record_from_dict"] or not self.definition["record_from_dict"]["variable_name"]:
raise ValueError(
f"The root record in converter {self.name} needs to have a "
"`variable_name` by which it is accessed in the subtree."
)
def __init__(self, definition: dict, name: str, converter_registry: dict,
referenced_record_callback: Optional[callable] = None):
_validate_definition(definition)
super().__init__(definition, name, converter_registry) super().__init__(definition, name, converter_registry)
self._validate_definition()
self.referenced_record_callback = referenced_record_callback
def _recursively_create_records(self, subdict: dict, root_record: db.Record,
root_rec_name: str,
values: GeneralStore, records: RecordStore,
referenced_record_callback: callable,
keys_modified: list = []
):
"""Create a record form the given `subdict` and recursively create referenced records."""
blacklisted_keys = self.definition["record_from_dict"][
"properties_blacklist"] if "properties_blacklist" in self.definition["record_from_dict"] else []
special_references = self.definition["record_from_dict"]["references"] if "references" in self.definition["record_from_dict"] else [
]
for key, value in subdict.items():
if key in blacklisted_keys:
# We ignore this in the automated property generation
continue
if isinstance(value, list):
if not any([isinstance(val, dict) for val in value]):
# no dict in list, i.e., no references, so this is simple
root_record.add_property(name=key, value=value)
else:
if not all([isinstance(val, dict) for val in value]):
# if this is not an error (most probably it is), this
# needs to be handled manually for now.
raise ValueError(
f"{key} in {subdict} contains a mixed list of references and scalars.")
ref_recs = []
for ii, ref_dict in enumerate(value):
ref_rec = db.Record()
ref_var_name = f"{root_rec_name}.{key}.{ii}"
if key in special_references:
for par in special_references[key]["parents"]:
ref_rec.add_parent(par)
else:
ref_rec.add_parent(key)
records[ref_var_name] = ref_rec
values[ref_var_name] = ref_rec
keys_modified, ref_rec = self._recursively_create_records(
subdict=ref_dict,
root_record=ref_rec,
root_rec_name=ref_var_name,
values=values,
records=records,
referenced_record_callback=referenced_record_callback,
keys_modified=keys_modified,
)
ref_recs.append(ref_rec)
root_record.add_property(name=key, value=ref_recs)
elif isinstance(value, dict):
ref_rec = db.Record()
ref_var_name = f"{root_rec_name}.{key}"
if key in special_references:
for par in special_references[key]["parents"]:
ref_rec.add_parent(par)
else:
ref_rec.add_parent(key)
records[ref_var_name] = ref_rec
values[ref_var_name] = ref_rec
keys_modified, ref_rec = self._recursively_create_records(
subdict=value,
root_record=ref_rec,
root_rec_name=ref_var_name,
values=values,
records=records,
referenced_record_callback=referenced_record_callback,
keys_modified=keys_modified
)
root_record.add_property(key, ref_rec)
else:
if key.lower() in SPECIAL_PROPERTIES:
setattr(root_record, key.lower(), value)
else:
root_record.add_property(name=key, value=value)
keys_modified.append((root_rec_name, key))
def create_records(self, values: GeneralStore, records: RecordStore, if referenced_record_callback:
element: StructureElement, referenced_record_callback: root_record = referenced_record_callback(root_record)
Optional[callable] = None):
keys_modified = [] return keys_modified, root_record
def _insert_into_stores(rec: db.Record, rec_name: str): def create_records(self, values: GeneralStore, records: RecordStore,
element: StructureElement):
records[rec_name] = rec keys_modified = []
values[rec_name] = rec
def _create_or_return_record(rec_name: str, parent_names: Optional[Union[str, List[str]]] = None): rfd = self.definition["record_from_dict"]
if rfd["variable_name"] not in records:
rec = db.Record()
if "name" in rfd:
rec.name = rfd["name"]
if "parents" in rfd:
for par in rfd["parents"]:
rec.add_parent(par)
else:
rec.add_parent(rfd["variable_name"])
records[rfd["variable_name"]] = rec
values[rfd["variable_name"]] = rec
if parent_names is None: else:
parent_names = [rec_name] rec = records[rfd["variable_name"]]
elif not isinstance(parent_names, list):
parent_names = [parent_names] keys_modified, rec = self._recursively_create_records(
subdict=element.value,
root_record=rec,
root_rec_name=rfd["variable_name"],
values=values,
records=records,
referenced_record_callback=self.referenced_record_callback,
keys_modified=keys_modified,
)
keys_modified.extend(super().create_records( keys_modified.extend(super().create_records(
values=values, records=records, element=element)) values=values, records=records, element=element))
......
...@@ -28,12 +28,15 @@ import importlib ...@@ -28,12 +28,15 @@ import importlib
import json import json
import logging import logging
import os import os
import pytest
import sys import sys
import yaml
from itertools import product from itertools import product
from pathlib import Path from pathlib import Path
import pytest import linkahead as db
import yaml
from caoscrawler.converters import (Converter, ConverterValidationError, from caoscrawler.converters import (Converter, ConverterValidationError,
DateElementConverter, DictElementConverter, DateElementConverter, DictElementConverter,
DictIntegerElementConverter, DictIntegerElementConverter,
...@@ -697,6 +700,7 @@ def test_properties_from_dict_basic(converter_registry): ...@@ -697,6 +700,7 @@ def test_properties_from_dict_basic(converter_registry):
"a": 5, "a": 5,
"b": ["a", "b", "c"], "b": ["a", "b", "c"],
"scalar_ref": { "scalar_ref": {
"name": "Scalar Ref",
"a": 23, "a": 23,
"blacklisted_int": 42 "blacklisted_int": 42
}, },
...@@ -718,7 +722,7 @@ def test_properties_from_dict_basic(converter_registry): ...@@ -718,7 +722,7 @@ def test_properties_from_dict_basic(converter_registry):
"blacklisted_ref": { "blacklisted_ref": {
"a": 25 "a": 25
}, },
"author": { "authors": {
"full_name": "Some Author" "full_name": "Some Author"
} }
}) })
...@@ -726,6 +730,9 @@ def test_properties_from_dict_basic(converter_registry): ...@@ -726,6 +730,9 @@ def test_properties_from_dict_basic(converter_registry):
assert "MyRec" in records assert "MyRec" in records
my_rec = records["MyRec"] my_rec = records["MyRec"]
assert isinstance(my_rec, db.Record) assert isinstance(my_rec, db.Record)
assert len(my_rec.parents) == 2
assert "DictRT1" in [par.name for par in my_rec.parents]
assert "DictRT2" in [par.name for par in my_rec.parents]
# scalar prop # scalar prop
assert my_rec.get_property("a") is not None assert my_rec.get_property("a") is not None
...@@ -741,8 +748,9 @@ def test_properties_from_dict_basic(converter_registry): ...@@ -741,8 +748,9 @@ def test_properties_from_dict_basic(converter_registry):
assert my_rec.get_property("scalar_ref") is not None assert my_rec.get_property("scalar_ref") is not None
referenced = my_rec.get_property("scalar_ref").value referenced = my_rec.get_property("scalar_ref").value
assert isinstance(referenced, db.Record) assert isinstance(referenced, db.Record)
assert referenced.name == "Scalar Ref"
assert len(referenced.parents) == 1 assert len(referenced.parents) == 1
assert referenced.has_parent("scalar_ref") assert "scalar_ref" in [par.name for par in referenced.parents]
assert referenced.get_property("a") is not None assert referenced.get_property("a") is not None
assert referenced.get_property("a").value == 23 assert referenced.get_property("a").value == 23
# blacklisted # blacklisted
...@@ -755,9 +763,9 @@ def test_properties_from_dict_basic(converter_registry): ...@@ -755,9 +763,9 @@ def test_properties_from_dict_basic(converter_registry):
for rec in my_rec.get_property("list_ref").value: for rec in my_rec.get_property("list_ref").value:
assert isinstance(rec, db.Record) assert isinstance(rec, db.Record)
assert len(rec.parents) == 1 assert len(rec.parents) == 1
assert rec.has_parent("list_ref") assert "list_ref" in [par.name for par in rec.parents]
assert rec.get_property("c") is not None assert rec.get_property("c") is not None
assert type(rec.get_property("c")) is bool assert type(rec.get_property("c").value) is bool
assert True in [rec.get_property("c").value for rec in my_rec.get_property("list_ref").value] assert True in [rec.get_property("c").value for rec in my_rec.get_property("list_ref").value]
assert False in [rec.get_property("c").value for rec in my_rec.get_property("list_ref").value] assert False in [rec.get_property("c").value for rec in my_rec.get_property("list_ref").value]
...@@ -766,14 +774,14 @@ def test_properties_from_dict_basic(converter_registry): ...@@ -766,14 +774,14 @@ def test_properties_from_dict_basic(converter_registry):
outer_rec = my_rec.get_property("ref_with_ref").value outer_rec = my_rec.get_property("ref_with_ref").value
assert isinstance(outer_rec, db.Record) assert isinstance(outer_rec, db.Record)
assert len(outer_rec.parents) == 1 assert len(outer_rec.parents) == 1
assert outer_rec.has_parent("ref_with_ref") assert "ref_with_ref" in [par.name for par in outer_rec.parents]
assert outer_rec.get_property("a") is not None assert outer_rec.get_property("a") is not None
assert outer_rec.get_property("a").value == 789 assert outer_rec.get_property("a").value == 789
assert outer_rec.get_property("ref_in_ref") is not None assert outer_rec.get_property("ref_in_ref") is not None
inner_rec = outer_rec.get_property("ref_in_ref").value inner_rec = outer_rec.get_property("ref_in_ref").value
assert isinstance(inner_rec, db.Record) assert isinstance(inner_rec, db.Record)
assert len(inner_rec.parents) == 1 assert len(inner_rec.parents) == 1
assert inner_rec.has_parent("ref_in_ref") assert "ref_in_ref" in [par.name for par in inner_rec.parents]
assert inner_rec.get_property("b") is not None assert inner_rec.get_property("b") is not None
assert inner_rec.get_property("b").value == "something" assert inner_rec.get_property("b").value == "something"
...@@ -782,11 +790,11 @@ def test_properties_from_dict_basic(converter_registry): ...@@ -782,11 +790,11 @@ def test_properties_from_dict_basic(converter_registry):
assert my_rec.get_property("blacklisted_ref") is None assert my_rec.get_property("blacklisted_ref") is None
# named reference property # named reference property
assert my_rec.get_property("author") is not None assert my_rec.get_property("authors") is not None
author_rec = my_rec.get_property("author").value author_rec = my_rec.get_property("authors").value
assert isinstance(author_rec, db.Record) assert isinstance(author_rec, db.Record)
assert len(author_rec.parents) == 1 assert len(author_rec.parents) == 1
assert author_rec.has_parent("Person") assert "Person" in [par.name for par in author_rec.parents]
assert author_rec.get_property("full_name") is not None assert author_rec.get_property("full_name") is not None
assert author_rec.get_property("full_name").value == "Some Author" assert author_rec.get_property("full_name").value == "Some Author"
...@@ -834,13 +842,13 @@ def test_properties_from_dict_callable(converter_registry): ...@@ -834,13 +842,13 @@ def test_properties_from_dict_callable(converter_registry):
pdfc = PropertiesFromDictConverter( pdfc = PropertiesFromDictConverter(
definition={ definition={
"record_from_dict": { "record_from_dict": {
"variable_name": "MyRec" "variable_name": "MyRec",
"name": "My New Record" "name": "My New Record"
}, }
name = "TestConverter", },
converter_registry = converter_registry, name="TestConverter",
referenced_record_callback = convert_some_values converter_registry=converter_registry,
} referenced_record_callback=convert_some_values
) )
values = GeneralStore() values = GeneralStore()
...@@ -851,7 +859,7 @@ def test_properties_from_dict_callable(converter_registry): ...@@ -851,7 +859,7 @@ def test_properties_from_dict_callable(converter_registry):
"url": "referenced" "url": "referenced"
}, },
"referenced2": { "referenced2": {
"nourl": "something else" "nourl": "something else",
"url": "https://indiscale.com" "url": "https://indiscale.com"
} }
}) })
...@@ -860,7 +868,7 @@ def test_properties_from_dict_callable(converter_registry): ...@@ -860,7 +868,7 @@ def test_properties_from_dict_callable(converter_registry):
my_rec = records["MyRec"] my_rec = records["MyRec"]
assert isinstance(my_rec, db.Record) assert isinstance(my_rec, db.Record)
assert len(my_rec.parents) == 1 assert len(my_rec.parents) == 1
assert my_rec.has_parent("MyRec") assert "MyRec" in [par.name for par in my_rec.parents]
assert my_rec.name == "My New Record" assert my_rec.name == "My New Record"
# simple conversion # simple conversion
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment