Skip to content
Snippets Groups Projects

F dict heuristic

Merged Florian Spreckelsen requested to merge f-dict-heuristic into dev
1 file
+ 98
5
Compare changes
  • Side-by-side
  • Inline
+ 356
7
@@ -29,26 +29,32 @@ import importlib
import json
import logging
import os
import pytest
import sys
import yaml
from itertools import product
from pathlib import Path
import pytest
import yaml
import linkahead as db
from caoscrawler.converters import (Converter, ConverterValidationError,
DateElementConverter, DictElementConverter,
DictIntegerElementConverter,
DirectoryConverter, FloatElementConverter,
IntegerElementConverter, JSONFileConverter,
ListElementConverter,
MarkdownFileConverter, YAMLFileConverter,
ListElementConverter, MarkdownFileConverter,
PropertiesFromDictConverter,
YAMLFileConverter,
_AbstractScalarValueElementConverter,
handle_value, replace_variables)
from caoscrawler.crawl import Crawler
from caoscrawler.scanner import (_load_definition_from_yaml_dict,
create_converter_registry,
create_transformer_registry, load_definition)
from caoscrawler.stores import GeneralStore
create_transformer_registry,
load_definition,
scan_structure_elements)
from caoscrawler.stores import GeneralStore, RecordStore
from caoscrawler.structure_elements import (BooleanElement, DictElement,
Directory, File, FloatElement,
IntegerElement, ListElement,
@@ -73,6 +79,10 @@ def converter_registry():
"DictElement": {
"converter": "DictElementConverter",
"package": "caoscrawler.converters"},
"PropertiesFromDictElement": {
"converter": "PropertiesFromDictConverter",
"package": "caoscrawler.converters"
},
"TextElement": {
"converter": "TextElementConverter",
"package": "caoscrawler.converters"},
@@ -633,7 +643,7 @@ def test_load_converters():
# converter classes can be loaded from their respective packages.
# Please adapt, if defaults change!
assert len(converter_registry) == 24
assert len(converter_registry) == 25
# All of them are contained in caoscrawler.converters
for conv_key, conv in converter_registry.items():
@@ -660,3 +670,342 @@ def test_create_path_value(converter_registry):
dc.create_values(values, Directory("a", "/a"))
assert "Test.path" in values
assert values["Test.path"] == "/a"
def test_properties_from_dict_basic(converter_registry):
"""Test that a record with the correct name and properties is created, and
that the children are still created correctly.
"""
# definitions with blacklist and named references
pfdc = PropertiesFromDictConverter(
definition={
"type": "PropertiesFromDictElement",
"match": ".*",
"record_from_dict": {
"variable_name": "MyRec",
"parents": ["DictRT1", "DictRT2"],
"properties_blacklist": ["blacklisted_int", "blacklisted_ref"],
"references": {
"authors": {
"parents": ["Person"]
}
}
}
},
name="Test", converter_registry=converter_registry)
# Tests for Dict with scalars, dict with lists, dict with reference,
# dict with list of references, dict with reference with reference, named
# reference
values = GeneralStore()
records = RecordStore()
test_dict_element = DictElement("TestDictElement", {
"a": 5,
"b": ["a", "b", "c"],
"scalar_ref": {
"name": "Scalar Ref",
"a": 23,
"blacklisted_int": 42
},
"list_ref": [
{
"c": True
},
{
"c": False
}
],
"ref_with_ref": {
"a": 789,
"ref_in_ref": {
"b": "something"
}
},
"blacklisted_int": -123,
"blacklisted_ref": {
"a": 25
},
"authors": {
"full_name": "Some Author"
}
})
pfdc.create_records(values=values, records=records, element=test_dict_element)
assert "MyRec" in records
my_rec = records["MyRec"]
assert isinstance(my_rec, db.Record)
assert len(my_rec.parents) == 2
assert "DictRT1" in [par.name for par in my_rec.parents]
assert "DictRT2" in [par.name for par in my_rec.parents]
# scalar prop
assert my_rec.get_property("a") is not None
assert my_rec.get_property("a").value == 5
# list prop
assert my_rec.get_property("b") is not None
assert len(my_rec.get_property("b").value) == 3
for elt in ["a", "b", "c"]:
assert elt in my_rec.get_property("b").value
# scalar ref
assert my_rec.get_property("scalar_ref") is not None
referenced = my_rec.get_property("scalar_ref").value
assert isinstance(referenced, db.Record)
assert referenced.name == "Scalar Ref"
assert len(referenced.parents) == 1
assert "scalar_ref" in [par.name for par in referenced.parents]
assert referenced.get_property("a") is not None
assert referenced.get_property("a").value == 23
# blacklisted
assert referenced.get_property("blacklisted_int") is None
# list of ref
assert my_rec.get_property("list_ref") is not None
assert isinstance(my_rec.get_property("list_ref").value, list)
assert len(my_rec.get_property("list_ref").value) == 2
for rec in my_rec.get_property("list_ref").value:
assert isinstance(rec, db.Record)
assert len(rec.parents) == 1
assert "list_ref" in [par.name for par in rec.parents]
assert rec.get_property("c") is not None
assert type(rec.get_property("c").value) is bool
assert True in [rec.get_property("c").value for rec in my_rec.get_property("list_ref").value]
assert False in [rec.get_property("c").value for rec in my_rec.get_property("list_ref").value]
# ref with ref
assert my_rec.get_property("ref_with_ref") is not None
outer_rec = my_rec.get_property("ref_with_ref").value
assert isinstance(outer_rec, db.Record)
assert len(outer_rec.parents) == 1
assert "ref_with_ref" in [par.name for par in outer_rec.parents]
assert outer_rec.get_property("a") is not None
assert outer_rec.get_property("a").value == 789
assert outer_rec.get_property("ref_in_ref") is not None
inner_rec = outer_rec.get_property("ref_in_ref").value
assert isinstance(inner_rec, db.Record)
assert len(inner_rec.parents) == 1
assert "ref_in_ref" in [par.name for par in inner_rec.parents]
assert inner_rec.get_property("b") is not None
assert inner_rec.get_property("b").value == "something"
# blacklisted
assert my_rec.get_property("blacklisted_int") is None
assert my_rec.get_property("blacklisted_ref") is None
# named reference property
assert my_rec.get_property("authors") is not None
author_rec = my_rec.get_property("authors").value
assert isinstance(author_rec, db.Record)
assert len(author_rec.parents) == 1
assert "Person" in [par.name for par in author_rec.parents]
assert author_rec.get_property("full_name") is not None
assert author_rec.get_property("full_name").value == "Some Author"
def test_properties_from_dict_callable(converter_registry):
def convert_some_values(rec: db.Record, records: RecordStore, values: GeneralStore):
"""Add an URL prefix to a property value if appliccable."""
if rec.get_property("url") is not None:
old_val = rec.get_property("url").value
if not (old_val is None or old_val.startswith("http")):
# only add if there is a value that doesn't look like an URL
rec.get_property("url").value = f"https://test.com/{old_val}"
return rec
pdfc = PropertiesFromDictConverter(
definition={
"record_from_dict": {
"variable_name": "MyRec",
"name": "My New Record"
}
},
name="TestConverter",
converter_registry=converter_registry,
referenced_record_callback=convert_some_values
)
values = GeneralStore()
records = RecordStore()
test_dict_element = DictElement("TestDictElement", {
"url": "something",
"referenced1": {
"url": "referenced"
},
"referenced2": {
"nourl": "something else",
"url": "https://indiscale.com"
}
})
pdfc.create_records(values=values, records=records, element=test_dict_element)
assert "MyRec" in records
my_rec = records["MyRec"]
assert isinstance(my_rec, db.Record)
assert len(my_rec.parents) == 1
assert "MyRec" in [par.name for par in my_rec.parents]
assert my_rec.name == "My New Record"
# simple conversion
assert my_rec.get_property("url") is not None
assert my_rec.get_property("url").value == "https://test.com/something"
# also works in referenced
assert my_rec.get_property("referenced1") is not None
referenced1 = my_rec.get_property("referenced1").value
assert isinstance(referenced1, db.Record)
assert referenced1.get_property("url") is not None
assert referenced1.get_property("url").value == "https://test.com/referenced"
# ... and works as expected
assert my_rec.get_property("referenced2") is not None
referenced2 = my_rec.get_property("referenced2").value
assert isinstance(referenced2, db.Record)
assert referenced2.get_property("nourl") is not None
assert referenced2.get_property("nourl").value == "something else"
assert referenced2.get_property("url") is not None
assert referenced2.get_property("url").value == "https://indiscale.com"
def test_properties_from_dict_nested(converter_registry):
"""Test the PropertiesFromDictConverter with a nested dict,
together with the regular DictElementConverter and Records created
and used on different subtree levels.
"""
root_dict_element = DictElement("RootDict", {
"TopLevelRec": "MyRec",
"propertiesDict": {
"a": 5,
"blacklisted": {
"bl_name": "BlackList",
"date": "2023-12-31"
}
},
"otherDict": {
"additional_from_other": "other"
}
})
def_dict = {
"RootElt": {
# Root dictionary
"type": "DictElement",
"match": ".*",
"records": {
# Define top-level, use below in subtrees
"MyRec": {
"parents": ["MyType"]
}
},
"subtree": {
# Top-level text element for the Record name
"NameElt": {
"type": "TextElement",
"match_name": "^TopLevelRec$",
"match_value": "(?P<name>.*)",
"records": {
"MyRec": {
"name": "$name"
}
}
},
"PFDElement": {
"type": "PropertiesFromDictElement",
"match_name": "^propertiesDict$",
"record_from_dict": {
"variable_name": "MyRec",
"properties_blacklist": ["blacklisted"]
},
"subtree": {
"BLElement": {
"type": "DictElement",
"match_name": "^blacklisted$",
"records": {
"BLRec": {
"parents": ["BlackListedType"],
"MyRec": "$MyRec"
}
},
"subtree": {
"BLNameElt": {
"type": "TextElement",
"match_name": "^bl_name$",
"match_value": "(?P<name>.*)",
"records": {
"BLRec": {
"name": "$name"
}
}
},
"BLDateElt": {
"type": "TextElement",
"match_name": "^date$",
"match_value": "(?P<date>.*)",
"records": {
"BLRec": {
"creation_date": "$date"
}
}
}
}
}
}
},
# Other dict which uses the DictElementConverter
"OtherDictElement": {
"type": "DictElement",
"match_name": "^otherDict$",
"subtree": {
"additionalElt": {
"type": "TextElement",
"match_name": "^additional_from_other$",
"match_value": "(?P<val>.*)",
"records": {
"MyRec": {
"additional_from_other": "$val"
}
}
}
}
}
}
}
}
records = scan_structure_elements(root_dict_element, def_dict, converter_registry)
# All records need to be there
assert len(records) == 2
myrec = None
blrec = None
for rec in records:
if rec.name == "MyRec":
myrec = rec
elif rec.name == "BlackList":
blrec = rec
assert myrec is not None
assert blrec is not None
# Parent is set from top level
assert len(myrec.parents) == 1
assert "MyType" in [par.name for par in myrec.parents]
# Set automatically, with blacklist
assert myrec.get_property("a") is not None
assert myrec.get_property("a").value == 5
assert myrec.get_property("blacklisted") is None
# Now check blacklisted record from subtree
assert len(blrec.parents) == 1
assert "BlackListedType" in [par.name for par in blrec.parents]
assert blrec.get_property("MyRec") is not None
assert blrec.get_property("MyRec").value == myrec
assert blrec.get_property("creation_date") is not None
assert blrec.get_property("creation_date").value == "2023-12-31"
# The "old" DictConverter should have added the additional property:
assert myrec.get_property("additional_from_other") is not None
assert myrec.get_property("additional_from_other").value == "other"
Loading