Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
test_converters.py 36.50 KiB
#!/usr/bin/env python3
# encoding: utf-8
#
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2021-2024 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2021,2022 Henrik tom Wörden <h.tomwoerden@indiscale.com>
# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#

"""
test the converters module
"""
import datetime
import importlib
import json
import logging
import os
import pytest
import sys
import yaml

from itertools import product
from pathlib import Path

import linkahead as db

from caoscrawler.converters import (Converter, ConverterValidationError,
                                    DateElementConverter, DictElementConverter,
                                    DictIntegerElementConverter,
                                    DirectoryConverter, FloatElementConverter,
                                    IntegerElementConverter, JSONFileConverter,
                                    ListElementConverter, MarkdownFileConverter,
                                    PropertiesFromDictConverter,
                                    YAMLFileConverter,
                                    handle_value, replace_variables)
from caoscrawler.converters.converters import _AbstractScalarValueElementConverter
from caoscrawler.crawl import Crawler
from caoscrawler.scanner import (_load_definition_from_yaml_dict,
                                 create_converter_registry,
                                 create_transformer_registry,
                                 load_definition,
                                 scan_structure_elements)
from caoscrawler.stores import GeneralStore, RecordStore
from caoscrawler.structure_elements import (BooleanElement, DictElement,
                                            Directory, File, FloatElement,
                                            IntegerElement, ListElement,
                                            TextElement)
from caoscrawler.transformer_functions import replace, split

UNITTESTDIR = Path(__file__).parent


@pytest.fixture
def converter_registry():
    converter_registry: dict[str, dict[str, str]] = {
        "Directory": {
            "converter": "DirectoryConverter",
            "package": "caoscrawler.converters"},
        "MarkdownFile": {
            "converter": "MarkdownFileConverter",
            "package": "caoscrawler.converters"},
        "Date": {
            "converter": "DateElementConverter",
            "package": "caoscrawler.converters"},
        "DictElement": {
            "converter": "DictElementConverter",
            "package": "caoscrawler.converters"},
        "PropertiesFromDictElement": {
            "converter": "PropertiesFromDictConverter",
            "package": "caoscrawler.converters"
        },
        "TextElement": {
            "converter": "TextElementConverter",
            "package": "caoscrawler.converters"},
        "ListElement": {
            "converter": "ListElementConverter",
            "package": "caoscrawler.converters"},
        "JSONFile": {
            "converter": "JSONFileConverter",
            "package": "caoscrawler.converters"},
    }

    for key, value in converter_registry.items():
        module = importlib.import_module(value["package"])
        value["class"] = getattr(module, value["converter"])
    return converter_registry


def testConverterTrivial(converter_registry):

    types = [
        "Directory",
        "MarkdownFile",
        "TextElement",
        "ListElement",
        "TextElement"
    ]

    for ct in types:
        Converter.converter_factory(
            definition={
                "type": ct},
            name="Test",
            converter_registry=converter_registry)


def testDirectoryConverter(converter_registry):
    """ test using the "test_directories" folder"""
    dc = Converter.converter_factory(
        definition={
            "type": "Directory"
        },
        name="Test", converter_registry=converter_registry)
    elements = dc.create_children(GeneralStore(),
                                  Directory("test_directories", UNITTESTDIR / "test_directories"))

    # Check whether the right structure elements were created
    # this has been updated, there are more directories now
    # assert len(elements) == 1
    element_names = []
    for element in elements:
        assert isinstance(element, Directory)
        element_names.append(element.name)
    assert "examples_article" in element_names
    assert "example_overwrite_1" in element_names
    assert "example_insert" in element_names


def test_markdown_converter(converter_registry):
    test_readme = File(
        "README.md",
        UNITTESTDIR /
        "test_directories" / "examples_article" / "DataAnalysis" /
        "2020_climate-model-predict" / "2020-02-08_prediction-errors" / "README.md"
    )

    converter = MarkdownFileConverter({"match": "(.*)"}, "TestMarkdownFileConverter",
                                      converter_registry)

    with pytest.raises(ConverterValidationError):
        converter.create_children(None, File("test_tool.py", UNITTESTDIR / "test_crawler.py"))

    m = converter.match(test_readme)
    assert m is not None
    assert m.__class__ == dict
    assert len(m) == 0

    converter = MarkdownFileConverter({
        "match": "README.md"
    }, "TestMarkdownFileConverter",
       converter_registry)

    m = converter.match(test_readme)
    assert m is not None
    assert len(m) == 0

    children = converter.create_children(None, test_readme)
    assert len(children) == 5
    assert children[1].__class__ == TextElement
    assert children[1].name == "description"
    assert children[1].value.__class__ == str

    assert children[0].__class__ == TextElement
    assert children[0].name == "responsible"
    assert children[0].value.__class__ == str

    test_readme2 = File(
        "README.md",
        UNITTESTDIR / "test_directories" / "examples_article" /
        "ExperimentalData" / "2020_SpeedOfLight" / "2020-01-01_TimeOfFlight" / "README.md"
    )

    m = converter.match(test_readme2)
    assert m is not None
    assert len(m) == 0

    children = converter.create_children(None, test_readme2)
    assert len(children) == 2
    assert children[1].__class__ == TextElement
    assert children[1].name == "description"
    assert children[1].value.__class__ == str

    assert children[0].__class__ == ListElement
    assert children[0].name == "responsible"
    assert children[0].value.__class__ == list


def test_json_converter(converter_registry):
    test_json = File("testjson.json", UNITTESTDIR /
                     "test_directories" / "examples_json" / "testjson.json")

    schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                               "test_directories", "examples_json", "testjson.schema.json")
    jsonconverter = JSONFileConverter(
        definition={"match": "(.*)", "validate": schema_path},
        name="TestJSONFileConverter",
        converter_registry=converter_registry)

    m = jsonconverter.match(test_json)
    assert m is not None
    assert len(m) == 0

    dict_el = jsonconverter.create_children(None, test_json)
    assert len(dict_el) == 1

    dictconverter = DictElementConverter(
        definition={"match_name": "(.*)"},
        name="dictconv",
        converter_registry=converter_registry)
    children = dictconverter.create_children(None, dict_el[0])
    for child in children:
        if child.name == "name":
            assert isinstance(child, TextElement)
            assert isinstance(child.value, str)
            assert child.value == "DEMO"
        elif child.name == "projectId":
            assert isinstance(child, IntegerElement)
            assert isinstance(child.value, int)
            assert child.value == 10002
        elif child.name == "archived":
            assert isinstance(child, BooleanElement)
            assert isinstance(child.value, bool)
            assert child.value is False
        elif child.name == "Person":
            assert isinstance(child, ListElement)
            assert isinstance(child.value, list)
            assert len(child.value) == 2
        elif child.name == "start_date":
            assert isinstance(child, TextElement)
            assert isinstance(child.value, str)
            assert child.value == '2022-03-01'
        elif child.name == "candidates":
            assert isinstance(child, ListElement)
            assert isinstance(child.value, list)
            assert child.value == ["Mouse", "Penguine"]
        elif child.name == "rvalue":
            assert isinstance(child, FloatElement)
            assert isinstance(child.value, float)
        elif child.name == "url":
            assert isinstance(child, TextElement)
            assert isinstance(child.value, str)
        else:
            raise ValueError()

    invalid_json = File(
        "invalidjson.json",
        UNITTESTDIR / "test_directories" / "examples_json" / "invalidjson.json"
    )
    # Doesn't validate because of missing required 'name' property
    with pytest.raises(ConverterValidationError) as err:
        jsonconverter.create_children(None, invalid_json)
        assert err.value.message.startswith("Couldn't validate")

    broken_json = File(
        "brokenjson.json",
        UNITTESTDIR / "test_directories" / "examples_json" / "brokenjson.json"
    )
    with pytest.raises(json.decoder.JSONDecodeError) as err:
        jsonconverter.create_children(None, broken_json)


def test_yaml_converter(converter_registry):
    test_yaml = File("testyaml.yml", UNITTESTDIR /
                     "test_directories" / "test_yamls" / "testyaml.yml")

    schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                               "test_directories", "test_yamls", "testyaml.schema.json")
    yamlconverter = YAMLFileConverter(
        definition={"match": "(.*)", "validate": schema_path},
        name="TestYAMLFileConverter",
        converter_registry=converter_registry)

    m = yamlconverter.match(test_yaml)
    assert m is not None
    assert len(m) == 0

    dict_el = yamlconverter.create_children(None, test_yaml)
    assert len(dict_el) == 1

    dictconverter = DictElementConverter(
        definition={"match_name": "(.*)"},
        name="dictconv",
        converter_registry=converter_registry)
    children = dictconverter.create_children(None, dict_el[0])
    for child in children:
        if child.name == "name":
            assert isinstance(child, TextElement)
            assert isinstance(child.value, str)
            assert child.value == "DEMO"
        elif child.name == "projectId":
            assert isinstance(child, IntegerElement)
            assert isinstance(child.value, int)
            assert child.value == 10002
        elif child.name == "archived":
            assert isinstance(child, BooleanElement)
            assert isinstance(child.value, bool)
            assert child.value is False
        elif child.name == "Person":
            assert isinstance(child, ListElement)
            assert isinstance(child.value, list)
            assert len(child.value) == 2
        elif child.name == "start_date":
            assert isinstance(child, TextElement)
            assert isinstance(child.value, str)
            assert child.value == '2022-03-01'
        elif child.name == "candidates":
            assert isinstance(child, ListElement)
            assert isinstance(child.value, list)
            assert child.value == ["Mouse", "Penguine"]
        elif child.name == "rvalue":
            assert isinstance(child, FloatElement)
            assert isinstance(child.value, float)
        elif child.name == "url":
            assert isinstance(child, TextElement)
            assert isinstance(child.value, str)
        else:
            raise ValueError()

    invalid_yaml = File(
        "invalidyaml.yml",
        UNITTESTDIR / "test_directories" / "test_yamls" / "invalidyaml.yml"
    )

    # Doesn't validate because of missing required 'name' property
    with pytest.raises(ConverterValidationError) as err:
        yamlconverter.create_children(None, invalid_yaml)
        assert err.value.message.startswith("Couldn't validate")

    broken_yaml = File(
        "brokenyaml.yml",
        UNITTESTDIR / "test_directories" / "test_yamls" / "brokenyaml.yml"
    )
    with pytest.raises(yaml.parser.ParserError) as err:
        yamlconverter.create_children(None, broken_yaml)


def test_variable_replacement():
    values = GeneralStore()
    values["a"] = 4
    values["b"] = "68"
    values["my_unit"] = "m"

    # basic values stay unchanged
    assert replace_variables(5, values) is 5
    assert replace_variables(True, values) is True
    assert replace_variables("$a", values) is 4
    assert replace_variables("${b}", values) == "68"

    # values given as simple strings never have units
    assert handle_value("b", values) == ("b", None, "single")
    assert handle_value("+b", values) == ("b", None, "list")
    assert handle_value("*b", values) == ("b", None, "multiproperty")
    assert handle_value("$b", values) == ("68", None, "single")
    assert handle_value("+$b", values) == ("68", None, "list")
    assert handle_value("*$b", values) == ("68", None, "multiproperty")

    # No units in dicts
    assert handle_value({"value": "b",
                         "collection_mode": "single"}, values) == ("b", None, "single")
    assert handle_value({"value": "b",
                         "collection_mode": "list"}, values) == ("b", None, "list")
    assert handle_value({"value": "b",
                         "collection_mode": "multiproperty"}, values) == ("b", None, "multiproperty")
    assert handle_value({"value": "$b",
                         "collection_mode": "single"}, values) == ("68", None, "single")
    assert handle_value({"value": "$b",
                         "collection_mode": "list"}, values) == ("68", None, "list")
    assert handle_value({"value": "$b",
                         "collection_mode": "multiproperty"}, values) == ("68", None, "multiproperty")

    # Unit specified in the same way as value:
    assert handle_value({"value": 5, "unit": "m"}, values) == (5, "m", "single")

    assert handle_value(["a", "b"], values) == (["a", "b"], None, "single")
    assert handle_value(["$a", "$b"], values) == ([4, "68"], None, "single")


def test_apply_transformers(converter_registry):
    cfood_def = {"type": 'ListElement', "debug_match": True, "match_name": ".*",
                 'transform': {'test': {'in': '$a', 'out': '$b', 'functions': [{
                     'split': {'marker': '|'}}]}}}
    values = GeneralStore()
    values["a"] = "a|b|c"

    # transformer_functions  = create_transformer_registry(crawler_definition)
    transformer_functions = {"split": split}

    conv = ListElementConverter(definition=cfood_def, name='test',
                                converter_registry=converter_registry)

    assert values['a'] is "a|b|c"
    conv.apply_transformers(values, transformer_functions)
    assert values['a'] is "a|b|c"
    assert values['b'] == ["a", "b", "c"]

    # Check replacing of existing variable
    cfood_def = {"type": 'ListElement', "debug_match": True, "match_name": ".*",
                 'transform': {'test': {'in': '$a', 'out': '$a', 'functions': [{
                     'split': {'marker': '|'}}]}}}
    conv = ListElementConverter(definition=cfood_def, name='test',
                                converter_registry=converter_registry)

    conv.apply_transformers(values, transformer_functions)
    assert values['a'] == ["a", "b", "c"]


def test_filter_children_of_directory(converter_registry, capsys):
    """Verify that children (i.e., files) in a directory are filtered or sorted correctly. """
    test_dir = Directory("examples_filter_children", UNITTESTDIR /
                         "test_directories" / "examples_filter_children")

    dc = DirectoryConverter(
        definition={
            "match": "(.*)",
            "filter": {
                "expr": "test_(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}).json",
                "group": "date",
                "rule": "only_max"
            }
        },
        name="TestOnlyMaxDirectoryConverter",
        converter_registry=converter_registry
    )

    m = dc.match(test_dir)
    assert m is not None

    # This should only contain the youngest json and the csv that doesn't match
    # the above filter expression.
    children = dc.create_children(None, test_dir)
    assert len(children) == 2
    assert children[0].__class__ == File
    assert children[0].name == "test_2022-02-02.json"
    assert children[1].__class__ == File
    assert children[1].name == "some_other_file.csv"

    dc = DirectoryConverter(
        definition={
            "match": "(.*)",
            "filter": {
                "expr": "test_(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}).json",
                "group": "date",
                "rule": "only_min"
            }
        },
        name="TestOnlyMinDirectoryConverter",
        converter_registry=converter_registry
    )

    m = dc.match(test_dir)
    assert m is not None

    # This should only contain the youngest json and the csv that doesn't match
    # the above filter expression.
    children = dc.create_children(None, test_dir)
    assert len(children) == 2
    assert children[0].__class__ == File
    assert children[0].name == "test_2022-01-01.json"
    assert children[1].__class__ == File
    assert children[1].name == "some_other_file.csv"

    dc = DirectoryConverter(
        definition={
            "match": "(.*)",
            "filter": {
                "expr": "test_(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}).json",
                "group": "date",
                "rule": "does_not_exist"
            }
        },
        name="TestBrokenDirectoryConverter",
        converter_registry=converter_registry
    )

    m = dc.match(test_dir)
    assert m is not None

    with pytest.raises(RuntimeError):
        children = dc.create_children(None, test_dir)


@pytest.mark.filterwarnings("ignore::UserWarning")
def test_validate_custom_converters():
    one_doc_yaml = """
Converters:
  MyNewType:
    converter: MyNewTypeConverter
    package: some_package.my_converters
MyElement:
  type: MyNewType
  match: something
    """
    one_doc_definitions = _load_definition_from_yaml_dict(
        [yaml.load(one_doc_yaml, Loader=yaml.SafeLoader)])
    assert "MyElement" in one_doc_definitions
    assert one_doc_definitions["MyElement"]["type"] == "MyNewType"

    # this has to be equivalent
    two_doc_yaml = """
---
metadata:
  crawler-version: 0.9.0
  Converters:
    MyNewType:
      converter: MyNewTypeConverter
      package: some_package.my_converters
---
MyElement:
  type: MyNewType
  match: something
    """
    two_doc_definitions = _load_definition_from_yaml_dict(
        list(yaml.safe_load_all(two_doc_yaml)))
    assert "MyElement" in two_doc_definitions
    assert two_doc_definitions["MyElement"]["type"] == one_doc_definitions["MyElement"]["type"]


def test_abstract_dict_element_converter():
    definition = yaml.safe_load("""
match_name: text
match_value: .*begin(?P<text>.*)end
accept_text: True
    """)
    converter = _AbstractScalarValueElementConverter(
        definition, "test_converter",
        None  # This is possible when "subtree" is not used
    )
    element = TextElement("text", """
begin
bla
end""")
    val = converter.match(element)
    assert val is not None
    assert val["text"] == "\nbla\n"


def test_converter_value_match(converter_registry):
    # test with defaults
    dc = FloatElementConverter(
        definition={
            "match_name": "(.*)",
            "match_value": "(.*)",
        },
        name="Test",
        converter_registry=converter_registry
    )
    m = dc.match(IntegerElement(name="a", value=4))
    assert m is not None

    # overwrite default with no match for int
    dc = FloatElementConverter(
        definition={
            "match_name": "(.*)",
            "match_value": "(.*)",
            "accept_int": False,
        },
        name="Test",
        converter_registry=converter_registry
    )
    assert dc.typecheck(IntegerElement(name="a", value=4)) is False

    # overwrite default with match for float
    dc = IntegerElementConverter(
        definition={
            "match_name": "(.*)",
            "match_value": "(.*)",
            "accept_float": True,
        },
        name="Test",
        converter_registry=converter_registry
    )
    m = dc.match(FloatElement(name="a", value=4.0))
    assert m is not None


def test_match_debug(converter_registry, caplog):
    caplog.set_level(logging.DEBUG, logger="caoscrawler.converters")
    for m, mn, mv in product([".*", None], [".*", None], [".*", None]):
        defi = {"debug_match": True}
        if m:
            defi["match"] = m
        if mn:
            defi["match_name"] = mn
        if mv:
            defi["match_value"] = mv
        dc = FloatElementConverter(
            definition=defi,
            name="Test",
            converter_registry=converter_registry
        )
        if m and mn:
            with pytest.raises(RuntimeError) as err:
                mtch = dc.match(IntegerElement(name="a", value=4))
            continue
        else:
            mtch = dc.match(IntegerElement(name="a", value=4))
        if not (m is None and mn is None and mv is None):
            assert mtch is not None
            # the name
            assert "a" in caplog.text
            # the regexp
            assert ".*" in caplog.text
            # the empty result set
            assert "{}" in caplog.text
            caplog.clear()


def test_date_converter():
    dictconverter = DateElementConverter(
        definition={"match_value": "(?P<date>.*)"},
        name="conv",
        converter_registry=converter_registry)
    matches = dictconverter.match(TextElement("text", "2022-11-11"))
    assert "date" in matches
    assert isinstance(matches["date"], datetime.date)
    assert matches["date"].year == 2022

    dictconverter = DateElementConverter(
        definition={"match_value": r"(?P<date>(\d|-)+)",
                    "date_format": "%y-%m-%d"},
        name="conv",
        converter_registry=converter_registry)
    matches = dictconverter.match(TextElement("text", "22-11-11"))
    assert "date" in matches
    assert isinstance(matches["date"], datetime.date)
    assert matches["date"].year == 2022

    matches = dictconverter.match(TextElement("text", "alve"))
    assert matches is None


def test_load_converters():
    converter_registry = create_converter_registry({})
    # The previous function call actually already asserts that all defined
    # converter classes can be loaded from their respective packages.

    # Please adapt, if defaults change!
    assert len(converter_registry) == 29

    # All of them are contained in caoscrawler.converters
    # except for the xml converters:
    for conv_key, conv in converter_registry.items():
        assert conv["package"] == "caoscrawler.converters"
        # ... and their names all end in "Converter"
        assert conv["converter"].endswith("Converter")

    # Some checks:
    assert "CSVTableConverter" in converter_registry
    assert "SimpleFile" in converter_registry
    assert "Directory" in converter_registry
    assert "ListElement" in converter_registry


def test_create_path_value(converter_registry):
    """ test whether the variable containing the path is added to the general store"""
    dc = Converter.converter_factory(
        definition={
            "type": "Directory",
            "match": ".*"
        },
        name="Test", converter_registry=converter_registry)
    values = GeneralStore()
    dc.create_values(values, Directory("a", "/a"))
    assert "Test.path" in values
    assert values["Test.path"] == "/a"


def test_properties_from_dict_basic(converter_registry):
    """Test that a record with the correct name and properties is created, and
    that the children are still created correctly.

    """
    # definitions with blacklist and named references
    pfdc = PropertiesFromDictConverter(
        definition={
            "type": "PropertiesFromDictElement",
            "match": ".*",
            "record_from_dict": {
                "variable_name": "MyRec",
                "parents": ["DictRT1", "DictRT2"],
                "properties_blacklist": ["blacklisted_int", "blacklisted_ref"],
                "references": {
                    "authors": {
                        "parents": ["Person"]
                    }
                }
            }
        },
        name="Test", converter_registry=converter_registry)
    # Tests for Dict with scalars, dict with lists, dict with reference,
    # dict with list of references, dict with reference with reference, named
    # reference
    values = GeneralStore()
    records = RecordStore()
    test_dict_element = DictElement("TestDictElement", {
        "a": 5,
        "b": ["a", "b", "c"],
        "scalar_ref": {
            "name": "Scalar Ref",
            "a": 23,
            "blacklisted_int": 42
        },
        "list_ref": [
            {
                "c": True
            },
            {
                "c": False
            }
        ],
        "ref_with_ref": {
            "a": 789,
            "ref_in_ref": {
                "b": "something"
            }
        },
        "blacklisted_int": -123,
        "blacklisted_ref": {
            "a": 25
        },
        "authors": {
            "full_name": "Some Author"
        }
    })
    pfdc.create_records(values=values, records=records, element=test_dict_element)
    assert "MyRec" in records
    my_rec = records["MyRec"]
    assert isinstance(my_rec, db.Record)
    assert len(my_rec.parents) == 2
    assert "DictRT1" in [par.name for par in my_rec.parents]
    assert "DictRT2" in [par.name for par in my_rec.parents]

    # scalar prop
    assert my_rec.get_property("a") is not None
    assert my_rec.get_property("a").value == 5

    # list prop
    assert my_rec.get_property("b") is not None
    assert len(my_rec.get_property("b").value) == 3
    for elt in ["a", "b", "c"]:
        assert elt in my_rec.get_property("b").value

    # scalar ref
    assert my_rec.get_property("scalar_ref") is not None
    referenced = my_rec.get_property("scalar_ref").value
    assert isinstance(referenced, db.Record)
    assert referenced.name == "Scalar Ref"
    assert len(referenced.parents) == 1
    assert "scalar_ref" in [par.name for par in referenced.parents]
    assert referenced.get_property("a") is not None
    assert referenced.get_property("a").value == 23
    # blacklisted
    assert referenced.get_property("blacklisted_int") is None

    # list of ref
    assert my_rec.get_property("list_ref") is not None
    assert isinstance(my_rec.get_property("list_ref").value, list)
    assert len(my_rec.get_property("list_ref").value) == 2
    for rec in my_rec.get_property("list_ref").value:
        assert isinstance(rec, db.Record)
        assert len(rec.parents) == 1
        assert "list_ref" in [par.name for par in rec.parents]
        assert rec.get_property("c") is not None
        assert type(rec.get_property("c").value) is bool
    assert True in [rec.get_property("c").value for rec in my_rec.get_property("list_ref").value]
    assert False in [rec.get_property("c").value for rec in my_rec.get_property("list_ref").value]

    # ref with ref
    assert my_rec.get_property("ref_with_ref") is not None
    outer_rec = my_rec.get_property("ref_with_ref").value
    assert isinstance(outer_rec, db.Record)
    assert len(outer_rec.parents) == 1
    assert "ref_with_ref" in [par.name for par in outer_rec.parents]
    assert outer_rec.get_property("a") is not None
    assert outer_rec.get_property("a").value == 789
    assert outer_rec.get_property("ref_in_ref") is not None
    inner_rec = outer_rec.get_property("ref_in_ref").value
    assert isinstance(inner_rec, db.Record)
    assert len(inner_rec.parents) == 1
    assert "ref_in_ref" in [par.name for par in inner_rec.parents]
    assert inner_rec.get_property("b") is not None
    assert inner_rec.get_property("b").value == "something"

    # blacklisted
    assert my_rec.get_property("blacklisted_int") is None
    assert my_rec.get_property("blacklisted_ref") is None

    # named reference property
    assert my_rec.get_property("authors") is not None
    author_rec = my_rec.get_property("authors").value
    assert isinstance(author_rec, db.Record)
    assert len(author_rec.parents) == 1
    assert "Person" in [par.name for par in author_rec.parents]
    assert author_rec.get_property("full_name") is not None
    assert author_rec.get_property("full_name").value == "Some Author"


def test_properties_from_dict_callable(converter_registry):

    def convert_some_values(rec: db.Record, records: RecordStore, values: GeneralStore):
        """Add an URL prefix to a property value if appliccable."""

        if rec.get_property("url") is not None:

            old_val = rec.get_property("url").value
            if not (old_val is None or old_val.startswith("http")):

                # only add if there is a value that doesn't look like an URL
                rec.get_property("url").value = f"https://test.com/{old_val}"

        return rec

    pdfc = PropertiesFromDictConverter(
        definition={
            "record_from_dict": {
                "variable_name": "MyRec",
                "name": "My New Record"
            }
        },
        name="TestConverter",
        converter_registry=converter_registry,
        referenced_record_callback=convert_some_values
    )

    values = GeneralStore()
    records = RecordStore()
    test_dict_element = DictElement("TestDictElement", {
        "url": "something",
        "referenced1": {
            "url": "referenced"
        },
        "referenced2": {
            "nourl": "something else",
            "url": "https://indiscale.com"
        }
    })
    pdfc.create_records(values=values, records=records, element=test_dict_element)
    assert "MyRec" in records
    my_rec = records["MyRec"]
    assert isinstance(my_rec, db.Record)
    assert len(my_rec.parents) == 1
    assert "MyRec" in [par.name for par in my_rec.parents]
    assert my_rec.name == "My New Record"

    # simple conversion
    assert my_rec.get_property("url") is not None
    assert my_rec.get_property("url").value == "https://test.com/something"

    # also works in referenced
    assert my_rec.get_property("referenced1") is not None
    referenced1 = my_rec.get_property("referenced1").value
    assert isinstance(referenced1, db.Record)
    assert referenced1.get_property("url") is not None
    assert referenced1.get_property("url").value == "https://test.com/referenced"

    # ... and works as expected
    assert my_rec.get_property("referenced2") is not None
    referenced2 = my_rec.get_property("referenced2").value
    assert isinstance(referenced2, db.Record)
    assert referenced2.get_property("nourl") is not None
    assert referenced2.get_property("nourl").value == "something else"
    assert referenced2.get_property("url") is not None
    assert referenced2.get_property("url").value == "https://indiscale.com"


def test_properties_from_dict_nested(converter_registry):
    """Test the PropertiesFromDictConverter with a nested dict,
    together with the regular DictElementConverter and Records created
    and used on different subtree levels.

    """
    root_dict_element = DictElement("RootDict", {
        "TopLevelRec": "MyRec",
        "propertiesDict": {
            "a": 5,
            "blacklisted": {
                "bl_name": "BlackList",
                "date": "2023-12-31"
            }
        },
        "otherDict": {
            "additional_from_other": "other"
        }
    })
    def_dict = {
        "RootElt": {
            # Root dictionary
            "type": "DictElement",
            "match": ".*",
            "records": {
                # Define top-level, use below in subtrees
                "MyRec": {
                    "parents": ["MyType"]
                }
            },
            "subtree": {
                # Top-level text element for the Record name
                "NameElt": {
                    "type": "TextElement",
                    "match_name": "^TopLevelRec$",
                    "match_value": "(?P<name>.*)",
                    "records": {
                        "MyRec": {
                            "name": "$name"
                        }
                    }
                },
                "PFDElement": {
                    "type": "PropertiesFromDictElement",
                    "match_name": "^propertiesDict$",
                    "record_from_dict": {
                        "variable_name": "MyRec",
                        "properties_blacklist": ["blacklisted"]
                    },
                    "subtree": {
                        "BLElement": {
                            "type": "DictElement",
                            "match_name": "^blacklisted$",
                            "records": {
                                "BLRec": {
                                    "parents": ["BlackListedType"],
                                    "MyRec": "$MyRec"
                                }
                            },
                            "subtree": {
                                "BLNameElt": {
                                    "type": "TextElement",
                                    "match_name": "^bl_name$",
                                    "match_value": "(?P<name>.*)",
                                    "records": {
                                        "BLRec": {
                                            "name": "$name"
                                        }
                                    }
                                },
                                "BLDateElt": {
                                    "type": "TextElement",
                                    "match_name": "^date$",
                                    "match_value": "(?P<date>.*)",
                                    "records": {
                                        "BLRec": {
                                            "creation_date": "$date"
                                        }
                                    }
                                }
                            }
                        }
                    }
                },
                # Other dict which uses the DictElementConverter
                "OtherDictElement": {
                    "type": "DictElement",
                    "match_name": "^otherDict$",
                    "subtree": {
                        "additionalElt": {
                            "type": "TextElement",
                            "match_name": "^additional_from_other$",
                            "match_value": "(?P<val>.*)",
                            "records": {
                                "MyRec": {
                                    "additional_from_other": "$val"
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    records = scan_structure_elements(root_dict_element, def_dict, converter_registry)

    # All records need to be there
    assert len(records) == 2
    myrec = None
    blrec = None
    for rec in records:
        if rec.name == "MyRec":
            myrec = rec
        elif rec.name == "BlackList":
            blrec = rec
    assert myrec is not None
    assert blrec is not None

    # Parent is set from top level
    assert len(myrec.parents) == 1
    assert "MyType" in [par.name for par in myrec.parents]

    # Set automatically, with blacklist
    assert myrec.get_property("a") is not None
    assert myrec.get_property("a").value == 5
    assert myrec.get_property("blacklisted") is None

    # Now check blacklisted record from subtree
    assert len(blrec.parents) == 1
    assert "BlackListedType" in [par.name for par in blrec.parents]
    assert blrec.get_property("MyRec") is not None
    assert blrec.get_property("MyRec").value == myrec
    assert blrec.get_property("creation_date") is not None
    assert blrec.get_property("creation_date").value == "2023-12-31"

    # The "old" DictConverter should have added the additional property:
    assert myrec.get_property("additional_from_other") is not None
    assert myrec.get_property("additional_from_other").value == "other"