From 3c32b1f0222621b7bfd173b5a6f4c7b195974bea Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Mon, 24 Mar 2025 14:58:49 +0100 Subject: [PATCH 1/2] FEAT(scanner): Auto-generate converter_registry and registered_transformer_functions --- CHANGELOG.md | 6 ++++++ src/caoscrawler/scanner.py | 15 ++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 33fdff70..6d15568e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,11 +11,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed ### +- `scanner.scan_structure_elements` now auto-generates the + `converter_registry` and the `registered_transformer_functions` from + the `crawler_definition` if none are given. Therefore, the + `converter_registry` argument is now optional. + ### Deprecated ### ### Removed ### ### Fixed ### + - A RecordType with multiple Parents no longer causes an error during collection of identifiables diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index af1f4173..cb8dfdf2 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -486,7 +486,7 @@ def scan_directory(dirname: Union[str, list[str]], crawler_definition_path: str, def scan_structure_elements(items: Union[list[StructureElement], StructureElement], crawler_definition: dict, - converter_registry: dict, + converter_registry: Optional[dict] = None, restricted_path: Optional[list[str]] = None, debug_tree: Optional[DebugTree] = None, registered_transformer_functions: Optional[dict] = None) -> ( @@ -508,6 +508,15 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen Traverse the data tree only along the given path. When the end of the given path is reached, traverse the full tree as normal. See docstring of 'scanner' for more details. + converter_registry: dict, optional + Optional dictionary containing the converter definitions + needed for the crawler definition. If none is given, it will + be generated from the `crawler_definition`. Default is None. + registered_transformer_functions: dict, optional + Optional dictionary containing the transformer function + definitions needed for the crawler definition. If none is + given, it will be generated from the + `crawler_definition`. Default is None. Returns ------- @@ -519,6 +528,10 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen if not isinstance(items, list): items = [items] + if converter_registry is None: + converter_registry = create_converter_registry(crawler_definition) + if registered_transformer_functions is None: + registered_transformer_functions = create_transformer_registry(crawler_definition) # TODO: needs to be covered somewhere else # self.run_id = uuid.uuid1() converters = initialize_converters(crawler_definition, converter_registry) -- GitLab From e9e6bb4523dfa2e61de87fb68a58813a4c5cf107 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Tue, 25 Mar 2025 09:46:18 +0100 Subject: [PATCH 2/2] TEST(scanner): Extend unit test for new behavior of scan_structure_elements --- unittests/test_scanner.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py index c531f66f..80c7b3e9 100644 --- a/unittests/test_scanner.py +++ b/unittests/test_scanner.py @@ -61,6 +61,12 @@ def test_scan_structure_elements(): converter_registry) assert len(recs) == 4 + # Converter registry is generated automatically from the crawler + # definition in scan_structure_elements. + recs_without_converter_registry = scan_structure_elements(DictElement(name="", value=data), + crawler_definition) + assert len(recs) == len(recs_without_converter_registry) + def test_provenance_debug_data(): # TODO rewrite the test to use a smaller example setup -- GitLab