diff --git a/CHANGELOG.md b/CHANGELOG.md index 33fdff70f9af8d1c2174dc0ec297b08762fdeb63..6d15568e4ecff8c9028d063b47419ab2a8adbd4f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,11 +11,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed ### +- `scanner.scan_structure_elements` now auto-generates the + `converter_registry` and the `registered_transformer_functions` from + the `crawler_definition` if none are given. Therefore, the + `converter_registry` argument is now optional. + ### Deprecated ### ### Removed ### ### Fixed ### + - A RecordType with multiple Parents no longer causes an error during collection of identifiables diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index af1f4173e95827606a02979ddd6d7fcd9f133271..cb8dfdf2bf65dba9fd206366722ad69ecfc251e4 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -486,7 +486,7 @@ def scan_directory(dirname: Union[str, list[str]], crawler_definition_path: str, def scan_structure_elements(items: Union[list[StructureElement], StructureElement], crawler_definition: dict, - converter_registry: dict, + converter_registry: Optional[dict] = None, restricted_path: Optional[list[str]] = None, debug_tree: Optional[DebugTree] = None, registered_transformer_functions: Optional[dict] = None) -> ( @@ -508,6 +508,15 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen Traverse the data tree only along the given path. When the end of the given path is reached, traverse the full tree as normal. See docstring of 'scanner' for more details. + converter_registry: dict, optional + Optional dictionary containing the converter definitions + needed for the crawler definition. If none is given, it will + be generated from the `crawler_definition`. Default is None. + registered_transformer_functions: dict, optional + Optional dictionary containing the transformer function + definitions needed for the crawler definition. If none is + given, it will be generated from the + `crawler_definition`. Default is None. Returns ------- @@ -519,6 +528,10 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen if not isinstance(items, list): items = [items] + if converter_registry is None: + converter_registry = create_converter_registry(crawler_definition) + if registered_transformer_functions is None: + registered_transformer_functions = create_transformer_registry(crawler_definition) # TODO: needs to be covered somewhere else # self.run_id = uuid.uuid1() converters = initialize_converters(crawler_definition, converter_registry) diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py index c531f66fd38a714ba4f6f538d41c9fbaeb364d44..80c7b3e937b586d5e1e0c847876e8328e03a7f77 100644 --- a/unittests/test_scanner.py +++ b/unittests/test_scanner.py @@ -61,6 +61,12 @@ def test_scan_structure_elements(): converter_registry) assert len(recs) == 4 + # Converter registry is generated automatically from the crawler + # definition in scan_structure_elements. + recs_without_converter_registry = scan_structure_elements(DictElement(name="", value=data), + crawler_definition) + assert len(recs) == len(recs_without_converter_registry) + def test_provenance_debug_data(): # TODO rewrite the test to use a smaller example setup