From 2eb36c7080e52619709b1a0b8276ad18191cb3cf Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <alexander@mail-schlemmer.de> Date: Mon, 6 Feb 2023 17:48:26 +0100 Subject: [PATCH] MAIN: Moved the definitions for the default converters to a separate file. --- CHANGELOG.md | 4 ++ src/caoscrawler/crawl.py | 72 ++---------------------------------- unittests/test_converters.py | 22 +++++++++++ 3 files changed, 30 insertions(+), 68 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f3c0cf5..7df6439d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed ### +- The definitions for the default converters were removed from crawl.py and placed into + a separate yaml file called `default_converters.yml`. There is a new test testing for + the correct loading behavior of that file. + ### Deprecated ### ### Removed ### diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 7ad81e7b..9cb2ceb4 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -328,77 +328,13 @@ class Crawler(object): directory: schema.yml file README.md documentation + + TODO: this function does not make use of self, so it could become static. """ # Defaults for the converter registry: - converter_registry: dict[str, dict[str, str]] = { - "Directory": { - "converter": "DirectoryConverter", - "package": "caoscrawler.converters"}, - "SimpleFile": { - "converter": "SimpleFileConverter", - "package": "caoscrawler.converters"}, - "MarkdownFile": { - "converter": "MarkdownFileConverter", - "package": "caoscrawler.converters"}, - "File": { - "converter": "SimpleFileConverter", - "package": "caoscrawler.converters"}, - "JSONFile": { - "converter": "JSONFileConverter", - "package": "caoscrawler.converters"}, - "YAMLFile": { - "converter": "YAMLFileConverter", - "package": "caoscrawler.converters"}, - "CSVTableConverter": { - "converter": "CSVTableConverter", - "package": "caoscrawler.converters"}, - "XLSXTableConverter": { - "converter": "XLSXTableConverter", - "package": "caoscrawler.converters"}, - "DictBooleanElement": { - "converter": "BooleanElementConverter", - "package": "caoscrawler.converters"}, - "BooleanElement": { - "converter": "BooleanElementConverter", - "package": "caoscrawler.converters"}, - "DictFloatElement": { - "converter": "FloatElementConverter", - "package": "caoscrawler.converters"}, - "FloatElement": { - "converter": "FloatElementConverter", - "package": "caoscrawler.converters"}, - "DictTextElement": { - "converter": "TextElementConverter", - "package": "caoscrawler.converters"}, - "TextElement": { - "converter": "TextElementConverter", - "package": "caoscrawler.converters"}, - "Date": { - "converter": "DateElementConverter", - "package": "caoscrawler.converters"}, - "DictIntegerElement": { - "converter": "IntegerElementConverter", - "package": "caoscrawler.converters"}, - "IntegerElement": { - "converter": "IntegerElementConverter", - "package": "caoscrawler.converters"}, - "DictListElement": { - "converter": "ListElementConverter", - "package": "caoscrawler.converters"}, - "ListElement": { - "converter": "ListElementConverter", - "package": "caoscrawler.converters"}, - "DictDictElement": { - "converter": "DictElementConverter", - "package": "caoscrawler.converters"}, - "DictElement": { - "converter": "DictElementConverter", - "package": "caoscrawler.converters"}, - "Dict": { - "converter": "DictElementConverter", - "package": "caoscrawler.converters"}, - } + with open(str(files('caoscrawler').joinpath('default_converters.yml')), "r") as f: + converter_registry: dict[str, dict[str, str]] = yaml.safe_load(f) # More converters from definition file: if "Converters" in definition: diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 25251f62..9390e65c 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -595,3 +595,25 @@ def test_date_converter(): matches = dictconverter.match(TextElement("text", "alve")) assert matches is None + + +def test_load_converters(): + c = Crawler() + converter_registry = c.load_converters({}) + # The previous function call actually already asserts that all defined + # converter classes can be loaded from their respective packages. + + # Please adapt, if defaults change! + assert len(converter_registry) == 22 + + # All of them are contained in caoscrawler.converters + for conv_key, conv in converter_registry.items(): + assert conv["package"] == "caoscrawler.converters" + # ... and their names all end in "Converter" + assert conv["converter"].endswith("Converter") + + # Some checks: + assert "CSVTableConverter" in converter_registry + assert "SimpleFile" in converter_registry + assert "Directory" in converter_registry + assert "ListElement" in converter_registry -- GitLab