diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f3c0cf58115f760cf248d4e87b2c1b27fed5d5c..7df6439d5ee38b236a0731cf5ca09b82c7fcf002 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed ### +- The definitions for the default converters were removed from crawl.py and placed into + a separate yaml file called `default_converters.yml`. There is a new test testing for + the correct loading behavior of that file. + ### Deprecated ### ### Removed ### diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 7ad81e7bc7b00c5e6fc412051c7497154ce097ae..9cb2ceb4c5b2100388c632ea95a57b08069e0066 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -328,77 +328,13 @@ class Crawler(object): directory: schema.yml file README.md documentation + + TODO: this function does not make use of self, so it could become static. """ # Defaults for the converter registry: - converter_registry: dict[str, dict[str, str]] = { - "Directory": { - "converter": "DirectoryConverter", - "package": "caoscrawler.converters"}, - "SimpleFile": { - "converter": "SimpleFileConverter", - "package": "caoscrawler.converters"}, - "MarkdownFile": { - "converter": "MarkdownFileConverter", - "package": "caoscrawler.converters"}, - "File": { - "converter": "SimpleFileConverter", - "package": "caoscrawler.converters"}, - "JSONFile": { - "converter": "JSONFileConverter", - "package": "caoscrawler.converters"}, - "YAMLFile": { - "converter": "YAMLFileConverter", - "package": "caoscrawler.converters"}, - "CSVTableConverter": { - "converter": "CSVTableConverter", - "package": "caoscrawler.converters"}, - "XLSXTableConverter": { - "converter": "XLSXTableConverter", - "package": "caoscrawler.converters"}, - "DictBooleanElement": { - "converter": "BooleanElementConverter", - "package": "caoscrawler.converters"}, - "BooleanElement": { - "converter": "BooleanElementConverter", - "package": "caoscrawler.converters"}, - "DictFloatElement": { - "converter": "FloatElementConverter", - "package": "caoscrawler.converters"}, - "FloatElement": { - "converter": "FloatElementConverter", - "package": "caoscrawler.converters"}, - "DictTextElement": { - "converter": "TextElementConverter", - "package": "caoscrawler.converters"}, - "TextElement": { - "converter": "TextElementConverter", - "package": "caoscrawler.converters"}, - "Date": { - "converter": "DateElementConverter", - "package": "caoscrawler.converters"}, - "DictIntegerElement": { - "converter": "IntegerElementConverter", - "package": "caoscrawler.converters"}, - "IntegerElement": { - "converter": "IntegerElementConverter", - "package": "caoscrawler.converters"}, - "DictListElement": { - "converter": "ListElementConverter", - "package": "caoscrawler.converters"}, - "ListElement": { - "converter": "ListElementConverter", - "package": "caoscrawler.converters"}, - "DictDictElement": { - "converter": "DictElementConverter", - "package": "caoscrawler.converters"}, - "DictElement": { - "converter": "DictElementConverter", - "package": "caoscrawler.converters"}, - "Dict": { - "converter": "DictElementConverter", - "package": "caoscrawler.converters"}, - } + with open(str(files('caoscrawler').joinpath('default_converters.yml')), "r") as f: + converter_registry: dict[str, dict[str, str]] = yaml.safe_load(f) # More converters from definition file: if "Converters" in definition: diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 25251f62f4e8c0cb1c9e4551a3b1fa4e1ce1fe35..9390e65c08da6b19e335424b8021a88528b93e19 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -595,3 +595,25 @@ def test_date_converter(): matches = dictconverter.match(TextElement("text", "alve")) assert matches is None + + +def test_load_converters(): + c = Crawler() + converter_registry = c.load_converters({}) + # The previous function call actually already asserts that all defined + # converter classes can be loaded from their respective packages. + + # Please adapt, if defaults change! + assert len(converter_registry) == 22 + + # All of them are contained in caoscrawler.converters + for conv_key, conv in converter_registry.items(): + assert conv["package"] == "caoscrawler.converters" + # ... and their names all end in "Converter" + assert conv["converter"].endswith("Converter") + + # Some checks: + assert "CSVTableConverter" in converter_registry + assert "SimpleFile" in converter_registry + assert "Directory" in converter_registry + assert "ListElement" in converter_registry