Skip to content
Snippets Groups Projects
Commit e73be6e5 authored by Alexander Schlemmer's avatar Alexander Schlemmer
Browse files

Merge branch 'f-refactor-default-converters' into 'dev'

F refactor default converters

See merge request !95
parents 7df30e6f 9cc69a95
No related branches found
No related tags found
2 merge requests!105REL: v0.4.0,!95F refactor default converters
Pipeline #33655 passed with warnings
...@@ -13,6 +13,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ...@@ -13,6 +13,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed ### ### Changed ###
- The definitions for the default converters were removed from crawl.py and placed into
a separate yaml file called `default_converters.yml`. There is a new test testing for
the correct loading behavior of that file.
### Deprecated ### ### Deprecated ###
### Removed ### ### Removed ###
......
...@@ -328,77 +328,13 @@ class Crawler(object): ...@@ -328,77 +328,13 @@ class Crawler(object):
directory: directory:
schema.yml file schema.yml file
README.md documentation README.md documentation
TODO: this function does not make use of self, so it could become static.
""" """
# Defaults for the converter registry: # Defaults for the converter registry:
converter_registry: dict[str, dict[str, str]] = { with open(str(files('caoscrawler').joinpath('default_converters.yml')), "r") as f:
"Directory": { converter_registry: dict[str, dict[str, str]] = yaml.safe_load(f)
"converter": "DirectoryConverter",
"package": "caoscrawler.converters"},
"SimpleFile": {
"converter": "SimpleFileConverter",
"package": "caoscrawler.converters"},
"MarkdownFile": {
"converter": "MarkdownFileConverter",
"package": "caoscrawler.converters"},
"File": {
"converter": "SimpleFileConverter",
"package": "caoscrawler.converters"},
"JSONFile": {
"converter": "JSONFileConverter",
"package": "caoscrawler.converters"},
"YAMLFile": {
"converter": "YAMLFileConverter",
"package": "caoscrawler.converters"},
"CSVTableConverter": {
"converter": "CSVTableConverter",
"package": "caoscrawler.converters"},
"XLSXTableConverter": {
"converter": "XLSXTableConverter",
"package": "caoscrawler.converters"},
"DictBooleanElement": {
"converter": "BooleanElementConverter",
"package": "caoscrawler.converters"},
"BooleanElement": {
"converter": "BooleanElementConverter",
"package": "caoscrawler.converters"},
"DictFloatElement": {
"converter": "FloatElementConverter",
"package": "caoscrawler.converters"},
"FloatElement": {
"converter": "FloatElementConverter",
"package": "caoscrawler.converters"},
"DictTextElement": {
"converter": "TextElementConverter",
"package": "caoscrawler.converters"},
"TextElement": {
"converter": "TextElementConverter",
"package": "caoscrawler.converters"},
"Date": {
"converter": "DateElementConverter",
"package": "caoscrawler.converters"},
"DictIntegerElement": {
"converter": "IntegerElementConverter",
"package": "caoscrawler.converters"},
"IntegerElement": {
"converter": "IntegerElementConverter",
"package": "caoscrawler.converters"},
"DictListElement": {
"converter": "ListElementConverter",
"package": "caoscrawler.converters"},
"ListElement": {
"converter": "ListElementConverter",
"package": "caoscrawler.converters"},
"DictDictElement": {
"converter": "DictElementConverter",
"package": "caoscrawler.converters"},
"DictElement": {
"converter": "DictElementConverter",
"package": "caoscrawler.converters"},
"Dict": {
"converter": "DictElementConverter",
"package": "caoscrawler.converters"},
}
# More converters from definition file: # More converters from definition file:
if "Converters" in definition: if "Converters" in definition:
...@@ -1234,7 +1170,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) ...@@ -1234,7 +1170,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
children = converter.create_children(generalStore_copy, element) children = converter.create_children(generalStore_copy, element)
if self.debug: if self.debug:
# add provenance information for each varaible # add provenance information for each variable
self.debug_tree[str(element)] = ( self.debug_tree[str(element)] = (
generalStore_copy.get_storage(), recordStore_copy.get_storage()) generalStore_copy.get_storage(), recordStore_copy.get_storage())
self.debug_metadata["copied"][str(element)] = ( self.debug_metadata["copied"][str(element)] = (
......
# -------------------------
# Base Types
# -------------------------
BooleanElement:
converter: BooleanElementConverter
package: caoscrawler.converters
Date:
converter: DateElementConverter
package: caoscrawler.converters
Dict:
converter: DictElementConverter
package: caoscrawler.converters
FloatElement:
converter: FloatElementConverter
package: caoscrawler.converters
IntegerElement:
converter: IntegerElementConverter
package: caoscrawler.converters
ListElement:
converter: ListElementConverter
package: caoscrawler.converters
TextElement:
converter: TextElementConverter
package: caoscrawler.converters
DictDictElement: # deprecated
converter: DictElementConverter
package: caoscrawler.converters
DictElement: # deprecated
converter: DictElementConverter
package: caoscrawler.converters
DictBooleanElement: # deprecated
converter: BooleanElementConverter
package: caoscrawler.converters
DictFloatElement: # deprecated
converter: FloatElementConverter
package: caoscrawler.converters
DictIntegerElement: # deprecated
converter: IntegerElementConverter
package: caoscrawler.converters
DictListElement: # deprecated
converter: ListElementConverter
package: caoscrawler.converters
DictTextElement: # deprecated
converter: TextElementConverter
package: caoscrawler.converters
# -------------------------
# Directories and Files
# -------------------------
Directory:
converter: DirectoryConverter
package: caoscrawler.converters
File: # deprecated
converter: SimpleFileConverter
package: caoscrawler.converters
SimpleFile:
converter: SimpleFileConverter
package: caoscrawler.converters
MarkdownFile:
converter: MarkdownFileConverter
package: caoscrawler.converters
YAMLFile:
converter: YAMLFileConverter
package: caoscrawler.converters
JSONFile:
converter: JSONFileConverter
package: caoscrawler.converters
CSVTableConverter:
converter: CSVTableConverter
package: caoscrawler.converters
XLSXTableConverter:
converter: XLSXTableConverter
package: caoscrawler.converters
...@@ -595,3 +595,25 @@ def test_date_converter(): ...@@ -595,3 +595,25 @@ def test_date_converter():
matches = dictconverter.match(TextElement("text", "alve")) matches = dictconverter.match(TextElement("text", "alve"))
assert matches is None assert matches is None
def test_load_converters():
c = Crawler()
converter_registry = c.load_converters({})
# The previous function call actually already asserts that all defined
# converter classes can be loaded from their respective packages.
# Please adapt, if defaults change!
assert len(converter_registry) == 22
# All of them are contained in caoscrawler.converters
for conv_key, conv in converter_registry.items():
assert conv["package"] == "caoscrawler.converters"
# ... and their names all end in "Converter"
assert conv["converter"].endswith("Converter")
# Some checks:
assert "CSVTableConverter" in converter_registry
assert "SimpleFile" in converter_registry
assert "Directory" in converter_registry
assert "ListElement" in converter_registry
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment