diff --git a/src/newcrawler/converters.py b/src/newcrawler/converters.py index 14a4020c744ad69ffe6884ad8a317214dd2ad02e..9b1c945078180e6a4710cbe15cfd05fd66f4251f 100644 --- a/src/newcrawler/converters.py +++ b/src/newcrawler/converters.py @@ -364,44 +364,18 @@ class MarkdownFileConverter(Converter): return m.groupdict() -class JSONFileConverter(Converter): - def typecheck(self, element: StructureElement): - print(type(element)) - return isinstance(element, File) - - def match(self, element: StructureElement): - if not self.typecheck(element): - raise RuntimeError("Element must be a file") - m = re.match(self.definition["match"], element.name) - if m is None: - return None - if "validate" in self.definition and self.definition["validate"]: - # TODO(fspreck) validate against json schema, raise a Validation - # error if not valid. - pass - return m.groupdict() - - def create_children(self, generalStore: GeneralStore, element: StructureElement): - if not self.typecheck(element): - raise RuntimeError("A JSON file is needed to create children") - with open(element.path, 'r') as json_file: - json_data = json.load(json_file) - if not isinstance(json_data, dict): - raise NotImplementedError("JSON file must contain a dict") - - children = [] - return [Dict(name="json_dict", value=json_data)] - - class DictConverter(Converter): # TODO use Dict as typecheck? def create_children(self, generalStore: GeneralStore, element: StructureElement): if not self.typecheck(element): raise RuntimeError("A dict is needed to create children") + return self._create_children_from_dict(element.value) + + def _create_children_from_dict(self, data): children = [] - for name, value in element.value.items(): + for name, value in data.items(): # TODO(fspreck): Treat similar to yaml header and introduce more # DictXXXElements for numbers, booleans, enums, ... if type(value) == list: @@ -433,6 +407,33 @@ class DictConverter(Converter): return {} +class JSONFileConverter(DictConverter): + def typecheck(self, element: StructureElement): + return isinstance(element, File) + + def match(self, element: StructureElement): + if not self.typecheck(element): + raise RuntimeError("Element must be a file") + m = re.match(self.definition["match"], element.name) + if m is None: + return None + if "validate" in self.definition and self.definition["validate"]: + # TODO(fspreck) validate against json schema, raise a Validation + # error if not valid. + pass + return m.groupdict() + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + if not self.typecheck(element): + raise RuntimeError("A JSON file is needed to create children") + with open(element.path, 'r') as json_file: + json_data = json.load(json_file) + if not isinstance(json_data, dict): + raise NotImplementedError("JSON file must contain a dict") + + return self._create_children_from_dict(json_data) + + class DictTextElementConverter(Converter): def create_children(self, generalStore: GeneralStore, element: StructureElement): return [] diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 278a1f2eb297ca1327b349b0955b69d83ca7a48d..935a70caa94f2d8c794d714817547d7ea2d39fb2 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -181,12 +181,6 @@ def test_json_converter(converter_registry): assert len(m) == 0 children = jsonconverter.create_children(None, test_json) - dictconverter = DictConverter( - definition={"match": "(.*)"}, - name="TestDictConverter", - converter_registry=converter_registry) - - children = dictconverter.create_children(None, children[0]) assert len(children) == 8 assert children[0].__class__ == DictTextElement assert children[0].name == "name"