From 4c0d720d639e58eaa94f6810925bc181aa192402 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <henrik@trineo.org> Date: Wed, 13 Apr 2022 18:05:07 +0200 Subject: [PATCH] ENH: add dict converter --- src/newcrawler/cfood-schema.yml | 1 + src/newcrawler/converters.py | 64 +++++++++++++++++++++------- src/newcrawler/crawl.py | 21 ++++++--- src/newcrawler/structure_elements.py | 10 +++++ unittests/test_converters.py | 17 ++++++-- 5 files changed, 87 insertions(+), 26 deletions(-) diff --git a/src/newcrawler/cfood-schema.yml b/src/newcrawler/cfood-schema.yml index 61f39440..c990dc5c 100644 --- a/src/newcrawler/cfood-schema.yml +++ b/src/newcrawler/cfood-schema.yml @@ -16,6 +16,7 @@ cfood: - MarkdownFile - DictListElement - Definitions + - Dict - JSONFile description: Type of this converter node. match: diff --git a/src/newcrawler/converters.py b/src/newcrawler/converters.py index 75d08198..f7a49166 100644 --- a/src/newcrawler/converters.py +++ b/src/newcrawler/converters.py @@ -29,7 +29,7 @@ import caosdb as db import json from .utils import has_parent from .stores import GeneralStore, RecordStore -from .structure_elements import (StructureElement, Directory, File, +from .structure_elements import (StructureElement, Directory, File, Dict, JSONFile, TextElement, DictTextElement, DictElement, DictListElement) from typing import Optional, Union from abc import abstractmethod @@ -369,11 +369,12 @@ class MarkdownFileConverter(Converter): class JSONFileConverter(Converter): - def typecheck(self, element: StructureElement): - return isinstance(element, File) + @staticmethod + def typecheck(element: StructureElement): + return isinstance(element, JSONFile) def match(self, element: StructureElement): - if not self.typecheck(element): + if not JSONFileConverter.typecheck(element): # TODO(salexan) Should we be more precise than just raising runtime # errors here? raise RuntimeError("Element must be a file") @@ -386,27 +387,60 @@ class JSONFileConverter(Converter): pass return m.groupdict() - def create_children(self, generalStore: GeneralStore, element: StructureElement): - if not self.typecheck(element): + @staticmethod + def create_children(generalStore: GeneralStore, element: StructureElement): + if not JSONFileConverter.typecheck(element): raise RuntimeError("A JSON file is needed to create children") with open(element.path, 'r') as json_file: - json_dict = json.load(json_file) + json_data = json.load(json_file) + if not isinstance(json_data, dict): + raise NotImplementedError("JSON file must contain a dict") + + children = [] + return [Dict(name="json_dict", value=json_data)] + + +class DictConverter(Converter): + # TODO use Dict as typecheck? + @staticmethod + def create_children(generalStore: GeneralStore, element: StructureElement): + if not DictConverter.typecheck(element): + raise RuntimeError("A dict is needed to create children") + children = [] - for name, entry in json_dict.items(): + for name, value in element.value.items(): # TODO(fspreck): Treat similar to yaml header and introduce more # DictXXXElements for numbers, booleans, enums, ... - if type(entry) == list: - children.append(DictListElement(name, entry)) - elif type(entry) == str: - children.append(DictTextElement(name, entry)) + if type(value) == list: + children.append(DictListElement(name, value)) + elif type(value) == str: + children.append(DictTextElement(name, value)) else: - children.append(DictElement(name, entry)) - print(f"JSON entry {name} has incompatible type.") - # raise RuntimeError(f"JSON entry {name} has incompatible type.") + children.append(DictElement(name, value)) + print(f"JSON value {name} has incompatible type.") + # raise RuntimeError(f"JSON value {name} has incompatible type.") return children + @staticmethod + def typecheck(element: StructureElement): + return isinstance(element, Dict) + + # TODO use Dict as typecheck? + def match(self, element: StructureElement): + """ + Try to match the given structure element. + + If it does not match, return None. + + Else return a dictionary containing the variables from the matched regexp + as key value pairs. + """ + if not isinstance(element, Dict): + raise RuntimeError("Element must be a DictElement.") + return {} + class DictTextElementConverter(Converter): def create_children(self, generalStore: GeneralStore, element: StructureElement): diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index f386df4b..b1418673 100644 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -187,7 +187,7 @@ class Crawler(object): # from the crawler definition and add them to the yaml schema that will be # tested in the next lines of code: - # Load and validate the cfood schema: + # Load the cfood schema: with open(files('newcrawler').joinpath('cfood-schema.yml'), "r") as f: schema = yaml.safe_load(f) @@ -197,6 +197,7 @@ class Crawler(object): schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( key) + # Validate the cfood schema: validate(instance=crawler_definition, schema=schema["cfood"]) return crawler_definition @@ -225,6 +226,12 @@ class Crawler(object): "MarkdownFile": { "converter": "MarkdownFileConverter", "package": "newcrawler.converters"}, + "JSONFile": { + "converter": "JSONFileConverter", + "package": "newcrawler.converters"}, + "Dict": { + "converter": "DictConverter", + "package": "newcrawler.converters"}, "DictTextElement": { "converter": "DictTextElementConverter", "package": "newcrawler.converters"}, @@ -247,7 +254,6 @@ class Crawler(object): # Load modules and associate classes: for key, value in converter_registry.items(): module = importlib.import_module(value["package"]) - print(value) value["class"] = getattr(module, value["converter"]) return converter_registry @@ -288,7 +294,8 @@ class Crawler(object): def start_crawling(self, item: StructureElement, crawler_definition: dict, - converter_registry: dict): + converter_registry: dict, + converter_class=DirectoryConverter): """ Start point of the crawler recursion. @@ -302,17 +309,17 @@ class Crawler(object): # This function builds the tree of converters out of the crawler definition. - if not isinstance(item, Directory): - raise NotImplementedError("Currently only directories are supported as items.") - if self.generalStore is None: raise RuntimeError("Should not happen.") + if converter_class is None: + converter_class = DirectoryConverter + local_converters = Crawler.create_local_converters(crawler_definition, converter_registry) # This recursive crawling procedure generates the update list: self.updateList: list[db.Record] = [] - self._crawl(DirectoryConverter.create_children_from_directory(item), + self._crawl([item], self.global_converters, local_converters, self.generalStore, self.recordStore, [], []) diff --git a/src/newcrawler/structure_elements.py b/src/newcrawler/structure_elements.py index 4a5dfaad..61a519f2 100644 --- a/src/newcrawler/structure_elements.py +++ b/src/newcrawler/structure_elements.py @@ -60,6 +60,16 @@ class File(FileSystemStructureElement): pass +class JSONFile(File): + pass + + +class Dict(StructureElement): + def __init__(self, name: str, value: dict): + super().__init__(name) + self.value = value + + class DictElement(StructureElement): def __init__(self, name: str, value: str): super().__init__(name) diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 37ee92df..278a1f2e 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -29,7 +29,7 @@ test the converters module from newcrawler.converters import Converter from newcrawler.stores import GeneralStore -from newcrawler.converters import MarkdownFileConverter, JSONFileConverter +from newcrawler.converters import MarkdownFileConverter, JSONFileConverter, DictConverter from newcrawler.structure_elements import Directory from newcrawler.structure_elements import File, DictTextElement, DictListElement, DictElement @@ -48,6 +48,9 @@ def converter_registry(): "MarkdownFile": { "converter": "MarkdownFileConverter", "package": "newcrawler.converters"}, + "Dict": { + "converter": "DictConverter", + "package": "newcrawler.converters"}, "DictTextElement": { "converter": "DictTextElementConverter", "package": "newcrawler.converters"}, @@ -168,16 +171,22 @@ def test_json_converter(converter_registry): test_json = File("testjson.json", rfp( "test_directories", "single_file_test_data", "testjson.json")) - converter = JSONFileConverter( + jsonconverter = JSONFileConverter( definition={"match": "(.*)"}, name="TestJSONFileConverter", converter_registry=converter_registry) - m = converter.match(test_json) + m = jsonconverter.match(test_json) assert m is not None assert len(m) == 0 - children = converter.create_children(None, test_json) + children = jsonconverter.create_children(None, test_json) + dictconverter = DictConverter( + definition={"match": "(.*)"}, + name="TestDictConverter", + converter_registry=converter_registry) + + children = dictconverter.create_children(None, children[0]) assert len(children) == 8 assert children[0].__class__ == DictTextElement assert children[0].name == "name" -- GitLab