From 4c0d720d639e58eaa94f6810925bc181aa192402 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <henrik@trineo.org>
Date: Wed, 13 Apr 2022 18:05:07 +0200
Subject: [PATCH] ENH: add dict converter

---
 src/newcrawler/cfood-schema.yml      |  1 +
 src/newcrawler/converters.py         | 64 +++++++++++++++++++++-------
 src/newcrawler/crawl.py              | 21 ++++++---
 src/newcrawler/structure_elements.py | 10 +++++
 unittests/test_converters.py         | 17 ++++++--
 5 files changed, 87 insertions(+), 26 deletions(-)

diff --git a/src/newcrawler/cfood-schema.yml b/src/newcrawler/cfood-schema.yml
index 61f39440..c990dc5c 100644
--- a/src/newcrawler/cfood-schema.yml
+++ b/src/newcrawler/cfood-schema.yml
@@ -16,6 +16,7 @@ cfood:
           - MarkdownFile
           - DictListElement
           - Definitions
+          - Dict
           - JSONFile
           description: Type of this converter node.
         match:
diff --git a/src/newcrawler/converters.py b/src/newcrawler/converters.py
index 75d08198..f7a49166 100644
--- a/src/newcrawler/converters.py
+++ b/src/newcrawler/converters.py
@@ -29,7 +29,7 @@ import caosdb as db
 import json
 from .utils import has_parent
 from .stores import GeneralStore, RecordStore
-from .structure_elements import (StructureElement, Directory, File,
+from .structure_elements import (StructureElement, Directory, File, Dict, JSONFile,
                                  TextElement, DictTextElement, DictElement, DictListElement)
 from typing import Optional, Union
 from abc import abstractmethod
@@ -369,11 +369,12 @@ class MarkdownFileConverter(Converter):
 
 
 class JSONFileConverter(Converter):
-    def typecheck(self, element: StructureElement):
-        return isinstance(element, File)
+    @staticmethod
+    def typecheck(element: StructureElement):
+        return isinstance(element, JSONFile)
 
     def match(self, element: StructureElement):
-        if not self.typecheck(element):
+        if not JSONFileConverter.typecheck(element):
             # TODO(salexan) Should we be more precise than just raising runtime
             # errors here?
             raise RuntimeError("Element must be a file")
@@ -386,27 +387,60 @@ class JSONFileConverter(Converter):
             pass
         return m.groupdict()
 
-    def create_children(self, generalStore: GeneralStore, element: StructureElement):
-        if not self.typecheck(element):
+    @staticmethod
+    def create_children(generalStore: GeneralStore, element: StructureElement):
+        if not JSONFileConverter.typecheck(element):
             raise RuntimeError("A JSON file is needed to create children")
         with open(element.path, 'r') as json_file:
-            json_dict = json.load(json_file)
+            json_data = json.load(json_file)
+        if not isinstance(json_data, dict):
+            raise NotImplementedError("JSON file must contain a dict")
+
+        children = []
+        return [Dict(name="json_dict", value=json_data)]
+
+
+class DictConverter(Converter):
+    # TODO use Dict as typecheck?
+    @staticmethod
+    def create_children(generalStore: GeneralStore, element: StructureElement):
+        if not DictConverter.typecheck(element):
+            raise RuntimeError("A dict is needed to create children")
+
         children = []
 
-        for name, entry in json_dict.items():
+        for name, value in element.value.items():
             # TODO(fspreck): Treat similar to yaml header and introduce more
             # DictXXXElements for numbers, booleans, enums, ...
-            if type(entry) == list:
-                children.append(DictListElement(name, entry))
-            elif type(entry) == str:
-                children.append(DictTextElement(name, entry))
+            if type(value) == list:
+                children.append(DictListElement(name, value))
+            elif type(value) == str:
+                children.append(DictTextElement(name, value))
             else:
-                children.append(DictElement(name, entry))
-                print(f"JSON entry {name} has incompatible type.")
-                # raise RuntimeError(f"JSON entry {name} has incompatible type.")
+                children.append(DictElement(name, value))
+                print(f"JSON value {name} has incompatible type.")
+                # raise RuntimeError(f"JSON value {name} has incompatible type.")
 
         return children
 
+    @staticmethod
+    def typecheck(element: StructureElement):
+        return isinstance(element, Dict)
+
+    # TODO use Dict as typecheck?
+    def match(self, element: StructureElement):
+        """
+        Try to match the given structure element.
+
+        If it does not match, return None.
+
+        Else return a dictionary containing the variables from the matched regexp
+        as key value pairs.
+        """
+        if not isinstance(element, Dict):
+            raise RuntimeError("Element must be a DictElement.")
+        return {}
+
 
 class DictTextElementConverter(Converter):
     def create_children(self, generalStore: GeneralStore, element: StructureElement):
diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py
index f386df4b..b1418673 100644
--- a/src/newcrawler/crawl.py
+++ b/src/newcrawler/crawl.py
@@ -187,7 +187,7 @@ class Crawler(object):
         #       from the crawler definition and add them to the yaml schema that will be
         #       tested in the next lines of code:
 
-        # Load and validate the cfood schema:
+        # Load the cfood schema:
         with open(files('newcrawler').joinpath('cfood-schema.yml'), "r") as f:
             schema = yaml.safe_load(f)
 
@@ -197,6 +197,7 @@ class Crawler(object):
                 schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append(
                     key)
 
+        # Validate the cfood schema:
         validate(instance=crawler_definition, schema=schema["cfood"])
 
         return crawler_definition
@@ -225,6 +226,12 @@ class Crawler(object):
             "MarkdownFile": {
                 "converter": "MarkdownFileConverter",
                 "package": "newcrawler.converters"},
+            "JSONFile": {
+                "converter": "JSONFileConverter",
+                "package": "newcrawler.converters"},
+            "Dict": {
+                "converter": "DictConverter",
+                "package": "newcrawler.converters"},
             "DictTextElement": {
                 "converter": "DictTextElementConverter",
                 "package": "newcrawler.converters"},
@@ -247,7 +254,6 @@ class Crawler(object):
         # Load modules and associate classes:
         for key, value in converter_registry.items():
             module = importlib.import_module(value["package"])
-            print(value)
             value["class"] = getattr(module, value["converter"])
         return converter_registry
 
@@ -288,7 +294,8 @@ class Crawler(object):
 
     def start_crawling(self, item: StructureElement,
                        crawler_definition: dict,
-                       converter_registry: dict):
+                       converter_registry: dict,
+                       converter_class=DirectoryConverter):
         """
         Start point of the crawler recursion.
 
@@ -302,17 +309,17 @@ class Crawler(object):
 
         # This function builds the tree of converters out of the crawler definition.
 
-        if not isinstance(item, Directory):
-            raise NotImplementedError("Currently only directories are supported as items.")
-
         if self.generalStore is None:
             raise RuntimeError("Should not happen.")
 
+        if converter_class is None:
+            converter_class = DirectoryConverter
+
         local_converters = Crawler.create_local_converters(crawler_definition,
                                                            converter_registry)
         # This recursive crawling procedure generates the update list:
         self.updateList: list[db.Record] = []
-        self._crawl(DirectoryConverter.create_children_from_directory(item),
+        self._crawl([item],
                     self.global_converters, local_converters, self.generalStore, self.recordStore,
                     [], [])
 
diff --git a/src/newcrawler/structure_elements.py b/src/newcrawler/structure_elements.py
index 4a5dfaad..61a519f2 100644
--- a/src/newcrawler/structure_elements.py
+++ b/src/newcrawler/structure_elements.py
@@ -60,6 +60,16 @@ class File(FileSystemStructureElement):
     pass
 
 
+class JSONFile(File):
+    pass
+
+
+class Dict(StructureElement):
+    def __init__(self, name: str, value: dict):
+        super().__init__(name)
+        self.value = value
+
+
 class DictElement(StructureElement):
     def __init__(self, name: str, value: str):
         super().__init__(name)
diff --git a/unittests/test_converters.py b/unittests/test_converters.py
index 37ee92df..278a1f2e 100644
--- a/unittests/test_converters.py
+++ b/unittests/test_converters.py
@@ -29,7 +29,7 @@ test the converters module
 
 from newcrawler.converters import Converter
 from newcrawler.stores import GeneralStore
-from newcrawler.converters import MarkdownFileConverter, JSONFileConverter
+from newcrawler.converters import MarkdownFileConverter, JSONFileConverter, DictConverter
 from newcrawler.structure_elements import Directory
 from newcrawler.structure_elements import File, DictTextElement, DictListElement, DictElement
 
@@ -48,6 +48,9 @@ def converter_registry():
         "MarkdownFile": {
             "converter": "MarkdownFileConverter",
             "package": "newcrawler.converters"},
+        "Dict": {
+            "converter": "DictConverter",
+            "package": "newcrawler.converters"},
         "DictTextElement": {
             "converter": "DictTextElementConverter",
             "package": "newcrawler.converters"},
@@ -168,16 +171,22 @@ def test_json_converter(converter_registry):
     test_json = File("testjson.json", rfp(
         "test_directories", "single_file_test_data", "testjson.json"))
 
-    converter = JSONFileConverter(
+    jsonconverter = JSONFileConverter(
         definition={"match": "(.*)"},
         name="TestJSONFileConverter",
         converter_registry=converter_registry)
 
-    m = converter.match(test_json)
+    m = jsonconverter.match(test_json)
     assert m is not None
     assert len(m) == 0
 
-    children = converter.create_children(None, test_json)
+    children = jsonconverter.create_children(None, test_json)
+    dictconverter = DictConverter(
+        definition={"match": "(.*)"},
+        name="TestDictConverter",
+        converter_registry=converter_registry)
+
+    children = dictconverter.create_children(None, children[0])
     assert len(children) == 8
     assert children[0].__class__ == DictTextElement
     assert children[0].name == "name"
-- 
GitLab