From f4b751e222938bd0cd98f652c9e251c3cf369639 Mon Sep 17 00:00:00 2001
From: Alexander Schlemmer <alexander@mail-schlemmer.de>
Date: Fri, 19 Nov 2021 11:54:59 +0100
Subject: [PATCH] ENH: sketch of the identifiable adapters

---
 src/newcrawler/converters.py            |   2 +-
 src/newcrawler/crawl.py                 |  37 ++++++--
 src/newcrawler/identifiable_adapters.py | 114 ++++++++++++++++++++++++
 tests/scifolder_cfood.yml               |  34 ++++---
 4 files changed, 165 insertions(+), 22 deletions(-)
 create mode 100644 src/newcrawler/identifiable_adapters.py

diff --git a/src/newcrawler/converters.py b/src/newcrawler/converters.py
index ab3bd412..1701f1a0 100644
--- a/src/newcrawler/converters.py
+++ b/src/newcrawler/converters.py
@@ -192,6 +192,7 @@ class Converter(object):
         """
         m = self.match(element)
         if m is None:
+            # this should never happen as the condition was checked before already
             raise RuntimeError("Condition does not match.")
         values.update(m)
 
@@ -210,7 +211,6 @@ class Converter(object):
         for name, record in self.definition["records"].items():
             # whether the record already exists in the store or not are actually really
             # different distinct cases for treating the setting and updating of variables:
-            print(name)
             if name not in records:
                 c_record = db.Record()
                 # add the new record to the record store:
diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py
index 3fd7ab18..2f9eba4d 100644
--- a/src/newcrawler/crawl.py
+++ b/src/newcrawler/crawl.py
@@ -68,6 +68,7 @@ import caosdb as db
 from .stores import GeneralStore, RecordStore
 from .structure_elements import StructureElement, Directory, File
 from .converters import Converter, DirectoryConverter
+from .identifiable_adapters import TestingIdentifiableAdapter
 
 
 class Crawler(object):
@@ -98,8 +99,7 @@ class Crawler(object):
         self.generalStore = generalStore
         self.recordStore = RecordStore()
 
-        self.insertList: db.Record = []
-        self.updateList: db.Record = []
+        self.identifiableAdapter = TestingIdentifiableAdapter()
 
         self.debug = debug
         if self.debug:
@@ -135,17 +135,37 @@ class Crawler(object):
         # This function builds the tree of converters out of the crawler definition.
         
         for key, value in crawler_definition.items():
+            if key == "Definitions":
+                continue
             self.converters.append(Converter.converter_factory(
                 value, key))
 
         if not isinstance(item, Directory):
             raise NotImplementedError("Currently only directories are supported as items.")
 
-        self.crawl(DirectoryConverter.create_children_from_directory(item),
-                   self.converters, [], self.generalStore, self.recordStore)
+        # This recursive crawling procedure generates the update list:
+        updateList = self._crawl(DirectoryConverter.create_children_from_directory(item),
+                                 self.converters, [], self.generalStore, self.recordStore)
 
+        # After the crawling, the actual synchronization with the database, based on the
+        # update list is carried out:
+        self._synchronize(updateList)
 
-    def crawl(self, items: list[StructureElement],
+    def _synchronize(self, updateList):
+        """
+        This function applies several stages:
+        1) Retrieve identifiables for all records in updateList.
+        2) Compare updateList with existing records.
+        3) Insert and update records based on the set of identified differences.
+
+        This function makes use of an IdentifiableAdapter which is used to retrieve
+        register and retrieve identifiables.
+        """
+
+        pass
+
+
+    def _crawl(self, items: list[StructureElement],
               global_converters: list[Converter],
               local_converters: list[Converter],
               generalStore: GeneralStore,
@@ -180,18 +200,21 @@ class Crawler(object):
 
                     children = converter.create_children(generalStore_copy, element)
                     if self.debug:
+                        # add provenance information for each varaible
                         self.debug_tree[str(element)] = (
                             generalStore_copy.get_storage(), recordStore_copy.get_storage())
                         self.debug_copied[str(element)] = (
                             generalStore_copy.get_dict_copied(), recordStore_copy.get_dict_copied())
 
-                    self.crawl(children, global_converters, converter.converters,
+                    self._crawl(children, global_converters, converter.converters,
                                generalStore_copy, recordStore_copy)
         # if the crawler is running out of scope, copy all records in the recordStore, that were created in this scope
         # to the general update container.
+        updateList = []
         scoped_records = recordStore.get_records_current_scope()
         for record in scoped_records:
-            self.updateList.append(record)
+            updateList.append(record)
+        return updateList
 
 
 def main(*args):
diff --git a/src/newcrawler/identifiable_adapters.py b/src/newcrawler/identifiable_adapters.py
new file mode 100644
index 00000000..3251b585
--- /dev/null
+++ b/src/newcrawler/identifiable_adapters.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+#
+# ** header v3.0
+# This file is a part of the CaosDB Project.
+#
+# Copyright (C) 2021 Henrik tom Wörden
+#               2021 Alexander Schlemmer
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+# ** end header
+#
+
+import caosdb as db
+from abc import abstractmethod
+
+class IdentifiableAdapter(object):
+    """
+    Base class for identifiable adapters.
+
+    Some terms:
+    - Registered identifiable is the definition of an identifiable which is:
+      - A record type as the parent
+      - A list of properties
+      - A list of referenced by statements
+
+    - Identifiable is the concrete identifiable, e.g. the Record based on
+      the registered identifiable which all the values filled in.
+
+    - Identified record is the result of retrieving a record based on the identifiable
+      from the database.
+    """
+
+    @abstractmethod
+    def get_registered_identifiable(self, record: db.Record):
+        """
+        Check whether an identifiable is registered for this record and return its definition.
+        If there is no identifiable registered, return None.
+        """
+        pass
+
+    
+    def get_identifiable(self, record: db.Record):
+        registered_identifiable = self.get_registered_identifiable(record)
+
+        if registered_identifiable is None:
+            return None
+
+        identifiable = db.Record()
+        if len(registered_identifiable.parents) != 1:
+            raise RuntimeError("Multiple parents for identifiables not supported.")
+        identifiable.add_parent(registered_identifiable.parents[0])
+        property_name_list_A = []
+        property_name_list_B = []
+
+        # fill the values:
+        for prop in registered_identifiable.properties:
+            # problem: what happens with multi properties?
+            # case A: in the registered identifiable
+            # case B: in the identifiable
+
+            identifiable.add_property(record.get_property(prop.name))
+            property_name_list_A.append(prop.name)
+                
+        # check for multi properties in the record:
+        for prop in property_name_list_A:
+            property_name_list_B.append(prop)
+        if (len(set(property_name_list_B)) != len(property_name_list_B) or
+            len(set(property_name_list_A)) != len(property_name_list_A)):
+            raise RuntimeError("Multi properties used in identifiables can cause unpredictable results.")
+
+        return identifiable
+
+    @abstractmethod
+    def retrieve_identified_record(self, identifiable: db.Record):
+        """
+        Retrieve identifiable for a given identifiable.
+
+        This function will return None if there is either no identifiable registered
+        or no corresponding identified record in the database for a given record.
+        """
+        pass
+
+    def retrieve_identifiable(self, record: db.Record):
+        identifiable = self.get_identifiable(record)
+
+        if identifiable is None:
+            return None
+
+        identified_record = self.retrieve_identified_record(identifiable)
+        return identified_record
+
+
+
+
+class TestingIdentifiableAdapter(IdentifiableAdapter):
+    """
+    Identifiable adapter which can be used for unit tests.
+    """
+
+    def get_identifiable(self, record: db.Record):
+        registered_identifiable = self.get_identifiable
diff --git a/tests/scifolder_cfood.yml b/tests/scifolder_cfood.yml
index c982be5b..dbcb40bc 100644
--- a/tests/scifolder_cfood.yml
+++ b/tests/scifolder_cfood.yml
@@ -1,3 +1,12 @@
+Definitions:
+  type: Definitions
+  #include "description.yml"
+
+# Converter-Provenance
+# DataAnalysis/project_dir/measurement/match/identifier
+# Structure-Element-Provenance
+# DataAnalysis/2020_SpeedOflight/2020-11-10_kram
+
 DataAnalysis:  # name of the converter
   type: Directory
   match: DataAnalysis
@@ -24,6 +33,8 @@ DataAnalysis:  # name of the converter
           subtree:
             README:
               type: MarkdownFile  # this is a subclass of converter File
+              # function signature: GeneralStore, StructureElement
+              # preprocessors: custom.caosdb.convert_values
               match: README\.md
               # how to make match case insensitive?
               subtree:
@@ -37,13 +48,16 @@ DataAnalysis:  # name of the converter
                 responsible_single:
                     type: DictTextElement
                     match_name: responsible
-                    match_value: ((?P<first_name>.+) )?(?P<last_name>.+)
-                    records:
-                      Person:
+                    match_value: &person_regexp ((?P<first_name>.+) )?(?P<last_name>.+)
+                    records: &responsible_records
+                      Person: 
                         first_name: $first_name
                         last_name: $last_name
                       Measurement:  # this uses the reference to the above defined record
-                        responsible: +$Person
+                        responsible: +$Person    # each record also implicitely creates a variable
+                                                 # with the same name. The "+" indicates, that
+                                                 # this will become a list entry in list property
+                                                 # "responsible" belonging to Measurement.
 
                 responsible_list:
                   type: DictListElement
@@ -51,16 +65,8 @@ DataAnalysis:  # name of the converter
                   subtree:
                     Person:
                       type: TextElement
-                      match: ((?P<first_name>.+) )?(?P<last_name>.+)
-                      records:
-                        Person:
-                          first_name: $first_name
-                          last_name: $last_name
-                        Measurement:  # this uses the reference to the above defined record
-                          responsible: +$Person  # each record also implicitely creates a variable
-                                                 # with the same name. The "+" indicates, that
-                                                 # this will become a list entry in list property
-                                                 # "responsible" belonging to Measurement.
+                      match: *person_regexp
+                      records: *responsible_records
 
 ExperimentalData:  # name of the converter
   type: Directory
-- 
GitLab