From f4b751e222938bd0cd98f652c9e251c3cf369639 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <alexander@mail-schlemmer.de> Date: Fri, 19 Nov 2021 11:54:59 +0100 Subject: [PATCH] ENH: sketch of the identifiable adapters --- src/newcrawler/converters.py | 2 +- src/newcrawler/crawl.py | 37 ++++++-- src/newcrawler/identifiable_adapters.py | 114 ++++++++++++++++++++++++ tests/scifolder_cfood.yml | 34 ++++--- 4 files changed, 165 insertions(+), 22 deletions(-) create mode 100644 src/newcrawler/identifiable_adapters.py diff --git a/src/newcrawler/converters.py b/src/newcrawler/converters.py index ab3bd412..1701f1a0 100644 --- a/src/newcrawler/converters.py +++ b/src/newcrawler/converters.py @@ -192,6 +192,7 @@ class Converter(object): """ m = self.match(element) if m is None: + # this should never happen as the condition was checked before already raise RuntimeError("Condition does not match.") values.update(m) @@ -210,7 +211,6 @@ class Converter(object): for name, record in self.definition["records"].items(): # whether the record already exists in the store or not are actually really # different distinct cases for treating the setting and updating of variables: - print(name) if name not in records: c_record = db.Record() # add the new record to the record store: diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index 3fd7ab18..2f9eba4d 100644 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -68,6 +68,7 @@ import caosdb as db from .stores import GeneralStore, RecordStore from .structure_elements import StructureElement, Directory, File from .converters import Converter, DirectoryConverter +from .identifiable_adapters import TestingIdentifiableAdapter class Crawler(object): @@ -98,8 +99,7 @@ class Crawler(object): self.generalStore = generalStore self.recordStore = RecordStore() - self.insertList: db.Record = [] - self.updateList: db.Record = [] + self.identifiableAdapter = TestingIdentifiableAdapter() self.debug = debug if self.debug: @@ -135,17 +135,37 @@ class Crawler(object): # This function builds the tree of converters out of the crawler definition. for key, value in crawler_definition.items(): + if key == "Definitions": + continue self.converters.append(Converter.converter_factory( value, key)) if not isinstance(item, Directory): raise NotImplementedError("Currently only directories are supported as items.") - self.crawl(DirectoryConverter.create_children_from_directory(item), - self.converters, [], self.generalStore, self.recordStore) + # This recursive crawling procedure generates the update list: + updateList = self._crawl(DirectoryConverter.create_children_from_directory(item), + self.converters, [], self.generalStore, self.recordStore) + # After the crawling, the actual synchronization with the database, based on the + # update list is carried out: + self._synchronize(updateList) - def crawl(self, items: list[StructureElement], + def _synchronize(self, updateList): + """ + This function applies several stages: + 1) Retrieve identifiables for all records in updateList. + 2) Compare updateList with existing records. + 3) Insert and update records based on the set of identified differences. + + This function makes use of an IdentifiableAdapter which is used to retrieve + register and retrieve identifiables. + """ + + pass + + + def _crawl(self, items: list[StructureElement], global_converters: list[Converter], local_converters: list[Converter], generalStore: GeneralStore, @@ -180,18 +200,21 @@ class Crawler(object): children = converter.create_children(generalStore_copy, element) if self.debug: + # add provenance information for each varaible self.debug_tree[str(element)] = ( generalStore_copy.get_storage(), recordStore_copy.get_storage()) self.debug_copied[str(element)] = ( generalStore_copy.get_dict_copied(), recordStore_copy.get_dict_copied()) - self.crawl(children, global_converters, converter.converters, + self._crawl(children, global_converters, converter.converters, generalStore_copy, recordStore_copy) # if the crawler is running out of scope, copy all records in the recordStore, that were created in this scope # to the general update container. + updateList = [] scoped_records = recordStore.get_records_current_scope() for record in scoped_records: - self.updateList.append(record) + updateList.append(record) + return updateList def main(*args): diff --git a/src/newcrawler/identifiable_adapters.py b/src/newcrawler/identifiable_adapters.py new file mode 100644 index 00000000..3251b585 --- /dev/null +++ b/src/newcrawler/identifiable_adapters.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 Henrik tom Wörden +# 2021 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +import caosdb as db +from abc import abstractmethod + +class IdentifiableAdapter(object): + """ + Base class for identifiable adapters. + + Some terms: + - Registered identifiable is the definition of an identifiable which is: + - A record type as the parent + - A list of properties + - A list of referenced by statements + + - Identifiable is the concrete identifiable, e.g. the Record based on + the registered identifiable which all the values filled in. + + - Identified record is the result of retrieving a record based on the identifiable + from the database. + """ + + @abstractmethod + def get_registered_identifiable(self, record: db.Record): + """ + Check whether an identifiable is registered for this record and return its definition. + If there is no identifiable registered, return None. + """ + pass + + + def get_identifiable(self, record: db.Record): + registered_identifiable = self.get_registered_identifiable(record) + + if registered_identifiable is None: + return None + + identifiable = db.Record() + if len(registered_identifiable.parents) != 1: + raise RuntimeError("Multiple parents for identifiables not supported.") + identifiable.add_parent(registered_identifiable.parents[0]) + property_name_list_A = [] + property_name_list_B = [] + + # fill the values: + for prop in registered_identifiable.properties: + # problem: what happens with multi properties? + # case A: in the registered identifiable + # case B: in the identifiable + + identifiable.add_property(record.get_property(prop.name)) + property_name_list_A.append(prop.name) + + # check for multi properties in the record: + for prop in property_name_list_A: + property_name_list_B.append(prop) + if (len(set(property_name_list_B)) != len(property_name_list_B) or + len(set(property_name_list_A)) != len(property_name_list_A)): + raise RuntimeError("Multi properties used in identifiables can cause unpredictable results.") + + return identifiable + + @abstractmethod + def retrieve_identified_record(self, identifiable: db.Record): + """ + Retrieve identifiable for a given identifiable. + + This function will return None if there is either no identifiable registered + or no corresponding identified record in the database for a given record. + """ + pass + + def retrieve_identifiable(self, record: db.Record): + identifiable = self.get_identifiable(record) + + if identifiable is None: + return None + + identified_record = self.retrieve_identified_record(identifiable) + return identified_record + + + + +class TestingIdentifiableAdapter(IdentifiableAdapter): + """ + Identifiable adapter which can be used for unit tests. + """ + + def get_identifiable(self, record: db.Record): + registered_identifiable = self.get_identifiable diff --git a/tests/scifolder_cfood.yml b/tests/scifolder_cfood.yml index c982be5b..dbcb40bc 100644 --- a/tests/scifolder_cfood.yml +++ b/tests/scifolder_cfood.yml @@ -1,3 +1,12 @@ +Definitions: + type: Definitions + #include "description.yml" + +# Converter-Provenance +# DataAnalysis/project_dir/measurement/match/identifier +# Structure-Element-Provenance +# DataAnalysis/2020_SpeedOflight/2020-11-10_kram + DataAnalysis: # name of the converter type: Directory match: DataAnalysis @@ -24,6 +33,8 @@ DataAnalysis: # name of the converter subtree: README: type: MarkdownFile # this is a subclass of converter File + # function signature: GeneralStore, StructureElement + # preprocessors: custom.caosdb.convert_values match: README\.md # how to make match case insensitive? subtree: @@ -37,13 +48,16 @@ DataAnalysis: # name of the converter responsible_single: type: DictTextElement match_name: responsible - match_value: ((?P<first_name>.+) )?(?P<last_name>.+) - records: - Person: + match_value: &person_regexp ((?P<first_name>.+) )?(?P<last_name>.+) + records: &responsible_records + Person: first_name: $first_name last_name: $last_name Measurement: # this uses the reference to the above defined record - responsible: +$Person + responsible: +$Person # each record also implicitely creates a variable + # with the same name. The "+" indicates, that + # this will become a list entry in list property + # "responsible" belonging to Measurement. responsible_list: type: DictListElement @@ -51,16 +65,8 @@ DataAnalysis: # name of the converter subtree: Person: type: TextElement - match: ((?P<first_name>.+) )?(?P<last_name>.+) - records: - Person: - first_name: $first_name - last_name: $last_name - Measurement: # this uses the reference to the above defined record - responsible: +$Person # each record also implicitely creates a variable - # with the same name. The "+" indicates, that - # this will become a list entry in list property - # "responsible" belonging to Measurement. + match: *person_regexp + records: *responsible_records ExperimentalData: # name of the converter type: Directory -- GitLab