diff --git a/src/newcrawler/converters.py b/src/newcrawler/converters.py new file mode 100644 index 0000000000000000000000000000000000000000..d251707cd4a91834007e04f11fcc66e337828be6 --- /dev/null +++ b/src/newcrawler/converters.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 Henrik tom Wörden +# 2021 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +import os +import re +import caosdb as db +from .stores import GeneralStore, RecordStore +from .structure_elements import StructureElement, Directory, File +from typing import Type +from abc import abstractmethod + +class Converter(object): + """ + Converters treat StructureElements contained in the hierarchical sturcture. + + A converter is defined via a yml file or part of it. The definition states + what kind of StructureElement it treats (typically one). + Also, it defines how children of the current StructureElement are + created and what Converters shall be used to treat those. + + The yaml definition looks like the following: + + converter-name: + type: <StructureElement Type> + match: ".*" + records: + Experiment1: + parents: + - Experiment + - Blablabla + date: $DATUM + <...> + Experiment2: + parents: + - Experiment + valuegenerators: + datepattern: + <...> + childrengenerators: + create_children_from_directory: + sort-by-date: true + subtree: + + + records: + Measurement: <- wird automatisch ein value im valueStore + run_number: 25 + Experiment1: + Measurement: +Measurement <- Element in List (list is cleared before run) + *Measurement <- Multi Property (properties are removed before run) + Measurement <- Overwrite + + UPDATE-Stage prüft ob es z.B. Gleichheit zwischen Listen gibt (die dadurch definiert sein + kann, dass alle Elemente vorhanden, aber nicht zwingend in der richtigen Reihenfolge sind) + evtl. brauchen wir das nicht, weil crawler eh schon deterministisch ist. + + The converter-name is a description of what it represents (e.g. + 'experiment-folder') and is used as identifier. + + The type restricts what kind of StructureElements are treated. + The match is by default a regular expression, that is matche against the + name of StructureElements. Discussion: StructureElements might not have a + name (e.g. a dict) or should a name be created artificially if necessary + (e.g. "root-dict")? It might make sense to allow keywords like "always" and + other kinds of checks. For example a dictionary could be checked against a + json-schema definition. + + recordtypes is a list of definitions that define the semantic structure + (see details below). + + valuegenerators allow to provide additional functionality that creates + data values in addition to the ones given by default via the + StructureElement. This can be for example a match group of a regular + expression applied to the filename. + It should be possible to access the values of parent nodes. For example, + the name of a parent node could be accessed with $converter-name.name. + Discussion: This can introduce conflicts, if the key <converver-name> + already exists. An alternative would be to identify those lookups. E.g. + $$converter-name.name (2x$). + + childrengenerators denotes how StructureElements shall be created that are + children of the current one. + + subtree contains a list of Converter defnitions that look like the one + described here. + + those keywords should be allowed but not required. I.e. if no + valuegenerators shall be defined, the keyword may be omitted. + """ + + def __init__(self, definition: dict, + name: str): + self.definition = definition + self.name = name + + self.converters = [] + + if "subtree" in definition: + for converter_name in definition['subtree']: + converter_definition = definition["subtree"][converter_name] + self.converters.append(Converter.converter_factory( + converter_definition, converter_name)) + + @staticmethod + def converter_factory(definition: dict, + name: str): + # The following dict is a mapping from types (as written in the yaml file) + # to classes implementing the specific converters: + converter_registry: dict[str, Type[Converter]] = { + "Directory": DirectoryConverter, + "MarkdownFile": MarkdownFileConverter, + "DictTextElement": DictTextElementConverter, + "DictListElement": DictListElementConverter, + "TextElement": TextElementConverter + } + + if "type" not in definition: + raise RuntimeError("Type is mandatory for converter entries in CFood definition.") + + if definition["type"] not in converter_registry: + raise RuntimeError("Unknown Type: {}".format(definition["type"])) + + converter = converter_registry[definition["type"]](definition, name) + return converter + + @abstractmethod + def create_values(self, + values: GeneralStore, + element: StructureElement): + """ + Extract information from the structure element and store them as values in the + general store. + + values: The GeneralStore to store values in. + element: The StructureElement to extract values from. + """ + pass + + @abstractmethod + def create_children(self, values: GeneralStore, + element: StructureElement): + pass + + def create_records(self, values: GeneralStore, + records: RecordStore, + element: StructureElement): + + if "records" not in self.definition: + return + + for name, record in self.definition["records"].items(): + if name not in records.storage: + c_record = db.Record() + records.storage[name] = c_record + c_record = records.storage[name] + values.storage[name] = c_record + + if "parents" in record: + for parent in record["parents"]: + c_record.add_parent(name=parent) + else: + c_record.add_parent(name=name) + + for key, value in record.items(): + if key == "parents": + continue + propvalue = value + + if propvalue.startswith("$"): + propvalue = values[propvalue[1:]] + c_record.add_property(name=key, value=propvalue) + + @abstractmethod + def typecheck(self, element: StructureElement): + pass + + @abstractmethod + def match(self, element: StructureElement): + pass + +class DirectoryConverter(Converter): + + def __init__(self, definition: dict, name: str): + """ + Initialize a new directory converter. + """ + super().__init__(definition, name) + + def create_values(self, + values: GeneralStore, + element: StructureElement): + if not isinstance(element, Directory): + raise RuntimeError("Element must be a directory.") + m = re.match(self.definition["match"], element.name) + if m is None: + raise RuntimeError("Condition does not match.") + values.update(m.groupdict()) + + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + if not isinstance(element, Directory): + raise RuntimeError("Directory converters can only create children from directories.") + + return self.create_children_from_directory(element) + + def typecheck(self, element: StructureElement): + return isinstance(element, Directory) + + def match(self, element: StructureElement): + if not isinstance(element, Directory): + raise RuntimeError("Element must be a directory.") + m = re.match(self.definition["match"], element.name) + return m is not None + + @staticmethod + def create_children_from_directory(element: Directory): + """ + Creates a list of files (of type File) and directories (of type Directory) for a + given directory. No recursion. + + element: A directory (of type Directory) which will be traversed. + """ + children: list[StructureElement] = [] + + for name in os.listdir(element.path): + path = os.path.join(element.path, name) + + if os.path.isdir(path): + children.append(Directory(name, path)) + elif os.path.isfile(path): + children.append(File(name, path)) + + return children + +class MarkdownFileConverter(Converter): + pass + +class DictTextElementConverter(Converter): + pass + +class DictListElementConverter(Converter): + pass + +class TextElementConverter(Converter): + pass diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index 0a3eaafa02bfd58326cfab6aadabc94e7329cff5..a20f0d880530765eb676f6c994cfecb127822f81 100644 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -5,6 +5,7 @@ # This file is a part of the CaosDB Project. # # Copyright (C) 2021 Henrik tom Wörden +# 2021 Alexander Schlemmer # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -58,308 +59,20 @@ example, a regular expresion could be used to get a date from a file name. """ -from typing import Type import sys import os import yaml -import re import argparse -from abc import abstractmethod from argparse import RawTextHelpFormatter import caosdb as db +from .stores import GeneralStore, RecordStore +from .structure_elements import StructureElement, Directory, File +from .converters import Converter, DirectoryConverter -class StructureElement(object): - """ base class for elements in the hierarchical data structure """ - pass - -class FileSystemStructureElement(StructureElement): - def __init__(self, name, path): - self.name = name - self.path = path - - def __str__(self): - class_name_short = str(self.__class__).replace( - "<class \'", "")[:-2] - return "{}: {}, {}".format(class_name_short, self.name, self.path) - -class Directory(FileSystemStructureElement): - pass - -class File(FileSystemStructureElement): - pass - - -class Store(object): - """ - Base class for record store and general store which act as storages for - records and values used during crawling. - """ - - def __init__(self): - self.storage = dict() - - def __getitem__(self, key): - if key not in self.storage: - self.storage[key] = db.Record() - - return self.storage[key] - - def update(self, other): - self.storage.update(other) - - def create_scoped_copy(self): - s_copy = Store() - s_copy.storage = dict(self.storage) - return s_copy - -class GeneralStore(Store): - def create_scoped_copy(self): - s_copy = GeneralStore() - s_copy.storage = dict(self.storage) - return s_copy - -class RecordStore(Store): - def create_scoped_copy(self): - s_copy = RecordStore() - s_copy.storage = dict(self.storage) - return s_copy - - -class Converter(object): - """ - Converters treat StructureElements contained in the hierarchical sturcture. - - A converter is defined via a yml file or part of it. The definition states - what kind of StructureElement it treats (typically one). - Also, it defines how children of the current StructureElement are - created and what Converters shall be used to treat those. - - The yaml definition looks like the following: - - converter-name: - type: <StructureElement Type> - match: ".*" - records: - Experiment1: - parents: - - Experiment - - Blablabla - date: $DATUM - <...> - Experiment2: - parents: - - Experiment - valuegenerators: - datepattern: - <...> - childrengenerators: - create_children_from_directory: - sort-by-date: true - subtree: - - - records: - Measurement: <- wird automatisch ein value im valueStore - run_number: 25 - Experiment1: - Measurement: +Measurement <- Element in List (list is cleared before run) - *Measurement <- Multi Property (properties are removed before run) - Measurement <- Overwrite - - UPDATE-Stage prüft ob es z.B. Gleichheit zwischen Listen gibt (die dadurch definiert sein - kann, dass alle Elemente vorhanden, aber nicht zwingend in der richtigen Reihenfolge sind) - evtl. brauchen wir das nicht, weil crawler eh schon deterministisch ist. - - The converter-name is a description of what it represents (e.g. - 'experiment-folder') and is used as identifier. - - The type restricts what kind of StructureElements are treated. - The match is by default a regular expression, that is matche against the - name of StructureElements. Discussion: StructureElements might not have a - name (e.g. a dict) or should a name be created artificially if necessary - (e.g. "root-dict")? It might make sense to allow keywords like "always" and - other kinds of checks. For example a dictionary could be checked against a - json-schema definition. - - recordtypes is a list of definitions that define the semantic structure - (see details below). - - valuegenerators allow to provide additional functionality that creates - data values in addition to the ones given by default via the - StructureElement. This can be for example a match group of a regular - expression applied to the filename. - It should be possible to access the values of parent nodes. For example, - the name of a parent node could be accessed with $converter-name.name. - Discussion: This can introduce conflicts, if the key <converver-name> - already exists. An alternative would be to identify those lookups. E.g. - $$converter-name.name (2x$). - - childrengenerators denotes how StructureElements shall be created that are - children of the current one. - - subtree contains a list of Converter defnitions that look like the one - described here. - - those keywords should be allowed but not required. I.e. if no - valuegenerators shall be defined, the keyword may be omitted. - """ - - def __init__(self, definition: dict, - name: str): - self.definition = definition - self.name = name - - self.converters = [] - - if "subtree" in definition: - for converter_name in definition['subtree']: - converter_definition = definition["subtree"][converter_name] - self.converters.append(Converter.converter_factory( - converter_definition, converter_name)) - - @staticmethod - def converter_factory(definition: dict, - name: str): - # The following dict is a mapping from types (as written in the yaml file) - # to classes implementing the specific converters: - converter_registry: dict[str, Type[Converter]] = { - "Directory": DirectoryConverter, - "MarkdownFile": MarkdownFileConverter, - "DictTextElement": DictTextElementConverter, - "DictListElement": DictListElementConverter, - "TextElement": TextElementConverter - } - - if "type" not in definition: - raise RuntimeError("Type is mandatory for converter entries in CFood definition.") - - if definition["type"] not in converter_registry: - raise RuntimeError("Unknown Type: {}".format(definition["type"])) - - converter = converter_registry[definition["type"]](definition, name) - return converter - - @abstractmethod - def create_values(self, - values: GeneralStore, - element: StructureElement): - """ - Extract information from the structure element and store them as values in the - general store. - - values: The GeneralStore to store values in. - element: The StructureElement to extract values from. - """ - pass - @abstractmethod - def create_children(self, values: GeneralStore, - element: StructureElement): - pass - def create_records(self, values: GeneralStore, - records: RecordStore, - element: StructureElement): - if "records" not in self.definition: - return - - for name, record in self.definition["records"].items(): - if name not in records.storage: - c_record = db.Record() - records.storage[name] = c_record - c_record = records.storage[name] - values.storage[name] = c_record - - if "parents" in record: - for parent in record["parents"]: - c_record.add_parent(name=parent) - else: - c_record.add_parent(name=name) - - for key, value in record.items(): - if key == "parents": - continue - propvalue = value - - if propvalue.startswith("$"): - propvalue = values[propvalue[1:]] - c_record.add_property(name=key, value=propvalue) - - @abstractmethod - def typecheck(self, element: StructureElement): - pass - - @abstractmethod - def match(self, element: StructureElement): - pass - -class DirectoryConverter(Converter): - - def __init__(self, definition: dict, name: str): - """ - Initialize a new directory converter. - """ - super().__init__(definition, name) - - def create_values(self, - values: GeneralStore, - element: StructureElement): - if not isinstance(element, Directory): - raise RuntimeError("Element must be a directory.") - m = re.match(self.definition["match"], element.name) - if m is None: - raise RuntimeError("Condition does not match.") - values.update(m.groupdict()) - - def create_children(self, generalStore: GeneralStore, - element: StructureElement): - if not isinstance(element, Directory): - raise RuntimeError("Directory converters can only create children from directories.") - - return self.create_children_from_directory(element) - - def typecheck(self, element: StructureElement): - return isinstance(element, Directory) - - def match(self, element: StructureElement): - if not isinstance(element, Directory): - raise RuntimeError("Element must be a directory.") - m = re.match(self.definition["match"], element.name) - return m is not None - - @staticmethod - def create_children_from_directory(element: Directory): - """ - Creates a list of files (of type File) and directories (of type Directory) for a - given directory. No recursion. - - element: A directory (of type Directory) which will be traversed. - """ - children: list[StructureElement] = [] - - for name in os.listdir(element.path): - path = os.path.join(element.path, name) - - if os.path.isdir(path): - children.append(Directory(name, path)) - elif os.path.isfile(path): - children.append(File(name, path)) - - return children - -class MarkdownFileConverter(Converter): - pass - -class DictTextElementConverter(Converter): - pass - -class DictListElementConverter(Converter): - pass - -class TextElementConverter(Converter): - pass class Crawler(object): diff --git a/src/newcrawler/stores.py b/src/newcrawler/stores.py new file mode 100644 index 0000000000000000000000000000000000000000..84ceac41aeaaea2dea568a335ad570e713d7da30 --- /dev/null +++ b/src/newcrawler/stores.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 Henrik tom Wörden +# 2021 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +import caosdb as db + + +class Store(object): + """ + Base class for record store and general store which act as storages for + records and values used during crawling. + """ + + def __init__(self): + self.storage = dict() + + def __getitem__(self, key): + if key not in self.storage: + self.storage[key] = db.Record() + + return self.storage[key] + + def update(self, other): + self.storage.update(other) + + def create_scoped_copy(self): + s_copy = Store() + s_copy.storage = dict(self.storage) + return s_copy + +class GeneralStore(Store): + def create_scoped_copy(self): + s_copy = GeneralStore() + s_copy.storage = dict(self.storage) + return s_copy + +class RecordStore(Store): + def create_scoped_copy(self): + s_copy = RecordStore() + s_copy.storage = dict(self.storage) + return s_copy diff --git a/src/newcrawler/structure_elements.py b/src/newcrawler/structure_elements.py new file mode 100644 index 0000000000000000000000000000000000000000..cb3cbac885d3707df2a4f8312cc684afef324267 --- /dev/null +++ b/src/newcrawler/structure_elements.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 Henrik tom Wörden +# 2021 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +class StructureElement(object): + """ base class for elements in the hierarchical data structure """ + pass + +class FileSystemStructureElement(StructureElement): + def __init__(self, name, path): + self.name = name + self.path = path + + def __str__(self): + class_name_short = str(self.__class__).replace( + "<class \'", "")[:-2] + return "{}: {}, {}".format(class_name_short, self.name, self.path) + +class Directory(FileSystemStructureElement): + pass + +class File(FileSystemStructureElement): + pass diff --git a/tests/test_tool.py b/tests/test_tool.py index 67269731e71143b9ecc0aba95888cdbf0ef5c30a..d1f0f211260ba7ded3beb76701484a5235a9966e 100755 --- a/tests/test_tool.py +++ b/tests/test_tool.py @@ -18,7 +18,7 @@ def dircheckstr(*pathcomponents): """ Return the debug tree identifier for a given path. """ - return "newcrawler.crawl.Directory: " + basename(join(*pathcomponents)) + ", " + rfp("test_directories", "examples_article", *pathcomponents) + return "newcrawler.structure_elements.Directory: " + basename(join(*pathcomponents)) + ", " + rfp("test_directories", "examples_article", *pathcomponents) def test_crawler(): crawler = Crawler(debug=True)