diff --git a/src/newcrawler/converters.py b/src/newcrawler/converters.py index 76afe6d4ed0117def850d03bafedf3ccdd7d26c9..4a659331e55d165d22a0f93eed33564202768b33 100644 --- a/src/newcrawler/converters.py +++ b/src/newcrawler/converters.py @@ -34,6 +34,7 @@ from typing import Type, Optional, Union from abc import abstractmethod import yaml_header_tools + def handle_value(value: Union[dict, str], values: GeneralStore): """ Function to generically handle values for properties defined in the @@ -66,6 +67,7 @@ def handle_value(value: Union[dict, str], values: GeneralStore): return (propvalue, collection_mode) + class Converter(object): """ Converters treat StructureElements contained in the hierarchical sturcture. @@ -238,7 +240,7 @@ class Converter(object): propvalue, collection_mode = handle_value(value, values) if c_record.get_property(key) is None: - + if collection_mode == "list": c_record.add_property(name=key, value=[propvalue]) elif (collection_mode == "multiproperty" or @@ -251,8 +253,6 @@ class Converter(object): c_record.add_property(name=key, value=propvalue) elif collection_mode == "single": c_record.get_property(key).value = propvalue - - # no matter whether the record existed in the record store or not, # parents will be added when they aren't present in the record yet: @@ -265,8 +265,6 @@ class Converter(object): c_record.add_parent(name) return keys_modified - - @abstractmethod def typecheck(self, element: StructureElement): pass @@ -275,6 +273,7 @@ class Converter(object): def match(self, element: StructureElement) -> Optional[dict]: pass + class DirectoryConverter(Converter): def __init__(self, definition: dict, name: str): @@ -306,7 +305,7 @@ class DirectoryConverter(Converter): """ Creates a list of files (of type File) and directories (of type Directory) for a given directory. No recursion. - + element: A directory (of type Directory) which will be traversed. """ children: list[StructureElement] = [] @@ -321,6 +320,7 @@ class DirectoryConverter(Converter): return children + class MarkdownFileConverter(Converter): def __init__(self, definition: dict, name: str): """ @@ -344,7 +344,6 @@ class MarkdownFileConverter(Converter): else: raise RuntimeError("Header entry {} has incompatible type.".format(name)) return children - def typecheck(self, element: StructureElement): return isinstance(element, File) @@ -361,11 +360,11 @@ class MarkdownFileConverter(Converter): return None return m.groupdict() + class DictTextElementConverter(Converter): def create_children(self, generalStore: GeneralStore, element: StructureElement): return [] - def typecheck(self, element: StructureElement): return isinstance(element, DictTextElement) @@ -384,6 +383,7 @@ class DictTextElementConverter(Converter): values.update(m2.groupdict()) return values + class DictListElementConverter(Converter): def create_children(self, generalStore: GeneralStore, element: StructureElement): @@ -404,6 +404,7 @@ class DictListElementConverter(Converter): raise NotImplementedError("Match is not implemented for DictListElement.") return m.groupdict() + class TextElementConverter(Converter): def create_children(self, generalStore: GeneralStore, element: StructureElement): diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index c9e851878b91dc5fc22db4e0e95aa91bb7b339ac..7097c34fabd7f85ca73c938f5e3d87e37e390a69 100644 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -100,7 +100,7 @@ class Crawler(object): of the structureelements. """ self.global_converters = converters - + self.recordStore = RecordStore() self.generalStore = generalStore @@ -136,8 +136,6 @@ class Crawler(object): dirname), crawler_definition) - - def start_crawling(self, item: StructureElement, crawler_definition: dict): """ @@ -242,7 +240,7 @@ class Crawler(object): if ("datatype" in comp[0]["properties"][key] and comp[0]["properties"][key]["datatype"] is not None and comp[1]["properties"][key]["datatype"] != - comp[1]["properties"][key]["datatype"]): + comp[1]["properties"][key]["datatype"]): identical = False break @@ -280,7 +278,7 @@ class Crawler(object): def save_debug_data(self, filename: str): paths: dict[str, Union[dict, list]] = dict() - + def flatten_debug_info(key): mod_info = self.debug_metadata[key] paths[key] = dict() @@ -298,8 +296,9 @@ class Crawler(object): for key in ("provenance", "usage"): flatten_debug_info(key) - paths["converters_usage"] = [self.debug_build_usage_tree(cv) for cv in self.debug_converters] - + paths["converters_usage"] = [self.debug_build_usage_tree( + cv) for cv in self.debug_converters] + with open(filename, "w") as f: f.write(yaml.dump(paths, sort_keys=False)) @@ -311,7 +310,7 @@ class Crawler(object): structure_elements_path: list[str], converters_path: list[str]): """ Crawl a list of StructureElements and apply any matching converters. - + items: structure_elements (e.g. files and folders on one level on the hierarchy) global_converters and local_converters: globally or locally defined converters for treating structure elements. A locally defined converter could be @@ -345,7 +344,8 @@ class Crawler(object): generalStore_copy.get_storage(), recordStore_copy.get_storage()) self.debug_metadata["copied"][str(element)] = ( generalStore_copy.get_dict_copied(), recordStore_copy.get_dict_copied()) - self.debug_metadata["usage"][str(element)].add("/".join(converters_path + [converter.name])) + self.debug_metadata["usage"][str(element)].add( + "/".join(converters_path + [converter.name])) mod_info = self.debug_metadata["provenance"] for record_name, prop_name in keys_modified: # TODO: check @@ -366,6 +366,7 @@ class Crawler(object): self.updateList.append(record) return self.updateList + def main(*args): pass diff --git a/src/newcrawler/identifiable_adapters.py b/src/newcrawler/identifiable_adapters.py index 5f4ad70a0a7cdb10357c7fb6e8276f59d12b1414..914e02a5c0bdb9b0550765e0ce6f03d7ffac5ca0 100644 --- a/src/newcrawler/identifiable_adapters.py +++ b/src/newcrawler/identifiable_adapters.py @@ -126,7 +126,8 @@ class IdentifiableAdapter(object): property_name_list_B.append(prop) if (len(set(property_name_list_B)) != len(property_name_list_B) or len( set(property_name_list_A)) != len(property_name_list_A)): - raise RuntimeError("Multi properties used in identifiables can cause unpredictable results.") + raise RuntimeError( + "Multi properties used in identifiables can cause unpredictable results.") return identifiable @@ -194,7 +195,7 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): """ if len(registered_identifiable.parents) != 1: raise RuntimeError("Multiple parents for identifiables not supported.") - + if not has_parent(record, registered_identifiable.parents[0].name): return False @@ -219,7 +220,7 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): Check for a record from the local storage (named "record") if it is the identified record for an identifiable which was created by a run of the crawler. - + Naming of the parameters could be confusing: record is the record from the local database to check against. identifiable is the record that was created during the crawler run. @@ -246,7 +247,7 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): raise RuntimeError("The identifiable which is used as property" " here has to be inserted first.") - + if prop.value != prop_record.value: return False return True diff --git a/src/newcrawler/stores.py b/src/newcrawler/stores.py index b85b14fe60abe5a24657aad687848a0159bb0368..452061e6cf26bcb69f95c20f109b1d15a53f4a04 100644 --- a/src/newcrawler/stores.py +++ b/src/newcrawler/stores.py @@ -80,9 +80,11 @@ class Store(object): """ return self._ids[key] + class GeneralStore(Store): pass + class RecordStore(Store): def get_records_current_scope(self): @@ -90,7 +92,7 @@ class RecordStore(Store): Return all records that were created in the current scope. """ lst = [] - + for key in self._storage: if not self._copied[key]: lst.append(self[key]) diff --git a/src/newcrawler/structure_elements.py b/src/newcrawler/structure_elements.py index df0d85f9181ff918ac2e80e355c9d18799e0781d..6562d03c16b3c4f380077e05268a370370b2c725 100644 --- a/src/newcrawler/structure_elements.py +++ b/src/newcrawler/structure_elements.py @@ -40,6 +40,7 @@ class StructureElement(object): def get_name(self): return self.name + class FileSystemStructureElement(StructureElement): def __init__(self, name: str, path: str): super().__init__(name) @@ -50,22 +51,27 @@ class FileSystemStructureElement(StructureElement): "<class \'", "")[:-2] return "{}: {}, {}".format(class_name_short, self.name, self.path) + class Directory(FileSystemStructureElement): pass + class File(FileSystemStructureElement): pass + class DictTextElement(StructureElement): def __init__(self, name: str, value: str): super().__init__(name) self.value = value + class DictListElement(StructureElement): def __init__(self, name: str, value: list): super().__init__(name) self.value = value + class TextElement(StructureElement): def __init__(self, name: str, value: str): super().__init__(name) diff --git a/src/newcrawler/utils.py b/src/newcrawler/utils.py index e0a44be3c3859823ba6cbd63e4cd694c2974cca0..c60b7f871db32d66c12781e5f0cfb246bc41c8fe 100644 --- a/src/newcrawler/utils.py +++ b/src/newcrawler/utils.py @@ -28,6 +28,7 @@ from datetime import datetime # Some utility functions, e.g. for extending pylib. + def has_parent(entity: db.Entity, name: str): """ A simple check, whether a parent with the given name exists. @@ -40,12 +41,13 @@ def has_parent(entity: db.Entity, name: str): return True return False + def get_value(prop): """ Returns the value of a Property This function is taken from the old crawler: caosdb-advanced-user-tools/src/caosadvancedtools/crawler.py - + Parameters ---------- prop : The property of which the value shall be returned.