diff --git a/src/newcrawler/converters.py b/src/newcrawler/converters.py index f76320829d382f917e2b6e9f171e7a625af30a51..6d4ae23f178e53c6f90c6d96c480d79ed4f178b0 100644 --- a/src/newcrawler/converters.py +++ b/src/newcrawler/converters.py @@ -39,6 +39,8 @@ from typing import Optional, Union from abc import abstractmethod import yaml_header_tools +from caosdb.high_level_api import (CaosDBPythonEntity, + create_entity_container) import yaml # These are special properties which are (currently) treated differently @@ -329,10 +331,61 @@ class SimpleFileConverter(Converter): return None return m.groupdict() +class YamlFileCaosDBRecord(Converter): + """ + Load a file using pylib high level API and convert the contained + record into caosdb records. + """ + + def typecheck(self, element: StructureElement): + return isinstance(element, File) + + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + return list() + + def match(self, element: StructureElement): + if not isinstance(element, File): + raise RuntimeError("Element must be a file.") + m = re.match(self.definition["match"], element.name) + if m is None: + return None + return m.groupdict() + + def create_records(self, values: GeneralStore, + records: RecordStore, + element: StructureElement): + if not isinstance(element, File): + raise RuntimeError("A yaml file is needed to create children.") + + keys_modified = [] + + with open(element.path, "r") as f: + entries = yaml.safe_load(f) + + entity = CaosDBPythonEntity.deserialize(entries) + entities = create_entity_container(entity) + + for n, ent in enumerate(entities): + name = ent.name + if name is None: + name = "YamlRecord_{}".format(n + 1) + records[name] = ent + values[name] = ent + + for propname in ent.properties: + keys_modified.append((name, propname.name)) + + # Process the records section of the yaml definition: + keys_modified.extend( + super().create_records(values, records, element)) + + return keys_modified + class MarkdownFileConverter(Converter): def __init__(self, definition: dict, name: str, - converter_registry: dict): + converter_registry: dict): """ Initialize a new directory converter. """ diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index f502527fd9f1f8f45693a73ac5497eaf4401be71..3a117b19ea5f31575891214151f0cc1ef28fffdd 100644 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -49,13 +49,13 @@ from caosdb.apiutils import compare_entities, merge_entities from copy import deepcopy from jsonschema import validate +from caosdb.high_level_api import convert_to_python_object import importlib SPECIAL_PROPERTIES_STRICT = ("description", "name", "id", "path") SPECIAL_PROPERTIES_NOT_STRICT = ("file", "checksum", "size") - def check_identical(record1: db.Entity, record2: db.Entity, ignore_id=False): """ This function uses compare_entities to check whether to entities are identical @@ -247,6 +247,9 @@ class Crawler(object): "SimpleFile": { "converter": "SimpleFileConverter", "package": "newcrawler.converters"}, + "YamlFileCaosDBRecord": { + "converter": "YamlFileCaosDBRecord", + "package": "newcrawler.converters"}, "MarkdownFile": { "converter": "MarkdownFileConverter", "package": "newcrawler.converters"}, @@ -913,11 +916,11 @@ def main(): for pn in v: rt.add_property(name=pn) ident.register_identifiable(k, rt) - + if args.dry_sync: ins, upd = crawler.synchronize(commit_changes=False) - inserts = [str(i) for i in ins] - updates = [str(i) for i in upd] + inserts = [convert_to_python_object(i).serialize() for i in ins] + updates = [convert_to_python_object(i).serialize() for i in upd] with open("dry.yml", "w") as f: f.write(yaml.dump({ "insert": inserts,