diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index c2ac05df0de3a1a9568894c83c0888613fab2c33..dcc3a5e2134a146871da34c21ea908fee0bec263 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -45,6 +45,7 @@ from caosadvancedtools.crawler import Crawler as OldCrawler from caosdb.common.datatype import is_reference from .stores import GeneralStore, RecordStore from .identified_cache import IdentifiedCache +from .identifiable import Identifiable from .structure_elements import StructureElement, Directory from .converters import Converter, DirectoryConverter from .identifiable_adapters import (IdentifiableAdapter, @@ -484,16 +485,18 @@ class Crawler(object): return self._synchronize(self.crawled_data, commit_changes, unique_names=unique_names) - def has_reference_value_without_id(self, record: Identifiable): + def has_reference_value_without_id(self, ident: Identifiable): """ - Returns True if there is at least one property in `record` which: + Returns True if there is at least one property in `ident` which: a) is a reference property AND b) where the value is set to a db.Entity (instead of an ID) AND c) where the ID of the value (the db.Entity object in b)) is not set (to an integer) Returns False otherwise. """ - for pname, pvalue in record.properties.items(): + if ident is None: + return True + for pname, pvalue in ident.properties.items(): if isinstance(pvalue, list): for el in pvalue: if isinstance(el, db.Entity) and el.id is None: @@ -525,14 +528,16 @@ class Crawler(object): flat.append(p.value) Crawler.create_flat_list([p.value], flat) - def has_missing_object_in_references(self, record: Identifiable): + def has_missing_object_in_references(self, ident: Identifiable): """ returns False if any property value is a db.Entity object that - is contained in the `remote_missing_cache`. If the record has such an object in the + is contained in the `remote_missing_cache`. If ident has such an object in the reference properties, it means that it references another Entity, where we checked whether it exists remotely and it was not found. """ - for pname, pvalue in record.properties.items(): + if ident is None: + return True + for pname, pvalue in ident.properties.items(): # if (is_reference(p) # Entity instead of ID and not cached locally if (isinstance(pvalue, list)): @@ -540,7 +545,7 @@ class Crawler(object): if (isinstance(el, db.Entity) and self.get_from_remote_missing_cache(el) is not None): return True - if (isinstance(p.value, db.Entity) + if (isinstance(pvalue, db.Entity) and self.get_from_remote_missing_cache(pvalue) is not None): # might be checked when reference is resolved return True @@ -611,9 +616,7 @@ class Crawler(object): raise RuntimeError("Should not happen.") identifiable = self.identifiableAdapter.get_identifiable(record) if identifiable is None: - # TODO: check whether the same idea as below works here - identifiable = record - # return None + return None if identifiable in self.remote_existing_cache: return self.remote_existing_cache[identifiable] @@ -670,7 +673,7 @@ class Crawler(object): # TODO: check whether that holds: # if there is no identifiable, for the cache that is the same # as if the complete entity is the identifiable: - identifiable = record + return cache.add(identifiable=identifiable, record=record) @staticmethod @@ -728,6 +731,7 @@ class Crawler(object): # can we check whether the record(identifiable) exists on the remote server? elif not self.has_reference_value_without_id( + # TODO move get_identifiable above if else? self.identifiableAdapter.get_identifiable(record)): # TODO: remove deepcopy? identified_record = ( diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index 9fc3364e09d6476aba820e64c0f1b55bc6b3ec84..3d2fce0e0a79ad88ac4909d39342cf9671a612ee 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -20,6 +20,7 @@ # from __future__ import annotations +from typing import Union class Identifiable(): @@ -42,7 +43,7 @@ class Identifiable(): """ def __init__(self, record_type: str = None, name: str = None, properties: dict = None, - path: str = None, backrefs: list[int, str] = None): + path: str = None, backrefs: list[Union[int, str]] = None): self.record_type = record_type self.name = name self.properties: dict = {} diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 62b51dfa1d2a8a953eac871657c93d581ffc844b..4b2d2ffc5ff850174043d20c707e3fed4031a904 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -95,7 +95,9 @@ class IdentifiableAdapter(metaclass=ABCMeta): uses the properties of ident to create a query that can determine whether the required record already exists. """ - query_string = "FIND Record " + ident.record_type + query_string = "FIND Record " + if ident.record_type is not None: + query_string += ident.record_type query_string += " WITH " @@ -166,6 +168,8 @@ class IdentifiableAdapter(metaclass=ABCMeta): registered_identifiable = self.get_registered_identifiable(record) if registered_identifiable is None: + if path is not None: + return Identifiable(path=path) return None if len(registered_identifiable.parents) != 1: raise RuntimeError("Multiple parents for identifiables" @@ -326,7 +330,8 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): record is the record from the local database to check against. identifiable is the record that was created during the crawler run. """ - if not has_parent(record, identifiable.record_type): + if (identifiable.record_type is not None + and not has_parent(record, identifiable.record_type)): return False for propname, propvalue in identifiable.properties.items(): prop_record = record.get_property(propname) @@ -419,6 +424,8 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): It is assumed, that there is exactly one identifiable for each RecordType. Only the first parent of the given Record is considered; others are ignored """ + if len(record.parents) == 0: + return None rt_name = record.parents[0].name for name, definition in self._registered_identifiables.items(): if definition.parents[0].name.lower() == rt_name.lower():