diff --git a/CHANGELOG.md b/CHANGELOG.md index 8eeed54fc829649a58a14cf60fbf85a48b0ae48a..65547e6b8017e53d711955f9f9fbee00dc739b86 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### * 'transform' sections can be added to a CFood to apply functions to values stored in variables. * default transform functions: submatch, split and replace. +* `*` can now be used as a wildcard in the identifiables parameter file to denote + that any Record may reference the identified one. ### Changed ### - If the `parents` key is used in a cfood at a lower level for a Record that @@ -19,8 +21,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - If a registered identifiable states, that a reference by a Record with parent RT1 is needed, then now also references from Records that have a child of RT1 as parent are accepted. +- More aggressive caching. ### Deprecated ### +- `IdentifiableAdapter.get_file` ### Removed ### diff --git a/CITATION.cff b/CITATION.cff index c6d41cc49d74b72056d825ca731fa79897fc537b..a41a45add0addbe657e6a95bd4e74f8c2365c742 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,25 +1,22 @@ cff-version: 1.2.0 message: "If you use this software, please cite it as below." authors: - - family-names: Fitschen - given-names: Timm - orcid: https://orcid.org/0000-0002-4022-432X - - family-names: Schlemmer - given-names: Alexander - orcid: https://orcid.org/0000-0003-4124-9649 - - family-names: Hornung - given-names: Daniel - orcid: https://orcid.org/0000-0002-7846-6375 - family-names: tom Wörden given-names: Henrik orcid: https://orcid.org/0000-0002-5549-578X - - family-names: Parlitz - given-names: Ulrich - orcid: https://orcid.org/0000-0003-3058-1435 + - family-names: Spreckelsen + given-names: Florian + orcid: https://orcid.org/0000-0002-6856-2910 - family-names: Luther given-names: Stefan orcid: https://orcid.org/0000-0001-7214-8125 + - family-names: Parlitz + given-names: Ulrich + orcid: https://orcid.org/0000-0003-3058-1435 +- family-names: Schlemmer + given-names: Alexander + orcid: https://orcid.org/0000-0003-4124-9649 title: CaosDB - Crawler version: 0.6.0 -doi: 10.3390/data4020083 +doi: 10.3390/data9020024 date-released: 2023-06-23 \ No newline at end of file diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 853098566f01ad878bda8769e9f5db1f3c6ec2d1..dddb83a7c57d90a987de74f337668f3bae73ee1b 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -516,14 +516,14 @@ class Converter(object, metaclass=ABCMeta): """ Template for the debugging output for the match function """ msg = "\n--------" + name + "-----------\n" for re, ma in zip(regexp, matched): - msg += "matching against:\n" + re + "\n" - msg += "matching:\n" + ma + "\n" + msg += "matching reg:\t" + re + "\n" + msg += "matching val:\t" + ma + "\n" msg += "---------\n" if result is None: - msg += "No match" + msg += "No match\n" else: msg += "Matched groups:\n" - msg += str(result) + msg += str(result)+'\n' msg += "----------------------------------------\n" logger.debug(msg) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 1ea5f84b68981b873f072470c820d6e38e1d12c5..1c6968d76257109cc24458a048d6d41ea64d2e5f 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -26,7 +26,7 @@ """ Crawl a file structure using a yaml cfood definition and synchronize -the acuired data with CaosDB. +the acuired data with LinkAhead. """ from __future__ import annotations @@ -44,17 +44,18 @@ from datetime import datetime from enum import Enum from typing import Any, Optional, Union -import caosdb as db +import linkahead as db import yaml from caosadvancedtools.cache import UpdateCache from caosadvancedtools.crawler import Crawler as OldCrawler from caosadvancedtools.serverside.helper import send_mail from caosadvancedtools.utils import create_entity_link -from caosdb.apiutils import (EntityMergeConflictError, compare_entities, - merge_entities) -from caosdb.cached import cache_clear, cached_get_entity_by -from caosdb.exceptions import EmptyUniqueQueryError +from linkahead.apiutils import (EntityMergeConflictError, compare_entities, + merge_entities) +from linkahead.cached import cache_clear, cached_get_entity_by from linkahead.common.datatype import get_list_datatype, is_reference +from linkahead.exceptions import EmptyUniqueQueryError +from linkahead.utils.escape import escape_squoted_text from .config import get_config_setting from .converters import Converter, ConverterValidationError @@ -658,9 +659,13 @@ class Crawler(object): try_to_merge_later.append(record) if newrecord.id is not None: record.id = newrecord.id + except NotImplementedError: + print(newrecord) + print(record) + raise Crawler.bend_references_to_new_object( old=record, new=newrecord, - entities=flat+to_be_updated+to_be_inserted+try_to_merge_later + entities=flat + to_be_updated + to_be_inserted + try_to_merge_later ) referencing_entities = self.create_reference_mapping( flat + to_be_updated + try_to_merge_later + to_be_inserted) @@ -892,6 +897,12 @@ class Crawler(object): "mode. This might lead to a failure of inserts that follow.") logger.info(parent_updates) + @staticmethod + def _get_property_id_for_datatype(rtname: str, name: str): + return cached_get_entity_by( + query=f"FIND Entity '{escape_squoted_text(rtname)}' " + f"with name='{escape_squoted_text(name)}'").id + @staticmethod def replace_name_with_referenced_entity_id(prop: db.Property): """changes the given property in place if it is a reference property that has a name as @@ -909,8 +920,8 @@ class Crawler(object): prop.datatype != db.FILE and prop.datatype != db.REFERENCE): # datatype is a non-generic reference and value is a string try: # the get_entity function will raise an error if not unique - prop.value = cached_get_entity_by( - query=f"FIND Entity {prop.datatype} with name='{prop.value}'").id + prop.value = Crawler._get_property_id_for_datatype( + rtname=prop.datatype, name=prop.value) except (db.EmptyUniqueQueryError, db.QueryNotUniqueError): logger.error("The Property {prop.name} with datatype={prop.datatype} has the " "value {prop.value} and there is no appropriate Entity with such " @@ -925,8 +936,8 @@ class Crawler(object): if isinstance(el, str): try: # the get_entity function will raise an error if not unique - propval.append(cached_get_entity_by( - query=f"FIND Entity {prop.datatype} with name='{prop.value}'").id) + propval.append(Crawler._get_property_id_for_datatype(rtname=dt, + name=el)) except (db.EmptyUniqueQueryError, db.QueryNotUniqueError): logger.error( "The Property {prop.name} with datatype={prop.datatype} has the " @@ -943,6 +954,8 @@ class Crawler(object): unique_names=True): for record in to_be_inserted: for prop in record.properties: + if prop.name == "name": + raise Exception('Cannot search for the property with name "name"') entity = cached_get_entity_by(name=prop.name) _resolve_datatype(prop, entity) Crawler.replace_name_with_referenced_entity_id(prop) diff --git a/src/caoscrawler/debug_tree.py b/src/caoscrawler/debug_tree.py index 9983981c69e3df7c58ddfda4b6977944eac54999..0d57040f5c20aca236a3c11531e8b7c45bad89ab 100644 --- a/src/caoscrawler/debug_tree.py +++ b/src/caoscrawler/debug_tree.py @@ -45,13 +45,13 @@ from importlib_resources import files from jsonschema import validate from typing import Any, Optional, Type, Union -import caosdb as db +import linkahead as db from caosadvancedtools.cache import UpdateCache, Cache from caosadvancedtools.crawler import Crawler as OldCrawler -from caosdb.apiutils import (compare_entities, EntityMergeConflictError, - merge_entities) -from caosdb.common.datatype import is_reference +from linkahead.apiutils import (compare_entities, EntityMergeConflictError, + merge_entities) +from linkahead.common.datatype import is_reference from .converters import Converter, DirectoryConverter, ConverterValidationError diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index d9c9c00b22443121b4989bf988a40308b143dbf1..dd8c032041a74fa05b16d93abb06186e7e6fa569 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -26,13 +26,16 @@ from __future__ import annotations import logging +import warnings from abc import ABCMeta, abstractmethod from datetime import datetime +from functools import lru_cache from typing import Any -import caosdb as db +import linkahead as db import yaml -from caosdb.cached import cached_get_entity_by +from linkahead.cached import cached_get_entity_by, cached_query +from linkahead.utils.escape import escape_squoted_text from .identifiable import Identifiable from .utils import has_parent @@ -43,21 +46,24 @@ logger = logging.getLogger(__name__) def get_children_of_rt(rtname): """Supply the name of a recordtype. This name and the name of all children RTs are returned in a list""" - return [p.name for p in db.execute_query(f"FIND RECORDTYPE {rtname}")] + escaped = escape_squoted_text(rtname) + return [p.name for p in cached_query(f"FIND RECORDTYPE '{escaped}'")] -def convert_value(value: Any): - """ Returns a string representation of the value that is suitable - to be used in the query - looking for the identified record. +def convert_value(value: Any) -> str: + """ Return a string representation of the value suitable for the search query. + + This is for search queries looking for the identified record. Parameters ---------- - value : Any type, the value that shall be returned and potentially converted. + value: Any + The value to be converted. Returns ------- - out : the string reprensentation of the value + out: str + the string reprensentation of the value. """ @@ -68,8 +74,7 @@ def convert_value(value: Any): elif isinstance(value, bool): return str(value).upper() elif isinstance(value, str): - # replace single quotes, otherwise they may break the queries - return value.replace("\'", "\\'") + return escape_squoted_text(value) else: return str(value) @@ -95,7 +100,7 @@ General question to clarify: The list of referenced by statements is currently not implemented. -The IdentifiableAdapter can be used to retrieve the three above mentioned objects (registred +The IdentifiableAdapter can be used to retrieve the three above mentioned objects (registered identifiabel, identifiable and identified record) for a Record. """ @@ -112,7 +117,8 @@ identifiabel, identifiable and identified record) for a Record. query_string = "FIND RECORD " if ident.record_type is not None: - query_string += f"'{ident.record_type}'" + escaped_rt = escape_squoted_text(ident.record_type) + query_string += f"'{escaped_rt}'" for ref in ident.backrefs: eid = ref if isinstance(ref, db.Entity): @@ -122,7 +128,7 @@ identifiabel, identifiable and identified record) for a Record. query_string += " WITH " if ident.name is not None: - query_string += "name='{}'".format(convert_value(ident.name)) + query_string += "name='{}'".format(escape_squoted_text(ident.name)) if len(ident.properties) > 0: query_string += " AND " @@ -138,10 +144,10 @@ identifiabel, identifiable and identified record) for a Record. query_string = "" for pname, pvalue in entity.properties.items(): if pvalue is None: - query_string += "'" + pname + "' IS NULL AND " + query_string += "'" + escape_squoted_text(pname) + "' IS NULL AND " elif isinstance(pvalue, list): for v in pvalue: - query_string += ("'" + pname + "'='" + + query_string += ("'" + escape_squoted_text(pname) + "'='" + convert_value(v) + "' AND ") # TODO: (for review) @@ -155,7 +161,7 @@ identifiabel, identifiable and identified record) for a Record. # IdentifiableAdapter.create_property_query(p.value) + # ") AND ") else: - query_string += ("'" + pname + "'='" + + query_string += ("'" + escape_squoted_text(pname) + "'='" + convert_value(pvalue) + "' AND ") # remove the last AND return query_string[:-4] @@ -174,6 +180,7 @@ identifiabel, identifiable and identified record) for a Record. @abstractmethod def get_file(self, identifiable: db.File): + warnings.warn(DeprecationWarning("This function is deprecated. Please do not use it.")) """ Retrieve the file object for a (File) identifiable. """ @@ -181,7 +188,7 @@ identifiabel, identifiable and identified record) for a Record. def get_identifiable(self, record: db.Record, referencing_entities=None): """ - retrieve the registred identifiable and fill the property values to create an + retrieve the registered identifiable and fill the property values to create an identifiable Args: @@ -215,22 +222,29 @@ identifiabel, identifiable and identified record) for a Record. # case A: in the registered identifiable # case B: in the identifiable - # TODO: similar to the Identifiable class, Registred Identifiable should be a + # TODO: similar to the Identifiable class, Registered Identifiable should be a # separate class too if prop.name.lower() == "is_referenced_by": for givenrt in prop.value: - rt_and_children = get_children_of_rt(givenrt) found = False - for rtname in rt_and_children: - if (id(record) in referencing_entities - and rtname in referencing_entities[id(record)]): - identifiable_backrefs.extend( - referencing_entities[id(record)][rtname]) + if givenrt == "*": + if id(record) not in referencing_entities: + continue + for rt, rec in referencing_entities[id(record)].items(): + identifiable_backrefs.extend(rec) found = True + else: + rt_and_children = get_children_of_rt(givenrt) + for rtname in rt_and_children: + if (id(record) in referencing_entities + and (rtname in referencing_entities[id(record)])): + identifiable_backrefs.extend( + referencing_entities[id(record)][rtname]) + found = True if not found: # TODO: is this the appropriate error? raise NotImplementedError( - f"The following record is missing an identifying property:" + f"The following record is missing an identifying property:\n" f"RECORD\n{record}\nIdentifying PROPERTY\n{prop.name}" ) continue @@ -241,7 +255,7 @@ identifiabel, identifiable and identified record) for a Record. # raise an exception? # TODO: is this the appropriate error? raise NotImplementedError( - f"The following record is missing an identifying property:" + f"The following record is missing an identifying property:\n" f"RECORD\n{record}\nIdentifying PROPERTY\n{prop.name}" ) identifiable_props[record_prop.name] = record_prop.value @@ -256,17 +270,21 @@ identifiabel, identifiable and identified record) for a Record. "Multi properties used in identifiables could cause unpredictable results and " "are not allowed. You might want to consider a Property with a list as value.") - # use the RecordType of the registred Identifiable if it exists + # use the RecordType of the registered Identifiable if it exists # We do not use parents of Record because it might have multiple - return Identifiable( - record_id=record.id, - record_type=(registered_identifiable.parents[0].name - if registered_identifiable else None), - name=record.name if name_is_identifying_property else None, - properties=identifiable_props, - path=record.path, - backrefs=identifiable_backrefs - ) + try: + return Identifiable( + record_id=record.id, + record_type=(registered_identifiable.parents[0].name + if registered_identifiable else None), + name=record.name if name_is_identifying_property else None, + properties=identifiable_props, + path=record.path, + backrefs=identifiable_backrefs + ) + except Exception: + logger.error(f"Error while creating identifiable for this record:\n{record}") + raise @abstractmethod def retrieve_identified_record_for_identifiable(self, identifiable: Identifiable): @@ -305,6 +323,8 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): """ def __init__(self): + warnings.warn(DeprecationWarning( + "This class is deprecated. Please use the CaosDBIdentifiableAdapter.")) self._registered_identifiables = dict() self._records = [] @@ -319,6 +339,7 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): Just look in records for a file with the same path. """ candidates = [] + warnings.warn(DeprecationWarning("This function is deprecated. Please do not use it.")) for record in self._records: if record.role == "File" and record.path == identifiable.path: candidates.append(record) @@ -466,14 +487,14 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): self._registered_identifiables[name] = definition def get_file(self, identifiable: Identifiable): + warnings.warn(DeprecationWarning("This function is deprecated. Please do not use it.")) # TODO is this needed for Identifiable? # or can we get rid of this function? if isinstance(identifiable, db.Entity): return cached_get_entity_by(path=identifiable) if identifiable.path is None: raise RuntimeError("Path must not be None for File retrieval.") - candidates = db.execute_query("FIND File which is stored at '{}'".format( - identifiable.path)) + candidates = cached_get_entity_by(path=identifiable.path) if len(candidates) > 1: raise RuntimeError("Identifiable was not defined unambigiously.") if len(candidates) == 0: @@ -482,7 +503,7 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): def get_registered_identifiable(self, record: db.Record): """ - returns the registred identifiable for the given Record + returns the registered identifiable for the given Record It is assumed, that there is exactly one identifiable for each RecordType. Only the first parent of the given Record is considered; others are ignored @@ -506,7 +527,7 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): def retrieve_identified_record_for_identifiable(self, identifiable: Identifiable): query_string = self.create_query_for_identifiable(identifiable) - candidates = db.execute_query(query_string) + candidates = cached_query(query_string) if len(candidates) > 1: raise RuntimeError( f"Identifiable was not defined unambigiously.\n{query_string}\nReturned the " diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index e8a3dc8024629a97d76e4528788e483f4887efd8..c4a8063cae5e74222e92f28a3898656bf4a97f6a 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -320,7 +320,7 @@ def scanner(items: list[StructureElement], converters_path = [] for element in items: - element_path = os.path.join(*(structure_elements_path + [element.get_name()])) + element_path = os.path.join(*(structure_elements_path + [str(element.get_name())])) logger.debug(f"Dealing with {element_path}") for converter in converters: diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst index 0b15f9b5d9aebefc2137b234ac4a9440b84906f5..b10deccdca1a2adf60ebcbac930bc797875724ff 100644 --- a/src/doc/concepts.rst +++ b/src/doc/concepts.rst @@ -85,7 +85,9 @@ we can check whether a Record with the parent "Project" is referencing the "Expe Record. If that is the case, this reference is part of the identifiable for the "Experiment" Record. Note, that if there are multiple Records with the appropriate parent (e.g. multiple "Project" Records in the above example) it will be required that all of them -reference the object to be identified. +reference the object to be identified. You can also use the wildcard "*" as +RecordType name in the configuration which will only require, that ANY Record +references the Record at hand. Identified Records diff --git a/src/doc/macros.rst b/src/doc/macros.rst index 560827e6fc4ff8b0238f16ca8d76b2c682bce505..5329ca6ddde49dbef439659d4904b07ed3f2bef9 100644 --- a/src/doc/macros.rst +++ b/src/doc/macros.rst @@ -192,6 +192,9 @@ The example will be expanded to: a: '5' +Note, that you need to make sure that subsequent macro calls do not +use the same top level key. Because later versions would overwrite previous +ones. Here we used ``$macro_name`` to prevent that. Limitation diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index 268b9800ddf1ef1688386f394ec1c6c7eb3e3912..7759fa55ce37e1c30ff9092eac3260ca80348bca 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -133,6 +133,22 @@ def test_non_default_name(): assert identifiable.name is None +def test_wildcard_ref(): + ident = CaosDBIdentifiableAdapter() + ident.register_identifiable( + "Person", db.RecordType() + .add_parent(name="Person") + .add_property(name="is_referenced_by", value=["*"])) + rec = (db.Record(name="don't touch it").add_parent("Person") + .add_property(name="last_name", value='Tom')) + identifiable = ident.get_identifiable(rec, + referencing_entities={ + id(rec): + {'A': [db.Record(id=1).add_parent("A")]}} + ) + assert identifiable.backrefs[0].id == 1 + + def test_convert_value(): # test that string representation of objects stay unchanged. No stripping or so. class A():