diff --git a/CHANGELOG.md b/CHANGELOG.md index 15a35a01473f02a55ef5d9f04aac6f2e13af4ca6..fd7168cd52294e15368a5ff17094dedd7bd5f2fa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,13 +9,44 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### +### Changed ### + +### Deprecated ### + +### Removed ### + +### Fixed ### + +### Security ### + +### Documentation ### + +## [0.8.0] - 2024-08-23 ## + +### Added ### + * Support for Python 3.12 and experimental support for 3.13 -* `spss_to_datamodel` script. -* `SPSSConverter` class +* CFood macros now accept complex objects as values, not just strings. +* More options for the `CSVTableConverter` +* New converters: + * `DatetimeElementConverter` + * `SPSSConverter` +* New scripts: + * `spss_to_datamodel` + * `csv_to_datamodel` +* New transformer functions: + * `date_parse` + * `datetime_parse` +* New ``PropertiesFromDictConverter`` which allows to automatically + create property values from dictionary keys. ### Changed ### -### Deprecated ### +* CFood macros do not render everything into strings now. +* Better internal handling of identifiable/reference resolving and merging of entities. This also + includes more understandable output for users. +* Better handling of missing imports, with nice messages for users. +* No longer use configuration of advancedtools to set to and from email addresses ### Removed ### @@ -24,11 +55,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed ### * [93](https://gitlab.com/linkahead/linkahead-crawler/-/issues/93) cfood.yaml does not allow umlaut in $expression +* [96](https://gitlab.com/linkahead/linkahead-crawler/-/issues/96) Do not fail silently on transaction errors ### Security ### ### Documentation ### +* General improvement of the documentaion, in many small places. +* The API documentation should now also include documentation of the constructors. + ## [0.7.1] - 2024-03-21 ## ### Fixed ### @@ -170,6 +205,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - ``add_prefix`` and ``remove_prefix`` arguments for the command line interface and the ``crawler_main`` function for the adding/removal of path prefixes when creating file entities. +- More strict checking of `identifiables.yaml`. +- Better error messages when server does not conform to expected data model. ### Changed ### @@ -218,7 +255,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Some StructureElements changed (see "How to upgrade" in the docs): - Dict, DictElement and DictDictElement were merged into DictElement. - DictTextElement and TextElement were merged into TextElement. The "match" - keyword is now invalid for TextElements. + keyword is now invalid for TextElements. - JSONFileConverter creates another level of StructureElements (see "How to upgrade" in the docs) - create_flat_list function now collects entities in a set and also adds the entities contained in the given list directly diff --git a/CITATION.cff b/CITATION.cff index abbd6b21e19c5a989c6d6d24f32d3946df070308..e219f1cb1868394a2997bf6ea93fc921d86163c4 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -17,6 +17,6 @@ authors: given-names: Alexander orcid: https://orcid.org/0000-0003-4124-9649 title: CaosDB - Crawler -version: 0.7.1 +version: 0.8.0 doi: 10.3390/data9020024 -date-released: 2023-03-21 \ No newline at end of file +date-released: 2024-08-23 \ No newline at end of file diff --git a/integrationtests/basic_example/test_basic.py b/integrationtests/basic_example/test_basic.py index c906a81d86af56669f7c522169bceb3b5fcb3e01..6fd322e5f6425e9bce25b970d6de7d99892762a5 100755 --- a/integrationtests/basic_example/test_basic.py +++ b/integrationtests/basic_example/test_basic.py @@ -32,7 +32,7 @@ import sys from argparse import RawTextHelpFormatter from pathlib import Path -import caosdb as db +import linkahead as db import pytest import yaml from caosadvancedtools.crawler import Crawler as OldCrawler @@ -42,8 +42,8 @@ from caoscrawler.debug_tree import DebugTree from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.scanner import scan_directory -from caosdb import EmptyUniqueQueryError -from caosdb.utils.register_tests import clear_database, set_test_key +from linkahead import EmptyUniqueQueryError +from linkahead.utils.register_tests import clear_database, set_test_key set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") diff --git a/integrationtests/test_use_case_simple_presentation.py b/integrationtests/test_use_case_simple_presentation.py index cf38e951b78534806c0ea76ef58051436aa22704..05b0a543deb03eb524d40d6a386876812e6b54e2 100644 --- a/integrationtests/test_use_case_simple_presentation.py +++ b/integrationtests/test_use_case_simple_presentation.py @@ -27,12 +27,12 @@ import os import pytest from subprocess import run -import caosdb as db +import linkahead as db from caosadvancedtools.loadFiles import loadpath -from caosdb.cached import cache_clear +from linkahead.cached import cache_clear from caosadvancedtools.models import parser as parser from caoscrawler.crawl import crawler_main -from caosdb.utils.register_tests import clear_database, set_test_key +from linkahead.utils.register_tests import clear_database, set_test_key set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") diff --git a/setup.cfg b/setup.cfg index 848150363c42776993029c54e777f4ff6ccf72ea..d272097e59ba5c256667a1fb9eef05bffff1fca3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = caoscrawler -version = 0.7.2 +version = 0.8.1 author = Alexander Schlemmer author_email = alexander.schlemmer@ds.mpg.de description = A new crawler for caosdb diff --git a/src/caoscrawler/cfood-schema.yml b/src/caoscrawler/cfood-schema.yml index 0e66f99affbbcfa3ed69b0f9cabdc7e59c50deec..acc3911f21d320146d0c35abc9d781541ee151ac 100644 --- a/src/caoscrawler/cfood-schema.yml +++ b/src/caoscrawler/cfood-schema.yml @@ -1,9 +1,44 @@ cfood: type: object + properties: + Converters: + description: Defintiion of custom converters + type: object + additionalProperties: + type: object + properties: + converter: + type: string + package: + type: string + required: + - converter + - package + macros: + description: Macro definitions + type: array + Transformers: + description: Variable transformer definition + type: object + additionalProperties: + type: object + properties: + function: + type: string + package: + type: string + required: + - package + - function additionalProperties: $ref: "#/$defs/converter" $defs: + parents: + description: Parents for this record are given here as a list of names. + type: array + items: + type: string converter: properties: type: @@ -28,7 +63,9 @@ cfood: - Definitions - Dict - Date + - Datetime - JSONFile + - YAMLFile - CSVTableConverter - XLSXTableConverter - SPSSFile @@ -39,6 +76,7 @@ cfood: - XMLFile - XMLTag - XMLTextNode + - PropertiesFromDictElement description: Type of this converter node. match: description: typically a regexp which is matched to a structure element name @@ -49,15 +87,46 @@ cfood: match_value: description: a regexp that is matched to the value of a key-value pair type: string - records: - description: This field is used to define new records or to modify records which have been defined on a higher level. + record_from_dict: + description: Only relevant for PropertiesFromDictElement. Specify the root record which is generated from the contained dictionary. type: object + required: + - variable_name properties: - parents: - description: Parents for this record are given here as a list of names. + variable_name: + description: | + Name of the record by which it can be accessed in the + cfood definiton. Can also be the name of an existing + record in which case that record will be updated by + the PropertiesFromDictConverter. + type: string + properties_blacklist: + description: List of keys to be ignored in the automatic treatment. They will be ignored on all levels of the dictionary. type: array items: type: string + references: + description: List of keys that will be transformed into named reference properties. + type: object + additionalProperties: + type: object + properties: + parents: + $ref: + "#/$defs/parents" + name: + description: Name of this record. If none is given, variable_name is used. + type: string + parents: + $ref: + "#/$defs/parents" + records: + description: This field is used to define new records or to modify records which have been defined on a higher level. + type: object + properties: + parents: + $ref: + "#/$defs/parents" additionalProperties: oneOf: - type: object @@ -79,3 +148,15 @@ cfood: additionalProperties: $ref: "#/$defs/converter" + if: + properties: + type: + const: + "PropertiesFromDictElement" + then: + required: + - type + - record_from_dict + else: + required: + - type diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 2b4da3d94bb359814aae03a4de0ae20473aea8e5..9805d1103e380f688b40a9bfd4c3d03129dbd591 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -435,6 +435,7 @@ class Converter(object, metaclass=ABCMeta): return for transformer_key, transformer in self.definition["transform"].items(): in_value = replace_variables(transformer["in"], values) + out_value = in_value for tr_func_el in transformer["functions"]: if not isinstance(tr_func_el, dict): @@ -826,6 +827,180 @@ class DictElementConverter(Converter): return match_name_and_value(self.definition, element.name, element.value) +class PropertiesFromDictConverter(DictElementConverter): + """Extend the :py:class:`DictElementConverter` by a heuristic to set + property values from the dictionary keys. + + """ + + def __init__(self, definition: dict, name: str, converter_registry: dict, + referenced_record_callback: Optional[callable] = None): + + super().__init__(definition, name, converter_registry) + self.referenced_record_callback = referenced_record_callback + + def _recursively_create_records(self, subdict: dict, root_record: db.Record, + root_rec_name: str, + values: GeneralStore, records: RecordStore, + referenced_record_callback: callable, + keys_modified: list = [] + ): + """Create a record form the given `subdict` and recursively create referenced records.""" + + blacklisted_keys = self.definition["record_from_dict"][ + "properties_blacklist"] if "properties_blacklist" in self.definition["record_from_dict"] else [] + special_references = self.definition["record_from_dict"]["references"] if "references" in self.definition["record_from_dict"] else [ + ] + + for key, value in subdict.items(): + + if key in blacklisted_keys: + # We ignore this in the automated property generation + continue + if isinstance(value, list): + if not any([isinstance(val, dict) for val in value]): + # no dict in list, i.e., no references, so this is simple + root_record.add_property(name=key, value=value) + else: + if not all([isinstance(val, dict) for val in value]): + # if this is not an error (most probably it is), this + # needs to be handled manually for now. + raise ValueError( + f"{key} in {subdict} contains a mixed list of references and scalars.") + ref_recs = [] + for ii, ref_dict in enumerate(value): + ref_var_name = f"{root_rec_name}.{key}.{ii+1}" + ref_rec, keys_modified = self._create_ref_rec( + ref_var_name, + key, + ref_dict, + special_references, + records, + values, + keys_modified, + referenced_record_callback + ) + ref_recs.append(ref_rec) + root_record.add_property(name=key, value=ref_recs) + + elif isinstance(value, dict): + # Treat scalar reference + ref_var_name = f"{root_rec_name}.{key}" + ref_rec, keys_modified = self._create_ref_rec( + ref_var_name, + key, + value, + special_references, + records, + values, + keys_modified, + referenced_record_callback + ) + root_record.add_property(key, ref_rec) + else: + # All that remains are scalar properties which may or + # may not be special attributes like name. + if key.lower() in SPECIAL_PROPERTIES: + setattr(root_record, key.lower(), value) + else: + root_record.add_property(name=key, value=value) + keys_modified.append((root_rec_name, key)) + + if referenced_record_callback: + root_record = referenced_record_callback(root_record, records, values) + + return keys_modified + + def _create_ref_rec( + self, + name: str, + key: str, + subdict: dict, + special_references: dict, + records: RecordStore, + values: GeneralStore, + keys_modified: list, + referenced_record_callback: callable + ): + """Create the referenced Record and forward the stores etc. to + ``_recursively_create_records``. + + Parameters: + ----------- + name : str + name of the referenced record to be created in RecordStore and Value Store. + key : str + name of the key this record's definition had in the original dict. + subdict : dict + subdict containing this record's definition from the original dict. + special_references : dict + special treatment of referenced records from the converter definition. + records : RecordStore + RecordStore for entering new Records + values : GeneralStore + ValueStore for entering new Records + keys_modified : list + List for keeping track of changes + referenced_record_callback : callable + Advanced treatment of referenced records as given in the + converter initialization. + """ + ref_rec = db.Record() + if key in special_references: + for par in special_references[key]["parents"]: + ref_rec.add_parent(par) + else: + ref_rec.add_parent(key) + records[name] = ref_rec + values[name] = ref_rec + keys_modified = self._recursively_create_records( + subdict=subdict, + root_record=ref_rec, + root_rec_name=name, + values=values, + records=records, + referenced_record_callback=referenced_record_callback, + keys_modified=keys_modified + ) + return ref_rec, keys_modified + + def create_records(self, values: GeneralStore, records: RecordStore, + element: StructureElement): + + keys_modified = [] + + rfd = self.definition["record_from_dict"] + if rfd["variable_name"] not in records: + rec = db.Record() + if "name" in rfd: + rec.name = rfd["name"] + if "parents" in rfd: + for par in rfd["parents"]: + rec.add_parent(par) + else: + rec.add_parent(rfd["variable_name"]) + records[rfd["variable_name"]] = rec + values[rfd["variable_name"]] = rec + + else: + rec = records[rfd["variable_name"]] + + keys_modified = self._recursively_create_records( + subdict=element.value, + root_record=rec, + root_rec_name=rfd["variable_name"], + values=values, + records=records, + referenced_record_callback=self.referenced_record_callback, + keys_modified=keys_modified, + ) + + keys_modified.extend(super().create_records( + values=values, records=records, element=element)) + + return keys_modified + + class DictConverter(DictElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( @@ -1249,11 +1424,12 @@ class DateElementConverter(TextElementConverter): """allows to convert different text formats of dates to Python date objects. The text to be parsed must be contained in the "date" group. The format string can be supplied - under "dateformat" in the Converter definition. The library used is datetime so see its + under "date_format" in the Converter definition. The library used is datetime so see its documentation for information on how to create the format string. """ + # TODO make `date` parameter name configurable def match(self, element: StructureElement): matches = super().match(element) if matches is not None and "date" in matches: @@ -1262,3 +1438,24 @@ class DateElementConverter(TextElementConverter): self.definition["date_format"] if "date_format" in self.definition else "%Y-%m-%d" ).date()}) return matches + + +class DatetimeElementConverter(TextElementConverter): + """Convert text so that it is formatted in a way that LinkAhead can understand it. + +The text to be parsed must be in the ``val`` parameter. The format string can be supplied in the +``datetime_format`` node. This class uses the ``datetime`` module, so ``datetime_format`` must +follow this specificaton: +https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes + + """ + + # TODO make `val` parameter name configurable + def match(self, element: StructureElement): + matches = super().match(element) + if matches is not None and "val" in matches: + fmt_default = "%Y-%m-%dT%H:%M:%S" + fmt = self.definition.get("datetime_format", fmt_default) + dt_str = datetime.datetime.strptime(matches["val"], fmt).strftime(fmt_default) + matches.update({"val": dt_str}) + return matches diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 2ce5eae9afbd78cbf4b78db0b152fa7578258ee9..0f23acfdfde2a863a66f25901a85748b538f5d04 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -55,6 +55,9 @@ from linkahead.apiutils import (compare_entities, merge_entities) from linkahead.cached import cache_clear, cached_get_entity_by from linkahead.common.datatype import get_list_datatype, is_reference +from linkahead.exceptions import ( + TransactionError, +) from linkahead.utils.escape import escape_squoted_text from .config import get_config_setting @@ -746,9 +749,31 @@ one with the entities that need to be updated and the other with entities to be def inform_about_pending_changes(pending_changes, run_id, path, inserts=False): # Sending an Email with a link to a form to authorize updates is if get_config_setting("send_crawler_notifications"): - filename = OldCrawler.save_form( - [el[3] for el in pending_changes], path, run_id) - OldCrawler.send_mail([el[3] for el in pending_changes], filename) + filename = OldCrawler.save_form([el[3] for el in pending_changes], path, run_id) + text = """Dear Curator, + there where changes that need your authorization. Please check the following + carefully and if the changes are ok, click on the following link: + + {url}/Shared/{filename} + + {changes} + """.format(url=db.configuration.get_config()["Connection"]["url"], + filename=filename, + changes="\n".join([el[3] for el in pending_changes])) + try: + fro = get_config_setting("sendmail_from_address") + to = get_config_setting("sendmail_to_address") + except KeyError: + logger.error("Server Configuration is missing a setting for " + "sending mails. The administrator should check " + "'from_mail' and 'to_mail'.") + return + + send_mail( + from_addr=fro, + to=to, + subject="Crawler Update", + body=text) for i, el in enumerate(pending_changes): @@ -859,6 +884,7 @@ def _notify_about_inserts_and_updates(n_inserts, n_updates, logfile, run_id): The email contains some basic information and a link to the log and the CrawlerRun Record. """ if not get_config_setting("send_crawler_notifications"): + logger.debug("Crawler email notifications are disabled.") return if n_inserts == 0 and n_updates == 0: return @@ -869,8 +895,8 @@ the CaosDB Crawler successfully crawled the data and """ + domain = get_config_setting("public_host_url") if get_config_setting("create_crawler_status_records"): - domain = get_config_setting("public_host_url") text += ("You can checkout the CrawlerRun Record for more information:\n" f"{domain}/Entity/?P=0L10&query=find%20crawlerrun%20with%20run_id=%27{run_id}%27\n\n") text += (f"You can download the logfile here:\n{domain}/Shared/" + logfile) @@ -1056,6 +1082,10 @@ def crawler_main(crawled_directory_path: str, ident = CaosDBIdentifiableAdapter() ident.load_from_yaml_definition(identifiables_definition_file) crawler.identifiableAdapter = ident + else: + # TODO + # raise ValueError("An identifiable file is needed.") + pass remove_prefix = _treat_deprecated_prefix(prefix, remove_prefix) @@ -1081,15 +1111,24 @@ def crawler_main(crawled_directory_path: str, logger.error(err) _update_status_record(crawler.run_id, 0, 0, status="FAILED") return 1 + except TransactionError as err: + logger.debug(traceback.format_exc()) + logger.error(err) + logger.error("Transaction error details:") + for suberr in err.errors: + logger.error("---") + logger.error(suberr.msg) + logger.error(suberr.entity) + return 1 except Exception as err: logger.debug(traceback.format_exc()) - logger.debug(err) + logger.error(err) if "SHARED_DIR" in os.environ: # pylint: disable=E0601 domain = get_config_setting("public_host_url") - logger.error("Unexpected Error: Please tell your administrator about this and provide the" - f" following path.\n{domain}/Shared/" + debuglog_public) + logger.error("Unexpected Error: Please tell your administrator about this and provide " + f"the following path.\n{domain}/Shared/" + debuglog_public) _update_status_record(crawler.run_id, 0, 0, status="FAILED") return 1 diff --git a/src/caoscrawler/default_converters.yml b/src/caoscrawler/default_converters.yml index c0af3e8698f33f7564ba5c0b7463d9d619dc332d..cb4a7d8c63489158c15dcf86b83fd940cd608460 100644 --- a/src/caoscrawler/default_converters.yml +++ b/src/caoscrawler/default_converters.yml @@ -8,9 +8,15 @@ BooleanElement: Date: converter: DateElementConverter package: caoscrawler.converters +Datetime: + converter: DatetimeElementConverter + package: caoscrawler.converters Dict: converter: DictElementConverter package: caoscrawler.converters +PropertiesFromDictElement: + converter: PropertiesFromDictConverter + package: caoscrawler.converters FloatElement: converter: FloatElementConverter package: caoscrawler.converters diff --git a/src/caoscrawler/default_transformers.yml b/src/caoscrawler/default_transformers.yml index d0ad23912176bdfbf2446aa6e04bd7fa6b858777..ffcb1b15bd2bad71083cc8f0ba84172ee3daf2b0 100644 --- a/src/caoscrawler/default_transformers.yml +++ b/src/caoscrawler/default_transformers.yml @@ -1,4 +1,4 @@ - +# Lookup table for matching functions and cfood yaml node names. submatch: package: caoscrawler.transformer_functions @@ -9,3 +9,9 @@ split: replace: package: caoscrawler.transformer_functions function: replace +date_parse: + package: caoscrawler.transformer_functions + function: date_parse +datetime_parse: + package: caoscrawler.transformer_functions + function: datetime_parse diff --git a/src/caoscrawler/exceptions.py b/src/caoscrawler/exceptions.py index 6d08cf76fc177407154e38f0eb6aaa47bc863866..e7c61c34e2abbebef4790bde42f50d4b5b29f957 100644 --- a/src/caoscrawler/exceptions.py +++ b/src/caoscrawler/exceptions.py @@ -27,15 +27,6 @@ class ForbiddenTransaction(Exception): pass -class MissingReferencingEntityError(Exception): - """Thrown if the identifiable requires that some entity references the given entity but there - is no such reference """ - - def __init__(self, *args, rts=None, **kwargs): - self.rts = rts - super().__init__(self, *args, **kwargs) - - class ImpossibleMergeError(Exception): """Thrown if due to identifying information, two SyncNodes or two Properties of SyncNodes should be merged, but there is conflicting information that prevents this. @@ -47,8 +38,29 @@ class ImpossibleMergeError(Exception): super().__init__(self, *args, **kwargs) +class InvalidIdentifiableYAML(Exception): + """Thrown if the identifiable definition is invalid.""" + pass + + class MissingIdentifyingProperty(Exception): """Thrown if a SyncNode does not have the properties required by the corresponding registered identifiable """ pass + + +class MissingRecordType(Exception): + """Thrown if an record type can not be found although it is expected that it exists on the + server. + """ + pass + + +class MissingReferencingEntityError(Exception): + """Thrown if the identifiable requires that some entity references the given entity but there + is no such reference """ + + def __init__(self, *args, rts=None, **kwargs): + self.rts = rts + super().__init__(self, *args, **kwargs) diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 3aae9353cb4c0cf4d6c264616d770837d87e801e..854ee614638712bdcf957c592ef2946dbdd43afc 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -36,7 +36,12 @@ import yaml from linkahead.cached import cached_get_entity_by, cached_query from linkahead.utils.escape import escape_squoted_text -from .exceptions import MissingIdentifyingProperty, MissingReferencingEntityError +from .exceptions import ( + InvalidIdentifiableYAML, + MissingIdentifyingProperty, + MissingRecordType, + MissingReferencingEntityError, +) from .identifiable import Identifiable from .sync_node import SyncNode from .utils import has_parent @@ -48,7 +53,10 @@ def get_children_of_rt(rtname): """Supply the name of a recordtype. This name and the name of all children RTs are returned in a list""" escaped = escape_squoted_text(rtname) - return [p.name for p in cached_query(f"FIND RECORDTYPE '{escaped}'")] + recordtypes = [p.name for p in cached_query(f"FIND RECORDTYPE '{escaped}'")] + if not recordtypes: + raise MissingRecordType(f"Record type could not be found on server: {rtname}") + return recordtypes def convert_value(value: Any) -> str: @@ -165,7 +173,10 @@ class IdentifiableAdapter(metaclass=ABCMeta): """ if node.registered_identifiable is None: if raise_exception: - raise RuntimeError("no registered_identifiable") + parents = [p.name for p in node.parents] + parents_str = "\n".join(f"- {p}" for p in parents) + raise RuntimeError("No registered identifiable for node with these parents:\n" + + parents_str) else: return False for prop in node.registered_identifiable.properties: @@ -576,19 +587,32 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): """Load identifiables defined in a yaml file""" with open(path, "r", encoding="utf-8") as yaml_f: identifiable_data = yaml.safe_load(yaml_f) + self.load_from_yaml_object(identifiable_data) - for key, value in identifiable_data.items(): - rt = db.RecordType().add_parent(key) - for prop_name in value: + def load_from_yaml_object(self, identifiable_data): + """Load identifiables defined in a yaml object. + """ + + for rt_name, id_list in identifiable_data.items(): + rt = db.RecordType().add_parent(rt_name) + if not isinstance(id_list, list): + raise InvalidIdentifiableYAML( + f"Identifiable contents must be lists, but this was not: {rt_name}") + for prop_name in id_list: if isinstance(prop_name, str): rt.add_property(name=prop_name) elif isinstance(prop_name, dict): for k, v in prop_name.items(): + if k == "is_referenced_by" and not isinstance(v, list): + raise InvalidIdentifiableYAML( + f"'is_referenced_by' must be a list. Found in: {rt_name}") rt.add_property(name=k, value=v) else: - NotImplementedError("YAML is not structured correctly") + raise InvalidIdentifiableYAML( + "Identifiable properties must be str or dict, but this one was not:\n" + f" {rt_name}/{prop_name}") - self.register_identifiable(key, rt) + self.register_identifiable(rt_name, rt) def register_identifiable(self, name: str, definition: db.RecordType): self._registered_identifiables[name] = definition diff --git a/src/caoscrawler/macros/macro_yaml_object.py b/src/caoscrawler/macros/macro_yaml_object.py index c6b5de27d7f498d9b1db6b6a90d986487340a880..d85883011db3cf651da0dda6c110015128fbe439 100644 --- a/src/caoscrawler/macros/macro_yaml_object.py +++ b/src/caoscrawler/macros/macro_yaml_object.py @@ -25,12 +25,17 @@ # Function to expand a macro in yaml # A. Schlemmer, 05/2022 +import re from dataclasses import dataclass from typing import Any, Dict from copy import deepcopy from string import Template +_SAFE_SUBST_PAT = re.compile(r"^\$(?P<key>\w+)$") +_SAFE_SUBST_PAT_BRACES = re.compile(r"^\$\{(?P<key>\w+)}$") + + @dataclass class MacroDefinition: """ @@ -53,6 +58,12 @@ def substitute(propvalue, values: dict): Substitution of variables in strings using the variable substitution library from python's standard library. """ + # Simple matches are simply replaced by the raw dict entry. + if match := (_SAFE_SUBST_PAT.fullmatch(propvalue) + or _SAFE_SUBST_PAT_BRACES.fullmatch(propvalue)): + key = match.group("key") + if key in values: + return values[key] propvalue_template = Template(propvalue) return propvalue_template.safe_substitute(**values) diff --git a/src/caoscrawler/scripts/generators.py b/src/caoscrawler/scripts/generators.py index 927d4dcaf7f6123a50d30657beff1cb1b32d381e..ba8e6e39cc03e9be1923d72ec5c8d699c01fa8f9 100644 --- a/src/caoscrawler/scripts/generators.py +++ b/src/caoscrawler/scripts/generators.py @@ -104,17 +104,27 @@ metadata: directory: # corresponds to the directory given to the crawler type: Directory match: .* # we do not care how it is named here + records: + DirRecord: # One record for each directory. subtree: # This is the file thisfile: type: []{file} match: []{match} + records: + DatFileRecord: # One record for each matching file + role: File + path: $thisfile + file: $thisfile subtree: entry: type: Dict match: .* # Name is irrelevant records: - MyParent: + BaseElement: # One BaseElement record for each row in the CSV/TSV file + DatFileRecord: $DatFileRecord + DirRecord: + BaseElement: +$BaseElement subtree: !macro """ @@ -196,8 +206,24 @@ cfood: str defs.append(def_str) del defs + sep = repr(sniffed.delimiter) + sep = f'"{sep[1:-1]}"' + match_str = f"""'.*[ct]sv' + sep: {sep} + # "header": [int] + # "names": [str] + # "index_col": [int] + # "usecols": [int] + # "true_values": [str] + # "false_values": [str] + # "na_values": [str] + # "skiprows": [int] + # "nrows": [int] + # "keep_default_na": [bool] + """ + cfood_str = (_CustomTemplate(CFOOD_TEMPLATE).substitute({"file": "CSVTableConverter", - "match": ".*\\[ct]sv"}) + "match": match_str}) + prefix[2:] + "ColumnValue:\n" + "".join(defs_col_value) + prefix[2:] + "ColumnValueReference:\n" + "".join(defs_col_value_ref) ) diff --git a/src/caoscrawler/transformer_functions.py b/src/caoscrawler/transformer_functions.py index eda9f3c2bc98c8d2561f152f9f6ddd422daee00a..ce08bc6bc05caa84f342cdc25f3243c5bab0b79c 100644 --- a/src/caoscrawler/transformer_functions.py +++ b/src/caoscrawler/transformer_functions.py @@ -20,9 +20,14 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. +"""Definition of default transformer functions. + +See https://docs.indiscale.com/caosdb-crawler/converters.html#transform-functions for more +information. + """ -Defnition of default transformer functions. -""" + +import datetime import re from typing import Any @@ -61,3 +66,36 @@ def replace(in_value: Any, in_parameters: dict): if not isinstance(in_value, str): raise RuntimeError("must be string") return in_value.replace(in_parameters['remove'], in_parameters['insert']) + + +def date_parse(in_value: str, params: dict) -> str: + """Transform text so that it is formatted in a way that LinkAhead can understand it. + +Parameters +========== + +- date_format: str, optional + A format string using the ``datetime`` specificaton: + https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes + """ + fmt_default = "%Y-%m-%d" + fmt = params.get("date_format", fmt_default) + dt_str = datetime.datetime.strptime(in_value, fmt).strftime(fmt_default) + return dt_str + + +def datetime_parse(in_value: str, params: dict) -> str: + """Transform text so that it is formatted in a way that LinkAhead can understand it. + + +Parameters +========== + +- datetime_format: str, optional + A format string using the ``datetime`` specificaton: + https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes + """ + fmt_default = "%Y-%m-%dT%H:%M:%S" + fmt = params.get("datetime_format", fmt_default) + dt_str = datetime.datetime.strptime(in_value, fmt).strftime(fmt_default) + return dt_str diff --git a/src/doc/README_SETUP.md b/src/doc/README_SETUP.md index a75193783d861707adf3b3d45311c392e22626f4..32f0bb89a6051bc2ec4be0bae6cf06cd1a540f8b 100644 --- a/src/doc/README_SETUP.md +++ b/src/doc/README_SETUP.md @@ -13,7 +13,10 @@ see INSTALL.md We use sphinx to create the documentation. Docstrings in the code should comply with the Googly style (see link below). -Build documentation in `src/doc` with `make html`. +Build documentation in `src/doc` with `make doc`. Note that for the +automatic generation of the complete API documentation, it is +necessary to first install this library with all its optional +dependencies, i.e., `pip install .[h5-crawler,spss]`. ### Requirements ### diff --git a/src/doc/conf.py b/src/doc/conf.py index 3248726ed63dd80fdee7c06da3c27caace93f22c..fc4ca0bea1742c99375c2a30bad42924180f7507 100644 --- a/src/doc/conf.py +++ b/src/doc/conf.py @@ -33,10 +33,10 @@ copyright = '2024, IndiScale' author = 'Alexander Schlemmer' # The short X.Y version -version = '0.7.2' +version = '0.8.1' # The full version, including alpha/beta/rc tags # release = '0.5.2-rc2' -release = '0.7.2-dev' +release = '0.8.1-dev' # -- General configuration --------------------------------------------------- diff --git a/src/doc/converters.rst b/src/doc/converters.rst index d7e11c235fafa1e42f53342a24255ceb0d275ed4..f59e6d3dff0a1f75dc4e0e5bcbbee0b4ceb7e81d 100644 --- a/src/doc/converters.rst +++ b/src/doc/converters.rst @@ -31,20 +31,20 @@ The yaml definition may look like this: .. code-block:: yaml <NodeName>: - type: <ConverterName> - match: ".*" - records: - Experiment1: - parents: - - Experiment - - Blablabla - date: $DATUM - (...) - Experiment2: - parents: - - Experiment - subtree: - (...) + type: <ConverterName> + match: ".*" + records: + Experiment1: + parents: + - Experiment + - Blablabla + date: $DATUM + (...) + Experiment2: + parents: + - Experiment + subtree: + (...) The **<NodeName>** is a description of what the current block represents (e.g. ``experiment-folder``) and is used as an identifier. @@ -76,35 +76,35 @@ applied to the respective variables when the converter is executed. .. code-block:: yaml <NodeName>: - type: <ConverterName> - match: ".*" - transform: - <TransformNodeName>: - in: $<in_var_name> - out: $<out_var_name> - functions: - - <func_name>: # name of the function to be applied - <func_arg1>: <func_arg1_value> # key value pairs that are passed as parameters - <func_arg2>: <func_arg2_value> - # ... + type: <ConverterName> + match: ".*" + transform: + <TransformNodeName>: + in: $<in_var_name> + out: $<out_var_name> + functions: + - <func_name>: # name of the function to be applied + <func_arg1>: <func_arg1_value> # key value pairs that are passed as parameters + <func_arg2>: <func_arg2_value> + # ... An example that splits the variable ``a`` and puts the generated list in ``b`` is the following: .. code-block:: yaml Experiment: - type: Dict - match: ".*" - transform: - param_split: - in: $a - out: $b - functions: - - split: # split is a function that is defined by default - marker: "|" # its only parameter is the marker that is used to split the string - records: - Report: - tags: $b + type: Dict + match: ".*" + transform: + param_split: + in: $a + out: $b + functions: + - split: # split is a function that is defined by default + marker: "|" # its only parameter is the marker that is used to split the string + records: + Report: + tags: $b This splits the string in '$a' and stores the resulting list in '$b'. This is here used to add a list valued property to the Report Record. @@ -218,21 +218,21 @@ Example: type: CSVTableConverter match: ^test_table.csv$ records: - (...) # Records edited for the whole table file + (...) # Records edited for the whole table file subtree: - ROW: # Any name for a data row in the table - type: DictElement - match_name: .* - match_value: .* - records: - (...) # Records edited for each row - subtree: - COLUMN: # Any name for a specific type of column in the table - type: FloatElement - match_name: measurement # Name of the column in the table file - match_value: (?P<column_value).*) - records: - (...) # Records edited for each cell + ROW: # Any name for a data row in the table + type: DictElement + match_name: .* + match_value: .* + records: + (...) # Records edited for each row + subtree: + COLUMN: # Any name for a specific type of column in the table + type: FloatElement + match_name: measurement # Name of the column in the table file + match_value: (?P<column_value).*) + records: + (...) # Records edited for each cell XLSXTableConverter @@ -245,6 +245,140 @@ CSVTableConverter CSV File → DictElement +PropertiesFromDictConverter +=========================== + +The :py:class:`~caoscrawler.converters.PropertiesFromDictConverter` is +a specialization of the +:py:class:`~caoscrawler.converters.DictElementConverter` and offers +all its functionality. It is meant to operate on dictionaries (e.g., +from reading in a json or a table file), the keys of which correspond +closely to properties in a LinkAhead datamodel. This is especially +handy in cases where properties may be added to the data model and +data sources that are not yet known when writing the cfood definition. + +The converter definition of the +:py:class:`~caoscrawler.converters.PropertiesFromDictConverter` has an +additional required entry ``record_from_dict`` which specifies the +Record to which the properties extracted from the dict are attached +to. This Record is identified by its ``variable_name`` by which it can +be referred to further down the subtree. You can also use the name of +a Record that was specified earlier in the CFood definition in order +to extend it by the properties extracted from a dict. Let's have a +look at a simple example. A CFood definition + +.. code-block:: yaml + + PropertiesFromDictElement: + type: PropertiesFromDictElement + match: ".*" + record_from_dict: + variable_name: MyRec + parents: + - MyType1 + - MyType2 + +applied to a dictionary + +.. code-block:: json + + { + "name": "New name", + "a": 5, + "b": ["a", "b", "c"], + "author": { + "full_name": "Silvia Scientist" + } + } + +will create a Record ``New name`` with parents ``MyType1`` and +``MyType2``. It has a scalar property ``a`` with value 5, a list +property ``b`` with values "a", "b" and "c", and an ``author`` +property which references an ``author`` with a ``full_name`` property +with value "Silvia Scientist": + +.. image:: img/properties-from-dict-records-author.png + :height: 210 + +Note how the different dictionary keys are handled differently +depending on their types: scalar and list values are understood +automatically, and a dictionary-valued entry like ``author`` is +translated into a reference to an ``author`` Record automatically. + +You can further specify how references are treated with an optional +``references key`` in ``record_from_dict``. Let's assume that in the +above example, we have an ``author`` **Property** with datatype +``Person`` in our data model. We could add this information by +extending the above example definition by + + +.. code-block:: yaml + + PropertiesFromDictElement: + type: PropertiesFromDictElement + match: ".*" + record_from_dict: + variable_name: MyRec + parents: + - MyType1 + - MyType2 + references: + author: + parents: + - Person + +so that now, a ``Person`` record with a ``full_name`` property with +value "Silvia Scientist" is created as the value of the ``author`` +property: + +.. image:: img/properties-from-dict-records-person.png + :height: 200 + +For the time being, only the parents of the referenced record can be +set via this option. More complicated treatments can be implemented +via the ``referenced_record_callback`` (see below). + +Properties can be blacklisted with the ``properties_blacklist`` +keyword, i.e., all keys listed under ``properties_blacklist`` will be +excluded from automated treatment. Since the +:py:class:`~caoscrawler.converters.PropertiesFromDictConverter` has +all the functionality of the +:py:class:`~caoscrawler.converters.DictElementConverter`, individual +properties can still be used in a subtree. Together with +``properties_blacklist`` this can be used to add custom treatment to +specific properties by blacklisting them in ``record_from_dict`` and +then treating them in the subtree the same as you would do it in the +standard +:py:class:`~caoscrawler.converters.DictElementConverter`. Note that +the blacklisted keys are excluded on **all** levels of the dictionary, +i.e., also when they occur in a referenced entity. + +For further customization, the +:py:class:`~caoscrawler.converters.PropertiesFromDictConverter` can be +used as a basis for :ref:`custom converters<Custom Converters>` which +can make use of its ``referenced_record_callback`` argument. The +``referenced_record_callback`` can be a callable object which takes +exactly a Record as an argument and needs to return that Record after +doing whatever custom treatment is needed. Additionally, it is given +the ``RecordStore`` and the ``ValueStore`` in order to be able to +access the records and values that have already been defined from +within ``referenced_record_callback``. Such a function might look the +following: + +.. code-block:: python + + def my_callback(rec: db.Record, records: RecordStore, values: GeneralStore): + # do something with rec, possibly using other records or values from the stores... + rec.description = "This was updated in a callback" + return rec + +It is applied to all Records that are created from the dictionary and +it can be used to, e.g., transform values of some properties, or add +special treatment to all Records of a specific +type. ``referenced_record_callback`` is applied **after** the +properties from the dictionary have been applied as explained above. + + Further converters ++++++++++++++++++ @@ -293,7 +427,7 @@ datamodel like H5Ndarray: obligatory_properties: internal_hdf5-path: - datatype: TEXT + datatype: TEXT although the names of both property and record type can be configured within the cfood definition. @@ -407,11 +541,11 @@ First we will create our package and module structure, which might be: tox.ini src/ scifolder/ - __init__.py - converters/ - __init__.py - sources.py # <- the actual file containing - # the converter class + __init__.py + converters/ + __init__.py + sources.py # <- the actual file containing + # the converter class doc/ unittests/ @@ -436,74 +570,74 @@ that would be given using a yaml definition (see next section below). """ def __init__(self, definition: dict, name: str, - converter_registry: dict): - """ - Initialize a new directory converter. - """ - super().__init__(definition, name, converter_registry) + converter_registry: dict): + """ + Initialize a new directory converter. + """ + super().__init__(definition, name, converter_registry) def create_children(self, generalStore: GeneralStore, - element: StructureElement): + element: StructureElement): - # The source resolver does not create children: + # The source resolver does not create children: - return [] + return [] def create_records(self, values: GeneralStore, - records: RecordStore, - element: StructureElement, - file_path_prefix): - if not isinstance(element, TextElement): - raise RuntimeError() - - # This function must return a list containing tuples, each one for a modified - # property: (name_of_entity, name_of_property) - keys_modified = [] - - # This is the name of the entity where the source is going to be attached: - attach_to_scientific_activity = self.definition["scientific_activity"] - rec = records[attach_to_scientific_activity] - - # The "source" is a path to a source project, so it should have the form: - # /<Category>/<project>/<scientific_activity>/ - # obtain these information from the structure element: - val = element.value - regexp = (r'/(?P<category>(SimulationData)|(ExperimentalData)|(DataAnalysis))' - '/(?P<project_date>.*?)_(?P<project_identifier>.*)' - '/(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))?/') - - res = re.match(regexp, val) - if res is None: - raise RuntimeError("Source cannot be parsed correctly.") - - # Mapping of categories on the file system to corresponding record types in CaosDB: - cat_map = { - "SimulationData": "Simulation", - "ExperimentalData": "Experiment", - "DataAnalysis": "DataAnalysis"} - linkrt = cat_map[res.group("category")] - - keys_modified.extend(create_records(values, records, { - "Project": { - "date": res.group("project_date"), - "identifier": res.group("project_identifier"), - }, - linkrt: { - "date": res.group("date"), - "identifier": res.group("identifier"), - "project": "$Project" - }, - attach_to_scientific_activity: { - "sources": "+$" + linkrt - }}, file_path_prefix)) - - # Process the records section of the yaml definition: - keys_modified.extend( - super().create_records(values, records, element, file_path_prefix)) - - # The create_records function must return the modified keys to make it compatible - # to the crawler functions: - return keys_modified + records: RecordStore, + element: StructureElement, + file_path_prefix): + if not isinstance(element, TextElement): + raise RuntimeError() + + # This function must return a list containing tuples, each one for a modified + # property: (name_of_entity, name_of_property) + keys_modified = [] + + # This is the name of the entity where the source is going to be attached: + attach_to_scientific_activity = self.definition["scientific_activity"] + rec = records[attach_to_scientific_activity] + + # The "source" is a path to a source project, so it should have the form: + # /<Category>/<project>/<scientific_activity>/ + # obtain these information from the structure element: + val = element.value + regexp = (r'/(?P<category>(SimulationData)|(ExperimentalData)|(DataAnalysis))' + '/(?P<project_date>.*?)_(?P<project_identifier>.*)' + '/(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))?/') + + res = re.match(regexp, val) + if res is None: + raise RuntimeError("Source cannot be parsed correctly.") + + # Mapping of categories on the file system to corresponding record types in CaosDB: + cat_map = { + "SimulationData": "Simulation", + "ExperimentalData": "Experiment", + "DataAnalysis": "DataAnalysis"} + linkrt = cat_map[res.group("category")] + + keys_modified.extend(create_records(values, records, { + "Project": { + "date": res.group("project_date"), + "identifier": res.group("project_identifier"), + }, + linkrt: { + "date": res.group("date"), + "identifier": res.group("identifier"), + "project": "$Project" + }, + attach_to_scientific_activity: { + "sources": "+$" + linkrt + }}, file_path_prefix)) + + # Process the records section of the yaml definition: + keys_modified.extend( + super().create_records(values, records, element, file_path_prefix)) + + # The create_records function must return the modified keys to make it compatible + # to the crawler functions: + return keys_modified If the recommended (python) package structure is used, the package containing the converter @@ -530,8 +664,8 @@ function signature: .. code-block:: python def create_records(values: GeneralStore, # <- pass the current variables store here - records: RecordStore, # <- pass the current store of CaosDB records here - def_records: dict): # <- This is the actual definition of new records! + records: RecordStore, # <- pass the current store of CaosDB records here + def_records: dict): # <- This is the actual definition of new records! `def_records` is the actual definition of new records according to the yaml cfood specification @@ -547,7 +681,7 @@ Let's have a look at a few examples: match: (?P<dir_name>.*) records: Experiment: - identifier: $dir_name + identifier: $dir_name This block will just create a new record with parent `Experiment` and one property `identifier` with a value derived from the matching regular expression. @@ -565,7 +699,7 @@ Let's formulate that using `create_records`: } keys_modified = create_records(values, records, - record_def) + record_def) The `dir_name` is set explicitely here, everything else is identical to the yaml statements. @@ -588,9 +722,9 @@ So, a sketch of a typical implementation within a custom converter could look li .. code-block:: python def create_records(self, values: GeneralStore, - records: RecordStore, - element: StructureElement, - file_path_prefix: str): + records: RecordStore, + element: StructureElement, + file_path_prefix: str): # Modify some records: record_def = { @@ -598,15 +732,15 @@ So, a sketch of a typical implementation within a custom converter could look li } keys_modified = create_records(values, records, - record_def) + record_def) # You can of course do it multiple times: keys_modified.extend(create_records(values, records, - record_def)) + record_def)) # You can also process the records section of the yaml definition: keys_modified.extend( - super().create_records(values, records, element, file_path_prefix)) + super().create_records(values, records, element, file_path_prefix)) # This essentially allows users of your converter to customize the creation of records # by providing a custom "records" section additionally to the modifications provided # in this implementation of the Converter. @@ -627,12 +761,12 @@ Let's have a look at a more complex examples, defining multiple records: match: (?P<dir_name>.*) records: Project: - identifier: project_name + identifier: project_name Experiment: - identifier: $dir_name - Project: $Project + identifier: $dir_name + Project: $Project ProjectGroup: - projects: +$Project + projects: +$Project This block will create two new Records: @@ -665,7 +799,7 @@ Let's formulate that using `create_records` (again, `dir_name` is constant here) } keys_modified = create_records(values, records, - record_def) + record_def) Debugging ========= @@ -681,7 +815,7 @@ output for the match step. The following snippet illustrates this: debug_match: True records: Project: - identifier: project_name + identifier: project_name Whenever this Converter tries to match a StructureElement, it logs what was tried to macht against diff --git a/src/doc/getting_started/helloworld.md b/src/doc/getting_started/helloworld.md index 723fb88d08047350d9f4bc3d3d2bd84ec9b27efb..67fdf88974391ac6209f1010bfb4f2d883e51021 100644 --- a/src/doc/getting_started/helloworld.md +++ b/src/doc/getting_started/helloworld.md @@ -33,7 +33,7 @@ Then you can do the following interactively in (I)Python. But we recommend that copy the code into a script and execute it to spare yourself typing. ```python -import caosdb as db +import linkahead as db from datetime import datetime from caoscrawler import Crawler, SecurityMode from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter diff --git a/src/doc/getting_started/optionalfeatures.rst b/src/doc/getting_started/optionalfeatures.rst index d326d7fce6f77a0278c9f2d05a641888203a2089..7b77646501d677b7a99799b97fae752107b11d6f 100644 --- a/src/doc/getting_started/optionalfeatures.rst +++ b/src/doc/getting_started/optionalfeatures.rst @@ -30,6 +30,13 @@ to decide what tool is used for sending mails (use the upper one if you want to actually send mails. See ``sendmail`` configuration in the LinkAhead docs. +You can even supply the name of a custom CSS file that shall be used: + +.. code:: ini + + [advancedtools] + crawler.customcssfile = theme-research.css + Crawler Status Records ---------------------- diff --git a/src/doc/img/properties-from-dict-records-author.png b/src/doc/img/properties-from-dict-records-author.png new file mode 100644 index 0000000000000000000000000000000000000000..20ee9497ab5ae577c3d515f11da6294c88601fed Binary files /dev/null and b/src/doc/img/properties-from-dict-records-author.png differ diff --git a/src/doc/img/properties-from-dict-records-person.png b/src/doc/img/properties-from-dict-records-person.png new file mode 100644 index 0000000000000000000000000000000000000000..8b026056a42ff3ba203c6077a426640c864b24c1 Binary files /dev/null and b/src/doc/img/properties-from-dict-records-person.png differ diff --git a/unittests/broken_cfoods/broken_record_from_dict.yml b/unittests/broken_cfoods/broken_record_from_dict.yml new file mode 100644 index 0000000000000000000000000000000000000000..fd8ffdbd29f6ad7b8b38fc17eb43686f4170dbcb --- /dev/null +++ b/unittests/broken_cfoods/broken_record_from_dict.yml @@ -0,0 +1,7 @@ +RecordFromDictElement: + type: PropertiesFromDictElement + match: "(.*)" + subtree: + AnotherElement: + type: Text + match_name: "(.*)" diff --git a/unittests/broken_cfoods/broken_record_from_dict_2.yml b/unittests/broken_cfoods/broken_record_from_dict_2.yml new file mode 100644 index 0000000000000000000000000000000000000000..ca321373c6c4d6bcc8c104c8c4b3c7147bf71375 --- /dev/null +++ b/unittests/broken_cfoods/broken_record_from_dict_2.yml @@ -0,0 +1,11 @@ +RecordFromDictElement: + type: PropertiesFromDictElement + record_from_dict: + parents: + - MyType1 + - MyType2 + match: "(.*)" + subtree: + AnotherElement: + type: Text + match_name: "(.*)" diff --git a/unittests/example_cfood.yml b/unittests/example_cfood.yml index 713bd4be0f3c816e1e8c8b7a057b30a4b400f13c..798e540fa25e49bf610ea21653db41a0bddc4d5f 100644 --- a/unittests/example_cfood.yml +++ b/unittests/example_cfood.yml @@ -1,6 +1,6 @@ --- metadata: - crawler-version: 0.3.1 + crawler-version: 0.7.2 --- Definitions: type: Definitions diff --git a/unittests/h5_cfood.yml b/unittests/h5_cfood.yml index f688de6a2171da6533626449b030bcd95a43b37b..4b95a0a31bc43a902eb63dc3aa09b805fc28c2aa 100644 --- a/unittests/h5_cfood.yml +++ b/unittests/h5_cfood.yml @@ -1,6 +1,6 @@ --- metadata: - crawler-version: 0.6.1 + crawler-version: 0.7.2 --- Converters: H5Dataset: diff --git a/unittests/record_from_dict_cfood.yml b/unittests/record_from_dict_cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..1ea2159df9d63256d9a0b2e293d82a9ad694608f --- /dev/null +++ b/unittests/record_from_dict_cfood.yml @@ -0,0 +1,12 @@ +PropertiesFromDictElement: + type: PropertiesFromDictElement + match: ".*" + record_from_dict: + variable_name: MyRec + parents: + - MyType1 + - MyType2 + references: + author: + parents: + - Person diff --git a/unittests/scifolder_cfood.yml b/unittests/scifolder_cfood.yml index 9d6e8cf3ea325ad14641530f2e6cafd43f0dc1bb..ca5fa589b5903e0c0d8ef3dcb2528ea79e0f8cee 100644 --- a/unittests/scifolder_cfood.yml +++ b/unittests/scifolder_cfood.yml @@ -4,7 +4,7 @@ --- metadata: - crawler-version: 0.3.1 + crawler-version: 0.7.2 --- Definitions: type: Definitions diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 2f62ef9216974bc4939667c0cb28971044c1f80c..e12302514d16f077882e41d6ff5995953f2228f8 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -29,26 +29,32 @@ import importlib import json import logging import os +import pytest import sys +import yaml + from itertools import product from pathlib import Path -import pytest -import yaml +import linkahead as db + from caoscrawler.converters import (Converter, ConverterValidationError, DateElementConverter, DictElementConverter, DictIntegerElementConverter, DirectoryConverter, FloatElementConverter, IntegerElementConverter, JSONFileConverter, - ListElementConverter, - MarkdownFileConverter, YAMLFileConverter, + ListElementConverter, MarkdownFileConverter, + PropertiesFromDictConverter, + YAMLFileConverter, _AbstractScalarValueElementConverter, handle_value, replace_variables) from caoscrawler.crawl import Crawler from caoscrawler.scanner import (_load_definition_from_yaml_dict, create_converter_registry, - create_transformer_registry, load_definition) -from caoscrawler.stores import GeneralStore + create_transformer_registry, + load_definition, + scan_structure_elements) +from caoscrawler.stores import GeneralStore, RecordStore from caoscrawler.structure_elements import (BooleanElement, DictElement, Directory, File, FloatElement, IntegerElement, ListElement, @@ -73,6 +79,10 @@ def converter_registry(): "DictElement": { "converter": "DictElementConverter", "package": "caoscrawler.converters"}, + "PropertiesFromDictElement": { + "converter": "PropertiesFromDictConverter", + "package": "caoscrawler.converters" + }, "TextElement": { "converter": "TextElementConverter", "package": "caoscrawler.converters"}, @@ -497,7 +507,7 @@ MyElement: two_doc_yaml = """ --- metadata: - crawler-version: 0.3.1 + crawler-version: 0.7.2 Converters: MyNewType: converter: MyNewTypeConverter @@ -633,7 +643,7 @@ def test_load_converters(): # converter classes can be loaded from their respective packages. # Please adapt, if defaults change! - assert len(converter_registry) == 23 + assert len(converter_registry) == 25 # All of them are contained in caoscrawler.converters for conv_key, conv in converter_registry.items(): @@ -660,3 +670,342 @@ def test_create_path_value(converter_registry): dc.create_values(values, Directory("a", "/a")) assert "Test.path" in values assert values["Test.path"] == "/a" + + +def test_properties_from_dict_basic(converter_registry): + """Test that a record with the correct name and properties is created, and + that the children are still created correctly. + + """ + # definitions with blacklist and named references + pfdc = PropertiesFromDictConverter( + definition={ + "type": "PropertiesFromDictElement", + "match": ".*", + "record_from_dict": { + "variable_name": "MyRec", + "parents": ["DictRT1", "DictRT2"], + "properties_blacklist": ["blacklisted_int", "blacklisted_ref"], + "references": { + "authors": { + "parents": ["Person"] + } + } + } + }, + name="Test", converter_registry=converter_registry) + # Tests for Dict with scalars, dict with lists, dict with reference, + # dict with list of references, dict with reference with reference, named + # reference + values = GeneralStore() + records = RecordStore() + test_dict_element = DictElement("TestDictElement", { + "a": 5, + "b": ["a", "b", "c"], + "scalar_ref": { + "name": "Scalar Ref", + "a": 23, + "blacklisted_int": 42 + }, + "list_ref": [ + { + "c": True + }, + { + "c": False + } + ], + "ref_with_ref": { + "a": 789, + "ref_in_ref": { + "b": "something" + } + }, + "blacklisted_int": -123, + "blacklisted_ref": { + "a": 25 + }, + "authors": { + "full_name": "Some Author" + } + }) + pfdc.create_records(values=values, records=records, element=test_dict_element) + assert "MyRec" in records + my_rec = records["MyRec"] + assert isinstance(my_rec, db.Record) + assert len(my_rec.parents) == 2 + assert "DictRT1" in [par.name for par in my_rec.parents] + assert "DictRT2" in [par.name for par in my_rec.parents] + + # scalar prop + assert my_rec.get_property("a") is not None + assert my_rec.get_property("a").value == 5 + + # list prop + assert my_rec.get_property("b") is not None + assert len(my_rec.get_property("b").value) == 3 + for elt in ["a", "b", "c"]: + assert elt in my_rec.get_property("b").value + + # scalar ref + assert my_rec.get_property("scalar_ref") is not None + referenced = my_rec.get_property("scalar_ref").value + assert isinstance(referenced, db.Record) + assert referenced.name == "Scalar Ref" + assert len(referenced.parents) == 1 + assert "scalar_ref" in [par.name for par in referenced.parents] + assert referenced.get_property("a") is not None + assert referenced.get_property("a").value == 23 + # blacklisted + assert referenced.get_property("blacklisted_int") is None + + # list of ref + assert my_rec.get_property("list_ref") is not None + assert isinstance(my_rec.get_property("list_ref").value, list) + assert len(my_rec.get_property("list_ref").value) == 2 + for rec in my_rec.get_property("list_ref").value: + assert isinstance(rec, db.Record) + assert len(rec.parents) == 1 + assert "list_ref" in [par.name for par in rec.parents] + assert rec.get_property("c") is not None + assert type(rec.get_property("c").value) is bool + assert True in [rec.get_property("c").value for rec in my_rec.get_property("list_ref").value] + assert False in [rec.get_property("c").value for rec in my_rec.get_property("list_ref").value] + + # ref with ref + assert my_rec.get_property("ref_with_ref") is not None + outer_rec = my_rec.get_property("ref_with_ref").value + assert isinstance(outer_rec, db.Record) + assert len(outer_rec.parents) == 1 + assert "ref_with_ref" in [par.name for par in outer_rec.parents] + assert outer_rec.get_property("a") is not None + assert outer_rec.get_property("a").value == 789 + assert outer_rec.get_property("ref_in_ref") is not None + inner_rec = outer_rec.get_property("ref_in_ref").value + assert isinstance(inner_rec, db.Record) + assert len(inner_rec.parents) == 1 + assert "ref_in_ref" in [par.name for par in inner_rec.parents] + assert inner_rec.get_property("b") is not None + assert inner_rec.get_property("b").value == "something" + + # blacklisted + assert my_rec.get_property("blacklisted_int") is None + assert my_rec.get_property("blacklisted_ref") is None + + # named reference property + assert my_rec.get_property("authors") is not None + author_rec = my_rec.get_property("authors").value + assert isinstance(author_rec, db.Record) + assert len(author_rec.parents) == 1 + assert "Person" in [par.name for par in author_rec.parents] + assert author_rec.get_property("full_name") is not None + assert author_rec.get_property("full_name").value == "Some Author" + + +def test_properties_from_dict_callable(converter_registry): + + def convert_some_values(rec: db.Record, records: RecordStore, values: GeneralStore): + """Add an URL prefix to a property value if appliccable.""" + + if rec.get_property("url") is not None: + + old_val = rec.get_property("url").value + if not (old_val is None or old_val.startswith("http")): + + # only add if there is a value that doesn't look like an URL + rec.get_property("url").value = f"https://test.com/{old_val}" + + return rec + + pdfc = PropertiesFromDictConverter( + definition={ + "record_from_dict": { + "variable_name": "MyRec", + "name": "My New Record" + } + }, + name="TestConverter", + converter_registry=converter_registry, + referenced_record_callback=convert_some_values + ) + + values = GeneralStore() + records = RecordStore() + test_dict_element = DictElement("TestDictElement", { + "url": "something", + "referenced1": { + "url": "referenced" + }, + "referenced2": { + "nourl": "something else", + "url": "https://indiscale.com" + } + }) + pdfc.create_records(values=values, records=records, element=test_dict_element) + assert "MyRec" in records + my_rec = records["MyRec"] + assert isinstance(my_rec, db.Record) + assert len(my_rec.parents) == 1 + assert "MyRec" in [par.name for par in my_rec.parents] + assert my_rec.name == "My New Record" + + # simple conversion + assert my_rec.get_property("url") is not None + assert my_rec.get_property("url").value == "https://test.com/something" + + # also works in referenced + assert my_rec.get_property("referenced1") is not None + referenced1 = my_rec.get_property("referenced1").value + assert isinstance(referenced1, db.Record) + assert referenced1.get_property("url") is not None + assert referenced1.get_property("url").value == "https://test.com/referenced" + + # ... and works as expected + assert my_rec.get_property("referenced2") is not None + referenced2 = my_rec.get_property("referenced2").value + assert isinstance(referenced2, db.Record) + assert referenced2.get_property("nourl") is not None + assert referenced2.get_property("nourl").value == "something else" + assert referenced2.get_property("url") is not None + assert referenced2.get_property("url").value == "https://indiscale.com" + + +def test_properties_from_dict_nested(converter_registry): + """Test the PropertiesFromDictConverter with a nested dict, + together with the regular DictElementConverter and Records created + and used on different subtree levels. + + """ + root_dict_element = DictElement("RootDict", { + "TopLevelRec": "MyRec", + "propertiesDict": { + "a": 5, + "blacklisted": { + "bl_name": "BlackList", + "date": "2023-12-31" + } + }, + "otherDict": { + "additional_from_other": "other" + } + }) + def_dict = { + "RootElt": { + # Root dictionary + "type": "DictElement", + "match": ".*", + "records": { + # Define top-level, use below in subtrees + "MyRec": { + "parents": ["MyType"] + } + }, + "subtree": { + # Top-level text element for the Record name + "NameElt": { + "type": "TextElement", + "match_name": "^TopLevelRec$", + "match_value": "(?P<name>.*)", + "records": { + "MyRec": { + "name": "$name" + } + } + }, + "PFDElement": { + "type": "PropertiesFromDictElement", + "match_name": "^propertiesDict$", + "record_from_dict": { + "variable_name": "MyRec", + "properties_blacklist": ["blacklisted"] + }, + "subtree": { + "BLElement": { + "type": "DictElement", + "match_name": "^blacklisted$", + "records": { + "BLRec": { + "parents": ["BlackListedType"], + "MyRec": "$MyRec" + } + }, + "subtree": { + "BLNameElt": { + "type": "TextElement", + "match_name": "^bl_name$", + "match_value": "(?P<name>.*)", + "records": { + "BLRec": { + "name": "$name" + } + } + }, + "BLDateElt": { + "type": "TextElement", + "match_name": "^date$", + "match_value": "(?P<date>.*)", + "records": { + "BLRec": { + "creation_date": "$date" + } + } + } + } + } + } + }, + # Other dict which uses the DictElementConverter + "OtherDictElement": { + "type": "DictElement", + "match_name": "^otherDict$", + "subtree": { + "additionalElt": { + "type": "TextElement", + "match_name": "^additional_from_other$", + "match_value": "(?P<val>.*)", + "records": { + "MyRec": { + "additional_from_other": "$val" + } + } + } + } + } + } + } + } + + records = scan_structure_elements(root_dict_element, def_dict, converter_registry) + + # All records need to be there + assert len(records) == 2 + myrec = None + blrec = None + for rec in records: + if rec.name == "MyRec": + myrec = rec + elif rec.name == "BlackList": + blrec = rec + assert myrec is not None + assert blrec is not None + + # Parent is set from top level + assert len(myrec.parents) == 1 + assert "MyType" in [par.name for par in myrec.parents] + + # Set automatically, with blacklist + assert myrec.get_property("a") is not None + assert myrec.get_property("a").value == 5 + assert myrec.get_property("blacklisted") is None + + # Now check blacklisted record from subtree + assert len(blrec.parents) == 1 + assert "BlackListedType" in [par.name for par in blrec.parents] + assert blrec.get_property("MyRec") is not None + assert blrec.get_property("MyRec").value == myrec + assert blrec.get_property("creation_date") is not None + assert blrec.get_property("creation_date").value == "2023-12-31" + + # The "old" DictConverter should have added the additional property: + assert myrec.get_property("additional_from_other") is not None + assert myrec.get_property("additional_from_other").value == "other" diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index e7a03e3322da0d937bf3c1330f21b90768b478d8..0a6aee44a1892f1c950a80b936adf184616fd612 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -173,7 +173,15 @@ A: model.get_deep("A").id = 2 return result + [model.get_deep("B")] print(query_string) - raise NotImplementedError("Mock for this case is missing") + raise NotImplementedError(f"Mock for this case is missing: {query_string}") + + +def mock_cached_only_rt_allow_empty(query_string: str): + try: + result = mock_cached_only_rt(query_string) + except NotImplementedError: + result = db.Container() + return result @pytest.fixture(autouse=True) diff --git a/unittests/test_data/invalid_identifiable/identifiable_content_no_list.yaml b/unittests/test_data/invalid_identifiable/identifiable_content_no_list.yaml new file mode 100644 index 0000000000000000000000000000000000000000..aee572a190bd7f439f638ef7c9a5d94a831aca81 --- /dev/null +++ b/unittests/test_data/invalid_identifiable/identifiable_content_no_list.yaml @@ -0,0 +1,4 @@ +Experiment: + date: + - 1 + - 2 diff --git a/unittests/test_data/invalid_identifiable/identifiable_no_str_or_dict.yaml b/unittests/test_data/invalid_identifiable/identifiable_no_str_or_dict.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a33c4ace9f8709a9b4a77c5fd8f38514acbe1e9c --- /dev/null +++ b/unittests/test_data/invalid_identifiable/identifiable_no_str_or_dict.yaml @@ -0,0 +1,3 @@ +Experiment: +- date +- 23 diff --git a/unittests/test_data/invalid_identifiable/identifiable_referenced_no_list.yaml b/unittests/test_data/invalid_identifiable/identifiable_referenced_no_list.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a504eab748d4891c3e1088ee785afcf6347fbbab --- /dev/null +++ b/unittests/test_data/invalid_identifiable/identifiable_referenced_no_list.yaml @@ -0,0 +1,5 @@ +Experiment: +- date +Event: +- is_referenced_by: Experiment +- event_id diff --git a/unittests/test_directories/examples_json/invalidjson.json b/unittests/test_directories/examples_json/invalidjson.json index 9c012bf062264014278fc2df7be6cf33b65c7469..49a00fc6df33fe8d82ec2735e39c400a2342f0bf 100644 --- a/unittests/test_directories/examples_json/invalidjson.json +++ b/unittests/test_directories/examples_json/invalidjson.json @@ -1,13 +1,13 @@ { - "projectId": 10002, - "archived": false, - "coordinator": { - "firstname": "Miri", - "lastname": "Mueller", - "email": "miri.mueller@science.de" - }, - "start_date": "2022-03-01", - "candidates": ["Mouse", "Penguine"], - "rvalue": 0.4444, - "url": "https://site.de/index.php/" + "projectId": 10002, + "archived": false, + "coordinator": { + "firstname": "Miri", + "lastname": "Mueller", + "email": "miri.mueller@science.de" + }, + "start_date": "2022-03-01", + "candidates": ["Mouse", "Penguine"], + "rvalue": 0.4444, + "url": "https://site.de/index.php/" } diff --git a/unittests/test_directories/examples_json/testjson.json b/unittests/test_directories/examples_json/testjson.json index d37ea2defc21d767e4e13ad3b39d6682b3c452ef..29d59780f4824d9c2edbc8fe1da3a6b380def57b 100644 --- a/unittests/test_directories/examples_json/testjson.json +++ b/unittests/test_directories/examples_json/testjson.json @@ -1,22 +1,21 @@ { - "name": "DEMO", - "projectId": 10002, - "archived": false, - "Person": [ - { - "firstname": "Miri", - "lastname": "Mueller", - "other": null, - "email": "miri.mueller@science.de" - }, + "name": "DEMO", + "projectId": 10002, + "archived": false, + "Person": [{ + "firstname": "Miri", + "lastname": "Mueller", + "other": null, + "email": "miri.mueller@science.de" + }, { "firstname": "Mara", "lastname": "Mueller", - "email": "mara.mueller@science.de" + "email": "mara.mueller@science.de" } ], - "start_date": "2022-03-01", - "candidates": ["Mouse", "Penguine"], - "rvalue": 0.4444, - "url": "https://site.de/index.php/" + "start_date": "2022-03-01", + "candidates": ["Mouse", "Penguine"], + "rvalue": 0.4444, + "url": "https://site.de/index.php/" } diff --git a/unittests/test_entity_comparison.py b/unittests/test_entity_comparison.py index 549bc4f42a59765d25446d44fbb845e49ca4d9b9..0f62475b6c61d82feb3e550cf5ab53e91183f80a 100644 --- a/unittests/test_entity_comparison.py +++ b/unittests/test_entity_comparison.py @@ -2,7 +2,7 @@ # Tests for entity comparison # A. Schlemmer, 06/2021 -import caosdb as db +import linkahead as db import pytest from pytest import raises diff --git a/unittests/test_h5_converter.py b/unittests/test_h5_converter.py index 2f7fae5d8d32bb7e5c90a535b63158c33df55daa..7f244e2cbdccb0d4eee6a62f59e9cea5684295a6 100644 --- a/unittests/test_h5_converter.py +++ b/unittests/test_h5_converter.py @@ -23,7 +23,7 @@ from functools import partial from pathlib import Path from pytest import fixture, importorskip -import caosdb as db +import linkahead as db from caoscrawler.debug_tree import DebugTree from caoscrawler.hdf5_converter import (convert_basic_element_with_nd_array, diff --git a/unittests/test_identifiable.py b/unittests/test_identifiable.py index 074c3843e351b20d17813a661974fdc59ca0442a..d94d852583523a3b3f29f002eaacb9ae0b616c4f 100644 --- a/unittests/test_identifiable.py +++ b/unittests/test_identifiable.py @@ -24,7 +24,7 @@ test identifiable module """ -import caosdb as db +import linkahead as db import pytest from caoscrawler.identifiable import Identifiable from caoscrawler.sync_node import SyncNode diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index e37c1ad4953880f988bb1efc3f6804766805b4ee..53490bc0413a95d960d94186c639dac2c6223b80 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -32,8 +32,10 @@ from datetime import datetime from unittest.mock import MagicMock, Mock, patch from pathlib import Path -import caosdb as db +import linkahead as db import pytest +from caoscrawler.exceptions import (InvalidIdentifiableYAML, + ) from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, IdentifiableAdapter, @@ -122,6 +124,23 @@ def test_load_from_yaml_file(): assert project_i.get_property("title") is not None +def test_invalid_yaml(): + ident = CaosDBIdentifiableAdapter() + invalid_dir = UNITTESTDIR / "test_data" / "invalid_identifiable" + with pytest.raises(InvalidIdentifiableYAML) as exc: + ident.load_from_yaml_definition(invalid_dir / "identifiable_content_no_list.yaml") + assert str(exc.value) == "Identifiable contents must be lists, but this was not: Experiment" + + with pytest.raises(InvalidIdentifiableYAML) as exc: + ident.load_from_yaml_definition(invalid_dir / "identifiable_referenced_no_list.yaml") + assert str(exc.value) == "'is_referenced_by' must be a list. Found in: Event" + + with pytest.raises(InvalidIdentifiableYAML) as exc: + ident.load_from_yaml_definition(invalid_dir / "identifiable_no_str_or_dict.yaml") + assert str(exc.value) == ("Identifiable properties must be str or dict, but this one was not:\n" + " Experiment/23") + + def test_non_default_name(): ident = CaosDBIdentifiableAdapter() identifiable = ident.get_identifiable(SyncNode(db.Record(name="don't touch it") @@ -141,8 +160,8 @@ def test_wildcard_ref(): dummy.id = 1 identifiable = ident.get_identifiable(SyncNode(rec, db.RecordType() .add_parent(name="Person") - .add_property(name="is_referenced_by", value=["*"])), - + .add_property(name="is_referenced_by", + value=["*"])), [dummy] ) assert identifiable.backrefs[0] == 1 diff --git a/unittests/test_json.py b/unittests/test_json.py index fdb332df60d73dce3356a563e09ae0d02cf845b7..be65a26ea01e11e11968bd927c80513708e73850 100644 --- a/unittests/test_json.py +++ b/unittests/test_json.py @@ -31,7 +31,7 @@ import os from pytest import raises -import caosdb as db +import linkahead as db from caoscrawler.converters import JSONFileConverter from pathlib import Path diff --git a/unittests/test_macros.py b/unittests/test_macros.py index 53837e920e93f2cc318d62549145a0e8ac757372..020098676407f1f70932559b1a995af9f9644fe9 100644 --- a/unittests/test_macros.py +++ b/unittests/test_macros.py @@ -59,7 +59,7 @@ def _temp_file_load(txt: str): def test_macros(register_macros, macro_store_reset): dat = yaml.load(""" -defs: +macros: - !defmacro name: test params: @@ -85,7 +85,7 @@ testnode: def test_macro_list_replacment(register_macros, macro_store_reset): dat = yaml.load(""" -defs: +macros: - !defmacro name: test params: @@ -112,7 +112,7 @@ testnode: def test_multi_macros(register_macros, macro_store_reset): dat = yaml.load(""" -defs: +macros: - !defmacro name: test_one params: {} @@ -142,7 +142,7 @@ def test_multi_macros_toplevel(register_macros, macro_store_reset): dat_loader = list(yaml.safe_load_all(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.7.2 macros: - !defmacro name: test_one @@ -171,7 +171,7 @@ def test_load_definition(register_macros, macro_store_reset): txt = """ --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.7.2 --- extroot: type: Directory @@ -188,12 +188,13 @@ extroot: cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.7.2 macros: - !defmacro name: test_one params: {} definition: + type: TextElement replaced1: ok - !defmacro name: test_two @@ -213,6 +214,7 @@ extroot: extroot2: !macro # test top level macro test_one: extroot3: + type: Directory subtree: SimulationData: !macro test_two: @@ -223,38 +225,44 @@ extroot3: assert cfood["extroot3"]["subtree"]["SimulationData"]["match"] == "SimulationData" -@pytest.mark.xfail def test_replace_arbitrary_objects(register_macros, macro_store_reset): """ See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/24 """ dat = yaml.load(""" -defs: +macros: - !defmacro name: test params: b: 25 + testvar_list_empty: [] testvar_list: - a - $b + testvar_dict_empty: {} testvar_dict: t1: a t2: $b definition: replaced1: $b: ok - c: $testvar_dict - d: $testvar_list + dict_empty: $testvar_dict_empty + dict: $testvar_dict + list_empty: $testvar_list_empty + list: ${testvar_list} testnode: obl: !macro test: """, Loader=yaml.SafeLoader) print(yaml.dump(dat)) - assert dat["testnode"]["obl"]["replaced1"]["c"]["t1"] == "a" - assert dat["testnode"]["obl"]["replaced1"]["c"]["t2"] == "25" - assert dat["testnode"]["obl"]["replaced1"]["d"][0] == "a" - assert dat["testnode"]["obl"]["replaced1"]["d"][1] == "25" + replaced = dat["testnode"]["obl"]["replaced1"] + assert replaced["dict_empty"] == {} + assert replaced["dict"]["t1"] == "a" + assert replaced["dict"]["t2"] == 25 + assert replaced["list_empty"] == [] + assert replaced["list"][0] == "a" + assert replaced["list"][1] == 25 def test_macros_in_macros(register_macros, macro_store_reset): @@ -264,13 +272,14 @@ def test_macros_in_macros(register_macros, macro_store_reset): cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.7.2 macros: - !defmacro name: one_macro params: a: 25 definition: + type: DictElement macro_sub_$a: b: $a another_param: 3 @@ -278,6 +287,7 @@ metadata: name: test_macrodef params: {} definition: + type: DictElement macro_top: !macro one_macro: - a: 17 @@ -293,11 +303,11 @@ extroot: !macro assert "test_macro" not in cfood["extroot"] assert cfood["extroot"]["macro_top"]["not_macro"]["a"] == 26 d = cfood["extroot"]["macro_top"] - assert d["macro_sub_17"]["b"] == "17" + assert d["macro_sub_17"]["b"] == 17 assert d["macro_sub_17"]["another_param"] == 3 - assert d["macro_sub_25"]["b"] == "25" + assert d["macro_sub_25"]["b"] == 25 assert d["macro_sub_25"]["another_param"] == 3 - assert d["macro_sub_98"]["b"] == "98" + assert d["macro_sub_98"]["b"] == 98 assert d["macro_sub_98"]["another_param"] == 3 @@ -309,7 +319,7 @@ def test_silent_overwrite(register_macros, macro_store_reset): cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.7.2 macros: - !defmacro name: one_macro @@ -340,12 +350,13 @@ def test_circular_macro_definition(register_macros, macro_store_reset): cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.7.2 macros: - !defmacro name: test_one params: {} definition: !macro + type: TextElement test_two: - !defmacro name: test_two @@ -361,6 +372,7 @@ metadata: name: test_four params: {} definition: !macro + type: TextElement test_four: --- extroot: !macro @@ -389,7 +401,7 @@ def test_use_macro_twice(): cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.7.2 macros: - !defmacro name: test_twice @@ -397,6 +409,7 @@ metadata: macro_name: default_name a: 4 definition: + type: DictElement $macro_name: something: a: $a @@ -410,9 +423,9 @@ extroot: !macro """) for name in ["once", "twice", "default_name"]: assert name in cfood["extroot"] - assert cfood["extroot"]["once"]["something"]["a"] == "4" - assert cfood["extroot"]["twice"]["something"]["a"] == "5" - assert cfood["extroot"]["default_name"]["something"]["a"] == "4" + assert cfood["extroot"]["once"]["something"]["a"] == 4 + assert cfood["extroot"]["twice"]["something"]["a"] == 5 + assert cfood["extroot"]["default_name"]["something"]["a"] == 4 # Code sample to generate the expanded macro: # with open("expanded_test_macro.yaml", "w") as f: # f.write(yaml.dump(cfood)) @@ -423,7 +436,7 @@ def test_documentation_example_2(): cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.7.2 macros: - !defmacro name: MarkdownFile @@ -461,7 +474,7 @@ def test_documentation_example_1(): cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.7.2 macros: - !defmacro name: SimulationDatasetFile @@ -510,7 +523,7 @@ def test_def_replacements(): cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.7.2 macros: - !defmacro name: test_def_replacements @@ -549,7 +562,7 @@ extroot: !macro def test_list_macro_application(register_macros, macro_store_reset): dat = yaml.load(""" -defs: +macros: - !defmacro name: test params: @@ -573,14 +586,14 @@ testnode: test2: a: 4 """, Loader=yaml.SafeLoader) - assert dat["testnode"]["obl"]["expanded_4"]["param"] == "4" - assert dat["testnode"]["obl"]["expanded_2"]["param"] == "2" - assert dat["testnode"]["obl"]["expanded_4_test2"]["param"] == "4" + assert dat["testnode"]["obl"]["expanded_4"]["param"] == 4 + assert dat["testnode"]["obl"]["expanded_2"]["param"] == 2 + assert dat["testnode"]["obl"]["expanded_4_test2"]["param"] == 4 def test_variable_in_macro_definition(register_macros, macro_store_reset): dat = yaml.load(""" -defs: +macros: - !defmacro name: test params: @@ -598,7 +611,7 @@ testnode: - a: 2 b: 4 """, Loader=yaml.SafeLoader) - assert dat["testnode"]["obl"]["expanded_4"]["param"] == "4" - assert dat["testnode"]["obl"]["expanded_4"]["param_b"] == "4" - assert dat["testnode"]["obl"]["expanded_2"]["param"] == "2" - assert dat["testnode"]["obl"]["expanded_2"]["param_b"] == "4" + assert dat["testnode"]["obl"]["expanded_4"]["param"] == 4 + assert dat["testnode"]["obl"]["expanded_4"]["param_b"] == 4 + assert dat["testnode"]["obl"]["expanded_2"]["param"] == 2 + assert dat["testnode"]["obl"]["expanded_2"]["param_b"] == 4 diff --git a/unittests/test_parent_cfood.yml b/unittests/test_parent_cfood.yml index b8d0eaf597641d311cb70017dc2bc75c7c3434f3..cd63e81b270117841128a34765a9635a036c52ec 100644 --- a/unittests/test_parent_cfood.yml +++ b/unittests/test_parent_cfood.yml @@ -1,6 +1,6 @@ --- metadata: - crawler-version: 0.6.1 + crawler-version: 0.7.2 --- Definitions: type: Definitions diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py index c0ce736fc4bed18f371f1626b6bc451ee103db49..226b5040547f0e003729dba63622edf836552f18 100644 --- a/unittests/test_scanner.py +++ b/unittests/test_scanner.py @@ -31,7 +31,7 @@ from pathlib import Path from tempfile import NamedTemporaryFile from unittest.mock import MagicMock, Mock, patch -import caosdb as db +import linkahead as db import pytest import yaml from caoscrawler.crawl import Crawler diff --git a/unittests/test_schema.py b/unittests/test_schema.py index 0d5bebce98fbc8c789c1080bcf3919f128bdbf54..ea8549b0b8dfd1f1af35784082a9e46320cfcff4 100644 --- a/unittests/test_schema.py +++ b/unittests/test_schema.py @@ -3,7 +3,7 @@ # A. Schlemmer, 06/2021 from importlib_resources import files -import caosdb as db +import linkahead as db from os.path import join, dirname from caoscrawler import Crawler @@ -27,6 +27,13 @@ def rfp(*pathcomponents): def test_schema_validation(): load_definition(rfp("scifolder_cfood.yml")) load_definition(rfp("scifolder_extended.yml")) + load_definition(rfp("record_from_dict_cfood.yml")) with raises(ValidationError, match=".*enum.*"): load_definition(rfp("broken_cfoods", "broken1.yml")) + + with raises(ValidationError, match=".*required.*"): + load_definition(rfp("broken_cfoods", "broken_record_from_dict.yml")) + + with raises(ValidationError, match=".*required.*"): + load_definition(rfp("broken_cfoods", "broken_record_from_dict_2.yml")) diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py index a7c1539118a4cd87d8c46bf6e18b07b90a90361a..9015e74be69c60c43ece80a2f742d6e9b7badda6 100644 --- a/unittests/test_sync_graph.py +++ b/unittests/test_sync_graph.py @@ -25,10 +25,15 @@ from unittest.mock import MagicMock, Mock, patch import linkahead as db import pytest -from test_crawler import basic_retrieve_by_name_mock_up, mock_get_entity_by +from test_crawler import (basic_retrieve_by_name_mock_up, + mock_cached_only_rt_allow_empty, + mock_get_entity_by, + ) from caoscrawler.exceptions import (ImpossibleMergeError, - MissingIdentifyingProperty) + MissingIdentifyingProperty, + MissingRecordType, + ) from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.sync_graph import SyncGraph, _set_each_scalar_value @@ -651,3 +656,30 @@ def test_set_each_scalar_value(): assert a.properties[0].value == 42 _set_each_scalar_value(a, lambda x: x == 42, lambda x: None) assert a.properties[0].value is None + + +@patch("caoscrawler.identifiable_adapters.cached_query", + new=Mock(side_effect=mock_cached_only_rt_allow_empty)) +def test_merge_referenced_by(): + """Merging two entities that are referenced by a third entity with nonexistent RecordType. + + See also https://gitlab.com/linkahead/linkahead-crawler/-/issues/95 + """ + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_object({ + "RT_A": ["name"], + "RT_B": [{"is_referenced_by": ["RT_A"]}, "my_id"] + }) + crawled_data: list = [] + references: list = [] + for ii in [0, 1]: + rec = db.Record().add_parent("RT_B").add_property("my_id", value=ii) + references.append(rec) + crawled_data.append(rec) + rec_a = db.Record(name="Rec_A").add_parent("RT_A") + rec_a.add_property("my_ref", value=references) + crawled_data.append(rec_a) + + with pytest.raises(MissingRecordType) as mrt: + SyncGraph(crawled_data, ident) + assert str(mrt.value).endswith("Record type could not be found on server: RT_A") diff --git a/unittests/test_table_converter.py b/unittests/test_table_converter.py index 178393d9345bd8a6846b66e362ce4f7edac382ee..3b563fd3179968fd90b1c92b9bc5bf0db9ed0858 100644 --- a/unittests/test_table_converter.py +++ b/unittests/test_table_converter.py @@ -32,7 +32,7 @@ import os from os.path import basename, dirname, join from pathlib import Path -import caosdb as db +import linkahead as db import pytest from caoscrawler import Crawler from caoscrawler.converters import (Converter, ConverterValidationError, diff --git a/unittests/test_transformers.py b/unittests/test_transformers.py index 02d932d13cc3fad52048b08e2b9fe56f11db2ae7..4ed12751d9052c839aa4db4abd586c419bed1018 100644 --- a/unittests/test_transformers.py +++ b/unittests/test_transformers.py @@ -34,7 +34,7 @@ from pathlib import Path from tempfile import NamedTemporaryFile from unittest.mock import MagicMock, Mock, patch -import caosdb as db +import linkahead as db import pytest import yaml from caoscrawler.converters import Converter, ListElementConverter @@ -46,6 +46,38 @@ from pytest import raises UNITTESTDIR = Path(__file__).parent +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + "MarkdownFile": { + "converter": "MarkdownFileConverter", + "package": "caoscrawler.converters"}, + "Date": { + "converter": "DateElementConverter", + "package": "caoscrawler.converters"}, + "DictElement": { + "converter": "DictElementConverter", + "package": "caoscrawler.converters"}, + "TextElement": { + "converter": "TextElementConverter", + "package": "caoscrawler.converters"}, + "ListElement": { + "converter": "ListElementConverter", + "package": "caoscrawler.converters"}, + "JSONFile": { + "converter": "JSONFileConverter", + "package": "caoscrawler.converters"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + def test_simple_transformer(): """ Test the correct list of returned records by the scanner using the @@ -82,38 +114,6 @@ def test_simple_transformer(): assert False -@pytest.fixture -def converter_registry(): - converter_registry: dict[str, dict[str, str]] = { - "Directory": { - "converter": "DirectoryConverter", - "package": "caoscrawler.converters"}, - "MarkdownFile": { - "converter": "MarkdownFileConverter", - "package": "caoscrawler.converters"}, - "Date": { - "converter": "DateElementConverter", - "package": "caoscrawler.converters"}, - "DictElement": { - "converter": "DictElementConverter", - "package": "caoscrawler.converters"}, - "TextElement": { - "converter": "TextElementConverter", - "package": "caoscrawler.converters"}, - "ListElement": { - "converter": "ListElementConverter", - "package": "caoscrawler.converters"}, - "JSONFile": { - "converter": "JSONFileConverter", - "package": "caoscrawler.converters"}, - } - - for key, value in converter_registry.items(): - module = importlib.import_module(value["package"]) - value["class"] = getattr(module, value["converter"]) - return converter_registry - - def test_apply_replace(converter_registry): cfood_def = {"type": 'ListElement', "match_name": ".*", 'transform': {'test': {'in': '$a', 'out': '$b', 'functions': [{ @@ -146,3 +146,21 @@ def test_apply_replace_from_def(converter_registry): conv.apply_transformers(values, transformer_functions) assert values['b'] == "16:45" + + +def test_empty_functions_list(converter_registry): + cfood_def = {"type": 'ListElement', + "match_name": ".*", + 'transform': {'test': {'in': '$a', 'out': '$b', + 'functions': []}}} + values = GeneralStore() + values["a"] = "16_45" + + # transformer_functions = create_transformer_registry(crawler_definition) + transformer_functions = {"replace": replace} + + conv = ListElementConverter(definition=cfood_def, name='test', + converter_registry=converter_registry) + + conv.apply_transformers(values, transformer_functions) + assert values['b'] == "16_45" diff --git a/unittests/test_variable_substitutions.py b/unittests/test_variable_substitutions.py index 09f78df661d82970e7264996102eff8881ee19ec..90d144b04a4e1271f74b769759e3f201007af705 100644 --- a/unittests/test_variable_substitutions.py +++ b/unittests/test_variable_substitutions.py @@ -25,7 +25,7 @@ from os.path import basename, dirname, join from pathlib import Path from unittest.mock import MagicMock, Mock -import caosdb as db +import linkahead as db import pytest import yaml from caoscrawler import Crawler @@ -35,7 +35,7 @@ from caoscrawler.identifiable_adapters import (IdentifiableAdapter, from caoscrawler.scanner import scan_directory from caoscrawler.structure_elements import (DictListElement, DictTextElement, File) -from caosdb.apiutils import compare_entities +from linkahead.apiutils import compare_entities from pytest import raises from utils import dircheckstr as dircheckstr_base