diff --git a/.docker/Dockerfile b/.docker/Dockerfile index 539ac0d4e70bfbde2f630d4254cacc7419105611..1e9763f3496c9dca6cc33e6ba8217a654bed487e 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -1,27 +1,31 @@ -FROM debian:bullseye +FROM debian:bookworm RUN apt-get update && \ apt-get install \ curl \ git \ - openjdk-11-jdk-headless \ + openjdk-17-jdk-headless \ python3-autopep8 \ python3-pip \ python3-pytest \ python3-sphinx \ tox \ -y -RUN pip3 install pylint recommonmark sphinx-rtd-theme +RUN pip3 install --break-system-packages \ + pylint \ + recommonmark \ + sphinx-rtd-theme \ + ; COPY .docker/wait-for-it.sh /wait-for-it.sh ARG PYLIB ADD https://gitlab.indiscale.com/api/v4/projects/97/repository/commits/${PYLIB} \ pylib_version.json RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git && \ - cd caosdb-pylib && git checkout ${PYLIB} && pip3 install . + cd caosdb-pylib && git checkout ${PYLIB} && pip3 install --break-system-packages . ARG ADVANCED ADD https://gitlab.indiscale.com/api/v4/projects/104/repository/commits/${ADVANCED} \ advanced_version.json RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git && \ - cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install .[h5-crawler] + cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install --break-system-packages .[h5-crawler] COPY . /git # Delete .git because it is huge. @@ -30,7 +34,7 @@ RUN rm -r /git/.git # Install pycaosdb.ini for the tests RUN mv /git/.docker/tester_pycaosdb.ini /git/integrationtests/pycaosdb.ini -RUN cd /git/ && pip3 install . +RUN cd /git/ && pip3 install --break-system-packages .[h5-crawler,spss] WORKDIR /git/integrationtests # wait for server, diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 879291320a7a715c10113f850a9f43f9465a7196..8812abacc0ef157c418e8f658a4fa7261bb04743 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -113,32 +113,33 @@ info: script: - *env -unittest_py3.9: +unittest_py3.11: tags: [cached-dind] stage: test image: $CI_REGISTRY_IMAGE script: - - tox + - python3 -c "import sys; assert sys.version.startswith('3.11')" + - tox -unittest_py3.7: +unittest_py3.8: tags: [cached-dind] stage: test - image: python:3.7 + image: python:3.8 script: &python_test_script # install dependencies - pip install pytest pytest-cov # TODO: Use f-branch logic here - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev - - pip install .[h5-crawler] + - pip install .[h5-crawler,spss] # actual test - caosdb-crawler --help - pytest --cov=caosdb -vv ./unittests -unittest_py3.8: +unittest_py3.9: tags: [cached-dind] stage: test - image: python:3.8 + image: python:3.9 script: *python_test_script unittest_py3.10: @@ -147,12 +148,31 @@ unittest_py3.10: image: python:3.10 script: *python_test_script -unittest_py3.11: +unittest_py3.12: tags: [cached-dind] stage: test - image: python:3.11 + image: python:3.12 script: *python_test_script - + +unittest_py3.13: + allow_failure: true + tags: [cached-dind] + stage: test + image: python:3.13-rc + script: + # TODO: Replace by '*python_test_script' as soon as 3.13 has been officially released. + # TODO Remove the "!" after 3.13 release, which serves as an xfail + - apt update && apt install -y cargo + # install dependencies + - pip install pytest pytest-cov + # TODO: Use f-branch logic here + - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev + - (! pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev) + - (! pip install .[h5-crawler,spss]) + # actual test + - (! caosdb-crawler --help) + - (! pytest --cov=caosdb -vv ./unittests) + inttest: tags: [docker] services: @@ -287,7 +307,8 @@ code-style: - job: build-testenv optional: true script: - - autopep8 -r --diff --exit-code . + - autopep8 --version + - autopep8 -r --diff --exit-code . allow_failure: true pylint: diff --git a/CHANGELOG.md b/CHANGELOG.md index 352311d0910bcf0c7f60183b6f58dd7ffdcb0ed4..995ad6eedf391f2219cbd25fbd7fa12e1f32126a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,18 +9,45 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### +* Support for Python 3.12 and experimental support for 3.13 +* CFood macros now accept complex objects as values, not just strings. +* More options for the `CSVTableConverter` +* New converters: + * `DatetimeElementConverter` + * `SPSSConverter` +* New scripts: + * `spss_to_datamodel` + * `csv_to_datamodel` +* New transformer functions: + * `date_parse` + * `datetime_parse` + ### Changed ### +* CFood macros do not render everything into strings now. +* Better internal handling of identifiable/reference resolving and merging of entities. This also + includes more understandable output for users. +* Better handling of missing imports, with nice messages for users. +* No longer use configuration of advancedtools to set to and from email addresses + ### Deprecated ### ### Removed ### +* Support for Python 3.7 + ### Fixed ### +* [93](https://gitlab.com/linkahead/linkahead-crawler/-/issues/93) cfood.yaml does not allow umlaut in $expression +* [96](https://gitlab.com/linkahead/linkahead-crawler/-/issues/96) Do not fail silently on transaction errors + ### Security ### ### Documentation ### +* General improvement of the documentaion, in many small places. +* The API documentation should now also include documentation of the constructors. + ## [0.7.1] - 2024-03-21 ## ### Fixed ### @@ -68,6 +95,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * The `identifiable_adapters.IdentifiableAdapter` uses entity ids (negative for entities that don't exist remotely) instead of entity objects for keeping track of references. +* Log output is either written to $SHARED_DIR/ (when this variable is set) or just to the terminal. ### Deprecated ### @@ -161,6 +189,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - ``add_prefix`` and ``remove_prefix`` arguments for the command line interface and the ``crawler_main`` function for the adding/removal of path prefixes when creating file entities. +- More strict checking of `identifiables.yaml`. +- Better error messages when server does not conform to expected data model. ### Changed ### diff --git a/integrationtests/basic_example/test_basic.py b/integrationtests/basic_example/test_basic.py index c906a81d86af56669f7c522169bceb3b5fcb3e01..6fd322e5f6425e9bce25b970d6de7d99892762a5 100755 --- a/integrationtests/basic_example/test_basic.py +++ b/integrationtests/basic_example/test_basic.py @@ -32,7 +32,7 @@ import sys from argparse import RawTextHelpFormatter from pathlib import Path -import caosdb as db +import linkahead as db import pytest import yaml from caosadvancedtools.crawler import Crawler as OldCrawler @@ -42,8 +42,8 @@ from caoscrawler.debug_tree import DebugTree from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.scanner import scan_directory -from caosdb import EmptyUniqueQueryError -from caosdb.utils.register_tests import clear_database, set_test_key +from linkahead import EmptyUniqueQueryError +from linkahead.utils.register_tests import clear_database, set_test_key set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") diff --git a/integrationtests/test_issues.py b/integrationtests/test_issues.py index 814e82ad75512ec8fe217294e1a9e86c6aa01ab3..76392f3a4ce20d7ed6b6ccc30c79f1ce400001f7 100644 --- a/integrationtests/test_issues.py +++ b/integrationtests/test_issues.py @@ -16,20 +16,18 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from pytest import fixture, mark, raises - import linkahead as db -from linkahead.cached import cache_clear from caosadvancedtools.models.parser import parse_model_from_string - from caoscrawler.crawl import Crawler from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter +from caoscrawler.scanner import (create_converter_registry, + scan_structure_elements) from caoscrawler.structure_elements import DictElement - -from caoscrawler.scanner import create_converter_registry, scan_structure_elements - +from linkahead.cached import cache_clear from linkahead.utils.register_tests import clear_database, set_test_key +from pytest import fixture, mark, raises + set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") @@ -171,8 +169,9 @@ def test_issue_83(clear_database): name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target1]) referencing2 = db.Record(name="Referencing2").add_parent( name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target2]) - referencing3 = db.Record(name="Referencing3").add_parent(name=referencing_type.name).add_property( - name=referenced_type.name, value=[ref_target1, ref_target2]) + referencing3 = db.Record(name="Referencing3").add_parent( + name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target1, + ref_target2]) records = db.Container().extend( [ref_target1, ref_target2, referencing1, referencing2, referencing3]) diff --git a/integrationtests/test_use_case_simple_presentation.py b/integrationtests/test_use_case_simple_presentation.py index cf38e951b78534806c0ea76ef58051436aa22704..05b0a543deb03eb524d40d6a386876812e6b54e2 100644 --- a/integrationtests/test_use_case_simple_presentation.py +++ b/integrationtests/test_use_case_simple_presentation.py @@ -27,12 +27,12 @@ import os import pytest from subprocess import run -import caosdb as db +import linkahead as db from caosadvancedtools.loadFiles import loadpath -from caosdb.cached import cache_clear +from linkahead.cached import cache_clear from caosadvancedtools.models import parser as parser from caoscrawler.crawl import crawler_main -from caosdb.utils.register_tests import clear_database, set_test_key +from linkahead.utils.register_tests import clear_database, set_test_key set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") diff --git a/setup.cfg b/setup.cfg index 88898530f7b7e049e84b230bdcbd45ff5170fabf..848150363c42776993029c54e777f4ff6ccf72ea 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,15 +17,15 @@ classifiers = package_dir = = src packages = find: -python_requires = >=3.7 +python_requires = >=3.8 install_requires = caosadvancedtools >= 0.7.0 importlib-resources - importlib_metadata;python_version<'3.8' linkahead > 0.13.2 odfpy #make optional packaging pandas + pyarrow # Will be required by Pandas >= 3.0. pyyaml yaml-header-tools >= 0.2.1 @@ -40,8 +40,12 @@ per-file-ignores = __init__.py:F401 [options.entry_points] console_scripts = caosdb-crawler = caoscrawler.crawl:main + spss_to_datamodel = caoscrawler.conv_impl.spss:spss_to_datamodel_main + csv_to_datamodel = caoscrawler.scripts.generators:csv_to_datamodel_main [options.extras_require] h5-crawler = h5py >= 3.8 numpy +spss = + pandas[spss] diff --git a/src/caoscrawler/__init__.py b/src/caoscrawler/__init__.py index 05bad0b54d9098c0b7f165d8295a0faa2966fa32..41b96323b1106d8ce28caadc4a2da012f3dc22ea 100644 --- a/src/caoscrawler/__init__.py +++ b/src/caoscrawler/__init__.py @@ -1,4 +1,15 @@ +from . import converters, utils +try: + from .conv_impl.spss import SPSSConverter +except ImportError as err: + SPSSConverter: type = utils.MissingImport( + name="SPSSConverter", hint="Try installing with the `spss` extra option.", + err=err) from .crawl import Crawler, SecurityMode from .version import CfoodRequiredVersionError, get_caoscrawler_version __version__ = get_caoscrawler_version() + +# Convenience members ######################################################### +# mypy: disable-error-code="attr-defined" +converters.SPSSConverter = SPSSConverter diff --git a/src/caoscrawler/cfood-schema.yml b/src/caoscrawler/cfood-schema.yml index 5a6e1e50345382ca6e5a1e6ef3a8fbeafb806b84..340e5b9dec0e8f05b1c39ec2511196249ec87d31 100644 --- a/src/caoscrawler/cfood-schema.yml +++ b/src/caoscrawler/cfood-schema.yml @@ -28,9 +28,12 @@ cfood: - Definitions - Dict - Date + - Datetime - JSONFile + - YAMLFile - CSVTableConverter - XLSXTableConverter + - SPSSFile - H5File - H5Dataset - H5Group diff --git a/src/caoscrawler/conv_impl/__init__.py b/src/caoscrawler/conv_impl/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/caoscrawler/conv_impl/spss.py b/src/caoscrawler/conv_impl/spss.py new file mode 100644 index 0000000000000000000000000000000000000000..5dfad0ff8be55e2ca3ddf0db3397dbac5fc9f2b0 --- /dev/null +++ b/src/caoscrawler/conv_impl/spss.py @@ -0,0 +1,303 @@ +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Converter for SAV files (stored by SPSS).""" + +from __future__ import annotations # Can be removed with 3.10. + +import argparse +from collections import OrderedDict + +import numpy as np +import pandas as pd +import pyreadstat +import yaml + +from .. import converters +from ..stores import GeneralStore +from ..structure_elements import (File, StructureElement) +from typing import Optional, Any + + +READSTAT_TYPES = { + "double": "DOUBLE", + "string": "TEXT", +} +ORIGINAL_TYPES = { + "EDATE8": "DATETIME", +} + + +class SPSSConverter(converters.TableConverter): + """Converter for SAV files (stored by SPSS).""" + + def create_children(self, values: GeneralStore, element: StructureElement) -> list: + assert isinstance(element, File) + # The default dtype backend "numpy_nullable" does not handle dates well. + # Note that pandas.ArrowDtype is considered experimental (in Pandas 2.2). + df = pd.io.spss.read_spss(element.path, dtype_backend="pyarrow") + dtypes = read_column_types(element.path) + + # Fix datetime columns + for name, dtype in dtypes.items(): + if dtype != "DATETIME": + continue + col = df.loc[:, name] + col.fillna(np.nan, inplace=True) + col.replace([np.nan], [None], inplace=True) + + return self._children_from_dataframe(df) + + +def read_column_types(savfile: Optional[str] = None, meta: Optional[Any] = None) -> dict[str, str]: + """Read SAV file and return the column types. + +Optionally, take data from a previours reading. + +Parameters +---------- +savfile : Optional[str] + The SAV file to read. + +meta : Optional + The meta data result from `pyreadstat.read_sav(...)`. + +Returns +------- +out : dict[str, str] + The column names and types. + """ + if not meta: + _, meta = pyreadstat.read_sav(savfile, metadataonly=True) + elif savfile is not None: + raise ValueError("Only one of `savfile` and `meta` must be given.") + dtypes: dict[str, str] = {} + for name in meta.column_names: + datatype = ORIGINAL_TYPES.get(meta.original_variable_types[name], + READSTAT_TYPES[meta.readstat_variable_types[name]]) + dtypes[name] = datatype + return dtypes + + +def spss_to_yaml(savfile: str, yamlfile: str, cfood: Optional[str] = None) -> None: + """Parse the *.sav and create basic datamodel in ``yamlfile``. + +Parameters +---------- +cfood: str + If given, also create a cfood skeleton. + """ + _, meta = pyreadstat.read_sav(savfile, metadataonly=True) + dtypes = read_column_types(meta=meta) + + cfood_str = """ +--- +metadata: + macros: + - !defmacro + # Simple column value -> property rule + name: ColumnValue + params: + name: null + belongsto: BaseElement + type: TextElement + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${belongsto}: + ${name}: $$val + - !defmacro + # column value -> reference property + name: ColumnValueReference + params: + name: null + reftype: null # RecordType of the reference + belongsto: BaseElement + type: TextElement # References are always text, right? + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${reftype}: + name: $$val + ${belongsto}: + ${name}: $$${reftype} + - !defmacro + # Same as "ColumnValue", but also give name of property. + name: ColumnValuePropname + params: + name: null + propname: null + belongsto: BaseElement + type: TextElement + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${belongsto}: + ${propname}: $$val +--- +directory: # corresponds to the directory given to the crawler + type: Directory + match: .* # we do not care how it is named here + subtree: + # This is the file + thisfile: + type: SPSSFile + match: ".*sav" + subtree: + entry: + type: Dict + match: .* # Name is irrelevant + records: + MyParent: + subtree: !macro +""" + + enums: dict[str, list[str]] = {} + properties = OrderedDict() + + for name in meta.column_names: + prop = { + "datatype": dtypes[name], + } + desc = meta.column_names_to_labels.get(name) + if desc and desc != name: + prop["description"] = desc + # Handle categorial variables + if var_label := meta.variable_to_label.get(name): + vvl = meta.variable_value_labels[name] + # reproducible (and sensible) order + label_values = [vvl[key] for key in sorted(vvl.keys())] + if label_values not in enums.values(): + enums[var_label] = label_values + else: + var_label = [key for key, value in enums.items() if value == label_values][0] + prop["datatype"] = var_label + properties[name] = prop + + output = f"""# auto-generated data model from file "{savfile}". +# To insert a datamodel into LinkAhead, run: +# +# python3 -m caosadvancedtools.models.parser datamodel.yaml --sync + +""" + + # Actual datamodel + output += """ +######### +# Enums # +######### + +""" + for name, values in enums.items(): + output += f"""{name}: + description: + # possible values: {values}\n""" + + output += (""" +############### +# RecordTypes # +############### + +DummyRT: + description: Note: Change name and enter description. + recommended_properties: + """ + + " ".join(yaml.dump(dict(properties), # from OrderedDict to dict + allow_unicode=True, + sort_keys=False).splitlines(keepends=True))) + + # Experimental: Enum creation + output += """ +############### +# Enum values # +############### +""" + for name, values in enums.items(): + output += f"\n# ### {name} ###\n" + for value in values: + output += f""" +{value}: + role: Record + inherit_from_suggested: + - {name} +""" + + with open(yamlfile, encoding="utf-8", mode="w") as myfile: + myfile.write(output) + + if cfood: + defs_col_value: list[str] = [] + defs_col_value_ref: list[str] = [] + prefix = " " * 14 + for name, propdef in properties.items(): + def_str = prefix + f"- name: {name}\n" + dtype = None + reftype = None + defs = defs_col_value + # Which type? + if propdef["datatype"] == "DOUBLE": + dtype = "FloatElement" + elif propdef["datatype"] in ("TEXT", "DATETIME"): + dtype = None + else: + reftype = propdef["datatype"] + defs = defs_col_value_ref + + # Append according to types: + if reftype: + def_str += prefix + f" reftype: {reftype}\n" + if dtype: + def_str += prefix + f" type: {dtype}\n" + + # Store result + defs.append(def_str) + del defs + + cfood_str += (prefix[2:] + "ColumnValue:\n" + "".join(defs_col_value) + + prefix[2:] + "ColumnValueReference:\n" + "".join(defs_col_value_ref) + ) + with open(cfood, encoding="utf-8", mode="w") as myfile: + myfile.write(cfood_str) + + +def _parse_arguments(): + """Parse the arguments.""" + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('-i', '--input', help="The *.sav file.", required=True) + parser.add_argument('-o', '--outfile', help="Yaml filename to save the result", required=True) + parser.add_argument('--cfood', help="Yaml filename to create cfood output in", required=False) + + return parser.parse_args() + + +def spss_to_datamodel_main(): + """The main function of this script.""" + args = _parse_arguments() + spss_to_yaml(savfile=args.input, yamlfile=args.outfile, cfood=args.cfood) + print(f"Written datamodel to: {args.outfile}") + if args.cfood: + print(f"Written cfood to: {args.cfood}") diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 535a14745282016cd55acd4ca3fcf0ceb0ccd7ec..dad11ec902d638b6c9f7d746b2e628a6af9c0c83 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -1,11 +1,11 @@ -#!/usr/bin/env python3 # encoding: utf-8 # -# ** header v3.0 -# This file is a part of the CaosDB Project. +# This file is a part of the LinkAhead Project. # +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> # Copyright (C) 2021 Henrik tom Wörden -# 2021 Alexander Schlemmer +# Copyright (C) 2021 Alexander Schlemmer +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -19,9 +19,8 @@ # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. -# -# ** end header -# + +"""Converters take structure elements and create Records and new structure elements from them.""" from __future__ import annotations @@ -34,7 +33,7 @@ import warnings from abc import ABCMeta, abstractmethod from inspect import signature from string import Template -from typing import Any, List, Optional, Tuple, Union +from typing import Any, Callable, Optional, Union import linkahead as db import pandas as pd @@ -53,12 +52,16 @@ from .utils import has_parent # by the converters: SPECIAL_PROPERTIES = ("description", "name", "id", "path", "file", "checksum", "size") -SINGLE_VAR_RE = re.compile(r"^\$(\{)?(?P<varname>[0-9a-zA-Z_]+)(\})?$") +ID_PATTERN = r"\D[.\w]*" +SINGLE_VAR_RE = re.compile(r"^\$(\{)?(?P<varname>" + ID_PATTERN + r")(\})?$") logger = logging.getLogger(__name__) class CrawlerTemplate(Template): - braceidpattern = r"(?a:[_a-z][_\.a-z0-9]*)" + # This also adds a dot to the default pattern. + # See: https://docs.python.org/3/library/string.html#template-strings + # Default flags is re.IGNORECASE + braceidpattern = ID_PATTERN def _only_max(children_with_keys): @@ -134,8 +137,8 @@ def replace_variables(propvalue: Any, values: GeneralStore): This function replaces variables in property values (and possibly other locations, where the crawler can replace cfood-internal variables). - If `propvalue` is a single variable name preceeded with a '$' (e.g. '$var' or '${var}'), then - the corresponding value stored in `values` is returned. + If ``propvalue`` is a single variable name preceeded by a ``$`` (e.g. ``$var`` or ``${var}``), + then the corresponding value stored in ``values`` is returned. In any other case the variable substitution is carried out as defined by string templates and a new string with the replaced variables is returned. """ @@ -160,16 +163,16 @@ def handle_value(value: Union[dict, str, list], values: GeneralStore): add as an additional property (multiproperty). Variable names (starting with a "$") are replaced by the corresponding value stored in the - `values` GeneralStore. + ``values`` GeneralStore. Parameters ---------- -value: - - if str, the value to be interpreted. E.g. "4", "hallo" or "$a" etc. - - if dict, must have keys "value" and "collection_mode". The returned tuple is directly +value: Union[dict, str, list] + - If *str*, the value to be interpreted. E.g. "4", "hello" or "$a" etc. + - If *dict*, must have keys ``value`` and ``collection_mode``. The returned tuple is directly created from the corresponding values. - - if list, each element is checked for replacement and the resulting list will be used + - If *list*, each element is checked for replacement and the resulting list will be used as (list) value for the property Returns @@ -181,15 +184,15 @@ out: tuple """ # @review Florian Spreckelsen 2022-05-13 - if type(value) == dict: + if isinstance(value, dict): if "value" not in value: # TODO: how do we handle this case? Just ignore? # or disallow? - raise NotImplementedError() + raise NotImplementedError(f"This definition has no \"value\": {value}") propvalue = value["value"] # can be "single", "list" or "multiproperty" collection_mode = value["collection_mode"] - elif type(value) == str: + elif isinstance(value, str): propvalue = value collection_mode = "single" if propvalue.startswith("+"): @@ -198,7 +201,7 @@ out: tuple elif propvalue.startswith("*"): collection_mode = "multiproperty" propvalue = propvalue[1:] - elif type(value) == list: + elif isinstance(value, list): # TODO: (for review) # This is a bit dirty right now and needed for # being able to directly set list values. Semantics is, however, a bit @@ -209,7 +212,7 @@ out: tuple propvalue = list() for element in value: # Do the element-wise replacement only, when its type is string: - if type(element) == str: + if isinstance(element, str): propvalue.append(replace_variables(element, values)) else: propvalue.append(element) @@ -286,9 +289,7 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict propvalue = os.path.normpath(propvalue) setattr(c_record, key.lower(), propvalue) else: - if c_record.get_property(key) is None: - if collection_mode == "list": c_record.add_property(name=key, value=[propvalue]) elif (collection_mode == "multiproperty" or @@ -322,10 +323,13 @@ class Converter(object, metaclass=ABCMeta): Parameters ---------- - definition: dict, Please refer to ``src/doc/converters.rst`` to learn about the structure - that the definition dict must have. - converter_registry: dict, A dictionary that contains converter names as keys and dicts as - values. Those value dicts have the keys 'converter' and 'package'. + definition: dict + Please refer to ``src/doc/converters.rst`` to learn about the structure that the + definition dict must have. + converter_registry: dict + A dictionary that contains converter names as keys and dicts as values. Those value dicts + have the keys 'converter', 'package' and 'class'. 'converter' is the class name, + 'package' the module and 'class' the class instance of converters. """ self.definition = definition @@ -363,7 +367,7 @@ class Converter(object, metaclass=ABCMeta): @staticmethod def converter_factory(definition: dict, name: str, converter_registry: dict): - """creates a Converter instance of the appropriate class. + """Create a Converter instance of the appropriate class. The `type` key in the `definition` defines the Converter class which is being used. """ @@ -424,10 +428,11 @@ class Converter(object, metaclass=ABCMeta): pass """ - if not "transform" in self.definition: + if "transform" not in self.definition: return for transformer_key, transformer in self.definition["transform"].items(): in_value = replace_variables(transformer["in"], values) + out_value = in_value for tr_func_el in transformer["functions"]: if not isinstance(tr_func_el, dict): @@ -460,13 +465,13 @@ class Converter(object, metaclass=ABCMeta): values[match.group('varname')] = out_value @abstractmethod - def create_children(self, values: GeneralStore, - element: StructureElement): + def create_children(self, values: GeneralStore, element: StructureElement): pass def create_records(self, values: GeneralStore, records: RecordStore, element: StructureElement): # TODO why is element passed but not used??? + # ANSWER: because it might be used by overriding child classes. if "records" not in self.definition: return [] @@ -477,7 +482,7 @@ class Converter(object, metaclass=ABCMeta): self.definition["records"]) def filter_children(self, children_with_strings: - List[Tuple[StructureElement, str]], expr: str, + list[tuple[StructureElement, str]], expr: str, group: str, rule: str): """Filter children according to regexp `expr` and `rule`.""" @@ -515,8 +520,8 @@ class Converter(object, metaclass=ABCMeta): result: Optional[dict]): """ Template for the debugging output for the match function """ msg = "\n--------" + name + "-----------\n" - for re, ma in zip(regexp, matched): - msg += "matching reg:\t" + re + "\n" + for exp, ma in zip(regexp, matched): + msg += "matching reg:\t" + exp + "\n" msg += "matching val:\t" + ma + "\n" msg += "---------\n" if result is None: @@ -620,7 +625,7 @@ class DirectoryConverter(Converter): element: A directory (of type Directory) which will be traversed. """ - children: List[StructureElement] = [] + children: list[StructureElement] = [] for name in sorted(os.listdir(element.path)): path = os.path.join(element.path, name) @@ -660,7 +665,7 @@ class SimpleFileConverter(Converter): class FileConverter(SimpleFileConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use SimpleFileConverter.")) + "This class is deprecated. Please use SimpleFileConverter.")) super().__init__(*args, **kwargs) @@ -693,12 +698,12 @@ class MarkdownFileConverter(SimpleFileConverter): "Error during the validation (yaml header cannot be read) of the markdown file " "located at the following node in the data structure:\n" "{}\nError:\n{}".format(path, err)) - children: List[StructureElement] = [] + children: list[StructureElement] = [] for name, entry in header.items(): - if type(entry) == list: + if isinstance(entry, list): children.append(ListElement(name, entry)) - elif type(entry) == str: + elif isinstance(entry, str): children.append(TextElement(name, entry)) else: if generalStore is not None and self.name in generalStore: @@ -713,7 +718,9 @@ class MarkdownFileConverter(SimpleFileConverter): def convert_basic_element(element: Union[list, dict, bool, int, float, str, None], name=None, msg_prefix=""): """Convert basic Python objects to the corresponding StructureElements""" - if isinstance(element, list): + if isinstance(element, StructureElement): + return element + elif isinstance(element, list): return ListElement(name, element) elif isinstance(element, dict): return DictElement(name, element) @@ -963,14 +970,14 @@ class PropertiesFromDictConverter(DictElementConverter): class DictConverter(DictElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use DictElementConverter.")) + "This class is deprecated. Please use DictElementConverter.")) super().__init__(*args, **kwargs) class DictDictElementConverter(DictElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use DictElementConverter.")) + "This class is deprecated. Please use DictElementConverter.")) super().__init__(*args, **kwargs) @@ -1035,7 +1042,7 @@ out: """ if "match_name" in definition: if "match" in definition: - raise RuntimeError(f"Do not supply both, 'match_name' and 'match'.") + raise RuntimeError("Do not supply both, 'match_name' and 'match'.") m1 = re.match(definition["match_name"], name) if m1 is None: @@ -1158,7 +1165,7 @@ class BooleanElementConverter(_AbstractScalarValueElementConverter): class DictBooleanElementConverter(BooleanElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use BooleanElementConverter.")) + "This class is deprecated. Please use BooleanElementConverter.")) super().__init__(*args, **kwargs) @@ -1174,7 +1181,7 @@ class FloatElementConverter(_AbstractScalarValueElementConverter): class DictFloatElementConverter(FloatElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use FloatElementConverter.")) + "This class is deprecated. Please use FloatElementConverter.")) super().__init__(*args, **kwargs) @@ -1189,7 +1196,7 @@ class TextElementConverter(_AbstractScalarValueElementConverter): def __init__(self, definition, *args, **kwargs): if "match" in definition: raise ValueError(""" -The 'match' key will in future be used to match a potential name of a TextElement. Please use +The 'match' key is used to match a potential name of a TextElement. Please use the 'match_value' key to match the value of the TextElement and 'match_name' for matching the name. """) @@ -1199,7 +1206,7 @@ the 'match_value' key to match the value of the TextElement and 'match_name' for class DictTextElementConverter(TextElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use TextElementConverter.")) + "This class is deprecated. Please use TextElementConverter.")) super().__init__(*args, **kwargs) @@ -1215,7 +1222,7 @@ class IntegerElementConverter(_AbstractScalarValueElementConverter): class DictIntegerElementConverter(IntegerElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use IntegerElementConverter.")) + "This class is deprecated. Please use IntegerElementConverter.")) super().__init__(*args, **kwargs) @@ -1225,7 +1232,7 @@ class ListElementConverter(Converter): # TODO: See comment on types and inheritance if not isinstance(element, ListElement): raise RuntimeError( - "This converter can only process DictListElements.") + "This converter can only process ListElements.") children: list[StructureElement] = [] for index, list_element in enumerate(element.value): children.append( @@ -1257,7 +1264,7 @@ class ListElementConverter(Converter): class DictListElementConverter(ListElementConverter): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning( - "This class is depricated. Please use ListElementConverter.")) + "This class is deprecated. Please use ListElementConverter.")) super().__init__(*args, **kwargs) @@ -1271,15 +1278,22 @@ class TableConverter(Converter): The rows can be matched using a DictElementConverter. """ - @abstractmethod - def get_options(self): - """ - This method needs to be overwritten by the specific table converter to provide - information about the possible options. + + def get_options(self) -> dict: + """Get specific options, e.g. from ``self.definitions``. + +This method may to be overwritten by the specific table converter to provide information about the +possible options. Implementors may use ``TableConverter._get_options(...)`` to get (and convert) +options from ``self.definitions``. + +Returns +------- +out: dict + An options dict. """ - pass + return {} - def _get_options(self, possible_options): + def _get_options(self, possible_options: list[tuple[str, Callable]]) -> dict: option_dict = dict() for opt_name, opt_conversion in possible_options: if opt_name in self.definition: @@ -1307,6 +1321,14 @@ class TableConverter(Converter): return None return m.groupdict() + @staticmethod + def _children_from_dataframe(dataframe: pd.DataFrame): + child_elements = list() + for index, row in dataframe.iterrows(): + child_elements.append( + DictElement(str(index), row.to_dict())) + return child_elements + class XLSXTableConverter(TableConverter): """ @@ -1336,11 +1358,7 @@ class XLSXTableConverter(TableConverter): if not isinstance(element, File): raise RuntimeError("Element must be a File.") table = pd.read_excel(element.path, **self.get_options()) - child_elements = list() - for index, row in table.iterrows(): - child_elements.append( - DictElement(str(index), row.to_dict())) - return child_elements + return self._children_from_dataframe(table) class CSVTableConverter(TableConverter): @@ -1365,22 +1383,19 @@ class CSVTableConverter(TableConverter): if not isinstance(element, File): raise RuntimeError("Element must be a File.") table = pd.read_csv(element.path, **self.get_options()) - child_elements = list() - for index, row in table.iterrows(): - child_elements.append( - DictElement(str(index), row.to_dict())) - return child_elements + return self._children_from_dataframe(table) class DateElementConverter(TextElementConverter): """allows to convert different text formats of dates to Python date objects. The text to be parsed must be contained in the "date" group. The format string can be supplied - under "dateformat" in the Converter definition. The library used is datetime so see its + under "date_format" in the Converter definition. The library used is datetime so see its documentation for information on how to create the format string. """ + # TODO make `date` parameter name configurable def match(self, element: StructureElement): matches = super().match(element) if matches is not None and "date" in matches: @@ -1389,3 +1404,24 @@ class DateElementConverter(TextElementConverter): self.definition["date_format"] if "date_format" in self.definition else "%Y-%m-%d" ).date()}) return matches + + +class DatetimeElementConverter(TextElementConverter): + """Convert text so that it is formatted in a way that LinkAhead can understand it. + +The text to be parsed must be in the ``val`` parameter. The format string can be supplied in the +``datetime_format`` node. This class uses the ``datetime`` module, so ``datetime_format`` must +follow this specificaton: +https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes + + """ + + # TODO make `val` parameter name configurable + def match(self, element: StructureElement): + matches = super().match(element) + if matches is not None and "val" in matches: + fmt_default = "%Y-%m-%dT%H:%M:%S" + fmt = self.definition.get("datetime_format", fmt_default) + dt_str = datetime.datetime.strptime(matches["val"], fmt).strftime(fmt_default) + matches.update({"val": dt_str}) + return matches diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index d21e6e2521578dc407e445d8220506677be84e26..0f23acfdfde2a863a66f25901a85748b538f5d04 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -51,26 +51,27 @@ from caosadvancedtools.cache import UpdateCache from caosadvancedtools.crawler import Crawler as OldCrawler from caosadvancedtools.serverside.helper import send_mail from caosadvancedtools.utils import create_entity_link -from linkahead.apiutils import (EntityMergeConflictError, compare_entities, +from linkahead.apiutils import (compare_entities, merge_entities) from linkahead.cached import cache_clear, cached_get_entity_by from linkahead.common.datatype import get_list_datatype, is_reference -from linkahead.exceptions import EmptyUniqueQueryError +from linkahead.exceptions import ( + TransactionError, +) from linkahead.utils.escape import escape_squoted_text from .config import get_config_setting from .converters import Converter, ConverterValidationError from .debug_tree import DebugTree -from .identifiable import Identifiable from .identifiable_adapters import (CaosDBIdentifiableAdapter, - IdentifiableAdapter, - LocalStorageIdentifiableAdapter) + IdentifiableAdapter) from .logging import configure_server_side_logging from .macros import defmacro_constructor, macro_constructor from .scanner import (create_converter_registry, initialize_converters, load_definition, scan_directory, scan_structure_elements) from .stores import GeneralStore from .structure_elements import StructureElement +from .sync_graph import SyncGraph logger = logging.getLogger(__name__) @@ -172,163 +173,12 @@ def _resolve_datatype(prop: db.Property, remote_entity: db.Entity): return prop -def _treat_merge_error_of(newrecord, record): - """ - The parameters are two entities that cannot be merged with the merge_entities function. - - # This function checks for two obvious cases where no merge will ever be possible: - # 1. Two Entities with differing IDs - # 2. Two non-Entity values which differ - - It creates a more informative logger message and raises an Exception in those cases. - """ - for this_p in newrecord.properties: - that_p = record.get_property(this_p.name) - - if that_p is None: - logger.debug(f"Property {this_p.name} does not exist in the second entity. Note that " - "this should not be the reason for the merge conflict.") - continue - - if (isinstance(this_p.value, db.Entity) - and isinstance(that_p.value, db.Entity)): - if this_p.value.id is not None and that_p.value.id is not None: - if this_p.value.id != that_p.value.id: - logger.error("The Crawler is trying to merge two entities " - "because they should be the same object (same" - " identifiables), but they reference " - "different Entities with the same Property." - f"Problematic Property: {this_p.name}\n" - f"Referenced Entities: {this_p.value.id} and " - f"{that_p.value.id}\n" - f"{record}\n{newrecord}") - raise RuntimeError("Cannot merge Entities") - elif (not isinstance(this_p.value, db.Entity) - and not isinstance(that_p.value, db.Entity)): - if ((this_p.value != that_p.value) - # TODO can we also compare lists? - and not isinstance(this_p.value, list) - and not isinstance(that_p.value, list)): - logger.error( - "The Crawler is trying to merge two entities because they should be the same " - "object (same identifiables), but they have different values for the same " - "Property.\n" - f"Problematic Property: {this_p.name}\n" - f"Values: {this_p.value} and {that_p.value}\n" - f"{record}\n{newrecord}") - raise RuntimeError("Cannot merge Entities") - - class SecurityMode(Enum): RETRIEVE = 0 INSERT = 1 UPDATE = 2 -class TreatedRecordLookUp(): - """tracks Records and Identifiables for which it was checked whether they exist in the remote - server - - For a given Record it can be checked, whether it exists in the remote sever if - - it has a (valid) ID - - it has a (valid) path (FILEs only) - - an identifiable can be created for the Record. - - Records are added by calling the `add` function and they are then added to the internal - existing or missing list depending on whether the Record has a valid ID. - Additionally, the Record is added to three look up dicts. The keys of those are paths, IDs and - the representation of the identifiables. - - The extreme case, that one could imagine, would be that the same Record occurs three times as - different Python objects: one that only has an ID, one with only a path and one without ID and - path but with identifying properties. During `split_into_inserts_and_updates` all three - must be identified with each other (and must be merged). Since we require, that treated - entities have a valid ID if they exist in the remote server, all three objects would be - identified with each other simply using the IDs. - - In the case that the Record is not yet in the remote server, there cannot be a Python object - with an ID. Thus we might have one with a path and one with an identifiable. If that Record - does not yet exist, it is necessary that both Python objects have at least either the path or - the identifiable in common. - """ - - def __init__(self): - self._id_look_up: dict[int, db.Entity] = {} - self._path_look_up: dict[str, db.Entity] = {} - self._identifiable_look_up: dict[str, db.Entity] = {} - self.remote_missing_counter = -1 - self._missing: dict[int, db.Entity] = {} - self._existing: dict[int, db.Entity] = {} - - def add(self, record: db.Entity, identifiable: Optional[Identifiable] = None): - """ - Add a Record that was treated, such that it is contained in the internal look up dicts - - This Record MUST have an ID if it was found in the remote server. - """ - if record.id is None: - if record.path is None and identifiable is None: - raise RuntimeError("Record must have ID or path or an identifiable must be given." - f"Record is\n{record}") - record.id = self.remote_missing_counter - self.remote_missing_counter -= 1 - self._add_any(record, self._missing, identifiable) - else: - self._add_any(record, self._existing, identifiable) - - def get_any(self, record: db.Entity, identifiable: Optional[Identifiable] = None): - """ - Check whether this Record was already added. Identity is based on ID, path or Identifiable - represenation - """ - if record.id is not None and record.id in self._id_look_up: - return self._id_look_up[record.id] - if record.path is not None and record.path in self._path_look_up: - return self._path_look_up[record.path] - if (identifiable is not None and identifiable.get_representation() in - self._identifiable_look_up): - return self._identifiable_look_up[identifiable.get_representation()] - - def get_existing(self, record: db.Entity, identifiable: Optional[Identifiable] = None): - """ Check whether this Record exists on the remote server - - Returns: The stored Record - """ - rec = self.get_any(record, identifiable) - if id(rec) in self._existing: - return rec - else: - return None - - def get_missing(self, record: db.Entity, identifiable: Optional[Identifiable] = None): - """ Check whether this Record is missing on the remote server - - Returns: The stored Record - """ - rec = self.get_any(record, identifiable) - if id(rec) in self._missing: - return rec - else: - return None - - def get_missing_list(self): - """ Return all Records that are missing in the remote server """ - return list(self._missing.values()) - - def get_existing_list(self): - """ Return all Records that exist in the remote server """ - return list(self._existing.values()) - - def _add_any(self, record: db.Entity, lookup, identifiable: Optional[Identifiable] = None): - if record.id is not None: - self._id_look_up[record.id] = record - if record.path is not None: - self._path_look_up[record.path] = record - if identifiable is not None: - self._identifiable_look_up[identifiable.get_representation()] = record - lookup[id(record)] = record - - class Crawler(object): """ Crawler class that encapsulates crawling functions. @@ -365,14 +215,13 @@ class Crawler(object): # The following caches store records, where we checked whether they exist on the remote # server. Since, it is important to know whether they exist or not, we store them into two # different caches. - self.treated_records_lookup = TreatedRecordLookUp() # TODO does it make sense to have this as member variable? self.securityMode = securityMode # TODO does it make sense to have this as member variable(run_id)? self.generate_run_id() - self.identifiableAdapter: IdentifiableAdapter = LocalStorageIdentifiableAdapter() + self.identifiableAdapter: IdentifiableAdapter = CaosDBIdentifiableAdapter() if identifiableAdapter is not None: self.identifiableAdapter = identifiableAdapter @@ -449,396 +298,59 @@ class Crawler(object): self.crawled_data = data return data - def _has_reference_value_without_id(self, ident: Identifiable) -> bool: - """ - Returns True if there is at least one value in the properties and backrefs attributes of - ``ident`` which: - - a) is a reference property AND - b) where the value is set to a - :external+caosdb-pylib:py:class:`db.Entity <caosdb.common.models.Entity>` - (instead of an ID) AND - c) where the ID of the value (the - :external+caosdb-pylib:py:class:`db.Entity <caosdb.common.models.Entity>` object in b)) - is not set (to an integer) - - Returns - ------- - bool - True if there is a value without id (see above) - - Raises - ------ - ValueError - If no Identifiable is given. - """ - if ident is None: - raise ValueError("Identifiable has to be given as argument") - for pvalue in list(ident.properties.values()) + ident.backrefs: - if isinstance(pvalue, list): - for el in pvalue: - if isinstance(el, db.Entity) and el.id is None: - return True - elif isinstance(pvalue, db.Entity) and pvalue.id is None: - return True - return False - - @staticmethod - def create_flat_list(ent_list: list[db.Entity], flat: Optional[list[db.Entity]] = None): - """ - Recursively adds entities and all their properties contained in ent_list to - the output list flat. - - TODO: This function will be moved to pylib as it is also needed by the - high level API. - """ - # Note: A set would be useful here, but we do not want a random order. - if flat is None: - flat = list() - for el in ent_list: - if el not in flat: - flat.append(el) - for ent in ent_list: - for p in ent.properties: - # For lists append each element that is of type Entity to flat: - if isinstance(p.value, list): - for el in p.value: - if isinstance(el, db.Entity): - if el not in flat: - flat.append(el) - Crawler.create_flat_list([el], flat) - elif isinstance(p.value, db.Entity): - if p.value not in flat: - flat.append(p.value) - Crawler.create_flat_list([p.value], flat) - return flat - - def _has_missing_object_in_references(self, ident: Identifiable, referencing_entities: dict): - """ - returns False if any value in the properties attribute is a db.Entity object that - is contained in the `remote_missing_cache`. If ident has such an object in - properties, it means that it references another Entity, where we checked - whether it exists remotely and it was not found. - """ - if ident is None: - raise ValueError("Identifiable has to be given as argument") - for pvalue in list(ident.properties.values()) + ident.backrefs: - # Entity instead of ID and not cached locally - if (isinstance(pvalue, list)): - for el in pvalue: - elident = self.identifiableAdapter.get_identifiable( - el, referencing_entities[id(el)]) - if (isinstance(el, db.Entity) - and self.treated_records_lookup.get_missing(el, elident) is not None): - return True - if (isinstance(pvalue, db.Entity) and self.treated_records_lookup.get_missing( - pvalue, - self.identifiableAdapter.get_identifiable(pvalue, - referencing_entities[id(pvalue)]) - ) is not None): - # might be checked when reference is resolved - return True - return False - - def replace_references_with_cached(self, record: db.Record, referencing_entities: dict): - """ - Replace all references with the versions stored in the cache. - - If the cache version is not identical, raise an error. - """ - for p in record.properties: - if (isinstance(p.value, list)): - lst = [] - for el in p.value: - if (isinstance(el, db.Entity) and el.id is None): - cached = self.treated_records_lookup.get_any( - el, - self.identifiableAdapter.get_identifiable( - el, referencing_entities[id(el)])) - if cached is None: - lst.append(el) - continue - if not check_identical(cached, el, True): - if isinstance(p.value, db.File): - if p.value.path != cached.path: - raise RuntimeError( - "The cached and the referenced entity are not identical.\n" - f"Cached:\n{cached}\nReferenced:\n{el}" - ) - else: - raise RuntimeError( - "The cached and the referenced entity are not identical.\n" - f"Cached:\n{cached}\nReferenced:\n{el}" - ) - lst.append(cached) - else: - lst.append(el) - p.value = lst - if (isinstance(p.value, db.Entity) and p.value.id is None): - cached = self.treated_records_lookup.get_any( - p.value, self.identifiableAdapter.get_identifiable( - p.value, referencing_entities[id(p.value)])) - if cached is None: - continue - if not check_identical(cached, p.value, True): - if isinstance(p.value, db.File): - if p.value.path != cached.path: - raise RuntimeError( - "The cached and the referenced entity are not identical.\n" - f"Cached:\n{cached}\nReferenced:\n{p.value}" - ) - else: - raise RuntimeError( - "The cached and the referenced entity are not identical.\n" - f"Cached:\n{cached}\nReferenced:\n{p.value}" - ) - p.value = cached - - @staticmethod - def bend_references_to_new_object(old, new, entities): - """ Bend references to the other object - Iterate over all entities in `entities` and check the values of all properties of - occurances of old Entity and replace them with new Entity - """ - for el in entities: - for p in el.properties: - if isinstance(p.value, list): - for index, val in enumerate(p.value): - if val is old: - p.value[index] = new - else: - if p.value is old: - p.value = new + def _split_into_inserts_and_updates(self, st: SyncGraph): + """Classify nodes in the SyncGraph ``st`` with respect to their state on the server. - def _merge_identified(self, newrecord, record, try_to_merge_later, all_records): - """ tries to merge record into newrecord +This method iteratively checks whether those nodes exist on the remote server and creates two lists, +one with the entities that need to be updated and the other with entities to be inserted. - If it fails, record is added to the try_to_merge_later list. - In any case, references are bent to the newrecord object. +.. todo:: - """ - try: - merge_entities( - newrecord, record, merge_references_with_empty_diffs=False, - merge_id_with_resolved_entity=True) - except EntityMergeConflictError: - _treat_merge_error_of(newrecord, record) - # We cannot merge but it is none of the clear case where merge is - # impossible. Thus we try later - try_to_merge_later.append(record) - if newrecord.id is not None: - record.id = newrecord.id - except NotImplementedError: - print(newrecord) - print(record) - raise - Crawler.bend_references_to_new_object( - old=record, new=newrecord, - entities=all_records - ) - - def _identity_relies_on_unchecked_entities(self, record: db.Record, referencing_entities): - """ - If a record for which it could not yet be verified whether it exists in LA or not is part - of the identifying properties, this returns True, otherwise False - """ + Should this be made into a public method of SyncGraph instead? At the moment, this is a + purely static method that only operates on the state of ``st``. - registered_identifiable = self.identifiableAdapter.get_registered_identifiable(record) - if registered_identifiable is None: - return False - refs = self.identifiableAdapter.get_identifying_referencing_entities(referencing_entities, - registered_identifiable) - if any(el is None for el in refs): - return True - - refs = self.identifiableAdapter.get_identifying_referenced_entities( - record, registered_identifiable) - if any([self.treated_records_lookup.get_any(el) is None for el in refs]): - return True - - return False - - @staticmethod - def create_reference_mapping(flat: list[db.Entity]): """ - Create a dictionary of dictionaries of the form: - dict[int, dict[str, list[Union[int,None]]]] - - - The integer index is the Python id of the value object. - - The string is the name of the first parent of the referencing object. - - Each value objects is taken from the values of all properties from the list flat. - - So the returned mapping maps ids of entities to the ids of objects which are referring - to them. - """ - # TODO we need to treat children of RecordTypes somehow. - references: dict[int, dict[str, list[Union[int, None]]]] = {} - for ent in flat: - if id(ent) not in references: - references[id(ent)] = {} - for p in ent.properties: - val = p.value - if not isinstance(val, list): - val = [val] - for v in val: - if isinstance(v, db.Entity): - if id(v) not in references: - references[id(v)] = {} - if ent.parents[0].name not in references[id(v)]: - references[id(v)][ent.parents[0].name] = [] - references[id(v)][ent.parents[0].name].append(ent.id) - - return references - - def split_into_inserts_and_updates(self, ent_list: list[db.Entity]): - flat = Crawler.create_flat_list(ent_list) - all_records = list(flat) - - # TODO: can the following be removed at some point - for ent in flat: - if ent.role == "Record" and len(ent.parents) == 0: - raise RuntimeError(f"Records must have a parent.\n{ent}") - - try_to_merge_later = [] - - # Check whether Records can be identified without identifiable - for i in reversed(range(len(flat))): - record = flat[i] - # 1. Can it be identified via an ID? - if record.id is not None: - treated_record = self.treated_records_lookup.get_existing(record) - if treated_record is not None: - self._merge_identified(treated_record, record, try_to_merge_later, all_records) - all_records.remove(record) - referencing_entities = self.create_reference_mapping(all_records) - else: - self.treated_records_lookup.add(record, None) - assert record.id - del flat[i] - # 2. Can it be identified via a path? - elif record.path is not None: - try: - existing = cached_get_entity_by(path=record.path) - except EmptyUniqueQueryError: - existing = None - if existing is not None: - record.id = existing.id - # TODO check the following copying of _size and _checksum - # Copy over checksum and size too if it is a file - record._size = existing._size - record._checksum = existing._checksum - treated_record = self.treated_records_lookup.get_any(record) - if treated_record is not None: - self._merge_identified(treated_record, record, try_to_merge_later, all_records) - all_records.remove(record) - referencing_entities = self.create_reference_mapping(all_records) - else: - # TODO add identifiable if possible - self.treated_records_lookup.add(record, None) - assert record.id - del flat[i] - entity_was_treated = True - # flat contains Entities which could not yet be checked against the remote server - while entity_was_treated and len(flat) > 0: + # st.unchecked contains Entities which could not yet be checked against the remote server + while entity_was_treated and len(st.unchecked) > 0: entity_was_treated = False - referencing_entities = self.create_reference_mapping(all_records) - - # For each element we try to find out whether we can find it in the server or whether - # it does not yet exist. Since a Record may reference other unkown Records it might not - # be possible to answer this right away. - # The following checks are done on each Record: - # 1. Is it in the cache of already checked Records? - # 2. Can it be checked on the remote server? - # 3. Does it have to be new since a needed reference is missing? - for i in reversed(range(len(flat))): - record = flat[i] - - if self._identity_relies_on_unchecked_entities(record, - referencing_entities[id(record)]): + + for se in st.unchecked: + if se.identifiable is None: # we cannot yet identify this node continue - identifiable = self.identifiableAdapter.get_identifiable( - record, - referencing_entities=referencing_entities[id(record)]) - - # 1. Is it in the cache of already checked Records? - if self.treated_records_lookup.get_any(record, identifiable) is not None: - treated_record = self.treated_records_lookup.get_any(record, identifiable) - # Since the identifiables are the same, treated_record and record actually - # describe the same object. - # We merge record into treated_record in order to prevent loss of information - self._merge_identified(treated_record, record, try_to_merge_later, all_records) - all_records.remove(record) - referencing_entities = self.create_reference_mapping(all_records) - - del flat[i] - entity_was_treated = True - - # 2. Can it be checked on the remote server? - elif not self._has_reference_value_without_id(identifiable): - identified_record = ( - self.identifiableAdapter.retrieve_identified_record_for_identifiable( - identifiable)) - if identified_record is None: - # identifiable does not exist remotely -> record needs to be inserted - self.treated_records_lookup.add(record, identifiable) - else: - # side effect - record.id = identified_record.id - record.path = identified_record.path - self.treated_records_lookup.add(record, identifiable) - assert record.id - del flat[i] - entity_was_treated = True - - # 3. Does it have to be new since a needed reference is missing? - # (Is it impossible to check this record because an identifiable references a - # missing record?) - elif self._has_missing_object_in_references(identifiable, referencing_entities): - self.treated_records_lookup.add(record, identifiable) - assert record.id - del flat[i] - entity_was_treated = True - - for record in flat: - self.replace_references_with_cached(record, referencing_entities) - - # We postponed the merge for records where it failed previously and try it again now. + # check remote server + identified_record = ( + st.identifiableAdapter.retrieve_identified_record_for_identifiable( + se.identifiable)) + remote_id = None + if identified_record is not None: + remote_id = identified_record.id + # set id of node. if node is missing, remote_id is None and the SyncGraph marks it + # as missing + st.set_id_of_node(se, remote_id) + entity_was_treated = True + break # one or more nodes were just removed from st.unchecked -> back to start + # This only might add properties of the postponed records to the already used ones. - for record in try_to_merge_later: - identifiable = self.identifiableAdapter.get_identifiable( - record, - referencing_entities=referencing_entities[id(record)]) - newrecord = self.treated_records_lookup.get_any(record, identifiable) - merge_entities(newrecord, record, merge_id_with_resolved_entity=True) - if len(flat) > 0: - circle = self.detect_circular_dependency(flat) - if circle is None: - logger.error("Failed, but found NO circular dependency. The data is as follows:" - + str(self.compact_entity_list_representation(flat, - referencing_entities))) - else: - logger.error("Found circular dependency (Note that this might include references " - "that are not identifying properties): " - + self.compact_entity_list_representation(circle, - referencing_entities)) + if len(st.unchecked) > 0: + # circle = st.unchecked_contains_circular_dependency() + # if circle is None: + # logger.error("Failed, but found NO circular dependency. The data is as follows:" + # + "\n".join([str(el) for el in st.unchecked]) + + # ) + # else: + # logger.error("Found circular dependency (Note that this might include references " + # "that are not identifying properties): " + # + "\n".join([str(el) for el in st.unchecked]) + # ) raise RuntimeError( - f"Could not finish split_into_inserts_and_updates. Circular dependency: " - f"{circle is not None}") - - # remove negative IDs - missing = self.treated_records_lookup.get_missing_list() - for el in missing: - if el.id is None: - raise RuntimeError("This should not happen") # TODO remove - if el.id >= 0: - raise RuntimeError("This should not happen") # TODO remove - el.id = None + "Could not finish _split_into_inserts_and_updates. " + "It might be due to a circular dependency") - return (missing, self.treated_records_lookup.get_existing_list()) + return st.export_record_lists() def replace_entities_with_ids(self, rec: db.Record): for el in rec.properties: @@ -851,7 +363,7 @@ class Crawler(object): if val.id is not None: el.value[index] = val.id - @ staticmethod + @staticmethod def compact_entity_list_representation(entities, referencing_entities: List) -> str: """ a more readable representation than the standard xml representation @@ -883,40 +395,7 @@ class Crawler(object): return text + "--------\n" - @ staticmethod - def detect_circular_dependency(flat: list[db.Entity]): - """ - Detects whether there are circular references in the given entity list and returns a list - where the entities are ordered according to the chain of references (and only the entities - contained in the circle are included. Returns None if no circular dependency is found. - - TODO: for the sake of detecting problems for split_into_inserts_and_updates we should only - consider references that are identifying properties. - """ - circle = [flat[0]] - closed = False - while not closed: - current = circle[-1] - added_to_circle = False - for p in current.properties: - if isinstance(p.value, list): - for pval in p.value: - if pval in flat: - if pval in circle: - closed = True - circle.append(pval) - added_to_circle = True - else: - if p.value in flat: - if p.value in circle: - closed = True - circle.append(p.value) - added_to_circle = True - if not added_to_circle: - return None - return circle - - @ staticmethod + @staticmethod def _merge_properties_from_remote( crawled_data: list[db.Record], identified_records: list[db.Record] @@ -958,7 +437,7 @@ class Crawler(object): return to_be_updated - @ staticmethod + @staticmethod def remove_unnecessary_updates( crawled_data: list[db.Record], identified_records: list[db.Record] @@ -984,7 +463,7 @@ class Crawler(object): return actual_updates - @ staticmethod + @staticmethod def execute_parent_updates_in_list(to_be_updated, securityMode, run_id, unique_names): """ Execute the updates of changed parents. @@ -1027,13 +506,13 @@ class Crawler(object): "mode. This might lead to a failure of inserts that follow.") logger.info(parent_updates) - @ staticmethod + @staticmethod def _get_property_id_for_datatype(rtname: str, name: str): return cached_get_entity_by( query=f"FIND Entity '{escape_squoted_text(rtname)}' " - f"with name='{escape_squoted_text(name)}'").id + f"with name='{escape_squoted_text(name)}'").id - @ staticmethod + @staticmethod def replace_name_with_referenced_entity_id(prop: db.Property): """changes the given property in place if it is a reference property that has a name as value @@ -1078,7 +557,7 @@ class Crawler(object): propval.append(el) prop.value = propval - @ staticmethod + @staticmethod def execute_inserts_in_list(to_be_inserted, securityMode, run_id: Optional[uuid.UUID] = None, unique_names=True): @@ -1098,7 +577,7 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_inserted, run_id, insert=True) - @ staticmethod + @staticmethod def set_ids_and_datatype_of_parents_and_properties(rec_list): for record in rec_list: for parent in record.parents: @@ -1110,7 +589,7 @@ class Crawler(object): prop.id = entity.id _resolve_datatype(prop, entity) - @ staticmethod + @staticmethod def execute_updates_in_list(to_be_updated, securityMode, run_id: Optional[uuid.UUID] = None, unique_names=True): @@ -1124,7 +603,7 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_updated, run_id) - @ staticmethod + @staticmethod def check_whether_parent_exists(records: list[db.Entity], parents: list[str]): """ returns a list of all records in `records` that have a parent that is in `parents`""" problems = [] @@ -1180,7 +659,8 @@ class Crawler(object): "use for example the Scanner to create this data.")) crawled_data = self.crawled_data - to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data) + to_be_inserted, to_be_updated = self._split_into_inserts_and_updates( + SyncGraph(crawled_data, self.identifiableAdapter)) for el in to_be_updated: # all entity objects are replaced by their IDs except for the not yet inserted ones @@ -1211,8 +691,10 @@ class Crawler(object): if len(ins_problems) > 0 or len(upd_problems) > 0: raise ForbiddenTransaction( "One or more Records that have a parent which is excluded from inserts or updates." - f"\nRecords excluded from inserts have the following RecordTypes:\n{[el.parents[0].name for el in ins_problems]}" - f"\nRecords excluded from updates have the following RecordTypes:\n{[el.parents[0].name for el in upd_problems]}" + f"\nRecords excluded from inserts have the following RecordTypes:\n" + f"{[el.parents[0].name for el in ins_problems]}" + f"\nRecords excluded from updates have the following RecordTypes:\n" + f"{[el.parents[0].name for el in upd_problems]}" ) logger.info(f"Going to insert {len(to_be_inserted)} Entities and update " @@ -1221,14 +703,14 @@ class Crawler(object): cache_clear() self.execute_parent_updates_in_list(to_be_updated, securityMode=self.securityMode, run_id=self.run_id, unique_names=unique_names) - logger.info(f"Added parent RecordTypes where necessary.") + logger.info("Added parent RecordTypes where necessary.") self.execute_inserts_in_list( to_be_inserted, self.securityMode, self.run_id, unique_names=unique_names) - logger.info(f"Executed inserts:\n" + logger.info("Executed inserts:\n" + self.create_entity_summary(to_be_inserted)) self.execute_updates_in_list( to_be_updated, self.securityMode, self.run_id, unique_names=unique_names) - logger.info(f"Executed updates:\n" + logger.info("Executed updates:\n" + self.create_entity_summary(to_be_updated)) update_cache = UpdateCache() @@ -1244,7 +726,7 @@ class Crawler(object): return (to_be_inserted, to_be_updated) - @ staticmethod + @staticmethod def create_entity_summary(entities: list[db.Entity]): """ Creates a summary string reprensentation of a list of entities.""" parents = {} @@ -1263,13 +745,35 @@ class Crawler(object): output = output[:-2] + "\n" return output - @ staticmethod + @staticmethod def inform_about_pending_changes(pending_changes, run_id, path, inserts=False): # Sending an Email with a link to a form to authorize updates is if get_config_setting("send_crawler_notifications"): - filename = OldCrawler.save_form( - [el[3] for el in pending_changes], path, run_id) - OldCrawler.send_mail([el[3] for el in pending_changes], filename) + filename = OldCrawler.save_form([el[3] for el in pending_changes], path, run_id) + text = """Dear Curator, + there where changes that need your authorization. Please check the following + carefully and if the changes are ok, click on the following link: + + {url}/Shared/{filename} + + {changes} + """.format(url=db.configuration.get_config()["Connection"]["url"], + filename=filename, + changes="\n".join([el[3] for el in pending_changes])) + try: + fro = get_config_setting("sendmail_from_address") + to = get_config_setting("sendmail_to_address") + except KeyError: + logger.error("Server Configuration is missing a setting for " + "sending mails. The administrator should check " + "'from_mail' and 'to_mail'.") + return + + send_mail( + from_addr=fro, + to=to, + subject="Crawler Update", + body=text) for i, el in enumerate(pending_changes): @@ -1284,7 +788,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) + " by invoking the crawler" " with the run id: {rid}\n".format(rid=run_id)) - @ staticmethod + @staticmethod def debug_build_usage_tree(converter: Converter): res: dict[str, dict[str, Any]] = { converter.name: { @@ -1302,7 +806,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) res[converter.name]["subtree"][k[0]] = d[k[0]] return res - def save_debug_data(self, filename: str, debug_tree: DebugTree = None): + def save_debug_data(self, filename: str, debug_tree: Optional[DebugTree] = None): """ Save the information contained in a debug_tree to a file named filename. """ @@ -1361,13 +865,13 @@ def _update_status_record(run_id, n_inserts, n_updates, status): cr_rec.get_property('status').value = status (cr_rec .add_property(db.execute_query( - f"FIND Property with name='number_of_inserted_entities'", unique=True).id, + "FIND Property with name='number_of_inserted_entities'", unique=True).id, n_inserts) .add_property( - db.execute_query(f"FIND Property with name='number_of_updated_entities'", + db.execute_query("FIND Property with name='number_of_updated_entities'", unique=True).id, n_updates) .add_property( - db.execute_query(f"FIND Property with name='finished'", + db.execute_query("FIND Property with name='finished'", unique=True).id, datetime.now().isoformat())) cr_rec.update() @@ -1380,6 +884,7 @@ def _notify_about_inserts_and_updates(n_inserts, n_updates, logfile, run_id): The email contains some basic information and a link to the log and the CrawlerRun Record. """ if not get_config_setting("send_crawler_notifications"): + logger.debug("Crawler email notifications are disabled.") return if n_inserts == 0 and n_updates == 0: return @@ -1390,8 +895,8 @@ the CaosDB Crawler successfully crawled the data and """ + domain = get_config_setting("public_host_url") if get_config_setting("create_crawler_status_records"): - domain = get_config_setting("public_host_url") text += ("You can checkout the CrawlerRun Record for more information:\n" f"{domain}/Entity/?P=0L10&query=find%20crawlerrun%20with%20run_id=%27{run_id}%27\n\n") text += (f"You can download the logfile here:\n{domain}/Shared/" + logfile) @@ -1550,11 +1055,19 @@ def crawler_main(crawled_directory_path: str, try: crawler = Crawler(securityMode=securityMode) - # setup logging and reporting if serverside execution - if "SHARED_DIR" in os.environ: + if "SHARED_DIR" in os.environ: # setup logging and reporting if serverside execution userlog_public, htmluserlog_public, debuglog_public = configure_server_side_logging() + # TODO make this optional _create_status_record( - get_config_setting("public_host_url") + "/Shared/" + htmluserlog_public, crawler.run_id) + get_config_setting("public_host_url") + "/Shared/" + htmluserlog_public, + crawler.run_id) + else: # setup stdout logging for other cases + root_logger = logging.getLogger() + root_logger.setLevel(level=(logging.DEBUG if debug else logging.INFO)) + handler = logging.StreamHandler(stream=sys.stdout) + handler.setLevel(logging.DEBUG if debug else logging.INFO) + root_logger.addHandler(handler) + logger.handlers.clear() debug_tree = DebugTree() crawled_data = scan_directory( @@ -1569,6 +1082,10 @@ def crawler_main(crawled_directory_path: str, ident = CaosDBIdentifiableAdapter() ident.load_from_yaml_definition(identifiables_definition_file) crawler.identifiableAdapter = ident + else: + # TODO + # raise ValueError("An identifiable file is needed.") + pass remove_prefix = _treat_deprecated_prefix(prefix, remove_prefix) @@ -1594,15 +1111,24 @@ def crawler_main(crawled_directory_path: str, logger.error(err) _update_status_record(crawler.run_id, 0, 0, status="FAILED") return 1 + except TransactionError as err: + logger.debug(traceback.format_exc()) + logger.error(err) + logger.error("Transaction error details:") + for suberr in err.errors: + logger.error("---") + logger.error(suberr.msg) + logger.error(suberr.entity) + return 1 except Exception as err: logger.debug(traceback.format_exc()) - logger.debug(err) + logger.error(err) if "SHARED_DIR" in os.environ: # pylint: disable=E0601 domain = get_config_setting("public_host_url") - logger.error("Unexpected Error: Please tell your administrator about this and provide the" - f" following path.\n{domain}/Shared/" + debuglog_public) + logger.error("Unexpected Error: Please tell your administrator about this and provide " + f"the following path.\n{domain}/Shared/" + debuglog_public) _update_status_record(crawler.run_id, 0, 0, status="FAILED") return 1 diff --git a/src/caoscrawler/default_converters.yml b/src/caoscrawler/default_converters.yml index ac8561e6e170007c1e1501a00ed61b152296b9a5..82e2f635f621b2e21e43b728fd9ed6865454f828 100644 --- a/src/caoscrawler/default_converters.yml +++ b/src/caoscrawler/default_converters.yml @@ -8,6 +8,9 @@ BooleanElement: Date: converter: DateElementConverter package: caoscrawler.converters +Datetime: + converter: DatetimeElementConverter + package: caoscrawler.converters Dict: converter: DictElementConverter package: caoscrawler.converters @@ -27,7 +30,7 @@ TextElement: converter: TextElementConverter package: caoscrawler.converters - + DictDictElement: # deprecated converter: DictElementConverter package: caoscrawler.converters @@ -63,7 +66,7 @@ File: # deprecated converter: SimpleFileConverter package: caoscrawler.converters - + SimpleFile: converter: SimpleFileConverter package: caoscrawler.converters @@ -84,6 +87,10 @@ CSVTableConverter: converter: CSVTableConverter package: caoscrawler.converters +SPSSFile: + converter: SPSSConverter + package: caoscrawler.converters + XLSXTableConverter: converter: XLSXTableConverter package: caoscrawler.converters diff --git a/src/caoscrawler/default_transformers.yml b/src/caoscrawler/default_transformers.yml index d0ad23912176bdfbf2446aa6e04bd7fa6b858777..ffcb1b15bd2bad71083cc8f0ba84172ee3daf2b0 100644 --- a/src/caoscrawler/default_transformers.yml +++ b/src/caoscrawler/default_transformers.yml @@ -1,4 +1,4 @@ - +# Lookup table for matching functions and cfood yaml node names. submatch: package: caoscrawler.transformer_functions @@ -9,3 +9,9 @@ split: replace: package: caoscrawler.transformer_functions function: replace +date_parse: + package: caoscrawler.transformer_functions + function: date_parse +datetime_parse: + package: caoscrawler.transformer_functions + function: datetime_parse diff --git a/src/caoscrawler/exceptions.py b/src/caoscrawler/exceptions.py new file mode 100644 index 0000000000000000000000000000000000000000..e7c61c34e2abbebef4790bde42f50d4b5b29f957 --- /dev/null +++ b/src/caoscrawler/exceptions.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +class ForbiddenTransaction(Exception): + """Thrown if an transactions is needed that is not allowed. + For example an update of an entity if the security level is INSERT + """ + pass + + +class ImpossibleMergeError(Exception): + """Thrown if due to identifying information, two SyncNodes or two Properties of SyncNodes + should be merged, but there is conflicting information that prevents this. + """ + + def __init__(self, *args, pname, values, **kwargs): + self.pname = pname + self.values = values + super().__init__(self, *args, **kwargs) + + +class InvalidIdentifiableYAML(Exception): + """Thrown if the identifiable definition is invalid.""" + pass + + +class MissingIdentifyingProperty(Exception): + """Thrown if a SyncNode does not have the properties required by the corresponding registered + identifiable + """ + pass + + +class MissingRecordType(Exception): + """Thrown if an record type can not be found although it is expected that it exists on the + server. + """ + pass + + +class MissingReferencingEntityError(Exception): + """Thrown if the identifiable requires that some entity references the given entity but there + is no such reference """ + + def __init__(self, *args, rts=None, **kwargs): + self.rts = rts + super().__init__(self, *args, **kwargs) diff --git a/src/caoscrawler/hdf5_converter.py b/src/caoscrawler/hdf5_converter.py index 5b1ff5775fb74919c989507c449636fd822db7f0..482d59c12d2d0b8540c01bd04da718d9c514ddc4 100644 --- a/src/caoscrawler/hdf5_converter.py +++ b/src/caoscrawler/hdf5_converter.py @@ -18,6 +18,8 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # +from typing import Optional + try: import h5py except ModuleNotFoundError: @@ -94,8 +96,8 @@ def convert_h5_element(elt: Union[h5py.Group, h5py.Dataset], name: str): raise ValueError("The given element must be either a HDF5 Group or Dataset object.") -def convert_basic_element_with_nd_array(value, name: str = None, - internal_path: str = None, msg_prefix: str = ""): +def convert_basic_element_with_nd_array(value, name: Optional[str] = None, + internal_path: Optional[str] = None, msg_prefix: str = ""): """Convert a given object either to an ndarray structure element or to a basic scalar structure element. diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index cefdf4a0f42b1f610e0712fdefebc2dc3b78d69f..f6c85c694e5ef0be7e6a9be8154a34c400bab008 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # -# This file is a part of the CaosDB Project. +# This file is a part of the LinkAhead Project. # # Copyright (C) 2022 Henrik tom Wörden # @@ -20,23 +20,27 @@ # from __future__ import annotations -import linkahead as db -from datetime import datetime + import json -from hashlib import sha256 -from typing import Union import logging +from datetime import datetime +from hashlib import sha256 +from typing import Optional, Union + +import linkahead as db + +from .exceptions import MissingIdentifyingProperty +from .sync_node import SyncNode logger = logging.getLogger(__name__) class Identifiable(): """ - The fingerprint of a Record in CaosDB. + The fingerprint of a Record in LinkAhead. - This class contains the information that is used by the CaosDB Crawler to identify Records. - On one hand, this can be the ID or a Record or the path of a File. - On the other hand, in order to check whether a Record exits in the CaosDB Server, a query can + This class contains the information that is used by the LinkAhead Crawler to identify Records. + In order to check whether a Record exits in the LinkAhead Server, a query can be created using the information contained in the Identifiable. Parameters @@ -46,23 +50,22 @@ class Identifiable(): properties: dict, keys are names of Properties; values are Property values Note, that lists are not checked for equality but are interpreted as multiple conditions for a single Property. - path: str, In case of files: The path where the file is stored. backrefs: list, TODO future """ - def __init__(self, record_id: int = None, path: str = None, record_type: str = None, - name: str = None, properties: dict = None, - backrefs: list[Union[int, str]] = None): - if (record_id is None and path is None and name is None + def __init__(self, record_id: Optional[int] = None, record_type: Optional[str] = None, + name: Optional[str] = None, properties: Optional[dict] = None, + backrefs: Optional[list[Union[int, str]]] = None): + if (record_id is None and name is None and (backrefs is None or len(backrefs) == 0) and (properties is None or len(properties) == 0)): - raise ValueError("There is no identifying information. You need to add a path or " - "properties or other identifying attributes.") + raise ValueError( + "There is no identifying information. You need to add " + "properties or other identifying attributes.") if properties is not None and 'name' in [k.lower() for k in properties.keys()]: raise ValueError("Please use the separete 'name' keyword instead of the properties " "dict for name") self.record_id = record_id - self.path = path self.record_type = record_type self.name = name if name == "": @@ -77,24 +80,21 @@ class Identifiable(): def get_representation(self) -> str: return sha256(Identifiable._create_hashable_string(self).encode('utf-8')).hexdigest() - @staticmethod + @ staticmethod def _value_representation(value) -> str: """returns the string representation of property values to be used in the hash function - The string is the path of a File Entity, the CaosDB ID or Python ID of other Entities - (Python Id only if there is no CaosDB ID) and the string representation of bool, float, int - and str. + The string is the LinkAhead ID in case of SyncNode objects (SyncNode objects must have an ID) + and the string representation of None, bool, float, int, datetime and str. """ if value is None: return "None" - elif isinstance(value, db.File): - return str(value.path) - elif isinstance(value, db.Entity): + elif isinstance(value, SyncNode): if value.id is not None: return str(value.id) else: - return "PyID=" + str(id(value)) + raise RuntimeError("Python Entity (SyncNode) without ID not allowed") elif isinstance(value, list): return "[" + ", ".join([Identifiable._value_representation(el) for el in value]) + "]" elif (isinstance(value, str) or isinstance(value, int) or isinstance(value, float) @@ -103,7 +103,7 @@ class Identifiable(): else: raise ValueError(f"Unknown datatype of the value: {value}") - @staticmethod + @ staticmethod def _create_hashable_string(identifiable: Identifiable) -> str: """ creates a string from the attributes of an identifiable that can be hashed @@ -120,27 +120,20 @@ class Identifiable(): return rec_string def __eq__(self, other) -> bool: - """ - Identifiables are equal if they belong to the same Record. Since ID and path are on their - own enough to identify the Record it is sufficient if those attributes are equal. - 1. both IDs are set (not None) -> equal if IDs are equal - 2. both paths are set (not None) -> equal if paths are equal - 3. equal if attribute representations are equal - """ + """ Identifiables are equal if they share the same ID or if the representation is equal """ if not isinstance(other, Identifiable): raise ValueError("Identifiable can only be compared to other Identifiable objects.") - elif self.record_id is not None and other.record_id is not None: + if self.record_id is not None and other.record_id is not None: return self.record_id == other.record_id - elif self.path is not None and other.path is not None: - return self.path == other.path elif self.get_representation() == other.get_representation(): return True else: return False def __repr__(self): - pstring = json.dumps(self.properties) + """ deterministic text representation of the identifiable """ + pstring = json.dumps({k: str(v) for k, v in self.properties.items()}) return (f"{self.__class__.__name__} for RT {self.record_type}: id={self.record_id}; " - f"name={self.name}\n\tpath={self.path}\n" + f"name={self.name}\n" f"\tproperties:\n{pstring}\n" f"\tbackrefs:\n{self.backrefs}") diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index d95112ee1aec6ca1526c96421a8052282b6ef9a7..854ee614638712bdcf957c592ef2946dbdd43afc 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -2,7 +2,7 @@ # encoding: utf-8 # # ** header v3.0 -# This file is a part of the CaosDB Project. +# This file is a part of the LinkAhead Project. # # Copyright (C) 2021-2022 Henrik tom Wörden # 2021-2022 Alexander Schlemmer @@ -29,7 +29,6 @@ import logging import warnings from abc import ABCMeta, abstractmethod from datetime import datetime -from functools import lru_cache from typing import Any import linkahead as db @@ -37,7 +36,14 @@ import yaml from linkahead.cached import cached_get_entity_by, cached_query from linkahead.utils.escape import escape_squoted_text +from .exceptions import ( + InvalidIdentifiableYAML, + MissingIdentifyingProperty, + MissingRecordType, + MissingReferencingEntityError, +) from .identifiable import Identifiable +from .sync_node import SyncNode from .utils import has_parent logger = logging.getLogger(__name__) @@ -47,11 +53,14 @@ def get_children_of_rt(rtname): """Supply the name of a recordtype. This name and the name of all children RTs are returned in a list""" escaped = escape_squoted_text(rtname) - return [p.name for p in cached_query(f"FIND RECORDTYPE '{escaped}'")] + recordtypes = [p.name for p in cached_query(f"FIND RECORDTYPE '{escaped}'")] + if not recordtypes: + raise MissingRecordType(f"Record type could not be found on server: {rtname}") + return recordtypes def convert_value(value: Any) -> str: - """ Return a string representation of the value suitable for the search query. + """Return a string representation of the value suitable for the search query. This is for search queries looking for the identified record. @@ -82,27 +91,27 @@ def convert_value(value: Any) -> str: class IdentifiableAdapter(metaclass=ABCMeta): """Base class for identifiable adapters. -Some terms: + Some terms: -- A *registered identifiable* defines an identifiable template, for example by specifying: - - Parent record types - - Properties - - ``is_referenced_by`` statements -- An *identifiable* belongs to a concrete record. It consists of identifying attributes which "fill - in" the *registered identifiable*. In code, it can be represented as a Record based on the - *registered identifiable* with all the values filled in. -- An *identified record* is the result of retrieving a record from the database, based on the - *identifiable* (and its values). + - A *registered identifiable* defines an identifiable template, for example by specifying: + - Parent record types + - Properties + - ``is_referenced_by`` statements + - An *identifiable* belongs to a concrete record. It consists of identifying attributes which + "fill in" the *registered identifiable*. In code, it can be represented as a Record based on + the *registered identifiable* with all the values filled in. + - An *identified record* is the result of retrieving a record from the database, based on the + *identifiable* (and its values). -General question to clarify: + General question to clarify: -- Do we want to support multiple identifiables per RecordType? -- Current implementation supports only one identifiable per RecordType. + - Do we want to support multiple identifiables per RecordType? + - Current implementation supports only one identifiable per RecordType. -The list of referenced by statements is currently not implemented. + The list of referenced by statements is currently not implemented. -The IdentifiableAdapter can be used to retrieve the three above mentioned objects (registered -identifiabel, identifiable and identified record) for a Record. + The IdentifiableAdapter can be used to retrieve the three above mentioned objects (registered + identifiabel, identifiable and identified record) for a Record. """ @@ -127,7 +136,7 @@ identifiabel, identifiable and identified record) for a Record. eid = ref if isinstance(ref, db.Entity): eid = ref.id - query_string += (" WHICH IS REFERENCED BY " + str(eid) + " AND") + query_string += " WHICH IS REFERENCED BY " + str(eid) + " AND" query_string += " WITH " @@ -136,22 +145,84 @@ identifiabel, identifiable and identified record) for a Record. if len(ident.properties) > 0: query_string += " AND " - query_string += IdentifiableAdapter.create_property_query(ident, startswith=startswith) + query_string += IdentifiableAdapter.create_property_query( + ident, startswith=startswith + ) # TODO Can these cases happen at all with the current code? if query_string.endswith(" AND WITH "): - query_string = query_string[:-len(" AND WITH ")] + query_string = query_string[: -len(" AND WITH ")] if query_string.endswith(" AND "): - query_string = query_string[:-len(" AND ")] + query_string = query_string[: -len(" AND ")] return query_string + def all_identifying_properties_exist( + self, node: SyncNode, raise_exception: bool = True + ): + """checks whether all identifying properties exist and raises an error if + that's not the case. It furthermore raises an error if "name" is part of + the identifiable, but the node does not have a name. + + If raise_exception is False, the function returns False instead of raising an error. + + Backreferences are not checked. + + Returns True if all identifying properties exist. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + if node.registered_identifiable is None: + if raise_exception: + parents = [p.name for p in node.parents] + parents_str = "\n".join(f"- {p}" for p in parents) + raise RuntimeError("No registered identifiable for node with these parents:\n" + + parents_str) + else: + return False + for prop in node.registered_identifiable.properties: + if prop.name.lower() == "is_referenced_by": + continue + if prop.name.lower() == "name": + if node.name is None: + if raise_exception: + i = MissingIdentifyingProperty("The node has no name.") + i.prop = "name" + raise i + else: + return False + else: + continue + + # multiple occurances are ok here. We deal with that when actually creating an + # identifiable (IDs of referenced Entities might need to get resolved first). + if ( + len( + [ + el + for el in node.properties + if el.name.lower() == prop.name.lower() + ] + ) + == 0 + ): + if raise_exception: + i = MissingIdentifyingProperty( + f"The property {prop.name} is missing." + ) + i.prop = prop.name + raise i + else: + return False + + return True + @staticmethod def __create_pov_snippet(pname: str, pvalue, startswith: bool = False): """Return something like ``'name'='some value'`` or ``'name' LIKE 'some*'``. -If ``startswith`` is True, the value of strings will be cut off at 200 characters and a ``LIKE`` -operator will be used to find entities matching at the beginning. -""" + If ``startswith`` is True, the value of strings will be cut off at 200 characters and a ``LIKE`` + operator will be used to find entities matching at the beginning. + """ if startswith and isinstance(pvalue, str) and len(pvalue) > 200: operator_value_str = f" LIKE '{escape_squoted_text(pvalue[:200])}*'" else: @@ -163,14 +234,14 @@ operator will be used to find entities matching at the beginning. def create_property_query(entity: Identifiable, startswith: bool = False): """Create a POV query part with the entity's properties. -Parameters ----------- + Parameters + ---------- -entity: Identifiable - The Identifiable whose properties shall be used. + entity: Identifiable + The Identifiable whose properties shall be used. -startswith: bool, optional - If True, check string typed properties against the first 200 characters only. Default is False. + startswith: bool, optional + If True, check string typed properties against the first 200 characters only. Default is False. """ query_string = "" pov = IdentifiableAdapter.__create_pov_snippet # Shortcut @@ -197,61 +268,38 @@ startswith: bool, optional return query_string[:-4] @abstractmethod - def get_registered_identifiable(self, record: db.Record): + def get_registered_identifiable(self, record: db.Entity): """ Check whether an identifiable is registered for this record and return its definition. If there is no identifiable registered, return None. """ pass - @abstractmethod - def resolve_reference(self, record: db.Record): - pass - @abstractmethod def get_file(self, identifiable: db.File): - warnings.warn(DeprecationWarning("This function is deprecated. Please do not use it.")) + warnings.warn( + DeprecationWarning("This function is deprecated. Please do not use it.") + ) """ Retrieve the file object for a (File) identifiable. """ pass - @staticmethod - def get_identifying_referencing_entities(referencing_entities, registered_identifiable): - refs = [] - for prop in registered_identifiable.properties: - if prop.name.lower() != "is_referenced_by": - continue - for looking_for_rt in prop.value: - found = False - if looking_for_rt == "*": - for val in referencing_entities.values(): - if len(val) > 0: - found = True - refs.extend(val) - else: - rt_and_children = get_children_of_rt(looking_for_rt) - for rtname in rt_and_children: - if (rtname in referencing_entities): - refs.extend(referencing_entities[rtname]) - found = True - if not found: - raise RuntimeError( - f"Could not find referencing entities of type(s): {prop.value}\n" - f"for registered identifiable:\n{registered_identifiable}\n" - f"There were {len(referencing_entities)} referencing entities to choose from.\n" - f"This error can also occur in case of merge conflicts in the referencing entities." - ) - return refs - @staticmethod def get_identifying_referenced_entities(record, registered_identifiable): + """Create a list of all entities that are referenced by record + and that are used as identying properties of the identifiable. + + Last review by Alexander Schlemmer on 2024-05-29. + """ refs = [] for prop in registered_identifiable.properties: pname = prop.name.lower() if pname == "name" or pname == "is_referenced_by": continue if record.get_property(prop.name) is None: + logger.error(f"Record with missing identifying property:\n{record}\n" + f"This property is missing: {prop.name}\n") raise RuntimeError("Missing identifying Property") pval = record.get_property(prop.name).value if not isinstance(prop.value, list): @@ -261,83 +309,101 @@ startswith: bool, optional refs.append(val) return refs - def get_identifiable(self, record: db.Record, referencing_entities=None): + def get_identifiable(self, se: SyncNode, identifiable_backrefs: set[SyncNode]) -> Identifiable: """ - Retrieve the registered identifiable and fill the property values to create an - identifiable. + Take the registered identifiable of given SyncNode ``se`` and fill the property values to + create an identifiable. Args: - record: the record for which the Identifiable shall be created. - referencing_entities: a dictionary (Type: dict[str, list[db.Entity]]), that - allows to look up entities with a certain RecordType, that reference ``record`` + se: the SyncNode for which the Identifiable shall be created. + identifiable_backrefs: a set (Type: set[SyncNode]), that contains SyncNodes + with a certain RecordType, that reference ``se`` Returns: Identifiable, the identifiable for record. - """ - registered_identifiable = self.get_registered_identifiable(record) - - if referencing_entities is None: - referencing_entities = {} + Last review by Alexander Schlemmer on 2024-05-29. + """ property_name_list_A = [] - property_name_list_B = [] identifiable_props = {} - identifiable_backrefs = [] - name_is_identifying_property = False - - if registered_identifiable is not None: - identifiable_backrefs = self.get_identifying_referencing_entities( - referencing_entities, registered_identifiable) - # fill the values: - for prop in registered_identifiable.properties: - if prop.name == "name": - # The name can be an identifiable, but it isn't a property - name_is_identifying_property = True - continue - # problem: what happens with multi properties? - # case A: in the registered identifiable - # case B: in the identifiable - - # treated above - if prop.name.lower() == "is_referenced_by": - continue + name = None + + if se.registered_identifiable is None: + raise ValueError("no registered_identifiable") + + # fill the values: + for prop in se.registered_identifiable.properties: + # TDOO: + # If there are multiproperties in the registered_identifiable, then only the LAST is + # taken into account (later properties overwrite previous one in the dict below). + if prop.name == "name": + name = se.name + continue - record_prop = record.get_property(prop.name) - if record_prop is None: - # TODO: how to handle missing values in identifiables - # raise an exception? - # TODO: is this the appropriate error? - raise NotImplementedError( - f"The following record is missing an identifying property:\n" - f"RECORD\n{record}\nIdentifying PROPERTY\n{prop.name}" + if prop.name.lower() == "is_referenced_by": + for el in identifiable_backrefs: + if not isinstance(el, SyncNode): + raise ValueError("Elements of `identifiable_backrefs` must be SyncNodes") + if len(identifiable_backrefs) == 0: + raise MissingReferencingEntityError( + f"Could not find referencing entities of type(s): {prop.value}\n" + f"for registered identifiable:\n{se.registered_identifiable}\n" + f"There were {len(identifiable_backrefs)} referencing entities to " + "choose from.\n" + f"This error can also occur in case of merge conflicts in the referencing" + " entities." ) - identifiable_props[record_prop.name] = record_prop.value - property_name_list_A.append(prop.name) - - # check for multi properties in the record: - for prop in property_name_list_A: - property_name_list_B.append(prop) - if (len(set(property_name_list_B)) != len(property_name_list_B) or len( - set(property_name_list_A)) != len(property_name_list_A)): - raise RuntimeError( - "Multi properties used in identifiables could cause unpredictable results and " - "are not allowed. You might want to consider a Property with a list as value.") + elif len([e.id for e in identifiable_backrefs if el.id is None]) > 0: + raise RuntimeError("Referencing entity has no id") + # At this point we know that there is at least one referencing SyncNode + # with an ID. We do not need to set any property value (the reference will be used + # in the backrefs argument below) and can thus continue with the next identifying + # property + continue + + options = [p.value for p in se.properties if p.name.lower() == prop.name.lower()] + if len(options) == 0: + raise MissingIdentifyingProperty( + f"The following record is missing an identifying property:\n" + f"RECORD\n{se}\nIdentifying PROPERTY\n{prop.name}" + ) + for ii, el in enumerate(options): + if isinstance(el, SyncNode): + options[ii] = el.id + if el.id is None: + raise RuntimeError( + "Reference to unchecked in identifiable:\n" + f"{prop.name}:\n{el}" + ) + else: + options[ii] = el + if not all([f == options[0] for f in options]): + raise RuntimeError("differing prop values ") + + identifiable_props[prop.name] = options[0] + property_name_list_A.append(prop.name) + + # check for multi properties in the record: + if len(set(property_name_list_A)) != len(property_name_list_A): + raise RuntimeError( + "Multi properties used in identifiables could cause unpredictable results and " + "are not allowed. You might want to consider a Property with a list as value." + ) # use the RecordType of the registered Identifiable if it exists # We do not use parents of Record because it might have multiple try: return Identifiable( - record_id=record.id, - record_type=(registered_identifiable.parents[0].name - if registered_identifiable else None), - name=record.name if name_is_identifying_property else None, + record_id=se.id, + record_type=se.registered_identifiable.parents[0].name, + name=name, properties=identifiable_props, - path=record.path, - backrefs=identifiable_backrefs + backrefs=[e.id for e in identifiable_backrefs], ) - except Exception: - logger.error(f"Error while creating identifiable for this record:\n{record}") + except Exception as exc: + logger.error(exc) + logger.error(f"Error while creating identifiable for this record:\n{se}") raise @abstractmethod @@ -352,23 +418,29 @@ startswith: bool, optional """ pass - def retrieve_identified_record_for_record(self, record: db.Record, referencing_entities=None): - """ - This function combines all functionality of the IdentifierAdapter by - returning the identifiable after having checked for an appropriate - registered identifiable. + @staticmethod + def referencing_entity_has_appropriate_type(parents, register_identifiable): + """returns true if one of the parents is listed by the 'is_referenced_by' property - In case there was no appropriate registered identifiable or no identifiable could - be found return value is None. - """ - if record.path is not None: - return cached_get_entity_by(path=record.path) - if record.id is not None: - return cached_get_entity_by(eid=record.id) + This function also returns True if 'is_referenced_by' contains the wildcard '*'. - identifiable = self.get_identifiable(record, referencing_entities=referencing_entities) + Last review by Alexander Schlemmer on 2024-05-29. + """ + if register_identifiable.get_property("is_referenced_by") is None: + return False + if register_identifiable.get_property("is_referenced_by").value is None: + return False - return self.retrieve_identified_record_for_identifiable(identifiable) + appropriate_types = [] + for rt in register_identifiable.get_property("is_referenced_by").value: + appropriate_types.extend(get_children_of_rt(rt)) + appropriate_types = [el.lower() for el in appropriate_types] + if "*" in appropriate_types: + return True + for parent in parents: + if parent.name.lower() in appropriate_types: + return True + return False class LocalStorageIdentifiableAdapter(IdentifiableAdapter): @@ -377,8 +449,11 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): """ def __init__(self): - warnings.warn(DeprecationWarning( - "This class is deprecated. Please use the CaosDBIdentifiableAdapter.")) + warnings.warn( + DeprecationWarning( + "This class is deprecated. Please use the CaosDBIdentifiableAdapter." + ) + ) self._registered_identifiables = dict() self._records = [] @@ -393,7 +468,9 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): Just look in records for a file with the same path. """ candidates = [] - warnings.warn(DeprecationWarning("This function is deprecated. Please do not use it.")) + warnings.warn( + DeprecationWarning("This function is deprecated. Please do not use it.") + ) for record in self._records: if record.role == "File" and record.path == identifiable.path: candidates.append(record) @@ -405,15 +482,18 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): def store_state(self, filename): with open(filename, "w") as f: - f.write(db.common.utils.xml2str( - db.Container().extend(self._records).to_xml())) + f.write( + db.common.utils.xml2str(db.Container().extend(self._records).to_xml()) + ) def restore_state(self, filename): with open(filename, "r") as f: self._records = db.Container().from_xml(f.read()) # TODO: move to super class? - def is_identifiable_for_record(self, registered_identifiable: db.RecordType, record: db.Record): + def is_identifiable_for_record( + self, registered_identifiable: db.RecordType, record: db.Record + ): """ Check whether this registered_identifiable is an identifiable for the record. @@ -424,8 +504,7 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): Return True in that case and False otherwise. """ if len(registered_identifiable.parents) != 1: - raise RuntimeError( - "Multiple parents for identifiables not supported.") + raise RuntimeError("Multiple parents for identifiables not supported.") if not has_parent(record, registered_identifiable.parents[0].name): return False @@ -435,14 +514,13 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): return False return True - def get_registered_identifiable(self, record: db.Record): + def get_registered_identifiable(self, record: db.Entity): identifiable_candidates = [] for _, definition in self._registered_identifiables.items(): if self.is_identifiable_for_record(definition, record): identifiable_candidates.append(definition) if len(identifiable_candidates) > 1: - raise RuntimeError( - "Multiple candidates for an identifiable found.") + raise RuntimeError("Multiple candidates for an identifiable found.") if len(identifiable_candidates) == 0: return None return identifiable_candidates[0] @@ -457,8 +535,9 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): record is the record from the local database to check against. identifiable is the record that was created during the crawler run. """ - if (identifiable.record_type is not None - and not has_parent(record, identifiable.record_type)): + if identifiable.record_type is not None and not has_parent( + record, identifiable.record_type + ): return False for propname, propvalue in identifiable.properties.items(): prop_record = record.get_property(propname) @@ -487,27 +566,12 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): candidates.append(record) if len(candidates) > 1: raise RuntimeError( - f"Identifiable was not defined unambigiously. Possible candidates are {candidates}") + f"Identifiable was not defined unambigiously. Possible candidates are {candidates}" + ) if len(candidates) == 0: return None return candidates[0] - def resolve_reference(self, value: db.Record): - if self.get_registered_identifiable(value) is None: - raise NotImplementedError("Non-identifiable references cannot" - " be used as properties in identifiables.") - # TODO: just resolve the entity - - value_identifiable = self.retrieve_identified_record_for_record(value) - if value_identifiable is None: - raise RuntimeError("The identifiable which is used as property" - " here has to be inserted first.") - - if value_identifiable.id is None: - raise RuntimeError("The entity has not been assigned an ID.") - - return value_identifiable.id - class CaosDBIdentifiableAdapter(IdentifiableAdapter): """ @@ -521,27 +585,42 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): def load_from_yaml_definition(self, path: str): """Load identifiables defined in a yaml file""" - with open(path, 'r', encoding="utf-8") as yaml_f: + with open(path, "r", encoding="utf-8") as yaml_f: identifiable_data = yaml.safe_load(yaml_f) + self.load_from_yaml_object(identifiable_data) + + def load_from_yaml_object(self, identifiable_data): + """Load identifiables defined in a yaml object. + """ - for key, value in identifiable_data.items(): - rt = db.RecordType().add_parent(key) - for prop_name in value: + for rt_name, id_list in identifiable_data.items(): + rt = db.RecordType().add_parent(rt_name) + if not isinstance(id_list, list): + raise InvalidIdentifiableYAML( + f"Identifiable contents must be lists, but this was not: {rt_name}") + for prop_name in id_list: if isinstance(prop_name, str): rt.add_property(name=prop_name) elif isinstance(prop_name, dict): for k, v in prop_name.items(): + if k == "is_referenced_by" and not isinstance(v, list): + raise InvalidIdentifiableYAML( + f"'is_referenced_by' must be a list. Found in: {rt_name}") rt.add_property(name=k, value=v) else: - NotImplementedError("YAML is not structured correctly") + raise InvalidIdentifiableYAML( + "Identifiable properties must be str or dict, but this one was not:\n" + f" {rt_name}/{prop_name}") - self.register_identifiable(key, rt) + self.register_identifiable(rt_name, rt) def register_identifiable(self, name: str, definition: db.RecordType): self._registered_identifiables[name] = definition def get_file(self, identifiable: Identifiable): - warnings.warn(DeprecationWarning("This function is deprecated. Please do not use it.")) + warnings.warn( + DeprecationWarning("This function is deprecated. Please do not use it.") + ) # TODO is this needed for Identifiable? # or can we get rid of this function? if isinstance(identifiable, db.Entity): @@ -555,7 +634,7 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): return None return candidates[0] - def get_registered_identifiable(self, record: db.Record): + def get_registered_identifiable(self, record: db.Entity): """ returns the registered identifiable for the given Record @@ -570,22 +649,17 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): if definition.parents[0].name.lower() == rt_name.lower(): return definition - def resolve_reference(self, record: db.Record): - """ - Current implementation just sets the id for this record - as a value. It needs to be verified that references all contain an ID. - """ - if record.id is None: - return record - return record.id - def retrieve_identified_record_for_identifiable(self, identifiable: Identifiable): query_string = self.create_query_for_identifiable(identifiable) try: candidates = cached_query(query_string) - except db.exceptions.HTTPServerError as err: - query_string = self.create_query_for_identifiable(identifiable, startswith=True) - candidates = cached_query(query_string).copy() # Copy against cache poisoning + except db.exceptions.HTTPServerError: + query_string = self.create_query_for_identifiable( + identifiable, startswith=True + ) + candidates = cached_query( + query_string + ).copy() # Copy against cache poisoning # Test if the candidates really match all properties for pname, pvalue in identifiable.properties.items(): @@ -604,7 +678,8 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): raise RuntimeError( f"Identifiable was not defined unambiguously.\n{query_string}\nReturned the " f"following {candidates}." - f"Identifiable:\n{identifiable.record_type}{identifiable.properties}") + f"Identifiable:\n{identifiable.record_type}{identifiable.properties}" + ) if len(candidates) == 0: return None return candidates[0] diff --git a/src/caoscrawler/macros/macro_yaml_object.py b/src/caoscrawler/macros/macro_yaml_object.py index c6b5de27d7f498d9b1db6b6a90d986487340a880..d85883011db3cf651da0dda6c110015128fbe439 100644 --- a/src/caoscrawler/macros/macro_yaml_object.py +++ b/src/caoscrawler/macros/macro_yaml_object.py @@ -25,12 +25,17 @@ # Function to expand a macro in yaml # A. Schlemmer, 05/2022 +import re from dataclasses import dataclass from typing import Any, Dict from copy import deepcopy from string import Template +_SAFE_SUBST_PAT = re.compile(r"^\$(?P<key>\w+)$") +_SAFE_SUBST_PAT_BRACES = re.compile(r"^\$\{(?P<key>\w+)}$") + + @dataclass class MacroDefinition: """ @@ -53,6 +58,12 @@ def substitute(propvalue, values: dict): Substitution of variables in strings using the variable substitution library from python's standard library. """ + # Simple matches are simply replaced by the raw dict entry. + if match := (_SAFE_SUBST_PAT.fullmatch(propvalue) + or _SAFE_SUBST_PAT_BRACES.fullmatch(propvalue)): + key = match.group("key") + if key in values: + return values[key] propvalue_template = Template(propvalue) return propvalue_template.safe_substitute(**values) diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index 9d1f538732858ff2fbf949d45c359ebb16fe3480..9f8f5e40beb729d73151bad38f3e390a4a8cecb4 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -62,11 +62,10 @@ def load_definition(crawler_definition_path: str): """ # Load the cfood from a yaml file: - with open(crawler_definition_path, "r") as f: + with open(crawler_definition_path, encoding="utf-8") as f: crawler_definitions = list(yaml.safe_load_all(f)) - crawler_definition = _load_definition_from_yaml_dict( - crawler_definitions) + crawler_definition = _load_definition_from_yaml_dict(crawler_definitions) return _resolve_validator_paths(crawler_definition, crawler_definition_path) @@ -362,16 +361,19 @@ def scanner(items: list[StructureElement], debug_tree.debug_metadata["usage"][str(element)].add( "/".join(converters_path + [converter.name])) mod_info = debug_tree.debug_metadata["provenance"] - for record_name, prop_name in keys_modified: - # TODO: check - internal_id = record_store_copy.get_internal_id( - record_name) - record_identifier = record_name + \ - "_" + str(internal_id) - converter.metadata["usage"].add(record_identifier) - mod_info[record_identifier][prop_name] = ( - structure_elements_path + [element.get_name()], - converters_path + [converter.name]) + # TODO: actually keys_modified must not be None. create_records should + # always return a list. + if keys_modified is not None: + for record_name, prop_name in keys_modified: + # TODO: check + internal_id = record_store_copy.get_internal_id( + record_name) + record_identifier = record_name + \ + "_" + str(internal_id) + converter.metadata["usage"].add(record_identifier) + mod_info[record_identifier][prop_name] = ( + structure_elements_path + [element.get_name()], + converters_path + [converter.name]) scanner(children, converter.converters, general_store_copy, record_store_copy, diff --git a/src/caoscrawler/scripts/__init__.py b/src/caoscrawler/scripts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/caoscrawler/scripts/generators.py b/src/caoscrawler/scripts/generators.py new file mode 100644 index 0000000000000000000000000000000000000000..ba8e6e39cc03e9be1923d72ec5c8d699c01fa8f9 --- /dev/null +++ b/src/caoscrawler/scripts/generators.py @@ -0,0 +1,247 @@ +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Scripts and functions to generate datamodel yaml files and cfood skeletons. + +For example from actual data files. +""" + +import argparse +import csv +from collections import OrderedDict +from string import Template +from typing import Optional + +import pandas as pd +import yaml + + +DM_TEMPLATE = """# auto-generated data model from file "[]{infile}". +# To insert a datamodel into LinkAhead, run: +# +# python3 -m caosadvancedtools.models.parser datamodel.yaml --sync +""" + +HEADER_RT = """ +############### +# RecordTypes # +############### + +DummyRT: + description: Note: Change name and enter description. + recommended_properties: + """ + +CFOOD_TEMPLATE = """ +--- +metadata: + macros: + - !defmacro + # Simple column value -> property rule + name: ColumnValue + params: + name: null + belongsto: BaseElement + type: TextElement + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${belongsto}: + ${name}: $$val + - !defmacro + # column value -> reference property + name: ColumnValueReference + params: + name: null + reftype: null # RecordType of the reference + belongsto: BaseElement + type: TextElement # References are always text, right? + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${reftype}: + name: $$val + ${belongsto}: + ${name}: $$${reftype} + - !defmacro + # Same as "ColumnValue", but also give name of property. + name: ColumnValuePropname + params: + name: null + propname: null + belongsto: BaseElement + type: TextElement + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${belongsto}: + ${propname}: $$val +--- +directory: # corresponds to the directory given to the crawler + type: Directory + match: .* # we do not care how it is named here + records: + DirRecord: # One record for each directory. + subtree: + # This is the file + thisfile: + type: []{file} + match: []{match} + records: + DatFileRecord: # One record for each matching file + role: File + path: $thisfile + file: $thisfile + subtree: + entry: + type: Dict + match: .* # Name is irrelevant + records: + BaseElement: # One BaseElement record for each row in the CSV/TSV file + DatFileRecord: $DatFileRecord + DirRecord: + BaseElement: +$BaseElement + subtree: !macro +""" + + +class _CustomTemplate(Template): + delimiter = "[]" # "$" is used too much by the yaml template. + + +def csv_to_datamodel(infile: str, outfile: str, cfood: Optional[str] = None): + """Parse the input csv and create basic datamodel in ``outfile``. + +Parameters +---------- +cfood: str + If given, also create a cfood skeleton. + """ + sniffer = csv.Sniffer() + with open(infile, encoding="utf-8") as f_infile: + max_sniff = 50000 + sniffed = sniffer.sniff(f_infile.read(max_sniff)) + df = pd.read_table(infile, sep=sniffed.delimiter, quotechar=sniffed.quotechar, + escapechar=sniffed.escapechar) + + properties = OrderedDict() + for colname in df.columns: + column = df[colname] + dtype: Optional[str] = "TEXT" + if pd.api.types.is_bool_dtype(column.dtype): + dtype = "BOOLEAN" + if pd.api.types.is_float_dtype(column.dtype): + dtype = "DOUBLE" + elif pd.api.types.is_integer_dtype(column.dtype): + dtype = "INTEGER" + properties[colname] = { + "datatype": dtype + } + + result = (_CustomTemplate(DM_TEMPLATE).substitute({"infile": infile}) + + HEADER_RT + + " ".join(yaml.dump(dict(properties), # from OrderedDict to dict + allow_unicode=True, + sort_keys=False).splitlines(keepends=True)) + ) + with open(outfile, encoding="utf-8", mode="w") as myfile: + myfile.write(result) + + ################# + # cfood section # + ################# + if cfood: + defs_col_value: list[str] = [] + defs_col_value_ref: list[str] = [] + prefix = " " * 14 + for name, propdef in properties.items(): + def_str = prefix + f"- name: {name}\n" + dtype = None + reftype = None + defs = defs_col_value + # Which type? + if propdef["datatype"] == "BOOLEAN": + dtype = "BooleanElement" + elif propdef["datatype"] == "INTEGER": + dtype = "IntegerElement" + elif propdef["datatype"] == "DOUBLE": + dtype = "FloatElement" + elif propdef["datatype"] == "TEXT": + dtype = None + else: + reftype = propdef["datatype"] + defs = defs_col_value_ref + + # Append according to types: + if reftype: + def_str += prefix + f" reftype: {reftype}\n" + if dtype: + def_str += prefix + f" type: {dtype}\n" + + # Store result + defs.append(def_str) + del defs + + sep = repr(sniffed.delimiter) + sep = f'"{sep[1:-1]}"' + match_str = f"""'.*[ct]sv' + sep: {sep} + # "header": [int] + # "names": [str] + # "index_col": [int] + # "usecols": [int] + # "true_values": [str] + # "false_values": [str] + # "na_values": [str] + # "skiprows": [int] + # "nrows": [int] + # "keep_default_na": [bool] + """ + + cfood_str = (_CustomTemplate(CFOOD_TEMPLATE).substitute({"file": "CSVTableConverter", + "match": match_str}) + + prefix[2:] + "ColumnValue:\n" + "".join(defs_col_value) + + prefix[2:] + "ColumnValueReference:\n" + "".join(defs_col_value_ref) + ) + with open(cfood, encoding="utf-8", mode="w") as myfile: + myfile.write(cfood_str) + + +def _parse_args_csv(): + """Parse the arguments.""" + parser = argparse.ArgumentParser(description="Create datamodel and cfood from CSV files.") + parser.add_argument('-i', '--input', help="The input file.", required=True, dest="infile") + parser.add_argument('-o', '--outfile', help="Yaml filename to save the result", required=True) + parser.add_argument('--cfood', help="Yaml filename to create cfood output in", required=False) + + return parser.parse_args() + + +def csv_to_datamodel_main(): + """The main function for csv data handling.""" + args = _parse_args_csv() + csv_to_datamodel(**vars(args)) diff --git a/src/caoscrawler/structure_elements.py b/src/caoscrawler/structure_elements.py index ff070626ebfdd580c16bbbf2dc30ab330dc162f0..0efba91c185446e0bfbecbbb53f68aaa8a8e15d1 100644 --- a/src/caoscrawler/structure_elements.py +++ b/src/caoscrawler/structure_elements.py @@ -23,7 +23,6 @@ # ** end header # -from typing import Dict as tDict import warnings @@ -39,7 +38,7 @@ name: str def __init__(self, name: str): # Used to store usage information for debugging: - self.metadata: tDict[str, set[str]] = { + self.metadata: dict[str, set[str]] = { "usage": set() } diff --git a/src/caoscrawler/sync_graph.py b/src/caoscrawler/sync_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..9c021a10f35e95ca56d45151b8d064ec905993ec --- /dev/null +++ b/src/caoscrawler/sync_graph.py @@ -0,0 +1,719 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +A data model class for the graph of entities that shall be created during synchronization of the +crawler. +""" + +from __future__ import annotations + +import logging +from typing import Any, Optional, Union, Callable + +import linkahead as db +from linkahead.cached import cached_get_entity_by +from linkahead.exceptions import EmptyUniqueQueryError + +from .identifiable_adapters import IdentifiableAdapter +from .identifiable import Identifiable +from .sync_node import SyncNode, TempID + +import re + +logger = logging.getLogger(__name__) + + +def _set_each_scalar_value( + node: SyncNode, condition: Callable[[Any], bool], value: Any +): + """helper function that conditionally replaces each value element of each property of a node + + If the property value is a list, the replacement is done for each list entry. + The replacement is only performed if the condition that + is provided is fulfilled, i.e. the callable ``condition`` returns True. The callable + ``condition`` must take the property value (or list element) as the sole argument. + + Args: + node (SyncNode): The node which provides the properties (and their values) to operate on. + condition (Callable): A function with one argument which is interpreted as a condition: + Only if it returns True for the property value, the action is + executed. + value (Callable): A function returning a new value that is set as the property value. This + function receives the old value as the single argument. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + for p in node.properties: + if isinstance(p.value, list): + for ii, el in enumerate(p.value): + if condition(el): + p.value[ii] = value(el) + elif condition(p.value): + p.value = value(p.value) + + +class SyncGraph: + """ + A data model class for the graph of entities that shall be created during synchronization of + the crawler. + + The SyncGraph combines nodes in the graph based on their identity in order to create a graph of + objects that can either be inserted or updated in(to) the remote server. This combination of + SyncNodes happens during initialization and later on when the ID of SyncNodes is set. + + When the SyncGraph is initialized, the properties of given entities are scanned and used to + create multiple reference maps that track how SyncNodes reference each other. + These maps are kept up to date when SyncNodes are merged because they are identified with each + other. During initialization, SyncNodes are first merged based on their ID, path or + identifiable. + + When additional information is added to the graph by setting the ID of a node + (via `set_id_of_node`) then the graph is updated accordingly: + - if this information implies that the node is equivalent to another node (e.g. has same ID), + then they are merged + - if knowing that one node does not exist in the remote server, then this might imply that some + other node also does not exist if its identity relies on the latter. + - The new ID might make it possible to create the identifiables of connected nodes and thus + might trigger further merging of nodes based on the new identifiables. + + A SyncGraph should only be manipulated via one function: + - set_id_of_node: a positive integer means the Entity exists, None means it is missing + TODO what about String IDs + + The SyncGraph can be converted back to lists of entities which allow to perform the desired + inserts and updates. + + Usage: + - Initialize the Graph with a list of entities. Those will be converted to the SyncNodes of the + graph. + - SyncNodes that can be merged are automatically merged and SyncNodes where the existence can + be determined are automatically removed from the list of unchecked SyncNodes: + graph.unchecked. + - You manipulate the graph by setting the ID of a SyncNode (either to a valid ID or to None). + For example, you can check whether a SyncNode has an identifiable and then query the remote + server and use the result to set the ID. + - After each manipulation, the graph updates accordingly (see above) + - Ideally, the unchecked list is empty after some manipulation. + - You can export a list of entities to be inserted and one of entities to be updated with + export_record_lists. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + + # General implementation remark: + # There are three cases where an update of one SyncNode can affect other nodes: + # - mark existing (add identifiables) + # - mark missing (add identifiables and add (negative) IDs) + # - merge (add identifiables) + # + # We cannot get an infinite recursion where one update triggers another update and so on + # because updates are conditional: + # Setting an ID removes the node (immediately) from the unchecked list and it is only tried to + # set an ID in _mark_missing if a node is in the uncheck list. Thus, setting the ID once + # prevents future attempts to set the ID of the same node. + # Also, setting an identifiable is only done when needed, i.e. there is no identifiable. + # Note, that when ever one node is changed, we check all dependent nodes (see usage of + # `_get_nodes_whose_identity_relies_on`) whether something should be updated. Thus, we cannot + # miss a necessary update. + def __init__( + self, entities: list[db.Entity], identifiableAdapter: IdentifiableAdapter + ): + self.identifiableAdapter = identifiableAdapter + # A dictionary allowing for quick lookup of sync nodes using their (possibly negative) IDs. + # This dictionary is initially set using _mark_entities_with_path_or_id and later updated + # using set_id_of_node or during merges of nodes. + self._id_look_up: dict[Union[int, TempID, str], SyncNode] = {} + # Similar as above for looking up nodes using paths + self._path_look_up: dict[str, SyncNode] = {} + # Similar as above for looking up nodes using identifiables. This dictionary uses the text + # representation generated by get_representation method of Identifiable as keys. + self._identifiable_look_up: dict[str, SyncNode] = {} + # look up for the nodes that were marked as being missing (on the remote server) + self._missing: dict[int, SyncNode] = {} + # same for existing + self._existing: dict[int, SyncNode] = {} + # entities that are missing get negative IDs to allow identifiable creation + self._remote_missing_counter = -1 + + self.nodes: list[SyncNode] = [] + self._initialize_nodes(entities) # list of all SemanticEntities + # list all SemanticEntities that have not yet been checked + self.unchecked = list(self.nodes) + + # initialize reference mappings (see _create_reference_mapping) + ( + self.forward_references, # id(node) -> full set of nodes referenced by the given node + self.backward_references, # id(node) -> full set of nodes referencing the given node + # as above, subset where the reference properties are part of identifiables + self.forward_references_id_props, + self.backward_references_id_props, + # as above, subset where references are part of identifiables due to "referenced_by" + self.forward_references_backref, + self.backward_references_backref, + ) = self._create_reference_mapping(self.nodes) + + # remove entities with path or ID from unchecked list + self._mark_entities_with_path_or_id() + + # add identifiables where possible + for node in list(self.nodes): + if self._identifiable_is_needed(node): + self._set_identifiable_of_node(node) + + # everything in unchecked neither does have an ID nor a path. + # Thus, it must be possible to create an + # identifiable which is checked using the following function: + for node in self.unchecked: + self.identifiableAdapter.all_identifying_properties_exist(node) + + def set_id_of_node(self, node: SyncNode, node_id: Optional[str] = None): + """sets the ID attribute of the given SyncNode to node_id. + + If node_id is None, a negative ID will be + given indicating that the node does not exist on the remote server. + Furthermore it will be marked as missing using _mark_missing. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + if node.id is not None: + raise RuntimeError( + "Cannot update ID.\n" + f"It already is {node.id} and shall be set to {node_id}." + ) + if node_id is None: + node_id = TempID(self._get_new_id()) + node.id = node_id + if node_id in self._id_look_up: + self._merge_into(node, self._id_look_up[node.id]) + else: + self._id_look_up[node.id] = node + if isinstance(node.id, TempID): + self._mark_missing(node) + else: + self._mark_existing(node) + + def export_record_lists(self): + """exports the SyncGraph in form of db.Entities + + All nodes are converted to db.Entity objects and reference values that are SyncNodes are + replaced by their corresponding (newly created) db.Entity objects. + + Since the result is returned in form of two lists, one with Entities that have a valid ID + one with those that haven't, an error is raised if there are any SyncNodes without an + (possibly negative) ID. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + # TODO reactivate once the implementation is appropriate + # if len(self.unchecked) > 1: + # self.unchecked_contains_circular_dependency() + + for el in self.nodes: + if el.id is None: + raise RuntimeError("Exporting unchecked entities is not supported") + + entities = [] + node_map = {} + for el in self.nodes: + entities.append(el.export_entity()) + node_map[id(el)] = entities[-1] + + for ent in entities: + _set_each_scalar_value( + ent, + condition=lambda val: isinstance(val, SyncNode), + value=lambda val: node_map[id(val)], + ) + + missing = [el for el in entities if el.id < 0] + existing = [el for el in entities if el.id > 0] + # remove negative IDs + for el in missing: + el.id = None + + return (missing, existing) + + def _identity_relies_on_unchecked_entity(self, node: SyncNode): + """ + If a record for which it could not yet be verified whether it exists in LA or not is part + of the identifying properties, this returns True, otherwise False + + Last review by Alexander Schlemmer on 2024-05-27. + """ + + return any( + [ + id(ent) not in self._missing and id(ent) not in self._existing + for ent in self.forward_references_id_props[id(node)] + ] + + [ + id(ent) not in self._missing and id(ent) not in self._existing + for ent in self.backward_references_backref[id(node)] + ] + ) + + def unchecked_contains_circular_dependency(self): + """ + Detects whether there are circular references in the given entity list and returns a list + where the entities are ordered according to the chain of references (and only the entities + contained in the circle are included. Returns None if no circular dependency is found. + + TODO: for the sake of detecting problems for split_into_inserts_and_updates we should only + consider references that are identifying properties. + """ + raise NotImplementedError("This function is not yet properly implemented") + # TODO if the first element is not part of the circle, then + # this will not work + # We must created a better implementation (see also TODO in docstring) + circle = [self.unchecked[0]] + closed = False + while not closed: + added_to_circle = False + for referenced in self.forward_references[id(circle[-1])]: + if referenced in self.unchecked: + if referenced in circle: + closed = True + circle.append(referenced) + added_to_circle = True + if not added_to_circle: + return None + return circle + + def get_equivalent(self, entity: SyncNode) -> Optional[SyncNode]: + """ + Return an equivalent SyncNode. + + Equivalent means that ID, path or identifiable are the same. + If a new information was added to the given SyncNode (e.g. the ID), it might be possible + then to identify an equivalent node (i.e. one with the same ID in this example). + There might be more than one equivalent node in the graph. However, simply the first that + is found is being returned. (When an equivalent node is found, the given node is + typically merged, into the one that was found and after the merge the graph is again + checked for equivalent nodes.) + + Returns None if no equivalent node is found. + + Last review by Alexander Schlemmer on 2024-05-28. + """ + if entity.id is not None and entity.id in self._id_look_up: + candidate = self._id_look_up[entity.id] + if candidate is not entity: + return candidate + if entity.path is not None and entity.path in self._path_look_up: + candidate = self._path_look_up[entity.path] + if candidate is not entity: + return candidate + if ( + entity.identifiable is not None + and entity.identifiable.get_representation() in self._identifiable_look_up + ): + candidate = self._identifiable_look_up[ + entity.identifiable.get_representation() + ] + if candidate is not entity: + return candidate + return None + + def _get_new_id(self): + """returns the next unused temporary ID + + Last review by Alexander Schlemmer on 2024-05-24. + """ + self._remote_missing_counter -= 1 + return self._remote_missing_counter + + def _set_identifiable_of_node( + self, node: SyncNode, identifiable: Optional[Identifiable] = None + ): + """sets the identifiable and checks whether an equivalent node can be found with that new + information. If an equivalent node is found, 'node' is merged into that node. + + if no identifiable is given, the identifiable is retrieved from the identifiable adapter + + Raises a ValueError if the equivalent node found does not have an identifiable. + Raises a RuntimeError if there is no equivalent node found and + the (unique) string representation of the identifiable of node is already contained in + the identifiable_look_up. + + Last review by Alexander Schlemmer on 2024-05-29. + """ + if identifiable is None: + self.identifiableAdapter.all_identifying_properties_exist(node) + identifiable = self.identifiableAdapter.get_identifiable( + node, self.backward_references_backref[id(node)] + ) + node.identifiable = identifiable + equivalent_se = self.get_equivalent(node) + if equivalent_se is not None: + self._merge_into(node, equivalent_se) + else: + if node.identifiable.get_representation() in self._identifiable_look_up: + raise RuntimeError("Identifiable is already in the look up") + self._identifiable_look_up[node.identifiable.get_representation()] = node + + @staticmethod + def _sanity_check(entities: list[db.Entity]): + """ + Checks whether each record in entities has at least one parent. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + for ent in entities: + if ent.role == "Record" and len(ent.parents) == 0: + raise ValueError(f"Records must have a parent.\n{ent}") + if isinstance(ent.id, int) and ent.id < 0: + raise ValueError( + f"Records must not have negative integers as IDs.\n{ent}" + ) + if isinstance(ent.id, str) and re.match(r"^-\d+$", ent.id): + raise ValueError( + f"Records must not have negative integers as IDs.\n{ent}" + ) + + def _get_nodes_whose_identity_relies_on(self, node: SyncNode): + """returns a set of nodes that reference the given node as identifying property or are + referenced by the given node and the parent of the given node is listed as + "is_referenced_by" + + Last review by Alexander Schlemmer on 2024-05-24. + """ + return self.backward_references_id_props[id(node)].union( + self.forward_references_backref[id(node)] + ) + + @staticmethod + def _create_flat_list( + ent_list: list[db.Entity], flat: Optional[list[db.Entity]] = None + ): + """ + Recursively adds entities and all their properties contained in ent_list to + the output list flat. + + TODO: This function will be moved to pylib as it is also needed by the + high level API. + + Last review by Alexander Schlemmer on 2024-05-29. + """ + # Note: A set would be useful here, but we do not want a random order. + if flat is None: + flat = list() + for el in ent_list: + if el not in flat: + flat.append(el) + for ent in ent_list: + for p in ent.properties: + # For lists append each element that is of type Entity to flat: + if isinstance(p.value, list): + for el in p.value: + if isinstance(el, db.Entity): + if el not in flat: + flat.append(el) + SyncGraph._create_flat_list([el], flat) + elif isinstance(p.value, db.Entity): + if p.value not in flat: + flat.append(p.value) + SyncGraph._create_flat_list([p.value], flat) + return flat + + @staticmethod + def _create_reference_mapping(flat: list[SyncNode]): + """ + Create six dictionaries that describe references among SyncNodes. All dictionaries use the + Python ID of SyncNodes as keys. + There is always one dictionary to describe the direction of the reference, i.e. + map[id(node)] -> other where other is a set of SyncNodes that are being referenced by node. + And then there is always one dictionary for the inverse direction. The two dictionaries are + named "forward_" and "backward_", respectively. + + Then there are three kinds of maps being generated: One includes all references + ("_references"), one includes references that are values of identifying properties + ("_references_id_props") and one includes references that are relevant for identifying + backreferences/"is_referenced_by" ("_references_backref"). I.e. the two latter are subesets + of the former reference map. + + Arguments: + ---------- + flat: list[SyncNode] + all SyncNodes that span the graph for which the reference map shall be created + + Last review by Alexander Schlemmer on 2024-05-29. + """ + # TODO we need to treat children of RecordTypes somehow. + forward_references: dict[int, set[SyncNode]] = {} + backward_references: dict[int, set[SyncNode]] = {} + forward_references_id_props: dict[int, set[SyncNode]] = {} + backward_references_id_props: dict[int, set[SyncNode]] = {} + forward_references_backref: dict[int, set[SyncNode]] = {} + backward_references_backref: dict[int, set[SyncNode]] = {} + + # initialize with empty lists/dict + for node in flat: + forward_references[id(node)] = set() + backward_references[id(node)] = set() + forward_references_id_props[id(node)] = set() + backward_references_id_props[id(node)] = set() + forward_references_backref[id(node)] = set() + backward_references_backref[id(node)] = set() + for node in flat: + for p in node.properties: + val = p.value + if not isinstance(val, list): + val = [val] + for v in val: + if isinstance(v, SyncNode): + forward_references[id(node)].add(v) + backward_references[id(v)].add(node) + if ( + node.registered_identifiable is not None + and len( + [ + el.name + for el in node.registered_identifiable.properties + if el.name == p.name + ] + ) + > 0 + ): + forward_references_id_props[id(node)].add(v) + backward_references_id_props[id(v)].add(node) + if ( + v.registered_identifiable is not None + and IdentifiableAdapter.referencing_entity_has_appropriate_type( + node.parents, v.registered_identifiable + ) + ): + forward_references_backref[id(node)].add(v) + backward_references_backref[id(v)].add(node) + + return ( + forward_references, + backward_references, + forward_references_id_props, + backward_references_id_props, + forward_references_backref, + backward_references_backref, + ) + + def _mark_entities_with_path_or_id(self): + """A path or an ID is sufficiently identifying. Thus, those entities can be marked as + checked + + When this function returns, there is only one node for each ID (i.e. no two nodes with the + same ID). The same is true for paths. + + This function also updates _id_look_up and _path_look_up + + Last review by Alexander Schlemmer on 2024-05-29. + """ + for node in list(self.nodes): + if node.id is not None: + eq_node = self.get_equivalent(node) + if eq_node is not None: + self._basic_merge_into(node, eq_node) + else: + self._id_look_up[node.id] = node + self._mark_existing(node) + + for node in list(self.nodes): + if node.path is not None: + eq_node = self.get_equivalent(node) + if eq_node is not None: + self._basic_merge_into(node, eq_node) + else: + self._path_look_up[node.path] = node + try: + existing = cached_get_entity_by(path=node.path) + except EmptyUniqueQueryError: + existing = None + remote_id = None + if existing is not None: + remote_id = existing.id + self.set_id_of_node(node, remote_id) + + def _basic_merge_into(self, source: SyncNode, target: SyncNode): + """tries to merge source into target and updates member variables + + - reference maps are updated + - self.nodes is updated + - self.unchecked is updated + - lookups are being updated + """ + # sanity checks + if source is target: + raise ValueError("source must not be target") + + target.update(source) + + # replace actual reference property values + for node in self.backward_references[id(source)]: + _set_each_scalar_value( + node, condition=lambda val: val is source, value=lambda val: target + ) + + # update reference mappings + for setA, setB in ( + (self.forward_references, self.backward_references), # ref: source -> other + (self.backward_references, self.forward_references), # ref: other -> source + (self.forward_references_id_props, self.backward_references_id_props), + (self.backward_references_id_props, self.forward_references_id_props), + (self.forward_references_backref, self.backward_references_backref), + (self.backward_references_backref, self.forward_references_backref), + ): + for node in setA.pop(id(source)): + setA[id(target)].add(node) + setB[id(node)].remove(source) + setB[id(node)].add(target) + + # remove unneeded SyncNode + self.nodes.remove(source) + if source in self.unchecked: + self.unchecked.remove(source) + # update look ups + if target.id is not None: + self._id_look_up[target.id] = target + if target.path is not None: + self._path_look_up[target.path] = target + if target.identifiable is not None: + self._identifiable_look_up[target.identifiable.get_representation()] = target + + def _merge_into(self, source: SyncNode, target: SyncNode): + """tries to merge source into target and performs the necessary updates: + - update the member variables of target using source (``target.update(source)``). + - replaces reference values to source by target + - updates the reference map + - updates lookup tables + - removes source from node lists + - marks target as missing/existing if source was marked that way + - adds an identifiable if now possible (e.g. merging based on ID might allow create an + identifiable when none of the two nodes had the sufficient properties on its own before) + - check whether dependent nodes can now get an identifiable (the merge might have set the + ID such that dependent nodes can now create an identifiable) + + Last review by Alexander Schlemmer on 2024-05-29. + """ + self._basic_merge_into(source, target) + + if (id(source) in self._existing and id(target) in self._missing) or ( + id(target) in self._existing and id(source) in self._missing + ): + raise RuntimeError("Trying to merge missing and existing") + + if id(source) in self._missing and id(target) not in self._missing: + self._mark_missing(target) + elif id(source) in self._existing and id(target) not in self._existing: + self._mark_existing(target) + + # due to the merge it might now be possible to create an identifiable + if self._identifiable_is_needed(target): + self._set_identifiable_of_node(target) + # This is one of three cases that affect other nodes: + # - mark existing + # - mark missing + # - merge + self._add_identifiables_to_dependent_nodes(target) + + eq_node = self.get_equivalent(target) + if eq_node is not None: + self._merge_into(target, eq_node) + + def _identifiable_is_needed(self, node: SyncNode): + """ + This function checks: + - the identifiable of node is None + - the node has all properties that are needed for the identifiable + - there are no unchecked entities that are needed for the identifiable of the node, + neither as forward or as backward references + + Last review by Alexander Schlemmer on 2024-05-24. + """ + return ( + node.identifiable is None + and not self._identity_relies_on_unchecked_entity(node) + and self.identifiableAdapter.all_identifying_properties_exist( + node, raise_exception=False + ) + ) + + def _initialize_nodes(self, entities: list[db.Entity]): + """create initial set of SyncNodes from provided Entity list""" + self._sanity_check(entities) + entities = self._create_flat_list(entities) + se_lookup: dict[int, SyncNode] = {} # lookup: python id -> SyncNode + + # Create new sync nodes from the list of entities, their registered identifiables + # are set from the identifiable adapter. + for el in entities: + self.nodes.append( + SyncNode(el, self.identifiableAdapter.get_registered_identifiable(el)) + ) + se_lookup[id(el)] = self.nodes[-1] + + # replace db.Entity objects with SyncNodes in references: + for node in self.nodes: + _set_each_scalar_value( + node, + condition=lambda val: id(val) in se_lookup, + value=lambda val: se_lookup[id(val)], + ) + + def _add_identifiables_to_dependent_nodes(self, node): + """For each dependent node, we check whether this allows to create an identifiable + + Last review by Alexander Schlemmer on 2024-05-29. + """ + for other_node in self._get_nodes_whose_identity_relies_on(node): + if self._identifiable_is_needed(other_node): + self._set_identifiable_of_node(other_node) + + def _mark_missing(self, node: SyncNode): + """Mark a sync node as missing and remove it from the dictionary of unchecked nodes. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + self._missing[id(node)] = node + self.unchecked.remove(node) + + # This is one of three cases that affect other nodes: + # - mark existing + # - mark missing + # - merge + self._add_identifiables_to_dependent_nodes(node) + # For each dependent node, we set the ID to None (missing) + # (None is the default second argument of set_id_of_node.) + for other_node in self._get_nodes_whose_identity_relies_on(node): + if other_node in self.unchecked: + self.set_id_of_node(other_node) + + def _mark_existing(self, node: SyncNode): + """Mark a sync node as existing and remove it from the dictionary of unchecked nodes. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + if isinstance(node.id, TempID): + raise ValueError("ID must valid existing entities, not TempID") + self._existing[id(node)] = node + self.unchecked.remove(node) + # This is one of three cases that affect other nodes: + # - mark existing + # - mark missing + # - merge + self._add_identifiables_to_dependent_nodes(node) diff --git a/src/caoscrawler/sync_node.py b/src/caoscrawler/sync_node.py new file mode 100644 index 0000000000000000000000000000000000000000..141e743bffa09f0caf661bcd1939a4233cb7249c --- /dev/null +++ b/src/caoscrawler/sync_node.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Any, Optional, Union + +import linkahead as db +import yaml +from linkahead.common.models import Parent, _ParentList, _Properties +from warnings import warn + +from .exceptions import ImpossibleMergeError + +if TYPE_CHECKING: + from .identifiable import Identifiable + +logger = logging.getLogger(__name__) + + +class TempID(int): + """A special kind of int for negative temporary IDs. + + This allows to identify TempIDs in the presence of String IDs. + A string ID might look like a negative integer. + """ + pass + + +class SyncNode(db.Entity): + """represents the information of an Entity as it shall be created in LinkAhead + + The following information is taken from an db.Entity object during initialization or when the + object is updated using the `update` member function: + - id + - role + - path + - file + - name + - description + - parents + - properties + + Typically, this class is used in the following way: + 1. A SyncNode is initialized with a db.Entity object. + 2. The SyncNode object is possibly updated one or more times with other SyncNode objects. + 3. A db.Entity object is created (`export_entity`) that contains the combined information. + """ + + def __init__( + self, entity: db.Entity, registered_identifiable: Optional[db.RecordType] = None, + **kwargs + ): + super().__init__(name=entity.name, + id=entity.id, + description=entity.description, + **kwargs) + # db.Entity properties + self.role = entity.role + self.path = entity.path + self.file = entity.file + self.parents = _ParentList().extend(entity.parents) + self.properties = _Properties().extend(entity.properties) + self._check_for_multiproperties() + # other members + self.identifiable: Optional[Identifiable] = None + self.registered_identifiable = registered_identifiable + + def update(self, other: SyncNode) -> None: + """update this node with information of given ``other`` SyncNode. + + parents are added if they are not yet in the list + properties are added in any case. This may lead to duplication of properties. + We allow this duplication here and remove it when we create a db.Entity (export_entity + function) because if property values are SyncNode objects, they might not be comparable (no + ID, no identifiable) yet. + """ + + if other.identifiable is not None and self.identifiable is not None: + if ( + other.identifiable.get_representation() + != self.identifiable.get_representation() + ): + raise ValueError( + "The SyncNode that is used with update must have an equivalent" + f" identifiable. I.e. you cannot merge entities with differing identifiables" + "The identifiables where:\n" + f"{self.identifiable._create_hashable_string(self.identifiable)}\n" + f"and\n{other.identifiable._create_hashable_string(other.identifiable)}." + ) + + if other.identifiable: + self.identifiable = other.identifiable + for attr in ["id", "role", "path", "file", "name", "description"]: + if other.__getattribute__(attr) is not None: + if self.__getattribute__(attr) is None: + self.__setattr__(attr, other.__getattribute__(attr)) + else: + if self.__getattribute__(attr) != other.__getattribute__(attr): + raise ImpossibleMergeError( + f"Trying to update {attr} but this would lead to an " + f"override of the value '{self.__getattribute__(attr)}' " + f"by the value '{other.__getattribute__(attr)}'", + pname=attr, values=(self.__getattribute__(attr), + other.__getattribute__(attr)) + ) + for p in other.parents: + if not parent_in_list(p, self.parents): + self.parents.append(p) + for p in other.properties: + self.properties.append(p) + + def export_entity(self) -> db.Entity: + """create a db.Entity object from this SyncNode + + Properties are only added once (based on id or name). If values do not match, an Error is + raised. If values are SyncNode objects with IDs, they are considered equal if their IDs are + equal. + """ + ent = None + if self.role == "Record": + ent = db.Record() + elif self.role == "File": + ent = db.File() + else: + raise RuntimeError("Invalid role") + for attr in ["id", "role", "path", "file", "name", "description"]: + ent.__setattr__(attr, self.__getattribute__(attr)) + for p in self.parents: + ent.add_parent(p) + for p in self.properties: + entval: Any = ent.get_property(p) + if entval is None: + ent.add_property(id=p.id, name=p.name, value=p.value, description=p.description, + datatype=p.datatype, unit=p.unit) + else: + entval = entval.value + unequal = False + pval = p.value + if isinstance(entval, list) != isinstance(pval, list): + unequal = True + if not isinstance(entval, list): + entval = [entval] + if not isinstance(pval, list): + pval = [pval] + if len(entval) != len(pval): + unequal = True + else: + for e_el, p_el in zip(entval, pval): + if isinstance(e_el, SyncNode) and e_el.id is not None: + e_el = e_el.id + if isinstance(p_el, SyncNode) and p_el.id is not None: + p_el = p_el.id + if e_el != p_el: + unequal = True + + if unequal: + logger.error( + "The Crawler is trying to create an entity," + " but there are conflicting property values." + f"Problematic Property: {p.name}\n" + f"First value:\n{entval}\n" + f"Second value:\n{pval}\n" + f"{self}" + ) + ime = ImpossibleMergeError( + "Cannot merge Entities", pname=p.name, values=(entval, pval) + ) + raise ime + return ent + + def __repr__(self) -> str: + """ somewhat concise text representation of the SyncNode """ + res = f"\n=====================================================\n{self.role}\n" + res += yaml.dump( + { + "id": self.id, + "name": self.name, + "path": self.path, + "parents": [el.name for el in self.parents], + }, + allow_unicode=True, + ) + res += "---------------------------------------------------\n" + res += "properties:\n" + d: dict[str, Any] = {} + for p in self.properties: + v = p.value + d[p.name] = [] + if not isinstance(p.value, list): + v = [v] + for el in v: + if isinstance(el, SyncNode): + d[p.name].append( + { + "id": el.id, + "name": el.name, + "path": el.path, + "parents": [e.name for e in el.parents], + } + ) + else: + d[p.name].append(el) + + return ( + res + + yaml.dump(d, allow_unicode=True) + + "=====================================================\n" + ) + + def _check_for_multiproperties(self): + """ warns if multiproperties are present """ + ids = set() + names = set() + for p in self.properties: + if p.name is not None: + if p.name in names: + warn("Multiproperties are not supported by the crawler.") + names.add(p.name) + if p.id is not None: + if p.id in ids: + warn("Multiproperties are not supported by the crawler.") + ids.add(p.id) + + +def parent_in_list(parent: Parent, plist: _ParentList) -> bool: + """helper function that checks whether a parent with the same name or ID is in the plist""" + missing = False + if parent.name is not None: + if parent.name not in plist._element_by_name: + missing = True + if parent.id is not None: + if str(parent.id) not in plist._element_by_id: + missing = True + return not missing + + +def property_in_list(prop: db.Property, plist: _Properties) -> bool: + """helper function that checks whether a property with the same name or ID is in the plist""" + missing = False + if prop.name is not None: + if prop.name not in plist._element_by_name: + missing = True + if prop.id is not None: + if str(prop.id) not in plist._element_by_id: + missing = True + return not missing diff --git a/src/caoscrawler/transformer_functions.py b/src/caoscrawler/transformer_functions.py index eda9f3c2bc98c8d2561f152f9f6ddd422daee00a..ce08bc6bc05caa84f342cdc25f3243c5bab0b79c 100644 --- a/src/caoscrawler/transformer_functions.py +++ b/src/caoscrawler/transformer_functions.py @@ -20,9 +20,14 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. +"""Definition of default transformer functions. + +See https://docs.indiscale.com/caosdb-crawler/converters.html#transform-functions for more +information. + """ -Defnition of default transformer functions. -""" + +import datetime import re from typing import Any @@ -61,3 +66,36 @@ def replace(in_value: Any, in_parameters: dict): if not isinstance(in_value, str): raise RuntimeError("must be string") return in_value.replace(in_parameters['remove'], in_parameters['insert']) + + +def date_parse(in_value: str, params: dict) -> str: + """Transform text so that it is formatted in a way that LinkAhead can understand it. + +Parameters +========== + +- date_format: str, optional + A format string using the ``datetime`` specificaton: + https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes + """ + fmt_default = "%Y-%m-%d" + fmt = params.get("date_format", fmt_default) + dt_str = datetime.datetime.strptime(in_value, fmt).strftime(fmt_default) + return dt_str + + +def datetime_parse(in_value: str, params: dict) -> str: + """Transform text so that it is formatted in a way that LinkAhead can understand it. + + +Parameters +========== + +- datetime_format: str, optional + A format string using the ``datetime`` specificaton: + https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes + """ + fmt_default = "%Y-%m-%dT%H:%M:%S" + fmt = params.get("datetime_format", fmt_default) + dt_str = datetime.datetime.strptime(in_value, fmt).strftime(fmt_default) + return dt_str diff --git a/src/caoscrawler/utils.py b/src/caoscrawler/utils.py index c62f44eeaa75ca42579aa3d6ead437e901cd38ff..096fde9b573f4ff60995498144cad3589ce7dbb2 100644 --- a/src/caoscrawler/utils.py +++ b/src/caoscrawler/utils.py @@ -25,6 +25,9 @@ # Some utility functions, e.g. for extending pylib. +import sys +from typing import Optional + import linkahead as db @@ -39,3 +42,30 @@ def has_parent(entity: db.Entity, name: str): if parent.name == name: return True return False + + +def MissingImport(name: str, hint: str = "", err: Optional[Exception] = None) -> type: + """Factory with dummy classes, which may be assigned to variables but never used.""" + def _error(): + error_msg = f"This class ({name}) cannot be used, because some libraries are missing." + if hint: + error_msg += "\n\n" + hint + + if err: + print(error_msg, file=sys.stdout) + raise RuntimeError(error_msg) from err + raise RuntimeError(error_msg) + + class _Meta(type): + def __getattribute__(cls, *args, **kwargs): + _error() + + def __call__(cls, *args, **kwargs): + _error() + + class _DummyClass(metaclass=_Meta): + pass + + _DummyClass.__name__ = name + + return _DummyClass diff --git a/src/caoscrawler/version.py b/src/caoscrawler/version.py index fdc8323452cd190cc3628efa57c15992f30fabeb..0b72dd65116fbc102a4dc2492d726698cad5a13b 100644 --- a/src/caoscrawler/version.py +++ b/src/caoscrawler/version.py @@ -17,11 +17,7 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # -try: - from importlib import metadata as importlib_metadata -except ImportError: # Python<3.8 dowesn"t support this so use - import importlib_metadata - +from importlib import metadata as importlib_metadata from packaging.version import parse as parse_version from warnings import warn @@ -43,7 +39,7 @@ def check_cfood_version(metadata: dict): if not metadata or "crawler-version" not in metadata: msg = """ -No crawler version specified in cfood definition, so there is now guarantee that +No crawler version specified in cfood definition, so there is no guarantee that the cfood definition matches the installed crawler version. Specifying a version is highly recommended to ensure that the definition works diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst index 7100bcd1790edb3e040a1a90663a32a09b7c8eaf..770731857112b93205f0e80d623fa9183c4aa885 100644 --- a/src/doc/concepts.rst +++ b/src/doc/concepts.rst @@ -1,3 +1,4 @@ +======== Concepts ======== @@ -5,6 +6,10 @@ The CaosDB Crawler can handle any kind of hierarchical data structure. The typic directory tree that is traversed. We use the following terms/concepts to describe how the CaosDB Crawler works. +Basics +====== + + Structure Elements ++++++++++++++++++ @@ -29,7 +34,7 @@ existing StructureElements, Converters create a tree of StructureElements. .. image:: img/converter.png :height: 170 -See :std:doc:`converters<converters>` for details. +See the chapter :std:doc:`Converters<converters>` for details. Relevant sources in: @@ -183,8 +188,7 @@ TODO Caching +++++++ -The Crawler uses the cached library function ``cached_get_entity_by``. The -cache is cleared automatically, when the Crawler does updates, but if you would -run the same Python process indefinetely the Crawler would not see changes due -to the Cache. Thus, please make sure to clear the cache if you create long -running Python processes. +The Crawler uses the cached library function ``cached_get_entity_by``. The cache is cleared +automatically when the Crawler does updates, but if you ran the same Python process indefinitely, +the Crawler would not see changes in LinkAhead due to the cache. Thus, please make sure to clear the +cache if you create long running Python processes. diff --git a/src/doc/conf.py b/src/doc/conf.py index 3cce99d03728d229c848ba6374d15de9fe73ec7b..3248726ed63dd80fdee7c06da3c27caace93f22c 100644 --- a/src/doc/conf.py +++ b/src/doc/conf.py @@ -53,6 +53,7 @@ extensions = [ 'sphinx.ext.autosectionlabel', 'sphinx.ext.intersphinx', 'sphinx.ext.napoleon', # For Google style docstrings + "sphinx.ext.todo", "recommonmark", # For markdown files. "sphinx_rtd_theme", ] @@ -213,6 +214,10 @@ intersphinx_mapping = { # TODO Which options do we want? autodoc_default_options = { - 'members': None, - 'undoc-members': None, + 'members': True, + 'undoc-members': True, + 'member-order': 'bysource', + 'special-member': ["__init__"], } + +todo_include_todos = True diff --git a/src/doc/converters.rst b/src/doc/converters.rst index 9b28c9a61eec4d9707b9640720b9c6a44a8fe25e..d7e11c235fafa1e42f53342a24255ceb0d275ed4 100644 --- a/src/doc/converters.rst +++ b/src/doc/converters.rst @@ -8,10 +8,6 @@ existing StructureElements, Converters create a tree of StructureElements. .. image:: img/converter.png :height: 170 -The ``cfood.yml`` definition also describes which -Converters shall be used to treat the generated child StructureElements. The -definition therefore itself also defines a tree. - Each StructureElement in the tree has a set of properties, organized as key-value pairs. Some of those properties are specified by the type of StructureElement. For example, @@ -19,15 +15,18 @@ a file could have the file name as property: ``'filename': myfile.dat``. Converters may define additional functions that create further values. For example, a regular expression could be used to get a date from a file name. +CFood definition +++++++++++++++++ -A converter is defined via a yml file or part of it. The definition states -what kind of StructureElement it treats (typically one). -Also, it defines how children of the current StructureElement are -created and what Converters shall be used to treat those. +Converter application to data is specified via a tree-like yml file (called ``cfood.yml``, by +convention). The yml file specifies which Converters shall be used on which StructureElements, and +how to treat the generated *child* StructureElements. The yaml definition may look like this: -TODO: outdated, see cfood-schema.yml +.. todo:: + + This is outdated, see ``cfood-schema.yml`` for the current specification of a ``cfood.yml``. .. code-block:: yaml @@ -47,13 +46,18 @@ TODO: outdated, see cfood-schema.yml subtree: (...) -The **<NodeName>** is a description of what it represents (e.g. -'experiment-folder') and is used as identifier. +The **<NodeName>** is a description of what the current block represents (e.g. +``experiment-folder``) and is used as an identifier. **<type>** selects the converter that is going to be matched against the current structure element. If the structure element matches (this is a combination of a typecheck and a detailed -match, see :py:class:`~caoscrawler.converters.Converter` for details) the converter is used -to generate records (see :py:meth:`~caoscrawler.converters.Converter.create_records`) and to possibly process a subtree, as defined by the function :func:`caoscrawler.converters.create_children`. +match, see the :py:class:`~caoscrawler.converters.Converter` source documentation for details), the +converter will: + +- generate records (with :py:meth:`~caoscrawler.converters.Converter.create_records`) +- possibly process a subtree (with :py:meth:`caoscrawler.converters.Converter.create_children`) + +**match** *TODO* **records** is a dict of definitions that define the semantic structure (see details below). @@ -151,6 +155,9 @@ The following StructureElement types are typically created by the DictElement co - ListElement - DictElement +Note that you may use ``TextElement`` for anything that exists in a text format that can be +interpreted by the server, such as date and datetime strings in ISO-8601 format. + Scalar Value Converters ======================= `BooleanElementConverter`, `FloatElementConverter`, `TextElementConverter`, and @@ -253,13 +260,13 @@ HDF5 Converters For treating `HDF5 Files <https://docs.hdfgroup.org/hdf5/develop/_s_p_e_c.html>`_, there are in total -four individual converters corresponding to the internal structure of HDF5 files: -the :ref:`H5FileConverter` which opens the file itself and creates further -structure elements from HDF5 groups, datasets, and included multi-dimensional -arrays that are in turn treated by the :ref:`H5GroupConverter`, the -:ref:`H5DatasetConverter`, and the :ref:`H5NdarrayConverter`, respectively. You -need to install the LinkAhead crawler with its optional ``h5crawler`` dependency -for using these converters. +four individual converters corresponding to the internal structure of HDF5 +files: the :ref:`H5FileConverter` which opens the file itself and creates +further structure elements from HDF5 groups, datasets, and included +multi-dimensional arrays that are in turn treated by the +:ref:`H5GroupConverter`, the :ref:`H5DatasetConverter`, and the +:ref:`H5NdarrayConverter`, respectively. You need to install the LinkAhead +crawler with its optional ``h5-crawler`` dependency for using these converters. The basic idea when crawling HDF5 files is to treat them very similar to :ref:`dictionaries <DictElement Converter>` in which the attributes on root, diff --git a/src/doc/getting_started/furtherreading.rst b/src/doc/getting_started/furtherreading.rst index eb600416c1fce3857d28fc2e856ceabebb3a8bb7..8d8d3ecc4b5575f71e90e9e5a17b060a63403a07 100644 --- a/src/doc/getting_started/furtherreading.rst +++ b/src/doc/getting_started/furtherreading.rst @@ -6,3 +6,4 @@ Further reading - Some useful examples can be found in the `integration tests <https://gitlab.com/caosdb/caosdb-crawler/-/tree/main/integrationtests>`_ (and to a certain extent in the unit tests). +- TODO: Information on caching diff --git a/src/doc/getting_started/helloworld.md b/src/doc/getting_started/helloworld.md index 723fb88d08047350d9f4bc3d3d2bd84ec9b27efb..67fdf88974391ac6209f1010bfb4f2d883e51021 100644 --- a/src/doc/getting_started/helloworld.md +++ b/src/doc/getting_started/helloworld.md @@ -33,7 +33,7 @@ Then you can do the following interactively in (I)Python. But we recommend that copy the code into a script and execute it to spare yourself typing. ```python -import caosdb as db +import linkahead as db from datetime import datetime from caoscrawler import Crawler, SecurityMode from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter diff --git a/src/doc/getting_started/optionalfeatures.rst b/src/doc/getting_started/optionalfeatures.rst index d326d7fce6f77a0278c9f2d05a641888203a2089..7b77646501d677b7a99799b97fae752107b11d6f 100644 --- a/src/doc/getting_started/optionalfeatures.rst +++ b/src/doc/getting_started/optionalfeatures.rst @@ -30,6 +30,13 @@ to decide what tool is used for sending mails (use the upper one if you want to actually send mails. See ``sendmail`` configuration in the LinkAhead docs. +You can even supply the name of a custom CSS file that shall be used: + +.. code:: ini + + [advancedtools] + crawler.customcssfile = theme-research.css + Crawler Status Records ---------------------- diff --git a/src/doc/macros.rst b/src/doc/macros.rst index d093d9b69f5d2c14b5bfbb2fe292545fc7943ca7..3a234973ee17791aaa2a0bd9e4b81836207a07e0 100644 --- a/src/doc/macros.rst +++ b/src/doc/macros.rst @@ -1,6 +1,9 @@ Macros ------ +Introduction +============ + Macros highly facilitate the writing of complex :doc:`CFoods<cfood>`. Consider the following common example: @@ -83,16 +86,46 @@ The expanded version of `ExperimentalData` will look like: This :ref:`example<example_files_2>` can also be found in the macro unit tests (see :func:`unittests.test_macros.test_documentation_example_2`). -Complex Example -=============== -The following, more complex example, demonstrates the use -of macro variable substitutions that generate crawler variable substitutions: +Mixing macros and plain definitions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can also mix macros and plain definitions. Whenever a name cannot be resolved to a macro, a +plain yaml node definition is used as a fallback: + +.. code:: yaml + + --- + metadata: + macros: + - !defmacro + name: MarkdownFile + # ... Definition here ... + --- + ExperimentalData: + type: Directory + match: ExperimentalData + subtree: !macro + MarkdownFile: + - name: README + filename: ^README.md$ + OtherContent: # There is no macro named "OtherContent", so this is parsed as normal content. + type: SimpleFile + match: .*txt + records: + # ... Normal content ... + -- `$$$nodename` will lead to a macro variable substitution of variable `$nodename` during macro expansion. -- `$$` will be turned into `$` -- So in the crawler cfood, the string will appear as `$value` if variable `nodename` would be set to `value` when using the macro. +Complex example +=============== + +Let's try something more complex: what happens to multiple ``$``? This example demonstrates the use +of `macro` variable substitutions to generate `crawler` variable substitutions: +- ``$$`` will be converted into ``$``. +- ``$$$nodename`` will retain a single ``$`` and substitute ``$nodename`` during macro expansion. +- So in the cfood, if ``nodename: value``, the string ``$$$nodename`` will be converted to + ``$value``. .. _example_1: .. code-block:: yaml @@ -118,7 +151,8 @@ of macro variable substitutions that generate crawler variable substitutions: Simulation: $recordtype: +$File -The expanded version of :ref:`example<example_1>` can be seen in :ref:`example<example_1_expanded>`. +The expanded version of the :ref:`example above<example_1>` (with ``nodename: Dataset``) can be seen +:ref:`here<example_1_expanded>`: .. _example_1_expanded: @@ -141,11 +175,11 @@ The expanded version of :ref:`example<example_1>` can be seen in :ref:`example<e type: SimpleFile type: Directory -This :ref:`example<example_1>` can also be found in the macro unit tests (see :func:`unittests.test_macros.test_documentation_example_1`). - +This example can also be found in the macro unit tests (see +:func:`unittests.test_macros.test_documentation_example_1`). -Using Macros Multiple Times +Using macros multiple times =========================== To use the same macro multiple times in the same yaml node, lists can be used: @@ -198,11 +232,11 @@ use the same top level key. Because later versions would overwrite previous ones. Here we used ``$macro_name`` to prevent that. -Limitation -========== +Limitations +=========== -Currently it is not possible to use the same macro twice in the same yaml node, but in different -positions. Consider: +Currently it is not possible to use the same macro twice in the same yaml node, if it occurs in +different positions. Consider: .. _example_multiple_limitation: .. code-block:: yaml @@ -227,14 +261,13 @@ positions. Consider: Other_node: type: test - test_twice: # This is NOT possible as each - # dictionary element can only appear once in a yaml node. + test_twice: # This is NOT possible as each key + # can only appear once in a yaml node. - macro_name: twice # <- This is the second one, with different arguments a: 5 - {} # <- This is the third one, just using default arguments -However, this should not be a real limitation, as the crawler is designed in a way, -that the order of the nodes in the same level should not matter. +This should not be a real limitation however, as the order of nodes does not matter for the crawler. Using macros within macro definitions diff --git a/src/doc/tutorials/parameterfile.rst b/src/doc/tutorials/parameterfile.rst index 9369ba8b83df8c484a4af8f240e1a1de2f4c10fb..2442969541eebf9a4e058b797b48995b39372a3e 100644 --- a/src/doc/tutorials/parameterfile.rst +++ b/src/doc/tutorials/parameterfile.rst @@ -88,6 +88,10 @@ regular expressions do: We can use the groups from the regular expressions that are used for matching. In our example, we use the "value" group to assign the "frequency" value to the "Experiment". +.. note:: + + For more information on the ``cfood.yml`` specification, read on in the chapter :ref:`Converters`. + A fully grown CFood ------------------- @@ -148,4 +152,6 @@ the CFood file is in the current working directory): caosdb-crawler -s update -i identifiables.yml cfood.yml . +.. note:: + ``caosdb-crawler`` currently only works with cfoods which have a directory as top level element. diff --git a/tox.ini b/tox.ini index 03e02ebeff196430129e10c4c0d853ca77c47302..41249e4277391c5ffa4ec13fc4da1a6ee1f48491 100644 --- a/tox.ini +++ b/tox.ini @@ -1,21 +1,23 @@ [tox] -envlist = py37, py38, py39, py310, py311 +envlist = py38, py39, py310, py311, py312, py313 skip_missing_interpreters = true [testenv] -deps = . +deps = .[h5-crawler,spss] pytest pytest-cov - h5py # TODO: Make this f-branch sensitive git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev commands = caosdb-crawler --help - py.test --cov=caosdb -vv {posargs} + py.test --cov=caoscrawler -vv {posargs} [flake8] max-line-length = 100 +[pycodestyle] +max-line-length = 100 + [pytest] testpaths = unittests -xfail_strict = True \ No newline at end of file +xfail_strict = True diff --git a/unittests/example_cfood.yml b/unittests/example_cfood.yml index 713bd4be0f3c816e1e8c8b7a057b30a4b400f13c..798e540fa25e49bf610ea21653db41a0bddc4d5f 100644 --- a/unittests/example_cfood.yml +++ b/unittests/example_cfood.yml @@ -1,6 +1,6 @@ --- metadata: - crawler-version: 0.3.1 + crawler-version: 0.7.2 --- Definitions: type: Definitions diff --git a/unittests/h5_cfood.yml b/unittests/h5_cfood.yml index f688de6a2171da6533626449b030bcd95a43b37b..4b95a0a31bc43a902eb63dc3aa09b805fc28c2aa 100644 --- a/unittests/h5_cfood.yml +++ b/unittests/h5_cfood.yml @@ -1,6 +1,6 @@ --- metadata: - crawler-version: 0.6.1 + crawler-version: 0.7.2 --- Converters: H5Dataset: diff --git a/unittests/scifolder_cfood.yml b/unittests/scifolder_cfood.yml index 9d6e8cf3ea325ad14641530f2e6cafd43f0dc1bb..ca5fa589b5903e0c0d8ef3dcb2528ea79e0f8cee 100644 --- a/unittests/scifolder_cfood.yml +++ b/unittests/scifolder_cfood.yml @@ -4,7 +4,7 @@ --- metadata: - crawler-version: 0.3.1 + crawler-version: 0.7.2 --- Definitions: type: Definitions diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 665169d785c1ed604314c4aff4a640d4418e80a9..8e5441ce00a7dca8bc69e90b6a96576a07187bfb 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -3,8 +3,9 @@ # # This file is a part of the CaosDB Project. # -# Copyright (C) 2021,2022 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2021-2024 Indiscale GmbH <info@indiscale.com> # Copyright (C) 2021,2022 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -148,7 +149,7 @@ def test_markdown_converter(converter_registry): converter = MarkdownFileConverter({"match": "(.*)"}, "TestMarkdownFileConverter", converter_registry) - with pytest.raises(ConverterValidationError) as err: + with pytest.raises(ConverterValidationError): converter.create_children(None, File("test_tool.py", UNITTESTDIR / "test_crawler.py")) m = converter.match(test_readme) @@ -504,7 +505,7 @@ MyElement: two_doc_yaml = """ --- metadata: - crawler-version: 0.3.1 + crawler-version: 0.7.2 Converters: MyNewType: converter: MyNewTypeConverter @@ -640,7 +641,7 @@ def test_load_converters(): # converter classes can be loaded from their respective packages. # Please adapt, if defaults change! - assert len(converter_registry) == 23 + assert len(converter_registry) == 24 # All of them are contained in caoscrawler.converters for conv_key, conv in converter_registry.items(): diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index a48b5e16ad1a71beeb4a5bf1c2ac52f67bbd7afe..0a6aee44a1892f1c950a80b936adf184616fd612 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -39,10 +39,12 @@ import linkahead.common.models as dbmodels import pytest import yaml from caosadvancedtools.models.parser import parse_model_from_string -from caoscrawler.crawl import (Crawler, SecurityMode, TreatedRecordLookUp, - _treat_deprecated_prefix, crawler_main, - split_restricted_path) +from caoscrawler.crawl import (Crawler, SecurityMode, _treat_deprecated_prefix, + crawler_main, split_restricted_path) from caoscrawler.debug_tree import DebugTree +from caoscrawler.exceptions import (ImpossibleMergeError, + MissingIdentifyingProperty, + MissingReferencingEntityError) from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, IdentifiableAdapter, @@ -52,6 +54,7 @@ from caoscrawler.scanner import (create_converter_registry, scan_directory, from caoscrawler.stores import GeneralStore, RecordStore from caoscrawler.structure_elements import (DictElement, DictListElement, DictTextElement, File) +from caoscrawler.sync_graph import SyncGraph from linkahead.apiutils import compare_entities from linkahead.cached import cache_clear from linkahead.exceptions import EmptyUniqueQueryError @@ -87,6 +90,20 @@ NEW_ELEMENT = (db.Record() .add_property(name="result", value="homogeneous")) +def reset_mocks(mocks): + for mock in mocks: + mock.reset_mock() + + +def mock_create_values(values, element): + pass + + +def mock_get_entity_by_query(query=None): + if query is not None: + return db.Record(id=1111, name='rec_name').add_parent('RT') + + def mock_get_entity_by(eid=None, name=None, path=None): if eid is not None: candidates = [el for el in EXAMPLE_SERVER_STATE if el.id == eid] @@ -110,6 +127,14 @@ def mock_get_entity_by(eid=None, name=None, path=None): raise EmptyUniqueQueryError("") +def basic_retrieve_by_name_mock_up(rec, referencing_entities=None, known=None): + """ returns a stored Record if rec.name is an existing key, None otherwise """ + if rec.name in known: + return known[rec.name] + else: + return None + + def mock_retrieve_record(identifiable: Identifiable): """ assumes that the identifiable is always only the date""" @@ -148,7 +173,15 @@ A: model.get_deep("A").id = 2 return result + [model.get_deep("B")] print(query_string) - raise NotImplementedError("Mock for this case is missing") + raise NotImplementedError(f"Mock for this case is missing: {query_string}") + + +def mock_cached_only_rt_allow_empty(query_string: str): + try: + result = mock_cached_only_rt(query_string) + except NotImplementedError: + result = db.Container() + return result @pytest.fixture(autouse=True) @@ -156,8 +189,51 @@ def clear_cache(): cache_clear() +@pytest.fixture +def crawler_mocked_identifiable_retrieve(): + crawler = Crawler() + # TODO use minimal setup + # mock retrieval of registered identifiabls: return Record with just a parent + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent(x.parents[0].name).add_property(name='name')) + + # Simulate remote server content by using the names to identify records + # There is only a single known Record with name A + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( + side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) + return crawler + + +@pytest.fixture +def crawler_mocked_for_backref_test(): + crawler = Crawler() + # mock retrieval of registered identifiabls: return Record with just a parent + + def get_reg_ident(x): + if x.parents[0].name == "C": + return db.Record().add_parent(x.parents[0].name).add_property( + "is_referenced_by", value=["BR"]).add_property("name") + elif x.parents[0].name == "D": + return db.Record().add_parent(x.parents[0].name).add_property( + "is_referenced_by", value=["BR", "BR2"]).add_property("name") + else: + return db.Record().add_parent(x.parents[0].name).add_property("name") + crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=get_reg_ident) + + # Simulate remote server content by using the names to identify records + # There is only a single known Record with name A + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( + side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": + db.Record(id=1111, name="A").add_parent("BR")})) + return crawler + + @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_constructor(): + # tests that appropriate DeprecationWarnings are triggered by the constructor when deprecated + # arguments are being passed. with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. warnings.filterwarnings("ignore") @@ -174,6 +250,7 @@ def test_constructor(): @pytest.mark.filterwarnings("ignore::DeprecationWarning") def test_deprecated_functions(): + # tests that appropriate DeprecationWarnings are triggered by deprecated methods with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. warnings.filterwarnings("ignore") @@ -218,113 +295,58 @@ def test_check_whether_parent_exists(): def test_remove_unnecessary_updates(): # test trvial case - upl = [db.Record().add_parent("A")] - irs = [db.Record().add_parent("A")] - updates = Crawler.remove_unnecessary_updates(upl, irs) + crawled_data = [db.Record().add_parent("A")] + identified_records = [db.Record().add_parent("A")] + updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records) assert len(updates) == 0 # test property difference case - # TODO this should work right? - # upl = [db.Record().add_parent("A").add_property("a", 3)] - # irs = [db.Record().add_parent("A")] # ID should be s - # Crawler.remove_unnecessary_updates(upl, irs) - # assert len(upl) == 1 + crawled_data = [db.Record().add_parent("A").add_property("a", 3)] + identified_records = [db.Record().add_parent("A")] # ID should be s + Crawler.remove_unnecessary_updates(crawled_data, identified_records) + assert len(crawled_data) == 1 # test value difference case - upl = [db.Record().add_parent("A").add_property("a", 5)] - irs = [db.Record().add_parent("A").add_property("a")] - updates = Crawler.remove_unnecessary_updates(upl, irs) + crawled_data = [db.Record().add_parent("A").add_property("a", 5)] + identified_records = [db.Record().add_parent("A").add_property("a")] + updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records) assert len(updates) == 1 - upl = [db.Record().add_parent("A").add_property("a", 5)] - irs = [db.Record().add_parent("A").add_property("a", 5)] - updates = Crawler.remove_unnecessary_updates(upl, irs) + crawled_data = [db.Record().add_parent("A").add_property("a", 5)] + identified_records = [db.Record().add_parent("A").add_property("a", 5)] + updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records) assert len(updates) == 0 # test unit difference case - upl = [db.Record().add_parent("A").add_property("a", unit='cm')] - irs = [db.Record().add_parent("A").add_property("a")] - updates = Crawler.remove_unnecessary_updates(upl, irs) + crawled_data = [db.Record().add_parent("A").add_property("a", unit='cm')] + identified_records = [db.Record().add_parent("A").add_property("a")] + updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records) assert len(updates) == 1 # test None difference case - upl = [db.Record().add_parent("A").add_property("a")] - irs = [db.Record().add_parent("A").add_property("a", 5)] - updates = Crawler.remove_unnecessary_updates(upl, irs) + crawled_data = [db.Record().add_parent("A").add_property("a")] + identified_records = [db.Record().add_parent("A").add_property("a", 5)] + updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records) assert len(updates) == 1 def test_split_into_inserts_and_updates_trivial(): crawler = Crawler() - crawler.split_into_inserts_and_updates([]) - - -def test_split_into_inserts_and_updates_unidentified(): - crawler = Crawler() - with raises(ValueError) as err: - crawler.split_into_inserts_and_updates([db.Record(name="recname").add_parent("someparent")]) - assert str(err.value).startswith("There is no identifying information.") - - -def basic_retrieve_by_name_mock_up(rec, referencing_entities=None, known=None): - """ returns a stored Record if rec.name is an existing key, None otherwise """ - if rec.name in known: - return known[rec.name] - else: - return None - - -@pytest.fixture -def crawler_mocked_identifiable_retrieve(): - crawler = Crawler() - # TODO use minimal setup - # mock retrieval of registered identifiabls: return Record with just a parent - crawler.identifiableAdapter.get_registered_identifiable = Mock( - side_effect=lambda x: db.Record().add_parent(x.parents[0].name).add_property(name='name')) - - # Simulate remote server content by using the names to identify records - # There is only a single known Record with name A - crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( - basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( - side_effect=partial( - basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) - return crawler + st = SyncGraph([], crawler.identifiableAdapter) + crawler._split_into_inserts_and_updates(st) -def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retrieve): +def test_split_into_inserts_and_updates_simple(crawler_mocked_identifiable_retrieve): + # basic test that checks whether two records are correctly sorted to update and insert based on + # whether an entity can be found using the identifiable crawler = crawler_mocked_identifiable_retrieve identlist = [Identifiable(name="A", record_type="C"), Identifiable(name="B", record_type="C")] - entlist = [db.Record(name="A").add_parent( - "C"), db.Record(name="B").add_parent("C")] - - assert crawler.treated_records_lookup.get_any(entlist[0], identlist[0]) is None - assert crawler.treated_records_lookup.get_any(entlist[0], identlist[0]) is None - assert not crawler._has_reference_value_without_id(identlist[0]) - assert not crawler._has_reference_value_without_id(identlist[1]) - assert crawler.identifiableAdapter.retrieve_identified_record_for_record( - identlist[0]).id == 1111 - assert crawler.identifiableAdapter.retrieve_identified_record_for_record( - identlist[1]) is None - - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - assert len(insert) == 1 - assert insert[0].name == "B" - assert len(update) == 1 - assert update[0].name == "A" - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() + entlist = [db.Record(name="A").add_parent("C"), + db.Record(name="B").add_parent("C")] + st = SyncGraph(entlist, crawler.identifiableAdapter) + # check setup -def test_split_into_inserts_and_updates_with_duplicate(crawler_mocked_identifiable_retrieve): - crawler = crawler_mocked_identifiable_retrieve - a = db.Record(name="A").add_parent("C") - b = db.Record(name="B").add_parent("C") - b.add_property("A", a) - # This is identical to a and should be removed - c = db.Record(name="A").add_parent("C") - entlist = [a, b, c] - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) + insert, update = crawler._split_into_inserts_and_updates(st) assert len(insert) == 1 assert insert[0].name == "B" assert len(update) == 1 @@ -334,31 +356,20 @@ def test_split_into_inserts_and_updates_with_duplicate(crawler_mocked_identifiab crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() -def test_split_into_inserts_and_updates_with_ref(crawler_mocked_identifiable_retrieve): +def test_split_into_inserts_and_updates_with_circ(crawler_mocked_identifiable_retrieve): + # test trying to split circular dependency crawler = crawler_mocked_identifiable_retrieve - # try it with a reference - a = db.Record(name="A").add_parent("C") - b = db.Record(name="B").add_parent("C") - b.add_property("A", a) - entlist = [a, b] - insert, update = crawler.split_into_inserts_and_updates(entlist) - assert len(insert) == 1 - assert insert[0].name == "B" - assert len(update) == 1 - assert update[0].name == "A" - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() - + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent('C').add_property(name='a') + ) + # two records that reference each other via identifying properties + a = db.Record().add_parent("C") + b = db.Record().add_parent("C").add_property(name='a', value=a) + a.add_property(name='a', value=b) -def test_split_into_inserts_and_updates_with_circ(): - # try circular - a = db.Record(name="A").add_parent("C") - b = db.Record(name="B").add_parent("C") - b.add_property("A", a) - a.add_property("B", b) - entlist = [a, b] - # TODO this does not seem to be complete! + st = SyncGraph([a, b], crawler.identifiableAdapter) + with pytest.raises(RuntimeError): + crawler._split_into_inserts_and_updates(st) def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable_retrieve): @@ -372,11 +383,12 @@ def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable b = db.Record(name="B").add_parent("C") g = db.Record(name="G").add_parent("C") f = db.Record(name="F").add_parent("C") - g.add_property("A", a) - b.add_property("A", f) + g.add_property("C", b) b.add_property("A", a) + b.add_property("C", f) entlist = [a, b, g] - insert, update = crawler.split_into_inserts_and_updates(entlist) + st = SyncGraph(entlist, crawler.identifiableAdapter) + insert, update = crawler._split_into_inserts_and_updates(st) assert len(insert) == 3 assert "B" in [el.name for el in insert] assert len(update) == 1 @@ -388,23 +400,8 @@ def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable # TODO write test where the unresoled entity is not part of the identifiable -def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiable_retrieve): - crawler = crawler_mocked_identifiable_retrieve - # assume identifiable is only the name - a = db.Record(name="A").add_parent("C") - a.add_property("foo", 1) - b = db.Record(name="A").add_parent("C") - b.add_property("bar", 2) - entlist = [a, b] - insert, update = crawler.split_into_inserts_and_updates(entlist) - - assert update[0].get_property("bar").value == 2 - assert update[0].get_property("foo").value == 1 - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() - - +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) @patch("caoscrawler.identifiable_adapters.cached_query", new=Mock(side_effect=mock_cached_only_rt)) def test_split_iiau_with_unmergeable_list_items(): @@ -440,6 +437,12 @@ b1: ("same", c1) b2: ("same", c2) a: ([b1, b2]) + + + +- a can be identified. +- bs can be identified with each other once a is identified +- cs depend on b(s), but cannot be put in one Entity because they have conflicting properties """ prop_ident = db.Property("prop_ident", datatype=db.INTEGER) prop_other = db.Property("prop_ident", datatype=db.INTEGER) @@ -472,82 +475,104 @@ a: ([b1, b2]) crawler = Crawler(identifiableAdapter=ident_adapter) - with raises(RuntimeError) as rte: - crawler.synchronize(commit_changes=False, - crawled_data=[rec_a, *rec_b, *rec_c]) - assert not isinstance(rte.value, NotImplementedError), \ - "Exception must not be NotImplementedError, but plain RuntimeError." - assert "Could not find referencing entities" in rte.value.args[0] - assert "merge conflicts in the referencing" in rte.value.args[0] + st = SyncGraph(deepcopy([rec_a, *rec_b, *rec_c]), crawler.identifiableAdapter) + assert st._identity_relies_on_unchecked_entity(st.nodes[0]) is False + assert st._identity_relies_on_unchecked_entity(st.nodes[1]) + assert st._identity_relies_on_unchecked_entity(st.nodes[2]) + assert st._identity_relies_on_unchecked_entity(st.nodes[3]) + assert st._identity_relies_on_unchecked_entity(st.nodes[4]) + assert len(st.unchecked) == 5 + + # The Cs cannot be merged due to different identifying properties + # The Bs cannot be merged due to different references to Cs + with raises(ImpossibleMergeError) as rte: + crawler._split_into_inserts_and_updates(st) + # TODO + # assert not isinstance(rte.value, NotImplementedError), \ + # "Exception must not be NotImplementedError, but plain RuntimeError." + # assert "Could not find referencing entities" in rte.value.args[0] + # assert "merge conflicts in the referencing" in rte.value.args[0] + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test): + # test that backrefs are appropriately considered in the identifiable + crawler = crawler_mocked_for_backref_test + identlist = [Identifiable(name="A", record_type="BR"), + Identifiable(name="B", record_type="C", backrefs=[db.Entity()])] + referenced = db.Record(name="B").add_parent("C") + entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] + # Test without referencing object + # currently a RuntimeError is raised if necessary properties are missing. + with raises(MissingReferencingEntityError): + st = SyncGraph([db.Record(name="B").add_parent("C")], crawler.identifiableAdapter) -def test_has_missing_object_in_references(): - crawler = Crawler() - # Simulate remote server content by using the names to identify records - # There are only two known Records with name A and B - crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=partial( - basic_retrieve_by_name_mock_up, known={"C": db.Record(name="C").add_parent("RTC") - .add_property("d").add_property("name"), - "D": db.Record(name="D").add_parent("RTD") - .add_property("d").add_property("e").add_property("name"), - })) - - # one reference with id -> check - assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': 123}), {}) - # one ref with Entity with id -> check - rec = db.Record(id=123).add_parent("C") - assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': rec}), {id(rec): {'C': [None]}}) - # one ref with id one with Entity with id (mixed) -> check - rec = db.Record(id=123).add_parent("RTC") - assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTD", - properties={'d': 123, 'b': rec}), {id(rec): {'C': [None]}}) - # entity to be referenced in the following - a = db.Record(name="C").add_parent("C").add_property("d", 12311) - # one ref with id one with Entity without id (but not identifying) -> fail - assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a}), - {id(a): {'C': [None]}}) - - # one ref with id one with Entity without id (mixed) -> fail - assert not crawler._has_missing_object_in_references( - Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), - {id(a): {'C': [None]}}) - - crawler.treated_records_lookup.add(a, Identifiable(name="C", record_type="RTC", - properties={'d': 12311})) - # one ref with id one with Entity without id but in cache -> check - assert crawler._has_missing_object_in_references( - Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), - {id(a): {'C': [None]}}) + # identifiables were not yet checked + st = SyncGraph(entlist, crawler.identifiableAdapter) + assert st.get_equivalent(st.nodes[1]) is None + assert st.get_equivalent(st.nodes[0]) is None + # one can be found remotely, one not - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() + # check the split... + insert, update = crawler._split_into_inserts_and_updates(st) + # A was found remotely and is therefore in the update list + assert len(update) == 1 + assert update[0].name == "A" + # B does not exist on the (simulated) remote server + assert len(insert) == 1 + assert insert[0].name == "B" -@ pytest.mark.xfail() -def test_references_entities_without_ids(): - crawler = Crawler() - assert not crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('last_name', 123) - .add_property('first_name', 123)) - # id and rec with id - assert not crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('first_name', 123) - .add_property('last_name', - db.Record(id=123))) - # id and rec with id and one unneeded prop - assert crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('first_name', 123) - .add_property('stuff', db.Record()) - .add_property('last_name', db.Record(id=123))) - - # one identifying prop is missing - assert crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('first_name', 123) - .add_property('last_name', db.Record())) +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test): + # test whether multiple references of the same record type are correctly used + crawler = crawler_mocked_for_backref_test + referenced = db.Record(name="B").add_parent("C") + entlist = [referenced, + db.Record(id=1, name="A").add_parent("BR").add_property("ref", referenced), + db.Record(id=2, name="C").add_parent("BR").add_property("ref", referenced), + ] + + # test whether both entities are listed in the backref attribute of the identifiable + st = SyncGraph(entlist, crawler.identifiableAdapter) + + identifiable = crawler.identifiableAdapter.get_identifiable( + st.nodes[0], + st.backward_references_backref[id(st.nodes[0])]) + assert len(identifiable.backrefs) == 2 + + # check the split... + insert, update = crawler._split_into_inserts_and_updates(st) + assert len(update) == 2 + assert len(insert) == 1 + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test): + # test whether multiple references of the different record types are correctly used + crawler = crawler_mocked_for_backref_test + referenced = db.Record(name="B").add_parent("D") + entlist = [referenced, + db.Record(id=1, name="A").add_parent("BR").add_property("ref", referenced), + db.Record(id=2, name="A").add_parent("BR2").add_property("ref", referenced), + ] + + # test whether both entities are listed in the backref attribute of the identifiable + st = SyncGraph(entlist, crawler.identifiableAdapter) + identifiable = crawler.identifiableAdapter.get_identifiable( + st.nodes[0], + st.backward_references_backref[id(st.nodes[0])]) + + assert len(identifiable.backrefs) == 2 + + # check the split... + insert, update = crawler._split_into_inserts_and_updates(st) + assert len(update) == 2 + assert len(insert) == 1 def test_replace_entities_with_ids(): @@ -562,20 +587,15 @@ def test_replace_entities_with_ids(): assert a.get_property("C").value == [12345, 233324] -def reset_mocks(mocks): - for mock in mocks: - mock.reset_mock() - - -@ patch("caoscrawler.crawl.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@ patch("caoscrawler.identifiable_adapters.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@ patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." - "retrieve_identified_record_for_identifiable", - new=Mock(side_effect=mock_retrieve_record)) -@ patch("caoscrawler.crawl.db.Container.insert") -@ patch("caoscrawler.crawl.db.Container.update") +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@patch("caoscrawler.identifiable_adapters.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." + "retrieve_identified_record_for_identifiable", + new=Mock(side_effect=mock_retrieve_record)) +@patch("caoscrawler.crawl.db.Container.insert") +@patch("caoscrawler.crawl.db.Container.update") def test_synchronization_no_commit(upmock, insmock): crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"] # change one; add one @@ -592,20 +612,19 @@ def test_synchronization_no_commit(upmock, insmock): assert len(ups) == 1 -@ patch("caoscrawler.crawl.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@ patch("caoscrawler.identifiable_adapters.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -@ patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." - "retrieve_identified_record_for_identifiable", - new=Mock(side_effect=mock_retrieve_record)) -@ patch("caoscrawler.crawl.db.Container.insert") -@ patch("caoscrawler.crawl.db.Container.update") -@ patch("caoscrawler.crawl.UpdateCache.insert") +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@patch("caoscrawler.identifiable_adapters.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." + "retrieve_identified_record_for_identifiable", + new=Mock(side_effect=mock_retrieve_record)) +@patch("caoscrawler.crawl.db.Container.insert") +@patch("caoscrawler.crawl.db.Container.update") +@patch("caoscrawler.crawl.UpdateCache.insert") def test_security_mode(updateCacheMock, upmock, insmock): # trivial case: nothing to do crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"] - print(crawled_data) crawler = Crawler(securityMode=SecurityMode.RETRIEVE) crawler.synchronize(commit_changes=True, crawled_data=crawled_data) assert crawler.run_id is not None @@ -640,9 +659,6 @@ def test_security_mode(updateCacheMock, upmock, insmock): assert crawler.run_id is not None insmock.assert_not_called() upmock.assert_not_called() - # import IPython - # IPython.embed() - # print(updateCacheMock.call_args_list) assert updateCacheMock.call_count == 1 # reset counts reset_mocks([updateCacheMock, insmock, upmock]) @@ -698,65 +714,6 @@ def test_security_mode(updateCacheMock, upmock, insmock): crawled_data[-1] = EXAMPLE_SERVER_STATE[-1].copy() -def test_create_reference_mapping(): - a = db.Record().add_parent("A") - b = db.Record(id=132).add_parent("B").add_property('a', a) - ref = Crawler.create_reference_mapping([a, b]) - assert id(a) in ref - assert id(b) in ref - assert "B" in ref[id(a)] - assert {} == ref[id(b)] - assert ref[id(a)]["B"] == [132] - - -def test_create_flat_list(): - a = db.Record() - b = db.Record() - a.add_property(name="a", value=a) - a.add_property(name="b", value=b) - flat = Crawler.create_flat_list([a]) - assert len(flat) == 2 - assert a in flat - assert b in flat - c = db.Record() - c.add_property(name="a", value=a) - # This would caus recursion if it is not dealt with properly. - a.add_property(name="c", value=c) - flat = Crawler.create_flat_list([c]) - assert len(flat) == 3 - assert a in flat - assert b in flat - assert c in flat - - -@ pytest.fixture -def crawler_mocked_for_backref_test(): - crawler = Crawler() - # mock retrieval of registered identifiabls: return Record with just a parent - - def get_reg_ident(x): - if x.parents[0].name == "C": - return db.Record().add_parent(x.parents[0].name).add_property( - "is_referenced_by", value=["BR"]).add_property("name") - elif x.parents[0].name == "D": - return db.Record().add_parent(x.parents[0].name).add_property( - "is_referenced_by", value=["BR", "BR2"]).add_property("name") - else: - return db.Record().add_parent(x.parents[0].name).add_property("name") - crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=get_reg_ident) - - # Simulate remote server content by using the names to identify records - # There is only a single known Record with name A - crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( - basic_retrieve_by_name_mock_up, known={"A": - db.Record(id=1111, name="A").add_parent("BR")})) - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( - side_effect=partial( - basic_retrieve_by_name_mock_up, known={"A": - db.Record(id=1111, name="A").add_parent("BR")})) - return crawler - - def test_validation_error_print(caplog): caplog.set_level(logging.DEBUG, logger="caoscrawler.converters") # there should be no server interaction since we only test the behavior if a validation error @@ -773,96 +730,7 @@ def test_validation_error_print(caplog): caplog.clear() -@ patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) -def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test): - crawler = crawler_mocked_for_backref_test - identlist = [Identifiable(name="A", record_type="BR"), - Identifiable(name="B", record_type="C", backrefs=[db.Entity()])] - referenced = db.Record(name="B").add_parent("C") - entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] - - # Test without referencing object - # currently a RuntimeError is raised if necessary properties are missing. - with raises(RuntimeError): - crawler.split_into_inserts_and_updates([db.Record(name="B").add_parent("C")]) - - # identifiables were not yet checked - assert crawler.treated_records_lookup.get_any(entlist[1], identlist[0]) is None - assert crawler.treated_records_lookup.get_any(entlist[0], identlist[1]) is None - # one with reference, one without - assert not crawler._has_reference_value_without_id(identlist[0]) - assert crawler._has_reference_value_without_id(identlist[1]) - # one can be found remotely, one not - assert crawler.identifiableAdapter.retrieve_identified_record_for_record( - identlist[0]).id == 1111 - assert crawler.identifiableAdapter.retrieve_identified_record_for_record( - identlist[1]) is None - - # check the split... - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - # A was found remotely and is therefore in the update list - assert len(update) == 1 - assert update[0].name == "A" - # B does not exist on the (simulated) remote server - assert len(insert) == 1 - assert insert[0].name == "B" - - -@ patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) -def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test): - # test whether multiple references of the same record type are correctly used - crawler = crawler_mocked_for_backref_test - referenced = db.Record(name="B").add_parent("C") - entlist = [referenced, - db.Record(name="A").add_parent("BR").add_property("ref", referenced), - db.Record(name="C").add_parent("BR").add_property("ref", referenced), - ] - - # test whether both entities are listed in the backref attribute of the identifiable - referencing_entities = crawler.create_reference_mapping(entlist) - identifiable = crawler.identifiableAdapter.get_identifiable( - referenced, - referencing_entities[id(referenced)]) - assert len(identifiable.backrefs) == 2 - - # check the split... - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - assert len(update) == 1 - assert len(insert) == 2 - - -@ patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=lambda x: [x])) -def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test): - # test whether multiple references of the different record types are correctly used - crawler = crawler_mocked_for_backref_test - referenced = db.Record(name="B").add_parent("D") - entlist = [referenced, - db.Record(name="A").add_parent("BR").add_property("ref", referenced), - db.Record(name="A").add_parent("BR2").add_property("ref", referenced), - ] - - # test whether both entities are listed in the backref attribute of the identifiable - referencing_entities = crawler.create_reference_mapping(entlist) - identifiable = crawler.identifiableAdapter.get_identifiable( - referenced, - referencing_entities[id(referenced)]) - - assert len(identifiable.backrefs) == 2 - - # check the split... - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - assert len(update) == 2 - assert len(insert) == 1 - - -def mock_create_values(values, element): - pass - - -@ patch("caoscrawler.converters.IntegerElementConverter.create_values") +@patch("caoscrawler.converters.IntegerElementConverter.create_values") def test_restricted_path(create_mock): """ The restricted_path argument allows to ignroe part of the crawled data structure. Here, we make @@ -955,7 +823,7 @@ def test_split_restricted_path(): # Filter the warning because we want to have it here and this way it does not hinder running # tests with -Werror. -@ pytest.mark.filterwarnings("ignore:The prefix:DeprecationWarning") +@pytest.mark.filterwarnings("ignore:The prefix:DeprecationWarning") def test_deprecated_prefix_option(): """Test that calling the crawler's main function with the deprecated `prefix` option raises the correct errors and warnings. @@ -993,36 +861,8 @@ def test_create_entity_summary(): assert "<a href='/Entity/4'>a</a>, <a href='/Entity/6'>b</a>" in text -def test_detect_circular_dependency(crawler_mocked_identifiable_retrieve, caplog): - crawler = crawler_mocked_identifiable_retrieve - crawler.identifiableAdapter.get_registered_identifiable = Mock( - side_effect=lambda x: db.Record().add_parent('C').add_property(name='C')) - a = db.Record(name='a').add_parent("C") - b = db.Record(name='b').add_parent("C").add_property(name="C", value=a) - c = db.Record(name='c').add_parent("C").add_property(name='D', value='e' - ).add_property(name="C", value=b) - d = db.Record(name='c').add_parent("C") - a.add_property(name="C", value=c) - flat = [a, b, c] - circle = Crawler.detect_circular_dependency(flat) - assert [id(el) for el in circle] == [id(el) for el in [a, c, b, a]] - - assert Crawler.detect_circular_dependency([d]) is None - with raises(RuntimeError): - _, _ = crawler.split_into_inserts_and_updates(flat) - caplog.set_level(logging.ERROR, logger="caoscrawler.converters") - assert "Found circular dependency" in caplog.text - assert "\n--------\n\n> Parent: C\n\n>> Name: a\n[\'C\']" in caplog.text - caplog.clear() - - -def mock_get_entity_by_query(query=None): - if query is not None: - return db.Record(id=1111, name='rec_name').add_parent('RT') - - -@ patch("caoscrawler.crawl.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by_query)) +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by_query)) def test_replace_name_with_referenced_entity(): test_text = 'lkajsdf' test_int = 134343 @@ -1090,72 +930,3 @@ def test_replace_name_with_referenced_entity(): assert isinstance(prop.value[2], int) assert prop.value[2] == test_id assert caoscrawler.crawl.cached_get_entity_by.call_count == 3 - - -def test_treated_record_lookup(): - trlu = TreatedRecordLookUp() - exist = db.Record(id=1) - trlu.add(exist) - assert len(trlu._existing) == 1 - # was added to existing - assert trlu._existing[id(exist)] is exist - # is in ID lookup - assert trlu._id_look_up[exist.id] is exist - # can be accessed via get_existing - assert trlu.get_existing(db.Record(id=1)) is exist - - miss = db.Record() - # exception when identifiable is missing - with raises(RuntimeError): - trlu.add(miss) - ident = Identifiable(name='a') - trlu.add(miss, ident) - # was added to missing - assert trlu._missing[id(miss)] is miss - # is in ident lookup - assert trlu._identifiable_look_up[ident.get_representation()] is miss - # can be accessed via get_missing - assert trlu.get_missing(db.Record(), Identifiable(name='a')) is miss - - fi = db.File(path='a', id=2) - trlu.add(fi) - assert len(trlu._existing) == 2 - # was added to existing - assert trlu._existing[id(fi)] is fi - # is in ID lookup - assert trlu._id_look_up[fi.id] is fi - # is in path lookup - assert trlu._path_look_up[fi.path] is fi - # can be accessed via get_existing - assert trlu.get_existing(fi) is fi - - all_exi = trlu.get_existing_list() - assert fi in all_exi - assert exist in all_exi - all_mi = trlu.get_missing_list() - assert miss in all_mi - - # If a Record was added using the ID, the ID must be used to identify it even though later an - # identifiable may be passed as well - assert trlu.get_any(exist, Identifiable(name='b')) is exist - - fi2 = db.File(path='b') - trlu.add(fi2) - assert trlu.get_any(db.File(path='b'), Identifiable(name='c')) is fi2 - - -def test_merge_entity_with_identifying_reference(crawler_mocked_identifiable_retrieve): - # When one python object representing a record is merged into another python object - # representing the same record, the former object can be forgotten and references from it to - # other records must not play a role - crawler = crawler_mocked_identifiable_retrieve - crawler.identifiableAdapter.get_registered_identifiable = Mock( - side_effect=lambda x: db.Record().add_parent('C').add_property(name='name') if - x.parents[0].name == "C" else - db.Record().add_parent('D').add_property(name='is_referenced_by', value="*") - ) - a = db.Record(name='a').add_parent("D") - b = db.Record(name='b').add_parent("C") - c = db.Record(name='b').add_parent("C").add_property(name="C", value=a) - flat = [a, c, b] - _, _ = crawler.split_into_inserts_and_updates(flat) diff --git a/unittests/test_data/invalid_identifiable/identifiable_content_no_list.yaml b/unittests/test_data/invalid_identifiable/identifiable_content_no_list.yaml new file mode 100644 index 0000000000000000000000000000000000000000..aee572a190bd7f439f638ef7c9a5d94a831aca81 --- /dev/null +++ b/unittests/test_data/invalid_identifiable/identifiable_content_no_list.yaml @@ -0,0 +1,4 @@ +Experiment: + date: + - 1 + - 2 diff --git a/unittests/test_data/invalid_identifiable/identifiable_no_str_or_dict.yaml b/unittests/test_data/invalid_identifiable/identifiable_no_str_or_dict.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a33c4ace9f8709a9b4a77c5fd8f38514acbe1e9c --- /dev/null +++ b/unittests/test_data/invalid_identifiable/identifiable_no_str_or_dict.yaml @@ -0,0 +1,3 @@ +Experiment: +- date +- 23 diff --git a/unittests/test_data/invalid_identifiable/identifiable_referenced_no_list.yaml b/unittests/test_data/invalid_identifiable/identifiable_referenced_no_list.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a504eab748d4891c3e1088ee785afcf6347fbbab --- /dev/null +++ b/unittests/test_data/invalid_identifiable/identifiable_referenced_no_list.yaml @@ -0,0 +1,5 @@ +Experiment: +- date +Event: +- is_referenced_by: Experiment +- event_id diff --git a/unittests/test_entity_comparison.py b/unittests/test_entity_comparison.py index 549bc4f42a59765d25446d44fbb845e49ca4d9b9..0f62475b6c61d82feb3e550cf5ab53e91183f80a 100644 --- a/unittests/test_entity_comparison.py +++ b/unittests/test_entity_comparison.py @@ -2,7 +2,7 @@ # Tests for entity comparison # A. Schlemmer, 06/2021 -import caosdb as db +import linkahead as db import pytest from pytest import raises diff --git a/unittests/test_file_identifiables.py b/unittests/test_file_identifiables.py deleted file mode 100644 index 4ec02aa3fc497f8dc35adc709533ef5b35066f3a..0000000000000000000000000000000000000000 --- a/unittests/test_file_identifiables.py +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/python -# Tests for file identifiables -# A. Schlemmer, 06/2021 - -from unittest.mock import Mock, patch - -import caosdb as db -import pytest -from caoscrawler.identifiable import Identifiable -from caoscrawler.identifiable_adapters import LocalStorageIdentifiableAdapter -from caosdb.cached import cache_clear -from caosdb.exceptions import EmptyUniqueQueryError -from pytest import raises - -from test_crawler import mock_get_entity_by - - -@pytest.fixture(autouse=True) -def clear_cache(): - cache_clear() - - -@patch("caoscrawler.identifiable_adapters.get_children_of_rt", - new=Mock(side_effect=id)) -@patch("caoscrawler.identifiable_adapters.cached_get_entity_by", - new=Mock(side_effect=mock_get_entity_by)) -def test_file_identifiable(): - ident = LocalStorageIdentifiableAdapter() - - # Without a path there is no identifying information - with raises(ValueError): - ident.get_identifiable(db.File(), []) - - fp = "/test/bla/bla.txt" - file_obj = db.File(path=fp) - identifiable = ident.get_identifiable(file_obj) - - # the path is copied to the identifiable - assert fp == identifiable.path - assert isinstance(identifiable, Identifiable) - - # __eq__ function is only defined for Identifiable objects - with raises(ValueError): - file_obj != identifiable - - # since the path does not exist in the data in ident, the follwoing functions return None - with raises(EmptyUniqueQueryError): - ident.retrieve_identified_record_for_record(file_obj) - assert ident.get_file(identifiable) is None - - # Try again with actual files in the store: - records = ident.get_records() - test_record_wrong_path = db.File(path="/bla/bla/test.txt") - test_record_correct_path = db.File(path="/test/bla/bla.txt") - test_record_alsocorrect_path = db.File(path="/test/bla/bla.txt") - records.append(test_record_wrong_path) - # Now, there is a file, but still wrong path -> result is still None - identified_file = ident.get_file(file_obj) - assert identified_file is None - - records.append(test_record_correct_path) - # now there is a match - identified_file = ident.get_file(file_obj) - assert identified_file is not None - assert identified_file.path == file_obj.path - - with raises(RuntimeError, match=".*unambigiously.*"): - records.append(test_record_alsocorrect_path) - identified_file = ident.get_file(file_obj) diff --git a/unittests/test_h5_converter.py b/unittests/test_h5_converter.py index 2f7fae5d8d32bb7e5c90a535b63158c33df55daa..7f244e2cbdccb0d4eee6a62f59e9cea5684295a6 100644 --- a/unittests/test_h5_converter.py +++ b/unittests/test_h5_converter.py @@ -23,7 +23,7 @@ from functools import partial from pathlib import Path from pytest import fixture, importorskip -import caosdb as db +import linkahead as db from caoscrawler.debug_tree import DebugTree from caoscrawler.hdf5_converter import (convert_basic_element_with_nd_array, diff --git a/unittests/test_identifiable.py b/unittests/test_identifiable.py index 28bdb7a2ad75d5b9389b47ca3f0ec2b2e2a1404b..d94d852583523a3b3f29f002eaacb9ae0b616c4f 100644 --- a/unittests/test_identifiable.py +++ b/unittests/test_identifiable.py @@ -24,9 +24,10 @@ test identifiable module """ -import caosdb as db +import linkahead as db import pytest from caoscrawler.identifiable import Identifiable +from caoscrawler.sync_node import SyncNode def test_create_hashable_string(): @@ -42,25 +43,20 @@ def test_create_hashable_string(): assert ( Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", - properties={'a': db.Record(id=12)}) + properties={'a': SyncNode(db.Record(id=12))}) ) == "P<B>N<A>R<[]>a:12") a = Identifiable._create_hashable_string( - Identifiable(name="A", record_type="B", properties={'a': [db.Record(id=12)]})) + Identifiable(name="A", record_type="B", properties={'a': [SyncNode(db.Record(id=12))]})) assert (a == "P<B>N<A>R<[]>a:[12]") assert (Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", properties={'a': [12]})) == "P<B>N<A>R<[]>a:[12]") assert ( Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", properties={ - 'a': [db.Record(id=12), 11]}) + 'a': [SyncNode(db.Record(id=12)), 11]}) ) == "P<B>N<A>R<[]>a:[12, 11]") - assert ( - Identifiable._create_hashable_string( - Identifiable(record_type="B", properties={'a': [db.Record()]}) - ) != Identifiable._create_hashable_string( - Identifiable(record_type="B", properties={'a': [db.Record()]}))) assert Identifiable._create_hashable_string( - Identifiable(name="A", record_type="B", backrefs=[123, db.Entity(id=124)], + Identifiable(name="A", record_type="B", backrefs=[123, SyncNode(db.Record(id=124))], properties={'a': 5})) == "P<B>N<A>R<['123', '124']>a:5" @@ -73,9 +69,9 @@ def test_repr(): # only test that something meaningful is returned assert 'properties' in str(Identifiable(name="A", record_type="B")) assert str(Identifiable(name="A", record_type="B", properties={'a': 0})).split( - "properties:\n")[1].split('\n')[0] == '{"a": 0}' + "properties:\n")[1].split('\n')[0] == '{"a": "0"}' assert str(Identifiable(name="A", record_type="B", properties={'a': 0, 'b': "test"})).split( - "properties:\n")[1].split('\n')[0] == '{"a": 0, "b": "test"}' + "properties:\n")[1].split('\n')[0] == '{"a": "0", "b": "test"}' # TODO(henrik): Add a test using backrefs once that's implemented. @@ -87,13 +83,5 @@ def test_equality(): record_id=12, properties={"a": 0}) != Identifiable(record_id=13, properties={"a": 0}) assert Identifiable( record_id=12, properties={"a": 0}) == Identifiable(properties={"a": 0}) - assert Identifiable( - path="a", properties={"a": 0}) != Identifiable(path="b", properties={"a": 0}) - assert Identifiable( - path="a", properties={"a": 0}) == Identifiable(path="a", properties={"a": 1}) - assert Identifiable( - path="a", properties={"a": 0}) == Identifiable(properties={"a": 0}) - assert Identifiable(properties={"a": 0}) == Identifiable( - properties={"a": 0}) - assert Identifiable(properties={"a": 0}) != Identifiable( - properties={"a": 1}) + assert Identifiable(properties={"a": 0}) == Identifiable(properties={"a": 0}) + assert Identifiable(properties={"a": 0}) != Identifiable(properties={"a": 1}) diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index ee0e0d6cd7c791f78e7cd2307dc6f34698326b4a..53490bc0413a95d960d94186c639dac2c6223b80 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -29,14 +29,18 @@ test identifiable_adapters module import os from datetime import datetime +from unittest.mock import MagicMock, Mock, patch from pathlib import Path -import caosdb as db +import linkahead as db import pytest +from caoscrawler.exceptions import (InvalidIdentifiableYAML, + ) from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, IdentifiableAdapter, convert_value) +from caoscrawler.sync_graph import SyncNode UNITTESTDIR = Path(__file__).parent @@ -120,30 +124,45 @@ def test_load_from_yaml_file(): assert project_i.get_property("title") is not None +def test_invalid_yaml(): + ident = CaosDBIdentifiableAdapter() + invalid_dir = UNITTESTDIR / "test_data" / "invalid_identifiable" + with pytest.raises(InvalidIdentifiableYAML) as exc: + ident.load_from_yaml_definition(invalid_dir / "identifiable_content_no_list.yaml") + assert str(exc.value) == "Identifiable contents must be lists, but this was not: Experiment" + + with pytest.raises(InvalidIdentifiableYAML) as exc: + ident.load_from_yaml_definition(invalid_dir / "identifiable_referenced_no_list.yaml") + assert str(exc.value) == "'is_referenced_by' must be a list. Found in: Event" + + with pytest.raises(InvalidIdentifiableYAML) as exc: + ident.load_from_yaml_definition(invalid_dir / "identifiable_no_str_or_dict.yaml") + assert str(exc.value) == ("Identifiable properties must be str or dict, but this one was not:\n" + " Experiment/23") + + def test_non_default_name(): ident = CaosDBIdentifiableAdapter() - ident.register_identifiable( - "Person", db.RecordType() - .add_parent(name="Person") - .add_property(name="last_name")) - identifiable = ident.get_identifiable(db.Record(name="don't touch it") - .add_parent("Person") - .add_property(name="last_name", value='Tom') - ) + identifiable = ident.get_identifiable(SyncNode(db.Record(name="don't touch it") + .add_parent("Person") + .add_property(name="last_name", value='Tom'), + db.RecordType() + .add_parent(name="Person") + .add_property(name="last_name")), []) assert identifiable.name is None def test_wildcard_ref(): ident = CaosDBIdentifiableAdapter() - ident.register_identifiable( - "Person", db.RecordType() - .add_parent(name="Person") - .add_property(name="is_referenced_by", value=["*"])) rec = (db.Record(name="don't touch it").add_parent("Person") .add_property(name="last_name", value='Tom')) - identifiable = ident.get_identifiable(rec, - referencing_entities={ - 'A': [1]} + dummy = SyncNode(db.Record(), None) + dummy.id = 1 + identifiable = ident.get_identifiable(SyncNode(rec, db.RecordType() + .add_parent(name="Person") + .add_property(name="is_referenced_by", + value=["*"])), + [dummy] ) assert identifiable.backrefs[0] == 1 @@ -158,25 +177,63 @@ def test_convert_value(): def test_get_identifiable(): - # TODO modify this such that it becomes a test that acutally tests (sufficiently) the - # get_identifable function - ident = CaosDBIdentifiableAdapter() ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml") - r_cur = (db.Record(id=5) - .add_parent(name="Experiment", id=3) - .add_property(name="date", value="2022-02-01") - .add_property(name="result", value="FAIL")) - id_r0 = ident.get_identifiable(r_cur) - assert r_cur.parents[0].name == id_r0.record_type - assert r_cur.get_property( - "date").value == id_r0.properties["date"] - assert len(r_cur.parents) == 1 - assert len(r_cur.properties) == 2 + rec = (db.Record(id=5) + .add_parent(name="Experiment", id=3) + .add_property(name="date", value="2022-02-01") + .add_property(name="result", value="FAIL")) + se = SyncNode(rec, + ident.get_registered_identifiable(rec)) + id_r0 = ident.get_identifiable(se, []) + assert rec.parents[0].name == id_r0.record_type + assert rec.get_property("date").value == id_r0.properties["date"] + assert len(rec.parents) == 1 + assert len(rec.properties) == 2 + assert len(id_r0.properties) == 1 + + ident = CaosDBIdentifiableAdapter() + ident_a = db.RecordType(name="A").add_parent("A").add_property("name").add_property("a") + ident.register_identifiable("A", ident_a) + rec = (db.Record(id=5) + .add_parent(name="A", id=3) + .add_property(name="a", value="2022-02-01") + .add_property(name="result", value="FAIL")) + se = SyncNode(rec, ident.get_registered_identifiable(rec)) + for el in [ + db.Record() + .add_parent(name="A", id=3) + .add_property(name="a", value="2022-02-01") + .add_property(name="result", value="FAIL"), + db.Record(name='a') + .add_parent(name="A", id=3) + .add_property(name="a", value="2022-02-01") + .add_property(name="result", value="FAIL"), + ]: + se.update(SyncNode(el)) + + id_r0 = ident.get_identifiable(se, []) + assert "A" == id_r0.record_type + assert "2022-02-01" == id_r0.properties["a"] + assert 'a' == id_r0.name assert len(id_r0.properties) == 1 + rec = (db.Record(name='a') + .add_parent(name="A") + .add_property(name="a", value="2") + ) + se = SyncNode(rec, ident.get_registered_identifiable(rec)) + se.update(SyncNode( + db.Record(name='a') + .add_parent(name="A") + .add_property(name="a", value="3") + )) -@pytest.mark.xfail + with pytest.raises(RuntimeError): + id_r0 = ident.get_identifiable(se, []) + + +@ pytest.mark.xfail def test_retrieve_identified_record_for_identifiable(): # TODO modify this such that it becomes a test that acutally tests (sufficiently) the # retrieve_identified_record_for_identifiable function @@ -190,7 +247,7 @@ def test_retrieve_identified_record_for_identifiable(): r_cur = r break - id_r1 = ident.get_identifiable(r_cur) + id_r1 = ident.get_identifiable(r_cur, []) assert r_cur.parents[0].name == id_r1.record_type assert r_cur.get_property( "identifier").value == id_r1.properties["identifier"] @@ -211,3 +268,19 @@ def test_retrieve_identified_record_for_identifiable(): assert r_cur.get_property( "responsible").value == idr_r1.get_property("responsible").value assert r_cur.description == idr_r1.description + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_referencing_entity_has_appropriate_type(): + dummy = db.Record().add_parent("A") + registered_identifiable = db.RecordType() + rft = IdentifiableAdapter.referencing_entity_has_appropriate_type + assert not rft([], registered_identifiable) + assert not rft(dummy.parents, registered_identifiable) + registered_identifiable.add_property("is_referenced_by", "B") + assert not rft(dummy.parents, registered_identifiable) + registered_identifiable.properties[0].value = ["B", "A"] + assert rft(dummy.parents, registered_identifiable) + registered_identifiable.properties[0].value = ["B", "*"] + assert rft(dummy.parents, registered_identifiable) diff --git a/unittests/test_issues.py b/unittests/test_issues.py index cbbe9cabcfd17daaf07165757351f00dc051eeab..1678280555e739bae55819fa7fe42a53c938c4e5 100644 --- a/unittests/test_issues.py +++ b/unittests/test_issues.py @@ -22,13 +22,10 @@ from pytest import mark -import caosdb as db - +from caoscrawler.converters import replace_variables, CrawlerTemplate from caoscrawler.crawl import Crawler -from caoscrawler.identifiable import Identifiable -from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.structure_elements import DictElement - +from caoscrawler.stores import GeneralStore from caoscrawler.scanner import create_converter_registry, scan_structure_elements @@ -110,3 +107,43 @@ def test_list_datatypes(): assert isinstance(records[0].get_property("Subject").value, list) assert records[0].get_property("Subject").datatype is not None assert records[0].get_property("Subject").datatype.startswith("LIST") + + +def test_issue_93(): + """https://gitlab.com/linkahead/linkahead-crawler/-/issues/93 + + cfood.yaml does not allow umlaut in $expression""" + values = GeneralStore() + expressions = [ + "foo", + "foo.bär", + "_1", + "Ä", + "ųøîµ", + ] + for exp in expressions: + values[exp] = f"This is {exp}" + # ## Test preliminary check + # With braces + for exp in expressions: + assert replace_variables(f"${{{exp}}}", values) == f"This is {exp}" + # Without braces + for exp in expressions: + assert replace_variables(f"${exp}", values) == f"This is {exp}" + + # ## Test actual replacement + for exp in expressions: + # as-is + propvalue = f"${{{exp}}}" + propvalue_template = CrawlerTemplate(propvalue) + # from IPython import embed + # embed() + + assert propvalue_template.safe_substitute(**values.get_storage()) == f"This is {exp}" + + # String embedded into context + propvalue = f"some text before >> ${{{exp}}} << some text after" + print(propvalue) + propvalue_template = CrawlerTemplate(propvalue) + assert (propvalue_template.safe_substitute(**values.get_storage()) + == f"some text before >> This is {exp} << some text after") diff --git a/unittests/test_json.py b/unittests/test_json.py index fdb332df60d73dce3356a563e09ae0d02cf845b7..be65a26ea01e11e11968bd927c80513708e73850 100644 --- a/unittests/test_json.py +++ b/unittests/test_json.py @@ -31,7 +31,7 @@ import os from pytest import raises -import caosdb as db +import linkahead as db from caoscrawler.converters import JSONFileConverter from pathlib import Path diff --git a/unittests/test_macros.py b/unittests/test_macros.py index 53837e920e93f2cc318d62549145a0e8ac757372..85fe56cd2d49581bcf07b1c7af8456ad219b0111 100644 --- a/unittests/test_macros.py +++ b/unittests/test_macros.py @@ -142,7 +142,7 @@ def test_multi_macros_toplevel(register_macros, macro_store_reset): dat_loader = list(yaml.safe_load_all(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.7.2 macros: - !defmacro name: test_one @@ -171,7 +171,7 @@ def test_load_definition(register_macros, macro_store_reset): txt = """ --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.7.2 --- extroot: type: Directory @@ -188,7 +188,7 @@ extroot: cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.7.2 macros: - !defmacro name: test_one @@ -223,7 +223,6 @@ extroot3: assert cfood["extroot3"]["subtree"]["SimulationData"]["match"] == "SimulationData" -@pytest.mark.xfail def test_replace_arbitrary_objects(register_macros, macro_store_reset): """ See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/24 @@ -234,27 +233,34 @@ defs: name: test params: b: 25 + testvar_list_empty: [] testvar_list: - a - $b + testvar_dict_empty: {} testvar_dict: t1: a t2: $b definition: replaced1: $b: ok - c: $testvar_dict - d: $testvar_list + dict_empty: $testvar_dict_empty + dict: $testvar_dict + list_empty: $testvar_list_empty + list: ${testvar_list} testnode: obl: !macro test: """, Loader=yaml.SafeLoader) print(yaml.dump(dat)) - assert dat["testnode"]["obl"]["replaced1"]["c"]["t1"] == "a" - assert dat["testnode"]["obl"]["replaced1"]["c"]["t2"] == "25" - assert dat["testnode"]["obl"]["replaced1"]["d"][0] == "a" - assert dat["testnode"]["obl"]["replaced1"]["d"][1] == "25" + replaced = dat["testnode"]["obl"]["replaced1"] + assert replaced["dict_empty"] == {} + assert replaced["dict"]["t1"] == "a" + assert replaced["dict"]["t2"] == 25 + assert replaced["list_empty"] == [] + assert replaced["list"][0] == "a" + assert replaced["list"][1] == 25 def test_macros_in_macros(register_macros, macro_store_reset): @@ -264,7 +270,7 @@ def test_macros_in_macros(register_macros, macro_store_reset): cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.7.2 macros: - !defmacro name: one_macro @@ -293,11 +299,11 @@ extroot: !macro assert "test_macro" not in cfood["extroot"] assert cfood["extroot"]["macro_top"]["not_macro"]["a"] == 26 d = cfood["extroot"]["macro_top"] - assert d["macro_sub_17"]["b"] == "17" + assert d["macro_sub_17"]["b"] == 17 assert d["macro_sub_17"]["another_param"] == 3 - assert d["macro_sub_25"]["b"] == "25" + assert d["macro_sub_25"]["b"] == 25 assert d["macro_sub_25"]["another_param"] == 3 - assert d["macro_sub_98"]["b"] == "98" + assert d["macro_sub_98"]["b"] == 98 assert d["macro_sub_98"]["another_param"] == 3 @@ -309,7 +315,7 @@ def test_silent_overwrite(register_macros, macro_store_reset): cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.7.2 macros: - !defmacro name: one_macro @@ -340,7 +346,7 @@ def test_circular_macro_definition(register_macros, macro_store_reset): cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.7.2 macros: - !defmacro name: test_one @@ -389,7 +395,7 @@ def test_use_macro_twice(): cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.7.2 macros: - !defmacro name: test_twice @@ -410,9 +416,9 @@ extroot: !macro """) for name in ["once", "twice", "default_name"]: assert name in cfood["extroot"] - assert cfood["extroot"]["once"]["something"]["a"] == "4" - assert cfood["extroot"]["twice"]["something"]["a"] == "5" - assert cfood["extroot"]["default_name"]["something"]["a"] == "4" + assert cfood["extroot"]["once"]["something"]["a"] == 4 + assert cfood["extroot"]["twice"]["something"]["a"] == 5 + assert cfood["extroot"]["default_name"]["something"]["a"] == 4 # Code sample to generate the expanded macro: # with open("expanded_test_macro.yaml", "w") as f: # f.write(yaml.dump(cfood)) @@ -423,7 +429,7 @@ def test_documentation_example_2(): cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.7.2 macros: - !defmacro name: MarkdownFile @@ -461,7 +467,7 @@ def test_documentation_example_1(): cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.7.2 macros: - !defmacro name: SimulationDatasetFile @@ -510,7 +516,7 @@ def test_def_replacements(): cfood = _temp_file_load(""" --- metadata: - crawler-version: 0.5.1 + crawler-version: 0.7.2 macros: - !defmacro name: test_def_replacements @@ -573,9 +579,9 @@ testnode: test2: a: 4 """, Loader=yaml.SafeLoader) - assert dat["testnode"]["obl"]["expanded_4"]["param"] == "4" - assert dat["testnode"]["obl"]["expanded_2"]["param"] == "2" - assert dat["testnode"]["obl"]["expanded_4_test2"]["param"] == "4" + assert dat["testnode"]["obl"]["expanded_4"]["param"] == 4 + assert dat["testnode"]["obl"]["expanded_2"]["param"] == 2 + assert dat["testnode"]["obl"]["expanded_4_test2"]["param"] == 4 def test_variable_in_macro_definition(register_macros, macro_store_reset): @@ -598,7 +604,7 @@ testnode: - a: 2 b: 4 """, Loader=yaml.SafeLoader) - assert dat["testnode"]["obl"]["expanded_4"]["param"] == "4" - assert dat["testnode"]["obl"]["expanded_4"]["param_b"] == "4" - assert dat["testnode"]["obl"]["expanded_2"]["param"] == "2" - assert dat["testnode"]["obl"]["expanded_2"]["param_b"] == "4" + assert dat["testnode"]["obl"]["expanded_4"]["param"] == 4 + assert dat["testnode"]["obl"]["expanded_4"]["param_b"] == 4 + assert dat["testnode"]["obl"]["expanded_2"]["param"] == 2 + assert dat["testnode"]["obl"]["expanded_2"]["param_b"] == 4 diff --git a/unittests/test_parent_cfood.yml b/unittests/test_parent_cfood.yml index b8d0eaf597641d311cb70017dc2bc75c7c3434f3..cd63e81b270117841128a34765a9635a036c52ec 100644 --- a/unittests/test_parent_cfood.yml +++ b/unittests/test_parent_cfood.yml @@ -1,6 +1,6 @@ --- metadata: - crawler-version: 0.6.1 + crawler-version: 0.7.2 --- Definitions: type: Definitions diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py index c0ce736fc4bed18f371f1626b6bc451ee103db49..226b5040547f0e003729dba63622edf836552f18 100644 --- a/unittests/test_scanner.py +++ b/unittests/test_scanner.py @@ -31,7 +31,7 @@ from pathlib import Path from tempfile import NamedTemporaryFile from unittest.mock import MagicMock, Mock, patch -import caosdb as db +import linkahead as db import pytest import yaml from caoscrawler.crawl import Crawler diff --git a/unittests/test_schema.py b/unittests/test_schema.py index 0d5bebce98fbc8c789c1080bcf3919f128bdbf54..3b576c9b72e41b799355f927d6e5387f1c187a18 100644 --- a/unittests/test_schema.py +++ b/unittests/test_schema.py @@ -3,7 +3,7 @@ # A. Schlemmer, 06/2021 from importlib_resources import files -import caosdb as db +import linkahead as db from os.path import join, dirname from caoscrawler import Crawler diff --git a/unittests/test_spss_converter.py b/unittests/test_spss_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..7ffc18dba43a6f7cd3c9fbc9273da349b4ec3c6e --- /dev/null +++ b/unittests/test_spss_converter.py @@ -0,0 +1,83 @@ +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Testing converter for SPSS files.""" + +import datetime +import importlib +import re +from pathlib import Path + +import numpy as np +import pytest + +from caoscrawler.converters import ( + ConverterValidationError, + SPSSConverter, +) +from caoscrawler.structure_elements import (BooleanElement, DictElement, + Directory, File, FloatElement, + IntegerElement, ListElement, + TextElement) + +UNITTESTDIR = Path(__file__).parent + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +def test_spss_converter(converter_registry): + converter = SPSSConverter({ + "match": ("sample.sav") + }, + "ThisConverterNameIsIrrelevant", converter_registry + ) + + spss_dir = UNITTESTDIR / "test_tables" / "spss" + for sav_file, length, thistype in [ + (File("sample.sav", spss_dir / "sample.sav"), 5, str), + (File("sample.sav", spss_dir / "sample_large.sav"), 485, int), + ]: + m = converter.match(sav_file) + assert m is not None + assert len(m) == 0 + + children = converter.create_children(None, sav_file) + assert len(children) == length + + for ii, child in enumerate(children): + assert child.__class__ == DictElement + assert child.name == str(ii) + my_dict = child.value + assert isinstance(my_dict["mychar"], str) + assert isinstance(my_dict["mydate"], datetime.date) or np.isnan(my_dict["mydate"]) + assert isinstance(my_dict["dtime"], datetime.datetime) or np.isnan(my_dict["dtime"]) + assert isinstance(my_dict["mytime"], datetime.time) or np.isnan(my_dict["mytime"]) + assert isinstance(my_dict["mylabl"], thistype), f"{type(my_dict['mylabl'])}" + assert isinstance(my_dict["myord"], thistype), f"{type(my_dict['myord'])}" diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..9015e74be69c60c43ece80a2f742d6e9b7badda6 --- /dev/null +++ b/unittests/test_sync_graph.py @@ -0,0 +1,685 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +import logging +from functools import partial +from unittest.mock import MagicMock, Mock, patch + +import linkahead as db +import pytest +from test_crawler import (basic_retrieve_by_name_mock_up, + mock_cached_only_rt_allow_empty, + mock_get_entity_by, + ) + +from caoscrawler.exceptions import (ImpossibleMergeError, + MissingIdentifyingProperty, + MissingRecordType, + ) +from caoscrawler.identifiable import Identifiable +from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter +from caoscrawler.sync_graph import SyncGraph, _set_each_scalar_value +from caoscrawler.sync_node import SyncNode, parent_in_list, property_in_list + +from itertools import product + + +@pytest.fixture +def simple_adapter(): + # different RTs with different registered identifiables to allow to test various behavior + ident_adapter = CaosDBIdentifiableAdapter() + ident_adapter.register_identifiable( + "RT1", + db.RecordType().add_parent("RT1").add_property("RT2")) + ident_adapter.register_identifiable( + "RT2", + db.RecordType().add_parent("RT2").add_property("is_referenced_by", ["RT1", "RT3"])) + ident_adapter.register_identifiable( + "RT3", + db.RecordType().add_parent("RT3").add_property("a")) + ident_adapter.register_identifiable( + "RT4", + db.RecordType().add_parent("RT4").add_property("RT3")) + ident_adapter.register_identifiable( + "RT5", + db.RecordType().add_parent("RT5").add_property("name")) + return ident_adapter + + +def test_create_flat_list(): + a = db.Record() + b = db.Record() + a.add_property(name="a", value=a) + a.add_property(name="b", value=b) + flat = SyncGraph._create_flat_list([a]) + assert len(flat) == 2 + assert a in flat + assert b in flat + c = db.Record() + c.add_property(name="a", value=a) + # This would cause a recursion error if it is not dealt with properly. + a.add_property(name="c", value=c) + flat = SyncGraph._create_flat_list([c]) + assert len(flat) == 3 + assert a in flat + assert b in flat + assert c in flat + + # Test for lists: + a = db.Record() + b = db.Record() + d = db.Record() + a.add_property(name="a", value=a) + a.add_property(name="list", value=[b, d]) + flat = SyncGraph._create_flat_list([a]) + assert len(flat) == 3 + assert a in flat + assert b in flat + assert d in flat + + c = db.Record() + c.add_property(name="a", value=a) + # This would cause a recursion error if it is not dealt with properly. + a.add_property(name="second_list", value=[b, d, c]) + flat = SyncGraph._create_flat_list([c]) + assert len(flat) == 4 + assert a in flat + assert b in flat + assert c in flat + assert d in flat + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_create_reference_mapping(): + a = SyncNode(db.Record().add_parent("RT1"), + db.RecordType().add_property("is_referenced_by", ["RT2"])) + b = SyncNode(db.Record(id=132).add_parent("RT2").add_property('a', a), + db.RecordType().add_property("a")) + ses = [a, b] + + mappings = SyncGraph._create_reference_mapping(ses) + # test initialization + for index, mapping in product((0, 1), mappings): + assert id(ses[index]) in mapping + + (forward_references, backward_references, forward_references_id_props, + backward_references_id_props, forward_references_backref, + backward_references_backref) = mappings + + # a has no ref + assert len(forward_references[id(a)]) == 0 + assert backward_references[id(a)] == set([b]) + # b does + assert forward_references[id(b)] == set([a]) + assert backward_references[id(b)] == set() + # a has no identifying reference + assert forward_references_id_props[id(a)] == set() + assert backward_references_id_props[id(a)] == set([b]) + # b has an identifying reference + assert forward_references_id_props[id(b)] == set([a]) + assert backward_references_id_props[id(b)] == set() + # a has an identifying back reference + assert forward_references_backref[id(a)] == set() + assert backward_references_backref[id(a)] == set([b]) + # b does not + assert forward_references_backref[id(b)] == set([a]) + assert backward_references_backref[id(b)] == set() + + +@patch("caoscrawler.sync_graph.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +def test_SyncGraph_init(): + # trivial case + a = db.Record(id=101).add_parent("A") + ident_a = db.RecordType().add_parent("A").add_property("prop_ident") + ident_adapter = CaosDBIdentifiableAdapter() + ident_adapter.register_identifiable("A", ident_a) + SyncGraph([a], ident_adapter) + SyncGraph([], ident_adapter) # should not fail either... + # test whether missing identifying properties cause an exception + with pytest.raises(MissingIdentifyingProperty): + SyncGraph([db.Record().add_parent("A")], ident_adapter) + + entlist = [ + db.Record(id=101).add_parent("A"), + db.Record(id=102).add_parent("A"), + db.File(path='a').add_parent("A"), + db.File(path='b').add_parent("A"), + db.Record(id=103).add_parent("A"), + db.Record(id=104).add_parent("A").add_property(name='prop_ident', value="MERGEME"), + db.Record().add_parent("A").add_property(name='prop_ident', value="MERGEME"), + db.File(path='a', file='b').add_parent("A"), + db.Record(id=101).add_parent("A"), + db.Record().add_parent("A").add_property(name='prop_ident', value="other"), + db.Record().add_parent("A").add_property(name='prop_ident', + value=db.Record().add_parent("A") + .add_property(name='prop_ident', value="other")), + db.File(path='a', file='b').add_parent("A"), + db.Record(id=101).add_parent("A"), + ] + st = SyncGraph(entlist, ident_adapter) + # all nodes with ID=101 have been merged + assert len([el for el in st.nodes if el.id == 101]) == 1 + # all nodes with path='a' have been merged + assert len([el for el in st.nodes if el.path == 'a']) == 1 + # all nodes with ID or path were removed from unchecked + for el in st.nodes: + if el.id is not None or el.path is not None: + assert el not in st.unchecked + # all nodes with ID are in the ID lookup + for el in st.nodes: + if el.id is not None: + assert st._id_look_up[el.id] is el + # all nodes with path are in the path lookup + for el in st.nodes: + if el.path is not None: + assert st._path_look_up[el.path] is el + # all nodes with identifiable are in the identifiable lookup + for el in st.nodes: + if el.identifiable is not None: + assert st._identifiable_look_up[el.identifiable.get_representation()] is el + # The node, which has no ID but has an identifiable, was merged with another node with ID (due + # to the shared identifiable) + new_one = [el for el in st.nodes if len(el.properties) > 0 + and el.properties[0].value == "MERGEME"] + assert len(new_one) == 1 + assert new_one[0].id == 104 + # every node that does not rely on something unchecked has an identifiable or an ID + for el in st.nodes: + if not st._identity_relies_on_unchecked_entity(el): + assert el.identifiable is not None or el.id is not None + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_merge_into_trivial(simple_adapter): + # simplest case: a -> c + # b + # (a reference c; b does not reference anything; a & b have the same target + # record) + c = db.Record(name='c').add_parent("RT2") + a = db.Record(name='a').add_parent("RT1").add_property('RT2', c) + b = db.Record(id=101).add_parent("RT1") + + st = SyncGraph([a, b], simple_adapter) + se_a, se_b, se_c = st.nodes + assert se_a.name == 'a' + assert se_b.id == 101 + assert se_c.name == 'c' + + # CHECK REFERENCE MAP (before merge): + # c is referenced by a + assert len(st.forward_references[id(se_a)]) == 1 + assert se_c in st.forward_references[id(se_a)] + assert len(st.forward_references[id(se_b)]) == 0 + assert len(st.forward_references[id(se_c)]) == 0 + assert len(st.backward_references[id(se_a)]) == 0 + assert len(st.backward_references[id(se_b)]) == 0 + assert len(st.backward_references[id(se_c)]) == 1 + assert se_a in st.backward_references[id(se_c)] + + assert len(st.forward_references_id_props[id(se_a)]) == 1 + assert se_c in st.forward_references_id_props[id(se_a)] + assert len(st.forward_references_id_props[id(se_b)]) == 0 + assert len(st.forward_references_id_props[id(se_c)]) == 0 + assert len(st.backward_references_id_props[id(se_a)]) == 0 + assert len(st.backward_references_id_props[id(se_b)]) == 0 + assert len(st.backward_references_id_props[id(se_c)]) == 1 + assert se_a in st.backward_references_id_props[id(se_c)] + + assert len(st.forward_references_backref[id(se_a)]) == 1 + assert se_c in st.forward_references_backref[id(se_a)] + assert len(st.forward_references_backref[id(se_b)]) == 0 + assert len(st.forward_references_backref[id(se_c)]) == 0 + assert len(st.backward_references_backref[id(se_a)]) == 0 + assert len(st.backward_references_backref[id(se_b)]) == 0 + assert len(st.backward_references_backref[id(se_c)]) == 1 + assert se_a in st.backward_references_backref[id(se_c)] + + st.set_id_of_node(se_a, 101) + + # CHECK REFERENCE MAP (after merge): + # c is now referenced by b + assert id(se_a) not in st.forward_references + assert len(st.forward_references[id(se_b)]) == 1 + assert se_c in st.forward_references[id(se_b)] + assert len(st.forward_references[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references + assert len(st.backward_references[id(se_b)]) == 0 + assert len(st.backward_references[id(se_c)]) == 1 + assert se_b in st.backward_references[id(se_c)] + + assert id(se_a) not in st.forward_references_id_props + assert len(st.forward_references_id_props[id(se_b)]) == 1 + assert se_c in st.forward_references_id_props[id(se_b)] + assert len(st.forward_references_id_props[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references_id_props + assert len(st.backward_references_id_props[id(se_b)]) == 0 + assert len(st.backward_references_id_props[id(se_c)]) == 1 + assert se_b in st.backward_references_id_props[id(se_c)] + + assert id(se_a) not in st.forward_references_backref + assert len(st.forward_references_backref[id(se_b)]) == 1 + assert se_c in st.forward_references_backref[id(se_b)] + assert len(st.forward_references_backref[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references_backref + assert len(st.backward_references_backref[id(se_b)]) == 0 + assert len(st.backward_references_backref[id(se_c)]) == 1 + assert se_b in st.backward_references_backref[id(se_c)] + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_merge_into_simple(simple_adapter): + # simple case: a -> c <- b (a & b reference c; a & b have the same target record) + c = db.Record(name='c').add_parent("RT2") + a = db.Record().add_parent("RT1").add_property('RT2', c) + b = db.Record().add_parent("RT1").add_property('RT2', c) + + st = SyncGraph([a, b], simple_adapter) + se_a = st.nodes[0] + se_b = st.nodes[1] + se_c = st.nodes[2] + + # CHECK REFERENCE MAP: + # c is referenced by a & b + assert len(st.forward_references[id(se_a)]) == 1 + se_c in st.forward_references[id(se_a)] + assert len(st.forward_references[id(se_b)]) == 1 + se_c in st.forward_references[id(se_b)] + assert len(st.forward_references[id(se_c)]) == 0 + assert len(st.backward_references[id(se_a)]) == 0 + assert len(st.backward_references[id(se_b)]) == 0 + assert len(st.backward_references[id(se_c)]) == 2 + se_a in st.backward_references[id(se_c)] + se_b in st.backward_references[id(se_c)] + + assert len(st.forward_references_id_props[id(se_a)]) == 1 + se_c in st.forward_references_id_props[id(se_a)] + assert len(st.forward_references_id_props[id(se_b)]) == 1 + se_c in st.forward_references_id_props[id(se_b)] + assert len(st.forward_references_id_props[id(se_c)]) == 0 + assert len(st.backward_references_id_props[id(se_a)]) == 0 + assert len(st.backward_references_id_props[id(se_b)]) == 0 + assert len(st.backward_references_id_props[id(se_c)]) == 2 + se_a in st.backward_references_id_props[id(se_c)] + se_b in st.backward_references_id_props[id(se_c)] + + assert len(st.forward_references_backref[id(se_a)]) == 1 + se_c in st.forward_references_backref[id(se_a)] + assert len(st.forward_references_backref[id(se_b)]) == 1 + se_c in st.forward_references_backref[id(se_b)] + assert len(st.forward_references_backref[id(se_c)]) == 0 + assert len(st.backward_references_backref[id(se_a)]) == 0 + assert len(st.backward_references_backref[id(se_b)]) == 0 + assert len(st.backward_references_backref[id(se_c)]) == 2 + se_a in st.backward_references_backref[id(se_c)] + se_b in st.backward_references_backref[id(se_c)] + + st._merge_into(se_a, se_b) + + # CHECK REFERENCE MAP (after merge): + # c is now referenced by b + # (same situation as above) + assert id(se_a) not in st.forward_references + assert len(st.forward_references[id(se_b)]) == 1 + se_c in st.forward_references[id(se_b)] + assert len(st.forward_references[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references + assert len(st.backward_references[id(se_b)]) == 0 + assert len(st.backward_references[id(se_c)]) == 1 + se_b in st.backward_references[id(se_c)] + + assert id(se_a) not in st.forward_references_id_props + assert len(st.forward_references_id_props[id(se_b)]) == 1 + se_c in st.forward_references_id_props[id(se_b)] + assert len(st.forward_references_id_props[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references_id_props + assert len(st.backward_references_id_props[id(se_b)]) == 0 + assert len(st.backward_references_id_props[id(se_c)]) == 1 + se_b in st.backward_references_id_props[id(se_c)] + + assert id(se_a) not in st.forward_references_backref + assert len(st.forward_references_backref[id(se_b)]) == 1 + se_c in st.forward_references_backref[id(se_b)] + assert len(st.forward_references_backref[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references_backref + assert len(st.backward_references_backref[id(se_b)]) == 0 + assert len(st.backward_references_backref[id(se_c)]) == 1 + se_b in st.backward_references_backref[id(se_c)] + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_backward_references_backref(): + # We use the reference as identifying reference in both directions. Thus the map is the same + # for all three categories: references, id_references and id_referenced_by + ident_a = db.RecordType().add_parent("BR").add_property("name") + ident_b = db.RecordType().add_parent("C").add_property("is_referenced_by", ["BR"]) + ident_adapter = CaosDBIdentifiableAdapter() + ident_adapter.register_identifiable("BR", ident_a) + ident_adapter.register_identifiable("C", ident_b) + + referenced = db.Record(name="B").add_parent("C") + ent_list = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] + + st = SyncGraph(ent_list, ident_adapter) + assert st.nodes[1] in st.backward_references_backref[id(st.nodes[0])] + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_set_id_of_node(simple_adapter): + # setting the id should lead to the node being marked as existing + ent_list = [db.Record(name='a').add_parent("RT5")] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 1 + st.set_id_of_node(st.unchecked[0], 101) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert id(st.nodes[0]) in st._existing + + # setting the id with None should lead to the node being marked as missing + ent_list = [db.Record().add_parent("RT1").add_property(name="RT2", value=1)] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 1 + # is automatically set in during initialization of graph + assert st.nodes[0].identifiable is not None + st.set_id_of_node(st.unchecked[0]) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert id(st.nodes[0]) in st._missing + + # setting the id to one that already exists should lead to a merge + ent_list = [ + db.Record(id=101).add_parent("RT5"), + db.Record(name='a').add_parent("RT5").add_property(name="RT2", value=1)] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 1 + st.set_id_of_node(st.unchecked[0], 101) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert st.nodes[0].properties[0].name == "RT2" + + # setting the id to None should lead to depending nodes marked as missing + ent_list = [ + db.Record().add_parent("RT3").add_property(name="a", value=1).add_property( + name="RT2", value=db.Record().add_parent("RT2")), + ] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 2 + st.set_id_of_node(st.unchecked[0]) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 0 + assert id(st.nodes[0]) in st._missing + assert id(st.nodes[1]) in st._missing + + # same as above but with backref + ent_list = [ + db.Record() + .add_parent("RT4") + .add_property(name="RT3", + value=db.Record().add_parent("RT3").add_property(name="a", value=1)), + ] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 2 + assert st.unchecked[1].identifiable is not None + st.set_id_of_node(st.unchecked[1]) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 0 + assert id(st.nodes[0]) in st._missing + assert id(st.nodes[1]) in st._missing + + # setting an id might allow to check another node that depends on the former + ent_list = [ + db.Record() + .add_parent("RT4") + .add_property(name="RT3", + value=db.Record().add_parent("RT3").add_property(name="a", value=1)), + ] + st = SyncGraph(ent_list, simple_adapter) + assert st.nodes[0].identifiable is None + assert st.nodes[1].identifiable is not None + st.set_id_of_node(st.unchecked[1], 111) + assert st.nodes[0].identifiable is not None + assert st.nodes[1].identifiable is not None + + # same as above but going one step further: the new identifiable allows to merge that node + ent_list = [ + (db.Record() + .add_parent("RT4") + .add_property(name="RT3", + value=db.Record().add_parent("RT3").add_property(name="a", value=1))), + + (db.Record() + .add_parent("RT4") + .add_property(name="RT3", value=111)) + ] + st = SyncGraph(ent_list, simple_adapter) + assert st.nodes[0].identifiable is None + assert st.nodes[1].identifiable is not None + assert st.nodes[2].identifiable is not None + assert len(st.nodes) == 3 + st.set_id_of_node(st.unchecked[2], 111) + assert st.nodes[0].identifiable is not None + assert len(st.nodes) == 2 + + +@patch("caoscrawler.sync_graph.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +def test_merging(simple_adapter): + # identifying information can be given at various locations in the hierachical tree + # test whether an object is correctly combined for all cases + ident_adapter = CaosDBIdentifiableAdapter() + ident_a = db.RecordType().add_parent("A").add_property("name").add_property("a") + ident_adapter.register_identifiable("A", ident_a) + ident_adapter.retrieve_identified_record_for_identifiable = Mock( + side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) + + # merging based on id + ent_list = [ + db.Record(id=101).add_parent("A"), + db.Record(id=101).add_parent("A")] + st = SyncGraph(ent_list, ident_adapter) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert 101 == st.nodes[0].id + assert "A" == st.nodes[0].parents[0].name + + # merging based on path + ent_list = [ + db.File(path='101').add_parent("A"), + db.File(path='101').add_parent("A")] + st = SyncGraph(ent_list, ident_adapter) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert '101' == st.nodes[0].path + assert "A" == st.nodes[0].parents[0].name + + # merging based on identifiable (non identifying properties are ignored) + ent_list = [ + db.File(name='101').add_parent("A").add_property('a', value=1).add_property('b', value=1), + db.File(name='101').add_parent("A").add_property('a', value=1).add_property('b', value=2)] + st = SyncGraph(ent_list, ident_adapter) + assert len(st.nodes) == 1 + assert st.nodes[0].id is None + assert '101' == st.nodes[0].name + assert "A" == st.nodes[0].parents[0].name + assert 1 == st.nodes[0].properties[0].value + assert "a" == st.nodes[0].properties[0].name + + # Merging a mix. One Record needs the identifiable to be merged. But the identifying + # information is scattered in the other case. + ent_list = [ + db.Record(id=101).add_parent("A"), + db.Record(id=101, name='a').add_parent("A"), + db.Record(id=101).add_parent("A").add_property('a', value=1), + db.Record(name='a').add_parent("A").add_property('a', value=1)] + + st = SyncGraph(ent_list, ident_adapter) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert 'a' == st.nodes[0].name + assert "A" == st.nodes[0].parents[0].name + assert 1 == st.nodes[0].properties[0].value + assert "a" == st.nodes[0].properties[0].name + assert 101 == st.nodes[0].id + + # test that adding an ID can lead to a cascade of merges + # This also tests whether setting something to missing allows to create an identifiable + # and thus allows a merge + subtree = db.Record(name='a').add_parent("A").add_property('a', value=db.Record( + name='b').add_parent("A").add_property('a', value=db.Record( + name='c').add_parent("A").add_property('a', value="missing"))) + ent_list = [ + db.Record(id=101).add_parent("A"), + db.Record(id=101, name='z').add_parent("A"), + db.Record(id=101).add_parent("A").add_property('a', value=subtree), + db.Record(name='z').add_parent("A").add_property('a', value=subtree), + ] + + st = SyncGraph(ent_list, ident_adapter) + assert len(st.nodes) == 5 + assert len(st.unchecked) == 4 + missing_one = [el for el in st.nodes if el.name == 'c'][0] + st.set_id_of_node(missing_one) + # setting c to missing means that b cannot exist which means that a cannot exist, this allows + # to merge the two z nodes + assert len(st.nodes) == 4 + assert len(st.unchecked) == 0 + + +def test_update_of_reference_values(simple_adapter): + # multiple nodes are merged including one that is referenced + # assure that this still leads to the value of the property of the referencing node to be + # updated, when the id is set. (Value object is replaced appropriately) + a = db.Record().add_parent("RT3").add_property('a', value=1) + ent_list = [ + a, + db.Record().add_parent("RT3").add_property('a', value=1), + db.Record().add_parent("RT3").add_property('a', value=1), + db.Record().add_parent("RT3").add_property('a', value=1), + db.Record().add_parent("RT3").add_property('a', value=1), + db.Record().add_parent("RT4").add_property('RT3', value=a), + db.Record().add_parent("RT3").add_property('a', value=1), + db.Record().add_parent("RT3").add_property('a', value=1)] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 2 + assert 'RT4' == st.nodes[1].parents[0].name + st.set_id_of_node(st.nodes[0], 101) + b_prop = st.nodes[1].properties[0].value + assert b_prop.id == 101 + + +def test_ignoring_irrelevant_references(simple_adapter): + # make sure that a circle of references is no problem if one references is not identifying + b = db.Record(name='b').add_parent("RT5") + a = db.Record().add_parent("RT3").add_property('a', value=b) + b.add_property('a', value=a) + ent_list = [a, b] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 2 + assert st.nodes[1].name == 'b' + + # a relies on b + assert st._identity_relies_on_unchecked_entity(st.nodes[0]) + # b relies on nothing + assert not st._identity_relies_on_unchecked_entity(st.nodes[1]) + # set ID of b + st.set_id_of_node(st.nodes[1], 101) + assert len(st.unchecked) == 1 + # now a nolonger relies on unchecked + assert not st._identity_relies_on_unchecked_entity(st.nodes[0]) + +# 'is implementation insufficient' + + +@pytest.mark.xfail() +def test_detect_circular_dependency(crawler_mocked_identifiable_retrieve, caplog): + crawler = crawler_mocked_identifiable_retrieve + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent('C').add_property(name='C')) + a = db.Record(name='a').add_parent("C") + b = db.Record(name='b').add_parent("C").add_property(name="C", value=a) + c = db.Record(name='c').add_parent("C").add_property(name='D', value='e' + ).add_property(name="C", value=b) + d = db.Record(name='c').add_parent("C") + a.add_property(name="C", value=c) + flat = [a, b, c] + circle = Crawler.detect_circular_dependency(flat) + assert [id(el) for el in circle] == [id(el) for el in [a, c, b, a]] + + assert Crawler.detect_circular_dependency([d]) is None + st = SyncGraph(flat, crawler.identifiableAdapter) + with pytest.raises(RuntimeError): + _, _ = crawler._split_into_inserts_and_updates(st) + caplog.set_level(logging.ERROR, logger="caoscrawler.converters") + assert "Found circular dependency" in caplog.text + assert "\n--------\n\n> Parent: C\n\n>> Name: a\n[\'C\']" in caplog.text + caplog.clear() + + +def test_set_each_scalar_value(): + """Test whether properties with None as value are treated appropriately.""" + a = SyncNode(db.Record().add_parent("RT1").add_property(name="bla"), + db.RecordType().add_property("is_referenced_by", ["RT2"])) + _set_each_scalar_value(a, lambda x: False, None) + _set_each_scalar_value(a, lambda x: isinstance(x, SyncNode), None) + _set_each_scalar_value(a, lambda x: x is None, lambda x: 42) + assert a.properties[0].value == 42 + _set_each_scalar_value(a, lambda x: x == 42, lambda x: None) + assert a.properties[0].value is None + + +@patch("caoscrawler.identifiable_adapters.cached_query", + new=Mock(side_effect=mock_cached_only_rt_allow_empty)) +def test_merge_referenced_by(): + """Merging two entities that are referenced by a third entity with nonexistent RecordType. + + See also https://gitlab.com/linkahead/linkahead-crawler/-/issues/95 + """ + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_object({ + "RT_A": ["name"], + "RT_B": [{"is_referenced_by": ["RT_A"]}, "my_id"] + }) + crawled_data: list = [] + references: list = [] + for ii in [0, 1]: + rec = db.Record().add_parent("RT_B").add_property("my_id", value=ii) + references.append(rec) + crawled_data.append(rec) + rec_a = db.Record(name="Rec_A").add_parent("RT_A") + rec_a.add_property("my_ref", value=references) + crawled_data.append(rec_a) + + with pytest.raises(MissingRecordType) as mrt: + SyncGraph(crawled_data, ident) + assert str(mrt.value).endswith("Record type could not be found on server: RT_A") diff --git a/unittests/test_sync_node.py b/unittests/test_sync_node.py new file mode 100644 index 0000000000000000000000000000000000000000..668a53470d028dfcfce7bb5785d68b685b034595 --- /dev/null +++ b/unittests/test_sync_node.py @@ -0,0 +1,347 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +from functools import partial +from unittest.mock import MagicMock, Mock, patch + +import linkahead as db +import pytest +from caoscrawler.exceptions import ImpossibleMergeError +from caoscrawler.identifiable import Identifiable +from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter +from caoscrawler.sync_graph import SyncGraph +from caoscrawler.sync_node import SyncNode, parent_in_list, property_in_list + +from test_crawler import basic_retrieve_by_name_mock_up, mock_get_entity_by + + +def assert_parents_equal(p1, p2): + """Special assertion for comparing parents.""" + for a, b in zip(p1, p2): + assert a.id == b.id + assert a.name == b.name + + +def assert_properties_equal(p1, p2): + """Special assertion for comparing properties.""" + for a, b in zip(p1, p2): + assert a.id == b.id + assert a.name == b.name + assert a.value == b.value + assert a.datatype == b.datatype + + +def test_sync_node(): + # initialization + rec = (db.Record(id=101, name='101') + .add_parent("A") + .add_parent("B") + .add_parent(id=102) + .add_property(name="a", value='a') + .add_property(id=103, value='b')) + rec.description = "hallo" + sna = SyncNode(rec) + # check information stored in initialized SyncNode + assert "Record" in str(sna) + assert sna.id == rec.id + assert sna.role == rec.role + assert sna.name == rec.name + assert sna.description == rec.description + assert_parents_equal(sna.parents, rec.parents) + assert_properties_equal(sna.properties, rec.properties) + # ... special case File (path and file attributes) + fi = db.File(id=101, name='101', path='/a/') + snb = SyncNode(fi) + assert snb.role == fi.role + assert snb.name == fi.name + assert snb.id == fi.id + assert snb.path == fi.path + assert snb.file == fi.file + + # check information in exported db.Entity + export = sna.export_entity() + assert export.id == rec.id + assert export.role == rec.role + assert export.name == rec.name + assert export.description == rec.description + assert_parents_equal(export.parents, rec.parents) + assert_properties_equal(export.properties, rec.properties) + export = snb.export_entity() + assert export.role == fi.role + assert export.name == fi.name + assert export.id == fi.id + assert export.path == fi.path + assert export.file == fi.file + + # merge no common information + # --------------------------- + rec_a = (db.Record(name='101') + .add_parent("A") + .add_parent(id=102) + .add_property(name="a", value='a') + .add_property(id=103, value='b')) + + rec_b = (db.Record(id=101) + .add_parent("B") + .add_parent(id=103) + .add_property(name="a", value='a') + .add_property(id=103, value='b')) + rec_b.description = "tja" + + sn_a = SyncNode(rec_a) + sn_b = SyncNode(rec_b) + sn_a.update(sn_b) + # test information in updated node + assert sn_a.id == rec_b.id + assert sn_a.role == rec_a.role + assert sn_a.name == rec_a.name + assert sn_a.description == rec_b.description + for p in rec_a.parents + rec_b.parents: + assert p in sn_a.parents + for p in rec_a.properties + rec_b.properties: + assert p in sn_a.properties + # Check for duplicated property: + ps = [p for p in sn_a.properties if p.name == "a"] + assert len(ps) == 2 + assert ps[0].value == "a" + assert ps[1].value == "a" + + # test information in exported entity + export = sn_a.export_entity() + assert export.id == rec_b.id + assert export.name == rec_a.name + for p in rec_a.parents + rec_b.parents: + assert parent_in_list(p, export.parents) + for p in rec_a.properties + rec_b.properties: + if p.name is not None: + assert p.name in [el.name for el in export.properties] + if p.id is not None: + assert p.id in [el.id for el in export.properties] + assert len(export.properties) == 2 + assert export.get_property('a').value == 'a' + assert export.get_property(103).value == 'b' + assert export.description == rec_b.description + assert export.role == rec_a.role + + # merge with common information + # ----------------------------- + rec_a = (db.Record(id=101, name='101') + .add_parent("A") + .add_parent(id=102) + .add_property(name="a", value='a')) + + rec_b = (db.Record(id=101, name='101') + .add_parent("A") + .add_parent(id=102) + .add_property(name="a", value='a')) + + sn_a = SyncNode(rec_a) + sn_b = SyncNode(rec_b) + sn_a.update(sn_b) + assert sn_a.id == rec_b.id + assert sn_a.name == rec_a.name + for p in rec_a.parents + rec_b.parents: + assert parent_in_list(p, sn_a.parents) + for p in rec_a.properties + rec_b.properties: + assert property_in_list(p, sn_a.properties) + assert sn_a.description == rec_b.description + assert sn_a.role == rec_a.role + + # merge with conflicting information + # ---------------------------------- + # ID mismatch + sn_a = SyncNode(db.Record(id=102)) + with pytest.raises(ImpossibleMergeError, match="Trying to update"): + sn_a.update(SyncNode(db.Record(id=101))) + + # name mismatch + sn_a = SyncNode(db.Record(name='102')) + with pytest.raises(ImpossibleMergeError, match="Trying to update"): + sn_a.update(SyncNode(db.Record(name='101'))) + + # type mismatch + sn_a = SyncNode(db.Record(name='102')) + with pytest.raises(ImpossibleMergeError, match="Trying to update"): + sn_a.update(SyncNode(db.File(name='102'))) + + # description mismatch + sn_a = SyncNode(db.Record(description='102')) + with pytest.raises(ImpossibleMergeError, match="Trying to update"): + sn_a.update(SyncNode(db.Record(description='101'))) + + # path mismatch + sn_a = SyncNode(db.File(path='102')) + with pytest.raises(ImpossibleMergeError, match="Trying to update"): + sn_a.update(SyncNode(db.File(path='101'))) + + # identifiable mismatch + sn_a = SyncNode(db.File(path='102')) + sn_a.identifiable = Identifiable(name='a') + sn_b = SyncNode(db.File(path='101')) + sn_b.identifiable = Identifiable(name='b') + with pytest.raises(ValueError, match="identifiable"): + sn_a.update(sn_b) + + +def test_export_node(): + rec_a = (db.Record(id=101) + .add_parent("B") + .add_parent(id=103) + .add_property(name="a", value=[SyncNode(db.Record())]) + .add_property(name='b', id=103, value='b')) + + sn_a = SyncNode(rec_a) + exp = sn_a.export_entity() + assert exp.id == rec_a.id + assert exp.name == rec_a.name + for p in rec_a.parents: + assert len([el for el in exp.parents if p.name == el.name]) == 1 + for p in rec_a.properties: + assert p.value == exp.get_property(p.name).value + if isinstance(p.value, list): + assert len(p.value) == len(exp.get_property(p.name).value) + assert len(exp.properties) == len(rec_a.properties) + assert len(exp.parents) == len(rec_a.parents) + + # --------------------------------------------------------------------------------------------- + # NOTE: in the following we create a SyncNode object with twice the same Property as a short + # hand for a SyncNode that was created from one Entity with such a Property and then updating + # it with another SyncNode that also has the Property + # --------------------------------------------------------------------------------------------- + + # same property name, different values + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value='b') + .add_property(name="a", value='a')) + + # there should be a warning when multiproperties are used + with pytest.warns(UserWarning) as caught: + SyncNode(rec_a) + messages = {str(w.message) for w in caught} + assert ("Multiproperties are not supported by the crawler.") in messages + + with pytest.raises(ImpossibleMergeError): + exp = SyncNode(rec_a).export_entity() + + # SyncNodes with same ID are considered equal + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=SyncNode(db.Record(id=1))) + .add_property(name="a", value=SyncNode(db.Record(id=1)))) + + exp = SyncNode(rec_a).export_entity() + assert exp.get_property('a').value.id == 1 + # SyncNodes convert multi properties into single properties + assert len([p for p in exp.properties if p.name == "a"]) == 1 + + # same SyncNode object is obviously equal + sn = SyncNode(db.Record(id=1)) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=sn) + .add_property(name="a", value=sn)) + + exp = SyncNode(rec_a).export_entity() + assert exp.get_property('a').value.id == 1 + assert len([p for p in exp.properties if p.name == "a"]) == 1 + + # different SyncNode Objects (without an ID) are not equal + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=SyncNode(db.Record())) + .add_property(name="a", value=SyncNode(db.Record()))) + + with pytest.raises(ImpossibleMergeError): + exp = SyncNode(rec_a).export_entity() + + # different SyncNode Objects with differing ID are not equal + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=SyncNode(db.Record(id=1))) + .add_property(name="a", value=SyncNode(db.Record(id=2)))) + + with pytest.raises(ImpossibleMergeError): + exp = SyncNode(rec_a).export_entity() + + # SyncNodes with same ID are considered equal (list) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=2))]) + .add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=2))])) + + exp = SyncNode(rec_a).export_entity() + assert exp.get_property('a').value[0].id == 1 + assert len([p for p in exp.properties if p.name == "a"]) == 1 + + # SyncNodes with same ID are not equal when in different order (list) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=2))]) + .add_property(name="a", value=[SyncNode(db.Record(id=2)), SyncNode(db.Record(id=1))])) + + with pytest.raises(ImpossibleMergeError): + exp = SyncNode(rec_a).export_entity() + + # same SyncNode object is obviously equal (list) + sn = SyncNode(db.Record(id=1)) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[sn]) + .add_property(name="a", value=[sn])) + + exp = SyncNode(rec_a).export_entity() + assert exp.get_property('a').value[0].id == 1 + + # different SyncNode Objects are not equal (list) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[SyncNode(db.Record())]) + .add_property(name="a", value=[SyncNode(db.Record())])) + + with pytest.raises(ImpossibleMergeError): + exp = SyncNode(rec_a).export_entity() + + # different SyncNode Objects with differing are not equal (list) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[SyncNode(db.Record(id=1))]) + .add_property(name="a", value=[SyncNode(db.Record(id=2))])) + + with pytest.raises(ImpossibleMergeError): + exp = SyncNode(rec_a).export_entity() + + # list vs no list + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=SyncNode(db.Record(id=1))) + .add_property(name="a", value=[SyncNode(db.Record(id=1))])) + + with pytest.raises(ImpossibleMergeError): + exp = SyncNode(rec_a).export_entity() + + # different list sizes + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[SyncNode(db.Record(id=1))]) + .add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=1))])) + + with pytest.raises(ImpossibleMergeError): + exp = SyncNode(rec_a).export_entity() diff --git a/unittests/test_table_converter.py b/unittests/test_table_converter.py index 178393d9345bd8a6846b66e362ce4f7edac382ee..3b563fd3179968fd90b1c92b9bc5bf0db9ed0858 100644 --- a/unittests/test_table_converter.py +++ b/unittests/test_table_converter.py @@ -32,7 +32,7 @@ import os from os.path import basename, dirname, join from pathlib import Path -import caosdb as db +import linkahead as db import pytest from caoscrawler import Crawler from caoscrawler.converters import (Converter, ConverterValidationError, diff --git a/unittests/test_tables/spss/CITATION.cff b/unittests/test_tables/spss/CITATION.cff new file mode 100644 index 0000000000000000000000000000000000000000..140fcc071bf2d5f5709cf31bf11bd9676b81ca5f --- /dev/null +++ b/unittests/test_tables/spss/CITATION.cff @@ -0,0 +1,11 @@ +cff-version: 1.2.0 +message: "If you use this software, please cite it as below." +authors: +- family-names: "Fajardo" + given-names: "Otto" + orcid: "https://orcid.org/0000-0002-3363-9287" +title: "Pyreadstat" +version: 1.2.7 +doi: 10.5281/zenodo.6612282 +date-released: 2018-09-24 +url: "https://github.com/Roche/pyreadstat" diff --git a/unittests/test_tables/spss/LICENSE b/unittests/test_tables/spss/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..a2f94b1a2a5a4255fc8ef6d0beb94cce89f545e8 --- /dev/null +++ b/unittests/test_tables/spss/LICENSE @@ -0,0 +1,210 @@ +Test data files were copied from [pyreadstat](https://github.com/Roche/pyreadstat), they are +licensed under the Apache License, cited below. + +Copyright (C) 2018-2024 Otto Fajardo +Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> + +pyreadstat liscence: +--------------------------------------------------------------------------- + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/unittests/test_tables/spss/sample.sav b/unittests/test_tables/spss/sample.sav new file mode 100644 index 0000000000000000000000000000000000000000..20d0c5ce6689a60adfa329a17b4347274e9a863b Binary files /dev/null and b/unittests/test_tables/spss/sample.sav differ diff --git a/unittests/test_tables/spss/sample_large.sav b/unittests/test_tables/spss/sample_large.sav new file mode 100644 index 0000000000000000000000000000000000000000..b0c16c1390a15a4f62a859ade76aa17b89c6ae40 Binary files /dev/null and b/unittests/test_tables/spss/sample_large.sav differ diff --git a/unittests/test_transformers.py b/unittests/test_transformers.py index 02d932d13cc3fad52048b08e2b9fe56f11db2ae7..4ed12751d9052c839aa4db4abd586c419bed1018 100644 --- a/unittests/test_transformers.py +++ b/unittests/test_transformers.py @@ -34,7 +34,7 @@ from pathlib import Path from tempfile import NamedTemporaryFile from unittest.mock import MagicMock, Mock, patch -import caosdb as db +import linkahead as db import pytest import yaml from caoscrawler.converters import Converter, ListElementConverter @@ -46,6 +46,38 @@ from pytest import raises UNITTESTDIR = Path(__file__).parent +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + "MarkdownFile": { + "converter": "MarkdownFileConverter", + "package": "caoscrawler.converters"}, + "Date": { + "converter": "DateElementConverter", + "package": "caoscrawler.converters"}, + "DictElement": { + "converter": "DictElementConverter", + "package": "caoscrawler.converters"}, + "TextElement": { + "converter": "TextElementConverter", + "package": "caoscrawler.converters"}, + "ListElement": { + "converter": "ListElementConverter", + "package": "caoscrawler.converters"}, + "JSONFile": { + "converter": "JSONFileConverter", + "package": "caoscrawler.converters"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + def test_simple_transformer(): """ Test the correct list of returned records by the scanner using the @@ -82,38 +114,6 @@ def test_simple_transformer(): assert False -@pytest.fixture -def converter_registry(): - converter_registry: dict[str, dict[str, str]] = { - "Directory": { - "converter": "DirectoryConverter", - "package": "caoscrawler.converters"}, - "MarkdownFile": { - "converter": "MarkdownFileConverter", - "package": "caoscrawler.converters"}, - "Date": { - "converter": "DateElementConverter", - "package": "caoscrawler.converters"}, - "DictElement": { - "converter": "DictElementConverter", - "package": "caoscrawler.converters"}, - "TextElement": { - "converter": "TextElementConverter", - "package": "caoscrawler.converters"}, - "ListElement": { - "converter": "ListElementConverter", - "package": "caoscrawler.converters"}, - "JSONFile": { - "converter": "JSONFileConverter", - "package": "caoscrawler.converters"}, - } - - for key, value in converter_registry.items(): - module = importlib.import_module(value["package"]) - value["class"] = getattr(module, value["converter"]) - return converter_registry - - def test_apply_replace(converter_registry): cfood_def = {"type": 'ListElement', "match_name": ".*", 'transform': {'test': {'in': '$a', 'out': '$b', 'functions': [{ @@ -146,3 +146,21 @@ def test_apply_replace_from_def(converter_registry): conv.apply_transformers(values, transformer_functions) assert values['b'] == "16:45" + + +def test_empty_functions_list(converter_registry): + cfood_def = {"type": 'ListElement', + "match_name": ".*", + 'transform': {'test': {'in': '$a', 'out': '$b', + 'functions': []}}} + values = GeneralStore() + values["a"] = "16_45" + + # transformer_functions = create_transformer_registry(crawler_definition) + transformer_functions = {"replace": replace} + + conv = ListElementConverter(definition=cfood_def, name='test', + converter_registry=converter_registry) + + conv.apply_transformers(values, transformer_functions) + assert values['b'] == "16_45" diff --git a/unittests/test_utilities.py b/unittests/test_utilities.py index 5a80ab9b230db4540d741bf8fa4f9d11b5158aab..dfb79c8b6b10909952174cf24c3aa9198f3b7743 100644 --- a/unittests/test_utilities.py +++ b/unittests/test_utilities.py @@ -19,7 +19,10 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # +import pytest + from caoscrawler.crawl import split_restricted_path +from caoscrawler.utils import MissingImport def test_split_restricted_path(): @@ -33,3 +36,33 @@ def test_split_restricted_path(): assert split_restricted_path("/test//bla") == ["test", "bla"] assert split_restricted_path("//test/bla") == ["test", "bla"] assert split_restricted_path("///test//bla////") == ["test", "bla"] + + +def test_dummy_class(): + Missing = MissingImport(name="Not Important", hint="Do the thing instead.") + with pytest.raises(RuntimeError) as err_info_1: + print(Missing.__name__) + with pytest.raises(RuntimeError) as err_info_2: + Missing() + with pytest.raises(RuntimeError) as err_info_3: + print(Missing.foo) + + for err_info in (err_info_1, err_info_2, err_info_3): + msg = str(err_info.value) + assert "(Not Important)" in msg + assert msg.endswith("Do the thing instead.") + + MissingErr = MissingImport(name="Not Important", hint="Do the thing instead.", + err=ImportError("Old error")) + with pytest.raises(RuntimeError) as err_info_1: + print(MissingErr.__name__) + with pytest.raises(RuntimeError) as err_info_2: + MissingErr() + with pytest.raises(RuntimeError) as err_info_3: + print(MissingErr.foo) + + for err_info in (err_info_1, err_info_2, err_info_3): + msg = str(err_info.value) + assert "(Not Important)" in msg + orig_msg = str(err_info.value.__cause__) + assert orig_msg == "Old error" diff --git a/unittests/test_variable_substitutions.py b/unittests/test_variable_substitutions.py index 09f78df661d82970e7264996102eff8881ee19ec..90d144b04a4e1271f74b769759e3f201007af705 100644 --- a/unittests/test_variable_substitutions.py +++ b/unittests/test_variable_substitutions.py @@ -25,7 +25,7 @@ from os.path import basename, dirname, join from pathlib import Path from unittest.mock import MagicMock, Mock -import caosdb as db +import linkahead as db import pytest import yaml from caoscrawler import Crawler @@ -35,7 +35,7 @@ from caoscrawler.identifiable_adapters import (IdentifiableAdapter, from caoscrawler.scanner import scan_directory from caoscrawler.structure_elements import (DictListElement, DictTextElement, File) -from caosdb.apiutils import compare_entities +from linkahead.apiutils import compare_entities from pytest import raises from utils import dircheckstr as dircheckstr_base