diff --git a/.docker/Dockerfile b/.docker/Dockerfile index 8e18096490632372d589749867c41e2244a67c11..dd4f3d258443dc1f8b2bacb8d535780e8e37e5e8 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -7,8 +7,10 @@ RUN apt-get update && \ python3-autopep8 \ python3-pip \ python3-pytest \ + python3-sphinx \ tox \ -y +RUN pip3 install recommonmark sphinx-rtd-theme COPY .docker/wait-for-it.sh /wait-for-it.sh ARG PYLIB ADD https://gitlab.indiscale.com/api/v4/projects/97/repository/commits/${PYLIB} \ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 67415c1b3a7da52e3179bec8463cd69ac3c667aa..8840e613f1e1eb86f30779b8b3535e2ff97ad0cc 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -296,8 +296,9 @@ style: pages_prepare: &pages_prepare tags: [ cached-dind ] stage: deploy - needs: [] - image: $CI_REGISTRY/caosdb/src/caosdb-pylib/testenv:latest + needs: + - job: build-testenv + image: $CI_REGISTRY_IMAGE only: refs: - /^release-.*$/i diff --git a/CHANGELOG.md b/CHANGELOG.md index c7e51f3018e78e22fbcaa48400baab5e868281ad..29013891d7f53dd4c4d8164e79eecfa169fcb289 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,12 +10,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### ### Changed ### +- If the `parents` key is used in a cfood at a lower level for a Record that + already has a Parent (because it was explicitly given or the default Parent), + the old Parent(s) are now overwritten with the value belonging to the + `parents` key. +- If a registered identifiable states, that a reference by a Record with parent + RT1 is needed, then now also references from Records that have a child of RT1 + as parent are accepted. ### Deprecated ### ### Removed ### ### Fixed ### +- Empty Records can now be created (https://gitlab.com/caosdb/caosdb-crawler/-/issues/27) + +* [#58](https://gitlab.com/caosdb/caosdb-crawler/-/issues/58) Documentation builds API docs in pipeline now. ### Security ### diff --git a/setup.cfg b/setup.cfg index fbbdc1031b4e023b351bd8c331a2aa579e0cf5b9..32edcde630172cb991ea28898ae0c5e9f5770f90 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,8 +20,8 @@ packages = find: python_requires = >=3.7 install_requires = importlib-resources - caosdb > 0.11.2 caosadvancedtools >= 0.7.0 + linkahead >= 0.13.1 yaml-header-tools >= 0.2.1 pyyaml odfpy #make optional diff --git a/src/caoscrawler/config.py b/src/caoscrawler/config.py index 18993b539a09aa58fa280759333b3e7fd315c5e0..8a5a2c48e714f721855d05d6ae6df2412c27836e 100644 --- a/src/caoscrawler/config.py +++ b/src/caoscrawler/config.py @@ -17,7 +17,7 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # -import caosdb as db +import linkahead as db DEFAULTS = { "send_crawler_notifications": False, diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index c8f4d2293a820d8d41f482741de7a7f2a3bf1f62..930a08c167e17efaed39b4b9045589cde68b2e8c 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -24,29 +24,29 @@ # from __future__ import annotations -from jsonschema import validate, ValidationError -import os -import re import datetime -import caosdb as db import json +import logging +import os +import re import warnings -from .utils import has_parent -from .stores import GeneralStore, RecordStore -from .structure_elements import (StructureElement, Directory, File, DictElement, JSONFile, - IntegerElement, BooleanElement, FloatElement, NoneElement, - TextElement, TextElement, ListElement) -from typing import List, Optional, Tuple, Union from abc import ABCMeta, abstractmethod from string import Template -import yaml_header_tools +from typing import List, Optional, Tuple, Union +import caosdb as db import pandas as pd -import logging - - import yaml +import yaml_header_tools +from jsonschema import ValidationError, validate + +from .stores import GeneralStore, RecordStore +from .structure_elements import (BooleanElement, DictElement, Directory, File, + FloatElement, IntegerElement, JSONFile, + ListElement, NoneElement, StructureElement, + TextElement) +from .utils import has_parent # These are special properties which are (currently) treated differently # by the converters: @@ -235,6 +235,12 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict keys_modified = [] for name, record in def_records.items(): + # If only a name was given (Like this: + # Experiment: + # ) set record to an empty dict / empty configuration + if record is None: + record = {} + role = "Record" # This allows us to create e.g. Files if "role" in record: @@ -300,6 +306,7 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict # no matter whether the record existed in the record store or not, # parents will be added when they aren't present in the record yet: if "parents" in record: + c_record.parents.clear() for parent in record["parents"]: # Do the variables replacement: var_replaced_parent = replace_variables(parent, values) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 56a7d8ca4b27d6e6bad0bce17810472f1cb4bf75..d422b0d6ea0ef34ab19826af14e2101a03f97dbc 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -36,6 +36,7 @@ import importlib import logging import os import sys +import traceback import uuid import warnings from argparse import RawTextHelpFormatter @@ -790,6 +791,8 @@ class Crawler(object): for i in reversed(range(len(crawled_data))): if not check_identical(crawled_data[i], identified_records[i]): + logger.debug("Sheduled update because of the folllowing diff:\n" + + str(compare_entities(crawled_data[i], identified_records[i]))) actual_updates.append(crawled_data[i]) return actual_updates @@ -1342,14 +1345,17 @@ def crawler_main(crawled_directory_path: str, _update_status_record(crawler.run_id, len(inserts), len(updates), status="OK") return 0 except ForbiddenTransaction as err: + logger.debug(traceback.format_exc()) logger.error(err) _update_status_record(crawler.run_id, 0, 0, status="FAILED") return 1 except ConverterValidationError as err: + logger.debug(traceback.format_exc()) logger.error(err) _update_status_record(crawler.run_id, 0, 0, status="FAILED") return 1 except Exception as err: + logger.debug(traceback.format_exc()) logger.debug(err) if "SHARED_DIR" in os.environ: diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 776baeaeac7caf961d6dba97641804c9e1608114..d9c9c00b22443121b4989bf988a40308b143dbf1 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -40,6 +40,12 @@ from .utils import has_parent logger = logging.getLogger(__name__) +def get_children_of_rt(rtname): + """Supply the name of a recordtype. This name and the name of all children RTs are returned in + a list""" + return [p.name for p in db.execute_query(f"FIND RECORDTYPE {rtname}")] + + def convert_value(value: Any): """ Returns a string representation of the value that is suitable to be used in the query @@ -212,11 +218,16 @@ identifiabel, identifiable and identified record) for a Record. # TODO: similar to the Identifiable class, Registred Identifiable should be a # separate class too if prop.name.lower() == "is_referenced_by": - for rtname in prop.value: - if (id(record) in referencing_entities - and rtname in referencing_entities[id(record)]): - identifiable_backrefs.extend(referencing_entities[id(record)][rtname]) - else: + for givenrt in prop.value: + rt_and_children = get_children_of_rt(givenrt) + found = False + for rtname in rt_and_children: + if (id(record) in referencing_entities + and rtname in referencing_entities[id(record)]): + identifiable_backrefs.extend( + referencing_entities[id(record)][rtname]) + found = True + if not found: # TODO: is this the appropriate error? raise NotImplementedError( f"The following record is missing an identifying property:" diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index 6f5545b1d0d53114282a3477b6855900f5294520..2f4f4228ccba232fb94654ae2b67756de8bf121c 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -264,6 +264,8 @@ def scanner(items: list[StructureElement], converters_path = [] for element in items: + element_path = os.path.join(*(structure_elements_path + [element.get_name()])) + logger.debug(f"Dealing with {element_path}") for converter in converters: # type is something like "matches files", replace isinstance with "type_matches" @@ -276,8 +278,7 @@ def scanner(items: list[StructureElement], record_store_copy = record_store.create_scoped_copy() # Create an entry for this matched structure element that contains the path: - general_store_copy[converter.name] = ( - os.path.join(*(structure_elements_path + [element.get_name()]))) + general_store_copy[converter.name] = element_path # extracts values from structure element and stores them in the # variable store diff --git a/src/doc/how-to-upgrade.md b/src/doc/how-to-upgrade.md index 2e26531d27a1cb038afa1b487007157b532a6fb7..30d23f8f3a4ad88f6b3f4fca18013e26fbcb1dc1 100644 --- a/src/doc/how-to-upgrade.md +++ b/src/doc/how-to-upgrade.md @@ -1,5 +1,10 @@ # How to upgrade +## 0.6.x to 0.7.0 +If you added Parents to Records at multiple places in the CFood, you must now +do this at a single location because this key now overwrites previously set +parents. + ## 0.5.x to 0.6.0 [#41](https://gitlab.com/caosdb/caosdb-crawler/-/issues/41) was fixed. This means that you previously used the name of Entities as an identifying diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index dc53cb099eb5e4b225b13461176504a003f2d2ba..91e0e86a6d6cf2967ab3567a2ef93b7ccde56e64 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -607,7 +607,7 @@ def test_create_flat_list(): assert c in flat -@ pytest.fixture +@pytest.fixture def crawler_mocked_for_backref_test(): crawler = Crawler() # mock retrieval of registered identifiabls: return Record with just a parent @@ -651,6 +651,8 @@ def test_validation_error_print(caplog): caplog.clear() +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test): crawler = crawler_mocked_for_backref_test identlist = [Identifiable(name="A", record_type="BR"), @@ -685,6 +687,8 @@ def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test) assert insert[0].name == "B" +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test): # test whether multiple references of the same record type are correctly used crawler = crawler_mocked_for_backref_test @@ -705,6 +709,8 @@ def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_ assert len(insert) == 2 +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test): # test whether multiple references of the different record types are correctly used crawler = crawler_mocked_for_backref_test diff --git a/unittests/test_file_identifiables.py b/unittests/test_file_identifiables.py index 2852b40ffde98180d5dd7b11b9109cc5875502da..4ec02aa3fc497f8dc35adc709533ef5b35066f3a 100644 --- a/unittests/test_file_identifiables.py +++ b/unittests/test_file_identifiables.py @@ -20,6 +20,8 @@ def clear_cache(): cache_clear() +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=id)) @patch("caoscrawler.identifiable_adapters.cached_get_entity_by", new=Mock(side_effect=mock_get_entity_by)) def test_file_identifiable(): diff --git a/unittests/test_parent_cfood.yml b/unittests/test_parent_cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..b8d0eaf597641d311cb70017dc2bc75c7c3434f3 --- /dev/null +++ b/unittests/test_parent_cfood.yml @@ -0,0 +1,39 @@ +--- +metadata: + crawler-version: 0.6.1 +--- +Definitions: + type: Definitions + +data: + type: Dict + match_name: '.*' + records: + Experiment: + Projekt: + parents: ["project"] + name: "p" + Campaign: + name: "c" + Stuff: + name: "s" + subtree: + Experiment: + type: DictElement + match: '.*' + records: + Experiment: + parents: ["Exp"] + name: "e" + Projekt: + parents: ["Projekt"] + Campaign: + parents: ["Cap"] + Stuff: + name: "s" + Experiment2: + type: DictElement + match: '.*' + records: + Campaign: + parents: ["Cap2"] diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py index 1a1c8c1823aee5e84d35b9ddaf7dc919b56e40dc..21b7da10ef860378a4779f34f8f1b26c6a54f359 100644 --- a/unittests/test_scanner.py +++ b/unittests/test_scanner.py @@ -1,3 +1,25 @@ +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# 2021-2023 Research Group Biomedical Physics, +# Max-Planck-Institute for Dynamics and Self-Organization Göttingen +# Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# import json import logging @@ -276,3 +298,33 @@ def test_variable_deletion_problems(): assert record.get_property("var2").value == "test" else: raise RuntimeError("Wrong name") + + +def test_record_parents(): + """ Test the correct list of returned records by the scanner """ + + data = { + 'Experiments': {} + } + + crawler_definition = load_definition(UNITTESTDIR / "test_parent_cfood.yml") + converter_registry = create_converter_registry(crawler_definition) + + records = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry) + assert len(records) == 4 + for rec in records: + if rec.name == 'e': + assert rec.parents[0].name == 'Exp' # default parent was overwritten + assert len(rec.parents) == 1 + elif rec.name == 'c': + assert rec.parents[0].name == 'Cap2' # default parent was overwritten by second + # converter + assert len(rec.parents) == 1 + elif rec.name == 'p': + assert rec.parents[0].name == 'Projekt' # top level set parent was overwritten + assert len(rec.parents) == 1 + elif rec.name == 's': + assert rec.parents[0].name == 'Stuff' # default parent stays if no parent is given on + # lower levels + assert len(rec.parents) == 1