diff --git a/.docker/Dockerfile b/.docker/Dockerfile index 14c3c1efc5b3974f6952b5ed439723c58b4627a5..ab4c5ca9aa1a30231f50ba7b8e60a748b11b6d66 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -37,6 +37,7 @@ RUN pip3 install --break-system-packages \ pylint \ recommonmark \ sphinx-rtd-theme \ + sphinx-jsonschema \ ; COPY .docker/wait-for-it.sh /wait-for-it.sh ARG PYLIB diff --git a/CHANGELOG.md b/CHANGELOG.md index f54c2a254cae78a242630207434ec73b77ad2abc..ddd0bbe315d34debb7ec2010ec683b4ccd746638 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,20 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.12.0] - 2025-03-25 ## + +### Changed ### + +- `scanner.scan_structure_elements` now auto-generates the + `converter_registry` and the `registered_transformer_functions` from + the `crawler_definition` if none are given. Therefore, the + `converter_registry` argument is now optional. + +### Fixed ### + +- A RecordType with multiple Parents no longer causes an error during + collection of identifiables + ## [0.11.0] - 2025-03-05 ## ### Added ### diff --git a/CITATION.cff b/CITATION.cff index 8f4e22a4f8b56c8640e7d0a9a5ccae93010b4847..2a883a07766846e452fe65183f4e7e51a59238c3 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -17,6 +17,6 @@ authors: given-names: Alexander orcid: https://orcid.org/0000-0003-4124-9649 title: CaosDB - Crawler -version: 0.11.0 +version: 0.12.0 doi: 10.3390/data9020024 -date-released: 2025-03-05 \ No newline at end of file +date-released: 2025-03-25 \ No newline at end of file diff --git a/README_SETUP.md b/README_SETUP.md index 32f0bb89a6051bc2ec4be0bae6cf06cd1a540f8b..3ad25de75f8bf78c775c4a9f5db21bf7f475af19 100644 --- a/README_SETUP.md +++ b/README_SETUP.md @@ -1,4 +1,4 @@ -# Getting started with the CaosDB Crawler # +# Getting started with the LinkAhead Crawler # ## Installation see INSTALL.md @@ -24,6 +24,7 @@ dependencies, i.e., `pip install .[h5-crawler,spss]`. - `sphinx-autoapi` - `recommonmark` - `sphinx-rtd-theme` +- `sphinx-jsonschema` ### How to contribute ### diff --git a/setup.cfg b/setup.cfg index da645c0d7615a1a3caab8dabd8af1893b72bdf61..f7c6df7b49a2d3870daf8473f67e9913df2ab8c5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = caoscrawler -version = 0.11.0 +version = 0.12.0 author = Alexander Schlemmer author_email = alexander.schlemmer@ds.mpg.de description = A new crawler for LinkAhead diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index e0d243979faee8f44cdcee3b0e49c15af640c378..3d1e9f754b6de1c8db959be26dc24da1d05e9d16 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -873,15 +873,12 @@ def _update_status_record(run_id, n_inserts, n_updates, status): cr_rec = db.execute_query(f"FIND RECORD CrawlerRun WITH run_id={run_id}", unique=True) cr_rec.get_property('status').value = status (cr_rec - .add_property(db.execute_query( - "FIND Property with name='number_of_inserted_entities'", unique=True).id, - n_inserts) .add_property( - db.execute_query("FIND Property with name='number_of_updated_entities'", - unique=True).id, n_updates) + cached_get_entity_by(name='number_of_inserted_entities').id, n_inserts) .add_property( - db.execute_query("FIND Property with name='finished'", - unique=True).id, datetime.now().isoformat())) + cached_get_entity_by(name='number_of_updated_entities').id, n_updates) + .add_property( + cached_get_entity_by(name='finished').id, datetime.now().isoformat())) cr_rec.update() diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 6169a99e7bf47daffb53332b7e0b6513730f2561..444b73f5d9a42cf8ec23eec7cb718b1fc183dd30 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -672,11 +672,15 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): registered = [] for parent in rt.parents: prt = _retrieve_RecordType(id=parent.id, name=parent.name) - registered.append(self._get_registered_for_rt(prt)) + reg = self._get_registered_for_rt(prt) + if reg is not None: + registered.append(reg) # TODO we might in future want to check whether the registered identifiables are the same if len(registered) > 1: - raise RuntimeError("Multiple registered identifiables found for the RecordType " - f" {rt.name} with the following parents: {rt.parents}") + ri_names = [i.name for i in registered] + raise RuntimeError(f"Multiple registered identifiables found for the RecordType " + f" {rt.name} with the following parents: {rt.parents}\n" + f"Registered identifiables: {', '.join(ri_names)}") elif len(registered) == 1: return registered[0] else: diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index af1f4173e95827606a02979ddd6d7fcd9f133271..cb8dfdf2bf65dba9fd206366722ad69ecfc251e4 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -486,7 +486,7 @@ def scan_directory(dirname: Union[str, list[str]], crawler_definition_path: str, def scan_structure_elements(items: Union[list[StructureElement], StructureElement], crawler_definition: dict, - converter_registry: dict, + converter_registry: Optional[dict] = None, restricted_path: Optional[list[str]] = None, debug_tree: Optional[DebugTree] = None, registered_transformer_functions: Optional[dict] = None) -> ( @@ -508,6 +508,15 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen Traverse the data tree only along the given path. When the end of the given path is reached, traverse the full tree as normal. See docstring of 'scanner' for more details. + converter_registry: dict, optional + Optional dictionary containing the converter definitions + needed for the crawler definition. If none is given, it will + be generated from the `crawler_definition`. Default is None. + registered_transformer_functions: dict, optional + Optional dictionary containing the transformer function + definitions needed for the crawler definition. If none is + given, it will be generated from the + `crawler_definition`. Default is None. Returns ------- @@ -519,6 +528,10 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen if not isinstance(items, list): items = [items] + if converter_registry is None: + converter_registry = create_converter_registry(crawler_definition) + if registered_transformer_functions is None: + registered_transformer_functions = create_transformer_registry(crawler_definition) # TODO: needs to be covered somewhere else # self.run_id = uuid.uuid1() converters = initialize_converters(crawler_definition, converter_registry) diff --git a/src/doc/cfood-schema.yml b/src/doc/cfood-schema.yml new file mode 120000 index 0000000000000000000000000000000000000000..c34280ed44cdd07eccb20e314e08a60c899dfe8a --- /dev/null +++ b/src/doc/cfood-schema.yml @@ -0,0 +1 @@ +../caoscrawler/cfood-schema.yml \ No newline at end of file diff --git a/src/doc/cfood-specification.rst b/src/doc/cfood-specification.rst new file mode 100644 index 0000000000000000000000000000000000000000..89588b2da9e4bb828eba05cc353dedb3abd6c821 --- /dev/null +++ b/src/doc/cfood-specification.rst @@ -0,0 +1,10 @@ +CFood-Specification +((((((((((((((((((( + + +CFoods are defined using a YAML find that has to abide by the following +specification. The specifiacition is defined using a JSON schema (see +`src/caoscrawler/cfood-schema.yml`). A CFood is basically composed of converter +definitions. A converter definition must have the following structure: + +.. jsonschema:: cfood-schema.yml#/cfood/$defs/converter diff --git a/src/doc/conf.py b/src/doc/conf.py index a1e9dbded97fe82fdee4d0df1e30a6fe46be6bae..85de34ab531cd7c4ebe2a566a98447d4c8ab1bba 100644 --- a/src/doc/conf.py +++ b/src/doc/conf.py @@ -33,10 +33,10 @@ copyright = '2024, IndiScale' author = 'Alexander Schlemmer' # The short X.Y version -version = '0.11.0' +version = '0.12.0' # The full version, including alpha/beta/rc tags # release = '0.5.2-rc2' -release = '0.11.0' +release = '0.12.0' # -- General configuration --------------------------------------------------- @@ -56,6 +56,8 @@ extensions = [ "sphinx.ext.todo", "recommonmark", # For markdown files. "sphinx_rtd_theme", + 'sphinx.ext.autodoc', + 'sphinx-jsonschema', ] # Add any paths that contain templates here, relative to this directory. diff --git a/src/doc/index.rst b/src/doc/index.rst index a72389b1f4b94430b2c5ff2bfee9757193327ed7..4cf6fd8c6e68874e2b4bb1a604c1c07b2cac2659 100644 --- a/src/doc/index.rst +++ b/src/doc/index.rst @@ -9,9 +9,11 @@ CaosDB-Crawler Documentation Getting started<getting_started/index> Tutorials<tutorials/index> + Workflow<workflow> Concepts<concepts> Converters<converters/index> CFoods (Crawler Definitions)<cfood> + CFood-Specification<cfood-specification> Macros<macros> How to upgrade<how-to-upgrade> API documentation<_apidoc/modules> diff --git a/src/doc/workflow.rst b/src/doc/workflow.rst new file mode 100644 index 0000000000000000000000000000000000000000..b8d48f1ae299431e6aeaf8a173a9e9ffbc0388f2 --- /dev/null +++ b/src/doc/workflow.rst @@ -0,0 +1,65 @@ +Crawler Workflow +================ + +The LinkAhead crawler aims to provide a very flexible framework for synchronizing +data on file systems (or potentially other sources of information) with a +running LinkAhead instance. The workflow that is used in the scientific environment +should be choosen according to the users needs. It is also possible to combine +multiple workflow or use them in parallel. + +In this document we will describe several workflows for crawler operation. + +Local Crawler Operation +----------------------- + +A very simple setup that can also reliably be used for testing +sets up the crawler on a local computer. The files that +are being crawled need to be visible to both, the locally running crawler and +the LinkAhead server. + +Prerequisites ++++++++++++++ + +- Make sure that LinkAhead is running, that your computer has a network connection to LinkAhead and + that your pycaosdb.ini is pointing to the correct instance of LinkAhead. Please refer to the + pylib manual for questions related to the configuration in pycaosdb.ini + (https://docs.indiscale.com/caosdb-pylib/README_SETUP.html). +- Make sure that caosdb-crawler and caosdb-advanced-user-tools are installed (e.g. using pip). +- Make sure that you have created: + - The data model, needed for the crawler. + - A file "identifiables.yml" describing the identifiables. + - A cfood file, e.g. cfood.yml. + +Running the crawler ++++++++++++++++++++ + +Running the crawler currently involves two steps: +- Inserting the files +- Running the crawler program + +Inserting the files +))))))))))))))))))) + +This can be done using the module "loadFiles" from caosadvancedtools. +(See https://docs.indiscale.com/caosdb-advanced-user-tools/ for installation.) + +The generic syntax is: + +python3 -m caosadvancedtools.loadFiles -p <prefix-in-caosdb-file-system> <path-to-crawled-folder> + +Important: The <path-to-crawled-folder> is the location of the files **as seen by LinkAhead**, e.g. for a LinkAhead instance running in a docker container (e.g. see: https://gitlab.com/caosdb/caosdb-docker) the command line could look like: + +python3 -m caosadvancedtools.loadFiles -p / /opt/caosdb/mnt/extroot/ExperimentalData + +This command line would load the folder "ExperimentalData" contained in the extroot folder within the docker container to the LinkAhead-prefix "/" which is the root prefix. + +Running the crawler +))))))))))))))))))) + +The following command line assumes that the extroot folder visible in the LinkAhead docker container is located in "../extroot": + +caosdb-crawler -i identifiables.yml --prefix /extroot --debug --provenance=provenance.yml -s update cfood.yml ../extroot/ExperimentalData/ + +Server Side Crawler Operation +----------------------- +To be filled. diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index 5108e83c83db16f1b44d836bf22d21d8e871ee8f..1c7733acfe952a2f47eff2853c2b90684c098dbf 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -54,7 +54,9 @@ def mock_retrieve_RecordType(id, name): "Lab": db.RecordType(name="Lab"), "Analysis": db.RecordType(name="Analysis"), "MetaAnalysis": db.RecordType(name="MetaAnalysis").add_parent("Analysis"), - "Measurement": db.RecordType(name="Measurement").add_parent("Experiment") + # Test that two parents are possible; only one of them + # (Experiment) has an identifiable. + "Measurement": db.RecordType(name="Measurement").add_parent("Experiment").add_parent("A") }[name] @@ -330,8 +332,10 @@ def test_get_registered_identifiable(): with pytest.raises(RuntimeError): registered = ident.get_registered_identifiable(rec) - # Test the case that the record has a parent for which no identifiable is registered - # and there is a registered identifiable for a grand parent + # Test the case that the record has a parent for which no + # identifiable is registered and there is a registered + # identifiable for a grand parent. Note that this also tests the + # case of two grandparents, only one of which has an identifiable. ident = CaosDBIdentifiableAdapter() ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml") rec = db.Record().add_parent(name="Measurement") diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py index c531f66fd38a714ba4f6f538d41c9fbaeb364d44..80c7b3e937b586d5e1e0c847876e8328e03a7f77 100644 --- a/unittests/test_scanner.py +++ b/unittests/test_scanner.py @@ -61,6 +61,12 @@ def test_scan_structure_elements(): converter_registry) assert len(recs) == 4 + # Converter registry is generated automatically from the crawler + # definition in scan_structure_elements. + recs_without_converter_registry = scan_structure_elements(DictElement(name="", value=data), + crawler_definition) + assert len(recs) == len(recs_without_converter_registry) + def test_provenance_debug_data(): # TODO rewrite the test to use a smaller example setup