Skip to content
Snippets Groups Projects
Commit f200440c authored by Alexander Schlemmer's avatar Alexander Schlemmer
Browse files

Merge branch 'dev' into f-new-debug-tree

parents 4c925e19 96ae0ada
No related branches found
No related tags found
No related merge requests found
Pipeline #59484 failed
Showing
with 701 additions and 78 deletions
FROM debian:bullseye FROM debian:bookworm
RUN apt-get update && \ RUN apt-get update && \
apt-get install \ apt-get install \
curl \ curl \
git \ git \
openjdk-11-jdk-headless \ openjdk-17-jdk-headless \
python3-autopep8 \ python3-autopep8 \
python3-pip \ python3-pip \
python3-pytest \ python3-pytest \
python3-sphinx \ python3-sphinx \
tox \ tox \
-y -y
RUN pip3 install recommonmark sphinx-rtd-theme RUN pip3 install --break-system-packages \
pylint \
recommonmark \
sphinx-rtd-theme \
;
COPY .docker/wait-for-it.sh /wait-for-it.sh COPY .docker/wait-for-it.sh /wait-for-it.sh
ARG PYLIB ARG PYLIB
ADD https://gitlab.indiscale.com/api/v4/projects/97/repository/commits/${PYLIB} \ ADD https://gitlab.indiscale.com/api/v4/projects/97/repository/commits/${PYLIB} \
pylib_version.json pylib_version.json
RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git && \ RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git && \
cd caosdb-pylib && git checkout ${PYLIB} && pip3 install . cd caosdb-pylib && git checkout ${PYLIB} && pip3 install --break-system-packages .
ARG ADVANCED ARG ADVANCED
ADD https://gitlab.indiscale.com/api/v4/projects/104/repository/commits/${ADVANCED} \ ADD https://gitlab.indiscale.com/api/v4/projects/104/repository/commits/${ADVANCED} \
advanced_version.json advanced_version.json
RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git && \ RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git && \
cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install .[h5-crawler] cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install --break-system-packages .[h5-crawler]
COPY . /git COPY . /git
# Delete .git because it is huge. # Delete .git because it is huge.
...@@ -30,7 +34,7 @@ RUN rm -r /git/.git ...@@ -30,7 +34,7 @@ RUN rm -r /git/.git
# Install pycaosdb.ini for the tests # Install pycaosdb.ini for the tests
RUN mv /git/.docker/tester_pycaosdb.ini /git/integrationtests/pycaosdb.ini RUN mv /git/.docker/tester_pycaosdb.ini /git/integrationtests/pycaosdb.ini
RUN cd /git/ && pip3 install . RUN cd /git/ && pip3 install --break-system-packages .[h5-crawler,spss,rocrate]
WORKDIR /git/integrationtests WORKDIR /git/integrationtests
# wait for server, # wait for server,
......
...@@ -113,46 +113,47 @@ info: ...@@ -113,46 +113,47 @@ info:
script: script:
- *env - *env
unittest_py3.9: unittest_py3.11:
tags: [cached-dind] tags: [cached-dind]
stage: test stage: test
image: $CI_REGISTRY_IMAGE image: $CI_REGISTRY_IMAGE
script: script:
- tox - python3 -c "import sys; assert sys.version.startswith('3.11')"
- tox
unittest_py3.7: unittest_py3.9:
tags: [cached-dind] tags: [cached-dind]
stage: test stage: test
image: python:3.7 image: python:3.9
script: &python_test_script script: &python_test_script
# install dependencies # install dependencies
- pip install pytest pytest-cov - pip install pytest pytest-cov
# TODO: Use f-branch logic here # TODO: Use f-branch logic here
- pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev
- pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev
- pip install .[h5-crawler] - pip install .[h5-crawler,spss,rocrate]
# actual test # actual test
- caosdb-crawler --help - caosdb-crawler --help
- pytest --cov=caosdb -vv ./unittests - pytest --cov=caosdb -vv ./unittests
unittest_py3.8: unittest_py3.10:
tags: [cached-dind] tags: [cached-dind]
stage: test stage: test
image: python:3.8 image: python:3.10
script: *python_test_script script: *python_test_script
unittest_py3.10: unittest_py3.12:
tags: [cached-dind] tags: [cached-dind]
stage: test stage: test
image: python:3.10 image: python:3.12
script: *python_test_script script: *python_test_script
unittest_py3.11: unittest_py3.13:
tags: [cached-dind] tags: [cached-dind]
stage: test stage: test
image: python:3.11 image: python:3.13
script: *python_test_script script: *python_test_script
inttest: inttest:
tags: [docker] tags: [docker]
services: services:
...@@ -279,7 +280,7 @@ cert: ...@@ -279,7 +280,7 @@ cert:
- cd .docker - cd .docker
- CAOSHOSTNAME=caosdb-server ./cert.sh - CAOSHOSTNAME=caosdb-server ./cert.sh
style: code-style:
tags: [docker] tags: [docker]
stage: style stage: style
image: $CI_REGISTRY_IMAGE image: $CI_REGISTRY_IMAGE
...@@ -287,9 +288,21 @@ style: ...@@ -287,9 +288,21 @@ style:
- job: build-testenv - job: build-testenv
optional: true optional: true
script: script:
- autopep8 -r --diff --exit-code . - autopep8 --version
- autopep8 -r --diff --exit-code .
allow_failure: true allow_failure: true
pylint:
tags: [docker]
stage: style
image: $CI_REGISTRY_IMAGE
needs:
- job: build-testenv
optional: true
allow_failure: true
script:
- pylint --unsafe-load-any-extension=y -d all -e E,F src/caoscrawler
# Build the sphinx documentation and make it ready for deployment by Gitlab Pages # Build the sphinx documentation and make it ready for deployment by Gitlab Pages
# Special job for serving a static website. See https://docs.gitlab.com/ee/ci/yaml/README.html#pages # Special job for serving a static website. See https://docs.gitlab.com/ee/ci/yaml/README.html#pages
# Based on: https://gitlab.indiscale.com/caosdb/src/caosdb-pylib/-/ci/editor?branch_name=main # Based on: https://gitlab.indiscale.com/caosdb/src/caosdb-pylib/-/ci/editor?branch_name=main
......
...@@ -9,6 +9,196 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ...@@ -9,6 +9,196 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added ### ### Added ###
- Validation module for checking a list of generated records against a list of json schemas
that can be generated from a yaml data model file.
- DictElementConverters can now make use of `match_properties` which
works analogous to `match_properties` in ROCrateEntityConverter and
`match_attrib` in XMLConverter.
- `match_properties` is a method of class Converter and can for
example be used by CustomConverters.
- ZipFileConverter that opens zip files and exposes their contents as
File and Directory structure elements.
- `linkahead-crawler` script as alias for `caosdb-crawler`.
- New transformers of the form `cast_to_*` which allow casting
variables to `int`, `float`, `str` and `bool`.
- Transformer function definition in the cfood support variable
substitutions now.
- `crawler_main` and `scanner.scan_directory` now support list of
directories to be crawled, too. Note that giving a list of
directories is currently incompatible with
`securityMode=SecurityMode.RETRIEVE` or
`securityMode=SecurityMode.INSERT` since the functionality to
authoriye pending inserts or updates doesn't support path lists yet
and will raise a NotImplementedError for now.
- `match_newer_than_file` option for `DirectoryConverter`: A reference
file containing (only) an ISO-formatted datetime string can be
specified here. Directories with this option won't match if all
their contents were last modified before that datetime.
### Changed ###
### Deprecated ###
### Removed ###
### Fixed ###
- `spss_to_datamodel` script works again.
- The cfood now supports bi-directional references when defining records on the same level.
(See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/175)
### Security ###
### Documentation ###
## [0.10.1] - 2024-11-13 ##
### Fixed ###
* Removed optional rocrate dependency which prevented package
publication on PyPI for a violation of PEP 440 (see
https://github.com/pypi/warehouse/issues/7136). It will be
re-activated once
https://github.com/ResearchObject/ro-crate-py/issues/203 has been
resolved upstream. For now, if you want to use the ROCrate or ELN
converters, manually install the fix from
https://github.com/salexan2001/ro-crate-py.git@f-automatic-dummy-ids
```sh
pip install git+https://github.com/salexan2001/ro-crate-py.git@f-automatic-dummy-ids
```
## [0.10.0] - 2024-11-13 ##
### Added ###
- XMLTextNodeConverter for converting text nodes created by XMLTagConverter
- XMLAttributeNodeConverter for converting attribute nodes created by XMLTagConverter
- Units for properties. They can be specified by giving the property as a dict in the form
```yaml
MyRecord:
my_prop:
value: 5
unit: m
```
- Support for Python 3.13
- ROCrateConverter, ELNFileConverter and ROCrateEntityConverter for crawling ROCrate and .eln files
- `max_log_level` parameter to `logging.configure_server_side_logging`
to control the server-side debuglog's verboosity, and an optional
`sss_max_log_level` parameter to `crawler_main` to control the SSS
loglevel separately from the global `debug` option.
### Changed ###
- Property values specified by dicts do not have to contain a
`collection_mode` key anymore. If none is given, the
`collection_mode` is determined from the `value` as it is done for
values specified by strings:
- if `value` starts with '+', collection mode is "list".
- if `value` starts with '*', collection mode is "multiproperty".
- in all other cases, collection mode is "single".
- The default server-side scrippting debug level is now controlled by
the global `debug` option by default and set to log level `INFO` in
case of `debug=False`. The previous behavior can be restored by
calling `crawler_main` with `sss_max_log_level=logging.DEBUG`.
### Removed ###
* Support for Python 3.8 (end of life)
### Fixed ###
- Added better error message for some cases of broken converter and
record definitions.
- [#108](https://gitlab.com/linkahead/linkahead-crawler/-/issues/108)
Too verbose server-side scripting logs that could lead to high disk
usage.
### Documentation ###
- Tutorial on crawling a simple CSV file
## [0.9.1] - 2024-09-26 ##
### Fixed ###
* ImpossibleMergeErrors now correctly include the problematic property
and its values in their string representation.
## [0.9.0] - 2024-09-05 ##
### Added ###
* New converters for XML documents/trees/tags: XMLFile, XMLTag, XMLTextNode
### Changed ###
* Moved the optional `hdf5_converter` to the `converters`
submodule. When updating from 0.8 or below, this means that you have
to adapt the converter package path in your cfood definition from
`caoscrawler.hdf5_converter` to
`caoscrawler.converters.hdf5_converter`.
### Fixed ###
* Use `urllib.parse.urljoin` to generate link addresses in status
mails, preventing wrong addresses, e.g., due to superfluous `/`.
## [0.8.0] - 2024-08-23 ##
### Added ###
* Support for Python 3.12 and experimental support for 3.13
* CFood macros now accept complex objects as values, not just strings.
* More options for the `CSVTableConverter`
* New converters:
* `DatetimeElementConverter`
* `SPSSConverter`
* New scripts:
* `spss_to_datamodel`
* `csv_to_datamodel`
* New transformer functions:
* `date_parse`
* `datetime_parse`
* New ``PropertiesFromDictConverter`` which allows to automatically
create property values from dictionary keys.
### Changed ###
* CFood macros do not render everything into strings now.
* Better internal handling of identifiable/reference resolving and merging of entities. This also
includes more understandable output for users.
* Better handling of missing imports, with nice messages for users.
* No longer use configuration of advancedtools to set to and from email addresses
### Removed ###
* Support for Python 3.7
### Fixed ###
* [93](https://gitlab.com/linkahead/linkahead-crawler/-/issues/93) cfood.yaml does not allow umlaut in $expression
* [96](https://gitlab.com/linkahead/linkahead-crawler/-/issues/96) Do not fail silently on transaction errors
### Security ###
### Documentation ###
* General improvement of the documentaion, in many small places.
* The API documentation should now also include documentation of the constructors.
## [0.7.1] - 2024-03-21 ##
### Fixed ###
* `crawler_main` doesn't need the deprecated `debug=True` anymore to put out a
provenance file if the `provenance_file` parameter is provided.
* [indiscale#129](https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/129)
missing packaging dependency.
## [0.7.0] - 2024-03-04 ##
### Added ###
* `transform` sections can be added to a CFood to apply functions to values stored in variables. * `transform` sections can be added to a CFood to apply functions to values stored in variables.
* default transform functions: submatch, split and replace. * default transform functions: submatch, split and replace.
* New command line option "--new-debug-tree" that allows saving a full tree of debug information for crawler runs in yaml and HTML format. * New command line option "--new-debug-tree" that allows saving a full tree of debug information for crawler runs in yaml and HTML format.
...@@ -44,6 +234,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ...@@ -44,6 +234,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
* The `identifiable_adapters.IdentifiableAdapter` uses entity ids (negative for * The `identifiable_adapters.IdentifiableAdapter` uses entity ids (negative for
entities that don't exist remotely) instead of entity objects for keeping entities that don't exist remotely) instead of entity objects for keeping
track of references. track of references.
* Log output is either written to $SHARED_DIR/ (when this variable is set) or just to the terminal.
### Deprecated ### ### Deprecated ###
...@@ -72,10 +263,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ...@@ -72,10 +263,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
[linkahead-server#101](https://gitlab.com/linkahead/linkahead-server/-/issues/101), this is now mitigated. [linkahead-server#101](https://gitlab.com/linkahead/linkahead-server/-/issues/101), this is now mitigated.
* [indiscale#128](https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/128) Yet another corner case of referencing resolution resolved. * [indiscale#128](https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/128) Yet another corner case of referencing resolution resolved.
### Security ###
### Documentation ###
## [0.6.0] - 2023-06-23 ## ## [0.6.0] - 2023-06-23 ##
(Florian Spreckelsen) (Florian Spreckelsen)
...@@ -141,6 +328,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ...@@ -141,6 +328,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- ``add_prefix`` and ``remove_prefix`` arguments for the command line interface - ``add_prefix`` and ``remove_prefix`` arguments for the command line interface
and the ``crawler_main`` function for the adding/removal of path prefixes when and the ``crawler_main`` function for the adding/removal of path prefixes when
creating file entities. creating file entities.
- More strict checking of `identifiables.yaml`.
- Better error messages when server does not conform to expected data model.
### Changed ### ### Changed ###
...@@ -189,7 +378,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ...@@ -189,7 +378,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Some StructureElements changed (see "How to upgrade" in the docs): - Some StructureElements changed (see "How to upgrade" in the docs):
- Dict, DictElement and DictDictElement were merged into DictElement. - Dict, DictElement and DictDictElement were merged into DictElement.
- DictTextElement and TextElement were merged into TextElement. The "match" - DictTextElement and TextElement were merged into TextElement. The "match"
keyword is now invalid for TextElements. keyword is now invalid for TextElements.
- JSONFileConverter creates another level of StructureElements (see "How to upgrade" in the docs) - JSONFileConverter creates another level of StructureElements (see "How to upgrade" in the docs)
- create_flat_list function now collects entities in a set and also adds the entities - create_flat_list function now collects entities in a set and also adds the entities
contained in the given list directly contained in the given list directly
......
...@@ -17,6 +17,6 @@ authors: ...@@ -17,6 +17,6 @@ authors:
given-names: Alexander given-names: Alexander
orcid: https://orcid.org/0000-0003-4124-9649 orcid: https://orcid.org/0000-0003-4124-9649
title: CaosDB - Crawler title: CaosDB - Crawler
version: 0.6.0 version: 0.10.1
doi: 10.3390/data9020024 doi: 10.3390/data9020024
date-released: 2023-06-23 date-released: 2024-11-13
\ No newline at end of file \ No newline at end of file
src/doc/README_SETUP.md
\ No newline at end of file
# Getting started with the CaosDB Crawler #
## Installation
see INSTALL.md
## Run Unit Tests
1. Install additional dependencies:
- h5py
2. Run `pytest unittests`.
## Documentation ##
We use sphinx to create the documentation. Docstrings in the code should comply
with the Googly style (see link below).
Build documentation in `src/doc` with `make doc`. Note that for the
automatic generation of the complete API documentation, it is
necessary to first install this library with all its optional
dependencies, i.e., `pip install .[h5-crawler,spss]`.
### Requirements ###
- `sphinx`
- `sphinx-autoapi`
- `recommonmark`
- `sphinx-rtd-theme`
### How to contribute ###
- [Google Style Python Docstrings](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html)
- [Google Style Python Docstrings 2nd reference](https://github.com/google/styleguide/blob/gh-pages/pyguide.md#38-comments-and-docstrings)
- [References to other documentation](https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html#role-external)
...@@ -32,7 +32,7 @@ import sys ...@@ -32,7 +32,7 @@ import sys
from argparse import RawTextHelpFormatter from argparse import RawTextHelpFormatter
from pathlib import Path from pathlib import Path
import caosdb as db import linkahead as db
import pytest import pytest
import yaml import yaml
from caosadvancedtools.crawler import Crawler as OldCrawler from caosadvancedtools.crawler import Crawler as OldCrawler
...@@ -42,8 +42,8 @@ from caoscrawler.debug_tree import DebugTree ...@@ -42,8 +42,8 @@ from caoscrawler.debug_tree import DebugTree
from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable import Identifiable
from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter
from caoscrawler.scanner import scan_directory from caoscrawler.scanner import scan_directory
from caosdb import EmptyUniqueQueryError from linkahead import EmptyUniqueQueryError
from caosdb.utils.register_tests import clear_database, set_test_key from linkahead.utils.register_tests import clear_database, set_test_key
set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2")
......
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
# 2024 Florian Spreckelsen <f.spreckelsen@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
import logging
import tempfile
from pathlib import Path
import linkahead as db
from caoscrawler import crawl
from caoscrawler.crawl import (crawler_main, SecurityMode)
from linkahead.utils.register_tests import clear_database, set_test_key
set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2")
INTTESTDIR = Path(__file__).parent
def test_list_of_paths(clear_database, monkeypatch):
# Mock the status record
dummy_status = {
"n_calls": 0
}
def _mock_update_status_record(run_id, n_inserts, n_updates, status):
print("Update mocked status")
dummy_status["run_id"] = run_id
dummy_status["n_inserts"] = n_inserts
dummy_status["n_updates"] = n_updates
dummy_status["status"] = status
dummy_status["n_calls"] += 1
monkeypatch.setattr(crawl, "_update_status_record", _mock_update_status_record)
# mock SSS environment
monkeypatch.setenv("SHARED_DIR", tempfile.gettempdir())
# We need only one dummy RT
rt = db.RecordType(name="TestType").insert()
basepath = INTTESTDIR / "test_data" / "crawler_main_with_list_of_dirs"
dirlist = [basepath / "dir1", basepath / "dir2"]
crawler_main(
dirlist,
cfood_file_name=basepath / "cfood.yml",
identifiables_definition_file=basepath / "identifiable.yml"
)
recs = db.execute_query("FIND TestType")
assert len(recs) == 2
assert "Test1" in [r.name for r in recs]
assert "Test2" in [r.name for r in recs]
assert dummy_status["n_inserts"] == 2
assert dummy_status["n_updates"] == 0
assert dummy_status["status"] == "OK"
assert dummy_status["n_calls"] == 1
def test_not_implemented_list_with_authorization(caplog, clear_database):
rt = db.RecordType(name="TestType").insert()
basepath = INTTESTDIR / "test_data" / "crawler_main_with_list_of_dirs"
dirlist = [basepath / "dir1", basepath / "dir2"]
# This is not implemented yet, so check log for correct error.
ret = crawler_main(
dirlist,
cfood_file_name=basepath / "cfood.yml",
identifiables_definition_file=basepath / "identifiable.yml",
securityMode=SecurityMode.RETRIEVE
)
# crawler_main hides the error, but has a non-zero return code and
# errors in the log:
assert ret != 0
err_tuples = [t for t in caplog.record_tuples if t[1] == logging.ERROR]
assert len(err_tuples) == 1
assert "currently implemented only for single paths, not for lists of paths" in err_tuples[0][2]
# No inserts after the errors
assert len(db.execute_query("FIND TestType")) == 0
---
metadata:
crawler-version: 0.10.2
---
BaseDirElement:
type: Directory
match: ^dir(?P<dir_number>[0-9]+)$$
records:
TestType:
name: Test$dir_number
TestType:
- name
# This file is a part of the CaosDB Project. # This file is a part of the LinkAhead Project.
# #
# Copyright (C) 2022 Indiscale GmbH <info@indiscale.com> # Copyright (C) 2022 Indiscale GmbH <info@indiscale.com>
# 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com> # 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com>
...@@ -16,20 +16,22 @@ ...@@ -16,20 +16,22 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>. # along with this program. If not, see <https://www.gnu.org/licenses/>.
# #
from pytest import fixture, mark, raises import tempfile
import linkahead as db import linkahead as db
from linkahead.cached import cache_clear import yaml
from caosadvancedtools.models.parser import parse_model_from_string from caosadvancedtools.models.parser import parse_model_from_string
from caoscrawler.crawl import Crawler from caoscrawler.crawl import Crawler
from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable import Identifiable
from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter
from caoscrawler.scanner import (_load_definition_from_yaml_dict,
create_converter_registry,
scan_structure_elements)
from caoscrawler.structure_elements import DictElement from caoscrawler.structure_elements import DictElement
from linkahead.cached import cache_clear
from caoscrawler.scanner import create_converter_registry, scan_structure_elements
from linkahead.utils.register_tests import clear_database, set_test_key from linkahead.utils.register_tests import clear_database, set_test_key
from pytest import fixture, mark, raises
set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2")
...@@ -171,8 +173,9 @@ def test_issue_83(clear_database): ...@@ -171,8 +173,9 @@ def test_issue_83(clear_database):
name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target1]) name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target1])
referencing2 = db.Record(name="Referencing2").add_parent( referencing2 = db.Record(name="Referencing2").add_parent(
name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target2]) name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target2])
referencing3 = db.Record(name="Referencing3").add_parent(name=referencing_type.name).add_property( referencing3 = db.Record(name="Referencing3").add_parent(
name=referenced_type.name, value=[ref_target1, ref_target2]) name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target1,
ref_target2])
records = db.Container().extend( records = db.Container().extend(
[ref_target1, ref_target2, referencing1, referencing2, referencing3]) [ref_target1, ref_target2, referencing1, referencing2, referencing3])
...@@ -329,3 +332,104 @@ def test_indiscale_87(clear_database): ...@@ -329,3 +332,104 @@ def test_indiscale_87(clear_database):
print(db.apiutils.compare_entities(rec, retrieved)) print(db.apiutils.compare_entities(rec, retrieved))
assert db.apiutils.empty_diff(rec, retrieved) assert db.apiutils.empty_diff(rec, retrieved)
print("---") print("---")
def test_issue_16(clear_database):
"""
This is another a test for:
https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/16
In addition to the two unit tests for recursive definition in `test_scanner.py` this system test
tests whether recursively defined records can be synchronized correctly using the crawler.
"""
recursive_yaml = """
FirstConverter:
type: DictElement
records:
Experiment:
subtree:
Converter:
type: DictElement
records:
Block:
name: block 1
Experiment: $Experiment
Experiment:
name: experiment 1
Block: $Block
"""
crawler_definition = _load_definition_from_yaml_dict(
[yaml.load(recursive_yaml, Loader=yaml.SafeLoader)])
converter_registry = create_converter_registry(crawler_definition)
# Nested DictElements that match the yaml structure in recursive_yaml:
data = {"data": {
}}
records = scan_structure_elements(DictElement(name="", value=data), crawler_definition,
converter_registry)
rt_exp = db.RecordType(name="Experiment").insert()
rt_block = db.RecordType(name="Block").insert()
ident = CaosDBIdentifiableAdapter()
ident.load_from_yaml_object(yaml.safe_load("""
Experiment:
- name
Block:
- name
"""))
crawler = Crawler(identifiableAdapter=ident)
crawler.synchronize(crawled_data=records)
exp_res = db.execute_query("FIND Experiment")
assert len(exp_res) == 1
exp_block = db.execute_query("FIND Block")
assert len(exp_block) == 1
assert exp_res[0].get_property("Block").value == exp_block[0].id
assert exp_block[0].get_property("Experiment").value == exp_res[0].id
def test_issue_14(clear_database):
"""
Issue title: Some parent updates are required before inserts
https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/14
"""
rt1 = db.RecordType(name="RT1")
rt2 = db.RecordType(name="RT2").insert()
rt1.add_property(rt2, importance=db.OBLIGATORY)
rt1.insert()
r = db.Record()
r.add_parent(rt1)
with tempfile.NamedTemporaryFile() as tmpf:
f = db.File(name="test_parent", path="parent_test/file.txt", file=tmpf.name)
f.insert()
# We create a clean new file object here:
f2 = db.File(name="test_parent", path="parent_test/file.txt", file=tmpf.name)
f2.add_parent(rt2)
r.add_property(name="RT2", value=f2)
# Current state in the database: File without parents
f_test_base = db.File(name="test_parent").retrieve()
assert len(f_test_base.parents) == 0
assert len(db.execute_query("FIND Record")) == 0
ident = CaosDBIdentifiableAdapter()
ident.register_identifiable("RT1", db.RecordType().add_parent(
name="RT1").add_property(name="RT2"))
crawler = Crawler(identifiableAdapter=ident)
crawler.synchronize(crawled_data=[f2, r])
f_test = db.File(name="test_parent").retrieve()
assert len(f_test.parents) == 1
assert f_test.parents[0].name == "RT2"
records = db.execute_query("FIND Record")
assert len(records) == 1
assert records[0].get_property("RT2").value == f_test.id
...@@ -24,24 +24,22 @@ ...@@ -24,24 +24,22 @@
""" """
an integration test module that runs a test against a (close to) real world example an integration test module that runs a test against a (close to) real world example
""" """
from caosdb.utils.register_tests import clear_database, set_test_key
import logging
import json import json
import logging
import os import os
import pytest
import sys
import caosdb as db import linkahead as db
from caosdb.cached import cache_clear from linkahead.cached import cache_clear
from linkahead.utils.register_tests import clear_database, set_test_key
from caosadvancedtools.loadFiles import loadpath
from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml
from caoscrawler.crawl import Crawler, crawler_main from caoscrawler.crawl import Crawler, crawler_main
from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter
from caoscrawler.structure_elements import Directory
import pytest
from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml
from caosadvancedtools.loadFiles import loadpath
from caoscrawler.scanner import load_definition, scan_structure_elements, create_converter_registry from caoscrawler.scanner import load_definition, scan_structure_elements, create_converter_registry
from caoscrawler.structure_elements import Directory
import sys
set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2")
...@@ -91,15 +89,6 @@ def usemodel(): ...@@ -91,15 +89,6 @@ def usemodel():
dataset_inherits.sync_data_model(noquestion=True) dataset_inherits.sync_data_model(noquestion=True)
@pytest.fixture
def clear_database():
# TODO(fspreck): Remove once the corresponding advancedtools function can
# be used.
ents = db.execute_query("FIND ENTITY WITH ID>99")
if ents:
ents.delete()
def create_identifiable_adapter(): def create_identifiable_adapter():
ident = CaosDBIdentifiableAdapter() ident = CaosDBIdentifiableAdapter()
ident.load_from_yaml_definition(os.path.join(DATADIR, "identifiables.yml")) ident.load_from_yaml_definition(os.path.join(DATADIR, "identifiables.yml"))
......
...@@ -27,12 +27,12 @@ import os ...@@ -27,12 +27,12 @@ import os
import pytest import pytest
from subprocess import run from subprocess import run
import caosdb as db import linkahead as db
from caosadvancedtools.loadFiles import loadpath from caosadvancedtools.loadFiles import loadpath
from caosdb.cached import cache_clear from linkahead.cached import cache_clear
from caosadvancedtools.models import parser as parser from caosadvancedtools.models import parser as parser
from caoscrawler.crawl import crawler_main from caoscrawler.crawl import crawler_main
from caosdb.utils.register_tests import clear_database, set_test_key from linkahead.utils.register_tests import clear_database, set_test_key
set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2")
......
[metadata] [metadata]
name = caoscrawler name = caoscrawler
version = 0.6.1 version = 0.10.2
author = Alexander Schlemmer author = Alexander Schlemmer
author_email = alexander.schlemmer@ds.mpg.de author_email = alexander.schlemmer@ds.mpg.de
description = A new crawler for caosdb description = A new crawler for LinkAhead
long_description = file: README.md long_description = file: README.md
long_description_content_type = text/markdown long_description_content_type = text/markdown
# url # url
...@@ -17,17 +17,18 @@ classifiers = ...@@ -17,17 +17,18 @@ classifiers =
package_dir = package_dir =
= src = src
packages = find: packages = find:
python_requires = >=3.7 python_requires = >=3.9
install_requires = install_requires =
importlib-resources
caosadvancedtools >= 0.7.0 caosadvancedtools >= 0.7.0
linkahead > 0.13.2 importlib-resources
yaml-header-tools >= 0.2.1 linkahead >= 0.16.0
pyyaml
odfpy #make optional odfpy #make optional
jinja2 #make optional? jinja2 #make optional?
packaging
pandas pandas
importlib_metadata;python_version<'3.8' pyarrow # Will be required by Pandas >= 3.0.
pyyaml
yaml-header-tools >= 0.2.1
[options.packages.find] [options.packages.find]
where = src where = src
...@@ -39,9 +40,16 @@ per-file-ignores = __init__.py:F401 ...@@ -39,9 +40,16 @@ per-file-ignores = __init__.py:F401
[options.entry_points] [options.entry_points]
console_scripts = console_scripts =
linkahead-crawler = caoscrawler.crawl:main
caosdb-crawler = caoscrawler.crawl:main caosdb-crawler = caoscrawler.crawl:main
spss_to_datamodel = caoscrawler.converters.spss:spss_to_datamodel_main
csv_to_datamodel = caoscrawler.scripts.generators:csv_to_datamodel_main
[options.extras_require] [options.extras_require]
h5-crawler = h5-crawler =
h5py >= 3.8 h5py >= 3.8
numpy numpy
spss =
pandas[spss]
rocrate =
rocrate @ git+https://github.com/salexan2001/ro-crate-py.git@f-automatic-dummy-ids
from . import converters, utils
from .crawl import Crawler, SecurityMode from .crawl import Crawler, SecurityMode
from .version import CfoodRequiredVersionError, get_caoscrawler_version from .version import CfoodRequiredVersionError, get_caoscrawler_version
......
...@@ -19,10 +19,10 @@ ...@@ -19,10 +19,10 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>. # along with this program. If not, see <https://www.gnu.org/licenses/>.
# #
from caosadvancedtools.crawler import Crawler as OldCrawler
import argparse import argparse
from caosadvancedtools.crawler import Crawler as OldCrawler
def parse_args(): def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
......
cfood: cfood:
type: object type: object
properties:
Converters:
description: Defintiion of custom converters
type: object
additionalProperties:
type: object
properties:
converter:
type: string
package:
type: string
required:
- converter
- package
macros:
description: Macro definitions
type: array
Transformers:
description: Variable transformer definition
type: object
additionalProperties:
type: object
properties:
function:
type: string
package:
type: string
required:
- package
- function
additionalProperties: additionalProperties:
$ref: $ref:
"#/$defs/converter" "#/$defs/converter"
$defs: $defs:
parents:
description: Parents for this record are given here as a list of names.
type: array
items:
type: string
converter: converter:
properties: properties:
type: type:
...@@ -28,13 +63,21 @@ cfood: ...@@ -28,13 +63,21 @@ cfood:
- Definitions - Definitions
- Dict - Dict
- Date - Date
- Datetime
- JSONFile - JSONFile
- YAMLFile
- CSVTableConverter - CSVTableConverter
- XLSXTableConverter - XLSXTableConverter
- SPSSFile
- H5File - H5File
- H5Dataset - H5Dataset
- H5Group - H5Group
- H5Ndarray - H5Ndarray
- XMLFile
- XMLTag
- XMLTextNode
- XMLAttributeNode
- PropertiesFromDictElement
description: Type of this converter node. description: Type of this converter node.
match: match:
description: typically a regexp which is matched to a structure element name description: typically a regexp which is matched to a structure element name
...@@ -45,15 +88,52 @@ cfood: ...@@ -45,15 +88,52 @@ cfood:
match_value: match_value:
description: a regexp that is matched to the value of a key-value pair description: a regexp that is matched to the value of a key-value pair
type: string type: string
records: match_newer_than_file:
description: This field is used to define new records or to modify records which have been defined on a higher level. description: |
Only relevant for Directory. A path to a file containing
an ISO-formatted datetime. Only match if the contents of the
Directory have been modified after that datetime.
type: string
record_from_dict:
description: Only relevant for PropertiesFromDictElement. Specify the root record which is generated from the contained dictionary.
type: object type: object
required:
- variable_name
properties: properties:
parents: variable_name:
description: Parents for this record are given here as a list of names. description: |
Name of the record by which it can be accessed in the
cfood definiton. Can also be the name of an existing
record in which case that record will be updated by
the PropertiesFromDictConverter.
type: string
properties_blacklist:
description: List of keys to be ignored in the automatic treatment. They will be ignored on all levels of the dictionary.
type: array type: array
items: items:
type: string type: string
references:
description: List of keys that will be transformed into named reference properties.
type: object
additionalProperties:
type: object
properties:
parents:
$ref:
"#/$defs/parents"
name:
description: Name of this record. If none is given, variable_name is used.
type: string
parents:
$ref:
"#/$defs/parents"
records:
description: This field is used to define new records or to modify records which have been defined on a higher level.
type: object
properties:
parents:
$ref:
"#/$defs/parents"
additionalProperties: additionalProperties:
oneOf: oneOf:
- type: object - type: object
...@@ -61,6 +141,9 @@ cfood: ...@@ -61,6 +141,9 @@ cfood:
value: value:
description: Dictionary notation for variable values. Values can be given by a variable which is indicated by an initial "$". Use "$$" for setting values actually starting with a dollar sign. description: Dictionary notation for variable values. Values can be given by a variable which is indicated by an initial "$". Use "$$" for setting values actually starting with a dollar sign.
type: string type: string
unit:
description: The unit of this property. Units can be given by a variable which is indicated by an initial "$". Use "$$" for setting values actually starting with a dollar sign.
type: string
collection_mode: collection_mode:
description: The collection mode defines whether the resulting property will be a single property or whether the values of multiple structure elements will be collected either into a list or a multiproperty. description: The collection mode defines whether the resulting property will be a single property or whether the values of multiple structure elements will be collected either into a list or a multiproperty.
enum: enum:
...@@ -75,3 +158,15 @@ cfood: ...@@ -75,3 +158,15 @@ cfood:
additionalProperties: additionalProperties:
$ref: $ref:
"#/$defs/converter" "#/$defs/converter"
if:
properties:
type:
const:
"PropertiesFromDictElement"
then:
required:
- type
- record_from_dict
else:
required:
- type
# encoding: utf-8
#
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Submodule containing all default and optional converters."""
from .. import utils
from .converters import *
from .xml_converter import *
from .zipfile_converter import ZipFileConverter
try:
from .spss import SPSSConverter
except ImportError as err:
SPSSConverter: type = utils.MissingImport(
name="SPSSConverter", hint="Try installing with the `spss` extra option.",
err=err)
try:
from .rocrate import (ELNFileConverter, ROCrateConverter,
ROCrateEntityConverter)
except ImportError as err:
ROCrateEntityConverter: type = utils.MissingImport(
name="ROCrateEntityConverter", hint="Try installing with the `rocrate` extra option.",
err=err)
ROCrateConverter: type = utils.MissingImport(
name="ROCrateConverter", hint="Try installing with the `rocrate` extra option.",
err=err)
ELNFileConverter: type = utils.MissingImport(
name="ELNFileConverter", hint="Try installing with the `rocrate` extra option.",
err=err)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment