diff --git a/.docker/Dockerfile b/.docker/Dockerfile index f7353e059d8cd027f08403d6f6527ffbcaabc965..14c3c1efc5b3974f6952b5ed439723c58b4627a5 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -1,34 +1,58 @@ -FROM debian:latest +############################### +###### Temporary Image ######## +############################### +FROM debian:bookworm as git_base + +# Check for availability of DNS +RUN if getent hosts indiscale.com > /dev/null; \ + then echo "Connected to the internet and DNS available"; \ + else echo "No internet connection or DNS not available"; \ + fi + +COPY . /git + +# Delete .git because it is huge. +RUN rm -r /git/.git + +# Install pycaosdb.ini for the tests +RUN mv /git/.docker/tester_pycaosdb.ini /git/integrationtests/pycaosdb.ini + +############################### +###### Main Image Build ####### +############################### + +FROM debian:bookworm RUN apt-get update && \ apt-get install \ curl \ git \ - openjdk-11-jdk-headless \ + openjdk-17-jdk-headless \ python3-autopep8 \ python3-pip \ python3-pytest \ + python3-sphinx \ tox \ -y +RUN pip3 install --break-system-packages \ + pylint \ + recommonmark \ + sphinx-rtd-theme \ + ; COPY .docker/wait-for-it.sh /wait-for-it.sh ARG PYLIB ADD https://gitlab.indiscale.com/api/v4/projects/97/repository/commits/${PYLIB} \ pylib_version.json RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git && \ - cd caosdb-pylib && git checkout ${PYLIB} && pip3 install . + cd caosdb-pylib && git checkout ${PYLIB} && pip3 install --break-system-packages . ARG ADVANCED ADD https://gitlab.indiscale.com/api/v4/projects/104/repository/commits/${ADVANCED} \ advanced_version.json RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git && \ - cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install .[h5-crawler] -COPY . /git - -# Delete .git because it is huge. -RUN rm -r /git/.git + cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install --break-system-packages .[h5-crawler] -# Install pycaosdb.ini for the tests -RUN mv /git/.docker/tester_pycaosdb.ini /git/integrationtests/pycaosdb.ini +COPY --from=git_base /git /git -RUN cd /git/ && pip3 install . +RUN cd /git/ && pip3 install --break-system-packages .[h5-crawler,spss,rocrate] WORKDIR /git/integrationtests # wait for server, diff --git a/.docker/cert.sh b/.docker/cert.sh index e22cfba2995b5fd9d812232f562b7254233fe5b0..628ba8dd9cc19f85a515a75cebd03b8981337bfd 100755 --- a/.docker/cert.sh +++ b/.docker/cert.sh @@ -43,7 +43,10 @@ function cert() { # Certificate is for localhost KEYPW="${KEYPW}" openssl req -new -x509 -key caosdb.key.pem \ -out caosdb.cert.pem -passin env:KEYPW \ - -subj "/C=/ST=/L=/O=/OU=/CN=${CAOSHOSTNAME}" + -subj "/C=/ST=/L=/O=example/OU=example/CN=${CAOSHOSTNAME}" \ + -days 365 \ + -addext "subjectAltName = DNS:${CAOSHOSTNAME}" \ + -addext "certificatePolicies = 1.2.3.4" KEYPW="${KEYPW}" KEYSTOREPW="$KEYSTOREPW" openssl pkcs12 -export \ -inkey caosdb.key.pem -in caosdb.cert.pem -out all-certs.pkcs12 \ -passin env:KEYPW -passout env:KEYPW diff --git a/.docker/docker-compose.yml b/.docker/docker-compose.yml index 02ccac5c48e039a3374a0d169f3b355f897e45fc..97f70320e37b1c1d8623e5fc1d98b6d72916e2b8 100644 --- a/.docker/docker-compose.yml +++ b/.docker/docker-compose.yml @@ -1,7 +1,7 @@ version: '3.7' services: sqldb: - image: mariadb:10.4 + image: mariadb:11.4 environment: MYSQL_ROOT_PASSWORD: caosdb1234 networks: diff --git a/.gitignore b/.gitignore index 182ed05e1404483ecb553c8a4e469a86a77ba27c..7ad8606171fd21b94ffd8b15b972f956d4dfc1a1 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,5 @@ start_caosdb_docker.sh src/doc/_apidoc /dist/ *.egg-info +venv/ +.backups \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index aeb0c11863a4cd1ba52396dbbbad441545366390..f295e0b9480d56359ec1f1d30bc3be5bd54aea57 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -104,6 +104,27 @@ stages: CAOSDB_TAG=${REFTAG}; fi - echo $CAOSDB_TAG + - if [ -z "$PYLIB" ]; then + if echo "$CI_COMMIT_REF_NAME" | grep -c "^f-" ; then + echo "Check if pylib has branch $CI_COMMIT_REF_NAME" ; + if wget -O /dev/null https://gitlab.indiscale.com/api/v4/projects/97/repository/branches/${CI_COMMIT_REF_NAME}>/dev/null ; then + PYLIB=$CI_COMMIT_REF_NAME ; + fi; + fi; + fi; + - PYLIB=${PYLIB:-dev} + - echo $PYLIB + + - if [ -z "$ADVANCED" ]; then + if echo "$CI_COMMIT_REF_NAME" | grep -c "^f-" ; then + echo "Check if advanced user tools have branch $CI_COMMIT_REF_NAME" ; + if wget -O /dev/null https://gitlab.indiscale.com/api/v4/projects/104/repository/branches/${CI_COMMIT_REF_NAME} ; then + ADVANCED=$CI_COMMIT_REF_NAME ; + fi; + fi; + fi; + - ADVANCED=${ADVANCED:-dev} + - echo $ADVANCED info: tags: [cached-dind] @@ -116,43 +137,55 @@ info: unittest_py3.9: tags: [cached-dind] stage: test - image: $CI_REGISTRY_IMAGE - script: - - tox - -unittest_py3.7: - tags: [cached-dind] - stage: test - image: python:3.7 + variables: + PYVER: "3.9" + image: python:3.9 script: &python_test_script # install dependencies + - *env - pip install pytest pytest-cov - # TODO: Use f-branch logic here - - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev - - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev - - pip install . + - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@${PYLIB} + - pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@${ADVANCED} + - pip install .[h5-crawler,spss,rocrate] + - echo "import sys; assert sys.version.startswith('$PYVER')" + - python3 -c "import sys; assert sys.version.startswith('$PYVER')" # actual test - caosdb-crawler --help - - pytest --cov=caosdb -vv ./unittests - -unittest_py3.8: - tags: [cached-dind] - stage: test - image: python:3.8 - script: *python_test_script + - make unittest unittest_py3.10: - tags: [cached-dind] + variables: + PYVER: "3.10" stage: test + tags: [cached-dind] image: python:3.10 script: *python_test_script unittest_py3.11: + variables: + PYVER: "3.11" tags: [cached-dind] stage: test image: python:3.11 script: *python_test_script - + +unittest_py3.12: + variables: + PYVER: "3.12" + stage: test + tags: [cached-dind] + image: python:3.12 + script: *python_test_script + +unittest_py3.13: + variables: + PYVER: "3.13" + tags: [cached-dind] + stage: test + image: python:3.13 + script: *python_test_script + + inttest: tags: [docker] services: @@ -169,6 +202,8 @@ inttest: - *env - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY - echo $CAOSDB_TAG + - echo $PYLIB + - echo $ADVANCED - cd .docker # Store mariadb version @@ -230,34 +265,14 @@ build-testenv: - pushes needs: [] script: + - *env - df -h - command -v wget - - if [ -z "$PYLIB" ]; then - if echo "$CI_COMMIT_REF_NAME" | grep -c "^f-" ; then - echo "Check if pylib has branch $CI_COMMIT_REF_NAME" ; - if wget https://gitlab.indiscale.com/api/v4/projects/97/repository/branches/${CI_COMMIT_REF_NAME} ; then - PYLIB=$CI_COMMIT_REF_NAME ; - fi; - fi; - fi; - - PYLIB=${PYLIB:-dev} - - echo $PYLIB - - - if [ -z "$ADVANCED" ]; then - if echo "$CI_COMMIT_REF_NAME" | grep -c "^f-" ; then - echo "Check if advanced user tools have branch $CI_COMMIT_REF_NAME" ; - if wget https://gitlab.indiscale.com/api/v4/projects/104/repository/branches/${CI_COMMIT_REF_NAME} ; then - ADVANCED=$CI_COMMIT_REF_NAME ; - fi; - fi; - fi; - - ADVANCED=${ADVANCED:-dev} - - echo $ADVANCED - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY # use here general latest or specific branch latest... - docker build - --build-arg PYLIB=${PYLIB} + --build-arg PYLIB=${PYLIB:dev} --build-arg ADVANCED=${ADVANCED:dev} --file .docker/Dockerfile -t $CI_REGISTRY_IMAGE . @@ -279,7 +294,7 @@ cert: - cd .docker - CAOSHOSTNAME=caosdb-server ./cert.sh -style: +code-style: tags: [docker] stage: style image: $CI_REGISTRY_IMAGE @@ -287,5 +302,42 @@ style: - job: build-testenv optional: true script: - - autopep8 -r --diff --exit-code . + - autopep8 --version + - autopep8 -r --diff --exit-code . allow_failure: true + +pylint: + tags: [docker] + stage: style + image: $CI_REGISTRY_IMAGE + needs: + - job: build-testenv + optional: true + allow_failure: true + script: + - pylint --unsafe-load-any-extension=y -d all -e E,F src/caoscrawler + +# Build the sphinx documentation and make it ready for deployment by Gitlab Pages +# Special job for serving a static website. See https://docs.gitlab.com/ee/ci/yaml/README.html#pages +# Based on: https://gitlab.indiscale.com/caosdb/src/caosdb-pylib/-/ci/editor?branch_name=main +pages_prepare: &pages_prepare + tags: [ cached-dind ] + stage: deploy + needs: + - job: build-testenv + image: $CI_REGISTRY_IMAGE + only: + refs: + - /^release-.*$/i + script: + - echo "Deploying documentation" + - make doc + - cp -r build/doc/html public + artifacts: + paths: + - public +pages: + <<: *pages_prepare + only: + refs: + - main diff --git a/.gitlab/issue_templates/Default.md b/.gitlab/issue_templates/Default.md new file mode 100644 index 0000000000000000000000000000000000000000..aa1a65aca363b87aff50280e1a86824009d2098b --- /dev/null +++ b/.gitlab/issue_templates/Default.md @@ -0,0 +1,28 @@ +## Summary + +*Please give a short summary of what the issue is.* + +## Expected Behavior + +*What did you expect how the software should behave?* + +## Actual Behavior + +*What did the software actually do?* + +## Steps to Reproduce the Problem + +*Please describe, step by step, how others can reproduce the problem. Please try these steps for yourself on a clean system.* + +1. +2. +3. + +## Specifications + +- Version: *Which version of this software?* +- Platform: *Which operating system, which other relevant software versions?* + +## Possible fixes + +*Do you have ideas how the issue can be resolved?* diff --git a/.gitlab/merge_request_templates/Default.md b/.gitlab/merge_request_templates/Default.md new file mode 100644 index 0000000000000000000000000000000000000000..b3eec01c595a461beec1b0a50fb598bdf8108c77 --- /dev/null +++ b/.gitlab/merge_request_templates/Default.md @@ -0,0 +1,55 @@ +# Summary + +*Insert a meaningful description for this merge request here: What is the new/changed behavior? +Which bug has been fixed? Are there related issues?* + + +# Focus + +*Point the reviewer to the core of the code change. Where should they start reading? What should +they focus on (e.g. security, performance, maintainability, user-friendliness, compliance with the +specs, finding more corner cases, concrete questions)?* + + +# Test Environment + +*How to set up a test environment for manual testing?* + + +# Check List for the Author + +Please, prepare your MR for a review. Be sure to write a summary and a focus and create gitlab +comments for the reviewer. They should guide the reviewer through the changes, explain your changes +and also point out open questions. For further good practices have a look at [our review +guidelines](https://gitlab.com/caosdb/caosdb/-/blob/dev/REVIEW_GUIDELINES.md) + +- [ ] All automated tests pass +- [ ] Reference related issues +- [ ] Up-to-date CHANGELOG.md (or not necessary) +- [ ] Appropriate user and developer documentation (or not necessary) + - Update / write published documentation (`make doc`). + - How do I use the software? Assume "stupid" users. + - How do I develop or debug the software? Assume novice developers. +- [ ] Annotations in code (Gitlab comments) + - Intent of new code + - Problems with old code + - Why this implementation? + + +# Check List for the Reviewer + +- [ ] I understand the intent of this MR +- [ ] All automated tests pass +- [ ] Up-to-date CHANGELOG.md (or not necessary) +- [ ] Appropriate user and developer documentation (or not necessary), also in published + documentation. +- [ ] The test environment setup works and the intended behavior is reproducible in the test + environment +- [ ] In-code documentation and comments are up-to-date. +- [ ] Check: Are there specifications? Are they satisfied? + +For further good practices have a look at [our review guidelines](https://gitlab.com/caosdb/caosdb/-/blob/dev/REVIEW_GUIDELINES.md). + + +/assign me +/target_branch dev diff --git a/CHANGELOG.md b/CHANGELOG.md index 263e58dbed69ee6ffb0f576e467f5f0329bc1a48..33fdff70f9af8d1c2174dc0ec297b08762fdeb63 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,8 +9,384 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### +### Changed ### + +### Deprecated ### + +### Removed ### + +### Fixed ### +- A RecordType with multiple Parents no longer causes an error during + collection of identifiables + +### Security ### + +### Documentation ### + +## [0.11.0] - 2025-03-05 ## + +### Added ### + +- Validation module for checking a list of generated records against a list of json schemas + that can be generated from a yaml data model file. +- DictElementConverters can now make use of `match_properties` which + works analogous to `match_properties` in ROCrateEntityConverter and + `match_attrib` in XMLConverter. +- `match_properties` is a method of class Converter and can for + example be used by CustomConverters. +- ZipFileConverter that opens zip files and exposes their contents as + File and Directory structure elements. +- `linkahead-crawler` script as alias for `caosdb-crawler`. +- New transformers of the form `cast_to_*` which allow casting + variables to `int`, `float`, `str` and `bool`. +- Transformer function definition in the cfood support variable + substitutions now. +- `crawler_main` and `scanner.scan_directory` now support list of + directories to be crawled, too. Note that giving a list of + directories is currently incompatible with + `securityMode=SecurityMode.RETRIEVE` or + `securityMode=SecurityMode.INSERT` since the functionality to + authoriye pending inserts or updates doesn't support path lists yet + and will raise a NotImplementedError for now. +- `match_newer_than_file` option for `DirectoryConverter`: A reference + file containing (only) an ISO-formatted datetime string can be + specified here. Directories with this option won't match if all + their contents were last modified before that datetime. + +### Changed ### + +- Registered identifiables can also be used by children of the given RecordType + if no registered identifiable is defined for them. +- ROCrate converter supports dereferencing property values with a single "@id"-property during + subtree generation. +- ROCrate converter supports the special property "variablesMeasured" in addition to "hasPart". +- `None` and other NA values (i.e., values where `pandas.isna` is + `True`) are now interpreted as empty strings in + `converters.match_name_and_value` instead of being cast to string naïvely + +### Fixed ### + +- `spss_to_datamodel` script works again. +- The cfood now supports bi-directional references when defining records on the same level. + (See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/175) +- [#112](https://gitlab.com/linkahead/linkahead-crawler/-/issues/112) + Children of CSVTableConverter match despite match_value: ".+" and + empty cell. This has been fixed by treating None and NA values in + `converters.match_name_and_value` (see above). + +### Documentation ### + +- Added documentation for ROCrateConverter, ELNFileConverter, and ROCrateEntityConverter + +## [0.10.1] - 2024-11-13 ## + +### Fixed ### + +* Removed optional rocrate dependency which prevented package + publication on PyPI for a violation of PEP 440 (see + https://github.com/pypi/warehouse/issues/7136). It will be + re-activated once + https://github.com/ResearchObject/ro-crate-py/issues/203 has been + resolved upstream. For now, if you want to use the ROCrate or ELN + converters, manually install the fix from + https://github.com/salexan2001/ro-crate-py.git@f-automatic-dummy-ids + ```sh + pip install git+https://github.com/salexan2001/ro-crate-py.git@f-automatic-dummy-ids + ``` + +## [0.10.0] - 2024-11-13 ## + +### Added ### + +- XMLTextNodeConverter for converting text nodes created by XMLTagConverter +- XMLAttributeNodeConverter for converting attribute nodes created by XMLTagConverter +- Units for properties. They can be specified by giving the property as a dict in the form + ```yaml + MyRecord: + my_prop: + value: 5 + unit: m + ``` +- Support for Python 3.13 +- ROCrateConverter, ELNFileConverter and ROCrateEntityConverter for crawling ROCrate and .eln files +- `max_log_level` parameter to `logging.configure_server_side_logging` + to control the server-side debuglog's verboosity, and an optional + `sss_max_log_level` parameter to `crawler_main` to control the SSS + loglevel separately from the global `debug` option. + +### Changed ### + +- Property values specified by dicts do not have to contain a + `collection_mode` key anymore. If none is given, the + `collection_mode` is determined from the `value` as it is done for + values specified by strings: + - if `value` starts with '+', collection mode is "list". + - if `value` starts with '*', collection mode is "multiproperty". + - in all other cases, collection mode is "single". +- The default server-side scrippting debug level is now controlled by + the global `debug` option by default and set to log level `INFO` in + case of `debug=False`. The previous behavior can be restored by + calling `crawler_main` with `sss_max_log_level=logging.DEBUG`. + +### Removed ### + +* Support for Python 3.8 (end of life) + +### Fixed ### + +- Added better error message for some cases of broken converter and + record definitions. +- [#108](https://gitlab.com/linkahead/linkahead-crawler/-/issues/108) + Too verbose server-side scripting logs that could lead to high disk + usage. + +### Documentation ### + +- Tutorial on crawling a simple CSV file + +## [0.9.1] - 2024-09-26 ## + +### Fixed ### + +* ImpossibleMergeErrors now correctly include the problematic property + and its values in their string representation. + +## [0.9.0] - 2024-09-05 ## + +### Added ### + +* New converters for XML documents/trees/tags: XMLFile, XMLTag, XMLTextNode + +### Changed ### + +* Moved the optional `hdf5_converter` to the `converters` + submodule. When updating from 0.8 or below, this means that you have + to adapt the converter package path in your cfood definition from + `caoscrawler.hdf5_converter` to + `caoscrawler.converters.hdf5_converter`. + +### Fixed ### + +* Use `urllib.parse.urljoin` to generate link addresses in status + mails, preventing wrong addresses, e.g., due to superfluous `/`. + +## [0.8.0] - 2024-08-23 ## + +### Added ### + +* Support for Python 3.12 and experimental support for 3.13 +* CFood macros now accept complex objects as values, not just strings. +* More options for the `CSVTableConverter` +* New converters: + * `DatetimeElementConverter` + * `SPSSConverter` +* New scripts: + * `spss_to_datamodel` + * `csv_to_datamodel` +* New transformer functions: + * `date_parse` + * `datetime_parse` +* New ``PropertiesFromDictConverter`` which allows to automatically + create property values from dictionary keys. + +### Changed ### + +* CFood macros do not render everything into strings now. +* Better internal handling of identifiable/reference resolving and merging of entities. This also + includes more understandable output for users. +* Better handling of missing imports, with nice messages for users. +* No longer use configuration of advancedtools to set to and from email addresses + +### Removed ### + +* Support for Python 3.7 + +### Fixed ### + +* [93](https://gitlab.com/linkahead/linkahead-crawler/-/issues/93) cfood.yaml does not allow umlaut in $expression +* [96](https://gitlab.com/linkahead/linkahead-crawler/-/issues/96) Do not fail silently on transaction errors + +### Security ### + +### Documentation ### + +* General improvement of the documentaion, in many small places. +* The API documentation should now also include documentation of the constructors. + +## [0.7.1] - 2024-03-21 ## + +### Fixed ### + +* `crawler_main` doesn't need the deprecated `debug=True` anymore to put out a + provenance file if the `provenance_file` parameter is provided. +* [indiscale#129](https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/129) + missing packaging dependency. + +## [0.7.0] - 2024-03-04 ## + +### Added ### + +* `transform` sections can be added to a CFood to apply functions to values stored in variables. +* default transform functions: submatch, split and replace. +* `*` can now be used as a wildcard in the identifiables parameter file to denote + that any Record may reference the identified one. +* `crawl.TreatedRecordLookUp` class replacing the old (and slow) + `identified_cache` module. The new class now handles all records identified by + id, path, or identifiable simultaneously. See API docs for more info on how to + add to and get from the new lookup class. +* `identifiable_adapters.IdentifiableAdapter.get_identifying_referencing_entities` + and + `identifiable_adapters.IdentifiableAdapter.get_identifying_referenced_entities` + static methods to return the referencing or referenced entities belonging to a + registered identifiable, respectively. +* [#70](https://gitlab.com/linkahead/linkahead-crawler/-/issues/70): Optional + converters for HDF5 files. They require this package to be installed with its + ``h5-crawler`` dependency. + +### Changed ### + +* If the `parents` key is used in a cfood at a lower level for a Record that + already has a Parent (because it was explicitly given or the default Parent), + the old Parent(s) are now overwritten with the value belonging to the + `parents` key. +* If a registered identifiable states, that a reference by a Record with parent + RT1 is needed, then now also references from Records that have a child of RT1 + as parent are accepted. +* More aggressive caching. +* The `identifiable_adapters.IdentifiableAdapter` now creates (possibly empty) + reference lists for all records in `create_reference_mapping`. This allows + functions like `get_identifiable` to be called only with the subset of the + referenceing entities belonging to a specific Record. +* The `identifiable_adapters.IdentifiableAdapter` uses entity ids (negative for + entities that don't exist remotely) instead of entity objects for keeping + track of references. +* Log output is either written to $SHARED_DIR/ (when this variable is set) or just to the terminal. + +### Deprecated ### + +* `IdentifiableAdapter.get_file` + +### Removed ### + +* `identified_cache` module which was replaced by the `crawl.TreatedRecordLookUp` class. + +### Fixed ### + +* Empty Records can now be created (https://gitlab.com/caosdb/caosdb-crawler/-/issues/27) +* [#58](https://gitlab.com/caosdb/caosdb-crawler/-/issues/58) Documentation builds API docs in pipeline now. +* [#117](https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/117) + `replace_variable` does no longer unnecessarily change the type. Values stored + in variables in a CFood can have now other types. +* [indiscale#113](https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/113) + Resolving referenced entities fails in some corner cases. The crawler now + handles cases correctly in which entities retrieved from the server have to be + merged with local entities that both reference another, already existing + entity +* A corner case in `split_into_inserts_and_updates` whereby two records created + in different places in the cfood definition would not be merged if both were + identified by the same LinkAhead id +* [#87](https://gitlab.com/linkahead/linkahead-crawler/-/issues/87) Handle long strings more gracefully. The crawler sometimes runs into + [linkahead-server#101](https://gitlab.com/linkahead/linkahead-server/-/issues/101), this is now mitigated. +* [indiscale#128](https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/128) Yet another corner case of referencing resolution resolved. + +## [0.6.0] - 2023-06-23 ## +(Florian Spreckelsen) + +### Added ### + +- Standard logging for server side execution +- Email notification if the `pycaosdb.ini` contains a `[caoscrawler]` with + `send_crawler_notifications=True`. +- Creation of CrawlerRun Records that contain status information about data + integration of the crawler if the `pycaosdb.ini` contains a `[caoscrawler]` + with `create_crawler_status_records=True`. +- The Crawler `synchronize` function now takes list of RecordType names. + Records that have the given names as parents are excluded from inserts or + updates +- `Crawler.synchronize` now takes an optional `path_for_authorized_run` argument + that specifies the path with which the crawler can be rerun to authorize + pending changes. + +### Fixed ### + +- Query generation when there are only backrefs or backrefs and a name +- Query generation when there are spaces or `'` in RecordType or Identifiable + names +- usage of ID when looking for identified records +- [#41](https://gitlab.com/caosdb/caosdb-crawler/-/issues/41) + +### Documentation ### + +- Expanded documentation, also has (better) tutorials now. + +## [0.5.0] - 2023-03-28 ## +(Florian Spreckelsen) + +### Changed ### + +- Refactoring of the crawl.py module: Now there is a separate scanner module handling the + collecting of information that is independent of CaosDB itself. +- The signature of the function ``save_debug_data`` was changed to explicitely + take the ``debug_tree`` as its first argument. This change was necessary, as + the ``debug_tree`` is no longer saved as member field of the Crawler class. + + +### Deprecated ### + +- The functions ``load_definition``, ``initialize_converters`` and + ``load_converters`` are deprecated. Please use the functions + ``load_definition``, ``initialize_converters`` and + ``create_converter_registry`` from the scanner module instead. +- The function ``start_crawling`` is deprecated. The function + ``scan_structure_elements`` in the scanner module mostly covers its + functionality. + +## [0.4.0] - 2023-03-22 ## +(Florian Spreckelsen) + +### Added ### + +- DateElementConverter: allows to interpret text as a date object +- the restricted_path argument allows to crawl only a subtree +- logging that provides a summary of what is inserted and updated +- You can now access the file system path of a structure element (if it has one) using the variable + name ``<converter name>.path`` +- ``add_prefix`` and ``remove_prefix`` arguments for the command line interface + and the ``crawler_main`` function for the adding/removal of path prefixes when + creating file entities. +- More strict checking of `identifiables.yaml`. +- Better error messages when server does not conform to expected data model. + +### Changed ### + +- The definitions for the default converters were removed from crawl.py and placed into + a separate yaml file called `default_converters.yml`. There is a new test testing for + the correct loading behavior of that file. +- JSONFileConverter, YAMLFileConverter and MarkdownFileConverter now inherit from + SimpleFileConverter. Behavior is unchanged, except that the MarkdownFileConverter now raises a + ConverterValidationError when the YAML header cannot be read instead of silently not matching. + +### Deprecated ### + +- The ``prefix`` argument of `crawler_main` is deprecated. Use the new argument + ``remove_prefix`` instead. + +### Removed ### +- The command line argument ``--prefix``. Use the new argument ``--remove-prefix`` instead. + +### Fixed ### + +- an empty string as name is treated as no name (as does the server). This, fixes + queries for identifiables since it would contain "WITH name=''" otherwise + which is an impossible condition. If your cfoods contained this case, they are ill defined. + +## [0.3.0] - 2022-01-30 ## +(Florian Spreckelsen) + +### Added ### + - Identifiable class to represent the information used to identify Records. -- Added some StructureElements: BooleanElement, FloatElement, IntegerElement, +- Added some StructureElements: BooleanElement, FloatElement, IntegerElement, ListElement, DictElement - String representation for Identifiables - [#43](https://gitlab.com/caosdb/caosdb-crawler/-/issues/43) the crawler @@ -28,19 +404,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Some StructureElements changed (see "How to upgrade" in the docs): - Dict, DictElement and DictDictElement were merged into DictElement. - DictTextElement and TextElement were merged into TextElement. The "match" - keyword is now invalid for TextElements. + keyword is now invalid for TextElements. - JSONFileConverter creates another level of StructureElements (see "How to upgrade" in the docs) - create_flat_list function now collects entities in a set and also adds the entities contained in the given list directly - ### Deprecated ### - The DictXYElements are now depricated and are now synonyms for the XYElements. -### Removed ### - ### Fixed ### - [#39](https://gitlab.com/caosdb/caosdb-crawler/-/issues/39) Merge conflicts in @@ -48,10 +421,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 without id - Queries for identifiables with boolean properties are now created correctly. -### Security ### - -### Documentation ### - ## [0.2.0] - 2022-11-18 ## (Florian Spreckelsen) diff --git a/CITATION.cff b/CITATION.cff index c5f9711f2b312a638fe6c8df7c95ad6a83e5ebcf..8f4e22a4f8b56c8640e7d0a9a5ccae93010b4847 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,25 +1,22 @@ cff-version: 1.2.0 message: "If you use this software, please cite it as below." authors: - - family-names: Fitschen - given-names: Timm - orcid: https://orcid.org/0000-0002-4022-432X - - family-names: Schlemmer - given-names: Alexander - orcid: https://orcid.org/0000-0003-4124-9649 - - family-names: Hornung - given-names: Daniel - orcid: https://orcid.org/0000-0002-7846-6375 - family-names: tom Wörden given-names: Henrik orcid: https://orcid.org/0000-0002-5549-578X - - family-names: Parlitz - given-names: Ulrich - orcid: https://orcid.org/0000-0003-3058-1435 + - family-names: Spreckelsen + given-names: Florian + orcid: https://orcid.org/0000-0002-6856-2910 - family-names: Luther given-names: Stefan orcid: https://orcid.org/0000-0001-7214-8125 + - family-names: Parlitz + given-names: Ulrich + orcid: https://orcid.org/0000-0003-3058-1435 +- family-names: Schlemmer + given-names: Alexander + orcid: https://orcid.org/0000-0003-4124-9649 title: CaosDB - Crawler -version: 0.2.0 -doi: 10.3390/data4020083 -date-released: 2022-11-18 \ No newline at end of file +version: 0.11.0 +doi: 10.3390/data9020024 +date-released: 2025-03-05 \ No newline at end of file diff --git a/src/doc/README_SETUP.md b/INSTALL.md similarity index 58% rename from src/doc/README_SETUP.md rename to INSTALL.md index 1f6e15d408e10e38bce0d9b9fe9b6197ec69bfc3..ba220626460c559aeded69d360c85917e0c78066 100644 --- a/src/doc/README_SETUP.md +++ b/INSTALL.md @@ -1,17 +1,14 @@ -# Getting started with the CaosDB Crawler # +# Installation ## -## Installation ## -### How to install ### - -#### Linux #### +## Linux #### Make sure that Python (at least version 3.8) and pip is installed, using your system tools and documentation. Then open a terminal and continue in the [Generic installation](#generic-installation) section. -#### Windows #### +## Windows #### If a Python distribution is not yet installed, we recommend Anaconda Python, which you can download for free from [https://www.anaconda.com](https://www.anaconda.com). The "Anaconda Individual Edition" provides most of all @@ -21,7 +18,7 @@ packages you will ever need out of the box. If you prefer, you may also install After installation, open an Anaconda prompt from the Windows menu and continue in the [Generic installation](#generic-installation) section. -#### MacOS #### +## MacOS #### If there is no Python 3 installed yet, there are two main ways to obtain it: Either get the binary package from @@ -41,40 +38,20 @@ sudo ./Install\ Certificates.command After these steps, you may continue with the [Generic installation](#generic-installation). -#### Generic installation #### +## Generic installation #### + +The CaosDB crawler is available as [PyPi +package](https://pypi.org/project/caoscrawler/) and can simply installed by ---- +```sh +pip3 install caoscrawler +``` -Obtain the sources from GitLab and install from there (`git` must be installed for -this option): +Alternatively, obtain the sources from GitLab and install from there (`git` must +be installed for this option): ```sh git clone https://gitlab.com/caosdb/caosdb-crawler cd caosdb-crawler pip3 install --user . ``` - -**Note**: In the near future, this package will also be made available on PyPi. - - -## Run Unit Tests - -## Documentation ## -We use sphinx to create the documentation. Docstrings in the code should comply -with the Googly style (see link below). - -Build documentation in `src/doc` with `make html`. - -### Requirements ### - -- `sphinx` -- `sphinx-autoapi` -- `recommonmark` - -### How to contribute ### - -- [Google Style Python Docstrings](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) -- [Google Style Python Docstrings 2nd reference](https://github.com/google/styleguide/blob/gh-pages/pyguide.md#38-comments-and-docstrings) -- [References to other documentation](https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html#role-external) - - diff --git a/Makefile b/Makefile index 95fc2bf61473b94decfb43d0c5ba0d3fda535a07..7167ebfdf106f5129ce7941706b9e871d51e551f 100644 --- a/Makefile +++ b/Makefile @@ -44,5 +44,5 @@ lint: .PHONY: lint unittest: - tox -r + pytest --cov=caoscrawler -vv ./unittests .PHONY: unittest diff --git a/README.md b/README.md index 6c94473c066439b1645712c0046cd890b6b38715..39f8d36769a520f35e717d180537a4cce704180c 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,11 @@ -# CaosDB-Crawler ## Welcome -This is the repository of the CaosDB-Crawler, a tool for automatic data -insertion into [CaosDB](https://gitlab.com/caosdb/caosdb-meta). +This is the repository of the LinkAhead Crawler, a tool for automatic data +insertion into [LinkAhead](https://gitlab.com/linkahead/linkahead). This is a new implementation resolving problems of the original implementation -in [caosdb-advancedtools](https://gitlab.com/caosdb/caosdb-advanced-user-tools) +in [LinkAhead Python Advanced User Tools](https://gitlab.com/caosdb/caosdb-advanced-user-tools) ## Setup @@ -16,20 +15,23 @@ setup this code. ## Further Reading -Please refer to the [official documentation](https://docs.indiscale.com/caosdb-crawler/) of the CaosDB-Crawler for more information. +Please refer to the [official documentation](https://docs.indiscale.com/caosdb-crawler/) of the LinkAhead Crawler for more information. ## Contributing -Thank you very much to all contributers—[past, present](https://gitlab.com/caosdb/caosdb/-/blob/dev/HUMANS.md), and prospective ones. +Thank you very much to all contributers—[past, +present](https://gitlab.com/linkahead/linkahead/-/blob/main/HUMANS.md), and prospective +ones. ### Code of Conduct -By participating, you are expected to uphold our [Code of Conduct](https://gitlab.com/caosdb/caosdb/-/blob/dev/CODE_OF_CONDUCT.md). +By participating, you are expected to uphold our [Code of +Conduct](https://gitlab.com/linkahead/linkahead/-/blob/main/CODE_OF_CONDUCT.md). ### How to Contribute * You found a bug, have a question, or want to request a feature? Please -[create an issue](https://gitlab.com/caosdb/caosdb-crawler). +[create an issue](https://gitlab.com/linkahead/linkahead-crawler/-/issues). * You want to contribute code? * **Forking:** Please fork the repository and create a merge request in GitLab and choose this repository as target. Make sure to select "Allow commits from members who can merge the target branch" under @@ -38,9 +40,8 @@ By participating, you are expected to uphold our [Code of Conduct](https://gitla * **Code style:** This project adhers to the PEP8 recommendations, you can test your code style using the `autopep8` tool (`autopep8 -i -r ./`). Please write your doc strings following the [NumpyDoc](https://numpydoc.readthedocs.io/en/latest/format.html) conventions. -* You can also contact us at **info (AT) caosdb.de** and join the - CaosDB community on - [#caosdb:matrix.org](https://matrix.to/#/!unwwlTfOznjEnMMXxf:matrix.org). +* You can also join the LinkAhead community on + [#linkahead:matrix.org](https://matrix.to/#/!unwwlTfOznjEnMMXxf:matrix.org). There is the file `unittests/records.xml` that servers as a dummy for a server state with files. diff --git a/README_SETUP.md b/README_SETUP.md deleted file mode 120000 index d478016ecde09dab8820d398b15df325f4159380..0000000000000000000000000000000000000000 --- a/README_SETUP.md +++ /dev/null @@ -1 +0,0 @@ -src/doc/README_SETUP.md \ No newline at end of file diff --git a/README_SETUP.md b/README_SETUP.md new file mode 100644 index 0000000000000000000000000000000000000000..32f0bb89a6051bc2ec4be0bae6cf06cd1a540f8b --- /dev/null +++ b/README_SETUP.md @@ -0,0 +1,34 @@ +# Getting started with the CaosDB Crawler # + +## Installation +see INSTALL.md + +## Run Unit Tests + +1. Install additional dependencies: + - h5py +2. Run `pytest unittests`. + +## Documentation ## +We use sphinx to create the documentation. Docstrings in the code should comply +with the Googly style (see link below). + +Build documentation in `src/doc` with `make doc`. Note that for the +automatic generation of the complete API documentation, it is +necessary to first install this library with all its optional +dependencies, i.e., `pip install .[h5-crawler,spss]`. + +### Requirements ### + +- `sphinx` +- `sphinx-autoapi` +- `recommonmark` +- `sphinx-rtd-theme` + +### How to contribute ### + +- [Google Style Python Docstrings](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) +- [Google Style Python Docstrings 2nd reference](https://github.com/google/styleguide/blob/gh-pages/pyguide.md#38-comments-and-docstrings) +- [References to other documentation](https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html#role-external) + + diff --git a/crawler_run_model.yml b/crawler_run_model.yml new file mode 100644 index 0000000000000000000000000000000000000000..e80244b5f449206410f25504b4d3e8bdbcfe0416 --- /dev/null +++ b/crawler_run_model.yml @@ -0,0 +1,22 @@ + +CrawlerRun: + recommended_properties: + logfile: + datatype: TEXT + description: 'A log file.' + status: + datatype: TEXT + description: 'RUNNING, FAILED or SUCCESS' + run_id: + datatype: TEXT + description: 'Unique crawler run identifier' + started: + datatype: DATETIME + description: 'Time when the crawler started.' + finished: + datatype: DATETIME + description: 'Time when the crawler finished.' + number_of_inserted_entities: + datatype: INTEGER + number_of_updated_entities: + datatype: INTEGER diff --git a/integrationtests/basic_example/test_basic.py b/integrationtests/basic_example/test_basic.py index 0c847b08a729f3b112cbdf3c38bac31309cda125..6fd322e5f6425e9bce25b970d6de7d99892762a5 100755 --- a/integrationtests/basic_example/test_basic.py +++ b/integrationtests/basic_example/test_basic.py @@ -26,35 +26,36 @@ an integration test module that does basic integration tests """ -from caosadvancedtools.crawler import Crawler as OldCrawler -import os -from caosdb import EmptyUniqueQueryError import argparse +import os import sys from argparse import RawTextHelpFormatter +from pathlib import Path + +import linkahead as db +import pytest +import yaml +from caosadvancedtools.crawler import Crawler as OldCrawler +from caosadvancedtools.models.parser import parse_model_from_yaml from caoscrawler import Crawler, SecurityMode +from caoscrawler.debug_tree import DebugTree from caoscrawler.identifiable import Identifiable -import caosdb as db from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter -import pytest -from caosadvancedtools.models.parser import parse_model_from_yaml -import yaml +from caoscrawler.scanner import scan_directory +from linkahead import EmptyUniqueQueryError +from linkahead.utils.register_tests import clear_database, set_test_key -from caosdb.utils.register_tests import clear_database, set_test_key set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") +# TODO move test related stuff here and remove it from unittests -def rfp(*pathcomponents): - """ - Return full path. - Shorthand convenience function. - """ - return os.path.join(os.path.dirname(__file__), *pathcomponents) +UNITTESTDIR = Path(__file__).parent.parent.parent / "unittests" +BASICTESTDIR = Path(__file__).parent @pytest.fixture def usemodel(): - model = parse_model_from_yaml(rfp("model.yml")) + model = parse_model_from_yaml(BASICTESTDIR / "model.yml") model.sync_data_model(noquestion=True, verbose=False) @@ -82,42 +83,46 @@ def ident(): return ident -def crawl_standard_test_directory(cr: Crawler, - subdir: str = "examples_article", - cfood: str = "scifolder_cfood.yml"): - cr.crawl_directory(rfp("..", "..", "unittests", "test_directories", subdir), - rfp("..", "..", "unittests", cfood)) +def crawl_standard_test_directory(subdir: str = "examples_article", + cfood: str = "scifolder_cfood.yml", + debug_tree=None): + return scan_directory(UNITTESTDIR / "test_directories" / subdir, + UNITTESTDIR / cfood, + debug_tree=debug_tree) @pytest.fixture def crawler(ident): - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr) - return cr + cr = Crawler(identifiableAdapter=ident) + debug_tree = DebugTree() + crawled_data = crawl_standard_test_directory(debug_tree=debug_tree) + return cr, crawled_data, debug_tree @pytest.fixture def crawler_extended(ident): - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr, cfood="scifolder_extended.yml") + cr = Crawler(identifiableAdapter=ident) + debug_tree = DebugTree() + crawled_data = crawl_standard_test_directory( + cfood="scifolder_extended.yml", debug_tree=debug_tree) # correct paths for current working directory - file_list = [r for r in cr.crawled_data if r.role == "File"] + file_list = [r for r in crawled_data if r.role == "File"] for f in file_list: - f.file = rfp("..", "..", "unittests", "test_directories", f.file) - return cr + f.file = UNITTESTDIR / "test_directories" / f.file + return cr, crawled_data, debug_tree -def test_ambigious_lookup(clear_database, usemodel, crawler, ident): - ins, ups = crawler.synchronize() +def test_ambiguous_lookup(clear_database, usemodel, crawler, ident): + ins, ups = crawler[0].synchronize(crawled_data=crawler[1]) proj = db.execute_query("FIND Project WITH identifier='SpeedOfLight'", unique=True) - with pytest.raises(RuntimeError, match=".*unambigiously.*"): - print(crawler.identifiableAdapter.retrieve_identified_record_for_identifiable( + with pytest.raises(RuntimeError, match=".*unambiguously.*"): + print(crawler[0].identifiableAdapter.retrieve_identified_record_for_identifiable( Identifiable(properties={'project': proj.id}))) def test_single_insertion(clear_database, usemodel, crawler, ident): - ins, ups = crawler.synchronize() + ins, ups = crawler[0].synchronize(crawled_data=crawler[1]) # This test also generates the file records.xml used in some of the unittesets: res = db.execute_query("FIND Record") @@ -125,7 +130,7 @@ def test_single_insertion(clear_database, usemodel, crawler, ident): if res[i].parents[0].name == "PyTestInfo": del res[i] # uncomment this to recreate the `records.xml` file - # filename = rfp("..", "..", "unittests", "records.xml") + # filename = UNITTESTDIR/ "records.xml" # with open(filename, "w") as f: # xml = res.to_xml() # # Remove noscript and transaction benchmark: @@ -138,94 +143,92 @@ def test_single_insertion(clear_database, usemodel, crawler, ident): assert len(ups) == 0 # Do a second run on the same data, there should be no changes: - crawler = Crawler(debug=True, identifiableAdapter=ident) - crawler.crawl_directory(rfp("../../unittests/test_directories", "examples_article"), - rfp("../../unittests/scifolder_cfood.yml")) - ins, ups = crawler.synchronize() + crawler = Crawler(identifiableAdapter=ident) + crawled_data = scan_directory(UNITTESTDIR / "test_directories" / "examples_article", + UNITTESTDIR / "scifolder_cfood.yml") + ins, ups = crawler.synchronize(crawled_data=crawled_data) assert len(ins) == 0 assert len(ups) == 0 def test_multiple_insertions(clear_database, usemodel, ident, crawler): - ins, ups = crawler.synchronize() + ins, ups = crawler[0].synchronize(crawled_data=crawler[1]) # Do a second run on the same data, there should be no changes: - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr) - ins, ups = cr.synchronize() + cr = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory() + ins, ups = cr.synchronize(crawled_data=crawled_data) assert len(ins) == 0 assert len(ups) == 0 def test_insertion(clear_database, usemodel, ident, crawler): - ins, ups = crawler.synchronize() + ins, ups = crawler[0].synchronize(crawled_data=crawler[1]) # Do a second run on the same data, there should a new insert: - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr, "example_insert") - assert len(cr.crawled_data) == 3 - ins, ups = cr.synchronize() + cr = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory("example_insert") + assert len(crawled_data) == 3 + ins, ups = cr.synchronize(crawled_data=crawled_data) assert len(ins) == 1 assert len(ups) == 0 # Do it again to check whether nothing is changed: - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr, "example_insert") - assert len(cr.crawled_data) == 3 - ins, ups = cr.synchronize() + cr = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory("example_insert") + assert len(crawled_data) == 3 + ins, ups = cr.synchronize(crawled_data=crawled_data) assert len(ins) == 0 assert len(ups) == 0 def test_insert_auth(clear_database, usemodel, ident, crawler): - ins, ups = crawler.synchronize() + ins, ups = crawler[0].synchronize(crawled_data=crawler[1]) # Do a second run on the same data, there should a new insert: - cr = Crawler(debug=True, identifiableAdapter=ident, securityMode=SecurityMode.RETRIEVE) - crawl_standard_test_directory(cr, "example_insert") - assert len(cr.crawled_data) == 3 - ins, ups = cr.synchronize() + cr = Crawler(identifiableAdapter=ident, securityMode=SecurityMode.RETRIEVE) + crawled_data = crawl_standard_test_directory("example_insert") + assert len(crawled_data) == 3 + ins, ups = cr.synchronize(crawled_data=crawled_data) assert len(ins) == 1 assert not ins[0].is_valid() nins, nups = OldCrawler.update_authorized_changes(cr.run_id) assert nins == 1 # Do it again to check whether nothing is changed: - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr, "example_insert") - assert len(cr.crawled_data) == 3 - ins, ups = cr.synchronize() + cr = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory("example_insert") + assert len(crawled_data) == 3 + ins, ups = cr.synchronize(crawled_data=crawled_data) assert len(ins) == 0 assert len(ups) == 0 def test_insertion_and_update(clear_database, usemodel, ident, crawler): - ins, ups = crawler.synchronize() - - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr, "example_insert") - ins, ups = cr.synchronize() - - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr, "example_overwrite_1") - # print(cr.crawled_data) - # cr.save_debug_data(rfp("provenance.yml")) - assert len(cr.crawled_data) == 3 - ins, ups = cr.synchronize() + ins, ups = crawler[0].synchronize(crawled_data=crawler[1]) + + cr = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory("example_insert") + ins, ups = cr.synchronize(crawled_data=crawled_data) + + cr = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory("example_overwrite_1") + assert len(crawled_data) == 3 + ins, ups = cr.synchronize(crawled_data=crawled_data) assert len(ins) == 0 assert len(ups) == 1 def test_identifiable_update(clear_database, usemodel, ident, crawler): - ins, ups = crawler.synchronize() + ins, ups = crawler[0].synchronize(crawled_data=crawler[1]) # Do a second run on the same data with a change in one # of the identifiables: - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr) + cr = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory() # Test the addition of a single property: - l = cr.crawled_data + l = crawled_data for record in l: if (record.parents[0].name == "Measurement" and record.get_property("date").value == "2020-01-03"): @@ -234,28 +237,28 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler): name="email", value="testperson@testaccount.test") print("one change") break - ins, ups = cr.synchronize() + ins, ups = cr.synchronize(crawled_data=crawled_data) assert len(ins) == 0 assert len(ups) == 1 # Test the change within one property: - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr) - l = cr.crawled_data + cr = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory() + l = crawled_data for record in l: if (record.parents[0].name == "Measurement" and record.get_property("date").value == "2020-01-03"): record.add_property(name="email", value="testperson@coolmail.test") print("one change") break - ins, ups = cr.synchronize() + ins, ups = cr.synchronize(crawled_data=crawled_data) assert len(ins) == 0 assert len(ups) == 1 # Changing the date should result in a new insertion: - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr) - l = cr.crawled_data + cr = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory() + l = crawled_data for record in l: if (record.parents[0].name == "Measurement" and record.get_property("date").value == "2020-01-03"): @@ -263,30 +266,31 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler): record.get_property("date").value = "2012-01-02" print("one change") break - ins, ups = cr.synchronize() + ins, ups = cr.synchronize(crawled_data=crawled_data) assert len(ins) == 1 assert len(ups) == 0 def test_file_insertion_dry(clear_database, usemodel, ident): - crawler_extended = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory( - crawler_extended, cfood="scifolder_extended.yml") - file_list = [r for r in crawler_extended.crawled_data if r.role == "File"] + crawler_extended = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory( + cfood="scifolder_extended.yml") + file_list = [r for r in crawled_data if r.role == "File"] assert len(file_list) == 11 for f in file_list: assert f.path.endswith("README.md") assert f.path[1:] == f.file - ins, ups = crawler_extended.synchronize(commit_changes=False) + ins, ups = crawler_extended.synchronize(crawled_data=crawled_data, commit_changes=False) assert len(ups) == 0 file_list_ins = [r for r in ins if r.role == "File"] assert len(file_list_ins) == 11 def test_file_insertion(clear_database, usemodel, ident, crawler_extended): - ins, ups = crawler_extended.synchronize(commit_changes=True) + ins, ups = crawler_extended[0].synchronize( + crawled_data=crawler_extended[1], commit_changes=True) file_list_ins = [r for r in ins if r.role == "File"] assert len(file_list_ins) == 11 @@ -302,16 +306,17 @@ def test_file_insertion(clear_database, usemodel, ident, crawler_extended): def test_file_update(clear_database, usemodel, ident, crawler_extended): - ins1, ups1 = crawler_extended.synchronize(commit_changes=True) + ins1, ups1 = crawler_extended[0].synchronize( + crawled_data=crawler_extended[1], commit_changes=True) file_list_ins = [r for r in ins1 if r.role == "File"] - cr = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr, cfood="scifolder_extended.yml") + cr = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory(cfood="scifolder_extended.yml") - file_list = [r for r in cr.crawled_data if r.role == "File"] + file_list = [r for r in crawled_data if r.role == "File"] for f in file_list: - f.file = rfp("..", "..", "unittests", "test_directories", f.file) - ins2, ups2 = cr.synchronize(commit_changes=True) + f.file = UNITTESTDIR / "test_directories", f.file + ins2, ups2 = cr.synchronize(crawled_data=crawled_data, commit_changes=True) assert len(ups1) == 0 assert len(ups2) == 0 @@ -320,13 +325,13 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended): assert len(res) == 11 assert len(res[0].parents) == 0 - cr2 = Crawler(debug=True, identifiableAdapter=ident) - crawl_standard_test_directory(cr2, cfood="scifolder_extended2.yml") + cr2 = Crawler(identifiableAdapter=ident) + crawled_data = crawl_standard_test_directory(cfood="scifolder_extended2.yml") - file_list = [r for r in cr2.crawled_data if r.role == "File"] + file_list = [r for r in crawled_data if r.role == "File"] for f in file_list: - f.file = rfp("..", "..", "unittests", "test_directories", f.file) - ins3, ups3 = cr2.synchronize(commit_changes=True) + f.file = UNITTESTDIR / "test_directories", f.file + ins3, ups3 = cr2.synchronize(crawled_data=crawled_data, commit_changes=True) assert len(ups3) == 11 res = db.execute_query("Find File") diff --git a/integrationtests/test_crawler_main.py b/integrationtests/test_crawler_main.py new file mode 100644 index 0000000000000000000000000000000000000000..a2eebf4f04e195754eaf71dc5e829b6a77a4cc4b --- /dev/null +++ b/integrationtests/test_crawler_main.py @@ -0,0 +1,95 @@ +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# 2024 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +import logging +import tempfile + +from pathlib import Path + +import linkahead as db + +from caoscrawler import crawl +from caoscrawler.crawl import (crawler_main, SecurityMode) +from linkahead.utils.register_tests import clear_database, set_test_key + +set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") + +INTTESTDIR = Path(__file__).parent + + +def test_list_of_paths(clear_database, monkeypatch): + + # Mock the status record + dummy_status = { + "n_calls": 0 + } + + def _mock_update_status_record(run_id, n_inserts, n_updates, status): + print("Update mocked status") + dummy_status["run_id"] = run_id + dummy_status["n_inserts"] = n_inserts + dummy_status["n_updates"] = n_updates + dummy_status["status"] = status + dummy_status["n_calls"] += 1 + monkeypatch.setattr(crawl, "_update_status_record", _mock_update_status_record) + + # mock SSS environment + monkeypatch.setenv("SHARED_DIR", tempfile.gettempdir()) + + # We need only one dummy RT + rt = db.RecordType(name="TestType").insert() + basepath = INTTESTDIR / "test_data" / "crawler_main_with_list_of_dirs" + dirlist = [basepath / "dir1", basepath / "dir2"] + crawler_main( + dirlist, + cfood_file_name=basepath / "cfood.yml", + identifiables_definition_file=basepath / "identifiable.yml" + ) + recs = db.execute_query("FIND TestType") + assert len(recs) == 2 + assert "Test1" in [r.name for r in recs] + assert "Test2" in [r.name for r in recs] + + assert dummy_status["n_inserts"] == 2 + assert dummy_status["n_updates"] == 0 + assert dummy_status["status"] == "OK" + assert dummy_status["n_calls"] == 1 + + +def test_not_implemented_list_with_authorization(caplog, clear_database): + + rt = db.RecordType(name="TestType").insert() + basepath = INTTESTDIR / "test_data" / "crawler_main_with_list_of_dirs" + dirlist = [basepath / "dir1", basepath / "dir2"] + + # This is not implemented yet, so check log for correct error. + ret = crawler_main( + dirlist, + cfood_file_name=basepath / "cfood.yml", + identifiables_definition_file=basepath / "identifiable.yml", + securityMode=SecurityMode.RETRIEVE + ) + # crawler_main hides the error, but has a non-zero return code and + # errors in the log: + assert ret != 0 + err_tuples = [t for t in caplog.record_tuples if t[1] == logging.ERROR] + assert len(err_tuples) == 1 + assert "currently implemented only for single paths, not for lists of paths" in err_tuples[0][2] + # No inserts after the errors + assert len(db.execute_query("FIND TestType")) == 0 diff --git a/integrationtests/test_data/crawler_main_with_list_of_dirs/cfood.yml b/integrationtests/test_data/crawler_main_with_list_of_dirs/cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..c7f22ce07e9b401915aefde3bf7e3a78d92e2bd6 --- /dev/null +++ b/integrationtests/test_data/crawler_main_with_list_of_dirs/cfood.yml @@ -0,0 +1,10 @@ +--- +metadata: + crawler-version: 0.10.2 +--- +BaseDirElement: + type: Directory + match: ^dir(?P<dir_number>[0-9]+)$$ + records: + TestType: + name: Test$dir_number diff --git a/integrationtests/test_data/crawler_main_with_list_of_dirs/dir1/.gitkeep b/integrationtests/test_data/crawler_main_with_list_of_dirs/dir1/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/integrationtests/test_data/crawler_main_with_list_of_dirs/dir2/.gitkeep b/integrationtests/test_data/crawler_main_with_list_of_dirs/dir2/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/integrationtests/test_data/crawler_main_with_list_of_dirs/identifiable.yml b/integrationtests/test_data/crawler_main_with_list_of_dirs/identifiable.yml new file mode 100644 index 0000000000000000000000000000000000000000..6d608cece0ae7c2aa6461fb56025a8ac8e4faf6f --- /dev/null +++ b/integrationtests/test_data/crawler_main_with_list_of_dirs/identifiable.yml @@ -0,0 +1,2 @@ +TestType: + - name diff --git a/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml b/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml index 7a64d708667182b80b739812e5fdf3369fc5b462..37a34d125dcff1d121b1bded2fe959c4d30ff403 100644 --- a/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml +++ b/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml @@ -153,6 +153,13 @@ Data: metadata_json: &metadata_json_template type: JSONFile match: metadata.json + records: + JSONFile: + parents: + - JSONFile + role: File + path: ${metadata_json.path} + file: ${metadata_json.path} validate: schema/dataset.schema.json subtree: jsondict: diff --git a/integrationtests/test_data/extroot/realworld_example/schema/dataspace.schema.json b/integrationtests/test_data/extroot/realworld_example/schema/dataspace.schema.json index 01653bfa821e0a0acbb5a481bfd458e2ed784fb9..36233230ae05f9df58ae4e492ff1f709322f6e51 100644 --- a/integrationtests/test_data/extroot/realworld_example/schema/dataspace.schema.json +++ b/integrationtests/test_data/extroot/realworld_example/schema/dataspace.schema.json @@ -9,6 +9,7 @@ "minimum": 20000 }, "archived": { "type": "boolean" }, + "JSONFile": { "type": "object" }, "url": { "type": "string", "description": "link to folder on file system (CaosDB or cloud folder)" diff --git a/integrationtests/test_data/extroot/use_case_simple_presentation/cfood.yml b/integrationtests/test_data/extroot/use_case_simple_presentation/cfood.yml index 6495e1828dc56e99459c162f7751951f880ea55c..c55be2157a1f079ecfb5809c3658586f9114fad1 100644 --- a/integrationtests/test_data/extroot/use_case_simple_presentation/cfood.yml +++ b/integrationtests/test_data/extroot/use_case_simple_presentation/cfood.yml @@ -25,8 +25,8 @@ extroot: parents: - mdfile role: File - path: $DataFile - file: $DataFile + path: ${DataFile.path} + file: ${DataFile.path} Experiment: mdfile: $mdfile @@ -68,8 +68,8 @@ extroot: parents: - mdfile role: File - path: $DataFile - file: $DataFile + path: ${DataFile.path} + file: ${DataFile.path} Experiment: {} diff --git a/integrationtests/test_issues.py b/integrationtests/test_issues.py index 527b4c0cf67f483d5b61972a0104ff4fb673402d..0506fa4db03e9b3638051e6ec4fa132bd348a988 100644 --- a/integrationtests/test_issues.py +++ b/integrationtests/test_issues.py @@ -1,4 +1,4 @@ -# This file is a part of the CaosDB Project. +# This file is a part of the LinkAhead Project. # # Copyright (C) 2022 Indiscale GmbH <info@indiscale.com> # 2022 Florian Spreckelsen <f.spreckelsen@indiscale.com> @@ -16,18 +16,31 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from pytest import fixture, mark - -import caosdb as db +import tempfile +import linkahead as db +import yaml +from caosadvancedtools.models.parser import parse_model_from_string from caoscrawler.crawl import Crawler +from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter +from caoscrawler.scanner import (_load_definition_from_yaml_dict, + create_converter_registry, + scan_structure_elements) from caoscrawler.structure_elements import DictElement +from linkahead.cached import cache_clear +from linkahead.utils.register_tests import clear_database, set_test_key +from pytest import fixture, mark, raises -from caosdb.utils.register_tests import clear_database, set_test_key set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") +@fixture(autouse=True) +def clear_cache(): + """Clear the LinkAhead cache.""" + cache_clear() + + def test_issue_23(clear_database): """Test that an update leaves existing properties, that were not found by the crawler, unchanged. @@ -86,8 +99,8 @@ def test_issue_23(clear_database): ident.register_identifiable("TestType", db.RecordType().add_parent( name="TestType").add_property(name="identifying_prop")) - crawler = Crawler(debug=True, identifiableAdapter=ident) - converter_registry = crawler.load_converters(crawler_definition) + crawler = Crawler(identifiableAdapter=ident) + converter_registry = create_converter_registry(crawler_definition) # the dictionary to be crawled... test_dict = { @@ -95,7 +108,8 @@ def test_issue_23(clear_database): "prop_b": "something_else" } - records = crawler.start_crawling( + crawler.generate_run_id() + records = scan_structure_elements( DictElement("TestDict", test_dict), crawler_definition, converter_registry) assert len(records) == 1 @@ -105,11 +119,11 @@ def test_issue_23(clear_database): assert rec_crawled.get_property("identifying_prop").value == "identifier" assert rec_crawled.get_property("prop_b") is not None assert rec_crawled.get_property("prop_b").value == "something_else" - # no interaction with the database yet, so the rrecord shouldn't have a prop_a yet + # no interaction with the database yet, so the record shouldn't have a prop_a yet assert rec_crawled.get_property("prop_a") is None # synchronize with database and update the record - ins, ups = crawler.synchronize() + ins, ups = crawler.synchronize(crawled_data=records) assert len(ins) == 0 assert len(ups) == 1 @@ -124,3 +138,298 @@ def test_issue_23(clear_database): "identifying_prop").value == rec_crawled.get_property("identifying_prop").value assert rec_retrieved.get_property( "prop_b").value == rec_crawled.get_property("prop_b").value + + +def test_issue_83(clear_database): + """https://gitlab.com/linkahead/linkahead-crawler/-/issues/83. Test that + names don't need to be unique for referenced entities if they are not part + of the identifiable. + + """ + + # Very simple data model + identifying_prop = db.Property(name="IdentifyingProp", datatype=db.INTEGER).insert() + referenced_type = db.RecordType(name="ReferencedType").add_property( + name=identifying_prop.name, importance=db.OBLIGATORY).insert() + referencing_type = db.RecordType(name="ReferencingType").add_property( + name=referenced_type.name, datatype=db.LIST(referenced_type.name)).insert() + + # Define identifiables. ReferencingType by name, ReferencedType by + # IdentifyingProp and not by name. + ident = CaosDBIdentifiableAdapter() + ident.register_identifiable(referenced_type.name, db.RecordType().add_parent( + name=referenced_type.name).add_property(name=identifying_prop.name)) + ident.register_identifiable(referencing_type.name, db.RecordType().add_parent( + name=referencing_type.name).add_property(name="name")) + + crawler = Crawler(identifiableAdapter=ident) + + ref_target1 = db.Record(name="RefTarget").add_parent( + name=referenced_type.name).add_property(name=identifying_prop.name, value=1) + ref_target2 = db.Record(name="RefTarget").add_parent( + name=referenced_type.name).add_property(name=identifying_prop.name, value=2) + + referencing1 = db.Record(name="Referencing1").add_parent( + name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target1]) + referencing2 = db.Record(name="Referencing2").add_parent( + name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target2]) + referencing3 = db.Record(name="Referencing3").add_parent( + name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target1, + ref_target2]) + + records = db.Container().extend( + [ref_target1, ref_target2, referencing1, referencing2, referencing3]) + + ins, ups = crawler.synchronize(crawled_data=records, unique_names=False) + assert len(ins) == len(records) + assert len(ups) == 0 + + retrieved_target1 = db.execute_query( + f"FIND {referenced_type.name} WITH {identifying_prop.name}=1", unique=True) + retrieved_target2 = db.execute_query( + f"FIND {referenced_type.name} WITH {identifying_prop.name}=2", unique=True) + assert retrieved_target2.name == retrieved_target1.name + assert retrieved_target1.name == ref_target1.name + assert retrieved_target1.id != retrieved_target2.id + + retrieved_referencing1 = db.execute_query( + f"FIND {referencing_type.name} WITH name={referencing1.name}", unique=True) + assert retrieved_referencing1.get_property(referenced_type.name) is not None + assert retrieved_referencing1.get_property(referenced_type.name).value == [ + retrieved_target1.id] + assert retrieved_referencing1.get_property(referenced_type.name).value != [ + retrieved_target2.id] + + retrieved_referencing2 = db.execute_query( + f"FIND {referencing_type.name} WITH name={referencing2.name}", unique=True) + assert retrieved_referencing2.get_property(referenced_type.name) is not None + assert retrieved_referencing2.get_property(referenced_type.name).value == [ + retrieved_target2.id] + assert retrieved_referencing2.get_property(referenced_type.name).value != [ + retrieved_target1.id] + + retrieved_referencing3 = db.execute_query( + f"FIND {referencing_type.name} WITH name={referencing3.name}", unique=True) + assert retrieved_referencing3.get_property(referenced_type.name) is not None + assert len(retrieved_referencing3.get_property(referenced_type.name).value) == 2 + assert retrieved_target1.id in retrieved_referencing3.get_property(referenced_type.name).value + assert retrieved_target2.id in retrieved_referencing3.get_property(referenced_type.name).value + + +def test_indiscale_113(clear_database): + """Somewhat mysterious failures to resolve references in + split_into_inserts_and_updates, see + https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/113 + + """ + + # Create and insert minimal datamodel + datamodel_str = """ +Event: + recommended_properties: + Basis: + Campaign: +Basis: +Campaign: + recommended_properties: + Basis: +""" + model = parse_model_from_string(datamodel_str) + model.sync_data_model(noquestion=True) + + # Register identifiables, everything is identified by name + ident = CaosDBIdentifiableAdapter() + ident.register_identifiable("Event", db.RecordType().add_parent( + name="Event").add_property(name="name")) + ident.register_identifiable("Basis", db.RecordType().add_parent( + name="Basis").add_property(name="name")) + ident.register_identifiable("Campaign", db.RecordType().add_parent( + name="Campaign").add_property(name="name")) + + crawler = Crawler(identifiableAdapter=ident) + + # Add records: event references basis and campaign, campaign references + # basis. + basis = db.Record(name="Poseidon").add_parent(name="Basis") + campaign = db.Record(name="POS386").add_parent( + name="Campaign").add_property(name="Basis", value=basis) + event = db.Record(name="GeoB13952").add_parent(name="Event") + event.add_property(name="Basis", value=basis) + event.add_property(name="Campaign", value=campaign) + + # basis and campaign already exist in the db + db.Container().extend([basis, campaign]).insert() + # redefine to trigger resolving + basis = db.Record(name="Poseidon").add_parent(name="Basis") + campaign = db.Record(name="POS386").add_parent( + name="Campaign").add_property(name="Basis", value=basis) + recs = [event, basis, campaign] + + ins, ups = crawler.synchronize(crawled_data=recs, unique_names=False) + # There is only one event to be inserted + assert len(ins) == 1 + # Nothing to do for the existing ents + assert len(ups) == 0 + assert ins[0].name == event.name + + +def test_indiscale_87(clear_database): + """Handle long string queries gracefully. + + https://gitlab.com/linkahead/linkahead-crawler/-/issues/87 + """ + + prop = db.Property(name="str", datatype=db.TEXT).insert() + rt = db.RecordType(name="RT1").add_property(prop).insert() + strings = [ + "X123456789" * 26, + "X" * 260, + "X123456789" * 25 + "9876543210", + ] + recs = [ + db.Record().add_parent(rt).add_property(name="str", value=string).insert() + for string in strings + ] + idents = [ + Identifiable(record_type="RT1", properties={"str": string}) + for string in strings + ] + adapter = CaosDBIdentifiableAdapter() + for rec, ident in zip(recs, idents): + print(f"Testing: ...{rec.get_property('str').value[-10:]}") + retrieved = adapter.retrieve_identified_record_for_identifiable(ident) + # print(rec) + # print(retrieved) + print(db.apiutils.compare_entities(rec, retrieved)) + assert db.apiutils.empty_diff(rec, retrieved) + print("---") + + # add another, harmless, property + prop2 = db.Property(name="someint", datatype=db.INTEGER).insert() + rt.add_property(prop2).update() + string = "Y123456789" * 26 + numbers = [23, 42] + recs = [ + db.Record().add_parent(rt).add_property(name="str", value=string).add_property( + name="someint", value=number).insert() + for number in numbers + ] + idents = [Identifiable(record_type="RT1", properties={"str": string})] + # Ambiguous result + with raises(RuntimeError, match=".*unambiguously.*"): + retrieved = adapter.retrieve_identified_record_for_identifiable(idents[0]) + + # Upgrade new property to be identifying + idents = [ + Identifiable(record_type="RT1", properties={"str": string, "someint": number}) + for number in numbers + ] + for rec, ident in zip(recs, idents): + print(f"Testing: someint={rec.get_property('someint').value}") + retrieved = adapter.retrieve_identified_record_for_identifiable(ident) + # print(rec) + # print(retrieved) + print(db.apiutils.compare_entities(rec, retrieved)) + assert db.apiutils.empty_diff(rec, retrieved) + print("---") + + +def test_issue_16(clear_database): + """ + This is another a test for: + https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/16 + + In addition to the two unit tests for recursive definition in `test_scanner.py` this system test + tests whether recursively defined records can be synchronized correctly using the crawler. + """ + recursive_yaml = """ +FirstConverter: + type: DictElement + records: + Experiment: + subtree: + Converter: + type: DictElement + records: + Block: + name: block 1 + Experiment: $Experiment + Experiment: + name: experiment 1 + Block: $Block + """ + + crawler_definition = _load_definition_from_yaml_dict( + [yaml.load(recursive_yaml, Loader=yaml.SafeLoader)]) + converter_registry = create_converter_registry(crawler_definition) + + # Nested DictElements that match the yaml structure in recursive_yaml: + data = {"data": { + }} + records = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry) + + rt_exp = db.RecordType(name="Experiment").insert() + rt_block = db.RecordType(name="Block").insert() + + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_object(yaml.safe_load(""" +Experiment: +- name +Block: +- name +""")) + + crawler = Crawler(identifiableAdapter=ident) + crawler.synchronize(crawled_data=records) + + exp_res = db.execute_query("FIND Experiment") + assert len(exp_res) == 1 + exp_block = db.execute_query("FIND Block") + assert len(exp_block) == 1 + + assert exp_res[0].get_property("Block").value == exp_block[0].id + assert exp_block[0].get_property("Experiment").value == exp_res[0].id + + +def test_issue_14(clear_database): + """ + Issue title: Some parent updates are required before inserts + + https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/14 + """ + + rt1 = db.RecordType(name="RT1") + rt2 = db.RecordType(name="RT2").insert() + rt1.add_property(rt2, importance=db.OBLIGATORY) + rt1.insert() + + r = db.Record() + r.add_parent(rt1) + with tempfile.NamedTemporaryFile() as tmpf: + f = db.File(name="test_parent", path="parent_test/file.txt", file=tmpf.name) + f.insert() + + # We create a clean new file object here: + f2 = db.File(name="test_parent", path="parent_test/file.txt", file=tmpf.name) + + f2.add_parent(rt2) + r.add_property(name="RT2", value=f2) + + # Current state in the database: File without parents + f_test_base = db.File(name="test_parent").retrieve() + assert len(f_test_base.parents) == 0 + assert len(db.execute_query("FIND Record")) == 0 + + ident = CaosDBIdentifiableAdapter() + ident.register_identifiable("RT1", db.RecordType().add_parent( + name="RT1").add_property(name="RT2")) + crawler = Crawler(identifiableAdapter=ident) + crawler.synchronize(crawled_data=[f2, r]) + + f_test = db.File(name="test_parent").retrieve() + assert len(f_test.parents) == 1 + assert f_test.parents[0].name == "RT2" + records = db.execute_query("FIND Record") + assert len(records) == 1 + assert records[0].get_property("RT2").value == f_test.id diff --git a/integrationtests/test_realworld_example.py b/integrationtests/test_realworld_example.py index 4158ed22278ef5c871a22d45885e58fbfa84ea3b..fbbf25643e1c1cf928aa9599c92d3d6e94a88974 100644 --- a/integrationtests/test_realworld_example.py +++ b/integrationtests/test_realworld_example.py @@ -24,19 +24,22 @@ """ an integration test module that runs a test against a (close to) real world example """ -from caosdb.utils.register_tests import clear_database, set_test_key import json +import logging import os +import pytest +import sys -import caosdb as db +import linkahead as db +from linkahead.cached import cache_clear +from linkahead.utils.register_tests import clear_database, set_test_key +from caosadvancedtools.loadFiles import loadpath +from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml from caoscrawler.crawl import Crawler, crawler_main from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter +from caoscrawler.scanner import load_definition, scan_structure_elements, create_converter_registry from caoscrawler.structure_elements import Directory -import pytest -from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml - -import sys set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") @@ -52,6 +55,22 @@ def rfp(*pathcomponents): DATADIR = rfp("test_data", "extroot", "realworld_example") +@pytest.fixture(autouse=True) +def clear_cache(): + cache_clear() + + +@pytest.fixture +def addfiles(): + loadpath(path='/opt/caosdb/mnt/extroot/', + include=None, + exclude=None, + prefix="", + dryrun=False, + forceAllowSymlinks=True, + ) + + @pytest.fixture def usemodel(): # First load dataspace data model @@ -70,37 +89,26 @@ def usemodel(): dataset_inherits.sync_data_model(noquestion=True) -@pytest.fixture -def clear_database(): - # TODO(fspreck): Remove once the corresponding advancedtools function can - # be used. - ents = db.execute_query("FIND ENTITY WITH ID>99") - if ents: - ents.delete() - - def create_identifiable_adapter(): ident = CaosDBIdentifiableAdapter() ident.load_from_yaml_definition(os.path.join(DATADIR, "identifiables.yml")) return ident -def test_dataset(clear_database, usemodel): - ident = create_identifiable_adapter() - crawler = Crawler(identifiableAdapter=ident) - crawler_definition = crawler.load_definition( - os.path.join(DATADIR, "dataset_cfoods.yml")) - # print(json.dumps(crawler_definition, indent=3)) - # Load and register converter packages: - converter_registry = crawler.load_converters(crawler_definition) - # print("DictIntegerElement" in converter_registry) - - records = crawler.start_crawling( - Directory("data", os.path.join(DATADIR, 'data')), - crawler_definition, - converter_registry +def test_dataset(clear_database, usemodel, addfiles, caplog): + caplog.set_level(logging.DEBUG, logger="caoscrawler") + identifiable_path = os.path.join(DATADIR, "identifiables.yml") + crawler_definition_path = os.path.join(DATADIR, "dataset_cfoods.yml") + crawler_main( + crawled_directory_path=os.path.join(DATADIR, 'data'), + cfood_file_name=crawler_definition_path, + identifiables_definition_file=identifiable_path, + provenance_file=os.path.join(DATADIR, "provenance.yml"), + dry_run=False, + remove_prefix=DATADIR, + # this test will fail without this prefix since the crawler would try to create new files + add_prefix="/extroot/realworld_example" ) - crawler.synchronize() dataspace = db.execute_query("FIND RECORD Dataspace WITH name=35 AND dataspace_id=20002 AND " "archived=FALSE AND url='https://datacloud.de/index.php/f/7679'" @@ -119,21 +127,26 @@ def test_dataset(clear_database, usemodel): "start_datetime='2022-02-10T16:36:48+01:00'") == 1 assert db.execute_query(f"FIND Event WITH latitude=53", unique=True) + # test logging + assert "Executed inserts" in caplog.text + assert "Going to insert" in caplog.text + assert "Executed updates" in caplog.text + -def test_event_update(clear_database, usemodel): +def test_event_update(clear_database, usemodel, addfiles): identifiable_path = os.path.join(DATADIR, "identifiables.yml") crawler_definition_path = os.path.join(DATADIR, "dataset_cfoods.yml") - # TODO(fspreck): Use crawler_main crawler_main( - os.path.join(DATADIR, 'data'), - crawler_definition_path, - identifiable_path, - True, - os.path.join(DATADIR, "provenance.yml"), - False, - "" + crawled_directory_path=os.path.join(DATADIR, 'data'), + cfood_file_name=crawler_definition_path, + identifiables_definition_file=identifiable_path, + provenance_file=os.path.join(DATADIR, "provenance.yml"), + dry_run=False, + remove_prefix=DATADIR, + # this test will fail without this prefix since the crawler would try to create new files + add_prefix="/extroot/realworld_example" ) old_dataset_rec = db.execute_query( @@ -151,10 +164,11 @@ def test_event_update(clear_database, usemodel): ident.load_from_yaml_definition(identifiable_path) second_crawler = Crawler(identifiableAdapter=ident) - crawler_definition = second_crawler.load_definition( + second_crawler.generate_run_id() + crawler_definition = load_definition( crawler_definition_path) - converter_registry = second_crawler.load_converters(crawler_definition) - records = second_crawler.start_crawling( + converter_registry = create_converter_registry(crawler_definition) + records = scan_structure_elements( Directory("data", os.path.join(DATADIR, "data")), crawler_definition, converter_registry @@ -172,7 +186,7 @@ def test_event_update(clear_database, usemodel): "latitude").value = 0.0 rec.get_property("Event").value[0].get_property( "location").value = "Origin" - second_crawler.synchronize() + second_crawler.synchronize(crawled_data=records) # Dataset is still the same Record, but with an updated event new_dataset_rec = db.Record(id=old_dataset_rec.id).retrieve() diff --git a/integrationtests/test_use_case_simple_presentation.py b/integrationtests/test_use_case_simple_presentation.py index 91c523be90a4d0117a7cc54217cae0b911511957..05b0a543deb03eb524d40d6a386876812e6b54e2 100644 --- a/integrationtests/test_use_case_simple_presentation.py +++ b/integrationtests/test_use_case_simple_presentation.py @@ -22,15 +22,17 @@ # ** end header # +import logging import os import pytest from subprocess import run -import caosdb as db +import linkahead as db from caosadvancedtools.loadFiles import loadpath +from linkahead.cached import cache_clear from caosadvancedtools.models import parser as parser from caoscrawler.crawl import crawler_main -from caosdb.utils.register_tests import clear_database, set_test_key +from linkahead.utils.register_tests import clear_database, set_test_key set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2") @@ -38,9 +40,12 @@ DATADIR = os.path.join(os.path.dirname(__file__), "test_data", "extroot", "use_case_simple_presentation") -def test_complete_crawler( - clear_database -): +@pytest.fixture(autouse=True) +def clear_cache(): + cache_clear() + + +def test_complete_crawler(clear_database, caplog): # Setup the data model: model = parser.parse_model_from_yaml(os.path.join(DATADIR, "model.yml")) model.sync_data_model(noquestion=True, verbose=False) @@ -57,13 +62,27 @@ def test_complete_crawler( dryrun=False, forceAllowSymlinks=False) - crawler_main(DATADIR, - os.path.join(DATADIR, "cfood.yml"), - os.path.join(DATADIR, "identifiables.yml"), - True, - os.path.join(DATADIR, "provenance.yml"), - False, - "/use_case_simple_presentation") + # test that a bad value for "remove_prefix" leads to runtime error + caplog.set_level(logging.DEBUG, logger="caoscrawler.crawl") + assert 1 == crawler_main( + crawled_directory_path=os.path.join(DATADIR), + cfood_file_name=os.path.join(DATADIR, "cfood.yml"), + identifiables_definition_file=os.path.join(DATADIR, "identifiables.yml"), + provenance_file=os.path.join(DATADIR, "provenance.yml"), + dry_run=False, + remove_prefix="sldkfjsldf", + ) + assert "path does not start with the prefix" in caplog.text + caplog.clear() + + crawler_main( + crawled_directory_path=os.path.join(DATADIR), + cfood_file_name=os.path.join(DATADIR, "cfood.yml"), + identifiables_definition_file=os.path.join(DATADIR, "identifiables.yml"), + provenance_file=os.path.join(DATADIR, "provenance.yml"), + dry_run=False, + remove_prefix=os.path.abspath(DATADIR), + ) res = db.execute_query("FIND Record Experiment") assert len(res) == 1 diff --git a/release.sh b/release.sh index 1af097f014de6cd9eb3d3e8ba5da34aea0fe1671..f6335ae20d0c29e760b508aac831a35460a59ef3 100755 --- a/release.sh +++ b/release.sh @@ -1,4 +1,4 @@ #!/bin/bash rm -rf dist/ build/ .eggs/ python setup.py sdist bdist_wheel -python -m twine upload -s dist/* +python -m twine upload dist/* diff --git a/setup.cfg b/setup.cfg index 433ea0abad33edc3998465809a5a97c0bc47d75a..f6f95d6de8bdea2d7620eb05d7654cb9600fabb5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,9 +1,9 @@ [metadata] name = caoscrawler -version = 0.2.1 +version = 0.11.1 author = Alexander Schlemmer author_email = alexander.schlemmer@ds.mpg.de -description = A new crawler for caosdb +description = A new crawler for LinkAhead long_description = file: README.md long_description_content_type = text/markdown # url @@ -17,16 +17,17 @@ classifiers = package_dir = = src packages = find: -python_requires = >=3.7 +python_requires = >=3.9 install_requires = - importlib-resources - caosdb > 0.10.0 - caosadvancedtools >= 0.6.0 - yaml-header-tools >= 0.2.1 - pyyaml + caosadvancedtools >= 0.7.0 + importlib-resources + linkahead >= 0.16.0 odfpy #make optional + packaging pandas - importlib_metadata;python_version<'3.8' + pyarrow # Will be required by Pandas >= 3.0. + pyyaml + yaml-header-tools >= 0.2.1 [options.packages.find] where = src @@ -38,4 +39,16 @@ per-file-ignores = __init__.py:F401 [options.entry_points] console_scripts = + linkahead-crawler = caoscrawler.crawl:main caosdb-crawler = caoscrawler.crawl:main + spss_to_datamodel = caoscrawler.converters.spss:spss_to_datamodel_main + csv_to_datamodel = caoscrawler.scripts.generators:csv_to_datamodel_main + +[options.extras_require] +h5-crawler = + h5py >= 3.8 + numpy +spss = + pandas[spss] +rocrate = + rocrate diff --git a/src/caoscrawler/__init__.py b/src/caoscrawler/__init__.py index 044d8f0bf53c4c80dab9b492919fa64ab321a60d..ba4844e15387cd13aa15db88521b2022fa52bfd6 100644 --- a/src/caoscrawler/__init__.py +++ b/src/caoscrawler/__init__.py @@ -1,2 +1,5 @@ +from . import converters, utils from .crawl import Crawler, SecurityMode -from .version import CfoodRequiredVersionError, version as __version__ +from .version import CfoodRequiredVersionError, get_caoscrawler_version + +__version__ = get_caoscrawler_version() diff --git a/src/caoscrawler/authorize.py b/src/caoscrawler/authorize.py index 6f1011b227881d4b73186996076abe20d94d52e5..f3deed4f8c78afa85fdd4471fe9383760b8c8b12 100644 --- a/src/caoscrawler/authorize.py +++ b/src/caoscrawler/authorize.py @@ -19,10 +19,10 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from caosadvancedtools.crawler import Crawler as OldCrawler - import argparse +from caosadvancedtools.crawler import Crawler as OldCrawler + def parse_args(): parser = argparse.ArgumentParser() diff --git a/src/caoscrawler/cfood-schema.yml b/src/caoscrawler/cfood-schema.yml index 5e724c83695e098ce980e1aa8e81c65ae8525e19..d2e4cea24f0f2803499116420091b36e95b2c781 100644 --- a/src/caoscrawler/cfood-schema.yml +++ b/src/caoscrawler/cfood-schema.yml @@ -1,9 +1,44 @@ cfood: type: object + properties: + Converters: + description: Defintiion of custom converters + type: object + additionalProperties: + type: object + properties: + converter: + type: string + package: + type: string + required: + - converter + - package + macros: + description: Macro definitions + type: array + Transformers: + description: Variable transformer definition + type: object + additionalProperties: + type: object + properties: + function: + type: string + package: + type: string + required: + - package + - function additionalProperties: $ref: "#/$defs/converter" $defs: + parents: + description: Parents for this record are given here as a list of names. + type: array + items: + type: string converter: properties: type: @@ -27,9 +62,22 @@ cfood: - BooleanElement - Definitions - Dict + - Date + - Datetime - JSONFile + - YAMLFile - CSVTableConverter - XLSXTableConverter + - SPSSFile + - H5File + - H5Dataset + - H5Group + - H5Ndarray + - XMLFile + - XMLTag + - XMLTextNode + - XMLAttributeNode + - PropertiesFromDictElement description: Type of this converter node. match: description: typically a regexp which is matched to a structure element name @@ -40,15 +88,52 @@ cfood: match_value: description: a regexp that is matched to the value of a key-value pair type: string - records: - description: This field is used to define new records or to modify records which have been defined on a higher level. + match_newer_than_file: + description: | + Only relevant for Directory. A path to a file containing + an ISO-formatted datetime. Only match if the contents of the + Directory have been modified after that datetime. + type: string + record_from_dict: + description: Only relevant for PropertiesFromDictElement. Specify the root record which is generated from the contained dictionary. type: object + required: + - variable_name properties: - parents: - description: Parents for this record are given here as a list of names. + variable_name: + description: | + Name of the record by which it can be accessed in the + cfood definiton. Can also be the name of an existing + record in which case that record will be updated by + the PropertiesFromDictConverter. + type: string + properties_blacklist: + description: List of keys to be ignored in the automatic treatment. They will be ignored on all levels of the dictionary. type: array items: type: string + references: + description: List of keys that will be transformed into named reference properties. + type: object + additionalProperties: + type: object + properties: + parents: + $ref: + "#/$defs/parents" + name: + description: Name of this record. If none is given, variable_name is used. + type: string + parents: + $ref: + "#/$defs/parents" + records: + description: This field is used to define new records or to modify records which have been defined on a higher level. + type: object + properties: + parents: + $ref: + "#/$defs/parents" additionalProperties: oneOf: - type: object @@ -56,6 +141,9 @@ cfood: value: description: Dictionary notation for variable values. Values can be given by a variable which is indicated by an initial "$". Use "$$" for setting values actually starting with a dollar sign. type: string + unit: + description: The unit of this property. Units can be given by a variable which is indicated by an initial "$". Use "$$" for setting values actually starting with a dollar sign. + type: string collection_mode: description: The collection mode defines whether the resulting property will be a single property or whether the values of multiple structure elements will be collected either into a list or a multiproperty. enum: @@ -70,3 +158,15 @@ cfood: additionalProperties: $ref: "#/$defs/converter" + if: + properties: + type: + const: + "PropertiesFromDictElement" + then: + required: + - type + - record_from_dict + else: + required: + - type diff --git a/src/caoscrawler/config.py b/src/caoscrawler/config.py new file mode 100644 index 0000000000000000000000000000000000000000..8a5a2c48e714f721855d05d6ae6df2412c27836e --- /dev/null +++ b/src/caoscrawler/config.py @@ -0,0 +1,34 @@ +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +import linkahead as db + +DEFAULTS = { + "send_crawler_notifications": False, + "create_crawler_status_records": False, + "public_host_url": "/", +} + + +def get_config_setting(setting): + caosdb_config = db.configuration.get_config() + if "caoscrawler" in caosdb_config and setting in caosdb_config["caoscrawler"]: + return caosdb_config["caoscrawler"][setting] + else: + return DEFAULTS[setting] diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py deleted file mode 100644 index d4e25f73a8a9e7dad42c50d907745dfb7329bb13..0000000000000000000000000000000000000000 --- a/src/caoscrawler/converters.py +++ /dev/null @@ -1,1103 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -# -# ** header v3.0 -# This file is a part of the CaosDB Project. -# -# Copyright (C) 2021 Henrik tom Wörden -# 2021 Alexander Schlemmer -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see <https://www.gnu.org/licenses/>. -# -# ** end header -# - -from __future__ import annotations -from jsonschema import validate, ValidationError - -import os -import re -import datetime -import caosdb as db -import json -import warnings -from .utils import has_parent -from .stores import GeneralStore, RecordStore -from .structure_elements import (StructureElement, Directory, File, DictElement, JSONFile, - IntegerElement, BooleanElement, FloatElement, NoneElement, - TextElement, TextElement, ListElement) -from typing import List, Optional, Tuple, Union -from abc import ABCMeta, abstractmethod -from string import Template -import yaml_header_tools - -import pandas as pd - -import yaml - -# These are special properties which are (currently) treated differently -# by the converters: -SPECIAL_PROPERTIES = ("description", "name", "id", "path", - "file", "checksum", "size") - - -def _only_max(children_with_keys): - - return [max(children_with_keys, key=lambda x: x[1])[0]] - - -def _only_min(children_with_keys): - - return [min(children_with_keys, key=lambda x: x[1])[0]] - - -# names of functions that can be used to filter children -FILTER_FUNCTIONS = { - "only_max": _only_max, - "only_min": _only_min, -} - - -def str_to_bool(x): - if str(x).lower() == "true": - return True - elif str(x).lower() == "false": - return False - else: - raise RuntimeError("Should be 'true' or 'false'.") - -# TODO: Comment on types and inheritance -# Currently, we often check the type of StructureElements, because serveral converters assume that -# they are called only with the appropriate class. -# Raising an Error if the type is not sufficient (e.g. TextElement instead of DictElement) means -# that the generic parent class StructureElement is actually NOT a valid type for the argument and -# type hints should reflect this. -# However, we should not narrow down the type of the arguments compared to the function definitions -# in the parent Converter class. See -# - https://mypy.readthedocs.io/en/stable/common_issues.html#incompatible-overrides -# - https://stackoverflow.com/questions/56860/what-is-an-example-of-the-liskov-substitution-principle -# - https://blog.daftcode.pl/covariance-contravariance-and-invariance-the-ultimate-python-guide-8fabc0c24278 -# Thus, the problem lies in the following design: -# Converter instances are supposed to be used by the Crawler in a generic way (The crawler calls -# `match` and `typecheck` etc) but the functions are not supposed to be called with generic -# StructureElements. One direction out of this would be a refactoring that makes the crawler class -# expose a generic function like `treat_element`, which can be called with any StructureElement and -# the Converter decides what to do (e.g. do nothing if the type is one that it does not care -# about). -# https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/64 - - -class ConverterValidationError(Exception): - - """To be raised if contents of an element to be converted are invalid.""" - - def __init__(self, msg): - self.message = msg - - -def replace_variables(propvalue, values: GeneralStore): - """ - This function replaces variables in property values (and possibly other locations, - where the crawler can replace cfood-internal variables). - - This function checks whether the value that is to be replaced is of type db.Entity. - In this case the entity is returned (note that this is of course only possible, if the - occurrence of the variable is directly at the beginning of the value and e.g. no string - concatenation is attempted. - - In any other case the variable substitution is carried out and a new string with the - replaced variables is returned. - """ - # Check if the replacement is a single variable containing a record: - match = re.match(r"^\$(\{)?(?P<varname>[0-9a-zA-Z_]+)(\})?$", propvalue) - if match is not None: - varname = match.group("varname") - if varname in values: - if values[varname] is None: - return None - if isinstance(values[varname], db.Entity): - return values[varname] - - propvalue_template = Template(propvalue) - return propvalue_template.safe_substitute(**values.get_storage()) - - -def handle_value(value: Union[dict, str, list], values: GeneralStore): - """ - determines whether the given value needs to set a property, be added to an existing value (create a list) or - add as an additional property (multiproperty). - - Variable names (starting with a "$") are replaced by the corresponding value stored in the - `values` GeneralStore. - - Parameters: - - value: if str, the value to be interpreted. E.g. "4", "hallo" or "$a" etc. - if dict, must have keys "value" and "collection_mode". The returned tuple is directly - created from the corresponding values. - if list, each element is checked for replacement and the resulting list will be used - as (list) value for the property - Returns a tuple: - - the final value of the property; variable names contained in `values` are replaced. - - the collection mode (can be single, list or multiproperty) - """ - # @review Florian Spreckelsen 2022-05-13 - - if type(value) == dict: - if "value" not in value: - # TODO: how do we handle this case? Just ignore? - # or disallow? - raise NotImplementedError() - propvalue = value["value"] - # can be "single", "list" or "multiproperty" - collection_mode = value["collection_mode"] - elif type(value) == str: - propvalue = value - collection_mode = "single" - if propvalue.startswith("+"): - collection_mode = "list" - propvalue = propvalue[1:] - elif propvalue.startswith("*"): - collection_mode = "multiproperty" - propvalue = propvalue[1:] - elif type(value) == list: - # TODO: (for review) - # This is a bit dirty right now and needed for - # being able to directly set list values. Semantics is, however, a bit - # different from the two cases above. - collection_mode = "single" - propvalue = value - - # variables replacement: - propvalue = list() - for element in value: - # Do the element-wise replacement only, when its type is string: - if type(element) == str: - propvalue.append(replace_variables(element, values)) - else: - propvalue.append(element) - - return (propvalue, collection_mode) - else: - # value is another simple type - collection_mode = "single" - propvalue = value - # Return it immediately, otherwise variable substitution would be done and fail: - return (propvalue, collection_mode) - - propvalue = replace_variables(propvalue, values) - return (propvalue, collection_mode) - - -def create_records(values: GeneralStore, records: RecordStore, def_records: dict): - # list of keys to identify, which variables have been set by which paths: - # the items are tuples: - # 0: record name - # 1: property name - keys_modified = [] - - for name, record in def_records.items(): - role = "Record" - # This allows us to create e.g. Files - if "role" in record: - role = record["role"] - - # whether the record already exists in the store or not are actually really - # different distinct cases for treating the setting and updating of variables: - if name not in records: - if role == "Record": - c_record = db.Record() - elif role == "File": - c_record = db.File() - else: - raise RuntimeError("Role {} not supported.".format(role)) - # add the new record to the record store: - records[name] = c_record - # additionally add the new record to the general store: - values[name] = c_record - - # add the "fallback" parent only for Records, not for Files: - if (role == "Record" and "parents" not in record): - c_record.add_parent(name) - - c_record = records[name] - - for key, value in record.items(): - if key == "parents" or key == "role": - continue - - # Allow replacing variables in keys / names of properties: - key_template = Template(key) - key = key_template.safe_substitute(**values.get_storage()) - - keys_modified.append((name, key)) - propvalue, collection_mode = handle_value(value, values) - - if key.lower() in SPECIAL_PROPERTIES: - # e.g. description, name, etc. - # list mode does not work for them - if key.lower() == "path" and not propvalue.startswith(os.path.sep): - propvalue = os.path.sep + propvalue - - # Convert relative to absolute paths: - propvalue = os.path.normpath(propvalue) - setattr(c_record, key.lower(), propvalue) - else: - - if c_record.get_property(key) is None: - - if collection_mode == "list": - c_record.add_property(name=key, value=[propvalue]) - elif (collection_mode == "multiproperty" or - collection_mode == "single"): - c_record.add_property(name=key, value=propvalue) - else: - if collection_mode == "list": - c_record.get_property(key).value.append(propvalue) - elif collection_mode == "multiproperty": - c_record.add_property(name=key, value=propvalue) - elif collection_mode == "single": - c_record.get_property(key).value = propvalue - - # no matter whether the record existed in the record store or not, - # parents will be added when they aren't present in the record yet: - if "parents" in record: - for parent in record["parents"]: - # Do the variables replacement: - var_replaced_parent = replace_variables(parent, values) - if not has_parent(c_record, var_replaced_parent): - c_record.add_parent(var_replaced_parent) - return keys_modified - - -class Converter(object, metaclass=ABCMeta): - """ - Converters treat StructureElements contained in the hierarchical sturcture. - """ - - def __init__(self, definition: dict, name: str, converter_registry: dict): - self.definition = definition - self.name = name - - # Used to store usage information for debugging: - self.metadata: dict[str, set[str]] = { - "usage": set() - } - - self.converters = [] - - if "subtree" in definition: - for converter_name in definition['subtree']: - converter_definition = definition["subtree"][converter_name] - self.converters.append(Converter.converter_factory( - converter_definition, converter_name, converter_registry)) - - @staticmethod - def converter_factory(definition: dict, name: str, converter_registry: dict): - """creates a Converter instance of the appropriate class. - - The `type` key in the `definition` defines the Converter class which is being used. - """ - - if "type" not in definition: - raise RuntimeError( - "Type is mandatory for converter entries in CFood definition.") - - if definition["type"] not in converter_registry: - raise RuntimeError("Unknown Type: {}".format(definition["type"])) - - if "class" not in converter_registry[definition["type"]]: - raise RuntimeError("Converter class not loaded correctly.") - - # instatiates an object of the required class, e.g. DirectoryConverter(definition, name) - converter = converter_registry[definition["type"]]["class"](definition, name, - converter_registry) - - return converter - - def create_values(self, - values: GeneralStore, - element: StructureElement): - """ - Extract information from the structure element and store them as values in the - general store. - - values: The GeneralStore to store values in. - element: The StructureElement to extract values from. - """ - m = self.match(element) - if m is None: - # this should never happen as the condition was checked before already - raise RuntimeError("Condition does not match.") - values.update(m) - - @abstractmethod - def create_children(self, values: GeneralStore, - element: StructureElement): - pass - - def create_records(self, values: GeneralStore, - records: RecordStore, - element: StructureElement): - - if "records" not in self.definition: - return [] - - return create_records(values, - records, - self.definition["records"]) - - def filter_children(self, children_with_strings: - List[Tuple[StructureElement, str]], expr: str, - group: str, rule: str): - """Filter children according to regexp `expr` and `rule`.""" - - if rule not in FILTER_FUNCTIONS: - raise RuntimeError( - f"{rule} is not a known filter rule. Only {list(FILTER_FUNCTIONS.keys())} are implemented." - ) - - to_be_filtered = [] - unmatched_children = [] - - for (child, name) in children_with_strings: - - m = re.match(expr, name) - if m is None: - unmatched_children.append(child) - else: - to_be_filtered.append((child, m.groupdict()[group])) - - filtered_children = FILTER_FUNCTIONS[rule](to_be_filtered) - - return filtered_children + unmatched_children - - @abstractmethod - def typecheck(self, element: StructureElement): - """ - Check whether the current structure element can be converted using - this converter. - """ - pass - - @staticmethod - def _debug_matching_template(name: str, regexp: list[str], matched: list[str], result: Optional[dict]): - """ Template for the debugging output for the match function """ - print("\n--------", name, "-----------") - for re, ma in zip(regexp, matched): - print("matching against:\n" + re) - print("matching:\n" + ma) - print("---------") - if result is None: - print("No match") - else: - print("Matched groups:") - print(result) - print("----------------------------------------") - - @staticmethod - def debug_matching(kind=None): - def debug_matching_decorator(func): - """ - decorator for the match function of Converters that implements debug for the match of - StructureElements - """ - - def inner(self, element: StructureElement): - mr = func(self, element) - if "debug_match" in self.definition and self.definition["debug_match"]: - if kind == "name" and "match" in self.definition: - self._debug_matching_template(name=self.__class__.__name__, - regexp=[self.definition["match"]], - matched=[element.name], - result=mr) - elif kind == "name_and_value": - self._debug_matching_template( - name=self.__class__.__name__, - regexp=[self.definition["match"] - if "match" in self.definition else "", - self.definition["match_name"] - if "match_name" in self.definition else "", - self.definition["match_value"] - if "match_value" in self.definition else ""], - matched=[element.name, element.name, str(element.value)], - result=mr) - else: - self._debug_matching_template(name=self.__class__.__name__, - regexp=self.definition["match"] - if "match" in self.definition else "", - matched=str(element), - result=mr) - return mr - return inner - return debug_matching_decorator - - @abstractmethod - def match(self, element: StructureElement) -> Optional[dict]: - """ - This method is used to implement detailed checks for matching compatibility - of the current structure element with this converter. - - The return value is a dictionary providing possible matched variables from the - structure elements information. - """ - pass - - -class DirectoryConverter(Converter): - def create_children(self, generalStore: GeneralStore, element: StructureElement): - # TODO: See comment on types and inheritance - if not isinstance(element, Directory): - raise RuntimeError( - "Directory converters can only create children from directories.") - - children = self.create_children_from_directory(element) - - if "filter" in self.definition: - - tuple_list = [(c, c.name) for c in children] - - return self.filter_children(tuple_list, **self.definition["filter"]) - - return children - - def typecheck(self, element: StructureElement): - return isinstance(element, Directory) - - # TODO basically all converters implement such a match function. Shouldn't this be the one - # of the parent class and subclasses can overwrite if needed? - @Converter.debug_matching("name") - def match(self, element: StructureElement): - # TODO: See comment on types and inheritance - if not isinstance(element, Directory): - raise RuntimeError("Element must be a directory.") - m = re.match(self.definition["match"], element.name) - if m is None: - return None - return m.groupdict() - - @staticmethod - def create_children_from_directory(element: Directory): - """ - Creates a list of files (of type File) and directories (of type Directory) for a - given directory. No recursion. - - element: A directory (of type Directory) which will be traversed. - """ - children: List[StructureElement] = [] - - for name in sorted(os.listdir(element.path)): - path = os.path.join(element.path, name) - - if os.path.isdir(path): - children.append(Directory(name, path)) - elif os.path.isfile(path): - children.append(File(name, path)) - - return children - - -class SimpleFileConverter(Converter): - """ - Just a file, ignore the contents. - """ - - def typecheck(self, element: StructureElement): - return isinstance(element, File) - - def create_children(self, generalStore: GeneralStore, element: StructureElement): - return list() - - @Converter.debug_matching("name") - def match(self, element: StructureElement): - # TODO: See comment on types and inheritance - if not isinstance(element, File): - raise RuntimeError("Element must be a file.") - m = re.match(self.definition["match"], element.name) - if m is None: - return None - return m.groupdict() - - -class FileConverter(SimpleFileConverter): - def __init__(self, *args, **kwargs): - warnings.warn(DeprecationWarning( - "This class is depricated. Please use SimpleFileConverter.")) - super().__init__(*args, **kwargs) - - -class MarkdownFileConverter(Converter): - """ - reads the yaml header of markdown files (if a such a header exists). - """ - - def create_children(self, generalStore: GeneralStore, element: StructureElement): - # TODO: See comment on types and inheritance - if not isinstance(element, File): - raise RuntimeError("A markdown file is needed to create children.") - - header = yaml_header_tools.get_header_from_file( - element.path, clean=False) - children: List[StructureElement] = [] - - for name, entry in header.items(): - if type(entry) == list: - children.append(ListElement(name, entry)) - elif type(entry) == str: - children.append(TextElement(name, entry)) - else: - raise RuntimeError( - "Header entry {} has incompatible type.".format(name)) - return children - - def typecheck(self, element: StructureElement): - return isinstance(element, File) - - @Converter.debug_matching("name") - def match(self, element: StructureElement): - # TODO: See comment on types and inheritance - if not isinstance(element, File): - raise RuntimeError("Element must be a file.") - m = re.match(self.definition["match"], element.name) - if m is None: - return None - try: - yaml_header_tools.get_header_from_file(element.path) - except yaml_header_tools.NoValidHeader: - # TODO(salexan): Raise a validation error instead of just not - # matching silently. - return None - return m.groupdict() - - -def convert_basic_element(element: Union[list, dict, bool, int, float, str, None], name=None, - msg_prefix=""): - """converts basic Python objects to the corresponding StructureElements """ - if isinstance(element, list): - return ListElement(name, element) - elif isinstance(element, dict): - return DictElement(name, element) - elif isinstance(element, bool): - return BooleanElement(name, element) - elif isinstance(element, int): - return IntegerElement(name, element) - elif isinstance(element, float): - return FloatElement(name, element) - elif isinstance(element, str): - return TextElement(name, element) - elif element is None: - return NoneElement(name) - elif isinstance(element, datetime.date): - return TextElement(name, str(element)) - else: - raise NotImplementedError( - msg_prefix + f"The object that has an unexpected type: {type(element)}\n" - f"The object is:\n{str(element)}") - - -def validate_against_json_schema(instance, schema_resource: Union[dict, str]): - """validates given ``instance`` against given ``schema_resource``. - - Args: - instance: instance to be validated, typically ``dict`` but can be ``list``, ``str``, etc. - schema_resource: Either a path to the JSON file containing the schema or a ``dict`` with - the schema - """ - if isinstance(schema_resource, dict): - schema = schema_resource - elif isinstance(schema_resource, str): - with open(schema_resource, 'r') as json_file: - schema = json.load(json_file) - else: - raise ValueError("The value of 'validate' has to be a string describing the path " - "to the json schema file (relative to the cfood yml) " - "or a dict containing the schema.") - # validate instance (e.g. JSON content) against schema - try: - validate(instance=instance, schema=schema) - except ValidationError as err: - raise ConverterValidationError( - f"\nCouldn't validate {instance}:\n{err.message}") - - -class DictElementConverter(Converter): - def create_children(self, generalStore: GeneralStore, element: StructureElement): - # TODO: See comment on types and inheritance - if not isinstance(element, DictElement): - raise ValueError("create_children was called with wrong type of StructureElement") - - try: - return self._create_children_from_dict(element.value) - except ConverterValidationError as err: - path = generalStore[self.name] - raise ConverterValidationError( - "Error during the validation of the dictionary located at the following node " - "in the data structure:\n" - f"{path}\n" + err.message) - - def _create_children_from_dict(self, data): - if "validate" in self.definition and self.definition["validate"]: - validate_against_json_schema(data, self.definition["validate"]) - - children = [] - - for name, value in data.items(): - children.append(convert_basic_element( - value, name, f"The value in the dict for key:{name} has an unknown type.")) - - return children - - def typecheck(self, element: StructureElement): - return isinstance(element, DictElement) - - @Converter.debug_matching("name_and_value") - def match(self, element: StructureElement): - """ - Allways matches if the element has the right type. - """ - # TODO: See comment on types and inheritance - if not isinstance(element, DictElement): - raise RuntimeError("Element must be a DictElement.") - return match_name_and_value(self.definition, element.name, element.value) - - -class DictConverter(DictElementConverter): - def __init__(self, *args, **kwargs): - warnings.warn(DeprecationWarning( - "This class is depricated. Please use DictConverter.")) - super().__init__(*args, **kwargs) - - -class DictDictElementConverter(DictElementConverter): - def __init__(self, *args, **kwargs): - warnings.warn(DeprecationWarning( - "This class is depricated. Please use DictElementConverter.")) - super().__init__(*args, **kwargs) - - -class JSONFileConverter(Converter): - def typecheck(self, element: StructureElement): - return isinstance(element, File) - - @Converter.debug_matching("name") - def match(self, element: StructureElement): - # TODO: See comment on types and inheritance - if not self.typecheck(element): - raise RuntimeError("Element must be a file") - m = re.match(self.definition["match"], element.name) - if m is None: - return None - return m.groupdict() - - def create_children(self, generalStore: GeneralStore, element: StructureElement): - # TODO: See comment on types and inheritance - if not isinstance(element, File): - raise ValueError("create_children was called with wrong type of StructureElement") - with open(element.path, 'r') as json_file: - json_data = json.load(json_file) - if "validate" in self.definition and self.definition["validate"]: - try: - validate_against_json_schema(json_data, self.definition["validate"]) - except ConverterValidationError as err: - raise ConverterValidationError( - "Error during the validation of the JSON file:\n" - f"{element.path}\n" + err.message) - structure_element = convert_basic_element( - json_data, - name=element.name+"_child_dict", - msg_prefix="The JSON File contained content that was parsed to a Python object" - " with an unexpected type.") - return [structure_element] - - -class YAMLFileConverter(Converter): - def typecheck(self, element: StructureElement): - return isinstance(element, File) - - @Converter.debug_matching("name") - def match(self, element: StructureElement): - # TODO: See comment on types and inheritance - if not self.typecheck(element): - raise RuntimeError("Element must be a file") - m = re.match(self.definition["match"], element.name) - if m is None: - return None - return m.groupdict() - - def create_children(self, generalStore: GeneralStore, element: StructureElement): - # TODO: See comment on types and inheritance - if not isinstance(element, File): - raise ValueError("create_children was called with wrong type of StructureElement") - with open(element.path, 'r') as yaml_file: - yaml_data = yaml.safe_load(yaml_file) - if "validate" in self.definition and self.definition["validate"]: - try: - validate_against_json_schema(yaml_data, self.definition["validate"]) - except ConverterValidationError as err: - raise ConverterValidationError( - "Error during the validation of the YAML file:\n" - f"{element.path}\n" + err.message) - structure_element = convert_basic_element( - yaml_data, - name=element.name+"_child_dict", - msg_prefix="The YAML File contained content that was parsed to a Python object" - " with an unexpected type.") - return [structure_element] - - -def match_name_and_value(definition, name, value): - """ - takes match definitions from the definition argument and applies regular expressiion to name - and possibly value - - one of the keys 'match_name' and "match' needs to be available in definition - 'match_value' is optional - - Returns None, if match_name or match lead to no match. Otherwise, returns a dictionary with the - matched groups, possibly including matches from using match_value - """ - if "match_name" in definition: - if "match" in definition: - raise RuntimeError(f"Do not supply both, 'match_name' and 'match'.") - - m1 = re.match(definition["match_name"], name) - if m1 is None: - return None - else: - m1 = m1.groupdict() - elif "match" in definition: - m1 = re.match(definition["match"], name) - if m1 is None: - return None - else: - m1 = m1.groupdict() - else: - m1 = {} - - if "match_value" in definition: - m2 = re.match(definition["match_value"], str(value), re.DOTALL) - if m2 is None: - return None - else: - m2 = m2.groupdict() - else: - m2 = {} - - values = dict() - values.update(m1) - values.update(m2) - return values - - -class _AbstractScalarValueElementConverter(Converter): - """ - A base class for all converters that have a scalar value that can be matched using a regular - expression. - - values must have one of the following type: str, bool, int, float - """ - - default_matches = { - "accept_text": False, - "accept_bool": False, - "accept_int": False, - "accept_float": False, - } - - def create_children(self, generalStore: GeneralStore, element: StructureElement): - return [] - - def typecheck(self, element: StructureElement): - """ - returns whether the type of StructureElement is accepted by this converter instance. - """ - allowed_matches = self._merge_match_definition_with_default(self.default_matches, - self.definition) - return self._typecheck(element, allowed_matches) - - @Converter.debug_matching("name_and_value") - def match(self, element: StructureElement): - """ - Try to match the given structure element. - - If it does not match, return None. - - Else return a dictionary containing the variables from the matched regexp - as key value pairs. - """ - # TODO: See comment on types and inheritance - if (not isinstance(element, TextElement) - and not isinstance(element, BooleanElement) - and not isinstance(element, IntegerElement) - and not isinstance(element, FloatElement)): - raise ValueError("create_children was called with wrong type of StructureElement") - return match_name_and_value(self.definition, element.name, element.value) - - def _typecheck(self, element: StructureElement, allowed_matches: dict): - """ - returns whether the type of StructureElement is accepted. - - Parameters: - element: StructureElement, the element that is checked - allowed_matches: Dict, a dictionary that defines what types are allowed. It must have the - keys 'accept_text', 'accept_bool', 'accept_int', and 'accept_float'. - - returns: whether or not the converter allows the type of element - """ - if (bool(allowed_matches["accept_text"]) and isinstance(element, TextElement)): - return True - elif (bool(allowed_matches["accept_bool"]) and isinstance(element, BooleanElement)): - return True - elif (bool(allowed_matches["accept_int"]) and isinstance(element, IntegerElement)): - return True - elif (bool(allowed_matches["accept_float"]) and isinstance(element, FloatElement)): - return True - else: - return False - - def _merge_match_definition_with_default(self, default: dict, definition: dict): - """ - returns a dict with the same keys as default dict but with updated values from definition - where it has the same keys - """ - - result = {} - for key in default: - if key in definition: - result[key] = definition[key] - else: - result[key] = default[key] - return result - - -class BooleanElementConverter(_AbstractScalarValueElementConverter): - default_matches = { - "accept_text": False, - "accept_bool": True, - "accept_int": True, - "accept_float": False, - } - - -class DictBooleanElementConverter(BooleanElementConverter): - def __init__(self, *args, **kwargs): - warnings.warn(DeprecationWarning( - "This class is depricated. Please use BooleanElementConverter.")) - super().__init__(*args, **kwargs) - - -class FloatElementConverter(_AbstractScalarValueElementConverter): - default_matches = { - "accept_text": False, - "accept_bool": False, - "accept_int": True, - "accept_float": True, - } - - -class DictFloatElementConverter(FloatElementConverter): - def __init__(self, *args, **kwargs): - warnings.warn(DeprecationWarning( - "This class is depricated. Please use FloatElementConverter.")) - super().__init__(*args, **kwargs) - - -class TextElementConverter(_AbstractScalarValueElementConverter): - default_matches = { - "accept_text": True, - "accept_bool": True, - "accept_int": True, - "accept_float": True, - } - - def __init__(self, definition, *args, **kwargs): - if "match" in definition: - raise ValueError(""" -The 'match' key will in future be used to match a potential name of a TextElement. Please use -the 'match_value' key to match the value of the TextElement and 'match_name' for matching the name. -""") - - super().__init__(definition, *args, **kwargs) - - -class DictTextElementConverter(TextElementConverter): - def __init__(self, *args, **kwargs): - warnings.warn(DeprecationWarning( - "This class is depricated. Please use TextElementConverter.")) - super().__init__(*args, **kwargs) - - -class IntegerElementConverter(_AbstractScalarValueElementConverter): - default_matches = { - "accept_text": False, - "accept_bool": False, - "accept_int": True, - "accept_float": False, - } - - -class DictIntegerElementConverter(IntegerElementConverter): - def __init__(self, *args, **kwargs): - warnings.warn(DeprecationWarning( - "This class is depricated. Please use IntegerElementConverter.")) - super().__init__(*args, **kwargs) - - -class ListElementConverter(Converter): - def create_children(self, generalStore: GeneralStore, - element: StructureElement): - # TODO: See comment on types and inheritance - if not isinstance(element, ListElement): - raise RuntimeError( - "This converter can only process DictListElements.") - children: list[StructureElement] = [] - for index, list_element in enumerate(element.value): - # TODO(fspreck): Refactor this and merge with DictXXXElements maybe? - if isinstance(list_element, str): - children.append(TextElement(str(index), list_element)) - elif isinstance(list_element, dict): - children.append(DictElement(str(index), list_element)) - elif isinstance(list_element, StructureElement): - children.append(list_element) - else: - raise NotImplementedError( - f"Unkown type {type(list_element)} in list element {list_element}.") - return children - - def typecheck(self, element: StructureElement): - return isinstance(element, ListElement) - - @Converter.debug_matching("name") - def match(self, element: StructureElement): - # TODO: See comment on types and inheritance - if not isinstance(element, ListElement): - raise RuntimeError("Element must be a ListElement.") - m = re.match(self.definition["match_name"], element.name) - if m is None: - return None - if "match" in self.definition: - raise NotImplementedError( - "Match is not implemented for ListElement.") - return m.groupdict() - - -class DictListElementConverter(ListElementConverter): - def __init__(self, *args, **kwargs): - warnings.warn(DeprecationWarning( - "This class is depricated. Please use ListElementConverter.")) - super().__init__(*args, **kwargs) - - -class TableConverter(Converter): - """ - This converter reads tables in different formats line by line and - allows matching the corresponding rows. - - The subtree generated by the table converter consists of DictElements, each being - a row. The corresponding header elements will become the dictionary keys. - - The rows can be matched using a DictElementConverter. - """ - @abstractmethod - def get_options(self): - """ - This method needs to be overwritten by the specific table converter to provide - information about the possible options. - """ - pass - - def _get_options(self, possible_options): - option_dict = dict() - for opt_name, opt_conversion in possible_options: - if opt_name in self.definition: - el = self.definition[opt_name] - # The option can often either be a single value or a list of values. - # In the latter case each element of the list will be converted to the defined - # type. - if isinstance(el, list): - option_dict[opt_name] = [ - opt_conversion(el_el) for el_el in el] - else: - option_dict[opt_name] = opt_conversion(el) - return option_dict - - def typecheck(self, element: StructureElement): - return isinstance(element, File) - - @Converter.debug_matching("name") - def match(self, element: StructureElement): - # TODO: See comment on types and inheritance - if not isinstance(element, File): - raise RuntimeError("Element must be a File.") - m = re.match(self.definition["match"], element.name) - if m is None: - return None - return m.groupdict() - - -class XLSXTableConverter(TableConverter): - def get_options(self): - return self._get_options([ - ("sheet_name", str), - ("header", int), - ("names", str), - ("index_col", int), - ("usecols", int), - ("true_values", str), - ("false_values", str), - ("na_values", str), - ("skiprows", int), - ("nrows", int), - ("keep_default_na", str_to_bool), ] - ) - - def create_children(self, generalStore: GeneralStore, - element: StructureElement): - # TODO: See comment on types and inheritance - if not isinstance(element, File): - raise RuntimeError("Element must be a File.") - table = pd.read_excel(element.path, **self.get_options()) - child_elements = list() - for index, row in table.iterrows(): - child_elements.append( - DictElement(str(index), row.to_dict())) - return child_elements - - -class CSVTableConverter(TableConverter): - def get_options(self): - return self._get_options([ - ("sep", str), - ("delimiter", str), - ("header", int), - ("names", str), - ("index_col", int), - ("usecols", int), - ("true_values", str), - ("false_values", str), - ("na_values", str), - ("skiprows", int), - ("nrows", int), - ("keep_default_na", str_to_bool), ]) - - def create_children(self, generalStore: GeneralStore, - element: StructureElement): - # TODO: See comment on types and inheritance - if not isinstance(element, File): - raise RuntimeError("Element must be a File.") - table = pd.read_csv(element.path, **self.get_options()) - child_elements = list() - for index, row in table.iterrows(): - child_elements.append( - DictElement(str(index), row.to_dict())) - return child_elements diff --git a/src/caoscrawler/converters/__init__.py b/src/caoscrawler/converters/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..edb7b3633cea2657dc3b9638379a3e57c37c87e4 --- /dev/null +++ b/src/caoscrawler/converters/__init__.py @@ -0,0 +1,47 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Submodule containing all default and optional converters.""" + +from .. import utils +from .converters import * +from .xml_converter import * +from .zipfile_converter import ZipFileConverter + +try: + from .spss import SPSSConverter +except ImportError as err: + SPSSConverter: type = utils.MissingImport( + name="SPSSConverter", hint="Try installing with the `spss` extra option.", + err=err) + +try: + from .rocrate import (ELNFileConverter, ROCrateConverter, + ROCrateEntityConverter) +except ImportError as err: + ROCrateEntityConverter: type = utils.MissingImport( + name="ROCrateEntityConverter", hint="Try installing with the `rocrate` extra option.", + err=err) + ROCrateConverter: type = utils.MissingImport( + name="ROCrateConverter", hint="Try installing with the `rocrate` extra option.", + err=err) + ELNFileConverter: type = utils.MissingImport( + name="ELNFileConverter", hint="Try installing with the `rocrate` extra option.", + err=err) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py new file mode 100644 index 0000000000000000000000000000000000000000..e16b2c0fbaeeee419b0e3f235339dc18cd4da885 --- /dev/null +++ b/src/caoscrawler/converters/converters.py @@ -0,0 +1,1698 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2021 Henrik tom Wörden +# Copyright (C) 2021 Alexander Schlemmer +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Converters take structure elements and create Records and new structure elements from them.""" + +from __future__ import annotations + +import datetime +import json +import logging +import os +import re +import warnings +from abc import ABCMeta, abstractmethod +from inspect import signature +from string import Template +from typing import Any, Callable, Optional, Union + +import linkahead as db +import pandas as pd +import yaml +import yaml_header_tools +from jsonschema import ValidationError, validate + +from ..stores import GeneralStore, RecordStore +from ..structure_elements import (BooleanElement, DictElement, Directory, File, + FloatElement, IntegerElement, JSONFile, + ListElement, NoneElement, StructureElement, + TextElement) +from ..utils import has_parent + +# These are special properties which are (currently) treated differently +# by the converters: +SPECIAL_PROPERTIES = ("description", "name", "id", "path", + "file", "checksum", "size") +ID_PATTERN = r"\D[.\w]*" +SINGLE_VAR_RE = re.compile(r"^\$(\{)?(?P<varname>" + ID_PATTERN + r")(\})?$") +logger = logging.getLogger(__name__) + + +class CrawlerTemplate(Template): + # This also adds a dot to the default pattern. + # See: https://docs.python.org/3/library/string.html#template-strings + # Default flags is re.IGNORECASE + braceidpattern = ID_PATTERN + + +def _only_max(children_with_keys): + + return [max(children_with_keys, key=lambda x: x[1])[0]] + + +def _only_min(children_with_keys): + + return [min(children_with_keys, key=lambda x: x[1])[0]] + + +# names of functions that can be used to filter children +FILTER_FUNCTIONS = { + "only_max": _only_max, + "only_min": _only_min, +} + + +def str_to_bool(x): + if str(x).lower() == "true": + return True + elif str(x).lower() == "false": + return False + else: + raise RuntimeError("Should be 'true' or 'false'.") + +# TODO: Comment on types and inheritance +# Currently, we often check the type of StructureElements, because serveral converters assume that +# they are called only with the appropriate class. +# Raising an Error if the type is not sufficient (e.g. TextElement instead of DictElement) means +# that the generic parent class StructureElement is actually NOT a valid type for the argument and +# type hints should reflect this. +# However, we should not narrow down the type of the arguments compared to the function definitions +# in the parent Converter class. See +# - https://mypy.readthedocs.io/en/stable/common_issues.html#incompatible-overrides +# - https://stackoverflow.com/questions/56860/what-is-an-example-of-the-liskov-substitution-principle +# - https://blog.daftcode.pl/covariance-contravariance-and-invariance-the-ultimate-python-guide-8fabc0c24278 +# Thus, the problem lies in the following design: +# Converter instances are supposed to be used by the Crawler in a generic way (The crawler calls +# `match` and `typecheck` etc) but the functions are not supposed to be called with generic +# StructureElements. One direction out of this would be a refactoring that makes the crawler class +# expose a generic function like `treat_element`, which can be called with any StructureElement and +# the Converter decides what to do (e.g. do nothing if the type is one that it does not care +# about). +# https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/64 + + +class ConverterValidationError(Exception): + + """To be raised if contents of an element to be converted are invalid.""" + + def __init__(self, msg): + self.message = msg + + +def create_path_value(func): + """Decorator for create_values functions that adds a value containing the path. + + should be used for StructureElement that are associated with file system objects that have a + path, like File or Directory. + + """ + + def inner(self, values: GeneralStore, element: StructureElement): + func(self, values=values, element=element) + values.update({self.name + ".path": element.path}) + return inner + + +def replace_variables(propvalue: Any, values: GeneralStore): + """ + This function replaces variables in property values (and possibly other locations, + where the crawler can replace cfood-internal variables). + + If ``propvalue`` is a single variable name preceeded by a ``$`` (e.g. ``$var`` or ``${var}``), + then the corresponding value stored in ``values`` is returned. + In any other case the variable substitution is carried out as defined by string templates + and a new string with the replaced variables is returned. + """ + # We only replace string variable names. If it is not a string the value stays unchanged + if not isinstance(propvalue, str): + return propvalue + + # Check if the replacement is a single variable containing a record: + match = SINGLE_VAR_RE.match(propvalue) + if match is not None: + varname = match.group("varname") + if varname in values: + return values[varname] + + propvalue_template = CrawlerTemplate(propvalue) + return propvalue_template.safe_substitute(**values.get_storage()) + + +def handle_value(value: Union[dict, str, list], values: GeneralStore): + """Determine whether the given value needs to set a property, + be added to an existing value (create a list) or + add as an additional property (multiproperty). + + Variable names (starting with a "$") are replaced by the corresponding value stored in the + ``values`` GeneralStore. + +Parameters +---------- + +value: Union[dict, str, list] + - If *str*, the value to be interpreted. E.g. "4", "hello" or "$a" + etc. No unit is set and collection mode is determined from the + first character: + - '+' corresponds to "list" + - '*' corresponds to "multiproperty" + - everything else is "single" + - If *dict*, it must have a ``value`` key and may ``unit``, and + ``collection_mode``. The returned tuple is directly created from + the corresponding values if they are given; ``unit`` defaults to + None and ``collection_mode`` is determined from ``value`` as + explained for the str case above, i.e., + - if it starts with '+', collection mode is "list", + - in case of '*', collection mode is "multiproperty", + - and everything else is "single". + - If *list*, each element is checked for variable replacement and the + resulting list will be used as (list) value for the property + +Returns +------- + +out: tuple + - the final value of the property; variable names contained in `values` are replaced. + - the final unit of the property; variable names contained in `values` are replaced. + - the collection mode (can be single, list or multiproperty) + """ + # @review Florian Spreckelsen 2022-05-13 + + propunit = None + propvalue = None + collection_mode = None + if isinstance(value, dict): + if "value" not in value: + # TODO: how do we handle this case? Just ignore? + # or disallow? + raise NotImplementedError(f"This definition has no \"value\": {value}") + propvalue = value["value"] + if "unit" in value: + propunit = replace_variables(value["unit"], values) + # can be "single", "list" or "multiproperty" + if "collection_mode" in value: + collection_mode = value["collection_mode"] + else: + propvalue = value + if collection_mode is None: + if isinstance(propvalue, str): + # Determine collection mode from string value + collection_mode = "single" + if propvalue.startswith("+"): + collection_mode = "list" + propvalue = propvalue[1:] + elif propvalue.startswith("*"): + collection_mode = "multiproperty" + propvalue = propvalue[1:] + elif isinstance(propvalue, list): + # TODO: (for review) + # This is a bit dirty right now and needed for + # being able to directly set list values. Semantics is, however, a bit + # different from the two cases above. + collection_mode = "single" + + # variables replacement: + returnvalue = list() + for element in propvalue: + # Do the element-wise replacement only, when its type is string: + if isinstance(element, str): + returnvalue.append(replace_variables(element, values)) + else: + returnvalue.append(element) + + return (returnvalue, propunit, collection_mode) + else: + # value is another simple type + collection_mode = "single" + # Return it immediately, otherwise variable substitution would be done and fail: + return (propvalue, propunit, collection_mode) + + propvalue = replace_variables(propvalue, values) + return (propvalue, propunit, collection_mode) + + +def create_records(values: GeneralStore, + records: RecordStore, + def_records: dict) -> list[tuple[str, str]]: + """ + Create records in GeneralStore `values` and RecordStore `records` as given + by the definition in `def_records`. + + This function will be called during scanning using the cfood definition. + It also should be used by CustomConverters to set records as automatic substitution + and other crawler features are applied automatically. + + Parameters + ---------- + values: GeneralStore + This GeneralStore will be used to access variables that are needed during variable substitution + in setting the properties of records and files. + Furthermore, the records that are generated in this function will be stored in this GeneralStore + **additionally to** storing them in the RecordStore given as the second argument to this function. + + records: RecordStore + The RecordStore where the generated records will be stored. + + Returns + ------- + : list[tuple[str, str]] + A list of tuples containing the record names (1st element of tuple) and respective property names + as 2nd element of the tuples. This list will be used by the scanner for creating the debug tree. + + """ + keys_modified = [] + + for name, record in def_records.items(): + # If only a name was given (Like this: + # Experiment: + # ) set record to an empty dict / empty configuration + if record is None: + record = {} + + role = "Record" + # This allows us to create e.g. Files + if "role" in record: + role = record["role"] + + # whether the record already exists in the store or not are actually really + # different distinct cases for treating the setting and updating of variables: + if name not in records: + if role == "Record": + c_record = db.Record() + elif role == "File": + c_record = db.File() + else: + raise RuntimeError("Role {} not supported.".format(role)) + # add the new record to the record store: + records[name] = c_record + # additionally add the new record to the general store: + values[name] = c_record + + # add the "fallback" parent only for Records, not for Files: + if (role == "Record" and "parents" not in record): + c_record.add_parent(name) + + if isinstance(record, str): + raise RuntimeError( + "dict expected, but found str: {}".format(record)) + + # We do a second run over the def_records, here. Having finished the first run + # for creating the records (in the variable and records stores) makes sure that + # records, that are defined on this level can already be accessed during variable substitution + # in the properties that will be set in the next block. + for name, record in def_records.items(): + # See above: + if record is None: + record = {} + + c_record = records[name] + + # Set the properties: + for key, value in record.items(): + if key == "parents" or key == "role": + continue + + # Allow replacing variables in keys / names of properties: + key_template = CrawlerTemplate(key) + key = key_template.safe_substitute(**values.get_storage()) + + keys_modified.append((name, key)) + propvalue, propunit, collection_mode = handle_value(value, values) + + if key.lower() in SPECIAL_PROPERTIES: + # e.g. description, name, etc. + # list mode does not work for them + if key.lower() == "path" and not propvalue.startswith(os.path.sep): + propvalue = os.path.sep + propvalue + + # Convert relative to absolute paths: + propvalue = os.path.normpath(propvalue) + setattr(c_record, key.lower(), propvalue) + else: + if c_record.get_property(key) is None: + if collection_mode == "list": + c_record.add_property(name=key, value=[propvalue], unit=propunit) + elif (collection_mode == "multiproperty" or + collection_mode == "single"): + c_record.add_property(name=key, value=propvalue, unit=propunit) + else: + if collection_mode == "list": + if (propunit and c_record.get_property(key).unit + and propunit != c_record.get_property(key).unit): + raise RuntimeError( + f"Property '{key}' has contradictory units: " + f"{propunit} and {c_record.get_property(key).unit}" + ) + c_record.get_property(key).value.append(propvalue) + if propunit and not c_record.get_property(key).unit: + c_record.get_property(key).unit = propunit + elif collection_mode == "multiproperty": + c_record.add_property(name=key, value=propvalue, unit=propunit) + elif collection_mode == "single": + c_record.get_property(key).value = propvalue + if propunit: + c_record.get_property(key).unit = propunit + + # no matter whether the record existed in the record store or not, + # parents will be added when they aren't present in the record yet: + if "parents" in record: + c_record.parents.clear() + for parent in record["parents"]: + # Do the variables replacement: + var_replaced_parent = replace_variables(parent, values) + if not has_parent(c_record, var_replaced_parent): + c_record.add_parent(var_replaced_parent) + return keys_modified + + +class Converter(object, metaclass=ABCMeta): + """Converters treat StructureElements contained in the hierarchical sturcture. + + This is the abstract super class for all Converters. + """ + + def __init__(self, definition: dict, name: str, converter_registry: dict): + """ + + Parameters + ---------- + definition: dict + Please refer to ``src/doc/converters.rst`` to learn about the structure that the + definition dict must have. + converter_registry: dict + A dictionary that contains converter names as keys and dicts as values. Those value dicts + have the keys 'converter', 'package' and 'class'. 'converter' is the class name, + 'package' the module and 'class' the class instance of converters. + """ + + self.definition = definition + self.name = name + + # Used to store usage information for debugging: + self.metadata: dict[str, set[str]] = { + "usage": set() + } + + self.converters = [] + if "transform" in self.definition: + if not isinstance(self.definition["transform"], dict): + raise RuntimeError("The value corresponding to the 'transform' key in the " + "converter definition must be a dict") + for transformer_key, transformer in self.definition["transform"].items(): + if "in" not in transformer: + raise RuntimeError("In-variable not defined!") + if "out" not in transformer: + raise RuntimeError("Out-variable not defined!") + if "functions" not in transformer: + raise RuntimeError("No functions given for transformer!") + if not isinstance(transformer["functions"], list): + raise RuntimeError("The value corresponding to the 'functions' key in the " + "transform section must be a list") + + if not isinstance(transformer["in"], str): + raise RuntimeError("You should provide the variable name as string") + + if "subtree" in definition: + for converter_name in definition['subtree']: + converter_definition = definition["subtree"][converter_name] + self.converters.append(Converter.converter_factory( + converter_definition, converter_name, converter_registry)) + + self.setup() + + def setup(self): + """ + Analogous to `cleanup`. Can be used to set up variables that are permanently + stored in this converter. + """ + pass + + @staticmethod + def converter_factory(definition: dict, name: str, converter_registry: dict): + """Create a Converter instance of the appropriate class. + + The `type` key in the `definition` defines the Converter class which is being used. + """ + + if definition is None: + raise RuntimeError("Definition of converter \"{}\" is " + "empty".format(name)) + + if "type" not in definition: + raise RuntimeError( + "Type is mandatory for converter entries in CFood definition.") + + if definition["type"] not in converter_registry: + raise RuntimeError("Unknown Type: {}".format(definition["type"])) + + if "class" not in converter_registry[definition["type"]]: + raise RuntimeError("Converter class not loaded correctly.") + + # instatiates an object of the required class, e.g. DirectoryConverter(definition, name) + converter = converter_registry[definition["type"]]["class"](definition, name, + converter_registry) + + return converter + + def create_values(self, values: GeneralStore, element: StructureElement): + """ + Extract information from the structure element and store them as values in the + general store. + + Parameters + ---------- + + values: GeneralStore + The GeneralStore to store values in. + + element: StructureElement + The StructureElement to extract values from. + """ + m = self.match(element) + if m is None: + # this should never happen as the condition was checked before already + raise RuntimeError("Condition does not match.") + values.update(m) + + def match_properties(self, properties: dict, vardict: dict, label: str = "match_properties"): + """This method can be used to generically match 'match_properties' from the cfood definition + with the behavior described as follows: + + 'match_properties' is a dictionary of key-regexps and value-regexp pairs. Each key matches + a property name and the corresponding value matches its property value. + + What a property means in the context of the respective converter can be different, examples: + + * XMLTag: attributes of the node + * ROCrate: properties of the ROCrateEntity + * DictElement: properties of the dict + + label can be used to customize the name of the dictionary in the definition. + + This method is not called by default, but can be called from child classes. + + Typically it would be used like this from methods overwriting `match`:: + + if not self.match_properties(<properties>, vardict): + return None + + vardict will be updated in place when there are + matches. <properties> is a dictionary taken from the structure + element that contains the properties in the context of this + converter. + + + Parameters + ---------- + + properties: dict + The dictionary containing the properties to be matched. + + vardict: dict + This dictionary will be used to store the variables created during the matching. + + label: str + Default "match_properties". Can be used to change the name + of the property in the definition. E.g. the xml converter + uses "match_attrib" which makes more sense in the context + of xml trees. + + Returns + ------- + + : bool + Returns True when properties match and False + otherwise. The vardict dictionary is updated in place. + + """ + if label in self.definition: + # This matcher works analogously to the attributes matcher in the XMLConverter + for prop_def_key, prop_def_value in self.definition[label].items(): + match_counter = 0 + matched_m_prop = None + matched_m_prop_value = None + for prop_key, prop_value in properties.items(): + # print("{} = {}".format(prop_key, prop_value)) + # TODO: automatic conversion to str ok? + m_prop = re.match(prop_def_key, str(prop_key)) + if m_prop is not None: + match_counter += 1 + matched_m_prop = m_prop + # TODO: automatic conversion to str ok? + m_prop_value = re.match(prop_def_value, str(prop_value)) + if m_prop_value is None: + return False + matched_m_prop_value = m_prop_value + # TODO: How to deal with multiple matches? + # There are multiple options: + # - Allow multiple attribute-key matches: Leads to possible overwrites of variables + # - Require unique attribute-key and attribute-value matches: Very complex + # - Only allow one single attribute-key to match and run attribute-value match separately. + # Currently the latter option is implemented. + # TODO: The ROCrateEntityConverter implements a very similar behavior. + if match_counter == 0: + return False + elif match_counter > 1: + raise RuntimeError("Multiple properties match the same {} entry.".format(label)) + vardict.update(matched_m_prop.groupdict()) + vardict.update(matched_m_prop_value.groupdict()) + return True + + def apply_transformers(self, values: GeneralStore, transformer_functions: dict): + """ + Check if transformers are defined using the "transform" keyword. + Then apply the transformers to the variables defined in GeneralStore "values". + + Parameters + ---------- + + values: GeneralStore + The GeneralStore to store values in. + + transformer_functions: dict + A dictionary of registered functions that can be used within this transformer block. + The keys of the dict are the function keys and the values the callable functions of the + form: + + def func(in_value: Any, in_parameters: dict) -> Any: + pass + """ + + if "transform" not in self.definition: + return + for transformer_key, transformer in self.definition["transform"].items(): + in_value = replace_variables(transformer["in"], values) + out_value = in_value + + for tr_func_el in transformer["functions"]: + if not isinstance(tr_func_el, dict): + raise RuntimeError("Elements of the list of the functions key " + "must be dictonaries!") + if len(tr_func_el) != 1: + raise RuntimeError("List element dictionaries must have exactly" + " one element with they key being the name" + " of the function!") + tr_func_key = list(tr_func_el.keys())[0] + + if tr_func_key not in transformer_functions: + raise RuntimeError("Unknown transformer function: {}".format(tr_func_key)) + + # Do variable replacment on function parameters: + if tr_func_el[tr_func_key] is not None: + # Create a copy of the function parameters: + tr_func_params = dict(tr_func_el[tr_func_key]) + for key in tr_func_params: + tr_func_params[key] = replace_variables(tr_func_params[key], values) + else: + tr_func_params = None + + # Retrieve the function from the dictionary: + tr_func = transformer_functions[tr_func_key] + # Call the function: + sig = signature(tr_func) + if len(sig.parameters) == 1 and len(tr_func_params) == 0: + out_value = tr_func(in_value) + else: + out_value = tr_func(in_value, tr_func_params) + # The next in_value is the current out_value: + in_value = out_value + # If everything succeeded, store the final value in the general store: + match = SINGLE_VAR_RE.match(transformer["out"]) + if match is None: + raise RuntimeError("'out' of the transformer definition must specify a single" + f" variable name. It was {transformer['out']}") + values[match.group('varname')] = out_value + + @abstractmethod + def create_children(self, values: GeneralStore, element: StructureElement): + pass + + def create_records(self, values: GeneralStore, records: RecordStore, + element: StructureElement): + # TODO why is element passed but not used??? + # ANSWER: because it might be used by overriding child classes. + + if "records" not in self.definition: + return [] + + # TODO please rename due to conflict + return create_records(values, + records, + self.definition["records"]) + + def filter_children(self, children_with_strings: + list[tuple[StructureElement, str]], expr: str, + group: str, rule: str): + """Filter children according to regexp `expr` and `rule`.""" + + if rule not in FILTER_FUNCTIONS: + raise RuntimeError( + f"{rule} is not a known filter rule. Only " + f"{list(FILTER_FUNCTIONS.keys())} are implemented." + ) + + to_be_filtered = [] + unmatched_children = [] + + for (child, name) in children_with_strings: + + m = re.match(expr, name) + if m is None: + unmatched_children.append(child) + else: + to_be_filtered.append((child, m.groupdict()[group])) + + filtered_children = FILTER_FUNCTIONS[rule](to_be_filtered) + + return filtered_children + unmatched_children + + @abstractmethod + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + pass + + @staticmethod + def _debug_matching_template(name: str, regexp: list[str], matched: list[str], + result: Optional[dict]): + """ Template for the debugging output for the match function """ + msg = "\n--------" + name + "-----------\n" + for exp, ma in zip(regexp, matched): + msg += "matching reg:\t" + exp + "\n" + msg += "matching val:\t" + ma + "\n" + msg += "---------\n" + if result is None: + msg += "No match\n" + else: + msg += "Matched groups:\n" + msg += str(result)+'\n' + msg += "----------------------------------------\n" + logger.debug(msg) + + @staticmethod + def debug_matching(kind=None): + def debug_matching_decorator(func): + """ + decorator for the match function of Converters that implements debug for the match of + StructureElements + """ + + def inner(self, element: StructureElement): + mr = func(self, element) + if "debug_match" in self.definition and self.definition["debug_match"]: + if kind == "name" and "match" in self.definition: + self._debug_matching_template(name=self.__class__.__name__, + regexp=[self.definition["match"]], + matched=[element.name], + result=mr) + elif kind == "name_and_value": + self._debug_matching_template( + name=self.__class__.__name__, + regexp=[self.definition["match"] + if "match" in self.definition else "", + self.definition["match_name"] + if "match_name" in self.definition else "", + self.definition["match_value"] + if "match_value" in self.definition else ""], + matched=[element.name, element.name, str(element.value)], + result=mr) + else: + self._debug_matching_template(name=self.__class__.__name__, + regexp=self.definition["match"] + if "match" in self.definition else "", + matched=str(element), + result=mr) + return mr + return inner + return debug_matching_decorator + + @abstractmethod + def match(self, element: StructureElement) -> Optional[dict]: + """ + This method is used to implement detailed checks for matching compatibility + of the current structure element with this converter. + + The return value is a dictionary providing possible matched variables from the + structure elements information. + """ + pass + + def cleanup(self): + """ + This function is called when the converter runs out of scope and can be used to + clean up objects that were needed in the converter or its children. + """ + pass + + +class DirectoryConverter(Converter): + """ + Converter that matches and handles structure elements of type directory. + + This is one typical starting point of a crawling procedure. + """ + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + # TODO: See comment on types and inheritance + if not isinstance(element, Directory): + raise RuntimeError( + "Directory converters can only create children from directories.") + + children = self.create_children_from_directory(element) + + if "filter" in self.definition: + + tuple_list = [(c, c.name) for c in children] + + return self.filter_children(tuple_list, **self.definition["filter"]) + + return children + + @create_path_value + def create_values(self, values: GeneralStore, element: StructureElement): + super().create_values(values=values, element=element) + + def typecheck(self, element: StructureElement): + return isinstance(element, Directory) + + # TODO basically all converters implement such a match function. Shouldn't this be the one + # of the parent class and subclasses can overwrite if needed? + @Converter.debug_matching("name") + def match(self, element: StructureElement): + # TODO: See comment on types and inheritance + if not isinstance(element, Directory): + raise RuntimeError("Element must be a directory.") + m = re.match(self.definition["match"], element.name) + if m is None: + return None + if "match_newer_than_file" in self.definition: + last_modified = self._get_most_recent_change_in_dir(element) + reference = self._get_reference_file_timestamp() + if last_modified < reference: + return None + return m.groupdict() + + @staticmethod + def create_children_from_directory(element: Directory): + """ + Creates a list of files (of type File) and directories (of type Directory) for a + given directory. No recursion. + + element: A directory (of type Directory) which will be traversed. + """ + children: list[StructureElement] = [] + + for name in sorted(os.listdir(element.path)): + path = os.path.join(element.path, name) + + if os.path.isdir(path): + children.append(Directory(name, path)) + elif os.path.isfile(path): + children.append(File(name, path)) + + return children + + @staticmethod + def _get_most_recent_change_in_dir(element: Directory) -> datetime.datetime: + """Return the datetime of the most recent change of any file + or directory in the given Directory element. + + """ + most_recent = os.path.getmtime(element.path) + + for root, _, files in os.walk(element.path): + mtimes = [os.path.getmtime(root)] + \ + [os.path.getmtime(os.path.join(root, fname)) for fname in files] + if max(mtimes) > most_recent: + most_recent = max(mtimes) + + return datetime.datetime.fromtimestamp(most_recent) + + def _get_reference_file_timestamp(self) -> datetime.datetime: + """Return a time stamp read from a reference file if it + exists. Otherwise return datetime.datetime.min, i.e., the + earliest datetime known to datetime. + + """ + + if "match_newer_than_file" not in self.definition: + logger.debug("No reference file specified.") + return datetime.datetime.min + + elif not os.path.isfile(self.definition["match_newer_than_file"]): + logger.debug("Reference file doesn't exist.") + return datetime.datetime.min + + with open(self.definition["match_newer_than_file"]) as ref_file: + stamp_str = ref_file.readline().strip() + try: + return datetime.datetime.fromisoformat(stamp_str) + except ValueError as e: + logger.error( + f"Reference file in {self.definition['match_newer_than_file']} " + "doesn't contain a ISO formatted datetime in its first line. " + "Match regardless of modification times." + ) + raise e + + +class SimpleFileConverter(Converter): + """Just a file, ignore the contents.""" + + def typecheck(self, element: StructureElement): + return isinstance(element, File) + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + return list() + + @create_path_value + def create_values(self, values: GeneralStore, element: StructureElement): + super().create_values(values=values, element=element) + + @Converter.debug_matching("name") + def match(self, element: StructureElement): + # TODO: See comment on types and inheritance + if not isinstance(element, File): + raise RuntimeError("Element must be a file.") + m = re.match(self.definition["match"], element.name) + if m is None: + return None + return m.groupdict() + + +class FileConverter(SimpleFileConverter): + def __init__(self, *args, **kwargs): + warnings.warn(DeprecationWarning( + "This class is deprecated. Please use SimpleFileConverter.")) + super().__init__(*args, **kwargs) + + +class MarkdownFileConverter(SimpleFileConverter): + """Read the yaml header of markdown files (if a such a header exists).""" + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + # TODO: See comment on types and inheritance + if not isinstance(element, File): + raise RuntimeError("A markdown file is needed to create children.") + + try: + header = yaml_header_tools.get_header_from_file( + element.path, clean=False) + except yaml_header_tools.NoValidHeader: + if generalStore is not None and self.name in generalStore: + path = generalStore[self.name] + else: + path = "<path not set>" + raise ConverterValidationError( + "Error during the validation (yaml header cannot be read) of the markdown file " + "located at the following node in the data structure:\n" + f"{path}") + except yaml_header_tools.ParseErrorsInHeader as err: + if generalStore is not None and self.name in generalStore: + path = generalStore[self.name] + else: + path = "<path not set>" + raise ConverterValidationError( + "Error during the validation (yaml header cannot be read) of the markdown file " + "located at the following node in the data structure:\n" + "{}\nError:\n{}".format(path, err)) + children: list[StructureElement] = [] + + for name, entry in header.items(): + if isinstance(entry, list): + children.append(ListElement(name, entry)) + elif isinstance(entry, str): + children.append(TextElement(name, entry)) + else: + if generalStore is not None and self.name in generalStore: + path = generalStore[self.name] + else: + path = "<path not set>" + raise RuntimeError( + "Header entry {} has incompatible type.\nFilename: {}".format(name, path)) + return children + + +def convert_basic_element(element: Union[list, dict, bool, int, float, str, None], name=None, + msg_prefix=""): + """Convert basic Python objects to the corresponding StructureElements""" + if isinstance(element, StructureElement): + return element + elif isinstance(element, list): + return ListElement(name, element) + elif isinstance(element, dict): + return DictElement(name, element) + elif isinstance(element, bool): + return BooleanElement(name, element) + elif isinstance(element, int): + return IntegerElement(name, element) + elif isinstance(element, float): + return FloatElement(name, element) + elif isinstance(element, str): + return TextElement(name, element) + elif element is None: + return NoneElement(name) + elif isinstance(element, datetime.date): + return TextElement(name, str(element)) + else: + raise NotImplementedError( + msg_prefix + f"The object that has an unexpected type: {type(element)}\n" + f"The object is:\n{str(element)}") + + +def validate_against_json_schema(instance, schema_resource: Union[dict, str]): + """Validate given ``instance`` against given ``schema_resource``. + +Parameters +---------- + +instance: + Instance to be validated, typically ``dict`` but can be ``list``, ``str``, etc. + +schema_resource: + Either a path to the JSON file containing the schema or a ``dict`` with the schema. + """ + if isinstance(schema_resource, dict): + schema = schema_resource + elif isinstance(schema_resource, str): + with open(schema_resource, 'r') as json_file: + schema = json.load(json_file) + else: + raise ValueError("The value of 'validate' has to be a string describing the path " + "to the json schema file (relative to the cfood yml) " + "or a dict containing the schema.") + # validate instance (e.g. JSON content) against schema + try: + validate(instance=instance, schema=schema) + except ValidationError as err: + raise ConverterValidationError( + f"\nCouldn't validate {instance}:\n{err.message}") + + +class DictElementConverter(Converter): + """ +**Operates on:** :py:class:`caoscrawler.structure_elements.DictElement` + +**Generates:** :py:class:`caoscrawler.structure_elements.StructureElement` + """ + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + # TODO: See comment on types and inheritance + if not isinstance(element, DictElement): + raise ValueError("create_children was called with wrong type of StructureElement") + + try: + return self._create_children_from_dict(element.value) + except ConverterValidationError as err: + path = generalStore[self.name] + raise ConverterValidationError( + "Error during the validation of the dictionary located at the following node " + "in the data structure:\n" + f"{path}\n" + err.message) + + def _create_children_from_dict(self, data): + if "validate" in self.definition and self.definition["validate"]: + validate_against_json_schema(data, self.definition["validate"]) + + children = [] + + for name, value in data.items(): + children.append(convert_basic_element( + value, name, f"The value in the dict for key:{name} has an unknown type.")) + + return children + + def typecheck(self, element: StructureElement): + return isinstance(element, DictElement) + + @Converter.debug_matching("name_and_value") + def match(self, element: StructureElement): + """ + Allways matches if the element has the right type. + """ + # TODO: See comment on types and inheritance + if not isinstance(element, DictElement): + raise RuntimeError("Element must be a DictElement.") + vardict = match_name_and_value(self.definition, element.name, element.value) + + if not self.match_properties(element.value, vardict): + return None + + return vardict + + +class PropertiesFromDictConverter(DictElementConverter): + """Extend the :py:class:`DictElementConverter` by a heuristic to set + property values from the dictionary keys. + + """ + + def __init__(self, definition: dict, name: str, converter_registry: dict, + referenced_record_callback: Optional[callable] = None): + + super().__init__(definition, name, converter_registry) + self.referenced_record_callback = referenced_record_callback + + def _recursively_create_records(self, subdict: dict, root_record: db.Record, + root_rec_name: str, + values: GeneralStore, records: RecordStore, + referenced_record_callback: callable, + keys_modified: list = [] + ): + """Create a record form the given `subdict` and recursively create referenced records.""" + + blacklisted_keys = self.definition["record_from_dict"][ + "properties_blacklist"] if "properties_blacklist" in self.definition["record_from_dict"] else [] + special_references = self.definition["record_from_dict"]["references"] if "references" in self.definition["record_from_dict"] else [ + ] + + for key, value in subdict.items(): + + if key in blacklisted_keys: + # We ignore this in the automated property generation + continue + if isinstance(value, list): + if not any([isinstance(val, dict) for val in value]): + # no dict in list, i.e., no references, so this is simple + root_record.add_property(name=key, value=value) + else: + if not all([isinstance(val, dict) for val in value]): + # if this is not an error (most probably it is), this + # needs to be handled manually for now. + raise ValueError( + f"{key} in {subdict} contains a mixed list of references and scalars.") + ref_recs = [] + for ii, ref_dict in enumerate(value): + ref_var_name = f"{root_rec_name}.{key}.{ii+1}" + ref_rec, keys_modified = self._create_ref_rec( + ref_var_name, + key, + ref_dict, + special_references, + records, + values, + keys_modified, + referenced_record_callback + ) + ref_recs.append(ref_rec) + root_record.add_property(name=key, value=ref_recs) + + elif isinstance(value, dict): + # Treat scalar reference + ref_var_name = f"{root_rec_name}.{key}" + ref_rec, keys_modified = self._create_ref_rec( + ref_var_name, + key, + value, + special_references, + records, + values, + keys_modified, + referenced_record_callback + ) + root_record.add_property(key, ref_rec) + else: + # All that remains are scalar properties which may or + # may not be special attributes like name. + if key.lower() in SPECIAL_PROPERTIES: + setattr(root_record, key.lower(), value) + else: + root_record.add_property(name=key, value=value) + keys_modified.append((root_rec_name, key)) + + if referenced_record_callback: + root_record = referenced_record_callback(root_record, records, values) + + return keys_modified + + def _create_ref_rec( + self, + name: str, + key: str, + subdict: dict, + special_references: dict, + records: RecordStore, + values: GeneralStore, + keys_modified: list, + referenced_record_callback: callable + ): + """Create the referenced Record and forward the stores etc. to + ``_recursively_create_records``. + + Parameters: + ----------- + name : str + name of the referenced record to be created in RecordStore and Value Store. + key : str + name of the key this record's definition had in the original dict. + subdict : dict + subdict containing this record's definition from the original dict. + special_references : dict + special treatment of referenced records from the converter definition. + records : RecordStore + RecordStore for entering new Records + values : GeneralStore + ValueStore for entering new Records + keys_modified : list + List for keeping track of changes + referenced_record_callback : callable + Advanced treatment of referenced records as given in the + converter initialization. + """ + ref_rec = db.Record() + if key in special_references: + for par in special_references[key]["parents"]: + ref_rec.add_parent(par) + else: + ref_rec.add_parent(key) + records[name] = ref_rec + values[name] = ref_rec + keys_modified = self._recursively_create_records( + subdict=subdict, + root_record=ref_rec, + root_rec_name=name, + values=values, + records=records, + referenced_record_callback=referenced_record_callback, + keys_modified=keys_modified + ) + return ref_rec, keys_modified + + def create_records(self, values: GeneralStore, records: RecordStore, + element: StructureElement): + + keys_modified = [] + + rfd = self.definition["record_from_dict"] + if rfd["variable_name"] not in records: + rec = db.Record() + if "name" in rfd: + rec.name = rfd["name"] + if "parents" in rfd: + for par in rfd["parents"]: + rec.add_parent(par) + else: + rec.add_parent(rfd["variable_name"]) + records[rfd["variable_name"]] = rec + values[rfd["variable_name"]] = rec + + else: + rec = records[rfd["variable_name"]] + + keys_modified = self._recursively_create_records( + subdict=element.value, + root_record=rec, + root_rec_name=rfd["variable_name"], + values=values, + records=records, + referenced_record_callback=self.referenced_record_callback, + keys_modified=keys_modified, + ) + + keys_modified.extend(super().create_records( + values=values, records=records, element=element)) + + return keys_modified + + +class DictConverter(DictElementConverter): + def __init__(self, *args, **kwargs): + warnings.warn(DeprecationWarning( + "This class is deprecated. Please use DictElementConverter.")) + super().__init__(*args, **kwargs) + + +class DictDictElementConverter(DictElementConverter): + def __init__(self, *args, **kwargs): + warnings.warn(DeprecationWarning( + "This class is deprecated. Please use DictElementConverter.")) + super().__init__(*args, **kwargs) + + +class JSONFileConverter(SimpleFileConverter): + def create_children(self, generalStore: GeneralStore, element: StructureElement): + # TODO: See comment on types and inheritance + if not isinstance(element, File): + raise ValueError("create_children was called with wrong type of StructureElement") + with open(element.path, 'r') as json_file: + json_data = json.load(json_file) + if "validate" in self.definition and self.definition["validate"]: + try: + validate_against_json_schema(json_data, self.definition["validate"]) + except ConverterValidationError as err: + raise ConverterValidationError( + "Error during the validation of the JSON file:\n" + f"{element.path}\n" + err.message) + structure_element = convert_basic_element( + json_data, + name=element.name + "_child_dict", + msg_prefix="The JSON File contained content that was parsed to a Python object" + " with an unexpected type.") + return [structure_element] + + +class YAMLFileConverter(SimpleFileConverter): + def create_children(self, generalStore: GeneralStore, element: StructureElement): + # TODO: See comment on types and inheritance + if not isinstance(element, File): + raise ValueError("create_children was called with wrong type of StructureElement") + with open(element.path, 'r') as yaml_file: + yaml_data = yaml.safe_load(yaml_file) + if "validate" in self.definition and self.definition["validate"]: + try: + validate_against_json_schema(yaml_data, self.definition["validate"]) + except ConverterValidationError as err: + raise ConverterValidationError( + "Error during the validation of the YAML file:\n" + f"{element.path}\n" + err.message) + structure_element = convert_basic_element( + yaml_data, + name=element.name + "_child_dict", + msg_prefix="The YAML File contained content that was parsed to a Python object" + " with an unexpected type.") + return [structure_element] + + +def match_name_and_value(definition, name, value): + """Take match definitions from the definition argument and apply regular expression to name and + possibly value. + + Exactly one of the keys ``match_name`` and ``match`` must exist in ``definition``, + ``match_value`` is optional + +Returns +------- + +out: + None, if match_name or match lead to no match. Otherwise, returns a dictionary with the + matched groups, possibly including matches from using `definition["match_value"]` + + """ + if "match_name" in definition: + if "match" in definition: + raise RuntimeError("Do not supply both, 'match_name' and 'match'.") + + m1 = re.match(definition["match_name"], name) + if m1 is None: + return None + else: + m1 = m1.groupdict() + elif "match" in definition: + m1 = re.match(definition["match"], name) + if m1 is None: + return None + else: + m1 = m1.groupdict() + else: + m1 = {} + + if "match_value" in definition: + # None values will be interpreted as empty strings for the + # matcher. + m_value = str(value) if (value is not None and not pd.isna(value)) else "" + m2 = re.match(definition["match_value"], m_value, re.DOTALL) + if m2 is None: + return None + else: + m2 = m2.groupdict() + else: + m2 = {} + + values = dict() + values.update(m1) + values.update(m2) + return values + + +class _AbstractScalarValueElementConverter(Converter): + """A base class for all converters that have a scalar value that can be matched using a regular + expression. + + values must have one of the following type: str, bool, int, float + + """ + + default_matches = { + "accept_text": False, + "accept_bool": False, + "accept_int": False, + "accept_float": False, + } + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + return [] + + def typecheck(self, element: StructureElement): + """ + returns whether the type of StructureElement is accepted by this converter instance. + """ + allowed_matches = self._merge_match_definition_with_default(self.default_matches, + self.definition) + return self._typecheck(element, allowed_matches) + + @Converter.debug_matching("name_and_value") + def match(self, element: StructureElement): + """ + Try to match the given structure element. + + If it does not match, return None. + + Else return a dictionary containing the variables from the matched regexp + as key value pairs. + """ + # TODO: See comment on types and inheritance + if (not isinstance(element, TextElement) + and not isinstance(element, BooleanElement) + and not isinstance(element, IntegerElement) + and not isinstance(element, FloatElement)): + raise ValueError("create_children was called with wrong type of StructureElement") + return match_name_and_value(self.definition, element.name, element.value) + + def _typecheck(self, element: StructureElement, allowed_matches: dict): + """Return whether the type of StructureElement is accepted. + + Parameters: element: StructureElement, the element that is checked allowed_matches: Dict, a + dictionary that defines what types are allowed. It must have the keys 'accept_text', + 'accept_bool', 'accept_int', and 'accept_float'. + + returns: whether or not the converter allows the type of element + + """ + if (bool(allowed_matches["accept_text"]) and isinstance(element, TextElement)): + return True + elif (bool(allowed_matches["accept_bool"]) and isinstance(element, BooleanElement)): + return True + elif (bool(allowed_matches["accept_int"]) and isinstance(element, IntegerElement)): + return True + elif (bool(allowed_matches["accept_float"]) and isinstance(element, FloatElement)): + return True + else: + return False + + def _merge_match_definition_with_default(self, default: dict, definition: dict): + """ + returns a dict with the same keys as default dict but with updated values from definition + where it has the same keys + """ + + result = {} + for key in default: + if key in definition: + result[key] = definition[key] + else: + result[key] = default[key] + return result + + +class BooleanElementConverter(_AbstractScalarValueElementConverter): + default_matches = { + "accept_text": False, + "accept_bool": True, + "accept_int": True, + "accept_float": False, + } + + +class DictBooleanElementConverter(BooleanElementConverter): + def __init__(self, *args, **kwargs): + warnings.warn(DeprecationWarning( + "This class is deprecated. Please use BooleanElementConverter.")) + super().__init__(*args, **kwargs) + + +class FloatElementConverter(_AbstractScalarValueElementConverter): + default_matches = { + "accept_text": False, + "accept_bool": False, + "accept_int": True, + "accept_float": True, + } + + +class DictFloatElementConverter(FloatElementConverter): + def __init__(self, *args, **kwargs): + warnings.warn(DeprecationWarning( + "This class is deprecated. Please use FloatElementConverter.")) + super().__init__(*args, **kwargs) + + +class TextElementConverter(_AbstractScalarValueElementConverter): + default_matches = { + "accept_text": True, + "accept_bool": True, + "accept_int": True, + "accept_float": True, + } + + def __init__(self, definition, *args, **kwargs): + if "match" in definition: + raise ValueError(""" +The 'match' key is used to match a potential name of a TextElement. Please use +the 'match_value' key to match the value of the TextElement and 'match_name' for matching the name. +""") + + super().__init__(definition, *args, **kwargs) + + +class DictTextElementConverter(TextElementConverter): + def __init__(self, *args, **kwargs): + warnings.warn(DeprecationWarning( + "This class is deprecated. Please use TextElementConverter.")) + super().__init__(*args, **kwargs) + + +class IntegerElementConverter(_AbstractScalarValueElementConverter): + default_matches = { + "accept_text": False, + "accept_bool": False, + "accept_int": True, + "accept_float": False, + } + + +class DictIntegerElementConverter(IntegerElementConverter): + def __init__(self, *args, **kwargs): + warnings.warn(DeprecationWarning( + "This class is deprecated. Please use IntegerElementConverter.")) + super().__init__(*args, **kwargs) + + +class ListElementConverter(Converter): + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + # TODO: See comment on types and inheritance + if not isinstance(element, ListElement): + raise RuntimeError( + "This converter can only process ListElements.") + children: list[StructureElement] = [] + for index, list_element in enumerate(element.value): + children.append( + convert_basic_element( + list_element, + name=f"{index}", + msg_prefix=f"The value at index {index} in the list as an unknown type." + ) + ) + return children + + def typecheck(self, element: StructureElement): + return isinstance(element, ListElement) + + @Converter.debug_matching("name") + def match(self, element: StructureElement): + # TODO: See comment on types and inheritance + if not isinstance(element, ListElement): + raise RuntimeError("Element must be a ListElement.") + m = re.match(self.definition["match_name"], element.name) + if m is None: + return None + if "match" in self.definition: + raise NotImplementedError( + "Match is not implemented for ListElement.") + return m.groupdict() + + +class DictListElementConverter(ListElementConverter): + def __init__(self, *args, **kwargs): + warnings.warn(DeprecationWarning( + "This class is deprecated. Please use ListElementConverter.")) + super().__init__(*args, **kwargs) + + +class TableConverter(Converter): + """This converter reads tables in different formats line by line and + allows matching the corresponding rows. + + The subtree generated by the table converter consists of DictElements, each being + a row. The corresponding header elements will become the dictionary keys. + + The rows can be matched using a DictElementConverter. + + """ + + def get_options(self) -> dict: + """Get specific options, e.g. from ``self.definitions``. + +This method may to be overwritten by the specific table converter to provide information about the +possible options. Implementors may use ``TableConverter._get_options(...)`` to get (and convert) +options from ``self.definitions``. + +Returns +------- +out: dict + An options dict. + """ + return {} + + def _get_options(self, possible_options: list[tuple[str, Callable]]) -> dict: + option_dict = dict() + for opt_name, opt_conversion in possible_options: + if opt_name in self.definition: + el = self.definition[opt_name] + # The option can often either be a single value or a list of values. + # In the latter case each element of the list will be converted to the defined + # type. + if isinstance(el, list): + option_dict[opt_name] = [ + opt_conversion(el_el) for el_el in el] + else: + option_dict[opt_name] = opt_conversion(el) + return option_dict + + def typecheck(self, element: StructureElement): + return isinstance(element, File) + + @Converter.debug_matching("name") + def match(self, element: StructureElement): + # TODO: See comment on types and inheritance + if not isinstance(element, File): + raise RuntimeError("Element must be a File.") + m = re.match(self.definition["match"], element.name) + if m is None: + return None + return m.groupdict() + + @staticmethod + def _children_from_dataframe(dataframe: pd.DataFrame): + child_elements = list() + for index, row in dataframe.iterrows(): + child_elements.append( + DictElement(str(index), row.to_dict())) + return child_elements + + +class XLSXTableConverter(TableConverter): + """ +**Operates on:** :py:class:`caoscrawler.structure_elements.File` + +**Generates:** :py:class:`caoscrawler.structure_elements.DictElement` + """ + + def get_options(self): + return self._get_options([ + ("sheet_name", str), + ("header", int), + ("names", str), + ("index_col", int), + ("usecols", int), + ("true_values", str), + ("false_values", str), + ("na_values", str), + ("skiprows", int), + ("nrows", int), + ("keep_default_na", str_to_bool), ] + ) + + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + # TODO: See comment on types and inheritance + if not isinstance(element, File): + raise RuntimeError("Element must be a File.") + table = pd.read_excel(element.path, **self.get_options()) + return self._children_from_dataframe(table) + + +class CSVTableConverter(TableConverter): + def get_options(self): + return self._get_options([ + ("sep", str), + ("delimiter", str), + ("header", int), + ("names", str), + ("index_col", int), + ("usecols", int), + ("true_values", str), + ("false_values", str), + ("na_values", str), + ("skiprows", int), + ("nrows", int), + ("keep_default_na", str_to_bool), ]) + + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + # TODO: See comment on types and inheritance + if not isinstance(element, File): + raise RuntimeError("Element must be a File.") + table = pd.read_csv(element.path, **self.get_options()) + return self._children_from_dataframe(table) + + +class DateElementConverter(TextElementConverter): + """allows to convert different text formats of dates to Python date objects. + + The text to be parsed must be contained in the "date" group. The format string can be supplied + under "date_format" in the Converter definition. The library used is datetime so see its + documentation for information on how to create the format string. + + """ + + # TODO make `date` parameter name configurable + def match(self, element: StructureElement): + matches = super().match(element) + if matches is not None and "date" in matches: + matches.update({"date": datetime.datetime.strptime( + matches["date"], + self.definition["date_format"] if "date_format" in self.definition else "%Y-%m-%d" + ).date()}) + return matches + + +class DatetimeElementConverter(TextElementConverter): + """Convert text so that it is formatted in a way that LinkAhead can understand it. + +The text to be parsed must be in the ``val`` parameter. The format string can be supplied in the +``datetime_format`` node. This class uses the ``datetime`` module, so ``datetime_format`` must +follow this specificaton: +https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes + + """ + + # TODO make `val` parameter name configurable + def match(self, element: StructureElement): + matches = super().match(element) + if matches is not None and "val" in matches: + fmt_default = "%Y-%m-%dT%H:%M:%S" + fmt = self.definition.get("datetime_format", fmt_default) + dt_str = datetime.datetime.strptime(matches["val"], fmt).strftime(fmt_default) + matches.update({"val": dt_str}) + return matches diff --git a/src/caoscrawler/converters/hdf5_converter.py b/src/caoscrawler/converters/hdf5_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..97dac53d053dbcb87c48f0cfb59d4f09770b9710 --- /dev/null +++ b/src/caoscrawler/converters/hdf5_converter.py @@ -0,0 +1,338 @@ +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2023 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +from typing import Optional + +try: + import h5py +except ModuleNotFoundError: + raise ModuleNotFoundError( + "Couldn't find module h5py. Did you install the crawler package with " + "its optional `h5-crawler` dependency?" + ) + +from typing import Union + +import linkahead as db +import numpy as np + +from ..stores import GeneralStore, RecordStore +from ..structure_elements import (DictElement, File, FloatElement, + IntegerElement, StructureElement) +from .converters import (Converter, DictElementConverter, SimpleFileConverter, + convert_basic_element, match_name_and_value) + + +def convert_attributes(elt: Union[h5py.File, h5py.Group, h5py.Dataset]): + """Convert hdf5 attributes to a list of either basic scalar structure elements or ndarrays. + + Parameters + ---------- + elt : Union[h5py.File, h5py.Group, h5py.Dataset] + The hdf5 element the attributes of which will be converted to structure + elements. + + Returns + ------- + converted : list[StructureElement] + A list of the attributes converted to StructureElements (either basic + scalar elements or ndarray). + """ + + converted = [] + for name, value in elt.attrs.items(): + converted.append(convert_basic_element_with_nd_array( + value, name, f"The value of attribute {name} has an unknown type: {type(value)}.")) + + return converted + + +def convert_h5_element(elt: Union[h5py.Group, h5py.Dataset], name: str): + """Convert a given HDF5 element to the corresponding StructureElement. + + Parameters + ---------- + elt : Union[h5py.Group, h5py.Dataset] + The hdf5 element to be converted. + name : str + The name of the StructureElement that the hdf5 element is converted to. + + Raises + ------ + ValueError + In case of anything that is not convertible to a HDF5 structure element. + + Returns + ------- + StructureElement + The converted StructureElement. + """ + + if isinstance(elt, h5py.Group): + + return H5GroupElement(name, elt) + + if isinstance(elt, h5py.Dataset): + + return H5DatasetElement(name, elt) + + raise ValueError("The given element must be either a HDF5 Group or Dataset object.") + + +def convert_basic_element_with_nd_array(value, name: Optional[str] = None, + internal_path: Optional[str] = None, msg_prefix: str = ""): + """Convert a given object either to an ndarray structure element or to a + basic scalar structure element. + + This function extends :func:`~caoscrawler.converters.convert_basic_element` + by a special treatment for certain numpy objects, most importantly + ndarrays. They are converted to a scalar in case of a size-1 array, to a + list in case of a 1-d array, and to a ``H5NdarrayElement`` in all other + cases. In addition, numpy integers and floats are also converted to + IntegerElements and FloatElements, respectively. + + Parameters + ---------- + value + The object to be converted. + name : str, optional + The name of the structure element ``value`` is being converted + to. Default is None. + internal_path : str, optional + The internal path of ``value`` within the HDF5 file. Default is None. + msg_prefix : str, optional + The prefix of the error message that will be raised. Default is ``""``. + + Returns + ------- + StructureElement + The StructureElement ``value`` was converted to. + + """ + + if isinstance(value, np.ndarray): + + if value.size == 1: + # this is a scalar stacked in a numpy array. We don't know its + # actual shape, so we reshape first, then use the actual value + # inside. + value = value.reshape((1,))[0] + + elif np.squeeze(value).ndim == 1: + # If the array is one-dimensional we can save it as a list + value = list(np.squeeze(value)) + + else: + # real multi-dimensional array + return H5NdarrayElement(name, value, internal_path) + + elif isinstance(value, np.int32) or isinstance(value, np.int64): + + return IntegerElement(name, value) + + elif isinstance(value, np.float64): + + return FloatElement(name, value) + + return convert_basic_element(value, name, msg_prefix) + + +class H5GroupElement(DictElement): + """StructureElement specific for HDF5 groups""" + + def __init__(self, name: str, value: h5py.Group): + super().__init__(name, value) + + +class H5DatasetElement(DictElement): + """StructureElement specific for HDF5 datasets.""" + + def __init__(self, name: str, value: h5py.Dataset): + super().__init__(name, value) + + +class H5NdarrayElement(DictElement): + """StructureElement specific for NDArrays within HDF5 files. + + Also store the internal path of the array within the HDF5 file in its + ``internal_path`` attribute. + + """ + + def __init__(self, name: str, value, internal_path: str): + super().__init__(name, value) + self.internal_path = internal_path + + +class H5FileConverter(SimpleFileConverter): + """Converter for HDF5 files that creates children for the contained + attributes, groups, and datasets. + + """ + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """Create children from root-level file attributes and contained hdf5 + elements. + + """ + + if not isinstance(element, File): + + raise ValueError("create_children should have been called with a File object.") + + ff = h5py.File(element.path, 'r') + + children = [] + + for name, value in ff.items(): + + children.append(convert_h5_element(value, name)) + + children.extend(convert_attributes(ff)) + + return children + + +class H5GroupConverter(DictElementConverter): + """Converter for HDF5 groups that creates children from the group-level + attributes and the contained subgroups and datasets. + + """ + + def typecheck(self, element: StructureElement): + + return isinstance(element, H5GroupElement) + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """Create children from group attributes and hdf5 elements contained in + this group. + + """ + + if not isinstance(element.value, h5py.Group): + + raise ValueError("create_children should have been called with a HDF5 Group object.") + + children = [] + + for name, value in element.value.items(): + + children.append(convert_h5_element(value, name)) + + children.append(convert_attributes(element.value)) + + return children + + +class H5DatasetConverter(DictElementConverter): + """Converter for HDF5 datasets that creates children from the dataset + attributes and the contained array data. + + """ + + def typecheck(self, element: StructureElement): + + return isinstance(element, H5DatasetElement) + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """Create children from the dataset attributes and append the array data + contained in this dataset. + + """ + + if not isinstance(element.value, h5py.Dataset): + + raise ValueError("create_children should have been called with a HDF5 Dataset object") + + children = convert_attributes(element.value) + + children.append( + H5NdarrayElement( + name=self.name+"_ndarray", + value=element.value, + internal_path=element.value.name + ) + ) + return children + + +class H5NdarrayConverter(Converter): + """Converter for ndarrays contained in HDF5 files. Creates the wrapper + record for this ndarray. + + """ + + def __init__(self, definition: dict, name: str, converter_registry: dict): + + # Check that a non-empty name for the record that will be created for + # the ndarray Record (within the cfood) is given + if not ("recordname" in definition and definition["recordname"]): + + raise RuntimeError(f"Converter {name} lacks the `recordname` definition.") + + super().__init__(definition, name, converter_registry) + + def create_children(self, values: GeneralStore, element: StructureElement): + """The ndarray doesn't have any further children.""" + + return [] + + def create_records(self, values: GeneralStore, records: RecordStore, element: StructureElement): + """Create a wrapper record with name ``recordname``, type + ``array_recordtype_name`` (default ``H5Ndarray``) and the internal path + stored in a property with name ``internal_path_property_name`` (default + ``internal_hdf5_path``). + + """ + + rname = self.definition["recordname"] + if "array_recordtype_name" in self.definition: + rtname = self.definition["array_recordtype_name"] + else: + rtname = "H5Ndarray" + + if "internal_path_property_name" in self.definition: + propname = self.definition["internal_path_property_name"] + else: + propname = "internal_hdf5_path" + + rec = db.Record().add_parent(rtname) + records[rname] = rec + values[rname] = rec + + rec.add_property(name=propname, value=element.internal_path) + keys_modified = [(rname, propname)] + + keys_modified.extend(super().create_records(values, records, element)) + + return keys_modified + + def typecheck(self, element: StructureElement): + + return isinstance(element, H5NdarrayElement) + + @Converter.debug_matching("name") + def match(self, element: StructureElement): + + if not isinstance(element, H5NdarrayElement): + + raise RuntimeError("This converter can only be called with H5NdarrayElements.") + + return match_name_and_value(self.definition, element.name, element.value) diff --git a/src/caoscrawler/converters/rocrate.py b/src/caoscrawler/converters/rocrate.py new file mode 100644 index 0000000000000000000000000000000000000000..7dcad86589961f03f1e755ddbc0b60742cf4ed4a --- /dev/null +++ b/src/caoscrawler/converters/rocrate.py @@ -0,0 +1,229 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Converters take structure elements and create Records and new structure elements from them. + +This converter converts ro-crate files which may also be .eln-files. + +""" + +from __future__ import annotations + +import os +import re +import tempfile +from typing import Optional +from zipfile import ZipFile + +import rocrate +from rocrate.rocrate import ROCrate + +from ..stores import GeneralStore +from ..structure_elements import (Directory, File, ROCrateEntity, + StructureElement) +from .converters import Converter, SimpleFileConverter, convert_basic_element + + +class ROCrateConverter(SimpleFileConverter): + + """Convert ro-crate files / directories. + """ + + def setup(self): + self._tempdir = None + + def cleanup(self): + self._tempdir.cleanup() + + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + return isinstance(element, File) or isinstance(element, Directory) + + def match(self, element: StructureElement) -> Optional[dict]: + m = re.match(self.definition["match"], element.name) + if m is None: + return None + return m.groupdict() + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + Loads an ROCrate from an rocrate file or directory. + + Arguments: + ---------- + element must be a File or Directory (structure element). + + Returns: + -------- + A list with an ROCrateElement representing the contents of the .eln-file or None + in case of errors. + """ + + if isinstance(element, File): + self._tempdir = tempfile.TemporaryDirectory() + with ZipFile(element.path) as zipf: + zipf.extractall(self._tempdir.name) + crate_path = self._tempdir.name + crate = ROCrate(crate_path) + entity_ls = [] + for ent in crate.get_entities(): + entity_ls.append(ROCrateEntity(crate_path, ent)) + return entity_ls + elif isinstance(element, Directory): + # This would be an unzipped .eln file + # As this is possible for rocrate files, I think it is reasonable + # to support it as well. + raise NotImplementedError() + else: + raise ValueError("create_children was called with wrong type of StructureElement") + return None + + +class ELNFileConverter(ROCrateConverter): + + """Convert .eln-Files + See: https://github.com/TheELNConsortium/TheELNFileFormat + + These files are basically RO-Crates with some minor differences: + - The ro-crate metadata file is not on top-level within the .eln-zip-container, + but in a top-level subdirectory. + """ + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + Loads an ROCrate from an .eln-file or directory. + + This involves unzipping the .eln-file to a temporary folder and creating an ROCrate object + from its contents. + + Arguments: + ---------- + element must be a File or Directory (structure element). + + Returns: + -------- + A list with an ROCrateElement representing the contents of the .eln-file or None + in case of errors. + """ + + if isinstance(element, File): + self._tempdir = tempfile.TemporaryDirectory() + with ZipFile(element.path) as zipf: + zipf.extractall(self._tempdir.name) + cratep = os.listdir(self._tempdir.name) + if len(cratep) != 1: + raise RuntimeError(".eln file must contain exactly one folder") + crate_path = os.path.join(self._tempdir.name, cratep[0]) + crate = ROCrate(crate_path) + entity_ls = [] + for ent in crate.get_entities(): + entity_ls.append(ROCrateEntity(crate_path, ent)) + return entity_ls + elif isinstance(element, Directory): + # This would be an unzipped .eln file + # As this is possible for rocrate files, I think it is reasonable + # to support it as well. + raise NotImplementedError() + else: + raise ValueError("create_children was called with wrong type of StructureElement") + return None + + +class ROCrateEntityConverter(Converter): + + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + return isinstance(element, ROCrateEntity) + + def match(self, element: StructureElement) -> Optional[dict]: + # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 + # for a suggestion for the design of the matching algorithm. + if not isinstance(element, ROCrateEntity): + raise TypeError("Element must be an instance of ROCrateEntity.") + + # Store the result of all individual regexp variable results: + vardict = {} + + # TODO: I accidentally used "match_type" instead + # of "match_entity_type". This was completely + # unnoticed. So add it to schema and adapt tests. + + if "match_entity_type" in self.definition: + entity_type = element.entity.type + if isinstance(entity_type, list): + # TODO: this seems to be a bug in kadi4mat RO-Crates + # ./ has type ['Dataset'] + # instead of type 'Dataset' + entity_type = entity_type[0] + m_type = re.match(self.definition["match_entity_type"], entity_type) + if m_type is None: + return None + vardict.update(m_type.groupdict()) + + if not self.match_properties(element.entity.properties(), vardict): + return None + + return vardict + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + + children = [] + + eprops = element.entity.properties() + + # Add the properties: + for name, value in eprops.items(): + if isinstance(value, dict): + # This is - according to the standard - only allowed, if it's flat, i.e. + # it contains a single element with key == "@id" and the id as value which + # is supposed to be dereferenced: + if not (len(value) == 1 and "@id" in value): + raise RuntimeError("The JSON-LD is not flat.") + dereferenced = element.entity.crate.dereference(value["@id"]) + if dereferenced is not None: + children.append( + ROCrateEntity(element.folder, dereferenced)) + else: + # This is just an external ID and will be added as simple DictElement + children.append(convert_basic_element(value, name)) + else: + children.append(convert_basic_element(value, name)) + + # Add the files: + if isinstance(element.entity, rocrate.model.file.File): + path, name = os.path.split(eprops["@id"]) + children.append(File(name, os.path.join(element.folder, path, name))) + + # Parts of this entity are added as child entities: + for sublist in ("hasPart", "variableMeasured"): + if sublist in eprops: + for p in eprops[sublist]: + children.append( + ROCrateEntity(element.folder, element.entity.crate.dereference( + p["@id"]))) + # TODO: See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/195 for discussion. + + return children diff --git a/src/caoscrawler/converters/spss.py b/src/caoscrawler/converters/spss.py new file mode 100644 index 0000000000000000000000000000000000000000..00742e91506245435ed0c590f68ea9ffce65717a --- /dev/null +++ b/src/caoscrawler/converters/spss.py @@ -0,0 +1,302 @@ +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Converter for SAV files (stored by SPSS).""" + +from __future__ import annotations # Can be removed with 3.10. + +import argparse +from collections import OrderedDict +from typing import Any, Optional + +import numpy as np +import pandas as pd +import pyreadstat +import yaml + +from ..stores import GeneralStore +from ..structure_elements import File, StructureElement +from . import converters + +READSTAT_TYPES = { + "double": "DOUBLE", + "string": "TEXT", +} +ORIGINAL_TYPES = { + "EDATE8": "DATETIME", +} + + +class SPSSConverter(converters.TableConverter): + """Converter for SAV files (stored by SPSS).""" + + def create_children(self, values: GeneralStore, element: StructureElement) -> list: + assert isinstance(element, File) + # The default dtype backend "numpy_nullable" does not handle dates well. + # Note that pandas.ArrowDtype is considered experimental (in Pandas 2.2). + df = pd.io.spss.read_spss(element.path, dtype_backend="pyarrow") + dtypes = read_column_types(element.path) + + # Fix datetime columns + for name, dtype in dtypes.items(): + if dtype != "DATETIME": + continue + col = df.loc[:, name] + col.fillna(np.nan, inplace=True) + col.replace([np.nan], [None], inplace=True) + + return self._children_from_dataframe(df) + + +def read_column_types(savfile: Optional[str] = None, meta: Optional[Any] = None) -> dict[str, str]: + """Read SAV file and return the column types. + +Optionally, take data from a previours reading. + +Parameters +---------- +savfile : Optional[str] + The SAV file to read. + +meta : Optional + The meta data result from `pyreadstat.read_sav(...)`. + +Returns +------- +out : dict[str, str] + The column names and types. + """ + if not meta: + _, meta = pyreadstat.read_sav(savfile, metadataonly=True) + elif savfile is not None: + raise ValueError("Only one of `savfile` and `meta` must be given.") + dtypes: dict[str, str] = {} + for name in meta.column_names: + datatype = ORIGINAL_TYPES.get(meta.original_variable_types[name], + READSTAT_TYPES[meta.readstat_variable_types[name]]) + dtypes[name] = datatype + return dtypes + + +def spss_to_yaml(savfile: str, yamlfile: str, cfood: Optional[str] = None) -> None: + """Parse the *.sav and create basic datamodel in ``yamlfile``. + +Parameters +---------- +cfood: str + If given, also create a cfood skeleton. + """ + _, meta = pyreadstat.read_sav(savfile, metadataonly=True) + dtypes = read_column_types(meta=meta) + + cfood_str = """ +--- +metadata: + macros: + - !defmacro + # Simple column value -> property rule + name: ColumnValue + params: + name: null + belongsto: BaseElement + type: TextElement + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${belongsto}: + ${name}: $$val + - !defmacro + # column value -> reference property + name: ColumnValueReference + params: + name: null + reftype: null # RecordType of the reference + belongsto: BaseElement + type: TextElement # References are always text, right? + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${reftype}: + name: $$val + ${belongsto}: + ${name}: $$${reftype} + - !defmacro + # Same as "ColumnValue", but also give name of property. + name: ColumnValuePropname + params: + name: null + propname: null + belongsto: BaseElement + type: TextElement + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${belongsto}: + ${propname}: $$val +--- +directory: # corresponds to the directory given to the crawler + type: Directory + match: .* # we do not care how it is named here + subtree: + # This is the file + thisfile: + type: SPSSFile + match: ".*sav" + subtree: + entry: + type: Dict + match: .* # Name is irrelevant + records: + MyParent: + subtree: !macro +""" + + enums: dict[str, list[str]] = {} + properties = OrderedDict() + + for name in meta.column_names: + prop = { + "datatype": dtypes[name], + } + desc = meta.column_names_to_labels.get(name) + if desc and desc != name: + prop["description"] = desc + # Handle categorial variables + if var_label := meta.variable_to_label.get(name): + vvl = meta.variable_value_labels[name] + # reproducible (and sensible) order + label_values = [vvl[key] for key in sorted(vvl.keys())] + if label_values not in enums.values(): + enums[var_label] = label_values + else: + var_label = [key for key, value in enums.items() if value == label_values][0] + prop["datatype"] = var_label + properties[name] = prop + + output = f"""# auto-generated data model from file "{savfile}". +# To insert a datamodel into LinkAhead, run: +# +# python3 -m caosadvancedtools.models.parser datamodel.yaml --sync + +""" + + # Actual datamodel + output += """ +######### +# Enums # +######### + +""" + for name, values in enums.items(): + output += f"""{name}: + description: + # possible values: {values}\n""" + + output += (""" +############### +# RecordTypes # +############### + +DummyRT: + description: Note: Change name and enter description. + recommended_properties: + """ + + " ".join(yaml.dump(dict(properties), # from OrderedDict to dict + allow_unicode=True, + sort_keys=False).splitlines(keepends=True))) + + # Experimental: Enum creation + output += """ +############### +# Enum values # +############### +""" + for name, values in enums.items(): + output += f"\n# ### {name} ###\n" + for value in values: + output += f""" +{value}: + role: Record + inherit_from_suggested: + - {name} +""" + + with open(yamlfile, encoding="utf-8", mode="w") as myfile: + myfile.write(output) + + if cfood: + defs_col_value: list[str] = [] + defs_col_value_ref: list[str] = [] + prefix = " " * 14 + for name, propdef in properties.items(): + def_str = prefix + f"- name: {name}\n" + dtype = None + reftype = None + defs = defs_col_value + # Which type? + if propdef["datatype"] == "DOUBLE": + dtype = "FloatElement" + elif propdef["datatype"] in ("TEXT", "DATETIME"): + dtype = None + else: + reftype = propdef["datatype"] + defs = defs_col_value_ref + + # Append according to types: + if reftype: + def_str += prefix + f" reftype: {reftype}\n" + if dtype: + def_str += prefix + f" type: {dtype}\n" + + # Store result + defs.append(def_str) + del defs + + cfood_str += (prefix[2:] + "ColumnValue:\n" + "".join(defs_col_value) + + prefix[2:] + "ColumnValueReference:\n" + "".join(defs_col_value_ref) + ) + with open(cfood, encoding="utf-8", mode="w") as myfile: + myfile.write(cfood_str) + + +def _parse_arguments(): + """Parse the arguments.""" + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('-i', '--input', help="The *.sav file.", required=True) + parser.add_argument('-o', '--outfile', help="Yaml filename to save the result", required=True) + parser.add_argument('--cfood', help="Yaml filename to create cfood output in", required=False) + + return parser.parse_args() + + +def spss_to_datamodel_main(): + """The main function of this script.""" + args = _parse_arguments() + spss_to_yaml(savfile=args.input, yamlfile=args.outfile, cfood=args.cfood) + print(f"Written datamodel to: {args.outfile}") + if args.cfood: + print(f"Written cfood to: {args.cfood}") diff --git a/src/caoscrawler/converters/xml_converter.py b/src/caoscrawler/converters/xml_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..60d7b49431fb011a06b7105a16471b0b3c7b2268 --- /dev/null +++ b/src/caoscrawler/converters/xml_converter.py @@ -0,0 +1,234 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Converters take structure elements and create Records and new structure elements from them.""" + +from __future__ import annotations + +import re +from typing import Optional + +import lxml.etree + +from ..stores import GeneralStore +from ..structure_elements import (File, StructureElement, XMLAttributeNode, + XMLTagElement, XMLTextNode) +from .converters import (Converter, ConverterValidationError, + SimpleFileConverter) + + +class XMLFileConverter(SimpleFileConverter): + + """Convert XML files. See + https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 + for the current suggestion for the specification. + + """ + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + # TODO: See comment on types and inheritance + if not isinstance(element, File): + raise ValueError("create_children was called with wrong type of StructureElement") + with open(element.path, 'r') as xml_file: + xml = lxml.etree.parse(xml_file) + if "validate" in self.definition and self.definition["validate"]: + try: + raise NotImplementedError("XML validation not implemented yet.") + except ConverterValidationError as err: + raise ConverterValidationError( + "Error during the validation of the XML file:\n" + f"{element.path}\n" + err.message) + + return [XMLTagElement(xml.getroot())] + + +class XMLTagConverter(Converter): + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """Children that are generated by this function are the + result of the xpath query given in the yaml property + ``xpath``. Its default (when not given) is ``child::*``, so the + direct children of the current xml node. The xpath expression + must be designed in a way that it returns xml tags (and no + attributes or texts). That means, that the axis ``attribute::`` + and the function ``text()`` must not be used. + + The following yaml properties can be used to generate other + types of nodes (text nodes and attribute nodes) as subtree + structure elements: + + :: + + # _*_ marks the default: + attribs_as_children: true # true / _false_ + text_as_children: true # true / _false_ + tags_as_children: true # _true_ / false + + The default is to generate the tags matched by the xpath expression only. + + - When text_as_children is set to true, text nodes will be generated that contain the text + contained in the matched tags. + - When attribs_as_children is set to true, attribute nodes will be generated from the attributes + of the matched tags. + + Notes + ----- + The default is to take the namespace map from the current node and use it in xpath queries. + Because default namespaces cannot be handled by xpath, it is possible to remap the default namespace + using the key ``default_namespace``. + The key ``nsmap`` can be used to define additional nsmap entries. + + """ + if not isinstance(element, XMLTagElement): + raise TypeError("Element must be an instance of XMLTagElement.") + + # Get the namespace map from the element: + nsmap = element.tag.nsmap + # The default name of the default namespace is "default". + # You can overwrite it using the attribute "default_namespace" in the converter definition: + default_namespace = self.definition.get("default_namespace", "default") + if None in nsmap: + nsmap[default_namespace] = nsmap[None] + del nsmap[None] + + # Set additional nsmap entries from the converter definition: + if "nsmap" in self.definition: + for key, value in self.definition["nsmap"].items(): + nsmap[key] = value + + xpath = self.definition.get("xpath", "child::*") + children = element.tag.xpath(xpath, namespaces=nsmap) + el_lst = [] + for el in children: + if isinstance(el, str): + raise RuntimeError( + "Only standard xml nodes are supported as results of xpath queries.") + elif isinstance(el, lxml.etree._Element): + if self.definition.get("tags_as_children", True): + el_lst.append(XMLTagElement(el)) + if self.definition.get("attribs_as_children", False): + for attrib in el.attrib: + el_lst.append(XMLAttributeNode(el, attrib)) + if self.definition.get("text_as_children", False): + el_lst.append(XMLTextNode(el)) + else: + raise RuntimeError("Unsupported child type.") + return el_lst + + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + return isinstance(element, XMLTagElement) + + def match(self, element: StructureElement) -> Optional[dict]: + # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 + # for a suggestion for the design of the matching algorithm. + if not isinstance(element, XMLTagElement): + raise TypeError("Element must be an instance of XMLTagElement.") + + # Store the result of all individual regexp variable results: + vardict = {} + + if "match_tag" in self.definition: + m_tag = re.match(self.definition["match_tag"], element.tag.tag) + if m_tag is None: + return None + vardict.update(m_tag.groupdict()) + + if "match_text" in self.definition: + tagtext = element.tag.text + if element.tag.text is None: + tagtext = "" + m_text = re.match(self.definition["match_text"], tagtext, re.DOTALL) + if m_text is None: + return None + vardict.update(m_text.groupdict()) + + if not self.match_properties(element.tag.attrib, vardict, "match_attrib"): + return None + + return vardict + + +class XMLTextNodeConverter(Converter): + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + This converter does not create children. + """ + return [] + + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + return isinstance(element, XMLTextNode) + + def match(self, element: StructureElement) -> Optional[dict]: + # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 + # for a suggestion for the design of the matching algorithm. + if not isinstance(element, XMLTextNode): + raise TypeError("Element must be an instance of XMLTextNode.") + + vardict = {} + + m_text = re.match(self.definition["match_text"], element.value, + re.DOTALL) + if m_text is None: + return None + vardict.update(m_text.groupdict()) + + return vardict + + +class XMLAttributeNodeConverter(Converter): + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + This converter does not create children. + """ + return [] + + def typecheck(self, element: StructureElement): + """ + Check whether the current structure element can be converted using + this converter. + """ + return isinstance(element, XMLAttributeNode) + + def match(self, element: StructureElement) -> Optional[dict]: + # See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145 + # for a suggestion for the design of the matching algorithm. + if not isinstance(element, XMLAttributeNode): + raise TypeError("Element must be an instance of XMLAttributeNode.") + + vardict = {} + + m_name = re.match(self.definition["match_name"], element.key) + if m_name is None: + return None + vardict.update(m_name.groupdict()) + + m_value = re.match(self.definition["match_value"], element.value) + if m_value is None: + return None + vardict.update(m_value.groupdict()) + + return vardict diff --git a/src/caoscrawler/converters/zipfile_converter.py b/src/caoscrawler/converters/zipfile_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..7073e66a266168e17eb9b6143e7dc6292b5149dc --- /dev/null +++ b/src/caoscrawler/converters/zipfile_converter.py @@ -0,0 +1,82 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Converters take structure elements and create Records and new structure elements from them. + +This converter opens zip files, unzips them into a temporary directory and +exposes its contents as File structure elements. + +""" + +from __future__ import annotations + +import os +import tempfile +from os.path import isdir, join +from zipfile import ZipFile + +from ..stores import GeneralStore +from ..structure_elements import Directory, File, StructureElement +from .converters import SimpleFileConverter + + +class ZipFileConverter(SimpleFileConverter): + + """Convert zipfiles. + """ + + def setup(self): + self._tempdir = None + + def cleanup(self): + self._tempdir.cleanup() + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + Loads an ROCrate from an rocrate file or directory. + + Arguments: + ---------- + element must be a File or Directory (structure element). + + Returns: + -------- + A list with an ROCrateElement representing the contents of the .eln-file or None + in case of errors. + """ + + if isinstance(element, File): + self._tempdir = tempfile.TemporaryDirectory() + unzd_path = self._tempdir.name + with ZipFile(element.path) as zipf: + zipf.extractall(unzd_path) + + entity_ls = [] + for el in os.listdir(unzd_path): + path = join(unzd_path, el) + if isdir(path): + entity_ls.append(Directory(el, path)) + else: + entity_ls.append(File(el, path)) + + return entity_ls + else: + raise ValueError("create_children was called with wrong type of StructureElement") + return None diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 6cf025a024e8cc392a7175421d47fb69059302a4..e0d243979faee8f44cdcee3b0e49c15af640c378 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -1,11 +1,13 @@ #!/usr/bin/env python3 # encoding: utf-8 # -# ** header v3.0 # This file is a part of the CaosDB Project. # -# Copyright (C) 2021 Henrik tom Wörden -# 2021 Alexander Schlemmer +# Copyright (C) 2021-2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2021-2023 Research Group Biomedical Physics, MPI-DS Göttingen +# Copyright (C) 2021-2023 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# Copyright (C) 2021-2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -25,46 +27,50 @@ """ Crawl a file structure using a yaml cfood definition and synchronize -the acuired data with CaosDB. +the acuired data with LinkAhead. """ from __future__ import annotations import argparse -import importlib import logging import os import sys +import traceback import uuid import warnings -import yaml - from argparse import RawTextHelpFormatter -from collections import defaultdict from copy import deepcopy +from datetime import datetime from enum import Enum -from importlib_resources import files -from jsonschema import validate -from typing import Any, Optional, Type, Union - -import caosdb as db +from typing import Any, List, Optional, Union -from caosadvancedtools.cache import UpdateCache, Cache +import linkahead as db +import yaml +from caosadvancedtools.cache import UpdateCache from caosadvancedtools.crawler import Crawler as OldCrawler -from caosdb.apiutils import (compare_entities, EntityMergeConflictError, - merge_entities) -from caosdb.common.datatype import is_reference - -from .converters import Converter, DirectoryConverter, ConverterValidationError -from .identifiable import Identifiable -from .identifiable_adapters import (IdentifiableAdapter, - LocalStorageIdentifiableAdapter, - CaosDBIdentifiableAdapter) -from .identified_cache import IdentifiedCache +from caosadvancedtools.serverside.helper import send_mail +from caosadvancedtools.utils import create_entity_link +from linkahead.apiutils import compare_entities, merge_entities +from linkahead.cached import cache_clear, cached_get_entity_by +from linkahead.common.datatype import get_list_datatype, is_reference +from linkahead.exceptions import TransactionError +from linkahead.utils.escape import escape_squoted_text + +from .config import get_config_setting +from .converters import Converter, ConverterValidationError +from .debug_tree import DebugTree +from .exceptions import ImpossibleMergeError +from .identifiable_adapters import (CaosDBIdentifiableAdapter, + IdentifiableAdapter) +from .logging import configure_server_side_logging from .macros import defmacro_constructor, macro_constructor -from .stores import GeneralStore, RecordStore -from .structure_elements import StructureElement, Directory, NoneElement -from .version import check_cfood_version +from .scanner import (create_converter_registry, initialize_converters, + load_definition, scan_directory, scan_structure_elements) +from .stores import GeneralStore +from .structure_elements import StructureElement +from .sync_graph import SyncGraph +from .utils import get_shared_resource_link logger = logging.getLogger(__name__) @@ -76,21 +82,27 @@ yaml.SafeLoader.add_constructor("!defmacro", defmacro_constructor) yaml.SafeLoader.add_constructor("!macro", macro_constructor) +class ForbiddenTransaction(Exception): + pass + + def check_identical(record1: db.Entity, record2: db.Entity, ignore_id=False): - """ - This function uses compare_entities to check whether to entities are identical - in a quite complex fashion: - - If one of the entities has additional parents or additional properties -> not identical - - If the value of one of the properties differs -> not identical - - If datatype, importance or unit are reported different for a property by compare_entities - return "not_identical" only if these attributes are set explicitely by record1. - Ignore the difference otherwise. - - If description, name, id or path appear in list of differences -> not identical. - - If file, checksum, size appear -> Only different, if explicitely set by record1. - - record1 serves as the reference, so datatype, importance and unit checks are carried - out using the attributes from record1. In that respect, the function is not symmetrical - in its arguments. + """Check whether two entities are identical. + +This function uses compare_entities to check whether two entities are identical +in a quite complex fashion: + +- If one of the entities has additional parents or additional properties -> not identical +- If the value of one of the properties differs -> not identical +- If datatype, importance or unit are reported different for a property by compare_entities + return False only if these attributes are set explicitely by record1. + Ignore the difference otherwise. +- If description, name, id or path appear in list of differences -> not identical. +- If file, checksum, size appear -> Only different, if explicitely set by record1. + +record1 serves as the reference, so datatype, importance and unit checks are carried +out using the attributes from record1. In that respect, the function is not symmetrical +in its arguments. """ comp = compare_entities(record1, record2) @@ -175,26 +187,19 @@ class Crawler(object): def __init__(self, generalStore: Optional[GeneralStore] = None, - debug: bool = False, - identifiableAdapter: IdentifiableAdapter = None, - securityMode: SecurityMode = SecurityMode.UPDATE - ): + debug: Optional[bool] = None, + identifiableAdapter: Optional[IdentifiableAdapter] = None, + securityMode: SecurityMode = SecurityMode.UPDATE): """ Create a new crawler and initialize an empty RecordStore and GeneralStore. + Deprecated arguments: + - The debug argument does not have an effect anymore. + - generalStore: This argument does not have an effect anymore. It might be added to the scanning + functions in the scanner module in the future, if needed. + Parameters ---------- - recordStore : GeneralStore - An initial GeneralStore which might store e.g. environment variables. - debug : bool - Create a debugging information tree when set to True. - The debugging information tree is a variable stored in - self.debug_tree. It is a dictionary mapping directory entries - to a tuple of general stores and record stores which are valid for - the directory scope. - Furthermore, it is stored in a second tree named self.debug_copied whether the - objects in debug_tree had been copied from a higher level in the hierarchy - of the structureelements. identifiableAdapter : IdentifiableAdapter TODO describe securityMode : int @@ -202,659 +207,149 @@ class Crawler(object): Please use SecurityMode Enum """ + # Remove this once the property `crawled_data` is no longer needed for compatibility + # reasons + self._crawled_data = None + # The following caches store records, where we checked whether they exist on the remote # server. Since, it is important to know whether they exist or not, we store them into two # different caches. - self.remote_existing_cache = IdentifiedCache() - self.remote_missing_cache = IdentifiedCache() - self.recordStore = RecordStore() - self.securityMode = securityMode - self.generalStore = generalStore - if generalStore is None: - self.generalStore = GeneralStore() + # TODO does it make sense to have this as member variable? + self.securityMode = securityMode + # TODO does it make sense to have this as member variable(run_id)? + self.generate_run_id() - self.identifiableAdapter: IdentifiableAdapter = LocalStorageIdentifiableAdapter() + self.identifiableAdapter: IdentifiableAdapter = CaosDBIdentifiableAdapter() if identifiableAdapter is not None: self.identifiableAdapter = identifiableAdapter - # If a directory is crawled this may hold the path to that directory - self.crawled_directory: Optional[str] = None - self.debug = debug - if self.debug: - # order in the tuple: - # 0: generalStore - # 1: recordStore - self.debug_tree: dict[str, tuple] = dict() - self.debug_metadata: dict[str, dict] = dict() - self.debug_metadata["copied"] = dict() - self.debug_metadata["provenance"] = defaultdict(lambda: dict()) - self.debug_metadata["usage"] = defaultdict(lambda: set()) - - def load_definition(self, crawler_definition_path: str): - """ - Load a cfood from a crawler definition defined by - crawler definition path and validate it using cfood-schema.yml. - """ - - # Load the cfood from a yaml file: - with open(crawler_definition_path, "r") as f: - crawler_definitions = list(yaml.safe_load_all(f)) - - crawler_definition = self._load_definition_from_yaml_dict( - crawler_definitions) - - return self._resolve_validator_paths(crawler_definition, crawler_definition_path) - - def _load_definition_from_yaml_dict(self, crawler_definitions: list[dict]): - """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which - contains either one or two documents. - - Doesn't resolve the validator paths in the cfood definition, so for - internal and testing use only. - - """ - if len(crawler_definitions) == 1: - # Simple case, just one document: - crawler_definition = crawler_definitions[0] - metadata = {} - elif len(crawler_definitions) == 2: - metadata = crawler_definitions[0]["metadata"] if "metadata" in crawler_definitions[0] else { - } - crawler_definition = crawler_definitions[1] - else: - raise RuntimeError( - "Crawler definition must not contain more than two documents.") - - check_cfood_version(metadata) - - # TODO: at this point this function can already load the cfood schema extensions - # from the crawler definition and add them to the yaml schema that will be - # tested in the next lines of code: - # Load the cfood schema: - with open(str(files('caoscrawler').joinpath('cfood-schema.yml')), "r") as f: - schema = yaml.safe_load(f) - - # Add custom converters to converter enum in schema: - if "Converters" in crawler_definition: - for key in crawler_definition["Converters"]: - schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( - key) - if len(crawler_definitions) == 2: - if "Converters" in metadata: - for key in metadata["Converters"]: - schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( - key) - - # Validate the cfood schema: - validate(instance=crawler_definition, schema=schema["cfood"]) - - return crawler_definition - - def _resolve_validator_paths(self, definition: dict, definition_path: str): - """Resolve path to validation files with respect to the file in which - the crawler was defined. - - """ + if debug is not None: + warnings.warn(DeprecationWarning( + "The debug argument of the Crawler class is deprecated and has no effect.")) - for key, value in definition.items(): - - if key == "validate" and isinstance(value, str): - # Validator is given by a path - if not value.startswith('/'): - # Not an absolute path - definition[key] = os.path.join( - os.path.dirname(definition_path), value) - if not os.path.isfile(definition[key]): - # TODO(henrik) capture this in `crawler_main` similar to - # `ConverterValidationError`. - raise FileNotFoundError( - f"Couldn't find validation file {definition[key]}") - elif isinstance(value, dict): - # Recursively resolve all validators - definition[key] = self._resolve_validator_paths( - value, definition_path) - - return definition + if generalStore is not None: + warnings.warn(DeprecationWarning( + "The generalStore argument of the Crawler class is deprecated and has no effect.")) def load_converters(self, definition: dict): - """ - Currently the converter registry is a dictionary containing for each converter: - - key is the short code, abbreviation for the converter class name - - module is the name of the module to be imported which must be installed - - class is the converter class to load and associate with this converter entry - - all other info for the converter needs to be included in the converter plugin - directory: - schema.yml file - README.md documentation - """ - - # Defaults for the converter registry: - converter_registry: dict[str, dict[str, str]] = { - "Directory": { - "converter": "DirectoryConverter", - "package": "caoscrawler.converters"}, - "SimpleFile": { - "converter": "SimpleFileConverter", - "package": "caoscrawler.converters"}, - "MarkdownFile": { - "converter": "MarkdownFileConverter", - "package": "caoscrawler.converters"}, - "File": { - "converter": "SimpleFileConverter", - "package": "caoscrawler.converters"}, - "JSONFile": { - "converter": "JSONFileConverter", - "package": "caoscrawler.converters"}, - "YAMLFile": { - "converter": "YAMLFileConverter", - "package": "caoscrawler.converters"}, - "CSVTableConverter": { - "converter": "CSVTableConverter", - "package": "caoscrawler.converters"}, - "XLSXTableConverter": { - "converter": "XLSXTableConverter", - "package": "caoscrawler.converters"}, - "DictBooleanElement": { - "converter": "BooleanElementConverter", - "package": "caoscrawler.converters"}, - "BooleanElement": { - "converter": "BooleanElementConverter", - "package": "caoscrawler.converters"}, - "DictFloatElement": { - "converter": "FloatElementConverter", - "package": "caoscrawler.converters"}, - "FloatElement": { - "converter": "FloatElementConverter", - "package": "caoscrawler.converters"}, - "DictTextElement": { - "converter": "TextElementConverter", - "package": "caoscrawler.converters"}, - "TextElement": { - "converter": "TextElementConverter", - "package": "caoscrawler.converters"}, - "DictIntegerElement": { - "converter": "IntegerElementConverter", - "package": "caoscrawler.converters"}, - "IntegerElement": { - "converter": "IntegerElementConverter", - "package": "caoscrawler.converters"}, - "DictListElement": { - "converter": "ListElementConverter", - "package": "caoscrawler.converters"}, - "ListElement": { - "converter": "ListElementConverter", - "package": "caoscrawler.converters"}, - "DictDictElement": { - "converter": "DictElementConverter", - "package": "caoscrawler.converters"}, - "DictElement": { - "converter": "DictElementConverter", - "package": "caoscrawler.converters"}, - "Dict": { - "converter": "DictElementConverter", - "package": "caoscrawler.converters"}, - } - - # More converters from definition file: - if "Converters" in definition: - for key, entry in definition["Converters"].items(): - if key in ["Dict", "DictTextElement", "DictIntegerElement", "DictBooleanElement", - "DictDictElement", "DictListElement", "DictFloatElement"]: - warnings.warn(DeprecationWarning(f"{key} is deprecated. Please use the new" - " variant; without 'Dict' prefix or " - "'DictElement' in case of 'Dict'")) - - converter_registry[key] = { - "converter": entry["converter"], - "package": entry["package"] - } - - # Load modules and associate classes: - for key, value in converter_registry.items(): - module = importlib.import_module(value["package"]) - value["class"] = getattr(module, value["converter"]) - return converter_registry - - def crawl_directory(self, dirname: str, crawler_definition_path: str): - """ Crawl a single directory. - - Convenience function that starts the crawler (calls start_crawling) - with a single directory as the StructureElement. - """ + warnings.warn(DeprecationWarning( + "The function load_converters in the crawl module is deprecated. " + "Please use create_converter_registry from the scanner module.")) + return create_converter_registry(definition) - crawler_definition = self.load_definition(crawler_definition_path) - # Load and register converter packages: - converter_registry = self.load_converters(crawler_definition) - - if not dirname: - raise ValueError( - "You have to provide a non-empty path for crawling.") - dir_structure_name = os.path.basename(dirname) - self.crawled_directory = dirname - if not dir_structure_name and dirname.endswith('/'): - if dirname == '/': - # Crawling the entire file system - dir_structure_name = "root" - else: - # dirname had a trailing '/' - dir_structure_name = os.path.basename(dirname[:-1]) - - self.start_crawling(Directory(dir_structure_name, - dirname), - crawler_definition, - converter_registry) - - @staticmethod - def initialize_converters(crawler_definition: dict, converter_registry: dict): - """ - takes the cfood as dict (`crawler_definition`) and creates the converter objects that - are defined on the highest level. Child Converters will in turn be created during the - initialization of the Converters. - """ - converters = [] - - for key, value in crawler_definition.items(): - # Definitions and Converters are reserved keywords - # on the top level of the yaml file. - # TODO: there should also be a top level keyword for the actual - # CFood to avoid confusion between top level keywords - # and the CFood. - if key == "Definitions": - continue - elif key == "Converters": - continue - converters.append(Converter.converter_factory( - value, key, converter_registry)) - - return converters + def load_definition(self, crawler_definition_path: str): + warnings.warn(DeprecationWarning( + "The function load_definition in the crawl module is deprecated. " + "Please use load_definition from the scanner module.")) + return load_definition(crawler_definition_path) + + def initialize_converters(self, crawler_definition: dict, converter_registry: dict): + warnings.warn(DeprecationWarning( + "The function initialize_converters in the crawl module is deprecated. " + "Please use initialize_converters from the scanner module.")) + return initialize_converters(crawler_definition, converter_registry) + + def generate_run_id(self): + self.run_id = uuid.uuid1() def start_crawling(self, items: Union[list[StructureElement], StructureElement], crawler_definition: dict, - converter_registry: dict): + converter_registry: dict, + restricted_path: Optional[list[str]] = None): + + warnings.warn(DeprecationWarning( + "The function start_crawling in the crawl module is deprecated. " + "Please use scan_structure_elements from the scanner module.")) + + data = scan_structure_elements( + items, crawler_definition, converter_registry, restricted_path) + self.crawled_data = data + return data + + @property + def crawled_data(self): + warnings.warn(DeprecationWarning( + "The use of self.crawled_data is depricated. You should not access this variable. " + "Instead, create the data with the scanner and then pass it as argument to Crawler " + "functions")) + return self._crawled_data + + @crawled_data.setter + def crawled_data(self, arg): + self._crawled_data = arg + + def crawl_directory(self, + crawled_directory: str, + crawler_definition_path: str, + restricted_path: Optional[list[str]] = None): """ - Start point of the crawler recursion. - - Parameters - ---------- - items: list - A list of structure elements (or a single StructureElement) that is used for - generating the initial items for the crawler. This could e.g. be a Directory. - crawler_definition : dict - A dictionary representing the crawler definition, possibly from a yaml - file. - - Returns - ------- - crawled_data : list - the final list with the target state of Records. + The new main function to run the crawler on a directory. """ - # This function builds the tree of converters out of the crawler definition. + warnings.warn(DeprecationWarning( + "The function crawl_directory in the crawl module is deprecated. " + "Please use scan_directory from the scanner module.")) - if self.generalStore is None: - raise RuntimeError("Should not happen.") - - if not isinstance(items, list): - items = [items] - - self.run_id = uuid.uuid1() - local_converters = Crawler.initialize_converters(crawler_definition, converter_registry) + data = scan_directory(crawled_directory, + crawler_definition_path, + restricted_path) + self.crawled_data = data + return data - # This recursive crawling procedure generates the update list: - self.crawled_data: list[db.Record] = [] - self._crawl(items, local_converters, self.generalStore, self.recordStore, [], []) + def _split_into_inserts_and_updates(self, st: SyncGraph): + """Classify nodes in the SyncGraph ``st`` with respect to their state on the server. - if self.debug: - self.debug_converters = local_converters +This method iteratively checks whether those nodes exist on the remote server and creates two lists, +one with the entities that need to be updated and the other with entities to be inserted. - return self.crawled_data - - def synchronize(self, commit_changes: bool = True, unique_names=True): - """ - Carry out the actual synchronization. - """ - - # After the crawling, the actual synchronization with the database, based on the - # update list is carried out: - - return self._synchronize(self.crawled_data, commit_changes, unique_names=unique_names) - - def _has_reference_value_without_id(self, ident: Identifiable) -> bool: - """ - Returns True if there is at least one value in the properties attribute of ``ident`` which: - - a) is a reference property AND - b) where the value is set to a - :external+caosdb-pylib:py:class:`db.Entity <caosdb.common.models.Entity>` - (instead of an ID) AND - c) where the ID of the value (the - :external+caosdb-pylib:py:class:`db.Entity <caosdb.common.models.Entity>` object in b)) - is not set (to an integer) - - Returns - ------- - bool - True if there is a value without id (see above) +.. todo:: - Raises - ------ - ValueError - If no Identifiable is given. - """ - if ident is None: - raise ValueError("Identifiable has to be given as argument") - for pvalue in list(ident.properties.values()) + ident.backrefs: - if isinstance(pvalue, list): - for el in pvalue: - if isinstance(el, db.Entity) and el.id is None: - return True - elif isinstance(pvalue, db.Entity) and pvalue.id is None: - return True - return False - - @staticmethod - def create_flat_list(ent_list: list[db.Entity], flat: Optional[list[db.Entity]] = None): - """ - Recursively adds entities and all their properties contained in ent_list to - the output list flat. - - TODO: This function will be moved to pylib as it is also needed by the - high level API. - """ - # Note: A set would be useful here, but we do not want a random order. - if flat is None: - flat = list() - for el in ent_list: - if el not in flat: - flat.append(el) - for ent in ent_list: - for p in ent.properties: - # For lists append each element that is of type Entity to flat: - if isinstance(p.value, list): - for el in p.value: - if isinstance(el, db.Entity): - if el not in flat: - flat.append(el) - Crawler.create_flat_list([el], flat) - elif isinstance(p.value, db.Entity): - if p.value not in flat: - flat.append(p.value) - Crawler.create_flat_list([p.value], flat) - return flat - - def _has_missing_object_in_references(self, ident: Identifiable, referencing_entities: list): - """ - returns False if any value in the properties attribute is a db.Entity object that - is contained in the `remote_missing_cache`. If ident has such an object in - properties, it means that it references another Entity, where we checked - whether it exists remotely and it was not found. - """ - if ident is None: - raise ValueError("Identifiable has to be given as argument") - for pvalue in list(ident.properties.values()) + ident.backrefs: - # Entity instead of ID and not cached locally - if (isinstance(pvalue, list)): - for el in pvalue: - if (isinstance(el, db.Entity) and self.get_from_remote_missing_cache( - self.identifiableAdapter.get_identifiable(el, referencing_entities)) is not None): - return True - if (isinstance(pvalue, db.Entity) and self.get_from_remote_missing_cache( - self.identifiableAdapter.get_identifiable(pvalue, referencing_entities)) is not None): - # might be checked when reference is resolved - return True - return False - - def replace_references_with_cached(self, record: db.Record, referencing_entities: list): - """ - Replace all references with the versions stored in the cache. - - If the cache version is not identical, raise an error. - """ - for p in record.properties: - if (isinstance(p.value, list)): - lst = [] - for el in p.value: - if (isinstance(el, db.Entity) and el.id is None): - cached = self.get_from_any_cache( - self.identifiableAdapter.get_identifiable(el, referencing_entities)) - if cached is None: - raise RuntimeError("Not in cache.") - if not check_identical(cached, el, True): - if isinstance(p.value, db.File): - if p.value.path != cached.path: - raise RuntimeError("Not identical.") - else: - raise RuntimeError("Not identical.") - lst.append(cached) - else: - lst.append(el) - p.value = lst - if (isinstance(p.value, db.Entity) and p.value.id is None): - cached = self.get_from_any_cache( - self.identifiableAdapter.get_identifiable(p.value, referencing_entities)) - if cached is None: - raise RuntimeError("Not in cache.") - if not check_identical(cached, p.value, True): - if isinstance(p.value, db.File): - if p.value.path != cached.path: - raise RuntimeError("Not identical.") - else: - raise RuntimeError("Not identical.") - p.value = cached - - def get_from_remote_missing_cache(self, identifiable: Identifiable): - """ - returns the identified record if an identifiable with the same values already exists locally - (Each identifiable that is not found on the remote server, is 'cached' locally to prevent - that the same identifiable exists twice) - """ - if identifiable is None: - raise ValueError("Identifiable has to be given as argument") - - if identifiable in self.remote_missing_cache: - return self.remote_missing_cache[identifiable] - else: - return None - - def get_from_any_cache(self, identifiable: Identifiable): - """ - returns the identifiable if an identifiable with the same values already exists locally - (Each identifiable that is not found on the remote server, is 'cached' locally to prevent - that the same identifiable exists twice) - """ - if identifiable is None: - raise ValueError("Identifiable has to be given as argument") - - if identifiable in self.remote_existing_cache: - return self.remote_existing_cache[identifiable] - elif identifiable in self.remote_missing_cache: - return self.remote_missing_cache[identifiable] - else: - return None - - def add_to_remote_missing_cache(self, record: db.Record, identifiable: Identifiable): - """ - stores the given Record in the remote_missing_cache. + Should this be made into a public method of SyncGraph instead? At the moment, this is a + purely static method that only operates on the state of ``st``. - If identifiable is None, the Record is NOT stored. """ - self.add_to_cache(record=record, cache=self.remote_missing_cache, - identifiable=identifiable) + entity_was_treated = True + # st.unchecked contains Entities which could not yet be checked against the remote server + while entity_was_treated and len(st.unchecked) > 0: + entity_was_treated = False + + for se in st.unchecked: + if se.identifiable is None: # we cannot yet identify this node + continue + + # check remote server + identified_record = ( + st.identifiableAdapter.retrieve_identified_record_for_identifiable( + se.identifiable)) + remote_id = None + if identified_record is not None: + remote_id = identified_record.id + # set id of node. if node is missing, remote_id is None and the SyncGraph marks it + # as missing + st.set_id_of_node(se, remote_id) + entity_was_treated = True + break # one or more nodes were just removed from st.unchecked -> back to start + + # This only might add properties of the postponed records to the already used ones. + if len(st.unchecked) > 0: + # circle = st.unchecked_contains_circular_dependency() + # if circle is None: + # logger.error("Failed, but found NO circular dependency. The data is as follows:" + # + "\n".join([str(el) for el in st.unchecked]) + + # ) + # else: + # logger.error("Found circular dependency (Note that this might include references " + # "that are not identifying properties): " + # + "\n".join([str(el) for el in st.unchecked]) + # ) - def add_to_remote_existing_cache(self, record: db.Record, identifiable: Identifiable): - """ - stores the given Record in the remote_existing_cache. - - If identifiable is None, the Record is NOT stored. - """ - self.add_to_cache(record=record, cache=self.remote_existing_cache, - identifiable=identifiable) - - def add_to_cache(self, record: db.Record, cache: IdentifiedCache, - identifiable: Identifiable) -> None: - """ - stores the given Record in the given cache. - - If identifiable is None, the Record is NOT stored. - """ - if identifiable is not None: - cache.add(identifiable=identifiable, record=record) - - @staticmethod - def bend_references_to_new_object(old, new, entities): - """ Bend references to the other object - Iterate over all entities in `entities` and check the values of all properties of - occurances of old Entity and replace them with new Entity - """ - for el in entities: - for p in el.properties: - if isinstance(p.value, list): - for index, val in enumerate(p.value): - if val is old: - p.value[index] = new - else: - if p.value is old: - p.value = new - - @staticmethod - def create_reference_mapping(flat: list[db.Entity]): - """ - Create a dictionary of dictionaries of the form: - dict[int, dict[str, list[db.Entity]]] - - - The integer index is the Python id of the value object. - - The string is the name of the first parent of the referencing object. - - Each value objects is taken from the values of all properties from the list flat. - - So the returned mapping maps ids of entities to the objects which are referring - to them. - """ - # TODO we need to treat children of RecordTypes somehow. - references: dict[int, dict[str, list[db.Entity]]] = {} - for ent in flat: - for p in ent.properties: - val = p.value - if not isinstance(val, list): - val = [val] - for v in val: - if isinstance(v, db.Entity): - if id(v) not in references: - references[id(v)] = {} - if ent.parents[0].name not in references[id(v)]: - references[id(v)][ent.parents[0].name] = [] - references[id(v)][ent.parents[0].name].append(ent) - - return references - - def split_into_inserts_and_updates(self, ent_list: list[db.Entity]): - to_be_inserted: list[db.Entity] = [] - to_be_updated: list[db.Entity] = [] - flat = Crawler.create_flat_list(ent_list) - - # TODO: can the following be removed at some point - for ent in flat: - if ent.role == "Record" and len(ent.parents) == 0: - raise RuntimeError("Records must have a parent.") - - resolved_references = True - # flat contains Entities which could not yet be checked against the remote server - while resolved_references and len(flat) > 0: - resolved_references = False - referencing_entities = self.create_reference_mapping( - flat + to_be_updated + to_be_inserted) - - # For each element we try to find out whether we can find it in the server or whether - # it does not yet exist. Since a Record may reference other unkown Records it might not - # be possible to answer this right away. - # The following checks are done on each Record: - # 1. Can it be identified via an ID? - # 2. Can it be identified via a path? - # 3. Is it in the cache of already checked Records? - # 4. Can it be checked on the remote server? - # 5. Does it have to be new since a needed reference is missing? - for i in reversed(range(len(flat))): - record = flat[i] - identifiable = self.identifiableAdapter.get_identifiable( - record, - referencing_entities=referencing_entities) - - # TODO remove if the exception is never raised - if record in to_be_inserted: - raise RuntimeError("This should not be reached since treated elements" - "are removed from the list") - # 1. Can it be identified via an ID? - elif record.id is not None: - to_be_updated.append(record) - self.add_to_remote_existing_cache(record, identifiable) - del flat[i] - # 2. Can it be identified via a path? - elif record.path is not None: - existing = self._get_entity_by_path(record.path) - if existing is None: - to_be_inserted.append(record) - self.add_to_remote_missing_cache(record, identifiable) - del flat[i] - else: - record.id = existing.id - # TODO check the following copying of _size and _checksum - # Copy over checksum and size too if it is a file - record._size = existing._size - record._checksum = existing._checksum - to_be_updated.append(record) - self.add_to_remote_existing_cache(record, identifiable) - del flat[i] - # 3. Is it in the cache of already checked Records? - elif self.get_from_any_cache(identifiable) is not None: - # We merge the two in order to prevent loss of information - newrecord = self.get_from_any_cache(identifiable) - try: - merge_entities(newrecord, record) - except EntityMergeConflictError: - continue - Crawler.bend_references_to_new_object( - old=record, new=newrecord, entities=flat + to_be_updated + to_be_inserted) - - del flat[i] - resolved_references = True - - # 4. Can it be checked on the remote server? - elif not self._has_reference_value_without_id(identifiable): - identified_record = ( - self.identifiableAdapter.retrieve_identified_record_for_identifiable( - identifiable)) - if identified_record is None: - # identifiable does not exist remotely -> record needs to be inserted - to_be_inserted.append(record) - self.add_to_remote_missing_cache(record, identifiable) - del flat[i] - else: - # side effect - record.id = identified_record.id - to_be_updated.append(record) - self.add_to_remote_existing_cache(record, identifiable) - del flat[i] - resolved_references = True - - # 5. Does it have to be new since a needed reference is missing? - # (Is it impossible to check this record because an identifiable references a - # missing record?) - elif self._has_missing_object_in_references(identifiable, referencing_entities): - to_be_inserted.append(record) - self.add_to_remote_missing_cache(record, identifiable) - del flat[i] - resolved_references = True - - for record in flat: - self.replace_references_with_cached(record, referencing_entities) - - if len(flat) > 0: raise RuntimeError( - "Could not resolve all Entity references. Circular Dependency?") + "Could not finish _split_into_inserts_and_updates. " + "It might be due to a circular dependency") - return to_be_inserted, to_be_updated + return st.export_record_lists() def replace_entities_with_ids(self, rec: db.Record): for el in rec.properties: @@ -867,6 +362,38 @@ class Crawler(object): if val.id is not None: el.value[index] = val.id + @staticmethod + def compact_entity_list_representation(entities, referencing_entities: List) -> str: + """ a more readable representation than the standard xml representation + + TODO this can be removed once the yaml format representation is in pylib + """ + text = "\n--------\n" + + grouped = {"": []} + for ent in entities: + if not ent.parents: + grouped[""].append(ent) + for parent in ent.parents: + if parent.name not in grouped: + grouped[parent.name] = [] + grouped[parent.name].append(ent) + if not grouped[""]: + del grouped[""] + for parent, group in grouped.items(): + text += f"\n> Parent: {parent}\n" + for ent in group: + if ent.name is not None: + text += f"\n>> Name: {ent.name}\n" + else: + text += "\n>> name: # No name" + text += f"{[ent.name for ent in ent.parents]}\n" + props = {p.name: p.value for p in ent.properties} + text += f"{props}\n" + text += f"is_referenced_by:\n{referencing_entities[id(ent)]}\n" + + return text + "--------\n" + @staticmethod def _merge_properties_from_remote( crawled_data: list[db.Record], @@ -929,6 +456,8 @@ class Crawler(object): for i in reversed(range(len(crawled_data))): if not check_identical(crawled_data[i], identified_records[i]): + logger.debug("Sheduled update because of the folllowing diff:\n" + + str(compare_entities(crawled_data[i], identified_records[i]))) actual_updates.append(crawled_data[i]) return actual_updates @@ -942,16 +471,17 @@ class Crawler(object): because some changes in parents (e.g. of Files) might fail if they are not updated first. """ + logger.debug("=== Going to execute parent updates ===") Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated) parent_updates = db.Container() - for record in to_be_updated: - old_entity = Crawler._get_entity_by_id(record.id) + for entity in to_be_updated: + old_entity = cached_get_entity_by(eid=entity.id) # Check whether the parents have been changed and add them if missing # in the old entity: changes_made = False - for parent in record.parents: + for parent in entity.parents: found = False for old_parent in old_entity.parents: if old_parent.id == parent.id: @@ -976,27 +506,67 @@ class Crawler(object): logger.info(parent_updates) @staticmethod - def _get_entity_by_name(name): - return db.Entity(name=name).retrieve() + def _get_property_id_for_datatype(rtname: str, name: str): + return cached_get_entity_by( + query=f"FIND Entity '{escape_squoted_text(rtname)}' " + f"with name='{escape_squoted_text(name)}'").id @staticmethod - def _get_entity_by_path(path): - try: - return db.execute_query(f"FIND FILE WHICH IS STORED AT '{path}'", unique=True) - except db.exceptions.EmptyUniqueQueryError: - return None - - @staticmethod - def _get_entity_by_id(id): - return db.Entity(id=id).retrieve() + def replace_name_with_referenced_entity_id(prop: db.Property): + """changes the given property in place if it is a reference property that has a name as + value + + If the Property has a List datatype, each element is treated separately. + If the datatype is generic, i.e. FILE or REFERENCE, values stay unchanged. + If the value is not a string, the value stays unchanged. + If the query using the datatype and the string value does not uniquely identify an Entity, + the value stays unchanged. + If an Entity is identified, then the string value is replaced by the ID. + """ + if get_list_datatype(prop.datatype) is None: # not a list + if (isinstance(prop.value, str) and is_reference(prop.datatype) and + prop.datatype != db.FILE and prop.datatype != db.REFERENCE): # datatype is a non-generic reference and value is a string + try: + # the get_entity function will raise an error if not unique + prop.value = Crawler._get_property_id_for_datatype( + rtname=prop.datatype, name=prop.value) + except (db.EmptyUniqueQueryError, db.QueryNotUniqueError): + logger.error(f"The Property {prop.name} with datatype={prop.datatype} has the " + f"value {prop.value} and there is no appropriate Entity with such " + "a name.") + raise + else: + dt = get_list_datatype(prop.datatype) + if not (is_reference(dt) and dt != db.FILE and dt != db.REFERENCE): + return + propval = [] + for el in prop.value: + if isinstance(el, str): + try: + # the get_entity function will raise an error if not unique + propval.append(Crawler._get_property_id_for_datatype(rtname=dt, + name=el)) + except (db.EmptyUniqueQueryError, db.QueryNotUniqueError): + logger.error( + f"The Property {prop.name} with datatype={prop.datatype} has the " + f"value {prop.value} and there is no appropriate Entity with such " + "a name.") + raise + else: + propval.append(el) + prop.value = propval @staticmethod - def execute_inserts_in_list(to_be_inserted, securityMode, run_id: uuid.UUID = None, + def execute_inserts_in_list(to_be_inserted, securityMode, + run_id: Optional[uuid.UUID] = None, unique_names=True): for record in to_be_inserted: for prop in record.properties: - entity = Crawler._get_entity_by_name(prop.name) + if prop.name == "name": + raise Exception('Cannot search for the property with name "name"') + entity = cached_get_entity_by(name=prop.name) _resolve_datatype(prop, entity) + Crawler.replace_name_with_referenced_entity_id(prop) logger.debug("INSERT") logger.debug(to_be_inserted) if len(to_be_inserted) > 0: @@ -1011,18 +581,22 @@ class Crawler(object): for record in rec_list: for parent in record.parents: if parent.id is None: - parent.id = Crawler._get_entity_by_name(parent.name).id + parent.id = cached_get_entity_by(name=parent.name).id for prop in record.properties: if prop.id is None: - entity = Crawler._get_entity_by_name(prop.name) + entity = cached_get_entity_by(name=prop.name) prop.id = entity.id _resolve_datatype(prop, entity) @staticmethod - def execute_updates_in_list(to_be_updated, securityMode, run_id: uuid.UUID = None, + def execute_updates_in_list(to_be_updated, securityMode, + run_id: Optional[uuid.UUID] = None, unique_names=True): Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated) logger.debug("UPDATE") + # Here, it's probably much more reasonable to show a diff of the update: + # from linkahead.apiutils import compare_entities + # [compare_entities(c, db.Record(id=c.id).retrieve()) for c in to_be_updated] logger.debug(to_be_updated) if len(to_be_updated) > 0: if securityMode.value > SecurityMode.INSERT.value: @@ -1031,8 +605,24 @@ class Crawler(object): update_cache = UpdateCache() update_cache.insert(to_be_updated, run_id) - def _synchronize(self, crawled_data: list[db.Record], commit_changes: bool = True, - unique_names=True): + @staticmethod + def check_whether_parent_exists(records: list[db.Entity], parents: list[str]): + """ returns a list of all records in `records` that have a parent that is in `parents`""" + problems = [] + for rec in records: + for parent in rec.parents: + if parent.name in parents: + problems.append(rec) + return problems + + def synchronize(self, + commit_changes: bool = True, + unique_names: bool = True, + crawled_data: Optional[list[db.Record]] = None, + no_insert_RTs: Optional[list[str]] = None, + no_update_RTs: Optional[list[str]] = None, + path_for_authorized_run: Optional[Union[str, list[str]]] = "", + ): """ This function applies several stages: 1) Retrieve identifiables for all records in crawled_data. @@ -1045,59 +635,154 @@ class Crawler(object): if commit_changes is True, the changes are synchronized to the CaosDB server. For debugging in can be useful to set this to False. - Return the final to_be_inserted and to_be_updated as tuple. + Parameters + ---------- + no_insert_RTs : list[str], optional + list of RecordType names. Records that have one of those RecordTypes + as parent will not be inserted + no_update_RTs : list[str], optional + list of RecordType names. Records that have one of those RecordTypes + as parent will not be updated + path_for_authorized_run : str or list[str], optional + only used if there are changes that need authorization before being + applied. The form for rerunning the crawler with the authorization + of these changes will be generated with this path. See + ``caosadvancedtools.crawler.Crawler.save_form`` for more info about + the authorization form. + + Returns + ------- + inserts and updates + the final to_be_inserted and to_be_updated as tuple. """ + if crawled_data is None: + warnings.warn(DeprecationWarning( + "Calling synchronize without the data to be synchronized is deprecated. Please " + "use for example the Scanner to create this data.")) + crawled_data = self.crawled_data - to_be_inserted, to_be_updated = self.split_into_inserts_and_updates(crawled_data) - referencing_entities = self.create_reference_mapping(to_be_updated + to_be_inserted) + if isinstance(path_for_authorized_run, list) and self.securityMode != SecurityMode.UPDATE: + raise NotImplementedError( + "Authorization of inserts and updates is currently implemented only " + "for single paths, not for lists of paths." + ) + + to_be_inserted, to_be_updated = self._split_into_inserts_and_updates( + SyncGraph(crawled_data, self.identifiableAdapter)) - # TODO: refactoring of typo for el in to_be_updated: # all entity objects are replaced by their IDs except for the not yet inserted ones self.replace_entities_with_ids(el) - identified_records = [ - self.identifiableAdapter.retrieve_identified_record_for_record(record, - referencing_entities) - for record in to_be_updated] + identified_records = [] + for record in to_be_updated: + if record.id is not None: + # TODO: use cache here? + identified_records.append(cached_get_entity_by(eid=record.id)) + else: + raise Exception("Please report a bug: At this stage all records to be updated" + " should have an ID") # Merge with existing data to prevent unwanted overwrites - to_be_updated = self._merge_properties_from_remote(to_be_updated, - identified_records) + to_be_updated = self._merge_properties_from_remote(to_be_updated, identified_records) # remove unnecessary updates from list by comparing the target records # to the existing ones - to_be_updated = self.remove_unnecessary_updates( - to_be_updated, identified_records) + to_be_updated = self.remove_unnecessary_updates(to_be_updated, identified_records) + if no_insert_RTs: + ins_problems = self.check_whether_parent_exists(to_be_inserted, no_insert_RTs) + else: + ins_problems = [] + if no_update_RTs: + upd_problems = self.check_whether_parent_exists(to_be_updated, no_update_RTs) + else: + upd_problems = [] + if len(ins_problems) > 0 or len(upd_problems) > 0: + raise ForbiddenTransaction( + "One or more Records have a parent which is excluded from inserts or updates." + f"\nRecords excluded from inserts have the following RecordTypes:\n" + f"{[el.parents[0].name for el in ins_problems]}" + f"\nRecords excluded from updates have the following RecordTypes:\n" + f"{[el.parents[0].name for el in upd_problems]}" + ) + + logger.info(f"Going to insert {len(to_be_inserted)} Entities and update " + f"{len(to_be_updated)} Entities.") if commit_changes: + cache_clear() self.execute_parent_updates_in_list(to_be_updated, securityMode=self.securityMode, run_id=self.run_id, unique_names=unique_names) + logger.info("Added parent RecordTypes where necessary.") self.execute_inserts_in_list( to_be_inserted, self.securityMode, self.run_id, unique_names=unique_names) + logger.info("Executed inserts:\n" + + self.create_entity_summary(to_be_inserted)) self.execute_updates_in_list( to_be_updated, self.securityMode, self.run_id, unique_names=unique_names) + logger.info("Executed updates:\n" + + self.create_entity_summary(to_be_updated)) update_cache = UpdateCache() pending_inserts = update_cache.get_inserts(self.run_id) if pending_inserts: Crawler.inform_about_pending_changes( - pending_inserts, self.run_id, self.crawled_directory) + pending_inserts, self.run_id, path_for_authorized_run) pending_updates = update_cache.get_updates(self.run_id) if pending_updates: Crawler.inform_about_pending_changes( - pending_updates, self.run_id, self.crawled_directory) + pending_updates, self.run_id, path_for_authorized_run) return (to_be_inserted, to_be_updated) + @staticmethod + def create_entity_summary(entities: list[db.Entity]): + """ Creates a summary string reprensentation of a list of entities.""" + parents = {} + for el in entities: + for pp in el.parents: + if pp.name not in parents: + parents[pp.name] = [el] + else: + parents[pp.name].append(el) + output = "" + for key, value in parents.items(): + output += f"{key}:\n" + for el in value: + output += create_entity_link(el) + ", " + + output = output[:-2] + "\n" + return output + @staticmethod def inform_about_pending_changes(pending_changes, run_id, path, inserts=False): # Sending an Email with a link to a form to authorize updates is - # only done in SSS mode - - if "SHARED_DIR" in os.environ: - filename = OldCrawler.save_form( - [el[3] for el in pending_changes], path, run_id) - OldCrawler.send_mail([el[3] for el in pending_changes], filename) + if get_config_setting("send_crawler_notifications"): + filename = OldCrawler.save_form([el[3] for el in pending_changes], path, run_id) + link_address = get_shared_resource_link(db.configuration.get_config()[ + "Connection"]["url"], filename) + changes = "\n".join([el[3] for el in pending_changes]) + text = f"""Dear Curator, + there where changes that need your authorization. Please check the following + carefully and if the changes are ok, click on the following link: + + {link_address} + + {changes} + """ + try: + fro = get_config_setting("sendmail_from_address") + to = get_config_setting("sendmail_to_address") + except KeyError: + logger.error("Server Configuration is missing a setting for " + "sending mails. The administrator should check " + "'from_mail' and 'to_mail'.") + return + + send_mail( + from_addr=fro, + to=to, + subject="Crawler Update", + body=text) for i, el in enumerate(pending_changes): @@ -1130,11 +815,15 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) res[converter.name]["subtree"][k[0]] = d[k[0]] return res - def save_debug_data(self, filename: str): + def save_debug_data(self, filename: str, debug_tree: Optional[DebugTree] = None): + """ + Save the information contained in a debug_tree to a file named filename. + """ + paths: dict[str, Union[dict, list]] = dict() def flatten_debug_info(key): - mod_info = self.debug_metadata[key] + mod_info = debug_tree.debug_metadata[key] paths[key] = dict() for record_name in mod_info: if key == "provenance": @@ -1150,195 +839,312 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) for key in ("provenance", "usage"): flatten_debug_info(key) - paths["converters_usage"] = [self.debug_build_usage_tree( - cv) for cv in self.debug_converters] + # TODO: clarify what this was used for + # paths["converters_usage"] = [self.debug_build_usage_tree( + # cv) for cv in self.debug_converters] with open(filename, "w") as f: f.write(yaml.dump(paths, sort_keys=False)) - def _crawl(self, items: list[StructureElement], - local_converters: list[Converter], - generalStore: GeneralStore, - recordStore: RecordStore, - structure_elements_path: list[str], converters_path: list[str]): - """ - Crawl a list of StructureElements and apply any matching converters. - - items: structure_elements (e.g. files and folders on one level on the hierarchy) - local_converters: locally defined converters for - treating structure elements. A locally defined converter could be - one that is only valid for a specific subtree of the originally - cralwed StructureElement structure. - generalStore and recordStore: This recursion of the crawl function should only operate on copies of the - global stores of the Crawler object. - """ - for element in items: - for converter in local_converters: - - # type is something like "matches files", replace isinstance with "type_matches" - # match function tests regexp for example - if (converter.typecheck(element) and - converter.match(element) is not None): - generalStore_copy = generalStore.create_scoped_copy() - recordStore_copy = recordStore.create_scoped_copy() - - # Create an entry for this matched structure element: - generalStore_copy[converter.name] = ( - os.path.join(*(structure_elements_path + [element.get_name()]))) - - # extracts values from structure element and stores them in the - # variable store - converter.create_values(generalStore_copy, element) - - keys_modified = converter.create_records( - generalStore_copy, recordStore_copy, element) - - children = converter.create_children(generalStore_copy, element) - - if self.debug: - # add provenance information for each varaible - self.debug_tree[str(element)] = ( - generalStore_copy.get_storage(), recordStore_copy.get_storage()) - self.debug_metadata["copied"][str(element)] = ( - generalStore_copy.get_dict_copied(), - recordStore_copy.get_dict_copied()) - self.debug_metadata["usage"][str(element)].add( - "/".join(converters_path + [converter.name])) - mod_info = self.debug_metadata["provenance"] - for record_name, prop_name in keys_modified: - # TODO: check - internal_id = recordStore_copy.get_internal_id( - record_name) - record_identifier = record_name + \ - "_" + str(internal_id) - converter.metadata["usage"].add(record_identifier) - mod_info[record_identifier][prop_name] = ( - structure_elements_path + [element.get_name()], - converters_path + [converter.name]) - - self._crawl(children, converter.converters, - generalStore_copy, recordStore_copy, - structure_elements_path + [element.get_name()], - converters_path + [converter.name]) - # if the crawler is running out of scope, copy all records in - # the recordStore, that were created in this scope - # to the general update container. - scoped_records = recordStore.get_records_current_scope() - for record in scoped_records: - self.crawled_data.append(record) - - # TODO: the scoped variables should be cleaned up as soon if the variables - # are no longer in the current scope. This can be implemented as follows, - # but this breaks the test "test_record_structure_generation", because - # some debug info is also deleted. This implementation can be used as soon - # as the remaining problems with the debug_tree are fixed. - # Delete the variables that are no longer needed: - # scoped_names = recordStore.get_names_current_scope() - # for name in scoped_names: - # del recordStore[name] - # del generalStore[name] - - return self.crawled_data - - -def crawler_main(crawled_directory_path: str, + +def _create_status_record(logfile_url, run_id): + """Insert a CrawlerRun Record + + CrawlerRun Records are used to have a (somewhat) persistent feedback from crawler runs that + are easyly accessible by users. + """ + if get_config_setting("create_crawler_status_records"): + (db.Record() + .add_parent('CrawlerRun') + .add_property('logfile', logfile_url) + .add_property('status', "RUNNING") + .add_property('run_id', run_id) + .add_property('started', datetime.now().isoformat()) + .insert()) + + +def _update_status_record(run_id, n_inserts, n_updates, status): + """Update the CrawlerRun Record + + The Record is identified using the run_id. The status is changed and some information about the + run is added. + """ + if get_config_setting("create_crawler_status_records"): + cr_rec = db.execute_query(f"FIND RECORD CrawlerRun WITH run_id={run_id}", unique=True) + cr_rec.get_property('status').value = status + (cr_rec + .add_property(db.execute_query( + "FIND Property with name='number_of_inserted_entities'", unique=True).id, + n_inserts) + .add_property( + db.execute_query("FIND Property with name='number_of_updated_entities'", + unique=True).id, n_updates) + .add_property( + db.execute_query("FIND Property with name='finished'", + unique=True).id, datetime.now().isoformat())) + cr_rec.update() + + +def _notify_about_inserts_and_updates(n_inserts, n_updates, logfile, run_id): + """send an email notification + + Only if there were inserts or updates. + + The email contains some basic information and a link to the log and the CrawlerRun Record. + """ + if not get_config_setting("send_crawler_notifications"): + logger.debug("Crawler email notifications are disabled.") + return + if n_inserts == 0 and n_updates == 0: + return + text = f"""Dear Curator, +the CaosDB Crawler successfully crawled the data and +- inserted {n_inserts} new Entities and +- updated {n_updates} existing Entities. + +""" + + domain = get_config_setting("public_host_url") + if get_config_setting("create_crawler_status_records"): + text += ("You can checkout the CrawlerRun Record for more information:\n" + f"{domain}/Entity/?P=0L10&query=find%20crawlerrun%20with%20run_id=%27{run_id}%27\n\n") + text += (f"You can download the logfile here:\n{get_shared_resource_link(domain, logfile)}") + send_mail( + from_addr=get_config_setting("sendmail_from_address"), + to=get_config_setting("sendmail_to_address"), + subject="Crawler Update", + body=text) + + +def _treat_deprecated_prefix(prefix, remove_prefix): + """notify about deprecation and use given value""" + if prefix != "": + warnings.warn(DeprecationWarning("The prefix argument is deprecated and will be removed " + "in the future. Please use `remove_prefix` instead.")) + if remove_prefix is not None: + raise ValueError("Please do not supply the (deprecated) `prefix` and the " + "`remove_prefix` argument at the same time. Only use " + "`remove_prefix` instead.") + return prefix + return remove_prefix + + +def _fix_file_paths(crawled_data: list[db.Entity], + add_prefix: Optional[str], + remove_prefix: Optional[str]): + """ + Adjust the path according to add_/remove_prefix + + Also remove the `file` attribute from File entities (because inserts need currently be done + by loadfiles. + + Arguments: + ------------ + + crawled_data: list[db.Entity] + A list of entities. This list will be searched for instances of db.File. + + add_prefix: Optional[str] + If add_prefix is not None, the given prefix will be added in front of elem.path. + + remove_prefix: Optional[str] + If remove_prefix is not None the given prefix will be removed from the front of + elem.path. In this case a RuntimeError will be raised if any path of a file does + not begin with "remove_prefix". + + """ + for elem in crawled_data: + if isinstance(elem, db.File): + # correct the file path: + # elem.file = os.path.join(args.path, elem.file) + if remove_prefix: + if elem.path.startswith(remove_prefix): + elem.path = elem.path[len(remove_prefix):] + else: + raise RuntimeError("Prefix shall be removed from file path but the path " + "does not start with the prefix:" + f"\n{remove_prefix}\n{elem.path}") + if add_prefix: + elem.path = add_prefix + elem.path + elem.file = None + # TODO: as long as the new file backend is not finished + # we are using the loadFiles function to insert symlinks. + # Therefore, I am setting the files to None here. + # Otherwise, the symlinks in the database would be replaced + # by uploads of the files which we currently do not want to happen. + + +def _check_record_types(crawled_data): + """Check for all parents in crawled_data whether they exists + + raise Error if it does not + """ + rtsfinder = dict() + + for elem in crawled_data: + # Check whether all needed RecordTypes exist: + if len(elem.parents) > 0: + for parent in elem.parents: + if parent.name in rtsfinder: + continue + + rt = db.RecordType(name=parent.name) + try: + rt.retrieve() + rtsfinder[parent.name] = True + except db.TransactionError: + rtsfinder[parent.name] = False + + notfound = [k for k, v in rtsfinder.items() if not v] + if len(notfound) > 0: + raise RuntimeError("Missing RecordTypes: {}". format(", ".join(notfound))) + + +def _store_dry_run_data(ins, upd): + """write insets and updates to a file """ + inserts = [str(i) for i in ins] + updates = [str(i) for i in upd] + with open("dry.yml", "w") as f: + f.write(yaml.dump({ + "insert": inserts, + "update": updates})) + + +def crawler_main(crawled_directory_path: Union[str, list[str]], cfood_file_name: str, - identifiables_definition_file: str = None, + identifiables_definition_file: Optional[str] = None, debug: bool = False, - provenance_file: str = None, + provenance_file: Optional[str] = None, dry_run: bool = False, prefix: str = "", securityMode: SecurityMode = SecurityMode.UPDATE, - unique_names=True, + unique_names: bool = True, + restricted_path: Optional[list[str]] = None, + remove_prefix: Optional[str] = None, + add_prefix: Optional[str] = None, + sss_max_log_level: Optional[int] = None, ): """ Parameters ---------- - crawled_directory_path : str - path to be crawled + crawled_directory_path : str or list[str] + path(s) to be crawled cfood_file_name : str filename of the cfood to be used identifiables_definition_file : str filename of an identifiable definition yaml file debug : bool - whether or not to run in debug mode + DEPRECATED, use a provenance file instead. provenance_file : str - provenance information will be stored in a file with given filename + Provenance information will be stored in a file with given filename dry_run : bool do not commit any chnages to the server prefix : str - remove the given prefix from file paths + DEPRECATED, remove the given prefix from file paths securityMode : int securityMode of Crawler unique_names : bool whether or not to update or insert entities inspite of name conflicts + restricted_path: optional, list of strings + Traverse the data tree only along the given path. When the end of the given path + is reached, traverse the full tree as normal. See docstring of 'scanner' in + module 'scanner' for more details. + remove_prefix : Optional[str] + Remove the given prefix from file paths. + See docstring of '_fix_file_paths' for more details. + add_prefix : Optional[str] + Add the given prefix to file paths. + See docstring of '_fix_file_paths' for more details. + sss_max_log_level : Optional[int] + If given, set the maximum log level of the server-side + scripting log separately from the general ``debug`` option. If + None is given, the maximum sss log level will be determined + from the value of ``debug``: ``logging.INFO`` if ``debug`` is + False, ``logging.DEBUG`` if ``debug`` is True. Returns ------- return_value : int 0 if successful """ - crawler = Crawler(debug=debug, securityMode=securityMode) try: - crawler.crawl_directory(crawled_directory_path, cfood_file_name) - except ConverterValidationError as err: - print(err) - return 1 - if provenance_file is not None: - crawler.save_debug_data(provenance_file) - - if identifiables_definition_file is not None: - - ident = CaosDBIdentifiableAdapter() - ident.load_from_yaml_definition(identifiables_definition_file) - crawler.identifiableAdapter = ident - - if dry_run: - ins, upd = crawler.synchronize(commit_changes=False) - inserts = [str(i) for i in ins] - updates = [str(i) for i in upd] - with open("dry.yml", "w") as f: - f.write(yaml.dump({ - "insert": inserts, - "update": updates})) - else: - rtsfinder = dict() - for elem in crawler.crawled_data: - if isinstance(elem, db.File): - # correct the file path: - # elem.file = os.path.join(args.path, elem.file) - if prefix is None: - raise RuntimeError( - "No prefix set. Prefix must be set if files are used.") - if elem.path.startswith(prefix): - elem.path = elem.path[len(prefix):] - elem.file = None - # TODO: as long as the new file backend is not finished - # we are using the loadFiles function to insert symlinks. - # Therefore, I am setting the files to None here. - # Otherwise, the symlinks in the database would be replaced - # by uploads of the files which we currently do not want to happen. - - # Check whether all needed RecordTypes exist: - if len(elem.parents) > 0: - for parent in elem.parents: - if parent.name in rtsfinder: - continue - - rt = db.RecordType(name=parent.name) - try: - rt.retrieve() - rtsfinder[parent.name] = True - except db.TransactionError: - rtsfinder[parent.name] = False - notfound = [k for k, v in rtsfinder.items() if not v] - if len(notfound) > 0: - raise RuntimeError("Missing RecordTypes: {}". - format(", ".join(notfound))) + crawler = Crawler(securityMode=securityMode) + + if "SHARED_DIR" in os.environ: # setup logging and reporting if serverside execution + if sss_max_log_level is None: + sss_max_log_level = logging.DEBUG if debug else logging.INFO + userlog_public, htmluserlog_public, debuglog_public = configure_server_side_logging( + max_log_level=sss_max_log_level + ) + # TODO make this optional + _create_status_record( + get_shared_resource_link(get_config_setting("public_host_url"), htmluserlog_public), + crawler.run_id) + else: # setup stdout logging for other cases + root_logger = logging.getLogger() + root_logger.setLevel(level=(logging.DEBUG if debug else logging.INFO)) + handler = logging.StreamHandler(stream=sys.stdout) + handler.setLevel(logging.DEBUG if debug else logging.INFO) + root_logger.addHandler(handler) + logger.handlers.clear() + + debug_tree = DebugTree() + crawled_data = scan_directory( + crawled_directory_path, cfood_file_name, restricted_path, debug_tree=debug_tree) + _fix_file_paths(crawled_data, add_prefix, remove_prefix) + _check_record_types(crawled_data) + + if provenance_file is not None: + crawler.save_debug_data(debug_tree=debug_tree, filename=provenance_file) + + if identifiables_definition_file is not None: + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition(identifiables_definition_file) + crawler.identifiableAdapter = ident + else: + # TODO + # raise ValueError("An identifiable file is needed.") + pass + + remove_prefix = _treat_deprecated_prefix(prefix, remove_prefix) - crawler.synchronize(commit_changes=True, unique_names=unique_names) - return 0 + if dry_run: + inserts, updates = crawler.synchronize(commit_changes=False, crawled_data=crawled_data) + _store_dry_run_data(inserts, updates) + else: + inserts, updates = crawler.synchronize(commit_changes=True, unique_names=unique_names, + crawled_data=crawled_data, + path_for_authorized_run=crawled_directory_path) + if "SHARED_DIR" in os.environ: + _notify_about_inserts_and_updates(len(inserts), len(updates), userlog_public, + crawler.run_id) + _update_status_record(crawler.run_id, len(inserts), len(updates), status="OK") + return 0 + except Exception as err: + logger.debug(traceback.format_exc()) + logger.error(err) + # Special treatment for known error types + if isinstance(err, ImpossibleMergeError): + logger.error( + "Encountered conflicting information when creating Records from the crawled " + f"data:\n\n{err}" + ) + elif isinstance(err, TransactionError): + logger.error("Transaction error details:") + for suberr in err.errors: + logger.error("---") + logger.error(suberr.msg) + logger.error(suberr.entity) + # Unkown errors get a special message + elif not isinstance(err, (ConverterValidationError, ForbiddenTransaction)): + if "SHARED_DIR" in os.environ: + # pylint: disable=E0601 + domain = get_config_setting("public_host_url") + logger.error("Unexpected Error: Please tell your administrator about this and provide " + f"the following path.\n{get_shared_resource_link(domain, debuglog_public)}") + _update_status_record(crawler.run_id, 0, 0, status="FAILED") + return 1 def parse_args(): @@ -1346,11 +1152,21 @@ def parse_args(): formatter_class=RawTextHelpFormatter) parser.add_argument("cfood_file_name", help="Path name of the cfood yaml file to be used.") + mg = parser.add_mutually_exclusive_group() + mg.add_argument("-r", "--restrict", nargs="*", + help="Restrict the crawling to the subtree at the end of the given path." + "I.e. for each level that is given the crawler only treats the element " + "with the given name.") + mg.add_argument("--restrict-path", help="same as restrict; instead of a list, this takes a " + "single string that is interpreded as file system path. Note that a trailing" + "separator (e.g. '/') will be ignored. Use --restrict if you need to have " + "empty strings.") parser.add_argument("--provenance", required=False, help="Path name of the provenance yaml file. " "This file will only be generated if this option is set.") parser.add_argument("--debug", required=False, action="store_true", help="Path name of the cfood yaml file to be used.") + # TODO allow to provide multiple directories to be crawled on the commandline parser.add_argument("crawled_directory_path", help="The subtree of files below the given path will " "be considered. Use '/' for everything.") @@ -1371,18 +1187,41 @@ def parse_args(): parser.add_argument("-u", "--unique-names", help="Insert or updates entities even if name conflicts exist.") parser.add_argument("-p", "--prefix", - help="Remove the given prefix from the paths " - "of all file objects.") + help="DEPRECATED, use --remove-prefix instead. Remove the given prefix " + "from the paths of all file objects.") + parser.add_argument("--remove-prefix", + help="Remove the given prefix from the paths of all file objects.") + parser.add_argument("--add-prefix", + help="Add the given prefix to the paths of all file objects.") return parser.parse_args() +def split_restricted_path(path): + """ + Split a path string into components separated by slashes or other os.path.sep. + Empty elements will be removed. + """ + # This implementation leads to infinite loops + # for "ill-posed" paths (see test_utilities.py"): + # elements = [] + # while path != "/": + # path, el = os.path.split(path) + # if el != "": + # elements.insert(0, el) + return [i for i in path.split(os.path.sep) if i != ""] + + def main(): args = parse_args() conlogger = logging.getLogger("connection") conlogger.setLevel(level=logging.ERROR) + if args.prefix: + print("Please use '--remove-prefix' option instead of '--prefix' or '-p'.") + return -1 + # logging config for local execution logger.addHandler(logging.StreamHandler(sys.stdout)) if args.debug: @@ -1392,6 +1231,12 @@ def main(): if args.add_cwd_to_path: sys.path.append(os.path.abspath(".")) + restricted_path = None + if args.restrict_path: + restricted_path = split_restricted_path(args.restrict_path) + if args.restrict: + restricted_path = args.restrict + sys.exit(crawler_main( crawled_directory_path=args.crawled_directory_path, cfood_file_name=args.cfood_file_name, @@ -1399,11 +1244,13 @@ def main(): debug=args.debug, provenance_file=args.provenance, dry_run=args.dry_run, - prefix=args.prefix, securityMode={"retrieve": SecurityMode.RETRIEVE, "insert": SecurityMode.INSERT, "update": SecurityMode.UPDATE}[args.security_mode], unique_names=args.unique_names, + restricted_path=restricted_path, + remove_prefix=args.remove_prefix, + add_prefix=args.add_prefix, )) diff --git a/src/caoscrawler/debug_tree.py b/src/caoscrawler/debug_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..c154f5b91d850476be0c0610e5bb1dfcbf9866ab --- /dev/null +++ b/src/caoscrawler/debug_tree.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +A structure containing debug tree information. +""" + +from __future__ import annotations + +from collections import defaultdict + +import linkahead as db +import yaml +from importlib_resources import files +from jsonschema import validate +from linkahead.apiutils import (EntityMergeConflictError, compare_entities, + merge_entities) +from linkahead.common.datatype import is_reference + +from .converters import Converter, ConverterValidationError, DirectoryConverter +from .macros import defmacro_constructor, macro_constructor +from .stores import GeneralStore, RecordStore, Store +from .structure_elements import Directory, NoneElement, StructureElement +from .version import check_cfood_version + + +class DebugTree(object): + + def __init__(self): + # order in the tuple: + # 0: general_store + # 1: record_store + self.debug_tree: dict[str, tuple] = dict() + self.debug_metadata: dict[str, dict] = dict() + self.debug_metadata["copied"] = dict() + self.debug_metadata["provenance"] = defaultdict(lambda: dict()) + self.debug_metadata["usage"] = defaultdict(lambda: set()) + + # TODO: turn the tuple into two individual elements diff --git a/src/caoscrawler/default_converters.yml b/src/caoscrawler/default_converters.yml new file mode 100644 index 0000000000000000000000000000000000000000..656b0ba0f1f76007266cc8b2e75f5bd7046f1206 --- /dev/null +++ b/src/caoscrawler/default_converters.yml @@ -0,0 +1,117 @@ +# ------------------------- +# Base Types +# ------------------------- + +BooleanElement: + converter: BooleanElementConverter + package: caoscrawler.converters +Date: + converter: DateElementConverter + package: caoscrawler.converters +Datetime: + converter: DatetimeElementConverter + package: caoscrawler.converters +Dict: + converter: DictElementConverter + package: caoscrawler.converters +PropertiesFromDictElement: + converter: PropertiesFromDictConverter + package: caoscrawler.converters +FloatElement: + converter: FloatElementConverter + package: caoscrawler.converters +IntegerElement: + converter: IntegerElementConverter + package: caoscrawler.converters +ListElement: + converter: ListElementConverter + package: caoscrawler.converters +TextElement: + converter: TextElementConverter + package: caoscrawler.converters + + +DictDictElement: # deprecated + converter: DictElementConverter + package: caoscrawler.converters +DictElement: # deprecated + converter: DictElementConverter + package: caoscrawler.converters +DictBooleanElement: # deprecated + converter: BooleanElementConverter + package: caoscrawler.converters +DictFloatElement: # deprecated + converter: FloatElementConverter + package: caoscrawler.converters +DictIntegerElement: # deprecated + converter: IntegerElementConverter + package: caoscrawler.converters +DictListElement: # deprecated + converter: ListElementConverter + package: caoscrawler.converters +DictTextElement: # deprecated + converter: TextElementConverter + package: caoscrawler.converters + +# ------------------------- +# Directories and Files +# ------------------------- + +Directory: + converter: DirectoryConverter + package: caoscrawler.converters + + +File: # deprecated + converter: SimpleFileConverter + package: caoscrawler.converters + + +SimpleFile: + converter: SimpleFileConverter + package: caoscrawler.converters + +MarkdownFile: + converter: MarkdownFileConverter + package: caoscrawler.converters + +YAMLFile: + converter: YAMLFileConverter + package: caoscrawler.converters + +JSONFile: + converter: JSONFileConverter + package: caoscrawler.converters + +CSVTableConverter: + converter: CSVTableConverter + package: caoscrawler.converters + +SPSSFile: + converter: SPSSConverter + package: caoscrawler.converters + +XLSXTableConverter: + converter: XLSXTableConverter + package: caoscrawler.converters + + +# ------------------------- +# XML +# ------------------------- + +XMLFile: + converter: XMLFileConverter + package: caoscrawler.converters + +XMLTag: + converter: XMLTagConverter + package: caoscrawler.converters + +XMLTextNode: + converter: XMLTextNodeConverter + package: caoscrawler.converters + +XMLAttributeNode: + converter: XMLAttributeNodeConverter + package: caoscrawler.converters diff --git a/src/caoscrawler/default_transformers.yml b/src/caoscrawler/default_transformers.yml new file mode 100644 index 0000000000000000000000000000000000000000..0de9a6e0585c5246fa5a21ffcbdfc37cfdc2b88d --- /dev/null +++ b/src/caoscrawler/default_transformers.yml @@ -0,0 +1,29 @@ +# Lookup table for matching functions and cfood yaml node names. + +submatch: + package: caoscrawler.transformer_functions + function: submatch +split: + package: caoscrawler.transformer_functions + function: split +replace: + package: caoscrawler.transformer_functions + function: replace +date_parse: + package: caoscrawler.transformer_functions + function: date_parse +datetime_parse: + package: caoscrawler.transformer_functions + function: datetime_parse +cast_to_int: + package: caoscrawler.transformer_functions + function: cast_to_int +cast_to_float: + package: caoscrawler.transformer_functions + function: cast_to_float +cast_to_bool: + package: caoscrawler.transformer_functions + function: cast_to_bool +cast_to_str: + package: caoscrawler.transformer_functions + function: cast_to_str diff --git a/src/caoscrawler/exceptions.py b/src/caoscrawler/exceptions.py new file mode 100644 index 0000000000000000000000000000000000000000..b9b94e1d4f9064701e8e05e22f5a0d3c6d3291a9 --- /dev/null +++ b/src/caoscrawler/exceptions.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +from typing import Any + + +class ForbiddenTransaction(Exception): + """Thrown if an transactions is needed that is not allowed. + For example an update of an entity if the security level is INSERT + """ + pass + + +class ImpossibleMergeError(Exception): + """Thrown if due to identifying information, two SyncNodes or two Properties of SyncNodes + should be merged, but there is conflicting information that prevents this. + + Parameters + ---------- + msg : str + A case-specific error message describing where the merger error occurred. + pname : str + The name of the property the values of which caused the merge error. + value_a, value_b : Any + The two values that couldn't be merged. + + Attributes + ---------- + message : str + A case-specific error message describing where the merger error occurred. + values : tuple[Any] + The two values that couldn't be merged. + pname : str + The name of the property the values of which caused the merge error. + """ + + def __init__(self, msg: str, pname: str, value_a: Any, value_b: Any): + self.pname = pname + self.values = (value_a, value_b) + self.message = msg + super().__init__(self, msg) + + def __str__(self): + return ( + f"{self.message}\n\nThe problematic property is '{self.pname}' with " + f"values '{self.values[0]}' and '{self.values[1]}'." + ) + + def __repr__(self): + return self.__str__() + + +class InvalidIdentifiableYAML(Exception): + """Thrown if the identifiable definition is invalid.""" + pass + + +class MissingIdentifyingProperty(Exception): + """Thrown if a SyncNode does not have the properties required by the corresponding registered + identifiable + """ + pass + + +class MissingRecordType(Exception): + """Thrown if an record type can not be found although it is expected that it exists on the + server. + """ + pass + + +class MissingReferencingEntityError(Exception): + """Thrown if the identifiable requires that some entity references the given entity but there + is no such reference """ + + def __init__(self, *args, rts=None, **kwargs): + self.rts = rts + super().__init__(self, *args, **kwargs) diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index 7ff7172576be08e068ba412f319b059fb349bbeb..cd52effb954d66bcc69b7296de77ddaf7b2b8394 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # -# This file is a part of the CaosDB Project. +# This file is a part of the LinkAhead Project. # # Copyright (C) 2022 Henrik tom Wörden # @@ -20,20 +20,27 @@ # from __future__ import annotations -import caosdb as db -from datetime import datetime + import json +import logging +from datetime import datetime from hashlib import sha256 -from typing import Union +from typing import Optional, Union + +import linkahead as db + +from .exceptions import MissingIdentifyingProperty +from .sync_node import SyncNode + +logger = logging.getLogger(__name__) class Identifiable(): """ - The fingerprint of a Record in CaosDB. + The fingerprint of a Record in LinkAhead. - This class contains the information that is used by the CaosDB Crawler to identify Records. - On one hand, this can be the ID or a Record or the path of a File. - On the other hand, in order to check whether a Record exits in the CaosDB Server, a query can + This class contains the information that is used by the LinkAhead Crawler to identify Records. + In order to check whether a Record exits in the LinkAhead Server, a query can be created using the information contained in the Identifiable. Parameters @@ -43,25 +50,26 @@ class Identifiable(): properties: dict, keys are names of Properties; values are Property values Note, that lists are not checked for equality but are interpreted as multiple conditions for a single Property. - path: str, In case of files: The path where the file is stored. backrefs: list, TODO future """ - def __init__(self, record_id: int = None, path: str = None, record_type: str = None, - name: str = None, properties: dict = None, - backrefs: list[Union[int, str]] = None): - if (record_id is None and path is None and name is None + def __init__(self, record_id: Optional[int] = None, record_type: Optional[str] = None, + name: Optional[str] = None, properties: Optional[dict] = None, + backrefs: Optional[list[Union[int, str]]] = None): + if (record_id is None and name is None and (backrefs is None or len(backrefs) == 0) and (properties is None or len(properties) == 0)): - raise ValueError("There is no identifying information. You need to add a path or " - "properties or other identifying attributes.") + raise ValueError( + "There is no identifying information. You need to add " + "properties or other identifying attributes.") if properties is not None and 'name' in [k.lower() for k in properties.keys()]: raise ValueError("Please use the separete 'name' keyword instead of the properties " "dict for name") self.record_id = record_id - self.path = path self.record_type = record_type self.name = name + if name == "": + self.name = None self.properties: dict = {} if properties is not None: self.properties = properties @@ -76,20 +84,17 @@ class Identifiable(): def _value_representation(value) -> str: """returns the string representation of property values to be used in the hash function - The string is the path of a File Entity, the CaosDB ID or Python ID of other Entities - (Python Id only if there is no CaosDB ID) and the string representation of bool, float, int - and str. + The string is the LinkAhead ID in case of SyncNode objects (SyncNode objects must have an ID) + and the string representation of None, bool, float, int, datetime and str. """ if value is None: return "None" - elif isinstance(value, db.File): - return str(value.path) - elif isinstance(value, db.Entity): + elif isinstance(value, SyncNode): if value.id is not None: return str(value.id) else: - return "PyID=" + str(id(value)) + raise RuntimeError("Python Entity (SyncNode) without ID not allowed") elif isinstance(value, list): return "[" + ", ".join([Identifiable._value_representation(el) for el in value]) + "]" elif (isinstance(value, str) or isinstance(value, int) or isinstance(value, float) @@ -115,27 +120,20 @@ class Identifiable(): return rec_string def __eq__(self, other) -> bool: - """ - Identifiables are equal if they belong to the same Record. Since ID and path are on their - own enough to identify the Record it is sufficient if those attributes are equal. - 1. both IDs are set (not None) -> equal if IDs are equal - 2. both paths are set (not None) -> equal if paths are equal - 3. equal if attribute representations are equal - """ + """ Identifiables are equal if they share the same ID or if the representation is equal """ if not isinstance(other, Identifiable): raise ValueError("Identifiable can only be compared to other Identifiable objects.") - elif self.record_id is not None and other.record_id is not None: + if self.record_id is not None and other.record_id is not None: return self.record_id == other.record_id - elif self.path is not None and other.path is not None: - return self.path == other.path elif self.get_representation() == other.get_representation(): return True else: return False def __repr__(self): - pstring = json.dumps(self.properties) + """ deterministic text representation of the identifiable """ + pstring = json.dumps({k: str(v) for k, v in self.properties.items()}) return (f"{self.__class__.__name__} for RT {self.record_type}: id={self.record_id}; " - f"name={self.name}\n\tpath={self.path}\n" + f"name={self.name}\n" f"\tproperties:\n{pstring}\n" f"\tbackrefs:\n{self.backrefs}") diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 40c801547a85afaf32e1ab6a668bc47d98d60b66..444b73f5d9a42cf8ec23eec7cb718b1fc183dd30 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -2,7 +2,7 @@ # encoding: utf-8 # # ** header v3.0 -# This file is a part of the CaosDB Project. +# This file is a part of the LinkAhead Project. # # Copyright (C) 2021-2022 Henrik tom Wörden # 2021-2022 Alexander Schlemmer @@ -24,30 +24,58 @@ # from __future__ import annotations -import yaml +import logging +import warnings +from abc import ABCMeta, abstractmethod from datetime import datetime from typing import Any + +import linkahead as db +import yaml +from linkahead.cached import cached_get_entity_by, cached_query +from linkahead.utils.escape import escape_squoted_text + +from .exceptions import (InvalidIdentifiableYAML, MissingIdentifyingProperty, + MissingRecordType, MissingReferencingEntityError) from .identifiable import Identifiable -import caosdb as db -import logging -from abc import abstractmethod, ABCMeta +from .sync_node import SyncNode from .utils import has_parent + logger = logging.getLogger(__name__) -def convert_value(value: Any): - """ Returns a string representation of the value that is suitable - to be used in the query - looking for the identified record. +def _retrieve_RecordType(id=None, name=None): + """ + Retrieve the RecordType from LinkAhead. For mocking purposes. + """ + return db.RecordType(name=name, id=id).retrieve() + + +def get_children_of_rt(rtname): + """Supply the name of a recordtype. This name and the name of all children RTs are returned in + a list""" + escaped = escape_squoted_text(rtname) + recordtypes = [p.name for p in cached_query(f"FIND RECORDTYPE '{escaped}'")] + if not recordtypes: + raise MissingRecordType(f"Record type could not be found on server: {rtname}") + return recordtypes + + +def convert_value(value: Any) -> str: + """Return a string representation of the value suitable for the search query. + + This is for search queries looking for the identified record. Parameters ---------- - value : Any type, the value that shall be returned and potentially converted. + value: Any + The value to be converted. Returns ------- - out : the string reprensentation of the value + out: str + the string reprensentation of the value. """ @@ -58,77 +86,174 @@ def convert_value(value: Any): elif isinstance(value, bool): return str(value).upper() elif isinstance(value, str): - # replace single quotes, otherwise they may break the queries - return value.replace("\'", "\\'") + return escape_squoted_text(value) else: return str(value) class IdentifiableAdapter(metaclass=ABCMeta): - """ - Base class for identifiable adapters. + """Base class for identifiable adapters. Some terms: - - Registered identifiable is the definition of an identifiable which is: - - A record type as the parent - - A list of properties - - A list of referenced by statements - - - Identifiable is the concrete identifiable, e.g. the Record based on - the registered identifiable with all the values filled in. - - Identified record is the result of retrieving a record based on the - identifiable from the database. + - A *registered identifiable* defines an identifiable template, for example by specifying: + - Parent record types + - Properties + - ``is_referenced_by`` statements + - An *identifiable* belongs to a concrete record. It consists of identifying attributes which + "fill in" the *registered identifiable*. In code, it can be represented as a Record based on + the *registered identifiable* with all the values filled in. + - An *identified record* is the result of retrieving a record from the database, based on the + *identifiable* (and its values). General question to clarify: - Do we want to support multiple identifiables per RecordType? - Current implementation supports only one identifiable per RecordType. + + - Do we want to support multiple identifiables per RecordType? + - Current implementation supports only one identifiable per RecordType. The list of referenced by statements is currently not implemented. - The IdentifiableAdapter can be used to retrieve the three above mentioned objects (registred + The IdentifiableAdapter can be used to retrieve the three above mentioned objects (registered identifiabel, identifiable and identified record) for a Record. + """ @staticmethod - def create_query_for_identifiable(ident: Identifiable): + def create_query_for_identifiable(ident: Identifiable, startswith: bool = False): """ This function is taken from the old crawler: caosdb-advanced-user-tools/src/caosadvancedtools/crawler.py uses the properties of ident to create a query that can determine whether the required record already exists. + + If ``startswith`` is True, use ``LIKE`` for long string values to test if the strings starts + with the first 200 characters of the value. """ query_string = "FIND RECORD " if ident.record_type is not None: - query_string += ident.record_type + escaped_rt = escape_squoted_text(ident.record_type) + query_string += f"'{escaped_rt}'" for ref in ident.backrefs: eid = ref if isinstance(ref, db.Entity): eid = ref.id - query_string += (" WHICH IS REFERENCED BY " + str(eid) + " AND") + query_string += " WHICH IS REFERENCED BY " + str(eid) + " AND" query_string += " WITH " if ident.name is not None: - query_string += "name='{}'".format(ident.name) + query_string += "name='{}'".format(escape_squoted_text(ident.name)) if len(ident.properties) > 0: query_string += " AND " - query_string += IdentifiableAdapter.create_property_query(ident) + query_string += IdentifiableAdapter.create_property_query( + ident, startswith=startswith + ) + + # TODO Can these cases happen at all with the current code? + if query_string.endswith(" AND WITH "): + query_string = query_string[: -len(" AND WITH ")] + if query_string.endswith(" AND "): + query_string = query_string[: -len(" AND ")] return query_string + def all_identifying_properties_exist( + self, node: SyncNode, raise_exception: bool = True + ): + """checks whether all identifying properties exist and raises an error if + that's not the case. It furthermore raises an error if "name" is part of + the identifiable, but the node does not have a name. + + If raise_exception is False, the function returns False instead of raising an error. + + Backreferences are not checked. + + Returns True if all identifying properties exist. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + if node.registered_identifiable is None: + if raise_exception: + parents = [p.name for p in node.parents] + parents_str = "\n".join(f"- {p}" for p in parents) + raise RuntimeError("No registered identifiable for node with these parents:\n" + + parents_str) + else: + return False + for prop in node.registered_identifiable.properties: + if prop.name.lower() == "is_referenced_by": + continue + if prop.name.lower() == "name": + if node.name is None: + if raise_exception: + i = MissingIdentifyingProperty("The node has no name.") + i.prop = "name" + raise i + else: + return False + else: + continue + + # multiple occurances are ok here. We deal with that when actually creating an + # identifiable (IDs of referenced Entities might need to get resolved first). + if ( + len( + [ + el + for el in node.properties + if el.name.lower() == prop.name.lower() + ] + ) + == 0 + ): + if raise_exception: + i = MissingIdentifyingProperty( + f"The property {prop.name} is missing." + ) + i.prop = prop.name + raise i + else: + return False + + return True + @staticmethod - def create_property_query(entity: Identifiable): + def __create_pov_snippet(pname: str, pvalue, startswith: bool = False): + """Return something like ``'name'='some value'`` or ``'name' LIKE 'some*'``. + + If ``startswith`` is True, the value of strings will be cut off at 200 characters and a ``LIKE`` + operator will be used to find entities matching at the beginning. + """ + if startswith and isinstance(pvalue, str) and len(pvalue) > 200: + operator_value_str = f" LIKE '{escape_squoted_text(pvalue[:200])}*'" + else: + operator_value_str = "='" + convert_value(pvalue) + "'" + result = "'" + escape_squoted_text(pname) + "'" + operator_value_str + return result + + @staticmethod + def create_property_query(entity: Identifiable, startswith: bool = False): + """Create a POV query part with the entity's properties. + + Parameters + ---------- + + entity: Identifiable + The Identifiable whose properties shall be used. + + startswith: bool, optional + If True, check string typed properties against the first 200 characters only. Default is False. + """ query_string = "" + pov = IdentifiableAdapter.__create_pov_snippet # Shortcut for pname, pvalue in entity.properties.items(): if pvalue is None: - query_string += "'" + pname + "' IS NULL AND " + query_string += "'" + escape_squoted_text(pname) + "' IS NULL AND " elif isinstance(pvalue, list): for v in pvalue: - query_string += ("'" + pname + "'='" + - convert_value(v) + "' AND ") + query_string += pov(pname, v, startswith=startswith) + " AND " # TODO: (for review) # This code would allow for more complex identifiables with @@ -141,111 +266,148 @@ class IdentifiableAdapter(metaclass=ABCMeta): # IdentifiableAdapter.create_property_query(p.value) + # ") AND ") else: - query_string += ("'" + pname + "'='" + - convert_value(pvalue) + "' AND ") + query_string += pov(pname, pvalue, startswith=startswith) + " AND " # remove the last AND return query_string[:-4] @abstractmethod - def get_registered_identifiable(self, record: db.Record): + def get_registered_identifiable(self, record: db.Entity): """ Check whether an identifiable is registered for this record and return its definition. If there is no identifiable registered, return None. """ pass - @abstractmethod - def resolve_reference(self, record: db.Record): - pass - @abstractmethod def get_file(self, identifiable: db.File): + warnings.warn( + DeprecationWarning("This function is deprecated. Please do not use it.") + ) """ Retrieve the file object for a (File) identifiable. """ pass - def get_identifiable(self, record: db.Record, referencing_entities=None): + @staticmethod + def get_identifying_referenced_entities(record, registered_identifiable): + """Create a list of all entities that are referenced by record + and that are used as identying properties of the identifiable. + + Last review by Alexander Schlemmer on 2024-05-29. """ - retrieve the registred identifiable and fill the property values to create an - identifiable + refs = [] + for prop in registered_identifiable.properties: + pname = prop.name.lower() + if pname == "name" or pname == "is_referenced_by": + continue + if record.get_property(prop.name) is None: + logger.error(f"Record with missing identifying property:\n{record}\n" + f"This property is missing: {prop.name}\n") + raise RuntimeError("Missing identifying Property") + pval = record.get_property(prop.name).value + if not isinstance(prop.value, list): + pval = [prop.value] + for val in pval: + if isinstance(val, db.Entity): + refs.append(val) + return refs + + def get_identifiable(self, se: SyncNode, identifiable_backrefs: set[SyncNode]) -> Identifiable: + """ + Take the registered identifiable of given SyncNode ``se`` and fill the property values to + create an identifiable. Args: - record: the record for which the Identifiable shall be created. - referencing_entities: a dictionary (Type: dict[int, dict[str, list[db.Entity]]]), that - allows to look up entities with a certain RecordType, that reference ``record`` + se: the SyncNode for which the Identifiable shall be created. + identifiable_backrefs: a set (Type: set[SyncNode]), that contains SyncNodes + with a certain RecordType, that reference ``se`` Returns: Identifiable, the identifiable for record. - """ - registered_identifiable = self.get_registered_identifiable(record) - - if referencing_entities is None: - referencing_entities = {} + Last review by Alexander Schlemmer on 2024-05-29. + """ property_name_list_A = [] - property_name_list_B = [] identifiable_props = {} - identifiable_backrefs = [] + name = None + + if se.registered_identifiable is None: + raise ValueError("no registered_identifiable") + + # fill the values: + for prop in se.registered_identifiable.properties: + # TDOO: + # If there are multiproperties in the registered_identifiable, then only the LAST is + # taken into account (later properties overwrite previous one in the dict below). + if prop.name == "name": + name = se.name + continue + + if prop.name.lower() == "is_referenced_by": + for el in identifiable_backrefs: + if not isinstance(el, SyncNode): + raise ValueError("Elements of `identifiable_backrefs` must be SyncNodes") + if len(identifiable_backrefs) == 0: + raise MissingReferencingEntityError( + f"Could not find referencing entities of type(s): {prop.value}\n" + f"for registered identifiable:\n{se.registered_identifiable}\n" + f"There were {len(identifiable_backrefs)} referencing entities to " + "choose from.\n" + f"This error can also occur in case of merge conflicts in the referencing" + " entities." + ) + elif len([e.id for e in identifiable_backrefs if el.id is None]) > 0: + raise RuntimeError("Referencing entity has no id") + # At this point we know that there is at least one referencing SyncNode + # with an ID. We do not need to set any property value (the reference will be used + # in the backrefs argument below) and can thus continue with the next identifying + # property + continue + + options = [p.value for p in se.properties if p.name.lower() == prop.name.lower()] + if len(options) == 0: + raise MissingIdentifyingProperty( + f"The following record is missing an identifying property:\n" + f"RECORD\n{se}\nIdentifying PROPERTY\n{prop.name}" + ) + for ii, el in enumerate(options): + if isinstance(el, SyncNode): + options[ii] = el.id + if el.id is None: + raise RuntimeError( + "Reference to unchecked in identifiable:\n" + f"{prop.name}:\n{el}" + ) + else: + options[ii] = el + if not all([f == options[0] for f in options]): + raise RuntimeError("differing prop values ") - if registered_identifiable is not None: - # fill the values: - for prop in registered_identifiable.properties: - if prop.name == "name": - # The name can be an identifiable, but it isn't a property - continue - # problem: what happens with multi properties? - # case A: in the registered identifiable - # case B: in the identifiable - - # TODO: similar to the Identifiable class, Registred Identifiable should be a - # separate class too - if prop.name.lower() == "is_referenced_by": - for rtname in prop.value: - if (id(record) in referencing_entities - and rtname in referencing_entities[id(record)]): - identifiable_backrefs.extend(referencing_entities[id(record)][rtname]) - else: - # TODO: is this the appropriate error? - raise NotImplementedError( - f"The following record is missing an identifying property:" - f"RECORD\n{record}\nIdentifying PROPERTY\n{prop.name}" - ) - continue + identifiable_props[prop.name] = options[0] + property_name_list_A.append(prop.name) - record_prop = record.get_property(prop.name) - if record_prop is None: - # TODO: how to handle missing values in identifiables - # raise an exception? - # TODO: is this the appropriate error? - raise NotImplementedError( - f"The following record is missing an identifying property:" - f"RECORD\n{record}\nIdentifying PROPERTY\n{prop.name}" - ) - identifiable_props[record_prop.name] = record_prop.value - property_name_list_A.append(prop.name) - - # check for multi properties in the record: - for prop in property_name_list_A: - property_name_list_B.append(prop) - if (len(set(property_name_list_B)) != len(property_name_list_B) or len( - set(property_name_list_A)) != len(property_name_list_A)): - raise RuntimeError( - "Multi properties used in identifiables could cause unpredictable results and " - "are not allowed. You might want to consider a Property with a list as value.") - - # use the RecordType of the registred Identifiable if it exists + # check for multi properties in the record: + if len(set(property_name_list_A)) != len(property_name_list_A): + raise RuntimeError( + "Multi properties used in identifiables could cause unpredictable results and " + "are not allowed. You might want to consider a Property with a list as value." + ) + + # use the RecordType of the registered Identifiable if it exists # We do not use parents of Record because it might have multiple - return Identifiable( - record_id=record.id, - record_type=(registered_identifiable.parents[0].name - if registered_identifiable else None), - name=record.name, - properties=identifiable_props, - path=record.path, - backrefs=identifiable_backrefs - ) + try: + return Identifiable( + record_id=se.id, + record_type=se.registered_identifiable.parents[0].name, + name=name, + properties=identifiable_props, + backrefs=[e.id for e in identifiable_backrefs], + ) + except Exception as exc: + logger.error(exc) + logger.error(f"Error while creating identifiable for this record:\n{se}") + raise @abstractmethod def retrieve_identified_record_for_identifiable(self, identifiable: Identifiable): @@ -259,23 +421,29 @@ class IdentifiableAdapter(metaclass=ABCMeta): """ pass - # TODO: remove side effect - # TODO: use ID if record has one? - def retrieve_identified_record_for_record(self, record: db.Record, referencing_entities=None): - """ - This function combines all functionality of the IdentifierAdapter by - returning the identifiable after having checked for an appropriate - registered identifiable. + @staticmethod + def referencing_entity_has_appropriate_type(parents, register_identifiable): + """returns true if one of the parents is listed by the 'is_referenced_by' property - In case there was no appropriate registered identifiable or no identifiable could - be found return value is None. - """ - identifiable = self.get_identifiable(record, referencing_entities=referencing_entities) + This function also returns True if 'is_referenced_by' contains the wildcard '*'. - if identifiable.path is not None: - return self.get_file(identifiable) + Last review by Alexander Schlemmer on 2024-05-29. + """ + if register_identifiable.get_property("is_referenced_by") is None: + return False + if register_identifiable.get_property("is_referenced_by").value is None: + return False - return self.retrieve_identified_record_for_identifiable(identifiable) + appropriate_types = [] + for rt in register_identifiable.get_property("is_referenced_by").value: + appropriate_types.extend(get_children_of_rt(rt)) + appropriate_types = [el.lower() for el in appropriate_types] + if "*" in appropriate_types: + return True + for parent in parents: + if parent.name.lower() in appropriate_types: + return True + return False class LocalStorageIdentifiableAdapter(IdentifiableAdapter): @@ -284,6 +452,11 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): """ def __init__(self): + warnings.warn( + DeprecationWarning( + "This class is deprecated. Please use the CaosDBIdentifiableAdapter." + ) + ) self._registered_identifiables = dict() self._records = [] @@ -298,6 +471,9 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): Just look in records for a file with the same path. """ candidates = [] + warnings.warn( + DeprecationWarning("This function is deprecated. Please do not use it.") + ) for record in self._records: if record.role == "File" and record.path == identifiable.path: candidates.append(record) @@ -309,15 +485,18 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): def store_state(self, filename): with open(filename, "w") as f: - f.write(db.common.utils.xml2str( - db.Container().extend(self._records).to_xml())) + f.write( + db.common.utils.xml2str(db.Container().extend(self._records).to_xml()) + ) def restore_state(self, filename): with open(filename, "r") as f: self._records = db.Container().from_xml(f.read()) # TODO: move to super class? - def is_identifiable_for_record(self, registered_identifiable: db.RecordType, record: db.Record): + def is_identifiable_for_record( + self, registered_identifiable: db.RecordType, record: db.Record + ): """ Check whether this registered_identifiable is an identifiable for the record. @@ -328,8 +507,7 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): Return True in that case and False otherwise. """ if len(registered_identifiable.parents) != 1: - raise RuntimeError( - "Multiple parents for identifiables not supported.") + raise RuntimeError("Multiple parents for identifiables not supported.") if not has_parent(record, registered_identifiable.parents[0].name): return False @@ -339,14 +517,13 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): return False return True - def get_registered_identifiable(self, record: db.Record): + def get_registered_identifiable(self, record: db.Entity): identifiable_candidates = [] for _, definition in self._registered_identifiables.items(): if self.is_identifiable_for_record(definition, record): identifiable_candidates.append(definition) if len(identifiable_candidates) > 1: - raise RuntimeError( - "Multiple candidates for an identifiable found.") + raise RuntimeError("Multiple candidates for an identifiable found.") if len(identifiable_candidates) == 0: return None return identifiable_candidates[0] @@ -361,8 +538,9 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): record is the record from the local database to check against. identifiable is the record that was created during the crawler run. """ - if (identifiable.record_type is not None - and not has_parent(record, identifiable.record_type)): + if identifiable.record_type is not None and not has_parent( + record, identifiable.record_type + ): return False for propname, propvalue in identifiable.properties.items(): prop_record = record.get_property(propname) @@ -391,27 +569,12 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter): candidates.append(record) if len(candidates) > 1: raise RuntimeError( - f"Identifiable was not defined unambigiously. Possible candidates are {candidates}") + f"Identifiable was not defined unambigiously. Possible candidates are {candidates}" + ) if len(candidates) == 0: return None return candidates[0] - def resolve_reference(self, value: db.Record): - if self.get_registered_identifiable(value) is None: - raise NotImplementedError("Non-identifiable references cannot" - " be used as properties in identifiables.") - # TODO: just resolve the entity - - value_identifiable = self.retrieve_identified_record_for_record(value) - if value_identifiable is None: - raise RuntimeError("The identifiable which is used as property" - " here has to be inserted first.") - - if value_identifiable.id is None: - raise RuntimeError("The entity has not been assigned an ID.") - - return value_identifiable.id - class CaosDBIdentifiableAdapter(IdentifiableAdapter): """ @@ -421,72 +584,139 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): # TODO: don't store registered identifiables locally def __init__(self): - self._registered_identifiables = dict() + self._registered_identifiables = {} def load_from_yaml_definition(self, path: str): """Load identifiables defined in a yaml file""" - with open(path, 'r') as yaml_f: + with open(path, "r", encoding="utf-8") as yaml_f: identifiable_data = yaml.safe_load(yaml_f) + self.load_from_yaml_object(identifiable_data) - for key, value in identifiable_data.items(): - rt = db.RecordType().add_parent(key) - for prop_name in value: + def load_from_yaml_object(self, identifiable_data): + """Load identifiables defined in a yaml object. """ + + for rt_name, id_list in identifiable_data.items(): + rt = db.RecordType().add_parent(rt_name) + if not isinstance(id_list, list): + raise InvalidIdentifiableYAML( + f"Identifiable contents must be lists, but this was not: {rt_name}") + for prop_name in id_list: if isinstance(prop_name, str): rt.add_property(name=prop_name) elif isinstance(prop_name, dict): for k, v in prop_name.items(): + if k == "is_referenced_by" and not isinstance(v, list): + raise InvalidIdentifiableYAML( + f"'is_referenced_by' must be a list. Found in: {rt_name}") rt.add_property(name=k, value=v) else: - NotImplementedError("YAML is not structured correctly") + raise InvalidIdentifiableYAML( + "Identifiable properties must be str or dict, but this one was not:\n" + f" {rt_name}/{prop_name}") - self.register_identifiable(key, rt) + self.register_identifiable(rt_name, rt) def register_identifiable(self, name: str, definition: db.RecordType): - self._registered_identifiables[name] = definition + self._registered_identifiables[name.lower()] = definition def get_file(self, identifiable: Identifiable): + warnings.warn( + DeprecationWarning("This function is deprecated. Please do not use it.") + ) + # TODO is this needed for Identifiable? + # or can we get rid of this function? + if isinstance(identifiable, db.Entity): + return cached_get_entity_by(path=identifiable) if identifiable.path is None: raise RuntimeError("Path must not be None for File retrieval.") - candidates = db.execute_query("FIND File which is stored at {}".format( - identifiable.path)) + candidates = cached_get_entity_by(path=identifiable.path) if len(candidates) > 1: raise RuntimeError("Identifiable was not defined unambigiously.") if len(candidates) == 0: return None return candidates[0] - def get_registered_identifiable(self, record: db.Record): + def get_registered_identifiable(self, record: db.Entity): """ - returns the registred identifiable for the given Record + returns the registered identifiable for the given Record It is assumed, that there is exactly one identifiable for each RecordType. Only the first parent of the given Record is considered; others are ignored """ if len(record.parents) == 0: return None - # TODO We need to treat the case where multiple parents exist properly. - rt_name = record.parents[0].name - for name, definition in self._registered_identifiables.items(): - if definition.parents[0].name.lower() == rt_name.lower(): - return definition + registered = [] + for parent in record.parents: + prt = _retrieve_RecordType(id=parent.id, name=parent.name) + reg = self._get_registered_for_rt(prt) + if reg is not None: + registered.append(reg) + # TODO we might in future want to check whether the registered identifiables are the same + if len(registered) > 1: + raise RuntimeError("Multiple registered identifiables found for a Record " + f"with the following parents: {record.parents}") + elif len(registered) == 1: + return registered[0] + else: + return None - def resolve_reference(self, record: db.Record): + def _get_registered_for_rt(self, rt: db.RecordType): """ - Current implementation just sets the id for this record - as a value. It needs to be verified that references all contain an ID. + returns the registered identifiable for the given RecordType or the + registered identifiable of the first parent """ - if record.id is None: - return record - return record.id + if rt.name.lower() in self._registered_identifiables: + return self._registered_identifiables[rt.name.lower()] + if len(rt.parents) == 0: + return None + registered = [] + for parent in rt.parents: + prt = _retrieve_RecordType(id=parent.id, name=parent.name) + reg = self._get_registered_for_rt(prt) + if reg is not None: + registered.append(reg) + # TODO we might in future want to check whether the registered identifiables are the same + if len(registered) > 1: + ri_names = [i.name for i in registered] + raise RuntimeError(f"Multiple registered identifiables found for the RecordType " + f" {rt.name} with the following parents: {rt.parents}\n" + f"Registered identifiables: {', '.join(ri_names)}") + elif len(registered) == 1: + return registered[0] + else: + return None def retrieve_identified_record_for_identifiable(self, identifiable: Identifiable): query_string = self.create_query_for_identifiable(identifiable) - candidates = db.execute_query(query_string) + try: + candidates = cached_query(query_string) + except db.exceptions.HTTPServerError: + query_string = self.create_query_for_identifiable( + identifiable, startswith=True + ) + candidates = cached_query( + query_string + ).copy() # Copy against cache poisoning + + # Test if the candidates really match all properties + for pname, pvalue in identifiable.properties.items(): + popme = [] + for i in range(len(candidates)): + this_prop = candidates[i].get_property(pname) + if this_prop is None: + popme.append(i) + continue + if not this_prop.value == pvalue: + popme.append(i) + for i in reversed(popme): + candidates.pop(i) + if len(candidates) > 1: raise RuntimeError( - f"Identifiable was not defined unambigiously.\n{query_string}\nReturned the " + f"Identifiable was not defined unambiguously.\n{query_string}\nReturned the " f"following {candidates}." - f"Identifiable:\n{identifiable.record_type}{identifiable.properties}") + f"Identifiable:\n{identifiable.record_type}{identifiable.properties}" + ) if len(candidates) == 0: return None return candidates[0] diff --git a/src/caoscrawler/identified_cache.py b/src/caoscrawler/identified_cache.py deleted file mode 100644 index aa2d82f8e66c738e737c62f3cc68eaf60127e28b..0000000000000000000000000000000000000000 --- a/src/caoscrawler/identified_cache.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -# -# ** header v3.0 -# This file is a part of the CaosDB Project. -# -# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com> -# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see <https://www.gnu.org/licenses/>. -# -# ** end header -# - - -""" -see class docstring -""" - -from .identifiable import Identifiable -import caosdb as db - - -class IdentifiedCache(object): - """ - This class is like a dictionary where the keys are Identifiables. When you check whether an - Identifiable exists as key this class returns True not only if that exact Python object is - used as a key, but if an Identifiable is used as key that is **equal** to the one being - considered (see __eq__ function of Identifiable). Similarly, if you do `cache[identifiable]` - you get the Record where the key is an Identifiable that is equal to the one in the rectangular - brackets. - - This class is used for Records where we checked the existence in a remote server using - identifiables. If the Record was found, this means that we identified the corresponding Record - in the remote server and the ID of the local object can be set. - To prevent querying the server again and again for the same objects, this cache allows storing - Records that were found on a remote server and those that were not (typically in separate - caches). - """ - - def __init__(self): - self._cache = {} - self._identifiables = [] - - def __contains__(self, identifiable: Identifiable): - return identifiable in self._identifiables - - def __getitem__(self, identifiable: db.Record): - index = self._identifiables.index(identifiable) - return self._cache[id(self._identifiables[index])] - - def add(self, record: db.Record, identifiable: Identifiable): - self._cache[id(identifiable)] = record - self._identifiables.append(identifiable) diff --git a/src/caoscrawler/logging.py b/src/caoscrawler/logging.py new file mode 100644 index 0000000000000000000000000000000000000000..b57a067d8635a468df7345365fabbfae9ee0b22f --- /dev/null +++ b/src/caoscrawler/logging.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +import logging +import sys + +from caosadvancedtools.serverside.helper import get_shared_filename +from caosadvancedtools.webui_formatter import WebUI_Formatter + + +def configure_server_side_logging(max_log_level: int = logging.INFO): + """ + Set logging up to save one plain debugging log file, one plain info log + file (for users) and a stdout stream with messages wrapped in html elements + + returns the path to the file with debugging output + + Parameters + ---------- + max_log_level : int, optional + The maximum log level to use for SSS-logs. Default is + ``logging.INFO``. + + Returns + ------- + userlog_public, htmluserlog_public, debuglog_public: str + Public paths of the respective log files. + """ + adv_logger = logging.getLogger("caosadvancedtools") + # The max_<level> variables will be used to set the logger levels + # to the respective maximum of intended level and max_log_level, + # effectively cutting off logging above the specified + # max_log_level. + max_info = max(logging.INFO, max_log_level) + max_debug = max(logging.DEBUG, max_log_level) + adv_logger.setLevel(level=max_debug) + + cr_logger = logging.getLogger("caoscrawler") + cr_logger.setLevel(level=max_debug) + + userlog_public, userlog_internal = get_shared_filename("userlog.txt") + + root_logger = logging.getLogger() + root_logger.setLevel(level=max_info) + + # this is a log file with INFO level for the user + user_file_handler = logging.FileHandler(filename=userlog_internal) + user_file_handler.setLevel(logging.INFO) + root_logger.addHandler(user_file_handler) + + # The output shall be printed in the webui. Thus wrap it in html elements. + formatter = WebUI_Formatter(full_file="/Shared/{}".format(userlog_public)) + web_handler = logging.StreamHandler(stream=sys.stdout) + web_handler.setFormatter(formatter) + web_handler.setLevel(logging.INFO) + root_logger.addHandler(web_handler) + + # Also create an HTML version for later use. + htmluserlog_public, htmluserlog_internal = get_shared_filename("userlog.html") + formatter = WebUI_Formatter(full_file="/Shared/{}".format(userlog_public)) + lweb_handler = logging.FileHandler(filename=htmluserlog_internal) + lweb_handler.setFormatter(formatter) + lweb_handler.setLevel(logging.INFO) + root_logger.addHandler(lweb_handler) + + # one log file with debug level output + debuglog_public, debuglog_internal = get_shared_filename("debuglog.txt") + debug_handler = logging.FileHandler(filename=debuglog_internal) + debug_handler.setLevel(logging.DEBUG) + root_logger.addHandler(debug_handler) + + return userlog_public, htmluserlog_public, debuglog_public diff --git a/src/caoscrawler/macros/macro_yaml_object.py b/src/caoscrawler/macros/macro_yaml_object.py index c6b5de27d7f498d9b1db6b6a90d986487340a880..5d2bc1fe0775499fa8b40a65e115fb4569892e38 100644 --- a/src/caoscrawler/macros/macro_yaml_object.py +++ b/src/caoscrawler/macros/macro_yaml_object.py @@ -25,10 +25,14 @@ # Function to expand a macro in yaml # A. Schlemmer, 05/2022 -from dataclasses import dataclass -from typing import Any, Dict +import re from copy import deepcopy +from dataclasses import dataclass from string import Template +from typing import Any, Dict + +_SAFE_SUBST_PAT = re.compile(r"^\$(?P<key>\w+)$") +_SAFE_SUBST_PAT_BRACES = re.compile(r"^\$\{(?P<key>\w+)}$") @dataclass @@ -53,6 +57,12 @@ def substitute(propvalue, values: dict): Substitution of variables in strings using the variable substitution library from python's standard library. """ + # Simple matches are simply replaced by the raw dict entry. + if match := (_SAFE_SUBST_PAT.fullmatch(propvalue) + or _SAFE_SUBST_PAT_BRACES.fullmatch(propvalue)): + key = match.group("key") + if key in values: + return values[key] propvalue_template = Template(propvalue) return propvalue_template.safe_substitute(**values) diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py new file mode 100644 index 0000000000000000000000000000000000000000..af1f4173e95827606a02979ddd6d7fcd9f133271 --- /dev/null +++ b/src/caoscrawler/scanner.py @@ -0,0 +1,532 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Research Group Biomedical Physics, +# Max-Planck-Institute for Dynamics and Self-Organization Göttingen +# Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +This is the scanner. + +This was where formerly the ``_crawl(...)`` function from ``crawl.py`` was located. + +This is just the functionality that extracts data from the file system. +""" + +from __future__ import annotations + +import importlib +import logging +import os +import warnings +from collections.abc import Callable +from typing import Any, Optional, Union + +import linkahead as db +import yaml +from importlib_resources import files +from jsonschema import validate + +from .converters import Converter +from .debug_tree import DebugTree +from .stores import GeneralStore, RecordStore +from .structure_elements import Directory, StructureElement +from .version import check_cfood_version + +logger = logging.getLogger(__name__) + + +def load_definition(crawler_definition_path: str) -> dict: + """ + Load a cfood from a crawler definition defined by + crawler definition path and validate it using cfood-schema.yml. + + Arguments: + ---------- + crawler_definition_path: str + Path to the crawler definition file in yaml format. + + Returns: + -------- + dict containing the crawler definition. + """ + + # Load the cfood from a yaml file: + with open(crawler_definition_path, encoding="utf-8") as f: + crawler_definitions = list(yaml.safe_load_all(f)) + + crawler_definition = _load_definition_from_yaml_dict(crawler_definitions) + + return _resolve_validator_paths(crawler_definition, crawler_definition_path) + + +def _load_definition_from_yaml_dict(crawler_definitions: list[dict]) -> dict: + """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which + contains either one or two documents. + + Doesn't resolve the validator paths in the cfood definition, so for + internal and testing use only. + + Arguments: + ---------- + crawler_definitions: list[dict] + List of one or two dicts containing (optionally) metadata and the crawler definition. + + Returns: + -------- + dict containing the crawler definition. + """ + if len(crawler_definitions) == 1: + # Simple case, just one document: + crawler_definition = crawler_definitions[0] + metadata = {} + elif len(crawler_definitions) == 2: + metadata = crawler_definitions[0]["metadata"] if "metadata" in crawler_definitions[0] else { + } + crawler_definition = crawler_definitions[1] + else: + raise RuntimeError( + "Crawler definition must not contain more than two documents.") + + check_cfood_version(metadata) + + # TODO: at this point this function can already load the cfood schema extensions + # from the crawler definition and add them to the yaml schema that will be + # tested in the next lines of code: + + # Load the cfood schema: + with open(str(files('caoscrawler').joinpath('cfood-schema.yml')), "r") as f: + schema = yaml.safe_load(f) + + # Add custom converters to converter enum in schema: + if "Converters" in crawler_definition: + for key in crawler_definition["Converters"]: + schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( + key) + if len(crawler_definitions) == 2: + if "Converters" in metadata: + for key in metadata["Converters"]: + schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append( + key) + # TODO: We need a similar thing for "Transformers". + + # Validate the cfood schema: + validate(instance=crawler_definition, schema=schema["cfood"]) + + return crawler_definition + + +def _resolve_validator_paths(definition: dict, definition_path: str): + """Resolve path to validation files with respect to the file in which + the crawler was defined. + + """ + + for key, value in definition.items(): + + if key == "validate" and isinstance(value, str): + # Validator is given by a path + if not value.startswith('/'): + # Not an absolute path + definition[key] = os.path.join(os.path.dirname(definition_path), value) + if not os.path.isfile(definition[key]): + # TODO(henrik) capture this in `crawler_main` similar to + # `ConverterValidationError`. + raise FileNotFoundError( + f"Couldn't find validation file {definition[key]}") + elif isinstance(value, dict): + # Recursively resolve all validators + definition[key] = _resolve_validator_paths(value, definition_path) + + return definition + + +def create_converter_registry(definition: dict): + """ + Currently the converter registry is a dictionary containing for each converter: + - key is the short code, abbreviation for the converter class name + - module is the name of the module to be imported which must be installed + - class is the converter class to load and associate with this converter entry + + Formerly known as "load_converters". + + all other info for the converter needs to be included in the converter plugin + directory: + schema.yml file + README.md documentation + """ + + # Defaults for the converter registry: + with open(str(files('caoscrawler').joinpath('default_converters.yml')), "r") as f: + converter_registry: dict[str, dict[str, str]] = yaml.safe_load(f) + + # More converters from definition file: + if "Converters" in definition: + for key, entry in definition["Converters"].items(): + if key in ["Dict", "DictTextElement", "DictIntegerElement", "DictBooleanElement", + "DictDictElement", "DictListElement", "DictFloatElement"]: + warnings.warn(DeprecationWarning(f"{key} is deprecated. Please use the new" + " variant; without 'Dict' prefix or " + "'DictElement' in case of 'Dict'")) + + converter_registry[key] = { + "converter": entry["converter"], + "package": entry["package"] + } + + # Load modules and associate classes: + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +def create_transformer_registry(definition: dict[str, dict[str, str]]): + """ + Currently the transformer registry is a dictionary containing for each transformer: + - key is the short code, abbreviation for the converter class name + - module is the name of the module to be imported which must be installed + - class is the transformer function to load and associate with this converter entry + + all other info for the converter needs to be included in the converter plugin + directory: + schema.yml file + README.md documentation + + Please refer to the docstring of function "scanner" for more information about the + detailed structure of the transformer functions. + """ + + # Defaults for the transformer registry: + with open(str(files('caoscrawler').joinpath('default_transformers.yml')), "r") as f: + transformer_def: dict[str, dict[str, str]] = yaml.safe_load(f) + + registry: dict[str, Callable[[Any, dict], Any]] = {} + # More transformers from definition file: + if "Transformers" in definition: + for key, entry in definition["Transformers"].items(): + transformer_def[key] = { + "function": entry["function"], + "package": entry["package"] + } + + # Load modules and associate classes: + for key, value in transformer_def.items(): + module = importlib.import_module(value["package"]) + registry[key] = getattr(module, value["function"]) + return registry + + +def initialize_converters(crawler_definition: dict, converter_registry: dict): + """ + takes the cfood as dict (`crawler_definition`) and creates the converter objects that + are defined on the highest level. Child Converters will in turn be created during the + initialization of the Converters. + """ + converters = [] + + for key, value in crawler_definition.items(): + # Definitions and Converters are reserved keywords + # on the top level of the yaml file. + # TODO: there should also be a top level keyword for the actual + # CFood to avoid confusion between top level keywords + # and the CFood. + if key == "Definitions": + continue + elif key == "Converters": + continue + elif key == "Transformers": + continue + converters.append(Converter.converter_factory( + value, key, converter_registry)) + + return converters + +# -------------------------------------------------------------------------------- +# Main scanner function: +# -------------------------------------------------------------------------------- + + +def scanner(items: list[StructureElement], + converters: list[Converter], + general_store: Optional[GeneralStore] = None, + record_store: Optional[RecordStore] = None, + structure_elements_path: Optional[list[str]] = None, + converters_path: Optional[list[str]] = None, + restricted_path: Optional[list[str]] = None, + crawled_data: Optional[list[db.Record]] = None, + debug_tree: Optional[DebugTree] = None, + registered_transformer_functions: Optional[dict] = None) -> list[db.Record]: + """Crawl a list of StructureElements and apply any matching converters. + + Formerly known as ``_crawl(...)``. + + Parameters + ---------- + items: list[StructureElement] + structure_elements (e.g. files and folders on one level on the hierarchy) + + converters: list[Converter] + locally defined converters for treating structure elements. A locally + defined converter could be one that is only valid for a specific subtree + of the originally cralwed StructureElement structure. + + general_store, record_store: GeneralStore, RecordStore, optional + This recursion of the crawl function should only operate on copies of + the global stores of the Crawler object. + + restricted_path : list[str], optional + traverse the data tree only along the given path. For example, when a + directory contains files a, b and c, and b is given as ``restricted_path``, a + and c will be ignored by the crawler. When the end of the given path is + reached, traverse the full tree as normal. The first element of the list + provided by ``restricted_path`` should be the name of the StructureElement + at this level, i.e. denoting the respective element in the items + argument. + + registered_transformer_functions : dict, optional + A dictionary of transformer functions that can be used in the "transform" block + of a converter and that allows to apply simple transformations to variables extracted + either by the current converter or to other variables found in the current variable store. + + Each function is a dictionary: + + - The key is the name of the function to be looked up in the dictionary of registered + transformer functions. + - The value is the function which needs to be of the form: + def func(in_value: Any, in_parameters: dict) -> Any: + pass + + """ + # This path_found variable stores wether the path given by restricted_path was found in the + # data tree + path_found = False + if restricted_path is not None and len(restricted_path) == 0: + restricted_path = None + + if crawled_data is None: + crawled_data = [] + + if general_store is None: + general_store = GeneralStore() + + if record_store is None: + record_store = RecordStore() + + if structure_elements_path is None: + structure_elements_path = [] + + if converters_path is None: + converters_path = [] + + for element in items: + element_path = os.path.join(*(structure_elements_path + [str(element.get_name())])) + logger.debug(f"Dealing with {element_path}") + for converter in converters: + + # type is something like "matches files", replace isinstance with "type_matches" + # match function tests regexp for example + if (converter.typecheck(element) and ( + restricted_path is None or element.name == restricted_path[0]) + and converter.match(element) is not None): + path_found = True + general_store_copy = general_store.create_scoped_copy() + record_store_copy = record_store.create_scoped_copy() + + # Create an entry for this matched structure element that contains the path: + general_store_copy[converter.name] = element_path + + # extracts values from structure element and stores them in the + # variable store. + converter.create_values(general_store_copy, element) + + # Apply transformers if there are any: + converter.apply_transformers(general_store_copy, + registered_transformer_functions) + + keys_modified = converter.create_records( + general_store_copy, record_store_copy, element) + + children = converter.create_children(general_store_copy, element) + + if debug_tree is not None: + # add provenance information for each variable + debug_tree.debug_tree[str(element)] = ( + general_store_copy.get_storage(), record_store_copy.get_storage()) + debug_tree.debug_metadata["copied"][str(element)] = ( + general_store_copy.get_dict_copied(), + record_store_copy.get_dict_copied()) + debug_tree.debug_metadata["usage"][str(element)].add( + "/".join(converters_path + [converter.name])) + mod_info = debug_tree.debug_metadata["provenance"] + # TODO: actually keys_modified must not be None. create_records should + # always return a list. + if keys_modified is not None: + for record_name, prop_name in keys_modified: + # TODO: check + internal_id = record_store_copy.get_internal_id( + record_name) + record_identifier = record_name + \ + "_" + str(internal_id) + converter.metadata["usage"].add(record_identifier) + mod_info[record_identifier][prop_name] = ( + structure_elements_path + [element.get_name()], + converters_path + [converter.name]) + + scanner(children, converter.converters, + general_store_copy, record_store_copy, + structure_elements_path + [element.get_name()], + converters_path + [converter.name], + restricted_path[1:] if restricted_path is not None else None, + crawled_data, debug_tree, + registered_transformer_functions) + + # Clean up converter: + converter.cleanup() + + if restricted_path and not path_found: + raise RuntimeError("A 'restricted_path' argument was given that is not contained in " + "the data tree") + # if the crawler is running out of scope, copy all records in + # the record_store, that were created in this scope + # to the general update container. + scoped_records = record_store.get_records_current_scope() + for record in scoped_records: + crawled_data.append(record) + + return crawled_data + + +# -------------------------------------------------------------------------------- +# Main scanning interface functions: +# -------------------------------------------------------------------------------- + + +def scan_directory(dirname: Union[str, list[str]], crawler_definition_path: str, + restricted_path: Optional[list[str]] = None, + debug_tree: Optional[DebugTree] = None): + """ Crawl a single directory. + + Formerly known as "crawl_directory". + + Convenience function that starts the crawler (calls start_crawling) + with a single directory as the StructureElement. + + Parameters + ---------- + + dirname: str or list[str] + directory or list of directories to be scanned + restricted_path: optional, list of strings + Traverse the data tree only along the given path. When the end + of the given path is reached, traverse the full tree as + normal. See docstring of 'scanner' for more details. + + Returns + ------- + crawled_data : list + the final list with the target state of Records. + """ + + crawler_definition = load_definition(crawler_definition_path) + # Load and register converter packages: + converter_registry = create_converter_registry(crawler_definition) + + # Load and register transformer functions: + registered_transformer_functions = create_transformer_registry(crawler_definition) + + if not dirname: + raise ValueError( + "You have to provide a non-empty path for crawling.") + if not isinstance(dirname, list): + dirname = [dirname] + dir_element_list = [] + for dname in dirname: + dir_structure_name = os.path.basename(dname) + + # TODO: needs to be covered somewhere else + crawled_directory = dname + if not dir_structure_name and dname.endswith(os.path.sep): + if dname == os.path.sep: + # Crawling the entire file system + dir_structure_name = "root" + else: + # dirname had a trailing '/' + dir_structure_name = os.path.basename(dname[:-1]) + dir_element_list.append(Directory(dir_structure_name, dname)) + + return scan_structure_elements( + dir_element_list, + crawler_definition, + converter_registry, + restricted_path=restricted_path, + debug_tree=debug_tree, + registered_transformer_functions=registered_transformer_functions + ) + + +def scan_structure_elements(items: Union[list[StructureElement], StructureElement], + crawler_definition: dict, + converter_registry: dict, + restricted_path: Optional[list[str]] = None, + debug_tree: Optional[DebugTree] = None, + registered_transformer_functions: Optional[dict] = None) -> ( + list[db.Record]): + """ + Start point of the crawler recursion. + + Formerly known as "start_crawling". + + Parameters + ---------- + items: list + A list of structure elements (or a single StructureElement) that is used for + generating the initial items for the crawler. This could e.g. be a Directory. + crawler_definition : dict + A dictionary representing the crawler definition, possibly from a yaml + file. + restricted_path: list[str], optional + Traverse the data tree only along the given path. When the end of the + given path is reached, traverse the full tree as normal. See docstring + of 'scanner' for more details. + + Returns + ------- + crawled_data : list[db.Record] + the final list with the target state of Records. + """ + + # This function builds the tree of converters out of the crawler definition. + if not isinstance(items, list): + items = [items] + + # TODO: needs to be covered somewhere else + # self.run_id = uuid.uuid1() + converters = initialize_converters(crawler_definition, converter_registry) + + return scanner( + items=items, + converters=converters, + restricted_path=restricted_path, + debug_tree=debug_tree, + registered_transformer_functions=registered_transformer_functions + ) diff --git a/src/caoscrawler/scripts/__init__.py b/src/caoscrawler/scripts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/caoscrawler/scripts/generators.py b/src/caoscrawler/scripts/generators.py new file mode 100644 index 0000000000000000000000000000000000000000..2bf8a90f5af5086e23b7e7cc35d21a50d8cd511a --- /dev/null +++ b/src/caoscrawler/scripts/generators.py @@ -0,0 +1,246 @@ +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Scripts and functions to generate datamodel yaml files and cfood skeletons. + +For example from actual data files. +""" + +import argparse +import csv +from collections import OrderedDict +from string import Template +from typing import Optional + +import pandas as pd +import yaml + +DM_TEMPLATE = """# auto-generated data model from file "[]{infile}". +# To insert a datamodel into LinkAhead, run: +# +# python3 -m caosadvancedtools.models.parser datamodel.yaml --sync +""" + +HEADER_RT = """ +############### +# RecordTypes # +############### + +DummyRT: + description: Note: Change name and enter description. + recommended_properties: + """ + +CFOOD_TEMPLATE = """ +--- +metadata: + macros: + - !defmacro + # Simple column value -> property rule + name: ColumnValue + params: + name: null + belongsto: BaseElement + type: TextElement + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${belongsto}: + ${name}: $$val + - !defmacro + # column value -> reference property + name: ColumnValueReference + params: + name: null + reftype: null # RecordType of the reference + belongsto: BaseElement + type: TextElement # References are always text, right? + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${reftype}: + name: $$val + ${belongsto}: + ${name}: $$${reftype} + - !defmacro + # Same as "ColumnValue", but also give name of property. + name: ColumnValuePropname + params: + name: null + propname: null + belongsto: BaseElement + type: TextElement + definition: + ${name}: + type: ${type} + match_name: ^${name}$$ + match_value: (?P<val>.*) + records: + ${belongsto}: + ${propname}: $$val +--- +directory: # corresponds to the directory given to the crawler + type: Directory + match: .* # we do not care how it is named here + records: + DirRecord: # One record for each directory. + subtree: + # This is the file + thisfile: + type: []{file} + match: []{match} + records: + DatFileRecord: # One record for each matching file + role: File + path: $thisfile + file: $thisfile + subtree: + entry: + type: Dict + match: .* # Name is irrelevant + records: + BaseElement: # One BaseElement record for each row in the CSV/TSV file + DatFileRecord: $DatFileRecord + DirRecord: + BaseElement: +$BaseElement + subtree: !macro +""" + + +class _CustomTemplate(Template): + delimiter = "[]" # "$" is used too much by the yaml template. + + +def csv_to_datamodel(infile: str, outfile: str, cfood: Optional[str] = None): + """Parse the input csv and create basic datamodel in ``outfile``. + +Parameters +---------- +cfood: str + If given, also create a cfood skeleton. + """ + sniffer = csv.Sniffer() + with open(infile, encoding="utf-8") as f_infile: + max_sniff = 50000 + sniffed = sniffer.sniff(f_infile.read(max_sniff)) + df = pd.read_table(infile, sep=sniffed.delimiter, quotechar=sniffed.quotechar, + escapechar=sniffed.escapechar) + + properties = OrderedDict() + for colname in df.columns: + column = df[colname] + dtype: Optional[str] = "TEXT" + if pd.api.types.is_bool_dtype(column.dtype): + dtype = "BOOLEAN" + if pd.api.types.is_float_dtype(column.dtype): + dtype = "DOUBLE" + elif pd.api.types.is_integer_dtype(column.dtype): + dtype = "INTEGER" + properties[colname] = { + "datatype": dtype + } + + result = (_CustomTemplate(DM_TEMPLATE).substitute({"infile": infile}) + + HEADER_RT + + " ".join(yaml.dump(dict(properties), # from OrderedDict to dict + allow_unicode=True, + sort_keys=False).splitlines(keepends=True)) + ) + with open(outfile, encoding="utf-8", mode="w") as myfile: + myfile.write(result) + + ################# + # cfood section # + ################# + if cfood: + defs_col_value: list[str] = [] + defs_col_value_ref: list[str] = [] + prefix = " " * 14 + for name, propdef in properties.items(): + def_str = prefix + f"- name: {name}\n" + dtype = None + reftype = None + defs = defs_col_value + # Which type? + if propdef["datatype"] == "BOOLEAN": + dtype = "BooleanElement" + elif propdef["datatype"] == "INTEGER": + dtype = "IntegerElement" + elif propdef["datatype"] == "DOUBLE": + dtype = "FloatElement" + elif propdef["datatype"] == "TEXT": + dtype = None + else: + reftype = propdef["datatype"] + defs = defs_col_value_ref + + # Append according to types: + if reftype: + def_str += prefix + f" reftype: {reftype}\n" + if dtype: + def_str += prefix + f" type: {dtype}\n" + + # Store result + defs.append(def_str) + del defs + + sep = repr(sniffed.delimiter) + sep = f'"{sep[1:-1]}"' + match_str = f"""'.*[ct]sv' + sep: {sep} + # "header": [int] + # "names": [str] + # "index_col": [int] + # "usecols": [int] + # "true_values": [str] + # "false_values": [str] + # "na_values": [str] + # "skiprows": [int] + # "nrows": [int] + # "keep_default_na": [bool] + """ + + cfood_str = (_CustomTemplate(CFOOD_TEMPLATE).substitute({"file": "CSVTableConverter", + "match": match_str}) + + prefix[2:] + "ColumnValue:\n" + "".join(defs_col_value) + + prefix[2:] + "ColumnValueReference:\n" + "".join(defs_col_value_ref) + ) + with open(cfood, encoding="utf-8", mode="w") as myfile: + myfile.write(cfood_str) + + +def _parse_args_csv(): + """Parse the arguments.""" + parser = argparse.ArgumentParser(description="Create datamodel and cfood from CSV files.") + parser.add_argument('-i', '--input', help="The input file.", required=True, dest="infile") + parser.add_argument('-o', '--outfile', help="Yaml filename to save the result", required=True) + parser.add_argument('--cfood', help="Yaml filename to create cfood output in", required=False) + + return parser.parse_args() + + +def csv_to_datamodel_main(): + """The main function for csv data handling.""" + args = _parse_args_csv() + csv_to_datamodel(**vars(args)) diff --git a/src/caoscrawler/structure_elements/__init__.py b/src/caoscrawler/structure_elements/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..351f1069708ec94c0dd27313b6329d89858d4330 --- /dev/null +++ b/src/caoscrawler/structure_elements/__init__.py @@ -0,0 +1,31 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Submdule containing all default and optional converters.""" + +from .. import utils +from .structure_elements import * + +try: + from .rocrate_structure_elements import ROCrateEntity +except ImportError as err: + ROCrateEntity: type = utils.MissingImport( + name="ROCrateEntity", hint="Try installing with the `rocrate` extra option.", + err=err) diff --git a/src/caoscrawler/structure_elements/rocrate_structure_elements.py b/src/caoscrawler/structure_elements/rocrate_structure_elements.py new file mode 100644 index 0000000000000000000000000000000000000000..66768ad800128297a27f47d672352f21310703e9 --- /dev/null +++ b/src/caoscrawler/structure_elements/rocrate_structure_elements.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2024 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +from rocrate.model.entity import Entity + +from .structure_elements import StructureElement + + +class ROCrateEntity(StructureElement): + """ + Store entities contained in ROCrates. + """ + + def __init__(self, folder: str, entity: Entity): + """ + Initializes this ROCrateEntity. + + Arguments: + ---------- + folder: str + The folder that contains the ROCrate data. In case of a zipped ROCrate, this + is a temporary folder that the ROCrate was unzipped to. + The folder is the folder containing the ro-crate-metadata.json. + + entity: Entity + The ROCrate entity that is stored in this structure element. + The entity automatically contains an attribute ".crate" + that stores the ROCrate that this entity belongs to. It can be used + e.g. to look up links to other entities (ROCrate.dereference). + """ + super().__init__(entity.properties()["@id"]) + self.folder = folder + self.entity = entity diff --git a/src/caoscrawler/structure_elements.py b/src/caoscrawler/structure_elements/structure_elements.py similarity index 64% rename from src/caoscrawler/structure_elements.py rename to src/caoscrawler/structure_elements/structure_elements.py index 952f29d012f8373062ed9dfe8a830bd18c4b0baa..3b4c6e9b9d13c61a5924a12d23b11b62edff6924 100644 --- a/src/caoscrawler/structure_elements.py +++ b/src/caoscrawler/structure_elements/structure_elements.py @@ -23,16 +23,24 @@ # ** end header # -from typing import Dict as tDict import warnings +import lxml.etree + class StructureElement(object): - """ base class for elements in the hierarchical data structure """ + """Base class for elements in the hierarchical data structure. + +Parameters +---------- + +name: str + The name of the StructureElement. May be used for pattern matching by CFood rules. + """ - def __init__(self, name): + def __init__(self, name: str): # Used to store usage information for debugging: - self.metadata: tDict[str, set[str]] = { + self.metadata: dict[str, set[str]] = { "usage": set() } @@ -46,6 +54,18 @@ class StructureElement(object): class FileSystemStructureElement(StructureElement): + """StructureElement representing an element of a file system, like a directory or a simple file. + +Parameters +---------- + +name: str + The name of the StructureElement. May be used for pattern matching by CFood rules. + +path: str + The path to the file or directory. + """ + def __init__(self, name: str, path: str): super().__init__(name) self.path = path @@ -65,6 +85,7 @@ class Directory(FileSystemStructureElement): class File(FileSystemStructureElement): + """StrutureElement representing a file.""" pass @@ -148,3 +169,53 @@ class DictDictElement(DictElement): def __init__(self, *args, **kwargs): warnings.warn(DeprecationWarning("This class is depricated. Please use DictElement.")) super().__init__(*args, **kwargs) + + +class XMLTagElement(StructureElement): + """ + Stores elements of an XML tree. + """ + + def __init__(self, element: lxml.etree.Element): + super().__init__(element.getroottree().getelementpath(element)) + self.tag = element + + +class XMLTextNode(StructureElement): + """ + Stores text nodes of XML trees. + """ + + def __init__(self, element: lxml.etree.Element): + """ + Initializes this XML text node. + + Please note that, although syntactically similar, it is semantically + different from TextElement: + - TextElements have a meaningful name, e.g. a key in a key-value pair. This name can + be matched using the match_name entry. + - XMLTextNodes just have a text and the name is just for identifying the structure element. + They can only be matched using the match entry in the XMLTextNodeConverter. + """ + super().__init__(element.getroottree().getelementpath(element) + "/text()") + self.tag = element + self.value = element.text + + +class XMLAttributeNode(StructureElement): + """ + Stores text nodes of XML trees. + """ + + def __init__(self, element: lxml.etree.Element, + key: str): + """ + Initializes this XML attribute node. + + element: The xml tree element containing the attribute. + key: The key which identifies the attribute in the list of attributes. + """ + super().__init__(element.getroottree().getelementpath(element) + "@" + key) + self.value = element.attrib[key] + self.key = key + self.tag = element diff --git a/src/caoscrawler/sync_graph.py b/src/caoscrawler/sync_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..a05e6320892239cbe8d7f1d9fbd7949a57f9bccb --- /dev/null +++ b/src/caoscrawler/sync_graph.py @@ -0,0 +1,718 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +A data model class for the graph of entities that shall be created during synchronization of the +crawler. +""" + +from __future__ import annotations + +import logging +import re +from typing import Any, Callable, Optional, Union + +import linkahead as db +from linkahead.cached import cached_get_entity_by +from linkahead.exceptions import EmptyUniqueQueryError + +from .identifiable import Identifiable +from .identifiable_adapters import IdentifiableAdapter +from .sync_node import SyncNode, TempID + +logger = logging.getLogger(__name__) + + +def _set_each_scalar_value( + node: SyncNode, condition: Callable[[Any], bool], value: Any +): + """helper function that conditionally replaces each value element of each property of a node + + If the property value is a list, the replacement is done for each list entry. + The replacement is only performed if the condition that + is provided is fulfilled, i.e. the callable ``condition`` returns True. The callable + ``condition`` must take the property value (or list element) as the sole argument. + + Args: + node (SyncNode): The node which provides the properties (and their values) to operate on. + condition (Callable): A function with one argument which is interpreted as a condition: + Only if it returns True for the property value, the action is + executed. + value (Callable): A function returning a new value that is set as the property value. This + function receives the old value as the single argument. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + for p in node.properties: + if isinstance(p.value, list): + for ii, el in enumerate(p.value): + if condition(el): + p.value[ii] = value(el) + elif condition(p.value): + p.value = value(p.value) + + +class SyncGraph: + """ + A data model class for the graph of entities that shall be created during synchronization of + the crawler. + + The SyncGraph combines nodes in the graph based on their identity in order to create a graph of + objects that can either be inserted or updated in(to) the remote server. This combination of + SyncNodes happens during initialization and later on when the ID of SyncNodes is set. + + When the SyncGraph is initialized, the properties of given entities are scanned and used to + create multiple reference maps that track how SyncNodes reference each other. + These maps are kept up to date when SyncNodes are merged because they are identified with each + other. During initialization, SyncNodes are first merged based on their ID, path or + identifiable. + + When additional information is added to the graph by setting the ID of a node + (via `set_id_of_node`) then the graph is updated accordingly: + - if this information implies that the node is equivalent to another node (e.g. has same ID), + then they are merged + - if knowing that one node does not exist in the remote server, then this might imply that some + other node also does not exist if its identity relies on the latter. + - The new ID might make it possible to create the identifiables of connected nodes and thus + might trigger further merging of nodes based on the new identifiables. + + A SyncGraph should only be manipulated via one function: + - set_id_of_node: a positive integer means the Entity exists, None means it is missing + TODO what about String IDs + + The SyncGraph can be converted back to lists of entities which allow to perform the desired + inserts and updates. + + Usage: + - Initialize the Graph with a list of entities. Those will be converted to the SyncNodes of the + graph. + - SyncNodes that can be merged are automatically merged and SyncNodes where the existence can + be determined are automatically removed from the list of unchecked SyncNodes: + graph.unchecked. + - You manipulate the graph by setting the ID of a SyncNode (either to a valid ID or to None). + For example, you can check whether a SyncNode has an identifiable and then query the remote + server and use the result to set the ID. + - After each manipulation, the graph updates accordingly (see above) + - Ideally, the unchecked list is empty after some manipulation. + - You can export a list of entities to be inserted and one of entities to be updated with + export_record_lists. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + + # General implementation remark: + # There are three cases where an update of one SyncNode can affect other nodes: + # - mark existing (add identifiables) + # - mark missing (add identifiables and add (negative) IDs) + # - merge (add identifiables) + # + # We cannot get an infinite recursion where one update triggers another update and so on + # because updates are conditional: + # Setting an ID removes the node (immediately) from the unchecked list and it is only tried to + # set an ID in _mark_missing if a node is in the uncheck list. Thus, setting the ID once + # prevents future attempts to set the ID of the same node. + # Also, setting an identifiable is only done when needed, i.e. there is no identifiable. + # Note, that when ever one node is changed, we check all dependent nodes (see usage of + # `_get_nodes_whose_identity_relies_on`) whether something should be updated. Thus, we cannot + # miss a necessary update. + def __init__( + self, entities: list[db.Entity], identifiableAdapter: IdentifiableAdapter + ): + self.identifiableAdapter = identifiableAdapter + # A dictionary allowing for quick lookup of sync nodes using their (possibly negative) IDs. + # This dictionary is initially set using _mark_entities_with_path_or_id and later updated + # using set_id_of_node or during merges of nodes. + self._id_look_up: dict[Union[int, TempID, str], SyncNode] = {} + # Similar as above for looking up nodes using paths + self._path_look_up: dict[str, SyncNode] = {} + # Similar as above for looking up nodes using identifiables. This dictionary uses the text + # representation generated by get_representation method of Identifiable as keys. + self._identifiable_look_up: dict[str, SyncNode] = {} + # look up for the nodes that were marked as being missing (on the remote server) + self._missing: dict[int, SyncNode] = {} + # same for existing + self._existing: dict[int, SyncNode] = {} + # entities that are missing get negative IDs to allow identifiable creation + self._remote_missing_counter = -1 + + self.nodes: list[SyncNode] = [] + self._initialize_nodes(entities) # list of all SemanticEntities + # list all SemanticEntities that have not yet been checked + self.unchecked = list(self.nodes) + + # initialize reference mappings (see _create_reference_mapping) + ( + self.forward_references, # id(node) -> full set of nodes referenced by the given node + self.backward_references, # id(node) -> full set of nodes referencing the given node + # as above, subset where the reference properties are part of identifiables + self.forward_references_id_props, + self.backward_references_id_props, + # as above, subset where references are part of identifiables due to "referenced_by" + self.forward_references_backref, + self.backward_references_backref, + ) = self._create_reference_mapping(self.nodes) + + # remove entities with path or ID from unchecked list + self._mark_entities_with_path_or_id() + + # add identifiables where possible + for node in list(self.nodes): + if self._identifiable_is_needed(node): + self._set_identifiable_of_node(node) + + # everything in unchecked neither does have an ID nor a path. + # Thus, it must be possible to create an + # identifiable which is checked using the following function: + for node in self.unchecked: + self.identifiableAdapter.all_identifying_properties_exist(node) + + def set_id_of_node(self, node: SyncNode, node_id: Optional[str] = None): + """sets the ID attribute of the given SyncNode to node_id. + + If node_id is None, a negative ID will be + given indicating that the node does not exist on the remote server. + Furthermore it will be marked as missing using _mark_missing. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + if node.id is not None: + raise RuntimeError( + "Cannot update ID.\n" + f"It already is {node.id} and shall be set to {node_id}." + ) + if node_id is None: + node_id = TempID(self._get_new_id()) + node.id = node_id + if node_id in self._id_look_up: + self._merge_into(node, self._id_look_up[node.id]) + else: + self._id_look_up[node.id] = node + if isinstance(node.id, TempID): + self._mark_missing(node) + else: + self._mark_existing(node) + + def export_record_lists(self): + """exports the SyncGraph in form of db.Entities + + All nodes are converted to db.Entity objects and reference values that are SyncNodes are + replaced by their corresponding (newly created) db.Entity objects. + + Since the result is returned in form of two lists, one with Entities that have a valid ID + one with those that haven't, an error is raised if there are any SyncNodes without an + (possibly negative) ID. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + # TODO reactivate once the implementation is appropriate + # if len(self.unchecked) > 1: + # self.unchecked_contains_circular_dependency() + + for el in self.nodes: + if el.id is None: + raise RuntimeError("Exporting unchecked entities is not supported") + + entities = [] + node_map = {} + for el in self.nodes: + entities.append(el.export_entity()) + node_map[id(el)] = entities[-1] + + for ent in entities: + _set_each_scalar_value( + ent, + condition=lambda val: isinstance(val, SyncNode), + value=lambda val: node_map[id(val)], + ) + + missing = [el for el in entities if el.id < 0] + existing = [el for el in entities if el.id > 0] + # remove negative IDs + for el in missing: + el.id = None + + return (missing, existing) + + def _identity_relies_on_unchecked_entity(self, node: SyncNode): + """ + If a record for which it could not yet be verified whether it exists in LA or not is part + of the identifying properties, this returns True, otherwise False + + Last review by Alexander Schlemmer on 2024-05-27. + """ + + return any( + [ + id(ent) not in self._missing and id(ent) not in self._existing + for ent in self.forward_references_id_props[id(node)] + ] + + [ + id(ent) not in self._missing and id(ent) not in self._existing + for ent in self.backward_references_backref[id(node)] + ] + ) + + def unchecked_contains_circular_dependency(self): + """ + Detects whether there are circular references in the given entity list and returns a list + where the entities are ordered according to the chain of references (and only the entities + contained in the circle are included. Returns None if no circular dependency is found. + + TODO: for the sake of detecting problems for split_into_inserts_and_updates we should only + consider references that are identifying properties. + """ + raise NotImplementedError("This function is not yet properly implemented") + # TODO if the first element is not part of the circle, then + # this will not work + # We must created a better implementation (see also TODO in docstring) + circle = [self.unchecked[0]] + closed = False + while not closed: + added_to_circle = False + for referenced in self.forward_references[id(circle[-1])]: + if referenced in self.unchecked: + if referenced in circle: + closed = True + circle.append(referenced) + added_to_circle = True + if not added_to_circle: + return None + return circle + + def get_equivalent(self, entity: SyncNode) -> Optional[SyncNode]: + """ + Return an equivalent SyncNode. + + Equivalent means that ID, path or identifiable are the same. + If a new information was added to the given SyncNode (e.g. the ID), it might be possible + then to identify an equivalent node (i.e. one with the same ID in this example). + There might be more than one equivalent node in the graph. However, simply the first that + is found is being returned. (When an equivalent node is found, the given node is + typically merged, into the one that was found and after the merge the graph is again + checked for equivalent nodes.) + + Returns None if no equivalent node is found. + + Last review by Alexander Schlemmer on 2024-05-28. + """ + if entity.id is not None and entity.id in self._id_look_up: + candidate = self._id_look_up[entity.id] + if candidate is not entity: + return candidate + if entity.path is not None and entity.path in self._path_look_up: + candidate = self._path_look_up[entity.path] + if candidate is not entity: + return candidate + if ( + entity.identifiable is not None + and entity.identifiable.get_representation() in self._identifiable_look_up + ): + candidate = self._identifiable_look_up[ + entity.identifiable.get_representation() + ] + if candidate is not entity: + return candidate + return None + + def _get_new_id(self): + """returns the next unused temporary ID + + Last review by Alexander Schlemmer on 2024-05-24. + """ + self._remote_missing_counter -= 1 + return self._remote_missing_counter + + def _set_identifiable_of_node( + self, node: SyncNode, identifiable: Optional[Identifiable] = None + ): + """sets the identifiable and checks whether an equivalent node can be found with that new + information. If an equivalent node is found, 'node' is merged into that node. + + if no identifiable is given, the identifiable is retrieved from the identifiable adapter + + Raises a ValueError if the equivalent node found does not have an identifiable. + Raises a RuntimeError if there is no equivalent node found and + the (unique) string representation of the identifiable of node is already contained in + the identifiable_look_up. + + Last review by Alexander Schlemmer on 2024-05-29. + """ + if identifiable is None: + self.identifiableAdapter.all_identifying_properties_exist(node) + identifiable = self.identifiableAdapter.get_identifiable( + node, self.backward_references_backref[id(node)] + ) + node.identifiable = identifiable + equivalent_se = self.get_equivalent(node) + if equivalent_se is not None: + self._merge_into(node, equivalent_se) + else: + if node.identifiable.get_representation() in self._identifiable_look_up: + raise RuntimeError("Identifiable is already in the look up") + self._identifiable_look_up[node.identifiable.get_representation()] = node + + @staticmethod + def _sanity_check(entities: list[db.Entity]): + """ + Checks whether each record in entities has at least one parent. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + for ent in entities: + if ent.role == "Record" and len(ent.parents) == 0: + raise ValueError(f"Records must have a parent.\n{ent}") + if isinstance(ent.id, int) and ent.id < 0: + raise ValueError( + f"Records must not have negative integers as IDs.\n{ent}" + ) + if isinstance(ent.id, str) and re.match(r"^-\d+$", ent.id): + raise ValueError( + f"Records must not have negative integers as IDs.\n{ent}" + ) + + def _get_nodes_whose_identity_relies_on(self, node: SyncNode): + """returns a set of nodes that reference the given node as identifying property or are + referenced by the given node and the parent of the given node is listed as + "is_referenced_by" + + Last review by Alexander Schlemmer on 2024-05-24. + """ + return self.backward_references_id_props[id(node)].union( + self.forward_references_backref[id(node)] + ) + + @staticmethod + def _create_flat_list( + ent_list: list[db.Entity], flat: Optional[list[db.Entity]] = None + ): + """ + Recursively adds entities and all their properties contained in ent_list to + the output list flat. + + TODO: This function will be moved to pylib as it is also needed by the + high level API. + + Last review by Alexander Schlemmer on 2024-05-29. + """ + # Note: A set would be useful here, but we do not want a random order. + if flat is None: + flat = list() + for el in ent_list: + if el not in flat: + flat.append(el) + for ent in ent_list: + for p in ent.properties: + # For lists append each element that is of type Entity to flat: + if isinstance(p.value, list): + for el in p.value: + if isinstance(el, db.Entity): + if el not in flat: + flat.append(el) + SyncGraph._create_flat_list([el], flat) + elif isinstance(p.value, db.Entity): + if p.value not in flat: + flat.append(p.value) + SyncGraph._create_flat_list([p.value], flat) + return flat + + @staticmethod + def _create_reference_mapping(flat: list[SyncNode]): + """ + Create six dictionaries that describe references among SyncNodes. All dictionaries use the + Python ID of SyncNodes as keys. + There is always one dictionary to describe the direction of the reference, i.e. + map[id(node)] -> other where other is a set of SyncNodes that are being referenced by node. + And then there is always one dictionary for the inverse direction. The two dictionaries are + named "forward_" and "backward_", respectively. + + Then there are three kinds of maps being generated: One includes all references + ("_references"), one includes references that are values of identifying properties + ("_references_id_props") and one includes references that are relevant for identifying + backreferences/"is_referenced_by" ("_references_backref"). I.e. the two latter are subesets + of the former reference map. + + Arguments: + ---------- + flat: list[SyncNode] + all SyncNodes that span the graph for which the reference map shall be created + + Last review by Alexander Schlemmer on 2024-05-29. + """ + # TODO we need to treat children of RecordTypes somehow. + forward_references: dict[int, set[SyncNode]] = {} + backward_references: dict[int, set[SyncNode]] = {} + forward_references_id_props: dict[int, set[SyncNode]] = {} + backward_references_id_props: dict[int, set[SyncNode]] = {} + forward_references_backref: dict[int, set[SyncNode]] = {} + backward_references_backref: dict[int, set[SyncNode]] = {} + + # initialize with empty lists/dict + for node in flat: + forward_references[id(node)] = set() + backward_references[id(node)] = set() + forward_references_id_props[id(node)] = set() + backward_references_id_props[id(node)] = set() + forward_references_backref[id(node)] = set() + backward_references_backref[id(node)] = set() + for node in flat: + for p in node.properties: + val = p.value + if not isinstance(val, list): + val = [val] + for v in val: + if isinstance(v, SyncNode): + forward_references[id(node)].add(v) + backward_references[id(v)].add(node) + if ( + node.registered_identifiable is not None + and len( + [ + el.name + for el in node.registered_identifiable.properties + if el.name == p.name + ] + ) + > 0 + ): + forward_references_id_props[id(node)].add(v) + backward_references_id_props[id(v)].add(node) + if ( + v.registered_identifiable is not None + and IdentifiableAdapter.referencing_entity_has_appropriate_type( + node.parents, v.registered_identifiable + ) + ): + forward_references_backref[id(node)].add(v) + backward_references_backref[id(v)].add(node) + + return ( + forward_references, + backward_references, + forward_references_id_props, + backward_references_id_props, + forward_references_backref, + backward_references_backref, + ) + + def _mark_entities_with_path_or_id(self): + """A path or an ID is sufficiently identifying. Thus, those entities can be marked as + checked + + When this function returns, there is only one node for each ID (i.e. no two nodes with the + same ID). The same is true for paths. + + This function also updates _id_look_up and _path_look_up + + Last review by Alexander Schlemmer on 2024-05-29. + """ + for node in list(self.nodes): + if node.id is not None: + eq_node = self.get_equivalent(node) + if eq_node is not None: + self._basic_merge_into(node, eq_node) + else: + self._id_look_up[node.id] = node + self._mark_existing(node) + + for node in list(self.nodes): + if node.path is not None: + eq_node = self.get_equivalent(node) + if eq_node is not None: + self._basic_merge_into(node, eq_node) + else: + self._path_look_up[node.path] = node + try: + existing = cached_get_entity_by(path=node.path) + except EmptyUniqueQueryError: + existing = None + remote_id = None + if existing is not None: + remote_id = existing.id + self.set_id_of_node(node, remote_id) + + def _basic_merge_into(self, source: SyncNode, target: SyncNode): + """tries to merge source into target and updates member variables + + - reference maps are updated + - self.nodes is updated + - self.unchecked is updated + - lookups are being updated + """ + # sanity checks + if source is target: + raise ValueError("source must not be target") + + target.update(source) + + # replace actual reference property values + for node in self.backward_references[id(source)]: + _set_each_scalar_value( + node, condition=lambda val: val is source, value=lambda val: target + ) + + # update reference mappings + for setA, setB in ( + (self.forward_references, self.backward_references), # ref: source -> other + (self.backward_references, self.forward_references), # ref: other -> source + (self.forward_references_id_props, self.backward_references_id_props), + (self.backward_references_id_props, self.forward_references_id_props), + (self.forward_references_backref, self.backward_references_backref), + (self.backward_references_backref, self.forward_references_backref), + ): + for node in setA.pop(id(source)): + setA[id(target)].add(node) + setB[id(node)].remove(source) + setB[id(node)].add(target) + + # remove unneeded SyncNode + self.nodes.remove(source) + if source in self.unchecked: + self.unchecked.remove(source) + # update look ups + if target.id is not None: + self._id_look_up[target.id] = target + if target.path is not None: + self._path_look_up[target.path] = target + if target.identifiable is not None: + self._identifiable_look_up[target.identifiable.get_representation()] = target + + def _merge_into(self, source: SyncNode, target: SyncNode): + """tries to merge source into target and performs the necessary updates: + - update the member variables of target using source (``target.update(source)``). + - replaces reference values to source by target + - updates the reference map + - updates lookup tables + - removes source from node lists + - marks target as missing/existing if source was marked that way + - adds an identifiable if now possible (e.g. merging based on ID might allow create an + identifiable when none of the two nodes had the sufficient properties on its own before) + - check whether dependent nodes can now get an identifiable (the merge might have set the + ID such that dependent nodes can now create an identifiable) + + Last review by Alexander Schlemmer on 2024-05-29. + """ + self._basic_merge_into(source, target) + + if (id(source) in self._existing and id(target) in self._missing) or ( + id(target) in self._existing and id(source) in self._missing + ): + raise RuntimeError("Trying to merge missing and existing") + + if id(source) in self._missing and id(target) not in self._missing: + self._mark_missing(target) + elif id(source) in self._existing and id(target) not in self._existing: + self._mark_existing(target) + + # due to the merge it might now be possible to create an identifiable + if self._identifiable_is_needed(target): + self._set_identifiable_of_node(target) + # This is one of three cases that affect other nodes: + # - mark existing + # - mark missing + # - merge + self._add_identifiables_to_dependent_nodes(target) + + eq_node = self.get_equivalent(target) + if eq_node is not None: + self._merge_into(target, eq_node) + + def _identifiable_is_needed(self, node: SyncNode): + """ + This function checks: + - the identifiable of node is None + - the node has all properties that are needed for the identifiable + - there are no unchecked entities that are needed for the identifiable of the node, + neither as forward or as backward references + + Last review by Alexander Schlemmer on 2024-05-24. + """ + return ( + node.identifiable is None + and not self._identity_relies_on_unchecked_entity(node) + and self.identifiableAdapter.all_identifying_properties_exist( + node, raise_exception=False + ) + ) + + def _initialize_nodes(self, entities: list[db.Entity]): + """create initial set of SyncNodes from provided Entity list""" + self._sanity_check(entities) + entities = self._create_flat_list(entities) + se_lookup: dict[int, SyncNode] = {} # lookup: python id -> SyncNode + + # Create new sync nodes from the list of entities, their registered identifiables + # are set from the identifiable adapter. + for el in entities: + self.nodes.append( + SyncNode(el, self.identifiableAdapter.get_registered_identifiable(el)) + ) + se_lookup[id(el)] = self.nodes[-1] + + # replace db.Entity objects with SyncNodes in references: + for node in self.nodes: + _set_each_scalar_value( + node, + condition=lambda val: id(val) in se_lookup, + value=lambda val: se_lookup[id(val)], + ) + + def _add_identifiables_to_dependent_nodes(self, node): + """For each dependent node, we check whether this allows to create an identifiable + + Last review by Alexander Schlemmer on 2024-05-29. + """ + for other_node in self._get_nodes_whose_identity_relies_on(node): + if self._identifiable_is_needed(other_node): + self._set_identifiable_of_node(other_node) + + def _mark_missing(self, node: SyncNode): + """Mark a sync node as missing and remove it from the dictionary of unchecked nodes. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + self._missing[id(node)] = node + self.unchecked.remove(node) + + # This is one of three cases that affect other nodes: + # - mark existing + # - mark missing + # - merge + self._add_identifiables_to_dependent_nodes(node) + # For each dependent node, we set the ID to None (missing) + # (None is the default second argument of set_id_of_node.) + for other_node in self._get_nodes_whose_identity_relies_on(node): + if other_node in self.unchecked: + self.set_id_of_node(other_node) + + def _mark_existing(self, node: SyncNode): + """Mark a sync node as existing and remove it from the dictionary of unchecked nodes. + + Last review by Alexander Schlemmer on 2024-05-24. + """ + if isinstance(node.id, TempID): + raise ValueError("ID must valid existing entities, not TempID") + self._existing[id(node)] = node + self.unchecked.remove(node) + # This is one of three cases that affect other nodes: + # - mark existing + # - mark missing + # - merge + self._add_identifiables_to_dependent_nodes(node) diff --git a/src/caoscrawler/sync_node.py b/src/caoscrawler/sync_node.py new file mode 100644 index 0000000000000000000000000000000000000000..46187c0d63afd6c18de6dd1df3304f13badb1899 --- /dev/null +++ b/src/caoscrawler/sync_node.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Optional +from warnings import warn + +import linkahead as db +import yaml +from linkahead.common.models import Parent, ParentList, PropertyList + +from .exceptions import ImpossibleMergeError + +if TYPE_CHECKING: + from .identifiable import Identifiable + + +class TempID(int): + """A special kind of int for negative temporary IDs. + + This allows to identify TempIDs in the presence of String IDs. + A string ID might look like a negative integer. + """ + pass + + +class SyncNode(db.Entity): + """represents the information of an Entity as it shall be created in LinkAhead + + The following information is taken from an db.Entity object during initialization or when the + object is updated using the `update` member function: + - id + - role + - path + - file + - name + - description + - parents + - properties + + Typically, this class is used in the following way: + 1. A SyncNode is initialized with a db.Entity object. + 2. The SyncNode object is possibly updated one or more times with other SyncNode objects. + 3. A db.Entity object is created (`export_entity`) that contains the combined information. + """ + + def __init__( + self, entity: db.Entity, registered_identifiable: Optional[db.RecordType] = None, + **kwargs + ): + super().__init__(name=entity.name, + id=entity.id, + description=entity.description, + **kwargs) + # db.Entity properties + self.role = entity.role + self.path = entity.path + self.file = entity.file + self.parents = ParentList().extend(entity.parents) + self.properties = PropertyList().extend(entity.properties) + self._check_for_multiproperties() + # other members + self.identifiable: Optional[Identifiable] = None + self.registered_identifiable = registered_identifiable + + def update(self, other: SyncNode) -> None: + """Update this node with information of given ``other`` SyncNode. + + parents are added if they are not yet in the list properties + are added in any case. This may lead to duplication of + properties. We allow this duplication here and remove it when + we create a db.Entity (export_entity function) because if + property values are SyncNode objects, they might not be + comparable (no ID, no identifiable) yet. + + Raises + ------ + ValueError: + The `other` SyncNode doesn't share identifiables with + `this` SyncNode, so they can't be merged. + ImpossibleMergeError: + The two SyncNodes are incompatible in their attributes + like "id", "role", "path", "file", "name", or + "description". + + """ + + if other.identifiable is not None and self.identifiable is not None: + if ( + other.identifiable.get_representation() + != self.identifiable.get_representation() + ): + raise ValueError( + "The SyncNode that is used with update must have an equivalent" + f" identifiable. I.e. you cannot merge entities with differing identifiables" + "The identifiables where:\n" + f"{self.identifiable._create_hashable_string(self.identifiable)}\n" + f"and\n{other.identifiable._create_hashable_string(other.identifiable)}." + ) + + if other.identifiable: + self.identifiable = other.identifiable + for attr in ["id", "role", "path", "file", "name", "description"]: + if other.__getattribute__(attr) is not None: + if self.__getattribute__(attr) is None: + self.__setattr__(attr, other.__getattribute__(attr)) + else: + if self.__getattribute__(attr) != other.__getattribute__(attr): + raise ImpossibleMergeError( + f"Trying to update {attr} but this would lead to an " + f"override of the value '{self.__getattribute__(attr)}' " + f"by the value '{other.__getattribute__(attr)}'", + pname=attr, + value_a=self.__getattribute__(attr), + value_b=other.__getattribute__(attr) + ) + for p in other.parents: + if not parent_in_list(p, self.parents): + self.parents.append(p) + for p in other.properties: + self.properties.append(p) + + def export_entity(self) -> db.Entity: + """create a db.Entity object from this SyncNode + + Properties are only added once (based on id or name). If values do not match, an Error is + raised. If values are SyncNode objects with IDs, they are considered equal if their IDs are + equal. + + Raises + ------ + RuntimeError: + In case of a unsupported role, so no Entity can't be created. + ImpossibleMergeError: + In case of conflicting property values in this SyncNode. + """ + ent = None + if self.role == "Record": + ent = db.Record() + elif self.role == "File": + ent = db.File() + else: + raise RuntimeError("Invalid role") + for attr in ["id", "role", "path", "file", "name", "description"]: + ent.__setattr__(attr, self.__getattribute__(attr)) + for p in self.parents: + ent.add_parent(p) + for p in self.properties: + entval: Any = ent.get_property(p) + if entval is None: + ent.add_property(id=p.id, name=p.name, value=p.value, description=p.description, + datatype=p.datatype, unit=p.unit) + else: + entval = entval.value + unequal = False + pval = p.value + if isinstance(entval, list) != isinstance(pval, list): + unequal = True + if not isinstance(entval, list): + entval = [entval] + if not isinstance(pval, list): + pval = [pval] + if len(entval) != len(pval): + unequal = True + else: + for e_el, p_el in zip(entval, pval): + if isinstance(e_el, SyncNode) and e_el.id is not None: + e_el = e_el.id + if isinstance(p_el, SyncNode) and p_el.id is not None: + p_el = p_el.id + if e_el != p_el: + unequal = True + + if unequal: + ime = ImpossibleMergeError( + f"The crawler is trying to create an entity \n\n{self}\n\nbut there are " + "conflicting property values.", + pname=p.name, value_a=entval, value_b=pval + ) + raise ime + return ent + + def __repr__(self) -> str: + """ somewhat concise text representation of the SyncNode """ + res = f"\n=====================================================\n{self.role}\n" + res += yaml.dump( + { + "id": self.id, + "name": self.name, + "path": self.path, + "parents": [el.name for el in self.parents], + }, + allow_unicode=True, + ) + res += "---------------------------------------------------\n" + res += "properties:\n" + d: dict[str, Any] = {} + for p in self.properties: + v = p.value + d[p.name] = [] + if not isinstance(p.value, list): + v = [v] + for el in v: + if isinstance(el, SyncNode): + d[p.name].append( + { + "id": el.id, + "name": el.name, + "path": el.path, + "parents": [e.name for e in el.parents], + } + ) + else: + d[p.name].append(el) + + return ( + res + + yaml.dump(d, allow_unicode=True) + + "=====================================================\n" + ) + + def _check_for_multiproperties(self): + """ warns if multiproperties are present """ + ids = set() + names = set() + for p in self.properties: + if p.name is not None: + if p.name in names: + warn("Multiproperties are not supported by the crawler.") + names.add(p.name) + if p.id is not None: + if p.id in ids: + warn("Multiproperties are not supported by the crawler.") + ids.add(p.id) + + +def parent_in_list(parent: Parent, plist: ParentList) -> bool: + """helper function that checks whether a parent with the same name or ID is in the plist""" + return plist.filter_by_identity(parent) + + +def property_in_list(prop: db.Property, plist: PropertyList) -> bool: + """helper function that checks whether a property with the same name or ID is in the plist""" + return plist.filter_by_identity(prop) diff --git a/src/caoscrawler/transformer_functions.py b/src/caoscrawler/transformer_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..117d0b021d4ec0b0efc79c5db0d7ed397207933f --- /dev/null +++ b/src/caoscrawler/transformer_functions.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2023 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Definition of default transformer functions. + +See https://docs.indiscale.com/caosdb-crawler/converters.html#transform-functions for more +information. + +""" + +import datetime +import re +from typing import Any + + +def submatch(in_value: Any, in_parameters: dict): + """ + Substitute the variable if it matches the regexp stored in "match". + + Returns the "in" value if it does NOT match the reg exp of 'match'. + Otherwise (if it matches) the value of 'then' stored in the second argument is returned. + """ + if "match" not in in_parameters or "then" not in in_parameters: + raise RuntimeError("Mandatory parameters missing.") + if re.match(in_parameters["match"], in_value) is not None: + return in_parameters["then"] + return in_value + + +def split(in_value: Any, in_parameters: dict): + """calls the string 'split' function on the first argument and uses the value of the key + 'marker' stored in the second argument + """ + if "marker" not in in_parameters: + raise RuntimeError("Mandatory parameter missing.") + if not isinstance(in_value, str): + raise RuntimeError("must be string") + return in_value.split(in_parameters['marker']) + + +def replace(in_value: Any, in_parameters: dict): + """calls the string 'replace' function on the first argument and uses the value of the keys + 'remove' and 'insert' stored in the second argument + """ + if "remove" not in in_parameters or "insert" not in in_parameters: + raise RuntimeError("Mandatory parameter missing.") + if not isinstance(in_value, str): + raise RuntimeError("must be string") + return in_value.replace(in_parameters['remove'], in_parameters['insert']) + + +def date_parse(in_value: str, params: dict) -> str: + """Transform text so that it is formatted in a way that LinkAhead can understand it. + +Parameters +========== + +- date_format: str, optional + A format string using the ``datetime`` specificaton: + https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes + """ + fmt_default = "%Y-%m-%d" + fmt = params.get("date_format", fmt_default) + dt_str = datetime.datetime.strptime(in_value, fmt).strftime(fmt_default) + return dt_str + + +def datetime_parse(in_value: str, params: dict) -> str: + """Transform text so that it is formatted in a way that LinkAhead can understand it. + + +Parameters +========== + +- datetime_format: str, optional + A format string using the ``datetime`` specificaton: + https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes + """ + fmt_default = "%Y-%m-%dT%H:%M:%S" + fmt = params.get("datetime_format", fmt_default) + dt_str = datetime.datetime.strptime(in_value, fmt).strftime(fmt_default) + return dt_str + + +def cast_to_int(in_value: Any, params: dict) -> int: + """ + Cast the `in_value` to int. + + Parameters + ========== + No parameters. + """ + return int(in_value) + + +def cast_to_float(in_value: Any, params: dict) -> float: + """ + Cast the `in_value` to float. + + Parameters + ========== + No parameters. + """ + return float(in_value) + + +def cast_to_bool(in_value: Any, params: dict) -> bool: + """ + Cast the `in_value` to bool. + + This is done by comparing `in_value` to "True". + Only "true", "True", "False" and "false" are accepted as possible values. + All other input values raise an error. + + Parameters + ========== + No parameters. + """ + val = str(in_value).lower() + if val == "true": + return True + if val == "false": + return False + raise ValueError("Invalid value for type cast to bool: {}".format(in_value)) + + +def cast_to_str(in_value: Any, params: dict) -> str: + """ + Cast the `in_value` to str. + + Parameters + ========== + No parameters. + """ + return str(in_value) diff --git a/src/caoscrawler/utils.py b/src/caoscrawler/utils.py index 61b363099d0892b74e91f257bccb6cc832c3d59f..5f736d5ad7550e0b29cb629b2fa140a2f38d6f5f 100644 --- a/src/caoscrawler/utils.py +++ b/src/caoscrawler/utils.py @@ -25,7 +25,12 @@ # Some utility functions, e.g. for extending pylib. -import caosdb as db +import sys +from posixpath import join as posixjoin +from typing import Optional +from urllib.parse import urljoin + +import linkahead as db def has_parent(entity: db.Entity, name: str): @@ -39,3 +44,45 @@ def has_parent(entity: db.Entity, name: str): if parent.name == name: return True return False + + +def MissingImport(name: str, hint: str = "", err: Optional[Exception] = None) -> type: + """Factory with dummy classes, which may be assigned to variables but never used.""" + def _error(): + error_msg = f"This class ({name}) cannot be used, because some libraries are missing." + if hint: + error_msg += "\n\n" + hint + + if err: + print(error_msg, file=sys.stdout) + raise RuntimeError(error_msg) from err + raise RuntimeError(error_msg) + + class _Meta(type): + def __getattribute__(cls, *args, **kwargs): + _error() + + def __call__(cls, *args, **kwargs): + _error() + + class _DummyClass(metaclass=_Meta): + pass + + _DummyClass.__name__ = name + + return _DummyClass + + +def get_shared_resource_link(host_url, filename): + """Return a link adress which is basically {host_url}/Shared/{filename}. + + Use urllib.parse.join and os.path.join to prevent missing or extra ``/`` and the like. + + """ + + if not host_url.endswith('/'): + # Fill with trailing '/' s. that urljoin doesn't remove the context root. + host_url += '/' + # Use posixjoin to always have '/' in links, even when running on + # Windows systems. + return urljoin(host_url, posixjoin("Shared/", filename)) diff --git a/src/caoscrawler/validator.py b/src/caoscrawler/validator.py new file mode 100644 index 0000000000000000000000000000000000000000..33e29b02db429e3382248bbd80d2d00cd7b07c6b --- /dev/null +++ b/src/caoscrawler/validator.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer +# +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +This module contains functions to validate the output of a scanner run with a +json schema. +""" + +import jsonschema +import linkahead as db +# from caosadvancedtools.models.parser import parse_model_from_string +from caosadvancedtools.json_schema_exporter import recordtype_to_json_schema +from caosadvancedtools.models.parser import parse_model_from_yaml +from jsonschema import ValidationError +from linkahead.high_level_api import convert_to_python_object + + +def load_json_schema_from_datamodel_yaml(filename: str) -> dict[str, dict]: + """ + Load a data model yaml file (using caosadvancedtools) and convert + all record types into a json schema using the json_schema_exporter module. + + Arguments + --------- + filename: str + The filename of the yaml file to load. + + Returns + ------- + A dict of json schema objects. The keys are the record types for which the schemas + are generated. + """ + + model = parse_model_from_yaml(filename) + + rt_schemas = {} + for el_key, el in model.items(): + if isinstance(el, db.RecordType): + rt_schemas[el_key] = recordtype_to_json_schema(el) + + return rt_schemas + + +def representer_ordereddict(dumper, data): + """ + Helper function to be able to represent the converted json schema objects correctly as yaml. + This representer essentially replaced OrderedDict objects with simple dict objects. + + Since Python 3.7 dicts are ordered by default, see e.g.: + https://softwaremaniacs.org/blog/2020/02/05/dicts-ordered/en/ + + Example how to use the representer: + ```python + yaml.add_representer(OrderedDict, caoscrawler.validator.representer_ordereddict) + ``` + """ + return dumper.represent_data(dict(data)) + + +def _apply_schema_patches(pobj: dict): + """ + Changes applied: + - properties are moved vom subitem "proeprties" to top-level. + - The following keys are deleted: parents, role, name, description, metadata, properties + """ + if "properties" not in pobj: + # this is probably a file + return pobj + for prop in pobj["properties"]: + if isinstance(pobj["properties"][prop], dict): + pobj[prop] = _apply_schema_patches(pobj["properties"][prop]) + else: + pobj[prop] = pobj["properties"][prop] + + for keyd in ("parents", "role", "name", + "description", "metadata", "properties"): + if keyd in pobj: + del pobj[keyd] + + return pobj + + +def convert_record(record: db.Record): + """ + Convert a record into a form suitable for validation with jsonschema. + + Uses `high_level_api.convert_to_python_object` + Afterwards `_apply_schema_patches` is called recursively to refactor the dictionary + to match the current form of the jsonschema. + + Arguments: + ---------- + record: db.Record + The record that is supposed to be converted. + """ + pobj = convert_to_python_object(record).serialize() + return _apply_schema_patches(pobj) + + +def validate(records: list[db.Record], schemas: dict[str, dict]) -> list[tuple]: + """ + Validate a list of records against a dictionary of schemas. + The keys of the dictionary are record types and the corresponding values are json schemata + associated with that record type. The current implementation assumes that each record that is + checked has exactly one parent and raises an error if that is not the case. + The schema belonging to a record is identified using the name of the first (and only) parent + of the record. + + Arguments: + ---------- + + records: list[db.Record] + List of records that will be validated. + + schemas: dict[str, dict] + A dictionary of JSON schemas generated using `load_json_schema_from_datamodel_yaml`. + + Returns: + -------- + A list of tuples, one element for each record: + + - Index 0: A boolean that determines whether the schema belonging to the record type of the + record matched. + - Index 1: A validation error if the schema did not match or None otherwise. + """ + + retval = [] + for r in records: + if len(r.parents) != 1: + raise NotImplementedError( + "Schema validation is only supported if records have exactly one parent.") + parname = r.parents[0].name + if parname not in schemas: + raise RuntimeError( + "No schema for record type {} in schema dictionary.".format(parname)) + try: + jsonschema.validate(convert_record(r), schemas[parname]) + retval.append((True, None)) + except ValidationError as ex: + retval.append((False, ex)) + return retval diff --git a/src/caoscrawler/version.py b/src/caoscrawler/version.py index e73905dcd25673eae88f718a7e45b7b4d0665e47..4cd435486aca26e20e785bbbeb65c013d8e727cb 100644 --- a/src/caoscrawler/version.py +++ b/src/caoscrawler/version.py @@ -17,16 +17,15 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # -try: - from importlib import metadata as importlib_metadata -except ImportError: # Python<3.8 dowesn"t support this so use - import importlib_metadata +from importlib import metadata as importlib_metadata +from warnings import warn from packaging.version import parse as parse_version -from warnings import warn -# Read in version of locally installed caoscrawler package -version = importlib_metadata.version("caoscrawler") + +def get_caoscrawler_version(): + """ Read in version of locally installed caoscrawler package""" + return importlib_metadata.version("caoscrawler") class CfoodRequiredVersionError(RuntimeError): @@ -41,7 +40,7 @@ def check_cfood_version(metadata: dict): if not metadata or "crawler-version" not in metadata: msg = """ -No crawler version specified in cfood definition, so there is now guarantee that +No crawler version specified in cfood definition, so there is no guarantee that the cfood definition matches the installed crawler version. Specifying a version is highly recommended to ensure that the definition works @@ -51,7 +50,7 @@ as expected with the installed version of the crawler. warn(msg, UserWarning) return - installed_version = parse_version(version) + installed_version = parse_version(get_caoscrawler_version()) cfood_version = parse_version(metadata["crawler-version"]) if cfood_version > installed_version: diff --git a/src/doc/_static/assets/scifolder_tutorial.tar.gz b/src/doc/_static/assets/scifolder_tutorial.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..7c06dea70f95630278d17f2cac5174aa9e208509 Binary files /dev/null and b/src/doc/_static/assets/scifolder_tutorial.tar.gz differ diff --git a/src/doc/cfood-schema.yml b/src/doc/cfood-schema.yml new file mode 120000 index 0000000000000000000000000000000000000000..c34280ed44cdd07eccb20e314e08a60c899dfe8a --- /dev/null +++ b/src/doc/cfood-schema.yml @@ -0,0 +1 @@ +../caoscrawler/cfood-schema.yml \ No newline at end of file diff --git a/src/doc/cfood-specification.rst b/src/doc/cfood-specification.rst new file mode 100644 index 0000000000000000000000000000000000000000..89588b2da9e4bb828eba05cc353dedb3abd6c821 --- /dev/null +++ b/src/doc/cfood-specification.rst @@ -0,0 +1,10 @@ +CFood-Specification +((((((((((((((((((( + + +CFoods are defined using a YAML find that has to abide by the following +specification. The specifiacition is defined using a JSON schema (see +`src/caoscrawler/cfood-schema.yml`). A CFood is basically composed of converter +definitions. A converter definition must have the following structure: + +.. jsonschema:: cfood-schema.yml#/cfood/$defs/converter diff --git a/src/doc/cfood.rst b/src/doc/cfood.rst index 37f6a8c7d3be9298ec965c50a4ec29110988ddc6..0c7726d2017b955ecd7472d57dc259ff9a7bab53 100644 --- a/src/doc/cfood.rst +++ b/src/doc/cfood.rst @@ -4,6 +4,7 @@ CFood-Definition The crawler specification is called CFood-definition. It is stored inside a yaml file, or - more precisely - inside of one single or two yaml documents inside a yaml file. The specification consists of three separate parts: + #. Metadata and macro definitions #. Custom converter registrations #. The converter tree specification @@ -26,17 +27,17 @@ A single document with a converter tree specification: .. _example_1: .. code-block:: yaml - + extroot: type: Directory match: ^extroot$ subtree: DataAnalysis: - type: Directory - match: DataAnalysis - # (...) + type: Directory + match: DataAnalysis + # (...) + - A single document with a converter tree specification, but also including a custom converters section: .. _example_2: @@ -49,15 +50,15 @@ A single document with a converter tree specification, but also including a cust CustomConverter_2: package: mypackage.converters converter: CustomConverter2 - + extroot: type: Directory match: ^extroot$ subtree: DataAnalysis: - type: Directory - match: DataAnalysis - # (...) + type: Directory + match: DataAnalysis + # (...) @@ -77,11 +78,11 @@ two custom converters in the second document (**not recommended**, see the recom - !defmacro name: SimulationDatasetFile params: - match: null - recordtype: null - nodename: null + match: null + recordtype: null + nodename: null definition: - # (...) + # (...) --- Converters: CustomConverter_1: @@ -90,15 +91,15 @@ two custom converters in the second document (**not recommended**, see the recom CustomConverter_2: package: mypackage.converters converter: CustomConverter2 - + extroot: type: Directory match: ^extroot$ subtree: DataAnalysis: - type: Directory - match: DataAnalysis - # (...) + type: Directory + match: DataAnalysis + # (...) @@ -117,27 +118,27 @@ The **recommended way** of defining metadata, custom converters, macros and the - !defmacro name: SimulationDatasetFile params: - match: null - recordtype: null - nodename: null + match: null + recordtype: null + nodename: null definition: - # (...) + # (...) Converters: CustomConverter_1: - package: mypackage.converters - converter: CustomConverter1 + package: mypackage.converters + converter: CustomConverter1 CustomConverter_2: - package: mypackage.converters - converter: CustomConverter2 + package: mypackage.converters + converter: CustomConverter2 --- extroot: type: Directory match: ^extroot$ subtree: DataAnalysis: - type: Directory - match: DataAnalysis - # (...) + type: Directory + match: DataAnalysis + # (...) List Mode @@ -147,8 +148,125 @@ Specifying values of properties can make use of two special characters, in order create lists or multi properties instead of single values: .. code-block:: yaml - - Experiment1: - Measurement: +Measurement <- Element in List (list is cleared before run) - *Measurement <- Multi Property (properties are removed before run) - Measurement <- Overwrite + + Experiment1: + Measurement: +Measurement # Element in List (list is cleared before run) + *Measurement # Multi Property (properties are removed before run) + Measurement # Overwrite + +Values and units +---------------- + +Property values can be specified as a simple strings (as above) or as +a dictionaries that may also specify the :ref:`collection mode <List +Mode>`. Strings starting with a "$" will be replaced by a +corresponding variable if there is any. See the :doc:`tutorials +chapter<tutorials/index>` of this documentation for more elaborate +examples on how the variable replacment works exactly. A simple +example could look the following. + +.. code-block:: yaml + + ValueElt: + type: TextElement + match_name: ^my_prop$ + match_value: "(?P<value>.*)" # Anything in here is stored in the variable "value" + records: + MyRecord: + MyProp: $value # will be replace by whatever is stored in the "value" variable set above. + +If not given explicitly, the collection mode will be determined from +the first character of the property value as explained above, and the +following three definitions are all equivalent: + +.. code-block:: yaml + + MyProp: +$value + +.. code-block:: yaml + + MyProp: + value: +$value + +and + +.. code-block:: yaml + + MyProp: + value: $value + collection_mode: list + + +Units of numeric values can be set by providing a property value not +as a single string, but as a dictionary with a ``value`` and a +``unit`` key. Within a converter definition this could look the +following. + +.. code-block:: yaml + + ValueWithUnitElt: + type: TextElement + match_name: ^my_prop$ + match_value: "^(?P<number>\\d+\\.?\\d*)\\s+(?P<unit>.+)" # Extract value and unit from a string which + # has a number followed by at least one whitespace + # character followed by a unit. + records: + MyRecord: + MyProp: + value: $number + unit: $unit + + +File Entities +------------- + +In order to use File Entities, you must set the appropriate ``role: File``. +Additionally, the path and file keys have to be given, with values that set the +paths remotely and locally, respectively. You can use the variable +``<converter name>_path`` that is automatically created by converters that deal +with file system related StructureElements. The file object itsself is stored +in a vairable with the same name (as it is the case for other Records). + + +.. code-block:: yaml + + somefile: + type: SimpleFile + match: ^params.*$ # match any file that starts with "params" + records: + fileEntity: + role: File # necessary to create a File Entity + path: somefile.path # defines the path in CaosDB + file: somefile.path # path where the file is found locally + SomeRecord: + ParameterFile: $fileEntity # creates a reference to the file + + +Transform Functions +------------------- +You can use transform functions to alter variable values that the crawler consumes (e.g. a string +that was matched with a reg exp). See :doc:`Converter Documentation<converters/index>`. + +You can define your own transform functions by adding the the same way you add custom converters: + +.. code-block:: yaml + + Transformers: + transform_foo: + package: some.package + function: some_foo + + + +Automatically generated keys +++++++++++++++++++++++++++++ + +Some variable names are automatically generated and can be used using the +``$<variable name>`` syntax. Those include: + +- ``<converter name>``: access the path of converter names to the current converter +- ``<converter name>.path``: the file system path to the structure element + (file system related converters only; you need curly brackets to use them: + ``${<converter name>.path}``) +- ``<Record key>``: all entities that are created in the ``records`` section + are available under the same key diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst index 89757f21958f3d94649b33e9f9112593f703191d..e1cbb10eff2c86034fc24a3fb0c73949e202df30 100644 --- a/src/doc/concepts.rst +++ b/src/doc/concepts.rst @@ -1,45 +1,54 @@ +======== Concepts -)))))))) +======== + +The CaosDB Crawler can handle any kind of hierarchical data structure. The typical use case is a +directory tree that is traversed. We use the following terms/concepts to describe how the CaosDB +Crawler works. + +Basics +====== + Structure Elements ++++++++++++++++++ -This hierarchical structure is assumed to be consituted of a tree of -StructureElements. The tree is created on the fly by so called Converters which -are defined in a yaml file. The tree of StructureElements is a model -of the existing data (For example could a tree of Python file objects -(StructureElements) represent a file tree that exists on some file server). +The crawled hierarchical structure is represented by a tree of *StructureElements*. This tree is +generated on the fly by so called Converters which are defined in a yaml file (usually called +``cfood.yml``). This generated tree of StructureElements is a model of the existing data. For +example a tree of Python *file objects* (StructureElements) could correspond to a file system tree. Relevant sources in: -- ``src/structure_elements.py`` +- :py:mod:`caoscrawler.structure_elements` + +.. _ConceptConverters: Converters ++++++++++ -Converters treat StructureElements and thereby create the StructureElement that -are the children of the treated StructureElement. Converters therefore create -the above named tree. The definition of a Converter also contains what -Converters shall be used to treat the generated child-StructureElements. The -definition is therefore a tree itself. - -See :std:doc:`converters<converters>` for details. +Converters treat a StructureElement and during this process create a number of new +StructureElements: the children of the initially treated StructureElement. Thus by treatment of +existing StructureElements, Converters create a tree of StructureElements. +.. image:: img/converter.png + :height: 170 +See the chapter :std:doc:`Converters<converters/index>` for details. Relevant sources in: -- ``src/converters.py`` +- :py:mod:`caoscrawler.converters` Identifiables +++++++++++++ -An Identifiable of a Record is like the fingerprint of a Record. +An *Identifiable* of a Record is like the fingerprint of a Record. -The identifiable contains the information that is used by the CaosDB Crawler to identify Records. -For example, in order to check whether a Record exits in the CaosDB Server, the CaosDB Crawler creates a query -using the information contained in the Identifiable. +The Identifiable contains the information that is used by the CaosDB Crawler to identify Records. +For example, the CaosDB Crawler may create a query using the information contained in the +Identifiable in order to check whether a Record exists in the CaosDB Server. Suppose a certain experiment is at most done once per day, then the identifiable could consist of the RecordType "SomeExperiment" (as a parent) and the Property "date" with the respective value. @@ -61,8 +70,8 @@ In the current implementation an identifiable can only use one RecordType even t Relevant sources in -- ``src/identifiable_adapters.py`` -- ``src/identifiable.py`` +- :py:mod:`caoscrawler.identifiable_adapters` +- :py:mod:`caoscrawler.identifiable` Registered Identifiables ++++++++++++++++++++++++ @@ -70,7 +79,7 @@ A Registered Identifiable is the blue print for Identifiables. You can think of registered identifiables as identifiables without concrete values for properties. RegisteredIdentifiables are associated with RecordTypes and define of what information an identifiable for that RecordType -exists. There can be multiple Registered Identifiables for one RecordType. +exists. There cannot be multiple Registered Identifiables for one RecordType. If identifiables shall contain references to the object to be identified, the Registered Identifiable must list the RecordTypes of the Entities that have those references. @@ -81,7 +90,40 @@ we can check whether a Record with the parent "Project" is referencing the "Expe Record. If that is the case, this reference is part of the identifiable for the "Experiment" Record. Note, that if there are multiple Records with the appropriate parent (e.g. multiple "Project" Records in the above example) it will be required that all of them -reference the object to be identified. +reference the object to be identified. You can also use the wildcard "*" as +RecordType name in the configuration which will only require, that ANY Record +references the Record at hand. + + +Instead of defining registered identifiables for a RecordType +directly, they can be defined for their parents. I.e., if there is no +registered identifiable for a RecordType, then it will be checked +whether there is a parent that has one. If multiple recordtypes exist +in the inheritance chain with a registered identifiable, then the one +that is closest to the direct parent is used. In case of multiple +inheritance, only one branch must have registered identifiables. + +The reason for this behavior is the following. If there were +mutliple registered identifiables that could be used to identify a +given record and only a single one of them would used, it might be +that the existence check returns a different result than if the other +one would be used. This would allow for unpredictable and inconsistent +behavior (Example: one registered identifiable contains the name +another one property date. Using the name might imply that the record +does not exist and using the date might imply that it does. Thus, for +any Record the registered identifiable must be unique). Analogous +Example: If you think in the context of relational databases, there +can always only be a foreign key associated with one table. + +.. note:: + + In case of using the registered identifiable of a parent, the + identifiable will be created by using the parent + RecordType. Example: The registered identifiable is defined for the + parent "Experiment" and the RecordType at hand "LaseExperiment" is + a child of "Experiment". Then the identifiable will construct a + query that searches for "Experiment" Records (and not + "LaseExperiment" Records). Identified Records @@ -92,12 +134,14 @@ The Crawler +++++++++++ The crawler can be considered the main program doing the synchronization in basically two steps: + #. Based on a yaml-specification scan the file system (or other sources) and create a set of CaosDB Entities that are supposed to be inserted or updated in a CaosDB instance. + #. Compare the current state of the CaosDB instance with the set of CaosDB Entities created in step 1, taking into account the :ref:`registered identifiables<Identifiables>`. Insert or update entites accordingly. Relevant sources in: -- ``src/crawl.py`` +- :py:mod:`caoscrawler.crawl` @@ -169,3 +213,13 @@ Example: File Objects ============ + +TODO + +Caching ++++++++ + +The Crawler uses the cached library function ``cached_get_entity_by``. The cache is cleared +automatically when the Crawler does updates, but if you ran the same Python process indefinitely, +the Crawler would not see changes in LinkAhead due to the cache. Thus, please make sure to clear the +cache if you create long running Python processes. diff --git a/src/doc/conf.py b/src/doc/conf.py index b2873c846e7275b2a5bfbc8bc5cd18dabaa843ef..2a783dec27a700f9d350a2b46cdd647ff0fccf2f 100644 --- a/src/doc/conf.py +++ b/src/doc/conf.py @@ -21,22 +21,22 @@ # import os import sys + sys.path.insert(0, os.path.abspath('..')) import sphinx_rtd_theme # noqa: E402 - # -- Project information ----------------------------------------------------- project = 'caosdb-caoscrawler' -copyright = '2021, MPIDS' +copyright = '2024, IndiScale' author = 'Alexander Schlemmer' # The short X.Y version -version = '0.2.1' +version = '0.11.1' # The full version, including alpha/beta/rc tags # release = '0.5.2-rc2' -release = '0.2.1-dev' +release = '0.11.1-dev' # -- General configuration --------------------------------------------------- @@ -53,8 +53,11 @@ extensions = [ 'sphinx.ext.autosectionlabel', 'sphinx.ext.intersphinx', 'sphinx.ext.napoleon', # For Google style docstrings + "sphinx.ext.todo", "recommonmark", # For markdown files. "sphinx_rtd_theme", + 'sphinx.ext.autodoc', + 'sphinx-jsonschema', ] # Add any paths that contain templates here, relative to this directory. @@ -100,7 +103,7 @@ html_theme = "sphinx_rtd_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = [] # ['_static'] +html_static_path = ['_static'] # ['_static'] # Custom sidebar templates, must be a dictionary that maps document names # to template names. @@ -213,6 +216,10 @@ intersphinx_mapping = { # TODO Which options do we want? autodoc_default_options = { - 'members': None, - 'undoc-members': None, + 'members': True, + 'undoc-members': True, + 'member-order': 'bysource', + 'special-member': ["__init__"], } + +todo_include_todos = True diff --git a/src/doc/converters.rst b/src/doc/converters.rst deleted file mode 100644 index b4ba89ced3b5858ca2f8abe7bc724d6710d9203b..0000000000000000000000000000000000000000 --- a/src/doc/converters.rst +++ /dev/null @@ -1,504 +0,0 @@ -Converters -)))))))))) - -Converters treat StructureElements and thereby create the StructureElement that -are the children of the treated StructureElement. Converters therefore create -the tree of structure elements. The definition of a Converter also contains what -Converters shall be used to treat the generated child-StructureElements. The -definition is therefore a tree itself. - -Each StructureElement in the tree has a set of data values, i.e a dictionary of -key value pairs. -Some of those values are set due to the kind of StructureElement. For example, -a file could have the file name as such a key value pair: 'filename': <sth>. -Converters may define additional functions that create further values. For -example, a regular expresion could be used to get a date from a file name. - - -A converter is defined via a yml file or part of it. The definition states -what kind of StructureElement it treats (typically one). -Also, it defines how children of the current StructureElement are -created and what Converters shall be used to treat those. - -The yaml definition looks like the following: - -TODO: outdated, see cfood-schema.yml - -.. code-block:: yaml - - <NodeName>: - type: <ConverterName> - match: ".*" - records: - Experiment1: - parents: - - Experiment - - Blablabla - date: $DATUM - (...) - Experiment2: - parents: - - Experiment - subtree: - (...) - -The **<NodeName>** is a description of what it represents (e.g. -'experiment-folder') and is used as identifier. - -**<type>** selects the converter that is going to be matched against the current structure -element. If the structure element matches (this is a combination of a typecheck and a detailed -match, see :py:class:`~caoscrawler.converters.Converter` for details) the converter is used -to generate records (see :py:meth:`~caoscrawler.converters.Converter.create_records`) and to possibly process a subtree, as defined by the function :func:`caoscrawler.converters.create_children`. - -**records** is a dict of definitions that define the semantic structure -(see details below). - -Subtree contains a list of Converter defnitions that look like the one -described here. - - -Standard Converters -+++++++++++++++++++ - -Directory Converter -=================== -The Directory Converter creates StructureElements for each File and Directory -inside the current Directory. You can match a regular expression against the -directory name using the 'match' key. - -Simple File Converter -===================== -The Simple File Converter does not create any children and is usually used if -a file shall be used as it is and be inserted and referenced by other entities. - -Markdown File Converter -======================= -Reads a YAML header from Markdown files (if such a header exists) and creates -children elements according to the structure of the header. - -DictElement Converter -============== -Creates a child StructureElement for each key in the dictionary. - -Typical Subtree converters --------------------------- -The following StructureElement are typically created: - -- BooleanElement -- FloatElement -- TextElement -- IntegerElement -- ListElement -- DictElement - -Scalar Value Converters -======================= -`BooleanElementConverter`, `FloatElementConverter`, `TextElementConverter`, and -`IntegerElementConverter` behave very similarly. - -These converters expect `match_name` and `match_value` in their definition -which allow to match the key and the value, respectively. - -Note that there are defaults for accepting other types. For example, -FloatElementConverter also accepts IntegerElements. The default -behavior can be adjusted with the fields `accept_text`, `accept_int`, -`accept_float`, and `accept_bool`. - -The following denotes what kind of StructureElements are accepted by default -(they are defined in `src/caoscrawler/converters.py`): - -- DictBooleanElementConverter: bool, int -- DictFloatElementConverter: int, float -- DictTextElementConverter: text, bool, int, float -- DictIntegerElementConverter: int -- DictListElementConverter: list -- DictDictElementConverter: dict - -YAMLFileConverter -================= - -A specialized Dict Converter for yaml files: Yaml files are opened and the contents are -converted into dictionaries that can be further converted using the typical subtree converters -of dict converter. - -**WARNING**: Currently unfinished implementation. - -JSONFileConverter -================= - - - - -TableConverter -============== - -A generic converter (abstract) for files containing tables. -Currently, there are two specialized implementations for xlsx-files and csv-files. - -All table converters generate a subtree that can be converted with DictDictElementConverters: -For each row in the table a DictDictElement (structure element) is generated. The key of the -element is the row number. The value of the element is a dict containing the mapping of -column names to values of the respective cell. - -Example: - -.. code-block:: yaml - - subtree: - TABLE: - type: CSVTableConverter - match: ^test_table.csv$ - records: - (...) # Records edited for the whole table file - subtree: - ROW: - type: DictDictElement - match_name: .* - match_value: .* - records: - (...) # Records edited for each row - subtree: - COLUMN: - type: DictFloatElement - match_name: measurement # Name of the column in the table file - match_value: (?P<column_value).*) - records: - (...) # Records edited for each cell - - -XLSXTableConverter -================== - -CSVTableConverter -================= - -Custom Converters -+++++++++++++++++ - -It was previously mentioned that it is possible to create custom converters. -These custom converters can be used to integrate arbitrary data extraction and ETL capabilities -into the caosdb-crawler and make these extensions available to any yaml specification. - -The basic syntax for adding a custom converter to a yaml cfood definition file is: - -.. code-block:: yaml - - Converters: - <NameOfTheConverterInYamlFile>: - package: <python>.<module>.<name> - converter: <PythonClassName> - -The Converters-section can be either put into the first or second document of the cfood yaml file. -It can be also part of a single-document yaml cfood file. Please refer to :doc:`the cfood documentation<cfood>` for more details. - -Details: - -- **<NameOfTheConverterInYamlFile>**: This is the name of the converter as it is going to be used in the present yaml file. -- **<python>.<module>.<name>**: The name of the module where the converter class resides. -- **<PythonClassName>**: Within this specified module there must be a class inheriting from base class :py:class:`caoscrawler.converters.Converter`. - -The following methods are abstract and need to be overwritten by your custom converter to make it work: - -- :py:meth:`~caoscrawler.converters.Converter.create_children` -- :py:meth:`~caoscrawler.converters.Converter.match` -- :py:meth:`~caoscrawler.converters.Converter.typecheck` - - -Example -======= - -In the following, we will explain the process of adding a custom converter to a yaml file using -a SourceResolver that is able to attach a source element to another entity. - -**Note**: This example might become a standard crawler soon, as part of the scifolder specification. See https://doi.org/10.3390/data5020043 for details. In this documentation example we will, therefore, add it to a package called "scifolder". - -First we will create our package and module structure, which might be: - -.. code-block:: - - scifolder_package/ - README.md - setup.cfg - setup.py - Makefile - tox.ini - src/ - scifolder/ - __init__.py - converters/ - __init__.py - sources.py # <- the actual file containing - # the converter class - doc/ - unittests/ - -Now we need to create a class called "SourceResolver" in the file "sources.py". In this - more advanced - example, we will not inherit our converter directly from :py:class:`~caoscrawler.converters.Converter`, but use :py:class:`~caoscrawler.converters.TextElementConverter`. The latter already implements :py:meth:`~caoscrawler.converters.Converter.match` and :py:meth:`~caoscrawler.converters.Converter.typecheck`, so only an implementation for :py:meth:`~caoscrawler.converters.Converter.create_children` has to be provided by us. -Furthermore we will customize the method :py:meth:`~caoscrawler.converters.Converter.create_records` that allows us to specify a more complex record generation procedure than provided in the standard implementation. One specific limitation of the standard implementation is, that only a fixed -number of records can be generated by the yaml definition. So for any applications - like here - that require an arbitrary number of records to be created, a customized implementation of :py:meth:`~caoscrawler.converters.Converter.create_records` is recommended. -In this context it is recommended to make use of the function :func:`caoscrawler.converters.create_records` that implements creation of record objects from python dictionaries of the same structure -that would be given using a yaml definition (see next section below). - -.. code-block:: python - - import re - from caoscrawler.stores import GeneralStore, RecordStore - from caoscrawler.converters import TextElementConverter, create_records - from caoscrawler.structure_elements import StructureElement, TextElement - - - class SourceResolver(TextElementConverter): - """ - This resolver uses a source list element (e.g. from the markdown readme file) - to link sources correctly. - """ - - def __init__(self, definition: dict, name: str, - converter_registry: dict): - """ - Initialize a new directory converter. - """ - super().__init__(definition, name, converter_registry) - - def create_children(self, generalStore: GeneralStore, - element: StructureElement): - - # The source resolver does not create children: - - return [] - - def create_records(self, values: GeneralStore, - records: RecordStore, - element: StructureElement, - file_path_prefix): - if not isinstance(element, TextElement): - raise RuntimeError() - - # This function must return a list containing tuples, each one for a modified - # property: (name_of_entity, name_of_property) - keys_modified = [] - - # This is the name of the entity where the source is going to be attached: - attach_to_scientific_activity = self.definition["scientific_activity"] - rec = records[attach_to_scientific_activity] - - # The "source" is a path to a source project, so it should have the form: - # /<Category>/<project>/<scientific_activity>/ - # obtain these information from the structure element: - val = element.value - regexp = (r'/(?P<category>(SimulationData)|(ExperimentalData)|(DataAnalysis))' - '/(?P<project_date>.*?)_(?P<project_identifier>.*)' - '/(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))?/') - - res = re.match(regexp, val) - if res is None: - raise RuntimeError("Source cannot be parsed correctly.") - - # Mapping of categories on the file system to corresponding record types in CaosDB: - cat_map = { - "SimulationData": "Simulation", - "ExperimentalData": "Experiment", - "DataAnalysis": "DataAnalysis"} - linkrt = cat_map[res.group("category")] - - keys_modified.extend(create_records(values, records, { - "Project": { - "date": res.group("project_date"), - "identifier": res.group("project_identifier"), - }, - linkrt: { - "date": res.group("date"), - "identifier": res.group("identifier"), - "project": "$Project" - }, - attach_to_scientific_activity: { - "sources": "+$" + linkrt - }}, file_path_prefix)) - - # Process the records section of the yaml definition: - keys_modified.extend( - super().create_records(values, records, element, file_path_prefix)) - - # The create_records function must return the modified keys to make it compatible - # to the crawler functions: - return keys_modified - - -If the recommended (python) package structure is used, the package containing the converter -definition can just be installed using `pip install .` or `pip install -e .` from the -`scifolder_package` directory. - -The following yaml block will register the converter in a yaml file: - -.. code-block:: yaml - - Converters: - SourceResolver: - package: scifolder.converters.sources - converter: SourceResolver - - -Using the `create_records` API function -======================================= - -The function :func:`caoscrawler.converters.create_records` was already mentioned above and it is -the recommended way to create new records from custom converters. Let's have a look at the -function signature: - -.. code-block:: python - - def create_records(values: GeneralStore, # <- pass the current variables store here - records: RecordStore, # <- pass the current store of CaosDB records here - def_records: dict): # <- This is the actual definition of new records! - - -`def_records` is the actual definition of new records according to the yaml cfood specification -(work in progress, in the docs). Essentially you can do everything here, that you could do -in the yaml document as well, but using python source code. - -Let's have a look at a few examples: - -.. code-block:: yaml - - DirConverter: - type: Directory - match: (?P<dir_name>.*) - records: - Experiment: - identifier: $dir_name - -This block will just create a new record with parent `Experiment` and one property -`identifier` with a value derived from the matching regular expression. - -Let's formulate that using `create_records`: - -.. code-block:: python - - dir_name = "directory name" - - record_def = { - "Experiment": { - "identifier": dir_name - } - } - - keys_modified = create_records(values, records, - record_def) - -The `dir_name` is set explicitely here, everything else is identical to the yaml statements. - - -The role of `keys_modified` -=========================== - -You probably have noticed already, that :func:`caoscrawler.converters.create_records` returns -`keys_modified` which is a list of tuples. Each element of `keys_modified` has two elements: - -- Element 0 is the name of the record that is modified (as used in the record store `records`). -- Element 1 is the name of the property that is modified. - -It is important, that the correct list of modified keys is returned by -:py:meth:`~caoscrawler.converters.Converter.create_records` to make the crawler process work. - -So, a sketch of a typical implementation within a custom converter could look like this: - - -.. code-block:: python - - def create_records(self, values: GeneralStore, - records: RecordStore, - element: StructureElement, - file_path_prefix: str): - - # Modify some records: - record_def = { - # ... - } - - keys_modified = create_records(values, records, - record_def) - - # You can of course do it multiple times: - keys_modified.extend(create_records(values, records, - record_def)) - - # You can also process the records section of the yaml definition: - keys_modified.extend( - super().create_records(values, records, element, file_path_prefix)) - # This essentially allows users of your converter to customize the creation of records - # by providing a custom "records" section additionally to the modifications provided - # in this implementation of the Converter. - - # Important: Return the list of modified keys! - return keys_modified - - -More complex example -==================== - -Let's have a look at a more complex examples, defining multiple records: - -.. code-block:: yaml - - DirConverter: - type: Directory - match: (?P<dir_name>.*) - records: - Project: - identifier: project_name - Experiment: - identifier: $dir_name - Project: $Project - ProjectGroup: - projects: +$Project - - -This block will create two new Records: - -- A project with a constant identifier -- An experiment with an identifier, derived from a regular expression and a reference to the new project. - -Furthermore a Record `ProjectGroup` will be edited (its initial definition is not given in the -yaml block): The project that was just created will be added as a list element to the property -`projects`. - -Let's formulate that using `create_records` (again, `dir_name` is constant here): - -.. code-block:: python - - dir_name = "directory name" - - record_def = { - "Project": { - "identifier": "project_name", - } - "Experiment": { - "identifier": dir_name, - "Project": "$Project", - } - "ProjectGroup": { - "projects": "+$Project", - } - - } - - keys_modified = create_records(values, records, - record_def) -Debugging -========= - -You can add the key `debug_match` to the definition of a Converter in order to create debugging -output for the match step. The following snippet illustrates this: - -.. code-block:: yaml - - DirConverter: - type: Directory - match: (?P<dir_name>.*) - debug_match: True - records: - Project: - identifier: project_name - - -Whenever this Converter tries to match a StructureElement, it logs what was tried to macht against -what and what the result was. diff --git a/src/doc/converters/cfood_definition.rst b/src/doc/converters/cfood_definition.rst new file mode 100644 index 0000000000000000000000000000000000000000..ea2f14b23bec04e659aa3166f089c7d274f74811 --- /dev/null +++ b/src/doc/converters/cfood_definition.rst @@ -0,0 +1,53 @@ +CFood definition +++++++++++++++++ + +Converter application to data is specified via a tree-like yml file (called ``cfood.yml``, by +convention). The yml file specifies which Converters shall be used on which StructureElements, and +how to treat the generated *child* StructureElements. + +The yaml definition may look like this: + +.. todo:: + + This is outdated, see ``cfood-schema.yml`` for the current specification of a ``cfood.yml``. + +.. code-block:: yaml + + <NodeName>: + type: <ConverterName> + match: ".*" + records: + Experiment1: + parents: + - Experiment + - Blablabla + date: $DATUM + (...) + Experiment2: + parents: + - Experiment + subtree: + (...) + +The **<NodeName>** is a description of what the current block represents (e.g. +``experiment-folder``) and is used as an identifier. + +**<type>** selects the converter that is going to be matched against +the current structure element. If the structure element matches (this +is a combination of a typecheck and a detailed match, see the +:py:class:`~caoscrawler.converters.converters.Converter` source +documentation for details), the converter will: + +- generate records (with + :py:meth:`~caoscrawler.converters.converters.Converter.create_records`) +- possibly process a subtree (with + :py:meth:`~caoscrawler.converters.converters.Converter.create_children`) + +**match** *TODO* + +**records** is a dict of definitions that define the semantic structure +(see details below). + +**subtree** makes the yaml recursive: It contains a list of new Converter +definitions, which work on the StructureElements that are returned by the +current Converter. diff --git a/src/doc/converters/custom_converters.rst b/src/doc/converters/custom_converters.rst new file mode 100644 index 0000000000000000000000000000000000000000..2738d66c483148fdecb9b189edac45e5b9a55a8b --- /dev/null +++ b/src/doc/converters/custom_converters.rst @@ -0,0 +1,344 @@ +Custom Converters ++++++++++++++++++ + +As mentioned before it is possible to create custom converters. +These custom converters can be used to integrate arbitrary data extraction and ETL capabilities +into the LinkAhead crawler and make these extensions available to any yaml specification. + +Tell the crawler about a custom converter +========================================= + +To use a custom crawler, it must be defined in the ``Converters`` section of the CFood yaml file. +The basic syntax for adding a custom converter to a definition file is: + +.. code-block:: yaml + + Converters: + <NameOfTheConverterInYamlFile>: + package: <python>.<module>.<name> + converter: <PythonClassName> + +The Converters section can be either put into the first or the second +document of the cfood yaml file. It can be also part of a +single-document yaml cfood file. Please refer to :doc:`the cfood +documentation<../cfood>` for more details. + +Details: + +- **<NameOfTheConverterInYamlFile>**: This is the name of the converter as it is going to be used in the present yaml file. +- **<python>.<module>.<name>**: The name of the module where the converter class resides. +- **<PythonClassName>**: Within this specified module there must be a class inheriting from base class :py:class:`caoscrawler.converters.converters.Converter`. + +Implementing a custom converter +=============================== + +Converters inherit from the :py:class:`~caoscrawler.converters.converters.Converter` class. + +The following methods are abstract and need to be overwritten by your custom converter to make it work: + +:py:meth:`~caoscrawler.converters.converters.Converter.create_children`: + Return a list of child StructureElement objects. + +- :py:meth:`~caoscrawler.converters.converters.Converter.match` +- :py:meth:`~caoscrawler.converters.converters.Converter.typecheck` + + +Example +======= + +In the following, we will explain the process of adding a custom converter to a yaml file using +a SourceResolver that is able to attach a source element to another entity. + +**Note**: This example might become a standard crawler soon, as part of the scifolder specification. See https://doi.org/10.3390/data5020043 for details. In this documentation example we will, therefore, add it to a package called "scifolder". + +First we will create our package and module structure, which might be: + +.. code-block:: + + scifolder_package/ + README.md + setup.cfg + setup.py + Makefile + tox.ini + src/ + scifolder/ + __init__.py + converters/ + __init__.py + sources.py # <- the actual file containing + # the converter class + doc/ + unittests/ + +Now we need to create a class called "SourceResolver" in the file "sources.py". In this - more advanced - example, we will not inherit our converter directly from :py:class:`~caoscrawler.converters.converters.Converter`, but use :py:class:`~caoscrawler.converters.converters.TextElementConverter`. The latter already implements :py:meth:`~caoscrawler.converters.converters.Converter.match` and :py:meth:`~caoscrawler.converters.converters.Converter.typecheck`, so only an implementation for :py:meth:`~caoscrawler.converters.converters.Converter.create_children` has to be provided by us. +Furthermore we will customize the method :py:meth:`~caoscrawler.converters.converters.Converter.create_records` that allows us to specify a more complex record generation procedure than provided in the standard implementation. One specific limitation of the standard implementation is, that only a fixed +number of records can be generated by the yaml definition. So for any applications - like here - that require an arbitrary number of records to be created, a customized implementation of :py:meth:`~caoscrawler.converters.converters.Converter.create_records` is recommended. +In this context it is recommended to make use of the function :func:`caoscrawler.converters.converters.create_records` that implements creation of record objects from python dictionaries of the same structure +that would be given using a yaml definition (see next section below). + +.. code-block:: python + + import re + from caoscrawler.stores import GeneralStore, RecordStore + from caoscrawler.converters import TextElementConverter, create_records + from caoscrawler.structure_elements import StructureElement, TextElement + + + class SourceResolver(TextElementConverter): + """ + This resolver uses a source list element (e.g. from the markdown readme file) + to link sources correctly. + """ + + def __init__(self, definition: dict, name: str, + converter_registry: dict): + """ + Initialize a new directory converter. + """ + super().__init__(definition, name, converter_registry) + + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + + # The source resolver does not create children: + + return [] + + def create_records(self, values: GeneralStore, + records: RecordStore, + element: StructureElement, + file_path_prefix): + if not isinstance(element, TextElement): + raise RuntimeError() + + # This function must return a list containing tuples, each one for a modified + # property: (name_of_entity, name_of_property) + keys_modified = [] + + # This is the name of the entity where the source is going to be attached: + attach_to_scientific_activity = self.definition["scientific_activity"] + rec = records[attach_to_scientific_activity] + + # The "source" is a path to a source project, so it should have the form: + # /<Category>/<project>/<scientific_activity>/ + # obtain these information from the structure element: + val = element.value + regexp = (r'/(?P<category>(SimulationData)|(ExperimentalData)|(DataAnalysis))' + '/(?P<project_date>.*?)_(?P<project_identifier>.*)' + '/(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))?/') + + res = re.match(regexp, val) + if res is None: + raise RuntimeError("Source cannot be parsed correctly.") + + # Mapping of categories on the file system to corresponding record types in CaosDB: + cat_map = { + "SimulationData": "Simulation", + "ExperimentalData": "Experiment", + "DataAnalysis": "DataAnalysis"} + linkrt = cat_map[res.group("category")] + + keys_modified.extend(create_records(values, records, { + "Project": { + "date": res.group("project_date"), + "identifier": res.group("project_identifier"), + }, + linkrt: { + "date": res.group("date"), + "identifier": res.group("identifier"), + "project": "$Project" + }, + attach_to_scientific_activity: { + "sources": "+$" + linkrt + }}, file_path_prefix)) + + # Process the records section of the yaml definition: + keys_modified.extend( + super().create_records(values, records, element, file_path_prefix)) + + # The create_records function must return the modified keys to make it compatible + # to the crawler functions: + return keys_modified + + +If the recommended (python) package structure is used, the package containing the converter +definition can just be installed using `pip install .` or `pip install -e .` from the +`scifolder_package` directory. + +The following yaml block will register the converter in a yaml file: + +.. code-block:: yaml + + Converters: + SourceResolver: + package: scifolder.converters.sources + converter: SourceResolver + + +Using the `create_records` API function +======================================= + +The function :func:`caoscrawler.converters.converters.create_records` was already mentioned above and it is +the recommended way to create new records from custom converters. Let's have a look at the +function signature: + +.. code-block:: python + + def create_records(values: GeneralStore, # <- pass the current variables store here + records: RecordStore, # <- pass the current store of CaosDB records here + def_records: dict): # <- This is the actual definition of new records! + + +`def_records` is the actual definition of new records according to the yaml cfood specification +(work in progress, in the docs). Essentially you can do everything here, that you could do +in the yaml document as well, but using python source code. + +Let's have a look at a few examples: + +.. code-block:: yaml + + DirConverter: + type: Directory + match: (?P<dir_name>.*) + records: + Experiment: + identifier: $dir_name + +This block will just create a new record with parent `Experiment` and one property +`identifier` with a value derived from the matching regular expression. + +Let's formulate that using `create_records`: + +.. code-block:: python + + dir_name = "directory name" + + record_def = { + "Experiment": { + "identifier": dir_name + } + } + + keys_modified = create_records(values, records, + record_def) + +The `dir_name` is set explicitely here, everything else is identical to the yaml statements. + + +The role of `keys_modified` +=========================== + +You probably have noticed already, that :func:`caoscrawler.converters.converters.create_records` returns +`keys_modified` which is a list of tuples. Each element of `keys_modified` has two elements: + +- Element 0 is the name of the record that is modified (as used in the record store `records`). +- Element 1 is the name of the property that is modified. + +It is important, that the correct list of modified keys is returned by +:py:meth:`~caoscrawler.converters.converters.Converter.create_records` to make the crawler process work. + +So, a sketch of a typical implementation within a custom converter could look like this: + + +.. code-block:: python + + def create_records(self, values: GeneralStore, + records: RecordStore, + element: StructureElement, + file_path_prefix: str): + + # Modify some records: + record_def = { + # ... + } + + keys_modified = create_records(values, records, + record_def) + + # You can of course do it multiple times: + keys_modified.extend(create_records(values, records, + record_def)) + + # You can also process the records section of the yaml definition: + keys_modified.extend( + super().create_records(values, records, element, file_path_prefix)) + # This essentially allows users of your converter to customize the creation of records + # by providing a custom "records" section additionally to the modifications provided + # in this implementation of the Converter. + + # Important: Return the list of modified keys! + return keys_modified + + +More complex example +==================== + +Let's have a look at a more complex examples, defining multiple records: + +.. code-block:: yaml + + DirConverter: + type: Directory + match: (?P<dir_name>.*) + records: + Project: + identifier: project_name + Experiment: + identifier: $dir_name + Project: $Project + ProjectGroup: + projects: +$Project + + +This block will create two new Records: + +- A project with a constant identifier +- An experiment with an identifier, derived from a regular expression and a reference to the new project. + +Furthermore a Record `ProjectGroup` will be edited (its initial definition is not given in the +yaml block): The project that was just created will be added as a list element to the property +`projects`. + +Let's formulate that using `create_records` (again, `dir_name` is constant here): + +.. code-block:: python + + dir_name = "directory name" + + record_def = { + "Project": { + "identifier": "project_name", + } + "Experiment": { + "identifier": dir_name, + "Project": "$Project", + } + "ProjectGroup": { + "projects": "+$Project", + } + + } + + keys_modified = create_records(values, records, + record_def) + +Debugging +========= + +You can add the key `debug_match` to the definition of a Converter in order to create debugging +output for the match step. The following snippet illustrates this: + +.. code-block:: yaml + + DirConverter: + type: Directory + match: (?P<dir_name>.*) + debug_match: True + records: + Project: + identifier: project_name + + +Whenever this Converter tries to match a StructureElement, it logs what was tried to macht against +what and what the result was. diff --git a/src/doc/converters/further_converters.rst b/src/doc/converters/further_converters.rst new file mode 100644 index 0000000000000000000000000000000000000000..0fffc2e7de1bd23327194c6379cca94bd7c72a29 --- /dev/null +++ b/src/doc/converters/further_converters.rst @@ -0,0 +1,187 @@ +Further converters +++++++++++++++++++ + +More converters, together with cfood definitions and examples can be found in +the `LinkAhead Crawler Extensions Subgroup +<https://gitlab.com/linkahead/crawler-extensions>`_ on gitlab. In the following, +we list converters that are shipped with the crawler library itself but are not +part of the set of standard converters and may require this library to be +installed with additional optional dependencies. + +HDF5 Converters +=============== + +For treating `HDF5 Files +<https://docs.hdfgroup.org/hdf5/develop/_s_p_e_c.html>`_, there are in total +four individual converters corresponding to the internal structure of HDF5 +files: the :ref:`H5FileConverter` which opens the file itself and creates +further structure elements from HDF5 groups, datasets, and included +multi-dimensional arrays that are in turn treated by the +:ref:`H5GroupConverter`, the :ref:`H5DatasetConverter`, and the +:ref:`H5NdarrayConverter`, respectively. You need to install the LinkAhead +crawler with its optional ``h5-crawler`` dependency for using these converters. + +The basic idea when crawling HDF5 files is to treat them very similar to +:ref:`dictionaries <DictElement Converter>` in which the attributes on root, +group, or dataset level are essentially treated like ``BooleanElement``, +``TextElement``, ``FloatElement``, and ``IntegerElement`` in a dictionary: They +are appended as children and can be accessed via the ``subtree``. The file +itself and the groups within may contain further groups and datasets, which can +have their own attributes, subgroups, and datasets, very much like +``DictElements`` within a dictionary. The main difference to any other +dictionary type is the presence of multi-dimensional arrays within HDF5 +datasets. Since LinkAhead doesn't have any datatype corresponding to these, and +since it isn't desirable to store these arrays directly within LinkAhead for +reasons of performance and of searchability, we wrap them within a specific +Record as explained :ref:`below <H5NdarrayConverter>`, together with more +metadata and their internal path within the HDF5 file. Users can thus query for +datasets and their arrays according to their metadata within LinkAhead and then +use the internal path information to access the dataset within the file +directly. The type of this record and the property for storing the internal path +need to be reflected in the datamodel. Using the default names, you would need a +datamodel like + +.. code-block:: yaml + + H5Ndarray: + obligatory_properties: + internal_hdf5-path: + datatype: TEXT + +although the names of both property and record type can be configured within the +cfood definition. + +A simple example of a cfood definition for HDF5 files can be found in the `unit +tests +<https://gitlab.com/linkahead/linkahead-crawler/-/blob/main/unittests/h5_cfood.yml?ref_type=heads>`_ +and shows how the individual converters are used in order to crawl a `simple +example file +<https://gitlab.com/linkahead/linkahead-crawler/-/blob/main/unittests/hdf5_dummy_file.hdf5?ref_type=heads>`_ +containing groups, subgroups, and datasets, together with their respective +attributes. + +H5FileConverter +--------------- + +This is an extension of the +:py:class:`~caoscrawler.converters.converters.SimpleFileConverter` +class. It opens the HDF5 file and creates children for any contained +group or dataset. Additionally, the root-level attributes of the HDF5 +file are accessible as children. + +H5GroupConverter +---------------- + +This is an extension of the +:py:class:`~caoscrawler.converters.converters.DictElementConverter` +class. Children are created for all subgroups and datasets in this +HDF5 group. Additionally, the group-level attributes are accessible as +children. + +H5DatasetConverter +------------------ + +This is an extension of the +:py:class:`~caoscrawler.converters.converters.DictElementConverter` +class. Most importantly, it stores the array data in HDF5 dataset into +:py:class:`~caoscrawler.converters.hdf5_converter.H5NdarrayElement` +which is added to its children, as well as the dataset attributes. + +H5NdarrayConverter +------------------ + +This converter creates a wrapper record for the contained dataset. The name of +this record needs to be specified in the cfood definition of this converter via +the ``recordname`` option. The RecordType of this record can be configured with +the ``array_recordtype_name`` option and defaults to ``H5Ndarray``. Via the +given ``recordname``, this record can be used within the cfood. Most +importantly, this record stores the internal path of this array within the HDF5 +file in a text property, the name of which can be configured with the +``internal_path_property_name`` option which defaults to ``internal_hdf5_path``. + + + +ROCrateConverter +================ + +The ROCrateConverter unpacks ro-crate files, and creates one instance of the +``ROCrateEntity`` structure element for each contained object. Currently only +zipped ro-crate files are supported. The created ROCrateEntities wrap a +``rocrate.model.entity.Entity`` with a path to the folder the ROCrate data +is saved in. They are appended as children and can then be accessed via the +subtree and treated using the :ref:`ROCrateEntityConverter`. + +To use the ROCrateConverter, you need to install the LinkAhead crawler with its +optional ``rocrate`` dependency. + +ELNFileConverter +---------------- + +As .eln files are zipped ro-crate files, the ELNFileConverter works analogously +to the ROCrateConverter and also creates ROCrateEntities for contained objects. + +ROCrateEntityConverter +---------------------- + +The ROCrateEntityConverter unpacks the ``rocrate.model.entity.Entity`` wrapped +within a ROCrateEntity, and appends all properties, contained files, and parts +as children. Properties are converted to a basic element matching their value +(``BooleanElement``, ``IntegerElement``, etc.) and can be matched using +match_properties. Each ``rocrate.model.file.File`` is converted to a crawler +File object, which can be matched with SimpleFile. And each subpart of the +ROCrateEntity is also converted to a ROCrateEntity, which can then again be +treated using this converter. + +The ``match_entity_type`` keyword can be used to match a ROCrateEntity using its +entity_type. With the ``match_properties`` keyword, properties of a ROCrateEntity +can be either matched or extracted, as seen in the cfood example below: +* with ``match_properties: "@id": ro-crate-metadata.json`` the ROCrateEntities +can be filtered to only match the metadata json files. +* with ``match_properties: dateCreated: (?P<dateCreated>.*)`` the ``dateCreated`` +entry of that metadata json file is extracted and accessible through the +``dateCreated`` variable. +* the example could then be extended to use any other entry present in the metadata +json to filter the results, or insert the extracted information into generated records. + +Example cfood +------------- + +One short cfood to generate records for each .eln file in a directory and +their metadata files could be: + +.. code-block:: yaml + + --- + metadata: + crawler-version: 0.9.0 + --- + Converters: + ELNFile: + converter: ELNFileConverter + package: caoscrawler.converters.rocrate + ROCrateEntity: + converter: ROCrateEntityConverter + package: caoscrawler.converters.rocrate + + ParentDirectory: + type: Directory + match: (.*) + subtree: + ELNFile: + type: ELNFile + match: (?P<filename>.*)\.eln + records: + ELNExampleRecord: + filename: $filename + subtree: + ROCrateEntity: + type: ROCrateEntity + match_properties: + "@id": ro-crate-metadata.json + dateCreated: (?P<dateCreated>.*) + records: + MDExampleRecord: + parent: $ELNFile + filename: ro-crate-metadata.json + time: $dateCreated + diff --git a/src/doc/converters/index.rst b/src/doc/converters/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..38fc11335a2640f645e9b4e093690d1ffa7cd07f --- /dev/null +++ b/src/doc/converters/index.rst @@ -0,0 +1,29 @@ +Converters +)))))))))) + +Converters treat a StructureElement and during this process create a number of new +StructureElements: the children of the initially treated StructureElement. Thus by treatment of +existing StructureElements, Converters create a tree of StructureElements. + +.. image:: ../img/converter.png + :height: 170 + :alt: Converters are Python classes that tell the crawler how to + interprete StructureElements. + +Each StructureElement in the tree has a set of properties, organized as +key-value pairs. +Some of those properties are specified by the type of StructureElement. For example, +a file could have the file name as property: ``'filename': myfile.dat``. +Converters may define additional functions that create further values. For +example, a regular expression could be used to get a date from a file name. + +.. toctree:: + :maxdepth: 1 + :caption: Contents: + + CFood definition<cfood_definition> + Standard converters<standard_converters> + Further converters<further_converters> + Custom converters<custom_converters> + Transform functions<transform_functions> + diff --git a/src/doc/converters/standard_converters.rst b/src/doc/converters/standard_converters.rst new file mode 100644 index 0000000000000000000000000000000000000000..96b089f2252e44e764ae35ceea560bcdf06858c6 --- /dev/null +++ b/src/doc/converters/standard_converters.rst @@ -0,0 +1,404 @@ +Standard Converters ++++++++++++++++++++ + +These are the standard converters that exist in a default installation. For writing and applying +*custom converters*, see :ref:`below <Custom Converters>`. + +Directory Converter +=================== + +The Directory Converter creates StructureElements for each File and +Directory inside the current Directory. You can match a regular +expression against the directory name using the 'match' key. + +With the optional ``match_newer_than_file`` key, a path to file +containing only an ISO-formatted datetime string can be specified. If +this is done, a directory will only match if it contains at least one +file or directory that has been modified since that datetime. If the +file doesn't exist or contains an invalid string, the directory will +be matched regardless of the modification times. + +Simple File Converter +===================== +The Simple File Converter does not create any children and is usually used if +a file shall be used as it is and be inserted and referenced by other entities. + +Markdown File Converter +======================= +Reads a YAML header from Markdown files (if such a header exists) and creates +children elements according to the structure of the header. + +DictElement Converter +===================== + +DictElement → StructureElement + +Creates a child StructureElement for each key in the dictionary. + +Typical Subtree converters +-------------------------- +The following StructureElement types are typically created by the DictElement converter: + +- BooleanElement +- FloatElement +- TextElement +- IntegerElement +- ListElement +- DictElement + +Note that you may use ``TextElement`` for anything that exists in a text format that can be +interpreted by the server, such as date and datetime strings in ISO-8601 format. + +match_properties +---------------- + +`match_properties` is a dictionary of key-regexps and value-regexp pairs and can be used to +match direct properties of a `DictElement`. Each key matches +a property name and the corresponding value matches its property value. + +Example: +........ + +.. code-block:: json + + { + "@type": "PropertyValue", + "additionalType": "str", + "propertyID": "testextra", + "value": "hi" + } + +When applied to a dict loaded from the above json, a `DictElementConverter` with the following definition: + +.. code-block:: yaml + + Example: + type: DictElement + match_properties: + additionalType: (?P<addt>.*)$ + property(.*): (?P<propid>.*)$ + +will match and create two variables: + +- `addt = "str"` +- `propid = "testextra"` + + +Scalar Value Converters +======================= +`BooleanElementConverter`, `FloatElementConverter`, `TextElementConverter`, and +`IntegerElementConverter` behave very similarly. + +These converters expect `match_name` and `match_value` in their definition +which allow to match the key and the value, respectively. + +Note that there are defaults for accepting other types. For example, +FloatElementConverter also accepts IntegerElements. The default +behavior can be adjusted with the fields `accept_text`, `accept_int`, +`accept_float`, and `accept_bool`. + +The following denotes what kind of StructureElements are accepted by default +(they are defined in `src/caoscrawler/converters.py`): + +- BooleanElementConverter: bool, int +- FloatElementConverter: int, float +- TextElementConverter: text, bool, int, float +- IntegerElementConverter: int +- ListElementConverter: list +- DictElementConverter: dict + +YAMLFileConverter +================= + +A specialized Dict Converter for yaml files: Yaml files are opened and the contents are +converted into dictionaries that can be further converted using the typical subtree converters +of dict converter. + +**WARNING**: Currently unfinished implementation. + +JSONFileConverter +================= + + + + +TableConverter +============== + +Table → DictElement + +A generic converter (abstract) for files containing tables. +Currently, there are two specialized implementations for XLSX files and CSV files. + +All table converters generate a subtree of dicts, which in turn can be converted with DictElementConverters: +For each row in the table the TableConverter generates a DictElement (structure element). The key of the +element is the row number. The value of the element is a dict containing the mapping of +column names to values of the respective cell. + +Example: + +.. code-block:: yaml + + subtree: + TABLE: # Any name for the table as a whole + type: CSVTableConverter + match: ^test_table.csv$ + records: + (...) # Records edited for the whole table file + subtree: + ROW: # Any name for a data row in the table + type: DictElement + match_name: .* + match_value: .* + records: + (...) # Records edited for each row + subtree: + COLUMN: # Any name for a specific type of column in the table + type: FloatElement + match_name: measurement # Name of the column in the table file + match_value: (?P<column_value).*) + records: + (...) # Records edited for each cell + + +XLSXTableConverter +================== + +XLSX File → DictElement + +CSVTableConverter +================= + +CSV File → DictElement + +PropertiesFromDictConverter +=========================== + +The :py:class:`~caoscrawler.converters.converters.PropertiesFromDictConverter` is +a specialization of the +:py:class:`~caoscrawler.converters.converters.DictElementConverter` and offers +all its functionality. It is meant to operate on dictionaries (e.g., +from reading in a json or a table file), the keys of which correspond +closely to properties in a LinkAhead datamodel. This is especially +handy in cases where properties may be added to the data model and +data sources that are not yet known when writing the cfood definition. + +The converter definition of the +:py:class:`~caoscrawler.converters.converters.PropertiesFromDictConverter` has an +additional required entry ``record_from_dict`` which specifies the +Record to which the properties extracted from the dict are attached +to. This Record is identified by its ``variable_name`` by which it can +be referred to further down the subtree. You can also use the name of +a Record that was specified earlier in the CFood definition in order +to extend it by the properties extracted from a dict. Let's have a +look at a simple example. A CFood definition + +.. code-block:: yaml + + PropertiesFromDictElement: + type: PropertiesFromDictElement + match: ".*" + record_from_dict: + variable_name: MyRec + parents: + - MyType1 + - MyType2 + +applied to a dictionary + +.. code-block:: json + + { + "name": "New name", + "a": 5, + "b": ["a", "b", "c"], + "author": { + "full_name": "Silvia Scientist" + } + } + +will create a Record ``New name`` with parents ``MyType1`` and +``MyType2``. It has a scalar property ``a`` with value 5, a list +property ``b`` with values "a", "b" and "c", and an ``author`` +property which references an ``author`` with a ``full_name`` property +with value "Silvia Scientist": + +.. image:: ../img/properties-from-dict-records-author.png + :height: 210 + :alt: A Record "New Name" and an author Record with full_name + "Silvia Scientist" are generated and filled automatically. + +Note how the different dictionary keys are handled differently +depending on their types: scalar and list values are understood +automatically, and a dictionary-valued entry like ``author`` is +translated into a reference to an ``author`` Record automatically. + +You can further specify how references are treated with an optional +``references key`` in ``record_from_dict``. Let's assume that in the +above example, we have an ``author`` **Property** with datatype +``Person`` in our data model. We could add this information by +extending the above example definition by + + +.. code-block:: yaml + + PropertiesFromDictElement: + type: PropertiesFromDictElement + match: ".*" + record_from_dict: + variable_name: MyRec + parents: + - MyType1 + - MyType2 + references: + author: + parents: + - Person + +so that now, a ``Person`` record with a ``full_name`` property with +value "Silvia Scientist" is created as the value of the ``author`` +property: + +.. image:: ../img/properties-from-dict-records-person.png + :height: 200 + :alt: A new Person Record is created which is referenced as an + author. + +For the time being, only the parents of the referenced record can be +set via this option. More complicated treatments can be implemented +via the ``referenced_record_callback`` (see below). + +Properties can be blacklisted with the ``properties_blacklist`` +keyword, i.e., all keys listed under ``properties_blacklist`` will be +excluded from automated treatment. Since the +:py:class:`~caoscrawler.converters.converters.PropertiesFromDictConverter` has +all the functionality of the +:py:class:`~caoscrawler.converters.converters.DictElementConverter`, individual +properties can still be used in a subtree. Together with +``properties_blacklist`` this can be used to add custom treatment to +specific properties by blacklisting them in ``record_from_dict`` and +then treating them in the subtree the same as you would do it in the +standard +:py:class:`~caoscrawler.converters.converters.DictElementConverter`. Note that +the blacklisted keys are excluded on **all** levels of the dictionary, +i.e., also when they occur in a referenced entity. + +For further customization, the +:py:class:`~caoscrawler.converters.converters.PropertiesFromDictConverter` +can be used as a basis for :ref:`custom converters<Custom Converters>` +which can make use of its ``referenced_record_callback`` argument. The +``referenced_record_callback`` can be a callable object which takes +exactly a Record as an argument and needs to return that Record after +doing whatever custom treatment is needed. Additionally, it is given +the ``RecordStore`` and the ``ValueStore`` in order to be able to +access the records and values that have already been defined from +within ``referenced_record_callback``. Such a function might look the +following: + +.. code-block:: python + + def my_callback(rec: db.Record, records: RecordStore, values: GeneralStore): + # do something with rec, possibly using other records or values from the stores... + rec.description = "This was updated in a callback" + return rec + +It is applied to all Records that are created from the dictionary and +it can be used to, e.g., transform values of some properties, or add +special treatment to all Records of a specific +type. ``referenced_record_callback`` is applied **after** the +properties from the dictionary have been applied as explained above. + +XML Converters +============== + +There are the following converters for XML content: + + +XMLFileConverter +---------------- + +This is a converter that loads an XML file and creates an XMLElement containing the +root element of the XML tree. It can be matched in the subtree using the XMLTagConverter. + +XMLTagConverter +--------------- + +The XMLTagConverter is a generic converter for XMLElements with the following main features: + +- It allows to match a combination of tag name, attribute names and text contents using the keys: + + - ``match_tag``: regexp, default empty string + - ``match_attrib``: dictionary of key-regexps and value-regexp + pairs. Each key matches an attribute name and the corresponding + value matches its attribute value. + - ``match_text``: regexp, default empty string +- It allows to traverse the tree using XPath (using Python lxml's xpath functions): + + - The key ``xpath`` is used to set the xpath expression and has a + default of ``child::*``. Its default would generate just the list of + sub nodes of the current node. The result of the xpath expression + is used to generate structure elements as children. It furthermore + uses the keys ``tags_as_children``, ``attribs_as_children`` and + ``text_as_children`` to decide which information from the found + nodes will be used as children: + - ``tags_as_children``: (default ``true``) For each xml tag element + found by the xpath expression, generate one XMLTag structure + element. Its name is the full path to the tag using the function + ``getelementpath`` from ``lxml``. + - ``attribs_as_children``: (default ``false``) For each xml tag element + found by the xpath expression, generate one XMLAttributeNode + structure element for each of its attributes. The name of the + respective attribute node has the form: ``<full path of the tag> @ + <name of the attribute>`` **Please note:** Currently, there is no + converter implemented that can match XMLAttributeNodes. + - ``text_as_children``: (default ``false``) For each xml tag element + found by the xpath expression, generate one XMLTextNode structure + element containing the text content of the tag element. Note that + in case of multiple text elements, only the first one is + added. The name of the respective attribute node has the form: + ``<full path of the tag> /text()`` to the tag using the function + ``getelementpath`` from ``lxml``. **Please note:** Currently, there is + no converter implemented that can match XMLAttributeNodes. + +Namespaces +.......... + +The default is to take the namespace map from the current node and use +it in xpath queries. Because default namespaces cannot be handled by +xpath, it is possible to remap the default namespace using the key +``default_namespace``. The key ``nsmap`` can be used to define +additional nsmap entries. + +XMLTextNodeConverter +-------------------- + +In the future, this converter can be used to match XMLTextNodes that +are generated by the XMLTagConverter. + + +ZipFileConverter +================ + +This converter opens zip files, unzips them into a temporary directory and +exposes its contents as File structure elements. + +Usage Example: +-------------- + +.. code-block:: yaml + + ExampleZipFile: + type: ZipFile + match: example\.zip$ + subtree: + DirInsideZip: + type: Directory + match: experiments$ + FileInsideZip: + type: File + match: description.odt$ + +This converter will match and open files called ``example.zip``. If +the file contains a directory called ``experiments`` it will be +processed further by the respective converter in the subtree. The same +is true for a file called ``description.odt``. diff --git a/src/doc/converters/transform_functions.rst b/src/doc/converters/transform_functions.rst new file mode 100644 index 0000000000000000000000000000000000000000..35c11093714f59cf3139a2544b5eae2f5a9c17f2 --- /dev/null +++ b/src/doc/converters/transform_functions.rst @@ -0,0 +1,157 @@ +Transform Functions ++++++++++++++++++++ +Often the situation arises, that you cannot use a value as it is found. Maybe a value should be +increased by an offset or a string should be split into a list of pieces. In order to allow such +simple conversions, transform functions can be named in the converter definition that are then +applied to the respective variables when the converter is executed. + +.. code-block:: yaml + + <NodeName>: + type: <ConverterName> + match: ".*" + transform: + <TransformNodeName>: + in: $<in_var_name> + out: $<out_var_name> + functions: + - <func_name>: # name of the function to be applied + <func_arg1>: <func_arg1_value> # key value pairs that are passed as parameters + <func_arg2>: <func_arg2_value> + # ... + +An example that splits the variable ``a`` and puts the generated list in ``b`` is the following: + +.. code-block:: yaml + + Experiment: + type: Dict + match: ".*" + transform: + param_split: + in: $a + out: $b + functions: + - split: # split is a function that is defined by default + marker: "|" # its only parameter is the marker that is used to split the string + records: + Report: + tags: $b + +This splits the string in '$a' and stores the resulting list in +'$b'. This is here used to add a list valued property to the Report +Record. Note that from LinkAhead Crawler 0.11.0 onwards, the value of +``marker`` in the above example can also be read in from a variable in +the usual ``$`` notation: + +.. code-block:: yaml + + # ... variable ``separator`` is defined somewhere above this part, e.g., + # by reading a config file. + Experiment: + type: Dict + match: ".*" + transform: + param_split: + in: $a + out: $b + functions: + - split: + marker: $separator # Now the separator is read in from a + # variable, so we can, e.g., change from + # '|' to ';' without changing the cfood + # definition. + records: + Report: + tags: $b + + + +There are a number of transform functions that are defined by default (see +``src/caoscrawler/default_transformers.yml``). You can define custom transform functions by adding +them to the cfood definition (see :doc:`CFood Documentation<../cfood>`). + + +Custom Transformers +=================== + +Custom transformers are basically python functions having a special form/signature. They need to +be registered in the cfood definition in order to be available during the scanning process. + +Let's assume we want to implement a transformer that replaces all occurrences of single letters +in the value of a variable with a different letter each. So passing "abc" as `in_letters` and +"xyz" as `out_letters` would result in a replacement of a value of "scan started" to +"szxn stxrted". We could implement this in python using the +following code: + +.. code-block:: python + + def replace_letters(in_value: Any, in_parameters: dict) -> Any: + """ + Replace letters in variables + """ + + # The arguments to the transformer (as given by the definition in the cfood) + # are contained in `in_parameters`. We need to make sure they are set or + # set their defaults otherwise: + + if "in_letters" not in in_parameters: + raise RuntimeError("Parameter `in_letters` missing.") + + if "out_letters" not in in_parameters: + raise RuntimeError("Parameter `out_letters` missing.") + + l_in = in_parameters["in_letters"] + l_out = in_parameters["out_letters"] + + + if len(l_in) != len(l_out): + raise RuntimeError("`in_letters` and `out_letters` must have the same length.") + + for l1, l2 in zip(l_in, l_out): + in_value = in_value.replace(l1, l2) + + return in_value + + +This code needs to be put into a module that can be found during runtime of the crawler. +One possibility is to install the package into the same virtual environment that is used +to run the crawler. + +In the cfood the transfomer needs to be registered: + +.. code-block:: yaml + + --- + metadata: + crawler-version: 0.10.2 + macros: + --- + #Converters: # put custom converters here + Transformers: + replace_letters: # This name will be made available in the cfood + function: replace_letters + package: utilities.replace_letters + +This would assume that the code for the function `replace_letters` is residing in a file +called `replace_letters.py` that is stored in a package called `utilities`. + +The transformer can then be used in a converter, e.g.: + + +.. code-block:: yaml + + Experiment: + type: Dict + match: ".*" + transform: + replace_letters: + in: $a + out: $b + functions: + - replace_letters: # This is the name of our custom transformer + in_letters: "abc" + out_letters: "xyz" + records: + Report: + tags: $b diff --git a/src/doc/getting_started/INSTALL.md b/src/doc/getting_started/INSTALL.md new file mode 120000 index 0000000000000000000000000000000000000000..95b6037c7ab329d91e3a8ed4a2b31eba675eef62 --- /dev/null +++ b/src/doc/getting_started/INSTALL.md @@ -0,0 +1 @@ +../../../INSTALL.md \ No newline at end of file diff --git a/src/doc/getting_started/furtherreading.rst b/src/doc/getting_started/furtherreading.rst new file mode 100644 index 0000000000000000000000000000000000000000..8d8d3ecc4b5575f71e90e9e5a17b060a63403a07 --- /dev/null +++ b/src/doc/getting_started/furtherreading.rst @@ -0,0 +1,9 @@ +Further reading +=============== + +- A simple `documented example <https://gitlab.com/caosdb/documented-crawler-example>`_ which + demonstrates the crawler usage. +- Some useful examples can be found in the `integration tests + <https://gitlab.com/caosdb/caosdb-crawler/-/tree/main/integrationtests>`_ (and to a certain extent + in the unit tests). +- TODO: Information on caching diff --git a/src/doc/getting_started/helloworld.md b/src/doc/getting_started/helloworld.md new file mode 100644 index 0000000000000000000000000000000000000000..67fdf88974391ac6209f1010bfb4f2d883e51021 --- /dev/null +++ b/src/doc/getting_started/helloworld.md @@ -0,0 +1,95 @@ +# Hello World + +## Setting up the data model ## + +For this example, we need a very simple data model. You can insert it into your +CaosDB instance by saving the following to a file called `model.yml`: + +```yaml +HelloWorld: + recommended_properties: + time: + datatype: DATETIME + note: + datatype: TEXT +``` +and insert the model using +```sh +python -m caosadvancedtools.models.parser model.yml --sync +``` + +Let's look first at how the CaosDB Crawler synchronizes Records that are +created locally with those that might already exist on the CaosDB server. + +For this you need a file called `identifiables.yml` with this content: +```yaml +HelloWorld: + - name +``` + +## Synchronizing data ## + +Then you can do the following interactively in (I)Python. But we recommend that you +copy the code into a script and execute it to spare yourself typing. + +```python +import linkahead as db +from datetime import datetime +from caoscrawler import Crawler, SecurityMode +from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter + + +# Create a Record that will be synced +hello_rec = db.Record(name="My first Record") +hello_rec.add_parent("HelloWorld") +hello_rec.add_property(name="time", value=datetime.now().isoformat()) + +# Create a Crawler instance that we will use for synchronization +crawler = Crawler(securityMode=SecurityMode.UPDATE) +# This defines how Records on the server are identified with the ones we have locally +identifiables_definition_file = "identifiables.yml" +ident = CaosDBIdentifiableAdapter() +ident.load_from_yaml_definition(identifiables_definition_file) +crawler.identifiableAdapter = ident + +# Here we synchronize the Record +inserts, updates = crawler.synchronize(commit_changes=True, unique_names=True, + crawled_data=[hello_rec]) +print(f"Inserted {len(inserts)} Records") +print(f"Updated {len(updates)} Records") +``` + +Now, start by executing the code. What happens? The output suggests that one +entity was inserted. Please go to the web interface of your instance and have a +look. You can use the query `FIND HelloWorld`. You should see a brand new +Record with a current time stamp. + +So, how did this happen? In our script, we created a "HelloWorld" Record and +gave it to the Crawler. The Crawler checks how "HelloWorld" Records are +identified. We told the Crawler with our `identifiables.yml` that it should +use the name. The Crawler thus checked whether a "HelloWorld" Record with our +name exists on the Server. It did not. Therefore the Record that we provided +was inserted in the Server. + +## Running the synchronization again ## + +Now, run the script again. What happens? There is an update! This time, a +Record with the required name existed. Thus the "time" Property of the existing Record was updated. + +The Crawler does not touch Properties that are not present in the local data. +Thus, if you add a "note" Property to the Record in the server (e.g. with the +edit mode in the web interface) and run the script again, this Property is +kept unchanged. This means that you can extend Records that were created using +the Crawler. + +Note that if you change the name of the "HelloWorld" Record in the script and +run it again, a new Record is inserted by the Crawler. This is because in the `identifiables.yml` we +told the Crawler that it should use the *name* to check whether a "HelloWorld" Record +already exists in the Server. + +So far, you saw how the Crawler handles synchronization in a very simple +scenario. In the following tutorials, you will learn how this looks like if +there are multiple connected Records involved which may not simply be +identified using the name. Also, we created the Record "manually" in this +example while the typical use case is to create it automatically from some files +or directories. How this is done will also be shown in the following chapters. diff --git a/src/doc/getting_started/index.rst b/src/doc/getting_started/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..86b34d069391b146d15599228067df2e9e41d642 --- /dev/null +++ b/src/doc/getting_started/index.rst @@ -0,0 +1,17 @@ +Getting Started ++++++++++++++++ + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + :hidden: + + Installation<INSTALL> + prerequisites + helloworld + optionalfeatures + furtherreading + +This section will help you get going! From the first installation steps to the first simple crawl. + +Let's go! diff --git a/src/doc/getting_started/optionalfeatures.rst b/src/doc/getting_started/optionalfeatures.rst new file mode 100644 index 0000000000000000000000000000000000000000..7b77646501d677b7a99799b97fae752107b11d6f --- /dev/null +++ b/src/doc/getting_started/optionalfeatures.rst @@ -0,0 +1,54 @@ +Optional Features +================= + +Email notifications +------------------- + +The crawler can send email notifications if it made some changes or if +new data was inserted. This is (currently) only available if the crawler +runs as server side script of CaosDB. You need to add the following +section to your ``.pycaosdb.ini`` + +.. code:: ini + + [caoscrawler] + send_crawler_notifications=True + public_host_url=https://example..eu + sendmail_to_address=someone@example.de + sendmail_from_address=caosdb-no-reply@example.eu + +This feature uses the ``sendmail`` functionality of +``caosadvancedtools``. Thus, it uses the setting + +.. code:: ini + + [Misc] + sendmail = /usr/sbin/sendmail + #sendmail = /usr/local/bin/sendmail_to_file + +to decide what tool is used for sending mails (use the upper one if you +want to actually send mails. See ``sendmail`` configuration in the +LinkAhead docs. + +You can even supply the name of a custom CSS file that shall be used: + +.. code:: ini + + [advancedtools] + crawler.customcssfile = theme-research.css + +Crawler Status Records +---------------------- + +The crawler can insert and update Records that contain essential +information about the data integration process. This is (currently) only +available if the crawler runs as server side script of CaosDB. To enable +this, add the following to your ``.pycaosdb.ini`` + +.. code:: ini + + [caoscrawler] + create_crawler_status_records=True + +You also need to add the data model needed for this as desribed by +``crawler_run_model.yml``. diff --git a/src/doc/getting_started/prerequisites.md b/src/doc/getting_started/prerequisites.md new file mode 100644 index 0000000000000000000000000000000000000000..06e33e922214786f1f48e9f210a297869c259379 --- /dev/null +++ b/src/doc/getting_started/prerequisites.md @@ -0,0 +1,30 @@ +# Prerequisites + +The CaosDB Crawler is a utility to create CaosDB Records from some data +structure, e.g. files, and synchronize these Records with a CaosDB server. + +Thus two prerequisites to use the CaosDB Crawler are clear: +1. You need access to a running CaosDB instance. See the [documentation](https://docs.indiscale.com/caosdb-deploy/index.html). +2. You need access to the data that you want to insert, i.e. the files or + the table from which you want to create Records. + +Make sure that you configured your Python client to speak +to the correct CaosDB instance (see [configuration docs](https://docs.indiscale.com/caosdb-pylib/configuration.html)). + +We would like to make another prerequisite explicit that is related to the first +point above: You need a data model. Typically, if you want to insert data into +an actively used CaosDB instance, there is a data model already. However, if +there is no data model yet, you can define one using the +[edit mode](https://docs.indiscale.com/caosdb-webui/tutorials/edit_mode.html) +or the [YAML format](https://docs.indiscale.com/caosdb-advanced-user-tools/yaml_interface.html). +We will provide small data models for the examples to come. + + +Also it is recommended, and necessary for the following chapters, that you have +some experience with the CaosDB Python client. +If you don't, you can start with +the [tutorials](https://docs.indiscale.com/caosdb-pylib/tutorials/index.html) + +If you want to write CaosDB Crawler configuration files (so called CFoods), it helps if you know +regular expressions. If regular expressions are new to you, don't worry, we keep it simple in this +tutorial. diff --git a/src/doc/how-to-upgrade.md b/src/doc/how-to-upgrade.md index 931fa0cd2f2d621c89c35046d6df4ba6ac9b7a1e..8af805ea30cc85cdde88d789ee3538b2bbaef7e3 100644 --- a/src/doc/how-to-upgrade.md +++ b/src/doc/how-to-upgrade.md @@ -1,6 +1,68 @@ # How to upgrade +## 0.8.x to 0.9.0 + +If you were using the optional HDF5 converter classes, you need to +adapt the package path in your cfood definition from the **old** + +```yaml +Converters: + H5Dataset: + converter: H5DatasetConverter + package: caoscrawler.hdf5_converter + H5File: + converter: H5FileConverter + package: caoscrawler.hdf5_converter + H5Group: + converter: H5GroupConverter + package: caoscrawler.hdf5_converter + H5Ndarray: + converter: H5NdarrayConverter + package: caoscrawler.hdf5_converter +``` + +to the **new** paths: + +```yaml +Converters: + H5Dataset: + converter: H5DatasetConverter + package: caoscrawler.converters.hdf5_converter + H5File: + converter: H5FileConverter + package: caoscrawler.converters.hdf5_converter + H5Group: + converter: H5GroupConverter + package: caoscrawler.converters.hdf5_converter + H5Ndarray: + converter: H5NdarrayConverter + package: caoscrawler.converters.hdf5_converter +``` + +## 0.6.x to 0.7.0 +If you added Parents to Records at multiple places in the CFood, you must now +do this at a single location because this key now overwrites previously set +parents. + +## 0.5.x to 0.6.0 +[#41](https://gitlab.com/caosdb/caosdb-crawler/-/issues/41) was fixed. This +means that you previously used the name of Entities as an identifying +property without adding it to the identifiable definition, you now need to +add 'name' explicitly. + +## 0.4.x to 0.5.0 +The crawler was split into two modules: the scanner and the crawler. The scanner creates a Record +structure from the data and the crawler synchronizes this with the server. Due to this change you +should: +- Remove the `debug` argument from the Crawler constructor. For debugging supply a DebugTree as + argument to functions like the scanner. +- Remove the `generalStore` argument from the Crawler constructor. A store can no longer be + provided to the crawler. +- `load_definition` and `initialize_converters` are now part of the scanner module +- `crawl_directory` is replcaced by `scan_directory` of the scanner module +- `start_crawling` is replcaced by `scan_structure_elements` of the scanner module + ## 0.2.x to 0.3.0 DictElementConverter (old: DictConverter) now can use "match" keywords. If none are in the definition, the behavior is as before. If you had "match", diff --git a/src/doc/img/converter.png b/src/doc/img/converter.png new file mode 100644 index 0000000000000000000000000000000000000000..c11517a32ceb164510a7731ff0516d19db71801a Binary files /dev/null and b/src/doc/img/converter.png differ diff --git a/src/doc/img/converter.svg b/src/doc/img/converter.svg new file mode 100644 index 0000000000000000000000000000000000000000..af32ff69cdd6c25805f929458556310b3ee34f41 --- /dev/null +++ b/src/doc/img/converter.svg @@ -0,0 +1,442 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="72.854424mm" + height="33.470383mm" + viewBox="0 0 72.854423 33.470383" + version="1.1" + id="svg13434" + inkscape:version="1.0.2 (e86c870879, 2021-01-15)" + sodipodi:docname="converter.svg" + inkscape:export-filename="/home/daniel/indiscale/software/linkahead/caosdb-crawler/src/doc/img/converter.png" + inkscape:export-xdpi="299.83078" + inkscape:export-ydpi="299.83078"> + <defs + id="defs13428"> + <marker + style="overflow:visible;" + id="marker1559" + refX="0.0" + refY="0.0" + orient="auto" + inkscape:stockid="Arrow2Mend" + inkscape:isstock="true"> + <path + transform="scale(0.6) rotate(180) translate(0,0)" + d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z " + style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round;stroke:#000000;stroke-opacity:1;fill:#000000;fill-opacity:1" + id="path1557" /> + </marker> + <marker + style="overflow:visible" + id="marker1266" + refX="0" + refY="0" + orient="auto" + inkscape:stockid="Arrow2Mend" + inkscape:isstock="true"> + <path + transform="scale(-0.6)" + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1" + id="path1264" /> + </marker> + <marker + style="overflow:visible" + id="marker1218" + refX="0" + refY="0" + orient="auto" + inkscape:stockid="Arrow2Mend" + inkscape:isstock="true" + inkscape:collect="always"> + <path + transform="scale(-0.6)" + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1" + id="path1216" /> + </marker> + <marker + style="overflow:visible" + id="Arrow2Mend" + refX="0" + refY="0" + orient="auto" + inkscape:stockid="Arrow2Mend" + inkscape:isstock="true" + inkscape:collect="always"> + <path + transform="scale(-0.6)" + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1" + id="path909" /> + </marker> + <marker + style="overflow:visible" + id="Arrow1Lend" + refX="0" + refY="0" + orient="auto" + inkscape:stockid="Arrow1Lend" + inkscape:isstock="true"> + <path + transform="matrix(-0.8,0,0,-0.8,-10,0)" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + id="path885" /> + </marker> + <marker + style="overflow:visible" + id="marker1559-2" + refX="0" + refY="0" + orient="auto" + inkscape:stockid="Arrow2Mend" + inkscape:isstock="true"> + <path + transform="scale(-0.6)" + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1" + id="path1557-9" /> + </marker> + </defs> + <sodipodi:namedview + id="base" + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1.0" + inkscape:pageopacity="0.0" + inkscape:pageshadow="2" + inkscape:zoom="2.8" + inkscape:cx="120.68286" + inkscape:cy="23.831081" + inkscape:document-units="mm" + inkscape:current-layer="g1411" + inkscape:document-rotation="0" + showgrid="false" + inkscape:snap-global="false" + inkscape:window-width="1920" + inkscape:window-height="1135" + inkscape:window-x="0" + inkscape:window-y="0" + inkscape:window-maximized="1" + lock-margins="true" + fit-margin-top="2" + fit-margin-left="2" + fit-margin-right="2" + fit-margin-bottom="2" /> + <metadata + id="metadata13431"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <g + inkscape:label="Ebene 1" + inkscape:groupmode="layer" + id="layer1" + transform="translate(-8.1569115,-36.221295)"> + <g + id="g1411" + transform="translate(32.258972,-4.0381556)"> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + d="m 26.22787,46.991961 -0.04324,7.85981" + id="path870" + sodipodi:nodetypes="cc" /> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + d="m 27.268191,47.234524 6.5917,7.093847" + id="path872" + sodipodi:nodetypes="cc" /> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + d="M 17.211264,56.167197 12.543075,64.49543" + id="path874" + sodipodi:nodetypes="cc" /> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + d="m 19.403188,56.222309 1.865426,8.356695" + id="path876" + sodipodi:nodetypes="cc" /> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.265;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Mend);paint-order:markers fill stroke;stop-color:#000000" + d="m 34.590338,55.360048 c 1.051358,-1.820435 1.974353,-2.426981 3.317324,-2.31217 0.956924,0.08181 1.647835,1.289889 2.049783,2.024833" + id="path880" + sodipodi:nodetypes="cac" /> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + d="m 25.076267,47.179412 -6.5917,7.093847" + id="path14001" + sodipodi:nodetypes="cc" /> + <rect + style="opacity:1;fill:#25e325;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + id="rect13997" + width="4.4276514" + height="3.9112766" + x="23.986937" + y="44.075451" /> + <rect + style="opacity:1;fill:#ff8cff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + id="rect13999" + width="4.4276514" + height="3.9112766" + x="15.955473" + y="53.282654" /> + <path + sodipodi:type="star" + style="opacity:1;fill:#ff8cff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + id="path14003" + sodipodi:sides="3" + sodipodi:cx="26.161613" + sodipodi:cy="55.658291" + sodipodi:r1="2.7924292" + sodipodi:r2="1.3962145" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="true" + inkscape:rounded="0" + inkscape:randomized="0" + d="m 28.579928,57.054505 -4.836629,0 2.418314,-4.188643 z" + inkscape:transform-center-y="-0.69810795" /> + <path + style="fill:none;stroke:#000000;stroke-width:0.264583;stroke-miterlimit:4;stroke-dasharray:0.529166, 0.529166;stroke-dashoffset:0;paint-order:markers fill stroke;stop-color:#000000" + d="M 11.791704,65.225482 9.0065326,70.566411" + id="path1467" + sodipodi:nodetypes="cc" /> + <path + style="fill:none;stroke:#000000;stroke-width:0.264583;stroke-miterlimit:4;stroke-dasharray:0.529166, 0.529166;stroke-dashoffset:0;paint-order:markers fill stroke;stop-color:#000000" + d="m 13.983628,65.280594 1.865426,5.369391" + id="path1469" + sodipodi:nodetypes="cc" /> + <circle + style="opacity:1;fill:#6abfff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + id="path14861" + cx="12.714239" + cy="65.343147" + r="2.3446827" /> + <path + sodipodi:type="star" + style="opacity:1;fill:#ff8cff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + id="path14863" + sodipodi:sides="3" + sodipodi:cx="33.771244" + sodipodi:cy="55.658291" + sodipodi:r1="2.7924292" + sodipodi:r2="1.3962145" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="true" + inkscape:rounded="0" + inkscape:randomized="0" + d="m 36.189559,57.054505 -4.83663,0 2.418315,-4.188643 z" + inkscape:transform-center-y="-0.69810795" /> + <path + sodipodi:type="star" + style="font-variation-settings:normal;opacity:1;vector-effect:none;fill:#6abfff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000;stop-opacity:1" + id="path14865" + sodipodi:sides="3" + sodipodi:cx="31.079979" + sodipodi:cy="69.469734" + sodipodi:r1="2.7924292" + sodipodi:r2="1.3962145" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="true" + inkscape:rounded="0" + inkscape:randomized="0" + d="m 33.498294,70.865949 -4.83663,0 2.418315,-4.188644 z" + inkscape:transform-center-y="-0.69810795" /> + <circle + style="font-variation-settings:normal;opacity:1;vector-effect:none;fill:#6abfff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000;stop-opacity:1" + id="circle14867" + cx="21.223957" + cy="65.343147" + r="2.3446827" /> + <rect + style="font-variation-settings:normal;opacity:1;vector-effect:none;fill:#6abfff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000;stop-opacity:1" + id="rect14869" + width="4.4276514" + height="3.9112766" + x="36.216988" + y="66.749847" /> + <path + id="path82" + inkscape:connector-curvature="0" + d="m 41.515562,54.871846 -0.417806,1.20398 -1.168202,0.279888 -0.960641,-0.870246 -1.138707,0.832408 0.537693,1.179427 -0.621192,1.02818 -1.273991,0.03274 -0.218785,1.407938 1.203979,0.417805 0.279889,1.168202 -0.870245,0.960643 0.832407,1.138704 1.179427,-0.537691 1.028181,0.621191 0.03274,1.273992 1.407938,0.218785 0.417806,-1.20398 1.168202,-0.279888 0.96064,0.870244 1.138706,-0.832406 -0.537691,-1.179427 0.621192,-1.028182 1.273992,-0.03274 0.218784,-1.407938 -1.20398,-0.417805 -0.279888,-1.168203 0.870246,-0.96064 -0.83241,-1.138707 -1.179425,0.537693 -1.028181,-0.621192 -0.03274,-1.273992 z" + style="fill:#d0dbf5;fill-opacity:1;stroke:#0f2d59;stroke-width:0.284967;stroke-linecap:round;stroke-linejoin:round" + sodipodi:nodetypes="ccccccccccccccccccccccccccccccccc" /> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.265;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#marker1218);paint-order:markers fill stroke;stop-color:#000000" + d="m 36.505382,61.212129 c -1.732593,0.460546 -2.239587,0.94846 -3.054171,1.805942 -0.855057,0.900086 -1.291029,1.914968 -1.728787,3.298907" + id="path1214" + sodipodi:nodetypes="cac" /> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.265;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#marker1266);paint-order:markers fill stroke;stop-color:#000000" + d="m 39.119283,63.680684 c -0.561579,0.349977 -1.171361,1.831472 -1.388934,2.468193" + id="path1262" + sodipodi:nodetypes="cc" /> + <g + id="g1624-1" + transform="translate(-24.776227,-7.0250037)"> + <path + sodipodi:type="star" + style="font-variation-settings:normal;vector-effect:none;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.113934;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000" + id="path14865-5-2" + sodipodi:sides="3" + sodipodi:cx="66.174721" + sodipodi:cy="64.759911" + sodipodi:r1="1.2024668" + sodipodi:r2="0.60123342" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="true" + inkscape:rounded="0" + inkscape:randomized="0" + inkscape:transform-center-y="-0.30061479" + inkscape:transform-center-x="2.0589357e-06" + d="m 67.216088,65.361144 -2.082734,0 1.041367,-1.8037 z" /> + <path + style="font-variation-settings:normal;opacity:1;vector-effect:none;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.15;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker1559-2);paint-order:markers fill stroke;stop-color:#000000;stop-opacity:1" + d="M 66.173282,65.809672 V 67.26045" + id="path1555-0" /> + <g + id="g1807" + transform="translate(0.32991862)"> + <rect + style="font-variation-settings:normal;vector-effect:none;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.113934;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000" + id="rect14869-6-7" + width="1.906621" + height="1.6842613" + x="66.594307" + y="67.911743" /> + <path + sodipodi:type="star" + style="font-variation-settings:normal;vector-effect:none;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.113934;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000" + id="path1803" + sodipodi:sides="3" + sodipodi:cx="64.271751" + sodipodi:cy="69.082977" + sodipodi:r1="1.2024668" + sodipodi:r2="0.60123342" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="true" + inkscape:rounded="0" + inkscape:randomized="0" + inkscape:transform-center-y="-0.30061479" + inkscape:transform-center-x="2.0589357e-06" + d="m 65.313118,69.684211 -2.082733,0 1.041366,-1.803701 z" /> + </g> + </g> + </g> + <g + id="g1374" + transform="translate(-49.214304,-4.5219647)"> + <circle + style="font-variation-settings:normal;vector-effect:none;fill:#6abfff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000" + id="circle14867-5" + cx="61.878521" + cy="49.767113" + r="2.3446827" /> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.125046" + x="59.173122" + y="60.298515" + id="text14855"><tspan + sodipodi:role="line" + id="tspan14853" + x="59.173122" + y="60.298515" + style="font-size:3.52778px;stroke-width:0.125046">Converter</tspan></text> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.125046" + x="59.173122" + y="45.423546" + id="text14859"><tspan + sodipodi:role="line" + id="tspan14857" + x="59.173122" + y="45.423546" + style="font-size:3.52778px;stroke-width:0.125046">StructureElement</tspan></text> + <rect + style="fill:#25e325;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + id="rect13997-6" + width="4.4276514" + height="3.9112766" + x="65.556091" + y="47.811474" /> + <path + sodipodi:type="star" + style="fill:#ff8cff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + id="path14003-7" + sodipodi:sides="3" + sodipodi:cx="73.831802" + sodipodi:cy="50.531364" + sodipodi:r1="2.7924292" + sodipodi:r2="1.3962145" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="true" + inkscape:rounded="0" + inkscape:randomized="0" + inkscape:transform-center-y="-0.69810795" + d="m 76.250117,51.927579 -4.836629,0 2.418314,-4.188644 z" /> + <g + id="g1649"> + <path + id="path82-3" + inkscape:connector-curvature="0" + d="m 66.342602,61.715213 -0.417806,1.20398 -1.168202,0.279888 -0.960641,-0.870246 -1.138707,0.832408 0.537693,1.179427 -0.621192,1.02818 -1.273991,0.03274 -0.218785,1.407938 1.203979,0.417805 0.279889,1.168202 -0.870245,0.960643 0.832407,1.138704 1.179427,-0.537691 1.028181,0.621191 0.03274,1.273992 1.407938,0.218785 0.417806,-1.20398 1.168202,-0.279888 0.96064,0.870244 1.138706,-0.832406 -0.537691,-1.179427 0.621192,-1.028182 1.273992,-0.03274 0.218784,-1.407938 -1.20398,-0.417805 -0.279888,-1.168203 0.870246,-0.96064 -0.83241,-1.138707 -1.179425,0.537693 -1.028181,-0.621192 -0.03274,-1.273992 z" + style="fill:#d0dbf5;fill-opacity:1;stroke:#0f2d59;stroke-width:0.284967;stroke-linecap:round;stroke-linejoin:round" + sodipodi:nodetypes="ccccccccccccccccccccccccccccccccc" /> + <g + id="g1624" + transform="translate(-0.23034383)"> + <path + sodipodi:type="star" + style="font-variation-settings:normal;vector-effect:none;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.113934;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000" + id="path14865-5" + sodipodi:sides="3" + sodipodi:cx="66.489288" + sodipodi:cy="64.759911" + sodipodi:r1="1.2024668" + sodipodi:r2="0.60123342" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="true" + inkscape:rounded="0" + inkscape:randomized="0" + inkscape:transform-center-y="-0.30061479" + d="m 67.530655,65.361144 -2.082734,0 1.041367,-1.8037 z" + inkscape:transform-center-x="2.0589357e-06" /> + <rect + style="font-variation-settings:normal;vector-effect:none;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.113934;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000" + id="rect14869-6" + width="1.906621" + height="1.6842613" + x="65.535973" + y="67.911743" /> + <path + style="font-variation-settings:normal;opacity:1;vector-effect:none;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.15;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker1559);paint-order:markers fill stroke;stop-color:#000000;stop-opacity:1" + d="M 66.487845,65.809672 V 67.26045" + id="path1555" /> + </g> + </g> + </g> + </g> +</svg> diff --git a/src/doc/img/properties-from-dict-records-author.png b/src/doc/img/properties-from-dict-records-author.png new file mode 100644 index 0000000000000000000000000000000000000000..20ee9497ab5ae577c3d515f11da6294c88601fed Binary files /dev/null and b/src/doc/img/properties-from-dict-records-author.png differ diff --git a/src/doc/img/properties-from-dict-records-person.png b/src/doc/img/properties-from-dict-records-person.png new file mode 100644 index 0000000000000000000000000000000000000000..8b026056a42ff3ba203c6077a426640c864b24c1 Binary files /dev/null and b/src/doc/img/properties-from-dict-records-person.png differ diff --git a/src/doc/index.rst b/src/doc/index.rst index b4e30e4728068cabb92626cfac986ab858a0bbb6..fdb99d4d9a6cb8bf6972d7ee22489f362436bb90 100644 --- a/src/doc/index.rst +++ b/src/doc/index.rst @@ -7,14 +7,18 @@ CaosDB-Crawler Documentation :caption: Contents: :hidden: - Getting started<README_SETUP> + Getting started<getting_started/index> + Tutorials<tutorials/index> Concepts<concepts> - Converters<converters> + Converters<converters/index> CFoods (Crawler Definitions)<cfood> + CFood-Specification<cfood-specification> Macros<macros> - Tutorials<tutorials/index> How to upgrade<how-to-upgrade> API documentation<_apidoc/modules> + Related Projects<related_projects/index> + + Back to Overview <https://docs.indiscale.com/> @@ -31,7 +35,7 @@ The hierarchical structure can be for example a file tree. However it can be also something different like the contents of a JSON file or a file tree with JSON files. -This documentation helps you to :doc:`get started<README_SETUP>`, explains the most important +This documentation helps you to :doc:`get started<getting_started/index>`, explains the most important :doc:`concepts<concepts>` and offers a range of :doc:`tutorials<tutorials/index>`. @@ -40,4 +44,3 @@ Indices and tables * :ref:`genindex` * :ref:`modindex` -* :ref:`search` diff --git a/src/doc/macros.rst b/src/doc/macros.rst index d3a3e9b9634a4e1d72228dd46692a824e1d5acfd..3a234973ee17791aaa2a0bd9e4b81836207a07e0 100644 --- a/src/doc/macros.rst +++ b/src/doc/macros.rst @@ -1,7 +1,11 @@ Macros ------ -Macros highly facilitate the writing of complex :doc:`CFoods<cfood>`. Consider the following prevalent example: +Introduction +============ + +Macros highly facilitate the writing of complex :doc:`CFoods<cfood>`. Consider the following common +example: .. _example_files: .. code-block:: yaml @@ -24,7 +28,7 @@ Macros highly facilitate the writing of complex :doc:`CFoods<cfood>`. Consider t This example just inserts a file called ``README.md`` contained in Folder ``ExpreimentalData/`` into CaosDB, assigns the parent (RecordType) ``MarkdownFile`` and allows for later referencing this entity within the cfood. As file objects are created in the cfood specification using the ``records`` section with the special role ``File``, defining and using many files can become very cumbersome and make the cfood file difficult to read. The same version using cfood macros could be defined as follows: - + .. _example_files_2: .. code-block:: yaml @@ -79,19 +83,49 @@ The expanded version of `ExperimentalData` will look like: type: SimpleFile type: Directory -This :ref:`example<_example_files_2>` can also be found in the macro unit tests (see :func:`unittests.test_macros.test_documentation_example_2`). +This :ref:`example<example_files_2>` can also be found in the macro unit tests (see :func:`unittests.test_macros.test_documentation_example_2`). -Complex Example -=============== -The following, more complex example, demonstrates the use -of macro variable substitutions that generate crawler variable substitutions: +Mixing macros and plain definitions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can also mix macros and plain definitions. Whenever a name cannot be resolved to a macro, a +plain yaml node definition is used as a fallback: + +.. code:: yaml + + --- + metadata: + macros: + - !defmacro + name: MarkdownFile + # ... Definition here ... + --- + ExperimentalData: + type: Directory + match: ExperimentalData + subtree: !macro + MarkdownFile: + - name: README + filename: ^README.md$ + OtherContent: # There is no macro named "OtherContent", so this is parsed as normal content. + type: SimpleFile + match: .*txt + records: + # ... Normal content ... + -- `$$$nodename` will lead to a macro variable substitution of variable `$nodename` during macro expansion. -- `$$` will be turned into `$` -- So in the crawler cfood, the string will appear as `$value` if variable `nodename` would be set to `value` when using the macro. +Complex example +=============== +Let's try something more complex: what happens to multiple ``$``? This example demonstrates the use +of `macro` variable substitutions to generate `crawler` variable substitutions: + +- ``$$`` will be converted into ``$``. +- ``$$$nodename`` will retain a single ``$`` and substitute ``$nodename`` during macro expansion. +- So in the cfood, if ``nodename: value``, the string ``$$$nodename`` will be converted to + ``$value``. .. _example_1: .. code-block:: yaml @@ -117,7 +151,8 @@ of macro variable substitutions that generate crawler variable substitutions: Simulation: $recordtype: +$File -The expanded version of :ref:`example<_example_1>` can be seen in :ref:`example<_example_1_expanded>`. +The expanded version of the :ref:`example above<example_1>` (with ``nodename: Dataset``) can be seen +:ref:`here<example_1_expanded>`: .. _example_1_expanded: @@ -140,11 +175,11 @@ The expanded version of :ref:`example<_example_1>` can be seen in :ref:`example< type: SimpleFile type: Directory -This :ref:`example<_example_1>` can also be found in the macro unit tests (see :func:`unittests.test_macros.test_documentation_example_1`). - +This example can also be found in the macro unit tests (see +:func:`unittests.test_macros.test_documentation_example_1`). -Using Macros Multiple Times +Using macros multiple times =========================== To use the same macro multiple times in the same yaml node, lists can be used: @@ -173,7 +208,7 @@ To use the same macro multiple times in the same yaml node, lists can be used: - {} # <- This is the third one, just using default arguments -This :ref:`example<_example_multiple>` is taken from the macro unit tests (see :func:`unittests.test_macros.test_use_macro_twice`). +This :ref:`example<example_multiple>` is taken from the macro unit tests (see :func:`unittests.test_macros.test_use_macro_twice`). The example will be expanded to: @@ -192,13 +227,16 @@ The example will be expanded to: a: '5' +Note, that you need to make sure that subsequent macro calls do not +use the same top level key. Because later versions would overwrite previous +ones. Here we used ``$macro_name`` to prevent that. -Limitation ----------- +Limitations +=========== -Currently it is not possible to use the same macro twice in the same yaml node, but in different -positions. Consider: +Currently it is not possible to use the same macro twice in the same yaml node, if it occurs in +different positions. Consider: .. _example_multiple_limitation: .. code-block:: yaml @@ -223,11 +261,50 @@ positions. Consider: Other_node: type: test - test_twice: # This is NOT possible as each - # dictionary element can only appear once in a yaml node. + test_twice: # This is NOT possible as each key + # can only appear once in a yaml node. - macro_name: twice # <- This is the second one, with different arguments a: 5 - {} # <- This is the third one, just using default arguments -However, this should not be a real limitation, as the crawler is designed in a way, -that the order of the nodes in the same level should not matter. +This should not be a real limitation however, as the order of nodes does not matter for the crawler. + + +Using macros within macro definitions +===================================== + +It is possible to use other macros in macro definitions. Again, examples can be found in +the macro unit tests (see e.g. :func:`unittests.test_macros.test_macros_in_macros`): + +.. _example_macros_in_macros: +.. code-block:: yaml + + --- + metadata: + crawler-version: 0.3.1 + macros: + - !defmacro + name: one_macro + params: + a: 25 + definition: + macro_sub_$a: + b: $a + another_param: 3 + - !defmacro + name: test_macrodef + params: {} + definition: + macro_top: !macro + one_macro: + - a: 17 + - {} + - a: 98 + not_macro: + a: 26 + --- + extroot: !macro + test_macrodef: + +TODO: +to be continued diff --git a/src/doc/related_projects/index.rst b/src/doc/related_projects/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..1dc8f41d6b81cae202bd46e7945ca05c682e479f --- /dev/null +++ b/src/doc/related_projects/index.rst @@ -0,0 +1,25 @@ +Related Projects +++++++++++++++++ + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + :hidden: + +.. container:: projects + + For in-depth documentation for users, administrators and developers, you may want to visit the subproject-specific documentation pages for: + + :`Server <https://docs.indiscale.com/caosdb-server>`_: The Java part of the LinkAhead server. + + :`MySQL backend <https://docs.indiscale.com/caosdb-mysqlbackend>`_: The MySQl/MariaDB components of the LinkAhead server. + + :`WebUI <https://docs.indiscale.com/caosdb-webui>`_: The default web frontend for the LinkAhead server. + + :`PyLinkAhead <https://docs.indiscale.com/caosdb-pylib>`_: The LinkAhead Python library. + + :`Advanced user tools <https://docs.indiscale.com/caosdb-advanced-user-tools>`_: The advanced Python tools for LinkAhead. + + :`LinkAhead <https://docs.indiscale.com/caosdb-deploy>`_: Your all inclusive LinkAhead software package. + + :`Back to Overview <https://docs.indiscale.com/>`_: LinkAhead Documentation. diff --git a/src/doc/tutorials/example_crawler.svg b/src/doc/tutorials/example_crawler.svg new file mode 100644 index 0000000000000000000000000000000000000000..d7af6fdaf37b550a9cc2adca6c7d7e411d3a70ad --- /dev/null +++ b/src/doc/tutorials/example_crawler.svg @@ -0,0 +1,310 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:xlink="http://www.w3.org/1999/xlink" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="221.24644mm" + height="211.33792mm" + viewBox="0 0 221.24645 211.33792" + version="1.1" + id="svg348" + xml:space="preserve" + inkscape:version="1.0.2 (e86c870879, 2021-01-15)" + sodipodi:docname="example_crawler.svg"><metadata + id="metadata57"><rdf:RDF><cc:Work + rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /><dc:title></dc:title></cc:Work></rdf:RDF></metadata><sodipodi:namedview + id="namedview350" + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1.0" + inkscape:showpageshadow="2" + inkscape:pageopacity="0.0" + inkscape:pagecheckerboard="0" + inkscape:deskcolor="#d1d1d1" + inkscape:document-units="mm" + showgrid="false" + inkscape:zoom="0.75001586" + inkscape:cx="430.07823" + inkscape:cy="364.2107" + inkscape:window-width="1920" + inkscape:window-height="1135" + inkscape:window-x="0" + inkscape:window-y="0" + inkscape:window-maximized="1" + inkscape:current-layer="layer1" + inkscape:document-rotation="0" + fit-margin-right="5" + fit-margin-top="0" + fit-margin-left="0" + fit-margin-bottom="0" /><defs + id="defs345" /><g + inkscape:label="Ebene 1" + inkscape:groupmode="layer" + id="layer1" + transform="translate(-0.51905298,-0.01583024)"><image + width="180.18124" + height="209.55" + preserveAspectRatio="none" + xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAqkAAAMYCAYAAADox36jAAAABHNCSVQICAgIfAhkiAAAIABJREFU eJzs3Wd0VWX69/HvSS8nhZBiAmnEQBKEPyBFBBXpPkoRFM0EZqQ5IiOOMwqCqIgiOqA40hQGkVFk pKkoCAgKCCJGYKgDY0JCIIQSAuk953nBeOSYwDnpkfw+a7FWsu927eLyyr3vvbchLzfHhIjI/5w9 e5abbrqpvsNoNHS8pSboOpIbkUFJqoiIiIg0NHb1HYCIiIiIyK8pSRURERGRBkdJqoiIiIg0OAaT yVStNanZ2dnXLffw8KhO9yIiIiLSCGkmVUREREQaHCWpIiIiItLgKEkVERERkQbHwZZKubm57Nmz h2PHjuHp6cnw4cNrOy4RERERacRsSlLt7OwICgqiqKiI9PT02o5JRERERBo5m273u7q6EhUVRVBQ UG3HIyIiIiKiNakiIiIi0vAoSRURERGRBkdJqoiIiIg0OEpSRURERKTBUZIqIiIiIg2OwWQymaxV Wr16NadPn6aoqIiioiKMRiOenp6MGjWK7Ozs67b18PCosWBFREREpHGwKUm9HiWpIiIiIlLTdLtf RERERBocJakiIiIi0uAoSRURERGRBkdJqoiIiIg0OEpSRURERKTBUZIqIiIiIg2OktSrZCQfZ+Xo nqQnHK6X8U2mskq3KSksYM24/qwZ158PHrqVtEN7aiGyK6oSn4iIiEhVONhSKTU1la1bt3LhwgWc nZ3p1asX0dHRNg+y78O/c/SLD3FwdTVv849qT89Jf698xLXIrYkfoV174940oM7HvnTyv8S/P5u+ Ly6qVDsHZxeGLtwIwBcTYyuss2Zcf0oKCzDY2WP0C6TN/aMJ7nx3ncQnIiIiUhVWk1STycSOHTvo 1asXQUFBnDhxghUrVvDkk09W6kX9rfo9SKeRE6sVbG1z8fKhy5gp9TJ2QdblWu2/93MLaNoimvPH /s32ORMpKSogvPs9Nrev7fhERERErmY1STUYDMTG/jJDFxERQUBAAOfOnauxr0ltm/1XvJtH0O7h x82/+0W2ofWgR4ArM4Qx9w3n+KaVZKWdxC+yDd3GT8fZswkAZaWlHFy9iKSdG8Bkwj+6A51HTcLR 1R2AvIzzfP3aBHpPXUj80r9x5sBumoRE0nfaYgA2TxtL9rlTAOSmn2Xgm2vwDo4A4OyRHzm6bhkl hfnkXEij88iJ7FnyGp5BofR5/h2bxv9iYixdH3uBw58sIe3QDxj9m3HXX/6Gx03BFGRmsHnaWAqy LlGUm82acf0B8AgINsd3+VQih9b+g4uJRynKzaJZ++7c9sep2Ds6V+5AGwz4R7en0yNPs2/5XHOS er3+ayq+kpISOnXqRLt27Vi6dGnl4hYREZFGp9JrUsvKyrh06RK+vr41FsRtY5/jv1vWcOnkT6Tu 30XO+VRiBoywqJO4bR09np7Ng4u+ws7BkR8/mGMuO7ByIeeO/sh9sz7m/rmf4+RmZN/yty3a519K 59u3niWkc0+GLviS7hNmmMv6TlvM0IUbGbpwIy6ePuXiO3NgN51HT8Y3sg2HPnmP+17/iPSEw+Sm n7V5/N0Lp3HL/aO5f+46XL2bcmjtEuDK7O3AOWvoMnYK/tHtzXH8nAACZKelENatPwPnrGHIgi+5 fDqR/25eXcWjDc3adSP7bArFeTlW+6+p+AoKCkhMTOTo0aNVjltEREQaj0onqbt376ZFixZ4e3tX qt3xTStZ8ftu5n95GRfMZS5ePnR65Bm+WziNH9+fze2Pv4TBzt6i/S33j8K1iR92Do5E3D2I1H07 zWXHNnxEh7gncXRxA4OBNkPHcip+m0X7vIzztBk6htCufXBwccXNx9/m2L2ah+MdHIFnYCjNOnTH 2bMJ7r6BZJ87bfP47X83gaYtonH28Cbs9n5kpibZPH5w57sJ7ngXpUWFZJ5OxDMwlAs/HbK5/a85 uhmxd3Qm//LFGunflvZGo5Hk5GS2b99e5bhFRESk8bDpwamfJSUlsXfvXkaNGlXpgVr1G3bdNamh t/XihyUz8WvZFp+wVtftyzs4gsKcTAAKsy5RlJ/LrnnPW9Rx9vC0+N3BxY2bWneqdNxXMxjK/2zr +HYOvxxqV++mlBYX2TxuXsZ5fljyGsX5eTS9OQaDnT0l/5sFrYqivBxKiwtx9fGrkf5tbe/jU36W WkRERKQiNiepZ86c4dNPP+V3v/sdRqOxxgM5sOpdQm7rTdrBPZz7zz4Cojtcs252WgpG/2YAOHt4 4+jiRp8X3sHdN7DG47Kmpsa3d3SiMLvih5N2vDmR6HvjCO3aB7iy9CFlz9fl6hnsDJhKS62OdXrv DjyDwq7M/NrYf03El5mZiaurK05OTlZjFBERkcbNptv9p06d4uOPP2bYsGEEBNT865kyko5xYsd6 Ov7+L9w+7gV2zXue4oI8izrJ322mtLiQorwcDqx8h8he918pMBho1fdBvl80g6L/zd4VZGaQkXSs xuOsUA2N7x0cweWUBHLT0670kXXJXJZzIQ2D3ZVTlZV2kuObV1XYh9EviNP7vgWTyTzT/Gvnj+1n 7z/fpH3s+Er1X934cnNzCQsLo0ePHtc8BiIiIiI/szqTWlxczAcffIDBYOBf//oXpf+bqQsKCmL4 8OE2D3R80yoSd3xh/t296U0MmL2SstISds6dSpcxk3F0deemWzrT/NY7iV/6N24fN+2XQJ1dWD/x dxTmXCb8jnstHqxqHzeBQ2v+wYZn48BgwMnNSNsHHsUnPMrm+KqjJsY3+jejQ9wENk59BHsnF9x9 A+n13Hzs7O3pMmYyB9csYv+KeXiHRNKq7zBS9mwt10fbBx5l+5sTWfXHvgTE3Mqdf37NXLZ15hMY DODW9CZuG/ucxXtSbem/uvE5OTkRGhpKZGSkzcdEREREGi+DyWQyVaeD7Ozs65bXxGuqvpgYy60j niKwTedq9yUiIiIiDd9v6LOo1cqlRUREROQ35DeUpIqIiIhIY/GbuN0vIiIiIo1Lpd6TWhWvX/1y 0QpMql6OLCIiIiI3IN3uFxEREZEGR0mqiIiIiDQ4v6kk1WQqq+8QatWBAwcIDAwkPj6+XsYvK7ux j6+IiIj8dtiUpKakpLBs2TLefPNN5syZw3fffVfbcZVz6eR/+Wr6Y3U+bk0IDw8nICCAoKAgunbt ymeffVZhvcDAQIYOHUrz5s3rOEI4ePAg/fr1q/NxRURERCpi04NTSUlJ9OrVi+bNm5Oens67775L UFAQYWFhtRzeLwqyKv5u/G/Fhg0b6NChA7t27SI2Npa8vDxiY2Mt6vj7+zNv3rx6iS89Pb1exhUR ERGpiE0zqXfddZd5ds/X15fg4GDy8/NrNbCfFWRmsO6poXz71rOc/89+1ozrz5px/dk8bSwAl08l svqPfS2WAhTlZvPxqB6UFhcCV75YdWLHer587g98PKoHX898gsKrvj1fVlrKvz9eyCdPDOCTP93H rvkvUJyfaxFHSUkJ7du3Z+TIkVXeF4PBQPfu3XnjjTd44YUXzNt79+5NeHg44eHhODg4cOTIEYt2 qampdOzYkQsXLjB8+HACAgLo3bu3RWzTpk2jVatWtGzZklGjRpV7NdiKFSto3749zZo149Zbb2Xd unUAnD9/nrZt2xIXF8fOnTvNcVzdf2ZmJmPGjCEkJISIiAheeeUV8+dxrcV35MgRQkJCLJYSXL58 mYCAAAoKCqp8LEVEROTGZvOaVJPJRE5ODvHx8eTn59fZN9hdvHwYOGcNXcZOwT+6PUMXbmTowo30 nbYYAO/gCIwBzUndv8vc5uSeLQR37IG9o7N5W+K2dfR4ejYPLvoKOwdHfvxgjrnswMqFnDv6I/fN +pj7536Ok5uRfcvftoijoKCAxMREjh49Wu196t+/PwkJCWRlZQGwZcsWkpKSSEpKws/Pr8I2Z8+e JS4ujsGDB3PixAmWLVtmLnvppZfYsWMHe/fu5fjx43h5eTFlyhRz+cqVK5k8eTJLly4lNTWV5cuX k5eXB1yZvT148CDz5s2je/fu5ji2bNlibj927FgMBgOJiYnEx8ezfv16/v73v9sUX+vWrWnRogUb N2401127di0DBgzAxcWlmkdSREREblQ2J6nHjh1j4cKFfPPNNwwcOBAHh1p/xarNovo/zE9frTH/ nrRjAy3uus+izi33j8K1iR92Do5E3D2I1H07zWXHNnxEh7gncXRxA4OBNkPHcip+m0V7o9FIcnIy 27dvr3a8np6euLi4cPbsWZvbpKamMmXKFB544AHc3d1p1qyZuWzu3Lm8+uqrGI1GDAYDU6ZMMc+U AsyZM4eZM2fSrl07AKKionj44YdtGvfy5cusXbuW2bNn4+joiI+PD9OnT2fx4sU2xzd+/HiL+h99 9BEjRoywed9FRESk8bE504yOjiY6OpqMjAxWr17N7bffzi233FKbsdkspEtPflz2BvmXLoDBQPa5 09wUc+s163sHR1CYkwlAYdYlivJz2TXveYs6zh6e5dr5+PjUSLyZmZkUFBQQFBRkcxuj0UiPHj3K bU9PTycrK6vcMoSrY/3pp59o3bp1lWJNSkrC19cXLy8v87abb76ZpKQkm+IDGDx4ME8//TRpaWkY DAZOnDjBnXfeWaV4REREpHGo9HSoj48P7dq14z//+U+dJqn2jk4UZlf88JSdvQM39xxM4rbPcXBx JfyOe+A6X7rKTkvB6H9lps/ZwxtHFzf6vPAO7r6B140hMzMTV1dXnJycqr4jwPr162nVqhVGo7Fa /QA0bdoUo9HIpk2bCAkJqbBOWFgYx48fp23bttfsx8XFhYsXL5bbHhoaSnp6OtnZ2eZP3J44caJS D805OjoycuRI/vnPf+Lu7k5sbCwGK18iExERkcbN6u3+/Px8Vq1aZU5gLl26xJEjRyxu59YF7+AI LqckkJueBkDBVQ8+AbTs8wCJ2z8n+bvNRNw1oFz75O82U1pcSFFeDgdWvkNkr/uvFBgMtOr7IN8v mkFRXs6VvjMzyEg6ZtE+NzeXsLCwa84W2mrXrl1MnDiR6dOnV6ufnxkMBh577DEef/xxMjOvzA6f P3+e/fv3m+uMGzeOKVOmcOzYlX06efIks2bNsugnJiaGw4cPk5KSAsCFCxeAK3+UDBo0iIkTJ1Ja WkpmZiYvvvgio0ePrlScjz76KB988AGrVq3SrX4RERGxyupMqqurK61ateKzzz7j8uXLmEwm2rVr x2233VYX8ZkZ/ZvRIW4CG6c+gr2TC+6+gfR6bj529vYAuPn44dW8BTnnz+DVLLxcewdnF9ZP/B2F OZcJv+NeYgb8kii1j5vAoTX/YMOzcWAw4ORmpO0Dj+ITHmWu4+TkRGhoaJUfGBswYAAGg4Hg4GDm z5/PoEGDqtRPRWbMmMHMmTO57bbbMBgMeHl5MXXqVNq3bw/A6NGjKSkpYciQIeTm5uLn58fkyZMt +ggPD+fVV1/lzjvvxNXVlZCQENavX4+DgwNLlizhqaeeokWLFjg4OPD73/+ev/zlL5WKMSgoiOjo aJKTk4mKirLeQERERBo1g8lkMlWng1+/6ujXFniWX9t5tUnVG97C9+++gnfIzUTdY/lQ0BcTY7l1 xFMEtulcY2NJ5Y0bN45bbrmF8ePH13coIiIi0sD9pj6Lej1nj8Rz9kg8LfsMvUaNmkuGpfK2bdvG tm3bGDt2bH2HIiIiIr8BDec9UlVUUljApxMG4ujqTrfx07FzcKzvkOQqeXl5REVF4enpyXvvvVft h85ERESkcbihbveLiIiIyI3hhrndLyIiIiI3DiWpIiIiItLgKEkVERERkQanUSWpJlNZldp9MTGW tEN7ajiaG8+BAwcIDAwkPj6+XsYvK6v8+c3LyyM8PJzw8HCcnZ3ZunVrLUR2RVXiu9rOnTvZu3dv DUUjIiLSsFU6SV2xYgWLFi2qjVhq1aWT/+Wr6Y/VdxjXdfvtt/PXv/613PbJkyfj6uqKn5+f+d/g wYPrIcLrCwwMZOjQoTRv3rzOxz548CD9+vWrdDs3NzeSkpJISkq65mdjw8PDCQgIICgoiK5du/LZ Z5/VWXxXO3DggPmrYSIiIje6Sr2C6sCBAxQXF9dWLLWqIOtyfYdwXUeOHMHPz4+vv/6aoqKicq9q euyxx5gzZ049RWcbf39/5s2bVy9jp6en12r/GzZsoEOHDuzatYvY2Fjy8vKIjY21uX114issLOS5 555jxYoVlJaWsnnzZt566y2aNGlS5T5FREQaOptnUrOysvj222/p1q1bbcZTztkjP/L1zCfYPG0M a8ffy+kft7Nm3D189fIvs6KXTyXy7d8n8+mEQawcfTe75j1PaXEhAAWZGax7aijfvvUs5/+znzXj +rNmXH82T7N8qXzSzi/5/OkHWTW2F1888xCn4rdZlBfn5bD9jaf51yN38sXEWLLPnrIoLykpoX37 9owcObJK+7l48WIeeeQR7rnnHj755JNKtX3wwQd58cUXLX6fPXu2+fdOnTqxfPlyunfvTkBAAAMH DrRImkpKSpg2bRqtWrWiZcuWjBo1yuLVYqmpqXTs2JELFy4wfPhwAgIC6N27t7m8d+/e5lvmDg4O HDlyxFy2fft2Bg4cSK9evbj55pv54osvaNGiBf3797d5/E6dOrFv3z6GDRuGr68vnTp1IjExEYDz 58/Ttm1b4uLi2LlzpzmOq+M7cuQIw4cPJzo6mptuuomRI0dSUFBQqWMMYDAY6N69O2+88QYvvPCC Tf3XRHxLlixhz549/PTTT5w+fZpu3bqRn59f6fhFRER+S2xOUj///HN69uyJs7NzbcZToTMHdtN5 9GR8I9tw6JP3uO/1j0hPOExu+lkAstNSCOvWn4Fz1jBkwZdcPp3IfzevBsDFy4eBc9bQZewU/KPb M3ThRoYu3EjfaYvN/Sd/t4l9H75Ft/Ev8+Dirdzx59coKbRMYv798UJuuX80989dh6t3Uw6tXWJR XlBQQGJiIkePHq30/hUWFvLll19y77338vvf/57Fixdbb3SV+fPns3jxYg4dOsTGjRtJSkriqaee sqizbNkyVq1axalTp3BycmLixInmspdeeokdO3awd+9ejh8/jpeXF1OmTLFof/bsWeLi4hg8eDAn Tpxg2bJl5rItW7aYb5n7+fmVi2/z5s28/fbbdOnShddee40ffviBH374gVOnTtk8/tixY3n22Wc5 fvw4AQEBzJw5E7gye3vw4EHmzZtH9+7dzXFs2bLF3DYhIYGHHnqIgwcPcuLECY4ePcq7775bqWN8 tf79+5OQkEBWVpbV/msqPoPBgMlkwsHBgUcffZSgoKAqxy8iIvJbYNPt/v379+Po6EhMTAynT5+u 7ZjK8WoejndwBJ6BoXgHR+Ds2QR330Cyz53G3fcmgjvfDUBxfi5ZZ5LxDAzlwk+HiLax/6Off0CH 4X/GJzzqynjNwvFqFm5R59YRT9G0xZUew27vx3+/Wm1RbjQaSU5Oxs3NrdL7t2bNGvr374+TkxNR UVHk5OSQmJhIRESEuc7ChQt5//33zb8fOXLEnKj4+/vz5ptvMnbsWLKzs/noo4+wt7e3GGPSpEkE BgYC8Ic//IFHH33UXDZ37lw2btyI0WgEYMqUKXTs2JG5c+ea66SmpvLhhx/So0cPANzd3W3ev+jo aFq3bk1kZCQxMTH4+voSEhLCiRMnCA4Otmn8GTNm0KFDBwCGDRtWqXXRgwYNAq58eOL48eNERkby ww8/2Nz+1zw9PXFxceHs2bN4enpWu39r7ceMGcO///1vwsLCGDt2LJMmTcLLy6vK8YuIiPwWWE1S MzMz2bFjB6NHj66LeK7LYKj457yM8/yw5DWK8/NoenMMBjt7SvJybO43Ky0F7+CI69axc/jlULl6 N6W0uKhcHR8fH5vHvNqwYcN46KGHzL/v3LkTOzvLSe5x48Zdd03qkCFDmDBhArfddhv/93//d93x WrduTUZGBnBlrWRWVla5ZQq/3hej0WhOUKvKcNVJ+/lnW8d3dPzlc7cBAQEUFhbaPG5qaioTJkwg JyeHjh074uDgQGZmZlV2Abjy30RBQYH5j4Tq9m+tvZOTE4sWLeLPf/4zr7/+Oq1atWLTpk1Wz7OI iMhvmdUk9fjx4xgMBt577z3gyvrB3Nxc3n77bcaOHWuldd3Y8eZEou+NI7RrHwASt60jZc/XFnXs HZ0ozK744SmjfxCZqck0CW1ZrTgyMzNxdXWt9PfpHRwcrvu7LV5++WWGDBnC1q1b+fbbb7njjjuu WTchIYHw8CszxU2bNsVoNLJp0yZCQkIqPW511dT4Li4uXLx4scKy2NhYJkyYwAMPPABcWfrw6aef lqtnZ2dHSUmJ1bHWr19Pq1atzDO/tvRfE/HFxMSwbNkynn76ad59910WLFhgNVYREZHfKqtrUjt3 7syECRPM/4YNG0ZAQAATJkzA1dW1LmK0KudCGob/zTxmpZ3k+OZV5ep4B0dwOSWB3PQ0AAqyLpnL WvUbxr7lb5OZmvS//s5w+LP3KxVDbm4uYWFh1Z5trIr9+/ezfPlyZs2axaJFixg5ciQ5OZYzyatW raKgoIDMzExeeuklRo0aBVyZ0Xzsscd4/PHHzbN358+fZ//+/XUSe02NHxMTw+HDh0lJSQHgwoUL 5rKTJ0+alz/89NNP11yPGhoaypdffonJZDLPNP/arl27mDhxItOnT69U/9WJb8KECbzzzjucOXOG EydO8MMPP9CyZfX+oBIREWnoboiX+XcZM5lDnyzhsz/fz/4V82nVd1i5Okb/ZnSIm8DGqY/w6YRB fPvWZMpKSwGI7DWEWwb9gW/+9hSr/9iPbbP+gkdA5d716eTkRGhoKJGRkTWyT7/2zjvvWLwn9ef1 mcXFxTzyyCPMnTsXDw8P7r77bu67775yD065ubnRuXNnYmJiuP322y3KZ8yYQefOnbntttuIiYlh 0KBBnDlzplb2oyI1MX54eDivvvoqd955J9HR0QwfPtw8Kzp37lxee+01brnlFp5//nkee6zi9+VO nTqVzZs3ExISwhNPPGFRNmDAAJo1a8Zf//pX5s+fz7Bhv1xjtvRfnfiefPJJ9u/fT5cuXRg0aBAj RowoF5+IiMiNxmAymUzV6eDqVwVVZIGn53XLJ1VveLFBp06deP311+nZs2d9hyLVMH/+fLy9vYmL i6vvUERERGpd5Rc/ym9SNf8WkQYgMDDQvA5WRETkRqckVeQ3YsiQIfUdgoiISJ1RktoIxMfH13cI IiIiIpXS4JNUa2tePTw86igSEREREakrN8TT/SIiIiJyY1GSKiIiIiINjpJUEREREWlwbFqTeuDA AdatW2fx/fQBAwbQunXrWgtMRERERBovm5LUgoICOnbsyD333FPb8YiIiIiI2Ha7Pz8/H3d399qO RUREREQEqMRManp6OitWrKCsrIzo6Gjzt+NFRERERGqaTUlq69atyc/PJywsjIsXL7Jq1SoMBgPt 27ev7fhEREREpBGy6XZ/cHAwLVu2xMnJicDAQLp168axY8dqOzYRERERaaSq9Aoqg8GAnZ3eXiUi IiIitcNqppmbm8vq1au5dOkSAJcvX2bXrl1ER0fXenAiIiIi0jhZXZPq7u7OzTffzNq1a8nOzsbO zo4uXbrQtm3buohPRERERBohmx6cateuHe3atavtWEREREREAH0WVUREREQaICWpIiIiItLgKEkV ERERkQZHSaqIiIiINDhKUkVERESkwVGSKiIiIiINjpJUEREREWlwbHpPKkBCQgJbt24lOzsbLy8v evbsSURERG3GJiIiIiKNlE1J6pkzZ1i/fj0PP/wwAQEBXLx4kcLCwtqOTUREREQaKZuS1B07dtCz Z08CAgIAaNq0aa0GJSIiIiKNm01J6rlz5+jWrRvr16/nwoULBAcHc8cdd+Dk5FTb8YmIiIhII2TT g1PZ2dl8/fXXdOjQgYceeoiLFy+ydevW2o5NRERERBopm5JUd3d3Bg0aRGBgIK6urnTp0oWEhITa jk1EREREGimbklQ/Pz/S09PNvxuNxloLSERERETEpiS1S5cufPPNNxQUFGAymfjuu+9o2bJlbccm IiIiIo2UTQ9ORUZGkpWVxXvvvUdpaSlhYWH07NmztmMTERERkUbK5pf533rrrdx66621GYuIiIiI CKDPooqIiIhIA6QkVUREREQaHCWpIiIiItLg2Lwmtb54eHjUdwgiIiIiUsc0kyoiIiIiDY6SVBER ERFpcBpFkvrFxFjSDu2xWs9kKquDaERERETEGqtrUnNzc5k7d67FttLSUoxGI08++WStBVbXLp38 L/Hvz6bvi4vqOxQRERGRRs9qkuru7s6zzz5rse3jjz+mTZs2tRZUfSjIulzfIYiIiIjI/1T66f4j R47g4OBATExMbcRTobyM83y3cBqZpxKxc3SiaYsY2v/uCTwCmgOwbGhbHn5/B84e3gDsXzGPkoI8 Oo2caO7j4on/8O+PF5B15iR+LdvS7fGXcPZsQkFmBpunjaUg6xJFudmsGdcfAI+AYPpOW2we/+vX JtB76kLil/6NMwd20yQk0lxeVlrKwdWLSNq5AUwm/KM70HnUJBxd3W0qBygpKaFTp060a9eOpUuX 1v5BFREREWnAKpWkmkwmtm3bxrBhw2orngodWPkORv9m9J4yH4BT8dssEjxbpB3YTY+/zsbZw5sd cybx4wdv0W38S7h4+TBwzhpOfr+F45tWXvN2f/6ldL5961la9nmArn98gaK87KviW8j5Y/u5b9bH ODq7Ev/+LPYtf5suYybbVA5QUFBAYmIiTk5OlT08IiIiIjecSj04lZiYiIeHB35+frUVT4XcmgZw 7uhezh7dS1lZKcGd78bFs0ml+rjl/lG4NvHDzsGRiLsHkbrv20q1z8s4T5uhYwjt2gcHF1fcfPzN Zcc2fESHuCdxdHEDg4E2Q8dyKn6bzeUARqOR5ORktm/fXqm4RERERG6N0Mk7AAAgAElEQVRElZpJ TUhIIDw8vLZiuaa2Q8fgbPRi34dvkXkmmeCOd9Eh7kmLRLEyvIMjKMzJrFQbBxc3bmrdqdz2wqxL FOXnsmve8xbbnT08bSq/mo+PT6ViEhEREblRVSpJTUlJoXfv3rUVyzUZ7OyJuudhou55mMKcTPYs fpXd775Mr8lX3jpg5+BIQdYl85rUspLi6/aXffaUeT3rz+wdnSjMrvzDU84e3ji6uNHnhXdw9w2s dPnVMjMzcXV11S1/ERERafQqdbv/0qVL9fKZ0n0fvc3lU4kAOLt74tW8BZhM5nLPoFASt31OaXEh p3/czokdX5Tr4+TurygtLqQ4L4cDK9/h5p6DLcq9gyO4nJJAbnoaAAVZl2wLzmCgVd8H+X7RDIry cq60zcwgI+mYbeX/k5ubS1hYGD169LBtXBEREZEbmM0zqaWlpeTn5+Pm5lab8VTI7+Y2xC/9GzkX 0jCVleEZFMJtY6eayzuPnMh3C6eRuG0doV370CHuyXJJoDGgOesnxVGYfYnw7v+PmAEjLMv9m9Eh bgIbpz6CvZML7r6B9HpuPnb29lbjax83gUNr/sGGZ+PAYMDJzUjbBx7FJzzKpnIAJycnQkNDiYyM rM6hEhEREbkhGEymq6YkqyA7O/u65Qs8y6+9vNqk6g0vIiIiIjegRvFZVBERERH5bVGSKiIiIiIN jpJUEREREWlwKv1Z1Lpmbc1rfbxtQERERERql2ZSRURERKTBUZIqIiIiIg2OklQRERERaXBsWpNa WlrKhg0bSEpKwmQyERUVRd++fTEYDLUdn4iIiIg0QjbNpMbHx5OTk8P48eMZN24caWlpHDlypLZj ExEREZFGyqaZ1Pz8fEJCQrC3t8fe3p6IiAirT92LiIiIiFSVTTOpbdu2Ze/evfz73/8mNzeXhIQE YmJiajs2EREREWmkbJpJ9fLyIjAwkH379vH555/TpUsXvLy8ajs2EREREWmkbEpSly9fTpcuXYiK iiIjI4MvvviC3bt307Vr19qOT0REREQaIau3+/Pz8zl37hxRUVEA+Pj40KdPHw4dOlTrwYmIiIhI 42Q1SXVxccHJyYnjx49jMpkoKysjISFBt/tFREREpNZYvd1vMBiIjY1l8+bNbN68GZPJRFBQEPfe e29dxCciIiIijZBNa1IDAgIYMWJEbcciIiIiIgLos6giIiIi0gApSRURERGRBkdJqoiIiIg0OEpS RURERKTBUZIqIiIiIg2OklQRERERaXCUpNagjOTjrBzdk/SEw7XSv8lUViP9HFyzmD3/mFlue23H LyIiImIrm96TmpWVxRdffMGFCxdwdXWlX79+hIaG1nZsvzluTfwI7dob96YBNd73pZP/Jf792fR9 cVGN9/2z2oxfREREpDJsmkldu3YtkZGRPPnkkwwZMoQ1a9aQnZ1d27H95rh4+dBlzBRcm/jVeN8F WZdrvM9fq834RURERCrD6kxqQUEBaWlp/OEPfwDA19eXDh068OOPP3L33XfXeoA14YuJscTcN5zj m1aSlXYSv8g2dBs/HWfPJgDkZZzn69cm0HvqQuKX/o0zB3bTJCSSvtMWA1CUl8OP78/mzIHvsLN3 4Oaeg2kzZDQGO3sANk8bS/a5UwDkpp9l4Jtr8A6OMI9fVlrKwdWLSNq5AUwm/KM70HnUJBxd3c11 knZ+yeFP36MgMwNXb1/+b9g4gjv1oCAzg83TxlKQdYmi3GzWjOsPgEdAsDk+awpzMvl+0QzOHv4B j4DmGAOa42z0Mpdbi9/a8QEoKSmhU6dOtGvXjqVLl9p+ckREREQqYNPt/uLiYoqKinB2dgbA39+f w4d/W+sWE7eto8fTs3H28GbHnEn8+MEcuo2fbi7Pv5TOt289S8s+D9D1jy9QlPfLTPHuhdNwdDMy ZMEGivNz2TpjPPbOLrQe8HsAi2Rt5eie5cY+sHIh54/t575ZH+Po7Er8+7PYt/xtuoyZDEDyd5vY 9+Fb3D3p7/iER5GZmkRG0nHgyuzmwDlrOPn9Fo5vWlml2/27F76EvZMLD7y7meL8XLbN/qtFkmot fmvHB678MZOYmIiTk1Ol4xMRERH5Nau3+11cXAgMDGTPnj0UFhaSmJjI1q1bycnJqYv4aswt94/C tYkfdg6ORNw9iNR9Oy3K8zLO02boGEK79sHBxRU3H38AinKzOfn9Fjr+4Wns7B1wNnrR7uHx/PTV GpvHPrbhIzrEPYmjixsYDLQZOpZT8dvM5Uc//4AOw/+MT3gUAF7Nwgnv3r/6Ow0U5WSRsudruox+ FntHJ1w8mxD0f10r3c+1js/PjEYjycnJbN++vUbiFhERkcbNppnUBx98kG+++YaPPvqI5s2bc8cd d5CQkFDbsdUa7+AICnMyLbY5uLhxU+tO5ermnE/FxbMJTm5G8zbPwBByzqfaNFZh1iWK8nPZNe95 i+3OHp7mn7PSUixur9ek7POpuHg1wcnoab3ydVzr+FzNx8enWmOIiIiI/MymJNXb25v777/f/Pum TZsICPjtPgGenZaC0b+ZTXXd/QIpyLpEcX6ueQ1p9rnTuPsF2dTe2cMbRxc3+rzwDu6+gRXWMfoH kZmaTJPQltfsx97RicLsyj885erlQ2F2JqXFhdg7Ole6fWVkZmbi6uqqW/4iIiJSbTY93Z+cnExh YSEAJ06c4PDhw3Ts2LFWA6tpyd9tprS4kKK8HA6sfIfIXvdbbwQ4G70I6dyTvf+cg6mslKK8HP79 rwVE9h5i28AGA636Psj3i2ZQlHdliURBZgYZScfMVVr1G8a+5W+TmZoEQM6FMxz+7H2LbryDI7ic kkBuetqVPrIu2TS8W9MAmoS15OCqRWAykX02haQdG2yLvRJyc3MJCwujR48eNd63iIiIND42zaSe O3eO9evXU1RUhI+PDyNGjMDV1bW2Y6tRDs4urJ/4OwpzLhN+x73EDBhhc9vbH3+J+KV/Y824/4ed vT0RPQbQuhLt28dN4NCaf7Dh2TgwGHByM9L2gUfNa1Ajew3BVFrKN397ipKCfFy8mtBmyBiLPoz+ zegQN4GNUx/B3skFd99Aej03Hzt7e6vj3/nU63w3/0VWPdobn7AoWvS4j7yL522O3xZOTk6EhoYS GRlZo/2KiIhI42QwmUym6nRg7X2pCzyvvxZykpXhrfXv4eFx3XK48gqqW0c8RWCbzlbrVkdZaSkr RnRl0Fuf2LycQERERETKs2km9cZQrVz8unLOn8HoH8TZwz/g4OyKWx1+senCfw+y5ZVxFZY9/P4O 87tcRURERH5LGlGSWjvyLp7j27cnk59xAXtnF+54ciZ29nV3WP1atiX2n7vqbDwRERGRutAobveL iIiIyG/LDT+TqiRXRERE5LfHpldQiYiIiIjUJSWpIiIiItLgKEmtBJOprL5DqFUHDhwgMDCQ+Pj4 ehm/rOzGPr4iIiJiOzu48rWgr7/+mgULFvDhhx+Wq1RWVsamTZt4++23mT9/Pnv37q3zQOvbpZP/ 5avpj9V3GFUSHh5OQEAAQUFBdO3alc8++6zCeoGBgQwdOpTmzZvXcYRw8OBB+vXrV+fjioiISMPk AGBnZ0dQUBBFRUWkp6eXq/Tdd9+RlZXFn/70JwoLC3n//ffx8fEhPDy8zgOuLwVZl+s7hGrZsGED HTp0YNeuXcTGxpKXl0dsbKxFHX9/f+bNm1cv8VV03YmIiEjjZQfg6upKVFQUQUFBFVbat28fPXr0 wM7ODldXV26//Xb27dtXp4HWl4LMDNY9NZRv33qW8//Zz5px/Vkzrj+bp40F4PKpRFb/sa/FUoCi 3Gw+HtWD0uJC4MoXr07sWM+Xz/2Bj0f14OuZT1CYdclcv6y0lH9/vJBPnhjAJ3+6j13zX6A4P9ci jpKSEtq3b8/IkSOrvC8Gg4Hu3bvzxhtv8MILL5i39+7dm/DwcMLDw3FwcODIkSMW7VJTU+nYsSMX Llxg+PDhBAQE0Lt3b4vYpk2bRqtWrWjZsiWjRo0q91aFFStW0L59e5o1a8att97KunXrADh//jxt 27YlLi6OnTt3muO4uv/MzEzGjBlDSEgIERERvPLKK5SWltoU35EjRwgJCbFYSnD58mUCAgIoKCio 8rEUERGR2mV1TWpZWRlZWVn4+vqya9cujh07hr+/PxkZGXURX71z8fJh4Jw1dBk7Bf/o9gxduJGh CzfSd9piALyDIzAGNCd1/y8v1D+5ZwvBHXtg7+hs3pa4bR09np7Ng4u+ws7BkR8/mGMuO7ByIeeO /sh9sz7m/rmf4+RmZN/yty3iKCgoIDExkaNHj1Z7n/r3709CQgJZWVkAbNmyhaSkJJKSkvDz86uw zdmzZ4mLi2Pw4MGcOHGCZcuWmcteeuklduzYwd69ezl+/DheXl5MmTLFXL5y5UomT57M0qVLSU1N Zfny5eTl5QFXZm8PHjzIvHnz6N69uzmOLVu2mNuPHTsWg8FAYmIi8fHxrF+/nr///e82xde6dWta tGjBxo0bzXXXrl3LgAEDcHFxqeaRFBERkdpiNUktKSnBzs4Og8FAcnIyaWlpODo6UlhYWBfx/SZE 9X+Yn75aY/49accGWtx1n0WdW+4fhWsTP+wcHIm4exCp+3aay45t+IgOcU/i6OIGBgNtho7lVPw2 i/ZGo5Hk5GS2b99e7Xg9PT1xcXHh7NmzNrdJTU1lypQpPPDAA7i7u9OsWTNz2dy5c3n11VcxGo0Y DAamTJlinikFmDNnDjNnzqRdu3YAREVF8fDDD9s07uXLl1m7di2zZ8/G0dERHx8fpk+fzuLFi22O b/z48Rb1P/roI0aMGGHzvouIiEjds/oyfycnJ+BKshoXFwdASkqKXoJ/lZAuPflx2RvkX7oABgPZ 505zU8yt16zvHRxBYU4mAIVZlyjKz2XXvOct6jh7lP9Sl4+PT43Em5mZSUFBwTWXd1TEaDTSo0eP ctvT09PJysoqtwzh6lh/+uknWrduXaVYk5KS8PX1xcvLy7zt5ptvJikpyab4AAYPHszTTz9NWloa BoOBEydOcOedd1YpHhEREakbNn1xyt/fn9TUVEJDQwE4ffo0/v7+tRpYQ2Pv6ERhdsUPT9nZO3Bz z8EkbvscBxdXwu+4BwyGa/aVnZaC0f/KTJ+zhzeOLm70eeEd3H0DrxtDZmYmrq6u5j8cqmr9+vW0 atUKo9FYrX4AmjZtitFoZNOmTYSEhFRYJywsjOPHj9O2bdtr9uPi4sLFixfLbQ8NDSU9PZ3s7Gzz H0YnTpwgLCzM5hgdHR0ZOXIk//znP3F3dyc2NhbDdc6PiIiI1D+b3pPasWNHtm/fTmlpKTk5OcTH x9OhQ4fajq1B8Q6O4HJKArnpaQAUXPXgE0DLPg+QuP1zkr/bTMRdA8q1T/5uM6XFhRTl5XBg5TtE 9rr/SoHBQKu+D/L9ohkU5eVc6Tszg4ykYxbtc3NzCQsLu+Zsoa127drFxIkTmT59erX6+ZnBYOCx xx7j8ccfJzPzyuzw+fPn2b9/v7nOuHHjmDJlCseOXdmnkydPMmvWLIt+YmJiOHz4MCkpKQBcuHAB uDIjO2jQICZOnEhpaSmZmZm8+OKLjB49ulJxPvroo3zwwQesWrVKt/pFRER+AxwAVq9ezenTpykq KqKoqIi33noLT09PRo0aBUC7du24fPkyCxcuxM7Ojt69exMQEFCvgdc1o38zOsRNYOPUR7B3csHd N5Bez83Hzt4eADcfP7yatyDn/Bm8mpV/NZeDswvrJ/6OwpzLhN9xLzEDfkmU2sdN4NCaf7Dh2Tgw GHByM9L2gUfxCY8y13FyciI0NJTIyMgqxT9gwAAMBgPBwcHMnz+fQYMGVamfisyYMYOZM2dy2223 YTAY8PLyYurUqbRv3x6A0aNHU1JSwpAhQ8jNzcXPz4/Jkydb9BEeHs6rr77KnXfeiaurKyEhIaxf vx4HBweWLFnCU089RYsWLXBwcOD3v/89f/nLXyoVY1BQENHR0SQnJxMVFWW9gYiIiNQrg8lkMlWn g1+/aujXFniWX1t5tUlWhrfWv7W1sdVtXxnfv/sK3iE3E3WP5UNBX0yM5dYRTxHYpnONjSWVN27c OG655RbGjx9f36GIiIiIFfosag05eySes0fiadln6DVqVOtvAammbdu2sW3bNsaOHVvfoYiIiIgN bHpwSq6tpLCATycMxNHVnW7jp2Pn4FjfIclV8vLyiIqKwtPTk/fee6/aD52JiIhI3dDtfr1KS0RE RKTB0e1+EREREWlwlKSKiIiISIOjJFVEREREGhwlqXXIZCqrUrsvJsaSdmhPDUdTseLiYiZOnEhe Xl6djFeRGTNm8MQTT1S6XVlZ1Y5vfTt27BhvvvlmjfRVn+fvwIEDBAYGEh8ff916VT2/1lg7/9bi S0lJoU+fPtx00020a9eOTZs2Vap9Q1WT15eISF2ygytfM/r6669ZsGABH374YblK1srFuksn/8tX 0x+r7zCsGj58OE2bNsXNzc287dy5c8TGxhIaGkpwcDCLFi2yaHP//ffj6+tLWFgYoaGh9O3bl8OH D9dp3AcPHqRfv351OqatrB2/m2++md27dzNnzpxqj1Wf5y8wMJChQ4fSvHnzau9HZdly/q3F98wz z9CqVSuSk5PZvXs3d9xxR6XaN1Q1eX2JiNQlOwA7OzuCgoJo0aJFxZWslIt1BVmX6zsEq/71r39R UFDApEmTLLafOXOGBx54gOTkZD7//HP+9Kc/cebMGYs6r7/+OsnJySQnJzNo0CBiY2OvOc7SpUtZ sWJFjcaenp5e5bYzZsxg27ZtNRfMr1g7fg4ODixbtoxFixaZPx1bFfV9/vz9/Zk3bx6BgYFV3oeq suX8W4vv4MGDDB8+HBcXF1xdXS0SfVvaV1VtX381dX2JiNQ1OwBXV1eioqIICgqqsJK18hvZ2SM/ 8vXMJ9g8bQxrx9/L6R+3s2bcPXz18i+zopdPJfLt3yfz6YRBrBx9N7vmPU9pcSEABZkZrHtqKN++ 9Szn/7OfNeP6s2ZcfzZPs3ypfNLOL/n86QdZNbYXXzzzEKfit1mUF+flsP2Np/nXI3fyxcRYss+e sigvKSmhffv2jBw5ssr7+sYbbzBz5sxy29u3b8/QoUMxGAy0bNkSDw+Pa75v1GAwMHToUI4dO1bu 9mtaWhr33XcfW7dupX///ubtGRkZPPzwwwQEBNC1a1eOHj1q0e7IkSMMHz6c6OhobrrpJkaOHElB QQEA58+fp23btsTFxbFz507Cw8MJDw+nd+/eFsdm2rRptGrVipYtWzJq1CiLV5MNGzaM559/nqee eor8/PzKH7j/efPNN9m9e3e57bYcPzc3N5555hnmzZtX5fHr6/z17t3bfNwdHBw4cuSIRTtr59fa +enUqRP79u1j2LBh+Pr60qlTJxITEwHbzr+1+J555hmioqJISEhg6NChlW5vLf7U1FQ6duzIhQsX GD58OAEBARb91/b1BzVzfYmI1DWtSbXBmQO76Tx6Mr6RbTj0yXvc9/pHpCccJjf9LADZaSmEdevP wDlrGLLgSy6fTuS/m1cD4OLlw8A5a+gydgr+0e0ZunAjQxdupO+0xeb+k7/bxL4P36Lb+Jd5cPFW 7vjza5QUFljE8O+PF3LL/aO5f+46XL2bcmjtEovygoICEhMTyyUANu/jmTNkZWURExNzzTplZWWM Hj2aJ554Al9f32vWee+99+jcuTN2dr9cXv/617/o1asXY8aM4cMPP6RJkybmsrFjx+Lo6EhKSgrr 1q0jNTXVos+EhAQeeughDh48yIkTJzh69CjvvvsucGV26+DBg8ybN4/u3buTlJREUlISW7ZsMbd/ 6aWX2LFjB3v37uX48eN4eXkxZcoUc3lkZCTbt28nKCiIrl27smdP5db//jwrmZubS3Z2NmVlZZw7 d67Sx2/w4MF8+umnlRr76hjq6/xt2bLFfNz9/PzK9Wnt/Fo7Pz/38eyzz3L8+HECAgLMybgt599a fLNmzeLYsWOEhYWxYcOGSre3Jf6zZ88SFxfH4MGDOXHiBMuWLTOX1dX1V53rS0SkPihJtYFX83C8 gyPwDAylWYfuOHs2wd03kOxzpwEI7nw3wR3vorSokMzTiXgGhnLhp0M293/08w/oMPzP+IRHXRmv WTjh3ftb1Ll1xFM0bRGNs4c3Ybf3IzM1yaLcaDSSnJzM9u3bq7SPycnJREREXLfO9OnT8fT0ZNq0 aeXKJk2aRFhYGOHh4fz444989NFH5rIlS5Ywe/ZsduzYweDBgy3aXbp0iU8//ZS3334bZ2dn/Pz8 6NOnj0WdQYMGMWDAAAoKCjh69CiRkZH88MMPNu/b3LlzefXVVzEajRgMBqZMmcK6dess6tjZ2fHM M8+wfPlyBg4cyMGDB23uf+nSpXTt2pW1a9fy6quvcuedd/Ltt9+Wq3e94wfg4+NDXl4eRUVFNo/9 s/o6f9bYcn5tOT8zZsygQ4cONG3alGHDhjWo29a2xJ+amsqUKVN44IEHcHd3p1mzZhbldXH9Vef6 EhGpD/osaiUYDBX/nJdxnh+WvEZxfh5Nb47BYGdPSV6Ozf1mpaXgHXz9BMPO4ZdT5erdlNLi8v+j 8fHxsXnMXysqKsLR8fqfdD1y5Agvv/xyhWWvv/46o0ePrrCsT58+LF++nKlTpzJ79myMRqO57OfZ qatn5n4tNTWVCRMmkJOTQ8eOHXFwcCAzM9OGvbqyVjErK6vcMoiKjlVKSgpPPvkkAwcOtJrwXe25 557jkUceoUOHDhQXF3PgwAHs7e3L1bve8fuZg4MDRUVFlf58a32dP2usnV9bz8/V+xYQEEBhYaHN MdQmW+M3Go306NHjun3V9vUHVb++RETqg2ZSa8CONycS3v0e+rzwDh1+N4HANp3L1bF3dKIwu+KH p4z+QWSmJlc7jszMzCrPkjRv3pxTp05dt86qVauIioqqdN8hISFs3bqVmJgYunTpwjfffGMu8/f3 JyMjw7zGtCKxsbHExsayadMmZsyYwd13312ujouLCxcvXiy3vWnTphiNRjZt2sR//vMf87/9+/db 1FuyZAn9+vXjr3/9K4sXL8bd3b1cX9c7vlOnTmXx4sUMGjSIJUuWVFjH2vErKCigrKysUkngz+rr /Flj7fzaen6sudb5r201FX9dXH/Vub5EROqDktQakHMhDcP/1u9lpZ3k+OZV5ep4B0dwOSWB3PQ0 AAqyLpnLWvUbxr7lb5tv4edcOMPhz96vVAy5ubmEhYVZna25loiICLKysjh79myF5RkZGYSGhvLO O+9UqX+DwcCECRP49NNPeeGFF8xr45o3b07btm155ZVXMJlMJCQkWNxqBjh58qR5Zuinn34yr0e9 WkxMDIcPHyYlJQWACxcumMd97LHHePzxx82zr+fPn7dIIp5//nm2bdvG7t27ueeeeyqM/3rHNzs7 m5iYGAYOHMj06dNJS0srV8eW47d582aLB2oqo77OnzXWzq8t58cW1zr/ta0m4q+L6w+qd32JiNQH B4DVq1dz+vRpioqKKCoq4q233sLT05NRo0ZhS3lj12XMZA6uWcT+FfPwDomkVd9hpOzZalHH6N+M DnET2Dj1EeydXHD3DaTXc/Oxs7cnstcQTKWlfPO3pygpyMfFqwlthoypVAxOTk6EhoYSGRlZpX0w GAw8+uijzJo1izfeeKNcuclkqlK/v/bzQyJXJ1MrVqxg9OjRNG/enHbt2jFixAiLh2vmzp3LjBkz eP7557nlllt47LHH+OSTTyz6DQ8PN6/Hc3V1JSQkhPXr1+Pg4MCMGTOYOXMmt912GwaDAS8vL6ZO nUr79u0B+OMf/2j13ZfXO74eHh4888wzANjb2/Piiy+Wq2Pt+JlMJmbNmmV1OcC11Of5s8ba+bV2 fmxxvfNf26obf11df9W5vkRE6oPBVM3/e139qpWKLPD0vG75JCvDW+vfw8OjVts3JsXFxXTr1o2X X365wb4Y/0b1yiuvcPLkSRYvXmy98jXo/Mm11MT1JSJS13S7X8wcHR35/PPPeeutt+r1s6iNzeHD hzly5AgLFiwoV/b999/TpEmTCv+VlpZa1NX5k4pc7/oSEWnINJOqmVQRERGRBkczqSIiIiLS4ChJ FREREZEGR0mqiIiIiDQ4SlJFREREpMFRkioiIiIiDY4DXPmayZ49ezh27Bienp4MHz7colJqaipb t27lwoULODs706tXL6Kjo+slYBERERG58TkA2NnZERQURFFREenp6RYVTCYTO3bsoFevXgQFBXHi xAlWrFjBk08+qdc3iYiIiEitcABwdXUlKiqqwiTVYDAQGxtr/j0iIoKAgADOnTunJFVEREREakWl 16SWlZVx6dIlfH19ayMeEREREZHKJ6m7d++mRYsWeHt710Y8IiIiIiKVS1KTkpLYu3cv/fv3r614 RERERERsT1LPnDnDp59+ykMPPYTRaKzNmERERESkkXOwpdKpU6dYvXo1w4YNIyAgoLZjEhEREZFG zmAymUyrV6/m9OnTFBUVUVRUhNFoxNPTk1GjRlFcXMysWbMwGAw4OTlRWloKQFBQEMOHDyc7O/u6 Ayzw9Lxu+SST6brl1vq39oaB6rYXERERkbpnMJmsZIlWKEkVERERkZqmz6KKiIiISIOjJFVERERE GhwlqSIiIiLS4ChJFREREZEGR0mqiIiIiDQ4SlJFREREpMFRknoDOrhmMXv+MbO+wxARERGpMgeA 3Nxc9uzZw7Fjx/D09GT48OEWlVJSUvjmm2+4ePEiBoOBLl26cPvtt9dLwCIiIiJy43MAsLOzIygo iKKiItLT08tVSkpKolevXjRv3pz09HTeffddgoKCCAsLq+t4RURERKQRcABwdXUlKirqmknqXXfd Zf7Z19eX4OBg8vPz6y7KepaXcZ6vX5tA76kLiV/6N84c2E2TkI3uwLwAACAASURBVEj6TlsMQFlp KQdXLyJp5wYwmfCP7kDnUZNwdHU395G080sOf/oeBZkZuHr78n/DxhHcqQcARXk5/Pj+bM4c+A47 ewdu7jmYNkNGY7Czt2n8wpxMvl80g7OHf8AjoDnGgOY4G70s4v9u4TQyTyVi5+hE0xYxtP/dE3gE NDfXKSkp+f/s3XlYlXX+//HngcOBA4cDokKAsqaipqG5ZFmZ29S3bZJyMmx+E2rTMuE002iaU7bY PmWuZaU1TpKm5WiaWpbaYkbqYGo4iSiKCyBw2DkI9+8Pp1MnF0DZ1NfjurwuuD/b6z6Qvb1Xevfu TXx8PPPmzWvsj1RERETktMx17WgYBqWlpfzwww+Ul5fToUOHxszV4pQX5PHF1EfoOOQ2+v3xMZxl P79uNW3RbHLSt3Ljiwvx8raS+vaLbHl3Gn1HTwBg79er2fKvqVw7/lWCouNwZGeSn7nLNX7j7Ml4 +doYNmslVeWlrJ3yAJ7ePnS96fd1Wn/j7CfwtPhw2+trqCovZd1Lf3UrUtMWvYYtOJzBE2cCsD91 nVsBDVBRUUFGRgYWi6VhPzgRERGRM1DnG6fS09OZPXs2n3/+OTfffDNmc53r2/NCWX4O3RJGE9lv CGYfK75Bwa629JUL6Jk4Fi8fXzCZ6JYwhv2p61ztO5fPp+fIPxMUHQdAQHg00f2vA8BZWsy+bz6l 1/97GA9PM962AOLveIAfP1lSp/WdJUVkbfqMvqMewdPLgo+9FWGX9nMb69s6hCM7N3N452Zqaqpp 3+dafOyt3PrYbDb27t3L+vXrG+wzExERETlTda40O3fuTOfOncnPz2fx4sVcccUVXHLJJY2ZrUUx +/hyUdfeJ2yvLCrAWV7KVzP+7rbd29/u+rroUBaB7WNPOm9JTjY+9lZYfG2ubfbQCEpysuu0fnFO Nj4BrbDY7Ce0/aR7wmi8bQFs+ddUHAf30r7XNfRMHOtWaAMEBQWdcg4RERGRplTvw6FBQUHEx8fz ww8/XFBF6ql4+wfi5ePLkMdew69N6En72ILDcGTvpVVkxxPa/NqGUlFUQFV5qesUfPGRA/i1DavT +taAICqLHVRXVeLp5X3SPiYPT+Kuv4O46++gssTBpjeeYePrTzFownS3fg6HA6vVqlP+IiIi0uxq Pd1fXl7O+++/z9GjRwEoKChgx44dhIeHN3q4c4LJRKeht/PNnCk4y0oAqHDkk5+Z7urS6TfD2fLu NBzZmQCU5B5k+7/fBsDbFkBEn4Fs/ucrGDXVOMtK+M97s+gweFidlvdtHUKrqI5se38OGAbFh7PI 3LDSrc+WBdMo3J9xfD0/OwHtYsAw3PqUlpYSFRXFgAEDzuRTEBEREWlQZoDFixdz4MABnE4nTqeT qVOnYrfbSUpKwmq10qlTJ/79739TWFiIYRjEx8dz+eWXN3f2FqNHYjLfL3mTlY8kgsmExddG99vu cV2D2mHQMIzqaj5/4SGOVZTjE9CKbsNGu8Zfcf8TpM57gSX3/R8enp7EDriJrjfdVef1r37oeb6e +Tjv3zOYoKg4YgbcSNnRHFd724u7kTrvBUpyD2HU1GAPi+DyMZPc5rBYLERGRl5wN8SJiIhIy2Qy jF8dUqun4uLi07bPsp/6WkmA8bUsX9v8/v7+jTpeRERERJqeXosqIiIiIi2OilQRERERaXFUpIqI iIhIi3NhPZH/DOiaVhEREZGmpyOpIiIiItLiqEgVERERkRZHRep5JH/vLhaNGkje7u2NMr9h1DTK vCIiIiK/ZobjbxvatGkT6enp2O12Ro4cecoBKSkpFBcXc8899zRZyHPd58//mSM/bMHLx4phQEB4 FL3/8DcCIy5u0HV8W7Ulst9g/FqHNOi8AAX7/kvq2y8x9PE5DT63iIiIyK+ZATw8PAgLC8PpdJKX l3fKzmlpaVRVVTVZuPPJZXf9mQ6DhoFhkL5qIetfGcctr3zQoGv4BATRd/TEBp3zJxVFhY0yr4iI iMjJmAGsVitxcXGnLVKLior44osvuP7661m7dm2ThjyvmExE9hvMt/OexzBqMJk8KMvP4bPnkhk8 aTap817gYNpGWkV0YOjkNwBwlpXw3dsvcTDtazw8zVw88Ld0GzYKk4cnAGsmj6H4yH4ASvMOc/PL SwhsH+tasqa6mm2L55D55UowDII796RP0ni8rH6uPplffsz2pXOpcORjDWzDpcPvo33vAVQ48lkz eQwVRQU4S4tZct91APiHtHflAzh27Bi9e/cmPj6eefPmNfrHKCIiIue3Oj+Cavny5QwcOBBvb+/G zHPeM4wadq/9kLYXd8Nk+vmS4PKCPL6Y+ggdh9xGvz8+hrPs50dfbZw9GS9fG8NmraSqvJS1Ux7A 09uHrjf9HsCtWFw0auAJa6Ytmk1O+lZufHEhXt5WUt9+kS3vTqPv6AkA7P16NVv+NZVrx79KUHQc juxM8jN3AcePzt78yhL2ffMpu1YvOuXp/oqKCjIyMrBYLGf/IYmIiMgFr043Tm3duhUvLy+6dOnS 2HnOW5vnT2XJvb9hyb3Xk5exk6sees6tvSw/h24Jo4nsNwSzjxXfoGAAnKXF7PvmU3r9v4fx8DTj bQsg/o4H+PGTJXVeO33lAnomjsXLxxdMJroljGF/6jpX+87l8+k58s8ERccBEBAeTXT/6+q1fzab jb1797J+/fp6jRMRERE5mVqPpDocDjZs2MCoUaOaIs95y3VN6imYfXy5qGvvE7aX5GTjY2+Fxdfm 2mYPjaAkJ7tO61YWFeAsL+WrGX932+7tb3d9XXQoy+3ygDMVFBR01nOIiIiIQB2K1F27dmEymZg7 dy5w/NrD0tJSpk2bxpgxYxo94IXOr20oFUUFVJWXuq4hLT5yAL+2YXUa7+0fiJePL0Meew2/NqEn 7WMLDsORvZdWkR1POY+nl4XK4tPfPOVwOLBarTrlLyIiImet1tP9ffr0ITk52fVn+PDhhISEkJyc jNVqbYqMFzRvWwARfQay+Z+vYNRU4ywr4T/vzaLD4FMflXVjMtFp6O18M2cKzrISACoc+eRnpru6 dPrNcLa8Ow1HdiYAJbkH2f7vt92mCWwfS2HWbkrzDh2fo6jArb20tJSoqCgGDBhwZjsqIiIi8gtm gMWLF3PgwAGcTidOp5OpU6dit9tJSkpq7nwCXHH/E6TOe4El9/0fHp6exA64ia433VXn8T0Sk/l+ yZusfCQRTCYsvja633aP6xrUDoOGYVRX8/kLD3GsohyfgFZ0GzbabQ5bcDg9E5NZNekPeFp88GsT yqBHZ+LhefwJAxaLhcjISDp06NBwOy4iIiIXLJNhGMbZTFBcXHza9ll2+2nbx9eyfG3z+/v7t+jx TammupqUu/pxy9QPsQWHN3ccERERkTOm16KeB0pyDgJwePu3mL2t+DbCG6dEREREmlKdn5MqLVPZ 0SN8MW0C5fm5eHr7cNXYZ/Hw1I9VREREzm2qZs5xvq1DuP7pd5o7hoiIiEiDUpHayM6la1pFRERE WgpdkyoiIiIiLY6KVBERERFpcVSkSoMxjJrmjiAiIiLnCTMcf1vQpk2bSE9Px263M3LkSLdOaWlp LFu2DC8vL9e2m266ia5duzZtWmmxCvb9l9S3X2Lo43OaO4qIiIicB8wAHh4ehIWF4XQ6ycvLO6FT RUUFvXr14vrrr2/ygHJuqCgqbO4IIiIich4xA1itVuLi4k5ZpJaXl+Pn59fk4eS4j8aNoMuNI9m1 ehFFh/bRtkM3rnzgSbztrQAo3J/B9x+8ydGMnThLiwjv0Z/L/zgJTy9vAMryc/jsuWQGT5pN6rwX OJi2kVYRHRg6+Q3g+Juqti2eQ+aXK8EwCO7ckz5J4/Gy+rnW73fvY2z/8C0Off8ttuBwrvnLC/hf 1J4KRz5rJo+hoqgAZ2kxS+67DgD/kPau+cvyc/h69mQc+zPw8LLQOqYLPe58EP+Qdq59PHbsGL17 9yY+Pp558+Y12WcrIiIiLVOdrkmtqKggKyuLlJQU3n33XbZs2dLYueRXMtYtY8DDL3H7nE/wMHvx 3fxXXG3Fh7KIuvI6bn5lCcNmfUzhgQz+u2ax2/jygjy+mPoIEX0GkjDrY/onT3G1pS2azZGd33Hj iwu5dfpyLL42trw7zW38xtmTueTWUdw6fRnWwNZ8/8FbAPgEBHHzK0voO2YiwZ17kDB7FQmzV7kK 1OPzv4YtOJyE2au4ddoyoq+8zlUA/6SiooKMjAx27tzZYJ+ZiIiInLvqVKR27dqVPn36kJCQwMCB A/nyyy/ZunVrY2eTX7jk1iSsrdriYfYi9tpbyN7ypautfZ9rad/rGqqdlTgOZGAPjST3x+/dxpfl 59AtYTSR/YZg9rHiGxTsaktfuYCeiWPx8vEFk4luCWPYn7rObXyPO5NpHdMZb/9Aoq74DY7szDpn 920dwpGdmzm8czM1NdW073MtPv87CvwTm83G3r17Wb9+fT0+FRERETlf1elh/u3bt3d9HRoaypVX Xkl6ejo9evRotGByaoHtY6kscbi+L8vP4du3nqOqvIzWF3fB5OHJsbIStzFmH18u6tr7hLkqiwpw lpfy1Yy/u2339re7fe9h/vlXxRrYmuoqZ53zdk8YjbctgC3/morj4F7a97qGnolj3QplgKCgoDrP KSIiIue3M3rjlMlkwsNDT69qLsWHsrAFh7u+3/DyODrfkEhkvyHA8UsDsjZ9Vqe5vP0D8fLxZchj r+HXJvSMM3l6WagsPvnNUyYPT+Kuv4O46++gssTBpjeeYePrTzFownS3fg6HA6vVisViOeMcIiIi cn6otdIsLS1l8eLFFBQUAFBYWMhXX31F586dGz2c/Gzv12uorqrEWVZC2qLX6DDoVldbSe4hTP/7 R0PRoX3sWvN+3Sc2meg09Ha+mTMF5/+OvlY48snPTK9XvsD2sRRm7aY079DxOYoKXG1bFkyjcH8G AN5+dgLaxYBhuI0vLS0lKiqKAQMG1GtdEREROT+ZARYvXsyBAwdwOp04nU6mTp2K3W4nKSkJPz8/ Lr74Yj744AOKi4vx8PCgb9++dO/evbmzX1DM3j6sGHcnlSWFRF91A11uusvV1nf0BLYtmcPWlBkE RnSg09DhZG1aW+e5eyQm8/2SN1n5SCKYTFh8bXS/7R6CouPqPIctOJyeicmsmvQHPC0++LUJZdCj M/Hw9KTtxd1InfcCJbmHMGpqsIdFcPmYSW7jLRYLkZGRdOjQoc5rioiIyPnLZBi/OqRVT8XFxadt n2W3n7Z9fC3L1za/v7//eT0ejj8C6rK7HiK0W59a+4qIiIicD3Rh6TnjrP4tISIiInJOUZEqIiIi Ii3OGd3dL03rxhdSmjuCiIiISJPSkVQRERERaXFUpIqIiIhIi6MiVURERERaHBWp0mDS0tIIDQ0l NTW1UeavqalplHlFRESk5THD8bf9bNq0ifT0dOx2OyNHjjyh4+7du1m7di3FxcUEBAQwcOBAYmNj mzyw1N+tt97KF198gc1mwzAMOnXqxMsvv8wll1zSoOuEhoaSkJBAu3btGnRegG3btvHXv/6VTz75 pMHnFhERkZbHDODh4UFYWBhOp5O8vLwTOh08eJAVK1Zwxx13EBISwtGjR6msrGzysHLmnn/+eUaN GoVhGMyaNYsRI0bw/fffN+gawcHBzJgxo0Hn/MnJfi9FRETk/OUBYLVaiYuLIyws7KSdNmzYwMCB AwkJCQGgdevWp+wrLZvJZCIhIYH09HTX6fPs7Gx69epFbm4uI0eOJCQkhMGDB7vGOBwORo8eTURE BLGxsTz99NNUV1e72gcPHkx0dDTR0dGYzWZ27NjhtuaxY8eYPHkynTp1omPHjiQlJZ3wJq6UlBR6 9OhBeHg4l112GcuWLQMgJyeH7t27k5iYyJdffula55f5RERE5PxTp2tSjxw5QmBgICtWrODtt99m 7dq1OJ3Oxs4mjaCmpoa5c+fSp08fPDx+/vEfPnyYxMREfvvb37Jnzx7eeecdV9uYMWMwmUxkZGSQ mprKihUrePXVV13tn376KZmZmWRmZtK2bdsT1nziiSfYsGEDmzdvZteuXQQEBDBx4kRX+6JFi5gw YQLz5s0jOzubd999l7KyMuD40dlt27YxY8YM+vfv71rn008/bYyPR0RERFqIOj3Mv7i4mM8++4yh Q4cSGBjI8uXLWbt2Lddff31j5xPguuuu4+jRo27brrnmGl566aU6zzF+/HieeuopDMPgsssuY8GC BW7t2dnZ/Otf/2LAgAEA+Pn5AVBYWMgHH3zA0aNH8fLyIigoiCeffJLk5GT+8pe/1Gnt6dOns2rV Kmw2GwATJ06kV69eTJ8+HYBXXnmFZ599lvj4eADi4uKIi4ur876JiIjI+adORaqfnx+33HILgYGB APTt29d1OlYa36pVq856jp+uST0Vm83mKlB/KTMzkzZt2hAQEODadvHFF5OZmVmndfPy8igqKuLu u+922x4UFOT6+scff6Rr1651mk9EREQuDHUqUtu2bUteXp6rSP3piJic/yIjI8nLy6O4uBh/f38A 9uzZQ1RUVJ3Gt27dGpvNxurVq4mIiDhpn6ioKHbt2kX37t1POY+Pj88JR5NFRETk/FWna1L79u3L 559/TkVFBYZh8PXXX9OxY8fGziYtQFBQELfccgvjxo2juroah8PB448/ftqjsr9kMpm49957uf/+ +3E4HMDxm6G2bt3q6nPfffcxceJE0tPTAdi3bx8vvvii2zxdunRh+/btZGVlAZCbm9sQuyciIiIt lBlg8eLFHDhwAKfTidPpZOrUqdjtdpKSkgDo0KEDRUVFzJ07l+rqaqKiohg4cGCzBpem89Zbb/HQ Qw8RExOD2Wzm97//fZ2vRwWYMmUKzz77LJdffjkmk4mAgAAmTZpEjx49ABg1ahTHjh1j2LBhlJaW 0rZtWyZMmOA2R3R0NM888wxXX301VquViIgIVqxYgdlcp5MBIiIico4xGYZhnM0Ev36U0K/NsttP 2z6+luVrm/+nU9Dn6/hzybFjxwgICGD79u1ER0c3dxwRERE5h+m1qHLW9u7dC8Dnn3+On59fo7xx SkRERC4sOlcqZ+XAgQPcddddHDx4EF9fX+bPn4+Xl1dzxxIREZFznIpUOSvt2rXjiy++aO4YIiIi cp7R6X4RERERaXFUpIqIiIhIi6MiVURERERaHBWp56BtS95g05vP1nucYdQ0Qpq6+/LLL9m8eXOt /aZMmcKDDz5Y7/lrapp3/0RERKThmAFKS0vZtGkT6enp2O12Ro4c6epQWlrK9OnT3QZVV1djs9kY O3Zs06aVM1aw77+kvv0SQx+f02wZ0tLSCAwM5LLLLmvwubdt28Zf//pXPvnkkwafW0RERJqeGcDD w4OwsDCcTid5eXluHfz8/HjkkUfcti1cuJBu3bo1XUo5axVFhc22dmVlJY8++igpKSlUV1ezZs0a pk6dSqtWrRpsjV//3oqIiMi5zQxgtVqJi4s7aZH6azt27MBsNtOlS5cmCShQWeLgmzlTOLz9W/xD 2mELaYe3LcDVXrg/g+8/eJOjGTtxlhYR3qM/l/9xEp5e3lQ48lkzeQwVRQU4S4tZct91APiHtGfo 5DcAqKmuZtviOWR+uRIMg+DOPemTNB4vq59rjWPHjtG7d2/i4+OZN29evfK/9dZbbNq0iR9//BGL xcLcuXMpLy93Fan5+fncf//9fP7558TExBATE0NQUJBr/I4dO3j22WfZvHkzBQUFXH/99cyePRsf Hx9ycnIYPHgwubm5FBYWut50FRsby6effurK/vTTT5OSkoJhGPTv359XX331vHrbl4iIyPmmXtek GobBunXruPrqqxsrj5zExtlP4OFp5rbX1zBwwnTK8nPc2osPZRF15XXc/MoShs36mMIDGfx3zWIA fAKCuPmVJfQdM5Hgzj1ImL2KhNmrXAUqQNqi2RzZ+R03vriQW6cvx+JrY8u709zWqKioICMjg507 d57RPphMJgzDwGw2c8899xAWFuZqGzNmDF5eXmRlZbFs2TKys7Pdxu7evZvf/e53bNu2jT179rBz 505ef/11AIKDg9m2bRszZsygf//+ZGZmkpmZ6SpQAZ544gk2bNjA5s2b2bVrFwEBAUycOPGM9kNE RESaRr2K1IyMDPz9/Wnbtm1j5ZFfcZYUkbXpM/qOegRPLws+9laEXdrPrU/7PtfSvtc1VDsrcRzI wB4aSe6P39d5jfSVC+iZOBYvH18wmeiWMIb9qevc+thsNvbu3cv69evrvQ+jR48mLi6OqKgoJk6c iMPhcLUVFBSwdOlSpk2bhre3N23btmXIkCFu42+55RZuuukmKioq2LlzJx06dODbb7+t8/rTp0/n mWeewWazYTKZmDhxIsuWLav3foiIiEjTqdcbp3bv3u06nSpNozgnG5+AVlhs9lP2KcvP4du3nqOq vIzWF3fB5OHJsbKSOs1fWVSAs7yUr2b83W27t/+J6/3yFHx9WCwW5syZw5///Geef/55OnXqxOrV q7n00kvJzMykbdu2p70+NTs7m+TkZEpKSujVqxdms9mt0D2dvLw8ioqKuPvuuxtkX0RERKRp1KtI zcrKYvDgwY2VRU7CGhBEZbGD6qpKPL28T9pnw8vj6HxDIpH9jh+BzFi3jKxNn7n18fSyUFl84s1T 3v6BePn4MuSx1/BrE3raLA6HA6vVisViOaN96dKlC++88w4PP/wwr7/+OrNmzSI4OJj8/HwqKirw 8fE56bgRI0aQnJzMbbfdBsA777zD0qVL3fr4+Phw9OjRE8a2bt0am83G6tWriYiIOKPcIiIi0vTq dbq/oKBAN5s0Md/WIbSK6si29+eAYVB8OIvMDSvd+pTkHsLkcfxHWXRoH7vWvH/CPIHtYynM2k1p 3iEAKooKjjeYTHQaejvfzJmC839HXysc+eRnpruNLy0tJSoqigEDBtR7H5KTk3nttdc4ePAge/bs 4dtvv6Vjx44AtGvXju7du/P0009jGAa7d+9mwYIFbuP37duHp6cnAD/++KPretRf6tKlC9u3bycr KwuA3Nzc/+2eiXvvvZf777/fdfQ1JyeHrVu31ns/REREpOmYDMMwFi9ezIEDB3A6nTidTmw2G3a7 naSkJFfH6upqnnrqKf72t7/h5/fzXd/FxcWnXWCW/dSnqQHGG8Zp22ubv7ai+VwfD8cLz69nPk7x kf0ERcUR3KUnZUdz6Dt6AgD7U9exbckcjlWUExjRgXY9ryJr01quHT/VbZ4d/36b9I9T8LT44Ncm lEGPzsTD05Oa6mN8v+RNMr/8GEwmLL42ut92D+0u+/kGuaqqKnr37s2ll17KO++8U2vmX8rIyOCF F15g5cqVBAYGkpycTFJSklvhOWrUKDIyMoiPj+eqq64iOzvb9XzeZcuWMWXKFEpLS7nkkkv4v//7 Pz788EM+/PBDt3VeeuklZsyYgdVqJSIighUrVmA2m6mqquLZZ58lJSUFk8lEQEAAkyZN4oYbbqjX foiIiEjTMRlGLVViLVSkNn6Rer6YOXMmgYGBJCYmNncUERERaeHqdU2qyNkIDQ3FZrM1dwwRERE5 B6hIlSYzbNiw5o4gIiIi54h63TglIiIiItIUVKSKiIiISIujIlVEREREWhwVqSIiIiLS4qhIPYfk 793FolEDydu9/bT9ti15g01vPtvg6xtGzWnba8tXmneINU/cw6JR17Lsr7dz8D9f12t8c/vyyy/Z vHnzKdvT0tIIDQ0lNTX1tPNMmTKFBx98sKHjUVNz+p9PbfmysrIYMmQIF110EfHx8axevbpe40VE RBqSGY6/TWjTpk2kp6djt9sZOXKkW6fq6mpWrlxJZmYmhmEQFxfH0KFDMZlMzRL6QuXbqi2R/Qbj 1zqkydcu2PdfUt9+iaGPzzlln9ryfffOywSERzFo4nQw4NeP6G3O/auLtLQ0AgMDueyyy07aHhoa SkJCAu3atWviZLBt2zb++te/8sknn5yyT235/va3v9GpUyeWL1+OYRgn/Hyac/9EROTCYwbw8PAg LCwMp9NJXl7eCZ1SU1MpKSnhgQceoLq6mgULFrBjxw4uueSSJg98IfMJCKLv6InNsnZFUWGtfWrL V7Dvv1z5p6fw9PI+o/HNpbKykkcffZSUlBSqq6tZs2YNU6dOpVWrVm79goODmTFjRrNkPNl/t79W W75t27Yxb948fHx8zmi8iIhIQzIDWK1W4uLiTlmklpeXExERgaenJ56ensTGxtb6JiVpOGsmj6H4 yH4ASvMOc/PLSwhsH+tqryxx8M2cKRze/i3+Ie2whbTD2xbgaq+prmbb4jlkfrkSDIPgzj3pkzQe L+vx19t+NG4E/e59jO0fvsWh77/FFhzONX95Af+L2lPhyGfN5DFUFBXgLC1myX3XAeAf0p6hk9+o U77v/vky+1PXUXxkP+tefAgPs1e9xteWvyw/h8+eS2bwpNmkznuBg2kbaRXRwTU/wLFjx+jduzfx 8fHMmzevXp//W2+9xaZNm/jxxx+xWCzMnTuX8vJyV5E6ePBgMjIyANi/fz9paWl07drVNT4/P5/7 77+fzz//nJiYGGJiYggKCnLL9vTTT5OSkoJhGPTv359XX33V9Tay3r178/rrr/Pcc8/x2WefER0d zXvvvUdsbCw5OTkMHjyY3NxcCgsLiY6OBiA2NpZPP/20Tvn+9re/sXz5cjIyMkhISMBisdRrfG35 s7OzueWWW/j444956KGH+OSTT+jWrZtrfhERkZMyfiEtLc2YP3++8Wt5eXnGq6++amzdutUoKSkx 5s6daxQWFhqGYRhFRUWn/fMcnPZPbWqb/3wf/2sLk641CrJ2u237/IWHjA1TJxjHnJVGuSPf+Pjv dxvfvPGMq33LgunGqseSDGd5qWHU1Bjfzn3erX353+4wl5u+4gAAIABJREFUlj883MjL2GlUFBUY n055wPhq5uNua+zd+ImxevKYM8r3kyX332AczUyv9/ja8pcePWIsGj3IWPPEPcber9cYVeVlRunR I25zFBcXG/7+/kafPn1q3YdfmzlzpnHVVVcZJSUltfa96KKLjO3bt7ttGzZsmDFy5EijoqLCyMnJ Ma655hrjT3/6k6t90qRJxrXXXmsUFxcbNTU1xp///Ge39l69ehk9e/Y0Nm/ebOTl5Rk33HCDMWrU KLc1Fi9ebAwePPiM8v3k4osvNv7zn//Ue3xt+Q8cOGCEh4cbQ4YMMd5//32jpKTEOHDgQK1ZRUTk wlanG6cCAgIIDQ1ly5YtvPzyy4SHhxMQEFD7QGl0zpIisjZ9Rt9Rj+DpZcHH3oqwS/u59UlfuYCe iWPx8vEFk4luCWPYn7rOrU+PO5NpHdMZb/9Aoq74DY7szCbci9OrS/6y/By6JYwmst8QzD5WfIOC 3dptNht79+5l/fr19V5/9OjRxMXFERUVxcSJE3E4HHUeW1BQwNKlS5k2bRre3t60bduWIUOGuPWZ Pn06zzzzDDabDZPJxMSJE1m2bJlbnylTptCzZ09at27N8OHDSU9Pr/d+NJa65M/OzmbixIncdttt +Pn5ER4e3kxpRUTkXFGn16K+++679O3bl7i4OPLz8/noo4/YuHEj/fr1q32wNKrinGx8AlphsdlP 2l5ZVICzvJSvZvzdbbu3v3t/D/PPvwrWwNZUVzkbPuwZqGt+s48vF3Xtfdq5fnmKvT4sFgtz5szh z3/+M88//zydOnVi9erVXHrppbWOzczMpG3btidcv/qTvLw8ioqKuPvuu0+b1cvLy/V1SEgIlZWV Z7AnDa+u+W02GwMGDGjCZCIicq6rtUgtLy/nyJEjxMXFAcf/5zNkyBCWL1+uIrUFsAYEUVnsoLqq 8qQ3JHn7B+Ll48uQx17Dr03oGa/j6WWhsrj2m6caWkPlB3A4HFitViwWyxmN79KlC++88w4PP/ww r7/+OrNmzap1THBwMPn5+VRUVJz0hqTWrVtjs9lYvXo1ERERZ5QLwMfHh6NHj57x+DPVUPlFRER+ rdbT/T4+PlgsFnbt2oVhGNTU1LB7926d7m8hfFuH0CqqI9venwOGQfHhLDI3rPy5g8lEp6G3882c KTjLSgCocOSTn1m/08WB7WMpzNpNad6h43MUFTTYPpxWA+UvLS0lKirqjI7mJScn89prr3Hw4EH2 7NnDt99+S8eOHes0tl27dnTv3p2nn34awzDYvXs3CxYscLWbTCbuvfde7r//ftdlBDk5OWzdurVe Gbt06cL27dvJysoCIDc3t17jz1RD5RcREfk1M8DixYs5cOAATqcTp9PJ1KlTsdvtJCUlYTKZGDFi BGvWrGHNmjUYhkFYWBg33HBDc2eX/7n6oef5eubjvH/PYIKi4ogZcCNlR3Nc7T0Sk/l+yZusfCQR TCYsvja633YPQdFxdV7DFhxOz8RkVk36A54WH/zahDLo0Zl4eHo2xi65aYj8FouFyMhIOnToUO/1 x44dywsvvMCUKVMIDAwkOTmZpKSkOo9PSUlh1KhRtGvXjvj4eO666y6ys7Nd7VOmTOHZZ5/l8ssv x2QyERAQwKRJk+jRo0ed14iOjuaZZ57h6quvxmq1EhERwYoVKzCb63RFz1lpiPwiIiK/ZjKMXz2x u55qexTVLPvJr5X8yfhalq9t/p8ec3O+jpeWY+bMmQQGBpKYmNjcUURERM57jX+YReQ8ERoais1m a+4YIiIiFwQVqSJ1NGzYsOaOICIicsGo03NSRURERESako6kNjJdcyoiIiJSfzqSKiIiIiItjopU EREREWlxVKSKiIiISItjhuNv49m0aRPp6enY7XZGjhzp1qmoqIiPPvqI3NxcrFYrv/nNb4iMjGyW wCIiIiJy/vMA8PDwICwsjJiYmJN2+uCDD+jQoQNjx45l2LBhLFmypNaH1IuIiIiInCkPAKvVSlxc HGFhYSd0qKio4NChQ/Tq1QuANm3a0LNnT7777rumTSoiIiIiF4w6XZNaVVWF0+l0fR8cHExubm6j hRIRERGRC1utRaqPjw+hoaFs2rSJyspKMjIyWLt2LSUlJU2RT0REREQuQHU6knr77bdz9OhRFixY wJ49e7jqqquw2+2NnU1ERERELlB1euNUYGAgt956q+v71atXExIS0mihREREROTCVqcjqXv37qWy shKAPXv2sH37dteNVCIiIiIiDc0MsHjxYg4cOIDT6cTpdDJ16lTsdjtJSUkAHDlyhBUrVuB0OgkK CuKuu+7CarU2a3AREREROX+ZDMMwzmaC2p6XOquWa1fH17J8bfP7+/u36PEiIiIiUn96LaqIiIiI tDgqUkVERESkxVGRKiIiIiItjopUEREREWlxVKSKiIiISIujIlVEREREWhwVqSIiIiLS4pgBsrOz Wbt2Lbm5uXh7ezNo0CA6d+7s6lRTU8Mnn3zCrl278PT05PLLL+eyyy5rttAiIiIicn4zG4bBhg0b GDRoEGFhYezZs4eUlBTGjh3relD9119/TVFREX/605+orKzk7bffJigoiOjo6GaOLyIiIiLnIw+T ycSIESMIDw/HZDIRGxtLSEgIR44ccXXasmULAwYMwMPDA6vVyhVXXMGWLVuaMbaIiIiInM9OuCa1 pqaGgoIC2rRp4/q+qKiINm3a8NVXX5Genk5wcDD5+flNHlZERERELgzmX2/YuHEjMTExBAYGAnDs 2DE8PDwwmUzs3bsXp9NJmzZtqKysbPKwIiIiInJhcCtSMzMz2bx5M0lJSa5tFosFOF6sJiYmApCV leW6XlVEREREpKG5TvcfPHiQpUuX8rvf/Q6bzebWKTg4mOzsbNf3Bw4cIDg4uOlSioiIiMgFxQNg //79LFy4kOHDhxMSEnJCp169erF+/Xqqq6spKSkhNTWVnj17NnlYEREREbkwmKuqqpg/fz4mk4n3 3nuP6upqAMLCwhg5ciQA8fHxFBYWMnv2bDw8PBg8ePBJi1kRERERkYZgMgzDOJsJiouLT9s+y24/ bfv4Wpavbf7aro1t7vEiIiIiUn96LaqIiIiItDgqUkVERESkxVGRKiIiIiItzgkP8xd3uuZURERE pOnpSKqIiIiItDgqUkVERESkxVGR2gQ+GjeCQ99vqrWfYdQ0QRoRERGRls8MkJ2dzdq1a8nNzcXb 25tBgwbRuXNnV6fS0lI2bdpEeno6drvd9ZB/aTgF+/5L6tsvMfTxOc0dRURERKTZmQ3DYMOGDQwa NIiwsDD27NlDSkoKY8eOdd005OHhQVhYGE6nk7y8vGaOfH6qKCps7ggiIiIiLYbZZDIxYsQI14bY 2FhCQkI4cuSIq0i1Wq3ExcVdsEVqWX4OX8+ejGN/Bh5eFlrHdKHHnQ/iH9IOgHcSunPH2xvw9g8E YGvKDI5VlNH77nGuOY7u+YH/LJxF0cF9tO3YnSvvfwJveysqHPmsmTyGiqICnKXFLLnvOgD8Q9oz dPIbrvU/ey6ZwZNmkzrvBQ6mbaRVRAdXe011NdsWzyHzy5VgGAR37kmfpPF4Wf3q1A5w7Ngxevfu TXx8PPPmzWv8D1VERETkNE54BFVNTQ0FBQW0adOmOfK0SGmLXsMWHM7giTMB2J+6zq3Aq4tDaRsZ 8NeX8PYPZMMr4/lu/lSufOAJfAKCuPmVJez75lN2rV50ytP95QV5fDH1EToOuY1+f3wMZ9nPr2tN WzSbnPSt3PjiQry8raS+/SJb3p1G39ET6tQOUFFRQUZGBhaLpb4fj4iIiEiDO+HGqY0bNxITE0Ng YGBz5GmRfFuHcGTnZg7v3ExNTTXt+1yLj71Vvea45NYkrK3a4mH2IvbaW8je8kW9xpfl59AtYTSR /YZg9rHiGxTsaktfuYCeiWPx8vEFk4luCWPYn7quzu0ANpuNvXv3sn79+nrlEhEREWkMbkdSMzMz 2bx5M0lJSc2Vp0XqnjAab1sAW/41FcfBvbTvdQ09E8e6FYr1Edg+lsoSR73GmH18uahr7xO2VxYV 4Cwv5asZf3fb7u1vr1P7LwUFBdUrk4iIiEhjcRWpBw8eZOnSpdx5553YbLbmzNTimDw8ibv+DuKu v4PKEgeb3niGja8/xaAJ0wHwMHtRUVTguia15ljVaecrPrzfdT3rTzy9LFQW1//mKW//QLx8fBny 2Gv4tQmtd/svORwOrFarTvmLiIhIs/MA2L9/PwsXLmT48OGEhIQ0d6YWZ8uCaRTuzwDA289OQLsY MAxXuz0skox1y6muquTAd+vZs+GjE+bYt/ETqqsqqSorIW3Ra1w88Ldu7YHtYynM2k1p3iEAKooK 6hbOZKLT0Nv5Zs4UnGUlx8c68snPTK9b+/+UlpYSFRXFgAED6rauiIiISCMyV1VVMX/+fEwmE++9 9x7V1dUAhIWFuZ6HunjxYg4cOIDT6cTpdDJ16lTsdvsFc1lA24u7kTrvBUpyD2HU1GAPi+DyMZNc 7X3uHsfXsyeTsW4Zkf2G0DNx7AlFoC2kHSvGJ1JZXEB0//+jy013ubcHh9MzMZlVk/6Ap8UHvzah DHp0Jh6enrXm65GYzPdL3mTlI4lgMmHxtdH9tnsIio6rUzuAxWIhMjKSDh06nM1HJSIiItIgTIbx i0OCZ6C4uPi07bPsJ177+Evja1m+tvl/ekxWY40XERERkaan16KKiIiISIujIlVEREREWhwVqSIi IiLS4pzwxilxp2taRURERJqejqSKiIiISIujIlVEREREWhwVqSIiIiLS4pgBsrOzWbt2Lbm5uXh7 ezNo0CA6d+7s6lRbu4iIiIhIQzIbhsGGDRsYNGgQYWFh7Nmzh5SUFMaOHYu/vz+1tYuIiIiINDSz yWRixIgRrg2xsbGEhIRw5MgR/P39qa1dRERERKShnXBNak1NDQUFBbRp0+akA2prFxERERE5WycU qRs3biQmJobAwMCTDqitXURERETkbLkVqZmZmWzevJnrrrvupJ1raxcRERERaQiuIvXgwYMsXbqU 3/3ud9hsthM61tYuIiIiItJQzAD79+9n8eLFDB8+nJCQkBM61dYuIiIiItKQzFVVVcyfPx+TycR7 771HdXU1AGFhYYwcOZLa2kVEREREGprJMAzjbCYoLi4+bfssu/207eNrWb62+Wt7DFZzjxcRERGR +tNrUUVERESkxVGRKiIiIiItjopUEREREWlxVKSKiIiISIujIlVEREREWhwVqSIiIiLS4qhIlXOG YdQ0dwQRERFpImaA7Oxs1q5dS25uLt7e3gwaNIjOnTu7OmVlZfH5559z9OhRTCYTffv25Yorrmi2 0HLhKdj3X1Lffomhj89p7igiIiLSBMyGYbBhwwYGDRpEWFgYe/bsISUlhbFjx7oeVJ+ZmcmgQYNo 164deXl5vP7664SFhREVFdW86eWCUVFU2NwRREREpAmZTSYTI0aMcG2IjY0lJCSEI0eOuIrUa665 xtXepk0b2rdvT3l5eZOHPRcd3vEdO5e9w7HKckpyD9Hn7nFseus57GGRDPn7awDUVFezbfEcMr9c CYZBcOee9Ekaj5fVD4DC/Rl8/8GbHM3YibO0iPAe/bn8j5Pw9PIGoCw/h69nT8axPwMPLwutY7rQ 484H8Q9pB8A7Cd254+0NePsHArA1ZQbHKsroffc41/jPnktm8KTZpM57gYNpG2kV0YGhk9+oNV9D 7N9H40bQ797H2P7hWxz6/ltsweFc85cX8L+oPRWOfNZMHkNFUQHO0mKW3HcdAP4h7V35AI4dO0bv 3r2Jj49n3rx5jfozFRERkcZ3wjWpNTU1FBQU0KZNG7fthmFQUlJCamoq5eXldOjQoclCnusOpm2k z6gJtOnQje8/nMuNzy8gb/d2SvMOA5C2aDZHdn7HjS8u5Nbpy7H42tjy7jTX+OJDWURdeR03v7KE YbM+pvBABv9ds9jVnrboNWzB4STMXsWt05YRfeV1rgKwrsoL8vhi6iNE9BlIwqyP6Z885Rfznz7f 2e4fwMbZk7nk1lHcOn0Z1sDWfP/BWwD4BARx8ytL6DtmIsGde5AwexUJs1e5FagAFRUVZGRksHPn znrtt4iIiLRMJxSpGzduJCYmhsDAQLft6enpzJ49m88//5ybb74Zs9ncZCHPdQHtoglsH4s9NJLw nv3xtrfCr00oxUcOAJC+cgE9E8fi5eMLJhPdEsawP3Wda3z7PtfSvtc1VDsrcRzIwB4aSe6P37va fVuHcGTnZg7v3ExNTTXt+1yLj71VvTKW5efQLWE0kf2GYPax4hsU7GqrLd/Z7h9AjzuTaR3TGW// QKKu+A2O7Mx65bfZbOzdu5f169fXa5yIiIi0TG6VZmZmJps3byYpKemEjp07d6Zz587k5+ezePFi rrjiCi655JImC3o+MJlO/LqyqABneSlfzfi7W19vf7vr67L8HL596zmqystofXEXTB6eHCsrcbV3 TxiNty2ALf+aiuPgXtr3uoaeiWPdCs3amH18uahr7xO21yXf2e4fgMcv/tFjDWxNdZWzztl/EhQU VO8xIiIi0jK5KoODBw+ydOlS7rzzTmw22ykHBAUFER8fzw8//KAitQF4+wfi5ePLkMdew69N6En7 bHh5HJ1vSCSy3xAAMtYtI2vTZ652k4cncdffQdz1d1BZ4mDTG8+w8fWnGDRhOgAeZi8qigpc16TW HKtq0HyNOf4nnl4WKotPf/OUw+HAarVisVjOeB0RERFpGTwA9u/fz8KFCxk+fDghISFuHcrLy3n/ /fc5evQoAAUFBezYsYPw8PCmT3s+MpnoNPR2vpkzBef/jo5WOPLJz0x3dSnJPYTJ4/iVGUWH9rFr zftuU2xZMI3C/RkAePvZCWgXA4bhareHRZKxbjnVVZUc+G49ezZ81KD5GnX8/wS2j6UwazeleYeO z1FU4NZeWlpKVFQUAwYMqNe8IiIi0jKZq6qqmD9/PiaTiffee4/q6moAwsLCGDlyJFarlU6dOvHv f/+bwsJCDMMgPj6eyy+/vJmjnz96JCbz/ZI3WflIIphMWHxtdL/tHoKi4wDoO3oC25bMYWvKDAIj OtBp6HCyNq11jW97cTdS571ASe4hjJoa7GERXD5mkqu9z93j+Hr2ZDLWLSOy3xB6Jo6tV5FYW77G Hg9gCw6nZ2Iyqyb9AU+LD35tQhn06Ew8PD0BsFgsREZG6oY+ERGR84TJMH5xyO0MFBcXn7Z9lv3E axd/aXwty9c2/0+PyWqp40VERESk/vRaVBERERFpcVSkioiIiEiLoyJVRERERFocPZG/kemaVhER EZH605FUEREREWlxVKSKiIiISIujIvUcYhg1zR2hUaWlpREaGkpqamqzrF9Tc35/viIiIucSD4Ds 7Gz++c9/8o9//IMZM2bwww8/nHJASkoKc+bMabKAclzBvv/yyZP3NneMMxIdHU1ISAhhYWH069eP f//73yftFxoaSkJCAu3atWvihLBt2zZ+85vfNPm6IiIicnJmwzDYsGEDgwYNIiwsjD179pCSksLY sWNPuKknLS2Nqqq6v/ddGk5F0enfW9/SrVy5kp49e/LVV18xYsQIysrKGDFihFuf4OBgZsyY0Sz5 8vLymmVdEREROTkPk8nEiBEjCA8Px2QyERsbS0hICEeOHHHrWFRUxBdffMGVV17ZTFEvTBWOfJY9 lMAXUx8h54etLLnvOpbcdx1rJo8BoHB/Bov/ONTtUgBnaTELkwZQXVUJwEfjRrBnwwo+fvT/sTBp AJ89+yCVRQWu/jXV1fxn4Ww+fPAmPvzTjXw18zGqykvdchw7dowePXpw9913n/G+mEwm+vfvzz/+ 8Q8ee+wx1/bBgwcTHR1NdHQ0ZrOZHTt2uI3Lzs6mV69e5ObmMnLkSEJCQhg8eLBbtsmTJ9OpUyc6 duxIUlLSCU9VSElJoUePHoSHh3PZZZexbNkyAHJycujevTuJiYl8+eWXrhy/nN/hcDB69GgiIiKI jY3l6aefdr0+uLZ8O3bsICIiwu1SgsLCQkJCQqioqDjjz1JEROR8d8I1qTU1NRQUFNCmTRu37cuX L2fgwIF4e3s3WTgBn4Agbn5lCX3HTCS4cw8SZq8iYfYqhk5+A4DA9rHYQtqRvfUr15h9mz6lfa8B eHr9/LPKWLeMAQ+/xO1zPsHD7MV3819xtaUtms2Rnd9x44sLuXX6ciy+Nra8O80tR0VFBRkZGezc ufOs9+m6665j9+7dFBUVAfDpp5+SmZlJZmYmbdu2PemYw4cPk5iYyG9/+1v27NnDO++842p74okn 2LBhA5s3b2bXrl0EBAQwceJEV/uiRYuYMGEC8+bNIzs7m3fffZeysjLg+NHbbdu2MWPGDPr37+/K 8emnn7rGjxkzBpPJREZGBqmpqaxYsYJXX321Tvm6du1KTEwMq1atcvX94IMPuOmmm/Dx8TnLT1JE ROT8dUKRunHjRmJiYggMDHRt27p1K15eXnTp0qVJw0ndxF13Bz9+ssT1feaGlcRcc6Nbn0tuTcLa qi0eZi9ir72F7C1futrSVy6gZ+JYvHx8wWSiW8IY9qeucxtvs9nYu3cv69evP+u8drsdHx8fDh8+ XOcx2dnZTJw4kdtuuw0/Pz/Cw8NdbdOnT+eZZ57BZrNhMpmYOHGi60gpwCuvvMKzzz5LfHw8AHFx cdxxxx11WrewsJAPPviAl156CS8vL4KCgnjyySd544036pzvgQcecOu/YMEC7rrrrjrvu4iIyIXI 7WH+mZmZbN68maSkJNc2h8PBhg0bGDVqVJOHk7qJ6DuQ7975B+UFuWAyUXzkABd1ueyU/QPbx1JZ 4gCgsqgAZ3kpX834u1sfb3/7CeOCgoIaJK/D4aCiooKwsLA6j7HZbAwYMOCE7Xl5eRQVFZ1wGcIv s/7444907dr1jLJmZmbSpk0bAgICXNsuvvhiMjMz65QP4Le//S0PP/wwhw4dwmQysWfPHq6++uoz yiMiInKhcBWpBw8eZOnSpdx5553YbDZXh127dmEymZg7dy5w/Pq/0tJSpk2bxpgxY5o+8QXK08tC ZfHJb57y8DRz8cDfkrFuOWYfK9FXXQ8m0ynnKj6UhS34+JE+b/9AvHx8GfLYa/i1CT1tBofDgdVq xWKxnPmOACtWrKBTp05uv2dnqnXr1thsNlavXk1ERMRJ+0RFRbFr1y66d+9+ynl8fHw4evToCdsj IyPJy8ujuLjYdSPhnj17iIqKqnNGLy8v7r77bv75z3/i5+fHiBEjMJ3m5yMiIiL/O92/f/9+Fi5c yPDhwwkJCXHr0KdPH5KTk11/fuqTnJyM1WptltAXosD2sRRm7aY07xAAFb+48Qmg45DbyFi/nL1f ryH2mptOGL/36zVUV1XiLCshbdFrdBh06/EGk4lOQ2/nmzlTcJaVHJ/bkU9+Zrrb+NLSUqKiok55 tLCuvvrqK8aNG8eTTz55VvP8xGQyce+993L//ffjcBw/OpyTk8PWrVtdfe677z4mTpxIevrxfdq3 bx8vvvii2zxdunRh+/btZGVlAZCbmwscPyJ7yy23MG7cOKqrq3E4HDz++OP1PrNwzz33MH/+fN5/ /32d6hcREakDc1VVFfPnz8dkMvHee++57loOCwtj5MiRzRxPfmILDqdnYjKrJv0BT4sPfm1CGfTo TDw8PQHwDWpLQLsYSnIOEhAefcJ4s7cPK8bdSWVJIdFX3UCXm34ulHokJvP9kjdZ+UgimExYfG10 v+0egqLjXH0sFguRkZF06NDhjPLfdNNNmEwm2rdvz8yZM7nlllvOaJ6TmTJlCs8++yyXX345JpOJ gIAAJk2aRI8ePQAYNWoUx44dY9iwYZSWltK2bVsmTJjgNkd0dDTPPPMMV199NVarlYiICFasWIHZ bOatt97ioYceIiYmBrPZzO9//3v+8pe/1CtjWFgYnTt3Zu/evcTFxdU+QERE5AJnMgzDOJsJfv2o n1+bZT/x2sZfGl/L8rXN/+tnuZ5v4+vjm9efJjDiYuKud78p6KNxI7jsrocI7danwdaS+rvvvvu4 5JJLeOCBB5o7ioiISIun16KeJw7vSOXwjlQ6Dkk4RY+z+reInKV169axbt06XcctIiJSR+bau0hL dqyygqXJN+Nl9ePKB57Ew+zV3JHkF8rKyoiLi8NutzN37tyzvulMRETkQqHT/S18vIiIiMiFSKf7 RURERKTFUZEqIiIiIi2OilQRERERaXFUpF5ADKPmjMZ9NG4Eh77f1MBpTq6qqopx48ZRVlbWJOud zJQpU3jwwQfrPa6m5sw+XxGRU0lPT+fll1+uU9+z/fuzOf/+TUtLIzQ0lNTU1NP2O9O/n2tT29/f teXLyspiyJAhXHTRRcTHx7N69ep6jW+pavv9q8/v55nwAMjOzuaf//wn//jHP5gxYwY//PCDW6e0 tDSeeuopnnvuOdefHTt2NFooaXgF+/7LJ0/e29wxajVy5Ehat/7/7N15WFXV+sDx74HDfBhEhEAQ kJAhNTRnLU3Q7FpWYpph9+bYdNMms8jKSrK0W94ccEjNBtHUMjVzSFEyyQFNnPAKgiCozKOHQdi/ P/x58sRwDohA+n6ep+ehvda79ruXh3UWe6+9d2usra112y5dusTo0aPx9PTEw8ODJUuW6MU89thj ODk54eXlhaenJ4MHD+b48eNNmnd8fDwPPPDATWs/JycHZ2dnxo4dy9ixY/n+++91ZZWVlbz66qv4 +voSGBhYrX8MOX/+PA8//DA+Pj5069aNX3/9Va98zpw5uv22b9++1nb27dvHwIED+de//qWX3/WG DRtG9+7d65Wfofj//Oc/hIWFMWjQIDIyMhrU7t+5fw8ePEhISAju7u4EBATwww8/1Cs/Q/HSv83b v3feeSexsbF89tlnBvdV0/hZH805/rq6uhIaGooaG5cXAAAgAElEQVS7u3uDcr8RxozfhvKbOnUq fn5+pKSkEBsby7333luv+JbK0OfPULmhz49BVVVVyqpVq5Tz588rVVVVSmJiovLBBx8ohYWFyjW/ //67smXLFqUmhYWFdf73EdT5nyGG2r/V4xtLRvx+ZduMiQ2K3TT1CSUj/vdGzqi6qKgoZdiwYdW2 Hz58WFm3bp1SVVWlHDlyRDEzM1PS09N15Y8++qjyxRdfKIqiKFVVVcr8+fOVjh071rqf5cuXK6tW raq1fObMmcq///3veuW+c+dOJSQkpF4x1+8vOjq6zjrZ2dlKr169aiz76KOPlMcff1ypqKhQcnJy lE6dOim7du0yev/9+/dXFixYoCiKoiQkJCgeHh56/Xu9O++8s9Z2/vnPfypr166ttXzlypVKSEiI 0q1bN6Nzq0/81KlTlXnz5jWo7b9r/1ZWVirDhg1TDhw4oFRVVSnbt29XrKysam3/r+oTL/3bfP1b UlKi+Pv7K6dOnap1X7WNn8ZqKeOvIQ0Znw25kfH7Gn9/fyU2NraRMjKeMd8fN8rQ56+uckOfH0NM VCoVo0ePpm3btqhUKnx8fHBxceHSpUu6iaxWq8XGxqZ+s1/RKC6eOMSuWS+yfcYEvn9hKOcP7WH9 cw+y44M/z4rmpyXx63/fZMPkR/hu/P38Nv9tKivKACgtyGXjy6H8OvcNMk8dYf1zQ1j/3BC2z9B/ qHzy3p/Z9NrjrJ0YzOapo0g7uFuvvOJyMXv+8xqrn76Pza+Ppuhiml75lStX6NKlC2PHjm3wsf7n P/9h1qxZ1bZ36dKF0NBQVCoVHTp0wNbWttbnjapUKkJDQ0lISKh2+ebChQs89NBD7Ny5kyFDhui2 5+bm8sQTT+Di4kLv3r05efKkXtyJEycYM2YMAQEB3HHHHYwdO5bS0lIAMjMz6dy5M2FhYezduxdv b2+8vb0JCQnR65sZM2bg5+dHhw4dGDdunN6jyUaOHMnbb7/Nyy+/jFarrXe/ffHFF7z33nuo1Woc HR157bXXWLp0qVGxBQUFHDlyhOeeew4APz8/xo0bx+LFi+udh6IoODo61liWnp7Ohx9+yOuvv17v do2Nb926dYPaNqQl96+JiQk//vgj3bt3R6VSMWjQIDp27MixY8eMarM+8dK/zde/1tbWTJ06lfnz 59dap7bx01jNNf6GhIToxk21Wl3tKq2h8dnQ+Nq9e3cOHz7MyJEjcXJyonv37iQlJQHGjd+G8ps6 dSr+/v4kJiYSGhpa73hD+aenp9OtWzeysrIYM2YMLi4ueu3f6PfHNZ9++imxsbE1lhn6/NVVXp/P T02qrUmtqqoiLy8PJycn3bbS0lJSU1OJiori22+/5fDhw0bvQNy4jKOx9Bj/Jk6+nTj2w3Ie+ngV 2YnHKcm+CEDRhVS8+g5h2GfrGb7wZ/LPJ/G/7esAsLR3ZNhn6+k5MRzngC6ERm4lNHIrg2f8+SWQ sm8bh7+ZS98XPuDxpTu596WPuFJWqpfDH2si6fjYeB6btxErh9Yc+36ZXnlpaSlJSUnVBhCjjzEj g8LCQgIDA2utU1VVxfjx43nxxRf1Pp9/rbN8+XJ69OiBicmfH+/Vq1cTHBzMhAkT+Oabb2jVqpWu bOLEiZiZmZGamsrGjRtJT0/XazMxMZFRo0YRHx/P2bNnOXnypO5L0NnZmfj4eObPn0+/fv1ITk4m OTmZX375RRf/3nvvERMTQ1xcHKdPn8be3p7w8HBdua+vL3v27MHNzY3evXuzf7/x638rKys5f/48 fn5+zJkzhw0bNtCxY0cSExONilcUBa1WqzcodurUqUH/jhqNptZBcuLEiURERGBn4LnJtTEmXqvV Nrj92vxd+vf6fFNSUvD39693+4bipX+bt38fffRRNmzYUGOZMeNnXZpz/P3ll19042abNm2qtWlo fDY0vl5r44033uD06dO4uLjoJuPGjN+G8pszZw4JCQl4eXmxZcuWescbk//FixcJCwvj0Ucf5ezZ s6xcuVJXdiPfH4BuiUlJSQlFRUVUVVXpnaS8pq7PnzHlxnx+alLtjVOxsbG0b98eBwcH3ba77roL rVaLl5cXOTk5rF27FpVKRZcuXYzekWg4e3dvHDx8sHP1xMHDBwu7Vtg4uVJ06Tw2Tnfg0eN+ACq0 JRRmpGDn6knWmWMEGNn+yU1f03XMSzh6Xx047dt6Y9/WW6/OPU+9TOv2V1v06vMA/9uxTq9co9GQ kpLS4LVQKSkp+Pj41Fnn/fffx87OjhkzZlQrmzZtGh988AGKonDPPfewatUqXdmyZcuIjIwkJiam 2i9HXl4eGzZsIDs7GwsLC9q0acOgQYO4ePGirs4jjzwCXH0xw+nTp/H19eXAgQNGH9u8efPYunUr Go0GgPDwcLp168a8efN0dUxMTJg6dSr/+Mc/GDhwIDt27KBz584G29ZqtZiZmWFiYsLu3bspKioi ICBA70t7yJAh5OTk6MX179+fTz75BAcHB4KCgvj888+ZMmUKsbGxhIeH4+zsbPTxVVVVkZ6ezv79 +5kyZUq18uXLl2NtbU1oaGi9B9D6xPv6+vLzzz/z0EMP6Z0Rq+v4Dfk79O/1Pv30UwYOHIinp6fR 7RsbL/3bPP17jaOjI5cvX6a8vLzamShjxs+6NNf4a4gx47Mx42tERARdu3YFrp55rPe6yJvImPzT 09P55ptvGDBgAEC1K9sN/f4AWLFiBZs3b6a0tJSdO3fy/vvv89JLLzFixAi9enV9/owpr+vzUxe9 SWpycjJxcXGMGzdOr5KHh4fuZ1dXV/r27UtCQoJMUpuYSlXzz5dzMzmw7CMqtJdpfWcgKhNTrlwu NrrdwgupOHjUPUCZqP/8qFg5tKayorxandou9RqjvLwcM7O6X+l64sQJPvjggxrLPv74Y8aPH19j 2aBBg/j222+ZPn06n3zyiW4wAHR/3V7/l/1fpaenM3nyZIqLi+nWrRtqtZqCggIjjgqys7MpLCys tgyipr5KTU1lypQpDBs2zOgvHI1Gg6IolJWV8dNPPwHw22+/4erqqquzdevWOttYu3Yt77zzDkOH DqVXr168+eab1e5MrcuGDRt49913uffee7nzzjv1ytLS0pg5c2atl5EMqU/80KFDWb58OUOHDmXZ smW6s0KGjr8uLb1/rxcdHc2SJUuq3ThkLEPx0r/N07/XU6vVNU4CjBk/69Jc468hhsZnY8fX64/N xcWFsrIyo3O4mYzNX6PR6CaotWnI9wfAW2+9xdNPP03Xrl2pqKjg6NGjmJqa1li3ts+fMeV1fX7q ojsfn5GRwYYNGxg1apTBD5FKpdI7lS+aV8ynr+Pd70EGvbOIrk9OxrVTj2p1TM3MKSvKrzFe4+xG QXrKDedRUFBAeXn1yasx3N3dSUtLq7PO2rVrG3SZrV27duzcuZPAwEB69uxJdHS0rszZ2Znc3Fzd GtOajB49mtGjR7Nt2zYiIiK4//77q9WxtLSsdrYHrq4z02g0bNu2jVOnTun+O3LkiF69ZcuW8cAD D/Dqq6+ydOnSeq0B79ixo96Z3djYWDp16mR0vKenJytXriQmJobZs2dz7NixesUPHz6cY8eOkZaW Vu0M88aNGzE1NaVfv374+voyYsQIjh07hq+vL3l5eQbbrk/8V199xcCBA4mNjW3wZc+atOT+vebQ oUM8/fTTrF+/njvuuMPotusTL/3bvP1bWlpKVVVVjd/PxoyfdWmu8dcQQ+OzseOrIbWN3zdbY+Vv zPdHXd/P06dPZ+nSpTzyyCMsW7asxjp1ff6MKW/o58cErp6tWLNmDSNHjsTFxUWvQklJCevWrdN9 IeTn5/Pbb78REGDsxWRxsxVnXUD1/380FF44x+nta6vVcfDwIT81kZLsCwCUFv75Be/3wEgOf/s5 BenJ/99eBsd//LJeOZSUlODl5WXwr73a+Pj4UFhYqHcZ53q5ubl4enqyaNGiBrWvUqmYPHkyGzZs 4J133tGtnXF3d6dz587MnDkTRVFITEzUu1QFcO7cOd1flmfOnKnxpozAwECOHz9OamoqAFlZWbr9 Pvvsszz//PO6s6+ZmZl6g9Dbb7/N7t27iY2N5cEHH6z3sT3zzDO8//77lJeXc+nSJSIjI5kwYYLR 8bt376awsBC4un5qzZo1uhtR6sPNzY38fP0/hF544QXOnDmj+2/dunV06tSJM2fO6J0dURSFvn37 VrvEZGw8XB2b3Nzc6p23IS25f+HqpC40NJS1a9fWeomvtv41Nh6kf5u7f7dv3653w8z1DI2fhjTX +GuIofHZmPHVGLWN3zdbY+RvzPdHXd/PRUVFBAYGMmzYMN5//30uXLhQYxt1ff4Mld/I50ddUVHB 119/jUqlYvXq1VRWVgJXf2HHjBmDjY0Nd955J99//z1FRUWYmJjQs2dPo9c7iJuv54Q3iV+/hCNR 83Fo54vf4JGk7t+pV0fj3JauYZPZOv1pTM0tsXFyJfitBZiYmuIbPBylspLo2S9zpVSLpX0rOg03 /ksCwNzcHE9PT3x9fRt0DCqVikmTJjFnzhz+85//VCtXFKVB7f7VtUXm1w/GUVFRjB8/Hnd3d4KC gnjqqaf0FufPmzePiIgI3n77bTp27Mizzz5b7VmH3t7efPjhh9x3331YWVnRrl07fvrpJ9RqNRER EcyaNYtevXqhUqmwt7dn+vTpuuUyzzzzzA09O+/pp58mJSWFzp07Y2ZmxqxZs+p1Jik+Pp4XXniB 4uJifHx82Lp1a53LH2pTUlKCpaVlveOuMTU15YcffiAnJ6dBd5JbWlpSXGz8MhdjteT+vXz5MoMH D8bU1JThw4frLmPec8891S6T19S/9YmX/m2+/lUUhTlz5tR6udTQ+GlIc46/hhganw2Nr8aoa/y+ 2W40f2O+P+r6fra1tWXq1KnA1c/wu+++W62Ooc+fMeUNpVJu8NN3/QL3miw0cDfoNAO7N9S+ra3t LR1/O6moqKBv37588MEHN/XB+H9HOTk5PPTQQw1e29lYfH19OXPmTI1lY8eOZciQIYwaNapBbZeW lnLHHXdw6dIlLCws6h3/2muv4e3tzQsvvFDvWOlfw6R/63Yz+3fmzJmcO3euzkdz3ej4KeOvqI2h z58xn8+Guvl/JghhJDMzMzZt2sTTTz/Nvffe2+AnBdyqkpKSmDjx6vNthw4dyqOPPtok+/300091 b6G7dqWlJpMmTSI8PJxt27YxbNiweuc3a9YsRowYUe8v+Llz53L48GEyMjJ49dVX6xV7Penfmkn/ Gudm9e/x48c5ceIEX331VZ3t1DR+/v7777VeAs7Ozta7QUbGX1ETQ58/Yz+fDSVnUlt4vBC3g61b t/Lbb78xY8aMWu8sFQ0n/XtzSf8KcXPIJLWFxwshhBBC3I7kOVJCCCGEEKLFkUmqEEIIIYRocWSS KoQQQgghWhyZpAohhBBCiBZHDVffTb5z506ysrKwsLAgODi42hulEhMT2blzJ0VFRdjb2zNw4MB6 vR9WCCGEEEIIY6kVRSEmJobg4GDc3Nw4e/YsUVFRTJkyRXfneUZGBj/99BNPPPEELi4u5OTk6N6c IYQQQgghRGNTq1QqRo8erdvg4+ODi4sLly5d0k1SY2JiGDhwIC4uLgANemWhEEIIIYQQxqr2xqmq qiry8vJwcnLSbbt06RJ9+/blp59+IisrCw8PD+69917Mzc2bNFkhhBBCCHF7qHbjVGxsLO3bt8fB wUG3raioiF27dtG1a1dGjRpFTk4OO3fubNJEhRBCCCHE7UNvkpqcnExcXBxDhgzRq2RjY8MjjzyC q6srVlZW9OzZk8TExCZNVAghhBBC3D50k9SMjAw2bNjAqFGj0Gg0epXatGlDdna27v//Wi6EEEII IURjMgFIS0tjzZo1jBw5Undz1PV69uxJdHQ0paWlKIrCvn376NChQ5MnK4QQQgghbg/qiooKvv76 a1QqFatXr6ayshIANzc3xowZA4Cvry+FhYUsX76cyspKvLy8GDhwYHPmLYQQQgghbmEqRVGUG2mg qKiozvKFdnZ1lk8zsHtD7V97TNatGi+EEEIIcTuS16IKIYQQQogWRyapQgghhBCixZFJqhBCCCGE aHFkkiqEEEIIIVocmaQKIYQQQogWRyapQgghhBCixZFJqmh08euXsv+LWc2dhhBCCCH+xtQA6enp 7Ny5k6ysLCwsLAgODiYgIACAkpIS5s2bpxdUWVmJRqNhypQpTZ+xEEIIIYS45akVRSEmJobg4GDc 3Nw4e/YsUVFRTJkyBVtbW2xsbHjjjTf0gtasWUOnTp2aKWUhhBBCCHGrU6tUKkaPHq3b4OPjg4uL C5cuXarxbUgnTpxArVYTGBjYlHne1i7nZrLro8mETI/k4IrZZByNpVU7XwbPWApAVWUl8euWkLx3 CygKzgFd6TFuGmZWNro2kvf+zPENyyktyMXKwYm7Rz6HR/cBAJRfLubQl5+QcXQfJqZq7hz4KJ2G j0dlYmrU/suKC/h9SQQXjx/A1sUdjYs7Fhp7vfz3Rc6gIC0JEzNzWrcPpMuTL2Lr4q6rc+XKFbp3 705QUBArVqy42V0qhBBCiBZO/dcNVVVV5OXl4eTkVK2yoijs3r2bkSNHNkly4k/avGx+nfsGHQaN oPcz71B++c/XrR79LpLMhCM8NGcNZhZWHPxyDoe//ZyeE94EIGXfNg5/M5f7p/0XR29/CtKTyU0+ rYuPjZyBmbWG4Qu3UKEtYWfEC5haWHLXw/80av+xke9ham7JiMXbqdCWsPuTV/UmqUe/W4TGuS0h 4QsASDu4W28CDVBaWkpSUhLm5uaN23FCCCGE+FuqduNUbGws7du3x8HBoVrlpKQkbG1tadOmTZMk J/50OTeTTqET8Ow9CLWlFdaOzrqyhC2r6Bo2BTNLa1Cp6BQ6kbSDu3XlJzd9TdcxL+Ho7Q+AfVtv vPsNAaC8pIhzv/9Ct3+9hompGguNPUFPvMCZHeuN2n95cSGp+3fRc/wbmJqZY2nXCre7e+vFWrd2 4dLJOC6ejKOqqhKPHvdjaddKr45GoyElJYU9e/Y0Wp8JIYQQ4u9L70xqcnIycXFxjBs3rsbKiYmJ eHt7N0liQp/a0po77upebXtZYR7l2hJ+m/+23nYLWzvdz4UXUnHw8Kmx3eLMdCztWmFurdFts3Nt R3FmulH7L8pMx9K+FeYau2pl13QOnYCFxp7D38ylICMFj2796Ro2RW+iDeDo6FhrG0IIIYS4vegm qRkZGWzYsIEnn3wSjUZTY+XU1FRCQkKaLDlhmIWtA2aW1gx6ZxE2Tq411tE4u1GQnkIrzw7Vymza uFJamEeFtkR3Cb7o0nls2rgZtX8re0fKigqorCjD1MyixjoqE1P8H3wC/wefoKy4gP1LPyR28QcE v6n/1IiCggKsrKzkkr8QQgghrl7uT0tLY82aNYwcORIXF5daK+fl5dV4M5VoRioVfoMf5/clEZRf LgagtCCX3OQEXRW/B0Zy+NvPKUhPBqA4K4PjP34JgIXGnnY9BhL31WcoVZWUXy7mj9UL8Q0ZbtTu rVu70MqrA/Frl4CiUHQxleSYLXp1Dq/6nPy0pKv7s7HD3r09KIpenZKSEry8vBgwYEBDekEIIYQQ txh1RUUFX3/9NSqVitWrV1NZWQmAm5sbY8aM0VWsrKxEq9VibW3dXLmKWnQJm8yx9V+w5Y0wUKkw t9bQecQk3RpU3+DhKJWVRM9+mSulWiztW9Fp+ARdfJ/n3+Pgitmsf+4fmJia4jPgYe56+Cmj93/f yx+zb8G7rJ0UgqOXP+0HPMTlnExdeZs7O3FwxWyKsy6gVFVh59aOXhOn67Vhbm6Op6cnvr6+N9gb QgghhLgVqBTlL6e06qmoqKjO8oV2ta9VBJhmYPeG2jd0ZvfvHi+EEEIIcTuS16IKIYQQQogWRyap QgghhBCixZFJqhBCCCGEaHGqvXFKtCyyplUIIYQQtyM5kyqEEEIIIVocmaQKIYQQQogWRyapQhhJ UaqaOwUhhBDitqEGSE9PZ+fOnWRlZWFhYUFwcDABAQG6SpWVlWzZsoXk5GQURcHf35/BgwejUqma LXFhnLKifFY/fR+9Jr2F3wOjANj7+Vtknj7K8AWbmzm7v4+8c//j4JefMPjdJc2dihBCCHFbMFEU hZiYGIKDg3nllVd48MEHWb9+vd4NOwcPHqS4uJgXXniB5557jgsXLnDixIlmTFvUh4WtAyn7tgNQ daWCrDPHmjmjv5/SwvzmTkEIIYS4rahVKhWjR4/WbfDx8cHFxYVLly7p7hzXarW0a9cOU1NTTE1N 8fHxMXjXuWg5zG3sKC3IpbQgl+zE49i39SI/7ayuvKqykvh1S0jeuwUUBeeArvQYNw0zKxsA8tOS OPb9F+QknaS8pJC2XfrR65npmJpZAHA5N5N9kTMoSEvCxMyc1u0D6fLki9i6uAOwMrQzT3wZg4Wt AwBHouZzpfQy3ce+rovf9dFkQqZHcnDFbDKOxtKqnS+DZyw1mN/FE4c4uXElV8q0FGddoMfY19m/ 7CPs3DwZ9PYio45v8+uj6f3sOxz/YRkXjh1A49yW/q/MxvYOD0oLctk+YyKlhXmUlxSx/rkhANi6 eOjyA7hy5Qrdu3cnKCiIFStW3LR/SyGEEOJ2UW1NalVVFXl5eTg5Oem2de7cmbi4OP744w9KSkpI TEwkMDCwSRMVDXel9DKevUJI3b+Tc7E7cAvqq1d+9LtILp08xENz1vDYvE2YW2s4/O3nuvKiC6l4 9R3CsM/WM3zhz+SfT+J/29ddF78IjXNbQiO38tjnG/HuO0Q3ATSWNi+bX+e+QbseAwld+DP9JkcY nV/G0Vh6jH8TJ99OHPthOQ99vIrsxOOUZF80Kh4gNnIGHR8bz2PzNmLl0Jpj3y8DwNLekWGfrafn xHCcA7oQGrmV0MitehNUgNLSUpKSkjh58mS9jlsIIYQQNas2SY2NjaV9+/Y4ODjottnb2+Pq6srh w4f59NNPadu2Lfb29k2aqGi4yooyvO8bSuqBXeSmnMbZ72698oQtq+gaNgUzS2tQqegUOpG0g7t1 5R497sejW38qy8soOJ+Enaun3pIB69YuXDoZx8WTcVRVVeLR434s7VrVK8fLuZl0Cp2AZ+9BqC2t sHZ0Njo/e3dvHDx8sHP1pG3XfljYtcLGyZWiS+eNigfo8uRkWrcPwMLWAa8+D1CQnlyv/DUaDSkp KezZs6decUIIIYSomd7D/JOTk4mLi2PcuHF6lb799lt69uyJv78/ubm5bN68mdjYWHr37t2kyYqG s3fzorQwj7Zd+sF1N7yVFeZRri3ht/lv69W3sLXT/Xw5N5MDyz6iQnuZ1ncGojIx5crlYl1559AJ WGjsOfzNXAoyUvDo1p+uYVP0JpqGqC2tueOu7tW2G5PfNdffx3ftZ2PjTdR//ipYObSmsqLc6Nyv cXR0rHeMEEIIIWqm+2bOyMhgw4YNPPnkk2g0Gl0FrVbLpUuX8Pf3B65+EQ8aNIhNmzbJJPVvpv8r szHX2Osug8PVm6rMLK0Z9M4ibJxca4yL+fR1AoaG4dl7EABJuzeSun+XrlxlYor/g0/g/+ATlBUX sH/ph8Qu/oDgN+cBYKI2o7QwT7cmtepKhdE5G5PfzYy/xtTMnLKium+eKigowMrKCnNz8wbvRwgh hBBXmQCkpaWxZs0aRo4ciYuLi14FS0tLzM3NOX36NIqiUFVVRWJiolzu/xuyvaMdFpq//LupVPgN fpzfl0RQ/v9nR0sLcslNTtBVKc66gMrk6sqQwgvnOL19rV4Th1d9Tn5aEgAWNnbYu7cHRdGV27l5 krR7E5UVZZw/tIezMfV49JUR+d3U+P/n4OFDfmoiJdkXrrZRmKdXXlJSgpeXFwMGDKhXu0IIIYSo mbqiooKvv/4alUrF6tWrqaysBMDNzY0xY8Zw7e7/7du3s337dhRFwc3NjaFDhzZz6qKxdAmbzLH1 X7DljTBQqTC31tB5xCQcva+ePe854U3i1y/hSNR8HNr54jd4JKn7d+ri29zZiYMrZlOcdQGlqgo7 t3b0mjhdV95j7Ovsi5xB0u6NePYeRNewKfWaJBrK72bHA2ic29I1bDJbpz+NqbklNk6uBL+1ABNT UwDMzc3x9PTE19fX6DaFEEIIUTuVolx3yqsBDD2KaqFd9bWD15tmYPeG2r/2mCyJF0IIIYS4dchr UYUQQgghRIsjk1QhhBBCCNHiyCRVCCGEEEK0OGrDVcTfmaxpFUIIIcTfkZxJFUIIIYQQLY5MUoUQ QgghRIsjk1RRb/Hrl7L/i1n1jlOUqpuQjfH27t1LXFycwXoRERG8+OKL9W6/qqp5j08IIYS4lagB 0tPT2blzJ1lZWVhYWBAcHExAQICuUmFhIZs3byYrKwsrKyseeOABPD09my1p8feTd+5/HPzyEwa/ u6TZcjh69CgODg7cc889jd52fHw8r776Kjt27Gj0toUQQojbkVpRFGJiYggODsbNzY2zZ88SFRXF lClTdDfVfP/999x11108+eSTZGdn89VXXzFx4kS56UYYrbSw7vfe30xlZWW89dZbREVFUVlZyfbt 25k7dy6tWrVqtH1kZ2c3WltCCCGEAPW1155e4+Pjg4uLC5cuXcLW1pbS0lIuXLjAv/71LwCcnJzo 2rUrhw4d4v7772+uvEUTKisu4PclEVw8fgBbF3c0Lu5YaOx15flpSRz7/gtykk5SXlJI2y796PXM dEzNLCgtyGX7jImUFuZRXlLE+ueGAGDr4sHgGUsBqKqsJH7dEpL3bgFFwTmgKz3GTcPMyka3jytX rtC9e3eCgoJYsWJFvfJftmwZ+/fv58yZM5ibm7N8+XK0Wq1ukpqbm8vzzz9PdHQ07du3p3379jg6 OuriT5w4waxZs4iLiyMvL48HH3yQyMhILKEwn0sAACAASURBVC0tyczMJCQkhKysLPLz8/H29gau /h798ssvutxnzpxJVFQUiqLQr18//vvf/8ofeUIIIUQdqq1JraqqIi8vDycnJ922iooKysvLdf/v 7OxMVlZW02Qoml1s5HuYmKoZsXg7A9+cx+XcTL3yogupePUdwrDP1jN84c/kn0/if9vXAWBp78iw z9bTc2I4zgFdCI3cSmjkVt0EFeDod5FcOnmIh+as4bF5mzC31nD428/19lFaWkpSUhInT55s0DGo VCoURUGtVjNp0iTc3Nx0ZRMnTsTMzIzU1FQ2btxIenq6XmxiYiKjRo0iPj6es2fPcvLkSRYvXgxc /V2Ij49n/vz59OvXj+TkZJKTk3UTVID33nuPmJgY4uLiOH36NPb29oSHhzfoOIQQQojbRbVJamxs LO3bt8fBwQEAS0tLXF1d2b9/P2VlZSQlJbFz506Ki4ubPFnR9MqLC0ndv4ue49/A1MwcS7tWuN3d W6+OR4/78ejWn8ryMgrOJ2Hn6knWmWNG7yNhyyq6hk3BzNIaVCo6hU4k7eBuvToajYaUlBT27NlT 72OYMGEC/v7+eHl5ER4eTkFBga4sLy+PDRs28Pnnn2NhYUGbNm0YNGiQXvwjjzzCww8/TGlpKSdP nsTX15cDBw4Yvf958+bx4YcfotFoUKlUhIeHs3HjxnofhxBCCHE70XuYf3JyMnFxcYwbN06v0uOP P050dDSrVq3C3d2de++9l8TExCZNVDSPosx0LO1bYa6xq7XO5dxMDiz7iArtZVrfGYjKxJQrl437 I6asMI9ybQm/zX9bb7uFbfX9XX8Jvj7Mzc1ZsmQJL730Eh9//DF+fn5s27aNu+++m+TkZNq0aVPn +tT09HQmT55McXEx3bp1Q61W601065KdnU1hYSFjx45tlGMRQgghbhe6SWpGRgYbNmzgySefRKPR 6FVycHDgscce0/3/tm3bcHFxabosRbOxsnekrKiAyooyTM0saqwT8+nrBAwNw7P31TOQSbs3krp/ l14dUzNzyoqq3zxlYeuAmaU1g95ZhI2Ta525FBQUYGVlhbm5eYOOJTAwkJUrV/Laa6+xePFiFi5c iLOzM7m5uZSWlmJpaVlj3OjRo5k8eTIjRowAYOXKlWzYsEGvjqWlJTk5OdViW7dujUajYdu2bbRr 165BeQshhBC3IxOAtLQ01qxZw8iRI2ucfKakpFBWVgbA2bNnOX78ON26dWvaTEWzsG7tQiuvDsSv XQKKQtHFVJJjtujVKc66gMrk6sqRwgvnOL19bbV2HDx8yE9NpCT7AgClhXlXC1Qq/AY/zu9LIij/ /7OvpQW55CYn6MWXlJTg5eXFgAED6n0MkydPZtGiRWRkZHD27FkOHDhAhw4dAHB3d6dz587MnDkT RVFITExk1apVevHnzp3D1NQUgDNnzujWo14vMDCQ48ePk5qaCqBbs61SqXj22Wd5/vnndWdfMzMz OXLkSL2PQwghhLidqMrLy5U5c+agUqkwNzensrISADc3N8aMGQPA/v37OXToEOXl5Tg6OvLggw/i 7OwMGH43/EK72i8TA0xTlDrLb/Td8xJ/Y/FwdeK5b8G7FF1Kw9HLH+fArlzOyaTnhDcBSDu4m/j1 S7hSqsWhnS/uXe8ldf9O7p82V6+dEz9+ScLPUZiaW2Lj5ErwWwswMTWlqvIKx9Z/QfLen0Glwtxa Q+cRk3C/5z5dbEVFBd27d+fuu+9m5cqVBnO+XlJSErNnz2bLli04ODgwefJkxo0bpzfxHD9+PElJ SQQFBXHvvfeSnp7OvHnzANi4cSMRERGUlJTQsWNH/vGPf/DDDz/www8/6O3nk08+Yf78+VhZWdGu XTt++ukn1Go1FRUVzJo1i6ioKFQqFfb29kyfPp2hQ4fW6ziEEEKI24lKUQzMEg2QSeqtHX8rWbBg AQ4ODoSFhTV3KkIIIYQwQG24ihC3BldX12rrrYUQQgjRMskkVdw2hg8f3twpCCGEEMJI1Z6TKoQQ QgghRHOTSaoQQgghhGhxZJIqhBBCCCFaHJmkCiGEEEKIFkcmqcJouSmn+W78QLITj9dZL379UvZ/ MavR968oVXWWG8qvJPsC29+bxHfj72fjq4+T8ce+esU3t7179xIXF1dr+dGjR3F1deXgwYN1thMR EcGLL77Y2OlRVVX3v4+h/FJTUxk0aBB33HEHQUFBbNu2rV7xQgghbi1quPrlEB0dTU5ODiqVip49 e9KnTx9dpaqqKnbs2MHp06cxNTWlV69e3HPPPc2WtGge1q3a4Nk7BJvWTf9K3Lxz/+Pgl58w+N0l tdYxlN+hlZ9i39aL4PB5oMBfHxHcnMdnjKNHj+Lg4FDr756rqyuhoaG4u7s3cWYQHx/Pq6++yo4d O2qtYyi/qVOn4ufnx6ZNm1AUpdq/T3MenxBCiKanBkhOTiY4OBh3d3eys7NZvHgxbm5ueHl5AbBv 3z4KCwv597//TVlZGV9++SWOjo54e3s3Z+6iiVnaO9JzQniz7Lu0MN9gHUP55Z37H33//QGmZhYN im8uZWVlvPXWW0RFRVFZWcn27duZO3curVq10qvn7OzM/PnzmyXH7Oxsg3UM5RcfH8+KFSuwtLRs ULwQQohbixqgf//+ug1OTk54eHig1Wp12w4fPszo0aMxMTHBysqKPn36cPjwYZmk3ia2z5hI0aU0 AEqyLzLs0/U4ePjoysuKC/h9SQQXjx/A1sUdjYs7Fhp7XXlVZSXx65aQvHcLKArOAV3pMW4aZlY2 AGx+fTS9n32H4z8s48KxA2ic29L/ldnY3uFBaUEu22dMpLQwj/KSItY/NwQAWxcPBs9YalR+h776 lLSDuym6lMbuOS9jojarV7yh/C/nZrLro8mETI/k4IrZZByNpVU7X137AFeuXKF79+4EBQWxYsWK evX/smXL2L9/P2fOnMHc3Jzly5ej1Wp1k9SQkBCSkpIASEtL4+jRo9x11126+NzcXJ5//nmio6Np 37497du3x9HRUS+3mTNnEhUVhaIo9OvXj//+97+6t5F1796dxYsX89FHH7Fr1y68vb1ZvXo1Pj4+ ZGZmEhISQlZWFvn5+boxwcfHh19++cWo/KZOncqmTZtISkoiNDQUc3PzesUbyj89PZ1HHnmEn3/+ mZdffpkdO3bQqVMnXftCCCFaJt2aVEVRKC4u5uDBg2i1Wnx9fYGrl/oLCwtxcnLit99+IyEhAWdn Z3Jzc5stadG0Bs9YSmjkVkIjt2Jp51itPDbyPUxM1YxYvJ2Bb87jcm6mXvnR7yK5dPIQD81Zw2Pz NmFureHwt5//pY0ZdHxsPI/N24iVQ2uOfb8MuHp2c9hn6+k5MRzngC66PK6fABrKr9s/X+GxeRvR tGlL8FsL6x1vTP7avGx+nfsG7XoMJHThz/SbHKFXXlpaSlJSEidPnqytm+ukUqlQFAW1Ws2kSZNw c3PTlf3yyy8kJyeTnJxMmzZtqsVOnDgRMzMzUlNT2bhxI+np6Xrl7733HjExMcTFxXH69Gns7e0J Dw+v1sYbb7zB6dOncXFxYdasq2uOnZ2diY+PZ/78+fTr10+Xx/UTQEP5zZkzh4SEBLy8vNiyZUu9 443J/+LFi4SFhfHoo49y9uxZVq5cWVd3CyGEaAF0k9SEhAQiIyOJjo5m2LBhqNVXX0Z15coVTExM UKlUpKSkcOHCBczMzCgrK2u2pEXLUV5cSOr+XfQc/wamZuZY2rXC7e7eenUStqyia9gUzCytQaWi U+hE0g7u1qvT5cnJtG4fgIWtA159HqAgPbkJj6JuxuR/OTeTTqET8Ow9CLWlFdaOznrlGo2GlJQU 9uzZU+/9T5gwAX9/f7y8vAgPD6egoMDo2Ly8PDZs2MDnn3+OhYUFbdq0YdCgQXp15s2bx4cffohG o0GlUhEeHs7GjRv16kRERNC1a1dat27NyJEjSUhIqPdx3CzG5J+enk54eDgjRozAxsaGtm3bNlO2 QgghjKV7LWpAQAABAQHk5uaybt06+vTpQ8eOHTE3NweuTlbDwsKAqzdaXbuUJm5vRZnpWNq3wlxj V2N5WWEe5doSfpv/tt52C1v9+ibqP9/Qa+XQmsqK8sZPtgGMzV9tac0dd3Wvs63rL7HXh7m5OUuW LOGll17i448/xs/Pj23btnH33XcbjL129vGv61evyc7OprCwkLFjx9aZq5mZme5nFxeXFvNHqrH5 azQaBgwY0ISZCSGEuFHqv25wdHQkKCiIU6dO0bFjR+DqJb309HQ8PT0BOH/+PM7Ozn8NFbchK3tH yooKqKwoq/GGJAtbB8wsrRn0ziJsnFwbvB9TM3PKigzfPNXYGit/gIKCAqysrHR/+NVXYGAgK1eu 5LXXXmPx4sUsXLjQYMy1pTmlpaU13pDUunVrNBoN27Zto127dg3KC8DS0pKcnJwGxzdUY+UvhBCi 5THRarWsXbtW9wWTl5fHiRMn9C6HdevWjT179lBZWalbt9q1a9fmylm0INatXWjl1YH4tUtAUSi6 mEpyzJY/K6hU+A1+nN+XRFB+uRiA0oJccpPrd7nYwcOH/NRESrIvXG2jMK/RjqFOjZR/SUkJXl5e DTqbN3nyZBYtWkRGRgZnz57lwIEDdOjQwahYd3d3OnfuzMyZM1EUhcTERFatWqUrV6lUPPvsszz/ /PO6ZQSZmZkcOXKkXjkGBgZy/PhxUlNTAcjKyqpXfEM1Vv5CCCFaHrWVlRV+fn78+OOP5OfnoygK QUFB9OrVS1cpKCiI/Px8IiMjMTExISQkBBeXlvksSdH07nv5Y/YteJe1k0Jw9PKn/YCHuJzz581T XcImc2z9F2x5IwxUKsytNXQeMQlHb3+j96FxbkvXsMlsnf40puaW2Di5EvzWAkxMTW/GIelpjPzN zc3x9PTU3ZBYH1OmTGH27NlERETg4ODA5MmTGTdunNHxUVFRjB8/Hnd3d4KCgnjqqaf0bp6KiIhg 1qxZ9OrVC5VKhb29PdOnT6dLly5G78Pb25sPP/yQ++67DysrK9q1a8dPP/2kW9t+MzVG/kIIIVoe lfLXJ2bXU1FRUZ3lC+1qXqt4zTQDuzfUvqG1sRJ/Y/Gi5ViwYAEODg66teFCCCHErezmn+YQQjQK V1dXNBpNc6chhBBCNAmZpArxNzF8+PDmTkEIIYRoMiaGqwghhBBCCNG0ZJIqhBBCCCFaHJmkCiGE EEKIFkcmqUIIIYQQosWRSapoMXJTTvPd+IFkJx6/Ke0rStVNafeavXv3EhcXd1P3UZejR4/i6urK wYMHb0r7VVU3t/+EEEKI65kApKamsnLlSj799FM+++wz9u3bp1eppKSEXbt2sXDhQr755ptmSVTc +qxbtcGzdwg2rRv/RRF55/7HjvefbfR2r3f06FESEqq/ieqxxx7DyckJLy8vPD09GTx4MMePN/5E 3NXVldDQUNzd3Ru97fj4eB544IFGb1cIIYSojRogOTmZ4OBg3N3dyc7OZvHixbi5ueHl5QWAiYkJ bm5ulJeXk52d3Zz5iluYpb0jPSeE35S2Swvzb0q7AGVlZbz11ltERUVRWVnJ9u3bmTt3Lq1atdLV +fjjjxk/fjyKorBw4UJGjx7NsWPHGjUPZ2dn5s+f36htXiO/90IIIZqaCUD//v11Z1+cnJzw8PBA q9XqKllZWeHv74+bm1vzZCma1ebXR3M25id+futfrBk3gF2zXqSsME9Xfjk3k82vP0FpYR6//vdN 1owbwPYZE3Xl5ZeL2bdwBuueGcz3z/+D+HVLUKoqdeXbZ0xk/XNDWP/cEL56PIj8tCS9/VdVVvLH mkh+ePFhfvj3Q/y24B0qtCV6dZL3/sym1x5n7cRgNk8dRdrB3QCUFuSy8eVQfp37Bpmnjuj2c31+ AFeuXKFLly6MHTu23v2zbNky9u/fz5kzZzh//jx9+/bV+/25nkqlIjQ0lISEBN3l8/T0dLp160ZW VhZjxozBxcWFkJAQXUxBQQETJkygXbt2+Pj4MHPmTCor/+y/kJAQvL298fb2Rq1Wc+LEiWrHNmPG DPz8/OjQoQPjxo2r9iayqKgounTpQtu2bbnnnnvYuHEjAJmZmXTu3JmwsDD27t2r28/1+QkhhBA3 g+5h/oqiUFJSwqlTp9BqtQ16x7i4dSXt3siA1z7BwtaBmM+mcejrz+j7wvu6cm1eNr/OfYMOg0bQ +5l3KL/85yQoNnIGZtYahi/cQoW2hJ0RL2BqYcldD/8TgMEzlurqfjd+YLV9H/0uksyEIzw0Zw1m FlYc/HIOh7/9nJ4T3gQgZd82Dn8zl/un/RdHb38K0pPJTT4NXD07O+yz9Zz7/RdOb/uOwe8uqfH4 SktLSUpKwtzcvEH9o1KpUBQFtVrNpEmTaq1XVVXF8uXL6dGjByYmfy4Jv3jxImFhYUyaNInFixeT n//nmd+JEydib29PUlISRUVFDB06FGtra1555RUAfvnlF11dV1fXavt87733+O2334iLi8PGxoZX XnmF8PBw5s2bB8B3333Hm2++yYYNGwgKCiIhIYE//vgDuHp2Nj4+nvXr17No0SJ27NjRoP4RQggh 6kv3LZmQkEBkZCTR0dEMGzYMtVpeRiX+1PGxcVi1aoOJ2gyf+x8h/fBevfLLuZl0Cp2AZ+9BqC2t sHZ0BqC8pIhzv/9Ct3+9hompGguNPUFPvMCZHeuN3nfCllV0DZuCmaU1qFR0Cp2oO1MKcHLT13Qd 8xKO3v4A2Lf1xrvfkHodn0ajISUlhT179tQrDmDChAn4+/vj5eVFeHg4BQUF1epMmzYNLy8vvL29 OXToEKtWrdIrT09PJzw8nBEjRmBjY0Pbtm0ByM/P5/vvv+eTTz7BzMwMR0dH3n//fZYuXVptH7WZ N28eH374IRqNBpVKRXh4uO5MKcBnn33GrFmzCAoKAsDf358nnnii3v0ghBBCNCbdTDQgIICAgABy c3NZt24dffr0oWPHjs2Zm2ihHDx8KCvWn4ipLa25467u1eoWZ6ZjadcKc+s/3zlv59qO4sx0o/ZV VphHubaE3+a/rbfdwtZO93PhhVQcPHzqcwg1cnR0bFCcubk5S5Ys4aWXXuLjjz/Gz8+Pbdu2cffd d+vqXFuTWhuNRsOAAQOqbU9OTsbJyQl7e3vdtjvvvJPk5GSjcsvOzqawsLDaMobrj/XMmTPcdddd RrUnhBBCNJVqp0sdHR0JCgri1KlTMkkVNSq6kIrGua1RdW3auFJamEeFtgQzK5ur8ZfOY9PGuPXN FrYOmFlaM+idRdg4Vb+UDaBxdqMgPYVWnh1qbcfUzJyyorpvniooKMDKyqrBl/wDAwNZuXIlr732 GosXL2bhwoUNaud6np6eZGdnU1RUhK2tLQBnz57V3dRoSOvWrdFoNGzbto127drVWMfLy4vTp0/T uXPnWtuxtLQkJyen3vkLIYQQDWWi1WpZu3at7gsoLy+PEydO6C43CgGQsm87lRVllF8u5uh3i/AN fsyoOAuNPe16DCTuq89Qqiopv1zMH6sX4hsy3Lgdq1T4DX6c35dEUH65GLh6M1Ru8p+PevJ7YCSH v/2cgvSrZxeLszI4/uOXes04ePiQn5pISfaFq21cd+MXXH3MmpeXV41nMw2ZPHkyixYtIiMjg7Nn z3LgwAE6dKh9wlwfjo6OPPLII7z++utUVlZSUFDAu+++W+dZ2eupVCqeffZZnn/+ed0yhMzMTI4c OaKr89xzzxEeHq57fNa5c+eYM2eOXjuBgYEcP36c1NRUALKyshrj8IQQQohaqa2srPDz8+PHH38k Pz8fRVEICgqiV69eukrr1q3j/PnzlJeXU15ezty5c7Gzs2PcuHHNmLpoSmoLS356/UnKivPxvnco gQ8/ZXRsn+ff4+CK2ax/7h+YmJriM+Bh7qpHfJewyRxb/wVb3ggDlQpzaw2dR0zSrUH1DR6OUllJ 9OyXuVKqxdK+FZ2GT9BrQ+Pclq5hk9k6/WlMzS2xcXIl+K0FmJiaAlcv2Xt6ejbohsEpU6Ywe/Zs IiIicHBwYPLkyY36u7Fs2TJefvll2rdvj1qt5p///KfupiljREREMGvWLHr16oVKpcLe3p7p06fT pUsXAMaPH8+VK1cYPnw4JSUltGnThjfffFOvDW9vbz788EPuu+8+rKysaNeuHT/99JOsXRdCCHHT qBRFUW6kgb8+yuavFtrZ1Vk+zcDuDbV/7RKoxN+ceLj6CKp7nnoZ1049DNa9EVWVlUQ91ZtH5v5g 9HKClmTBggU4ODgQFhbWLPu/cuUK9vb2HD9+HG9v72bJQQghhGgs8lpUYaQb+lumTsWZGQBcPH4A tYUV1jfhjVNNwdXVlTZt2jT5flNSUgCIjo7GxsbmprxxSgghhGhqcq1ONKvLOZf49fM30eZmYWph yb1TZmFi+vf8WA4fbuQ620Z0/vx5nnrqKTIyMrC2tubrr7/GzMysyfMQQgghGptc7pf4OsuFEEII IZrD3/OU1d/IjU4CmzteCCGEEKI5yJpUIYQQQgjR4sgkVQghhBBCtDgySRVCCCGEEC2OGiA1NZXo 6GhycnJQqVT07NmTPn366Cqlp6ezc+dOsrKysLCwIDg4mICAgGZLWgghhBBC3NrUAMnJyQQHB+Pu 7k52djaLFy/Gzc0NLy8vFEUhJiaG4OBg3NzcOHv2LFFRUUyZMkVuyhFCCCGEEDeFGqB///66DU5O Tnh4eKDVaoGr7/4ePXq0rtzHxwcXFxcuXbokk1QhhBBCCHFT6B5BpSgKJSUlnDp1Cq1WW+s7zKuq qsjLy8PJyanJkhRCCCGEELcX3SQ1ISGBzZs3oygKTz31FGp1zY9QjY2NpX379jg4ODRZkkIIIYQQ 4vaim4kGBAQQEBBAbm4u69ato0+fPnTs2FGvcnJyMnFxcYwbN67JExVCCCGEELePao+gcnR0JCgo iFOnTultz8jIYMOGDYwaNQqNRtNkCQohhBBCiNuPiVarZe3ateTk5ACQl5fHiRMnaNu2ra5SWloa a9asYeTIkbi4uDRXrkIIIYQQ4jahtrKyws/Pjx9//JH8/HwURSEoKIhevXoBUFFRwddff41KpWL1 6tVUVlYC4ObmxpgxY5ozdyGEEEIIcYtSKYqi3EgDRUVFdZYvtLOrs3yagd0bat/QY7CaO14IIYQQ QtSfvBZVCCGEEEK0ODJJFUIIIYQQLY5MUoUQQgghRIsjk1QhhBBCCNHiyCRVCCGEEEK0ODJJFUII IYQQLY5MUoUQQgghRIujBkhNTSU6OpqcnBxUKhU9e/akT58+ukqGyoUQQgghhGhMaoDk5GSCg4Nx d3cnOzubxYsX4+bmhpeXF8aUCyGEEEII0ZjUAP3799dtcHJywsPDA61Wq9tmqFwIIYQQQojGpL72 g6IolJSUcOrUKbRaLb6+vnoVDZULIYQQQgjRWHST1ISEBDZv3oyiKDz11FOo1Wq9iobKhRBCCCGE aCy6mWZAQAABAQHk5uaybt06+vTpQ8eOHTG2XAghhBBCiMZS7RFUjo6OBAUFcerUqRoDDJULIYQQ Qghxo0y0Wi1r164lJycHgLy8PE6cOEHbtm0BMFQuhBBCCCFEY1NbWVnh5+fHjz/+SH5+PoqiEBQU RK9evQAwVC6EEEIIIURjUymKotxIA0VFRXWWL7Szq7N8moHdG2rf1ta2RccLIYQQQoj6k9eiCiGE EEKIFkcmqUIIIYQQosWRSaoQQgghhGhx5In8N5msaRVCCCGEqD85kyqEEEIIIVocmaQKIYQQQogW Ryapt4HNr4/mwrH9BuspSlUTZCOEEEIIYZgaIDU1lejoaHJyclCpVPTs2ZM+ffrUGBAVFUVRURGT Jk1q0kTFzZV37n8c/PITBr+7pLlTEUIIIYS4OklNTk4mODgYd3d3srOzWbx4MW5ubnh5eelVPnr0 KBUVFc2Rp7jJSgvzmzsFIYQQQggdNUD//v11G5ycnPDw8ECr1epVLCws5Ndff+XBBx9k586dTZvl be5ybib7ImdQkJaEiZk5rdsH0uXJF7F1cQdgZWhnnvgyBgtbBwCORM3nSulluo99XddGztlT/LFm IYUZ52jToTN9n38PC7tWlBbksn3GREoL8ygvKWL9c0MAsHXxYPCMpbr97/poMiHTIzm4YjYZR2Np 1c5XV15VWUn8uiUk790CioJzQFd6jJuGmZWNUeUAV65coXv37gQFBbFixYqb36lCCCGEaNF0j6BS FIWSkhJOnTqFVqvF19dXr+KmTZsYOHAgFhYWTZ7k7e7od4vQOLclJHwBAGkHd+tN8Ixx4WgsA179 BAtbB2I+m8ahr+fS94X3sLR3ZNhn6zn3+y+c3vZdrZf7tXnZ/Dr3DToMGkHvZ96h/PKfj9Y6+l0k mQlHeGjOGswsrDj45RwOf/s5PSe8aVQ5QGlpKUlJSZibm9e3e4QQQghxC9LdOJWQkEBkZCTR0dEM GzYMtfrPR6geOXIEMzMzAgMDmyXJ2511axcunYzj4sk4qqoq8ehxP5Z2rerVRsfHxmHVqg0majN8 7n+E9MO/1iv+cm4mnUIn4Nl7EGpLK6wdnXVlCVtW0TVsCmaW1qBS0Sl0ImkHdxtdDqDRaEhJSWHP nj31yksIIYQQtybdTDQgIICAgAByc3NZt24dffr0oWPHjhQUFBATE8P48eObM8/bWufQCVho7Dn8 zVwKMlLw6NafrmFT9CaK9fF/7N17XFRl/sDxz8AwMDBcRIHlDpoieAlNwduqeUt/pW5qlpm7pWJl G9Zuq6VWtkUXa81VEy+luW1SqeWtFNJMNE1JDRWFFEFUTOQ2A8MM1/n94Xa2WZCbCKTf9+s1r5c8 z/d5nu+Z7bV8Oec557j5d6C0WN+gMWoHR37XpXe19lJDAWUmI98te9Gq3d7ZpV79v+bu7t6gnIQQ Qghx66r2xil3d3fCw8M5ffo0Xbt2QavyWQAAIABJREFUJS0tDZVKxZo1a4BreweNRiNLliwhKiqq 2RO+HalsbOk86iE6j3qI0mI9h1a/zsGVrzL0haUA2KjtMBsKlD2pVRW139xW9PMFZT/rL2ztNJQW NfzmKXtnN+wcHBn+0gqc2nk3uP/X9Ho9Wq1WLvkLIYQQAhuTycSGDRvIy8sDoKCggJSUFHx9fQGI iIggOjpa+UycOBEvLy+io6PRarUtmftt4+j6JRReSAfA3skFV7/2YLEo/S4+gaR/u43K8lIu/rCX c4nbq81x/uDXVJaXUl5STPJnK7hjyB+s+t38O1CYdRZj7mUAzIaC+iWnUhEy4gG+XxVDWUnxtbH6 fPIzUuvX/x9Go5GgoCAGDx5cv3WFEEIIcUtTa7VaQkJC2LJlC4WFhVgsFsLDw+nTp09L5yb+w+OO biStXUjx1ctYqqpw8QmgT9R8pT/isdkciF1A+rdbCew7nJ6TZ1UrAnVefnw5ZzKlRQUED/g/wkZP se739KXn5Gh2zn8UW40DTu28GTrvPWxsbevMr8fkaE5sep+vnp8MKhUaRx3dJ8zAPbhzvfoBNBoN gYGB1W7YE0IIIcTtSWWx/OqUXCMUFRXV2r/cpfrew1+bU8fydc3v7Ox8S48XQgghhLgdyWtRhRBC CCFEqyNFqhBCCCGEaHWkSBVCCCGEEK1OtUdQidZF9rQKIYQQ4nYkZ1KFEEIIIUSrI0WqEEIIIYRo daRIFUIIIYQQrY4aICsriz179pCXl4dKpSIyMpJ+/fopQcnJyWzduhU7OzulbfTo0XTp0qX5MxZC CCGEELc8NUBGRgZDhw7Fz8+P3NxcVq5ciY+PD0FBQQCYzWZ69erFqFGjWjJXIYQQQghxm1ADDBo0 SGlo164d/v7+mEwmpc1kMuHk5NT82QkhhBBCiNuS8ggqi8WC0Wjk9OnTmEwmq3eom81mcnNziYuL o6qqitDQUHr27NkiCQshhBBCiFufUqSmpqayfft2LBYLU6ZMQa3+7yNUu3TpgslkIigoiLy8PDZs 2IBKpaJHjx4tkrQQQgghhLi1KZVoaGgooaGh5Ofns3HjRvr160fXrl0B8Pf3VwZ4e3vTv39/UlNT pUgVQgghhBA3RbVHULm7uxMeHs7p06evO0ilUmFjI0+vEkIIIYQQN4eNyWRiw4YN5OXlAVBQUEBK Sgq+vr4AGI1GNm7cSEFBAQCFhYV89913hIaGtljSQgghhBDi1qbWarWEhISwZcsWCgsLsVgshIeH 06dPHwCcnJy44447+PzzzykqKsLGxobIyEi6d+/ewqkLIYQQQohblRqge/futRad4eHhhIeHN1tS QgghhBDi9iYbS4UQQgghRKsjRaoQQgghhGh1pEgVQgghhBCtjhSpQgghhBCi1ZEiVQghhBBCtDpS pAohhBBCiFZHitTb0PFNqzn0/hsNHmexVN3QuvmZaXw2bQi5Z08qbRWlZjY9OZJNT47kowfv4vKJ Qze0hhBCCCFuDWqArKws9uzZQ15eHiqVisjISPr162cVePbsWXbv3k1RURGurq4MGTKEDh06tEjS ovkVnP+JpA/fYcTLq6r1Zf94gF0xM9HoXJQ2O3st41fEW8U5tvEgsO8wnNp6KW1qewfGx+4EYPvs STcpeyGEEEL81qgBMjIyGDp0KH5+fuTm5rJy5Up8fHwICgoCIDs7my+//JKHHnoILy8v8vLyKC0t bcm8RTMzGwpr7W8T2JHR72yoNcbB1Z3I6XObMi0hhBBC3KLUAIMGDVIa2rVrh7+/PyaTSWlLTExk yJAheHldOwPWtm3bZk5T3IjSYj3fr4rh55OHcfbyQ+flh73OVekvvJDOic/fJy/9FGVGA749BtDn 8fnY2tlj1ueTsCAKs6GAMmMRm54cCYCzlz8jFqyu1/oJC6IounIBAGPuz4xZtAk3//qfha+qrOT4 xlVk7P8KLBY8Q3sSMXUOdlonJaaiooLevXsTHh7O2rVr6z23EEIIIVon9S//sFgsGI1GTp8+jclk omPHjkrQlStX6N+/P19++SVXr17F39+f3//+92g0mhZJWjTMwdhXsNU4MGFlAuUmI9++81erIrXo chZB/UfS/8+vUlVRQfzLU/kpYSOh907GwdWdMe9u4vz3u0iL/6zGy/11+XUx+9m0IQ0en/xZLDmp x7jv7U+xs9eS9OHbHP14CZHTX1BizGYz6enp8t+kEEIIcYtQitTU1FS2b9+OxWJhypQpqNVKF0VF RXzzzTeMGDECNzc3tm3bxu7duxk1alSLJC3qr6zYQNahb3jow0Rs7TTY2mnwubMvpoJcJcY/4m4A yk1GDNmZuHgHcvXMCUIbsE7B+TN88thA5ef+T72Kf69BtYyov9Sv1jPsxRXYOTgC0G18FNv/9pBV karT6cjMzMTR0bFJ1hRCCCFEy1Iq0dDQUEJDQ8nPz2fjxo3069ePrl27AuDk5MTYsWNxc3MDIDIy kq1bt7ZMxqJBinIu4eDaxuqmpv9Vkp/D4Q/epNxUQts7wlDZ2FJRUtygdeqzJ7UxSg0FlJmMfLfs Rat2e+fqx+Pu7t7k6wshhBCiZaj/t8Hd3Z3w8HBOnz6tFKkeHh7k5uYqRapOp2veLEWjaV3dKS3S U1leiq2dfY0xiYtmE3rvZAL7Dgcg/dutZB36xirG1k5DaVHtN0/dKJWNCktlpVWbvbMbdg6ODH9p BU7tvGsdr9fr0Wq1cslfCCGEuAXYmEwmNmzYQF5eHgAFBQWkpKTg6+urBEVGRrJnzx7MZjMWi4UD Bw7QqVOnlspZNIBjWy/aBHXi+IZVYLFQ9HMWGYlfWcUUX72MyubaI3MNl8+TllD9jKibfwcKs85i zL0MgNlQ0OS56jx8uHh0H1gslBbrrzWqVISMeIDvV8VQ9p+zu2Z9PvkZqVZjjUYjQUFBDB48uMnz EkIIIUTzU2u1WkJCQtiyZQuFhYVYLBbCw8Pp06ePEtSxY0cMBgNr1qyhsrKSoKAghgxp+A0womUM fPYtDrz3MhtmDMM9qDPtB99HSV6O0h85/QWOb1rFsbhluAV0JGTERLIO7baaQ+fpS8/J0eyc/yi2 Ggec2nkzdN572NjaNlme3SfMYO+i2Wx4fAReYXcx8Jk3AegxOZoTm97nq+cng0qFxlFH9wkzcA/u rIzVaDQEBgZa3fAnhBBCiN8ulcVisdzIBEVFRbX2L3e5/l5IgDl1LF/X/M7OzjJeCCGEEOIWI69F FUIIIYQQrY4UqUIIIYQQotWRIlUIIYQQQrQ61R5BJW4tsqdVCCGEEL9FciZVCCGEEEK0OlKkCiGE EEKIVkeKVFFvFktVS6dwUyUnJ+Pt7U1SUlKTzVlSUkJwcDDBwcHY29uze/fuugc1Uk35N+f6Qggh RFNSA2RlZbFnzx7y8vJQqVRERkbSr18/4NqbfJYuXWo1qLKyEp1Ox6xZs5o/Y9EiCs7/RNKH7zDi 5VUtnUqDmc1m/vKXv/D555+jUqkYOHAg77zzDv7+/lZx3t7ejB8/Hj8/vyZb29HRkYyMDAB69+7d qDni4+O59957adOmjdLm5OREZmamVVxN+TfF+kIIIURLUANkZGQwdOhQ/Pz8yM3NZeXKlfj4+BAU FISTkxPPP/+81aBPP/2Ubt26tUjComWYDYUtnUKjxcTEcOrUKY4cOYKLiwufffYZSUlJ1YpUT09P li1b1kJZ1q5bt24cO3as1pjWnL8QQgjRUDYAgwYNUs6+tGvXDn9/f0wmU40DUlJSUKvVhIWFNV+W osWY9flsfXY8+xY/T87pY2x6ciSbnhxJwoIoAAovpLPx8RFWWwHKjEV8OnUwleWlAGyfPYlziV+y Y96f+HTqYL5542lKDQVKfFVlJT9+GssXT4/miz/fx3fvvUS5yWiVR0VFBT169OCxxx5r8DGcOnWK iIgIfH19cXZ2Ztq0aYwbN07pHzZsmHJJXK1Wk5KSovTt3buXMWPGMHToUO644w62b99O+/btGTly pBLTu3dvPv74YwYMGICXlxdjxowhNze33vlVVFSwYMECQkJC6NSpE1OnTq3zqQy/Vlv+zbG+EEII cTMoe1ItFgvFxcUkJSVhMplqfAe6xWLh22+/ZeDAgc2apGg5Dq7ujHl3E5FRc/EM7cH42J2Mj93J iAWrAXDz74DOy49Lx75Txpw/tAv/XoOxtbNX2tK/3crg597hgVVfY6O244eP3lX6kj+L5cqpH7jv 7U+5f+k2NI46jn68xCoPs9lMeno6p06davAx/PGPfyQ2NpYXX3yxxuJx165dZGRkkJGRgYeHR7X+ hIQElixZQmRkJG+++SaHDx/m8OHDXLhwQYlZt24dGzZs4MKFC2g0GmbPnl3v/F555RUSExM5cuQI aWlpuLq6Mnfu3HqPryv/m72+EEIIcTMoRWpqaiqxsbHs2bOHMWPGoFZXf4Rqeno6zs7OjfpFKG5d nUc+xJmvNyk/ZyR+RftB91nFdL1/Kto2Htio7ehw91guHd2v9KV+tZ6ek2dh5+AIKhXdxkdxIelb q/E6nY7MzEz27t3b4PzGjh3L/v37SUpKIjAwkHnz5mE2m+s9PjQ0lC5dutCxY0dGjRpFu3btCAgI 4Ny5c0rMnDlz8Pb2RqPR8Kc//YkdO3bUe/6lS5fy+uuvo9PpUKlUzJ07l61bt1rFnDhxAg8PD+Wz bdu2es/fFOsLIYQQzU2pRENDQwkNDSU/P5+NGzfSr18/unbtahV89uxZgoODmz1J0boFRA7hh3X/ wFRwFVQqiq5c5Hdhd1033s2/A6XFegBKDQWUmYx8t+xFqxh7Z5dq49zd3Rud45133snOnTs5duwY M2bMIC0tjY0bNzZoDpVKVeO//1eXLl3Iz8+v15y5ubkYDIZq2xj+91jrsye1Meq7vhBCCNHcqp0u dXd3Jzw8nNOnT1crUrOyshg2bFizJSdaD1s7DaVFNd88ZWOr5o4hfyD9222oHbQE/34U1FLEFV3O QufpC4C9sxt2Do4Mf2kFTu28a81Br9ej1WrRaDSNPo4ePXqwcOFCJk6c2Og56nK9P+ZsbGyoqKiw amvbti06nY74+HgCAgJuWk6tYX0hhBCiIWxMJhMbNmwgLy8PgIKCAlJSUvD19a0WXFBQIK/RvE25 +XegMOssxtzLAJh/deMTQKfhE0jfu43MAwl0GDS62vjMAwlUlpdSVlJM8mcr6Dj0/msdKhUhIx7g +1UxlJUUX5tbn09+RqrVeKPRSFBQEIMHD25Q3hUVFfTv35+PPvoIg8HA1atX+fjjj5VHrDWVDRs2 YDab0ev1vPLKK0ydOrVaTGBgIDt27MBisShnWlUqFU888QQzZ85Er792djknJ+emnDVt6fWFEEKI hrDRarWEhISwZcsWFi1axJo1awgICKBPnz5WgZWVlZhMJhwdHVsoVdGSdJ6+9Jwczc75j7I5eiz7 Fr9AVWWl0u/o7oGrX3sqSs24+lY/i6i2d+DL2Q+zZdZYPDqHEzZ6itLXY3I07e7oylfPT2bzrD/w zZvRlOTnWI3XaDQEBgbWeENfbdRqNcuWLePTTz8lJCSELl26YDKZWL16dQO/gdo5OjoSERFBWFgY /fr149lnn60WM3/+fBISEggICODpp59W2mNiYoiIiKBPnz6EhYUxduxYsrOzmzS/1rC+EEII0RAq i8ViuZEJ6npUzXKX6nsLf21OHcvXNX9dZ3Zl/I2Nb4jvV76GW8AddB71kFX79tmTuGvKs3h3i2iy tVqT3r1789ZbbzFkyJCWTkUIIYS4ZchrUUWT+DkliZ9Tkug0fPx1Im7ob6FW7wb/1hNCCCHE/6j+ nCkhGqCi1Mzm6DHYaZ3o/9TfsVHbtXRKQgghhLgFSJEqboja3oEJKxNqjblvYVwzZdMykpKSWjoF IYQQ4pYjl/uFEEIIIUSrI0WqEEIIIYRodaRIFUIIIYQQrY4UqaLZWCxVjRq3ffYkLp841MTZNE55 eTmzZ8+mpKSkyeZMTU1l0aJF1+0/evQoV69ebbL1YmJirJ6TWl9VVbX/77d//36OHDnS2LSEEEII KzZw7XWn69atY9GiRbz77rscOHDAKqiyspJt27axZMkS/vnPfxIfHy+P3BENUnD+J77++xMtnUat +vXrx1//+tdaYx555BHatm3bpC+1uOOOOzh48CDvvvtujf0HDx7kH//4R5Ot1xjHjx/nnnvuqTUm OTmZ1NTUWmOEEEKI+lIDZGRkMHToUPz8/MjNzWXlypX4+PgQFBQEXLt7ubi4mKeeeorKykrWr19P SkoKXbt2bcncxW+I2VDY0inUKiUlBQ8PD7755hvKysrQaDTVYj755BPMZjNz5sxp0rXVajXr1q3j rrvuYtSoUXTu3Nmq/+GHH+auu+4iJiYGW1vbJl27vnJzc6/bV1payrx584iLi6OyspKEhAQWL15M mzZtmjFDIYQQtxo1wKBBg5SGdu3a4e/vj8lkUtpMJhMBAQHY2tpia2tLhw4d6nyTkbg1/JzyA6e2 rqOi1ETx1ctEPDabQx+8iYtPIMNfXAFA4YV0Tnz+PnnppygzGvDtMYA+j8/H1s4esz6fhAVRmA0F lBmL2PTkSACcvfwZseC/rybN2L+Dk5vXYNbno3Vrx50Tn8S/92Clv7ykmL3/eI7LJw6j8/Rl0F8W 4vw7f6W/oqKC3r17Ex4eztq1axt8nKtXr+bRRx8lKSmJL774ggcffLBazD/+8Q/WrVvX4Ll/sWjR Ivr27Uvfvn2r9Tk6OvK3v/2NZcuWsWzZMqu+Nm3a0Lt3bxISEhg1alSD183Pz2fmzJns2bOH9u3b 0759e9zd3ZX+lJQU3njjDY4cOUJBQQGjRo0iNjYWBwcHcnJyGDZsGFevXqWwsJDg4GuvvO3QoQO7 du0C4IMPPuDQoUOcOXMGjUbDmjVrMJlMUqQKIYS4IcqeVIvFQnFxMUlJSZhMJqt3pHfv3p0jR47w 448/YjQaOXv2LGFhYS2SsGh+2ckHiZj2Au06duPEF2u476315J49iTH3ZwCKLmcR1H8kY97dxLjl Oyi8mM5PCRsBcHB1Z8y7m4iMmotnaA/Gx+5kfOxOqwI180A8R/+9mP5PvcoDq3fz+2fepKLUbJXD j5/G0vX+ady/dCtat7ac+PwDq36z2Ux6ejqnTp1q8PGVlpayY8cO7r33Xv74xz+yevXqajHZ2dkY DIZG/XefnZ0NgNFopKioiKqqKq5cuVIt7g9/+AObN2+ucY5HH320UcU3QFRUFHZ2dmRlZbF161Yu Xbpk1X/27FkefPBBjh8/zrlz5zh16hQrV64EwNPTk+PHj7Ns2TIGDBhARkYGGRkZSoH6C5VKhcVi Qa1WM2PGDHx8fBqVqxBCCPEL5WH+qampbN++HYvFwpQpU1Cr//ucf1dXV7y9vTl69Cjbtm0jMjIS V1fXFklYND9Xv2Dc/Dvg4h2Im38H7F3a4NTOm6IrF3Fq9zv8I+4GoNxkxJCdiYt3IFfPnCC0nvOf 2vYRPR95Bvfga5e5XX2DcfUNtoq5a8qztG1/bcagfvfw09cbrfp1Oh2ZmZmN2iu6adMmRo4ciUaj oXPnzhQXF5Oenk6HDh2UmMzMTKufG2Lt2rVs374ds9nM7t27+fvf/84zzzzDhAkTrOLc3d0pKSmp cbvBiBEjeOqpp8jPz7c6C1qXgoICNm/eTG5uLvb29nh4eDB8+HB+/vlnJWbs2LEAFBUVkZaWRseO HTl8+HC915g+fTo//vgjQUFBREVFMWfOHPn/ByGEEDdMqURDQ0MJDQ0lPz+fjRs30q9fP2XP6ccf f0xkZCSdO3cmPz+f7du3c/DgwRovW4pbl0pV879L8nM4/MGblJtKaHtHGCobWypKius9r+FyFm7+ tReANr/6o0nr1pbK8rJqMQ0p3n5t4sSJVpf39+/fj42N9YMvysrKsLNr3Ctf582bx6OPPkrPnj0p Ly8nOTn5untL1Wp1jUWqra0tDz74IHFxcTz11FP1XjsjIwMPD49aL71funSJ6OhoiouL6dWrF2q1 Gr1eX+81NBoNq1at4plnnuGtt94iJCSE+Ph47rzzznrPIYQQQvyvao+gcnd3Jzw8nNOnTwPX9qNe uXJFuZnD3d2d4cOHc+LEiebNVLRaiYtmEzxgFMNfWkHPh6Px7hZRLcbWTkNpUc03T+k8fdBfyrzh PPR6PWVl1YvXuqjVaquiUa1WVytS/fz8uHDhQqPXnz9/PqtXr2bs2LF88MEHNcaYzWaqqqrQ6XQ1 9td1yb+m9T09PcnPz8dsNl9nFEyaNIlJkyYRHx9PTEwMd999d7UYBwcH8vLyrjsHQFhYGOvWreOR Rx5RtgsIIYQQjWVjMpnYsGGD8guooKCAlJQUfH19gWu/nDQaDWlpaVgsFqqqqjh79qxczhOK4quX Uf2nqDNcPk9awoZqMW7+HSjMOosx9zIAZkOB0hdyz0SOfrwE/aWM/8yXzcktHzYoB6PRSFBQEIMH D27cQdShQ4cOGAwGq8vk9V2/qKiIsLAwxowZw9///ncuX75c4xwJCQkMGzbsujmEhISg0Whq/APx euv7+fnRvXt3XnvtNSwWC2fPnmX9+vVWMefPn1eK9DNnztRYYIaFhXHy5EmysrIArJ7bGh0dzYoV K8jOzubcuXMcPnyYTp06Xfc4hBBCiPpQa7VaQkJC2LJlC4WFhVgsFsLDw+nTpw9w7YaISZMmkZCQ QEJCAhaLBR8fH+69994WTl20FpHTX+D4plUci1uGW0BHQkZMJOvQbqsYnacvPSdHs3P+o9hqHHBq 583Qee9hY2tLx6HjsFRWsmfhs1SYTTi4tqHbuOkNykGj0RAYGGh1w19TUqlUzJgxg7fffrvGZ5bW tr6zszN/+9vfgGuX7V9++eVqMRaLhbfffptXX3211jwee+wxPvzww2o51LZ+XFwc06ZNw8/Pj/Dw cKZMmWJ189TSpUuJiYnhxRdfpGvXrjzxxBN88cUXVnMEBwfz+uuvM3DgQLRaLQEBAXz55Zeo1Wpm zZrFwoULiYmJwc3NjejoaKZOnVrrcQghhBB1UVlu8Kn8dT2KarmLS639c+pYvq75nZ2dZfxNHC/+ q7y8nP79+/Pqq6/W+WD7hnrttdc4f/58jU8W+DWDwcCdd97JmTNnrG5ubA3ee+893NzcmDx5ckun IoQQ4hYgr0UVop7s7OzYtm0bixcvbtLXop48eZKUlBSWL19eZ6yLiwv9+/fnyy+/bLL1m4q3tzce Hh4tnYYQQohbhJxJlfG19gshhBBCtAQ5kyqEEEIIIVodKVKFEEIIIUSrI0WqEEIIIYRodaRIFUII IYQQrY4UqUIIIYQQotVRA2RlZbFnzx7y8vJQqVRERkbSr18/JchgMLB9+3auXr2KVqvlnnvuITAw sMWSFkIIIYQQtzY1QEZGBkOHDsXPz4/c3FxWrlyJj48PQUFBAHz++ed06dKFhx9+mNzcXP71r38R FRUljy8SQgghhBA3hQ3AoEGD8PPzA6Bdu3b4+/tjMpkAMJvNXL58mV69ein9PXv25IcffmihlIUQ QgghxK1O2ZNqsVgoLi4mKSkJk8lk9Q7w8vJyysrKlJ89PT25evVq82YqhBBCCCFuG8rLv1NTU9m+ fTsWi4UpU6Yo7wV3cHDA29ubQ4cOERkZycWLF9m9ezdOTk4tlrQQQgghhLi1KUVqaGgooaGh5Ofn s3HjRvr160fXrl0BeOCBB9izZw/r16/Hz8+P3//+95w9e7bFkhZCCCGEELc29f82uLu7Ex4ezunT p5Ui1c3Njfvvv1+JiY+Px8vLq/myFEIIIYQQtxUbk8nEhg0byMvLA6CgoICUlBR8fX2VoMzMTEpL SwE4d+4cJ0+eVG6kEkIIIYQQoqmptVotISEhbNmyhcLCQiwWC+Hh4fTp00cJunLlCl9++SVlZWW4 u7szZcoUtFptC6YthBBCCCFuZWqA7t2707179+sGRUZGEhkZ2WxJCSGEEEKI25u8FlUIIYQQQrQ6 UqQKIYQQQohWR4pUIYQQQgjR6kiRKoQQQgghWh0pUoUQQgghRKsjRaoQQgghhGh1pEgVt5zjm1Zz 6P03WjoNIYQQQtyAaq9FjYuLo6ioiBkzZihtVVVVfP3116SlpWFra0ufPn246667mjVRIYQQQghx +7AqUpOTkykvL68WdODAAQwGA3/+858pLS3lww8/xN3dneDg4GZLVAghhBBC3D6UItVgMLBv3z5G jRrF7t27rYKOHj3KpEmTsLGxQavV0q9fP44ePSpF6m2iJD+Hb96MZtj8WJLWLiQ7+SBtAjoyYsFq AKoqKzm+cRUZ+78CiwXP0J5ETJ2DndZJmSNj/w5Obl6DWZ+P1q0dd058Ev/egwEoKynmhw/fITv5 ADa2au4Y8ge6jZuGysa2XuuXFuv5flUMP588jLOXHzovP+x1rlb5H4hdgP5COjZ2Gtq2D6PHw0/j 7OWnxFRUVNC7d2/Cw8NZu3btzf5KhRBCCFEHpUjdtm0bQ4YMwd7e3iqgqqoKg8FAu3bt+O6772jb ti2enp4cPny42ZMVLcdUkMu+xc/TafgE+j7+EmUlRUpf8mex5KQe4763P8XOXkvSh29z9OMlRE5/ AYDMA/Ec/fdi7p7zT9yDO6O/lEF+Rpoy/mDsAuwcdYxb/hXlJiO7Y57C1t6BLqP/WK/1D8a+gq3G gQkrEyg3Gfn2nb9aFanJn61A5+nLsLnvAXAh6VurAhrAbDaTnp6ORqNp2i9OCCGEEI1iA3Ds2DHs 7OwICwurFlBRUYGNjQ0qlYrMzEwuX76MnZ0dpaWlzZ6saDkl+Tl0Gz+dwL7DUTtocXT3VPpSv1pP z8mzsHNwBJWKbuOjuJD0rdJ/attH9HzkGdyDOwPg6htM8ICRAJQZizj//S56/ek5bGzV2OtcCX/o Kc58vale65cVG8g69A2R057ZCJBNAAAgAElEQVTH1k6Dg0sbfO7sazXWsa0XV04d4edTR6iqqsQ/ 4m4cXNpYxeh0OjIzM9m7d2+TfWdCCCGEaDy1Xq8nMTGRadOm1Rjwy5mliooKJk+eDEBWVhbOzs7N lqRoeWoHR37XpXe19lJDAWUmI98te9Gq3d7ZRfm34XIWbv4dapy3OOcSDi5t0DjqlDYX7wCKcy7V a/2inEs4uLZBo3Op1veL7uOnY69z5ei/F6PPzsS/1yB6Tp5lVWgDuLu7X3cOIYQQQjQvdVpaGiqV ijVr1gDXilGj0ciSJUuIiopCq9Xi6enJpUuXCAwMBODixYt4enrWNq+4Tdg7u2Hn4Mjwl1bg1M67 xhidpw/6S5m0CexUrc/JwxuzoYByk1G5BF905SJOHj71Wl/r6k5pkZ7K8lJs7exrjFHZ2NJ51EN0 HvUQpcV6Dq1+nYMrX2XoC0ut4vR6PVqtVi75CyGEEK2ATUREBNHR0cpn4sSJeHl5ER0djVarBaBX r17s3buXyspKiouLSUpKomfPni2cumgVVCpCRjzA96tiKCspBsCszyc/I1UJCblnIkc/XoL+UgYA xVezObnlQwDsda4ERAzhyL/exVJVSVlJMT9+spyOw8bVa3nHtl60CerE8Q2rwGKh6OcsMhK/soo5 un4JhRfSr63n5IKrX3uwWKxijEYjQUFBDB48uDHfghBCCCGaWLXnpNYkPDycwsJCYmNjsbGxYdiw YXh5ed3s3MRvRI/J0ZzY9D5fPT8ZVCo0jjq6T5ih7EHtOHQclspK9ix8lgqzCQfXNnQbN10Z32/m KyStXcimJ/8PG1tbOgweTZfRU+q9/sBn3+LAey+zYcYw3IM6037wfZTk5Sj9Hnd0I2ntQoqvXsZS VYWLTwB9ouZbzaHRaAgMDKRjx443+G0IIYQQoimoLJb/OaXUQEVFRbX2L3e5/l5BgDl1LF/X/HXt jZXxNzZeCCGEEKIlyGtRhRBCCCFEqyNFqhBCCCGEaHWkSBVCCCGEEK1OvW6cErcv2dMqhBBCiJYg Z1KFEEIIIUSrI0WqEEIIIYRodaRIFc3u+KbVHHr/jQaPs1iqbmjd/Mw0Pps2hNyzJ5W2ilIzm54c yaYnR/LRg3dx+cShG1rjt6Cx378QQgjRnKrtSY2Li6OoqIgZM2YobUajkUOHDpGamoqLiwuPPPJI syYpRMH5n0j68B1GvLyqWl/2jwfYFTMTje6/z+S1s9cyfkW8VZxjGw8C+w7Dqe1/X0ShtndgfOxO ALbPntSo3Ha/8TRXf0pGrdFSYS7Bo1N3ImfMQ1fPV7sKIYQQojqrIjU5OZny8vJqQTY2Nvj4+FBW VkZubm6zJSfEL8yGwlr72wR2ZPQ7G2qNcXB1J3L63KZMS9H7T8/RYfAYykuKOf75+3y/6jWGzVt+ U9YSQgghbgdKkWowGNi3bx+jRo1i9+7dVkFarZbOnTtLkSoapbRYz/erYvj55GGcvfzQeflhr3NV +gsvpHPi8/fJSz9FmdGAb48B9Hl8PrZ29pj1+SQsiMJsKKDMWMSmJ0cC4Ozlz4gFq+u1fsKCKIqu XADAmPszYxZtws2/Q73zr6qs5PjGVWTs/wosFjxDexIxdQ52WqdqsXaOOoL6Didz/44Gjc/Yv4OT m9dg1uejdWvHnROfxL/3YADKSor54cN3yE4+gI2tmjuG/IFu46ahsrEFoCQ/h2/ejGbY/FiS1i4k O/kgbQI6Kt9PXd8/QEVFBb179yY8PJy1a9fW+7sRQgghbhalSN22bRtDhgzB3t6+JfMRt6CDsa9g q3FgwsoEyk1Gvn3nr1ZFUtHlLIL6j6T/n1+lqqKC+Jen8lPCRkLvnYyDqztj3t3E+e93kRb/WY2X ++vy62L2s2lDGjw++bNYclKPcd/bn2JnryXpw7c5+vESIqe/UC221FDAmV1f4N4+rN7jMw/Ec/Tf i7l7zj9xD+6M/lIG+RlpyviDsQuwc9QxbvlXlJuM7I55Clt7B7qM/qMSYyrIZd/i5+k0fAJ9H3+J spKiX42v/fsHMJvNpKeno9FoGvz9CCGEEDeDDcCxY8ews7MjLCysrnghGqSs2EDWoW+InPY8tnYa HFza4HNnX6sY/4i78e81iMqyUvQX03HxDuTqmRMNWqfg/Bk+eWyg8rnww94mO4bUr9bTc/Is7Bwc QaWi2/goLiR9axWTtO4dNj5+D59MHUxVVQX9nny53uNPbfuIno88g3twZwBcfYMJHnDtjHGZsYjz 3++i15+ew8ZWjb3OlfCHnuLM15us1i/Jz6Hb+OkE9h2O2kGLo7vntfH1+P4BdDodmZmZ7N3bdN+b EEIIcSPUer2exMREpk2b1tK5iFtQUc4lHFzbWN3U9L9K8nM4/MGblJtKaHtHGCobWypKihu0Tn32 pDZGqaGAMpOR75a9aNVu72x9PL3/9BztB97LlmfG4XNnP+yd3eo93nA567rbD4pzLuHg0gaNo05p c/EOoDjnklWc2sGR33XpXW18fb7/X7i7u9cZI4QQQjQXdVpaGiqVijVr1gDX9qYZjUaWLFlCVFQU Wq22hVMUv2VaV3dKi/RUlpdia1fzVpLERbMJvXcygX2HA5D+7VayDn1jFWNrp6G0qPabp26UykaF pbLSqs3e2Q07B0eGv7QCp3bedYy3JfyhmRyLW0pA5FBsbG3rNV7n6YP+UiZtAjtV63Py8MZsKKDc ZFT2sBZduYhTPZ8cUJ/v/xd6vR6tViuX/IUQQrQKNhEREURHRyufiRMn4uXlRXR0tBSo4oY5tvWi TVAnjm9YBRYLRT9nkZH4lVVM8dXLqGyuPbLXcPk8aQnVz4i6+XegMOssxtzLAJgNBU2eq87Dh4tH 94HFQmmx/lqjSkXIiAf4flUMZf85u2vW55OfkVrjHEF9R2CrcSB9z5Z6jw+5ZyJHP16C/lIGAMVX szm55UMA7HWuBEQM4ci/3sVSVUlZSTE/frKcjsPG1euY6vP9w7XHzAUFBTF48OB6zSuEEELcbNWe k1qTjRs3cvHiRcrKyigrK2Px4sW4uLgwderUm52fuAUMfPYtDrz3MhtmDMM9qDPtB99HSV6O0h85 /QWOb1rFsbhluAV0JGTERLIOWT9hQufpS8/J0eyc/yi2Ggec2nkzdN572NjaNlme3SfMYO+i2Wx4 fAReYXcx8Jk3AegxOZoTm97nq+cng0qFxlFH9wkzlD2kVlQqejz0FN+vjqH9oHuxtbOvc3zHoeOw VFayZ+GzVJhNOLi2odu46cqU/Wa+QtLahWx68v+wsbWlw+DRdBk9pd7HVdf3D6DRaAgMDKRjx46N +OaEEEKIpqeyWCyWG5mgqKio1v7lLrXvhZtTx/J1ze/s7CzjW/F4IYQQQojGkNeiCiGEEEKIVkeK VCGEEEII0epIkSqEEEIIIVqdet04JURjyZ5WIYQQQjSGnEkVQgghhBCtjhSpQgghhBCi1ZEiVdzS KkrNbHpyJJueHMlHD97F5ROHWjql60pOTsbb25ukpCSlraSkhODgYIKDg7G3t2f37t21zCCEEELc OqrtSY2Li6OoqIgZM2YobZcuXWL37t1cvXoVe3t7hg4dSmhoaLMmKkRjqO0dGB+7E4Dtsye1SA7x 8fHce++9tGnTRmlzcnIiMzPTKs7b25vx48fj5+entDk6OpKRce1NVL17926WfIUQQojWwKpITU5O pry83CrAYrGQmJjI0KFD8fHx4dy5c8TFxTFr1iy56UWIeurWrRvHjh2rNcbT05Nly5Y1U0ZCCCFE 66Zc7jcYDOzbt4/+/ftbBahUKiZNmoSvry8qlYoOHTrg5eXFlStXmj1ZcXsqyc9h++yHMBsK2PfP F/h06mASFkQp/WUlxRxYvoCNj4/g85n/x/GNq7BUVdZ7/qrKSn78NJYvnh7NF3++j+/ee4lyk9Eq pqKigh49evDYY4811WEphg0bplzSV6vVpKSkNGh8RUUFCxYsICQkhE6dOjF16tQ6n6oghBBCtHZK kbpt2zaGDBmCvb19rQOqqqooKCigXbt2Nz05IX5hKshl3+LnCYgYwvjlOxgQHaP0HYxdACoYt/wr 7l0Yx8UjiZz68uN6z538WSxXTv3AfW9/yv1Lt6Fx1HH04yVWMWazmfT0dE6dOtVUh6TYtWsXGRkZ ZGRk4OHh0eDxr7zyComJiRw5coS0tDRcXV2ZO3duk+cphBBCNCcbgGPHjmFnZ0dYWFidAw4ePEj7 9u1xc3O76ckJ8YuS/By6jZ9OYN/hqB20OLp7AlBmLOL897vo9afnsLFVY69zJfyhpzjz9aZ6z536 1Xp6Tp6FnYMjqFR0Gx/FhaRvrWJ0Oh2ZmZns3bu3UfmfOHECDw8P5bNt27ZGzVOTpUuX8vrrr6PT 6VCpVMydO5etW7c22fxCCCFES1Dr9XoSExOZNm1ancEZGRkcOXKEqVOnNkNqQvyX2sGR33WpfuNQ cc4lHFzaoHHUKW0u3gEU51yq17ylhgLKTEa+W/aiVbu9s0u1WHd39wZm/V/12ZPaGLm5uRgMhmrb EG4kVyGEEKI1UKelpaFSqVizZg1wbX+b0WhkyZIlREVFodVqAcjOzmbz5s08/PDD6HS62uYUotk4 eXhjNhRQbjJip3UCoOjKRZw8fKrFqmxUWCqt96raO7th5+DI8JdW4NTOu9a19Ho9Wq0WjUbTdAfQ ADY2NlRUVFi1tW3bFp1OR3x8PAEBAS2SlxBCCHEz2ERERBAdHa18Jk6ciJeXF9HR0UqBeuHCBT79 9FOlT4jWwl7nSkDEEI78610sVZWUlRTz4yfL6ThsXLVYnYcPF4/uA4uF0mL9tUaVipARD/D9qhjK SooBMOvzyc9ItRprNBoJCgpi8ODBN/uQriswMJAdO3ZgsVjIz88Hrt3Y+MQTTzBz5kz0+mvHlJOT c1PO2gohhBDNqc6H+ZeXl/PRRx9hNpv55JNPWLhwIQsXLuTf//53c+QnRJ36zXyFyvJSNj35f2x/ biI+d/ahy+gp1eK6T5hBdvJBNjw+gkPvv6G095gcTbs7uvLV85PZPOsPfPNmNCX5OVZjNRoNgYGB dOzY8aYfz/XMnz+fhIQEAgICePrpp5X2mJgYIiIi6NOnD2FhYYwdO5bs7OwWy1MIIYRoCiqLxWK5 kQnqetTNcpfqe/t+bU4dy9c1f13PapXxv+3xQgghhLg9yWtRhRBCCCFEqyNFqhBCCCGEaHWkSBVC CCGEEK2OFKlCCCGEEKLVkSJVCCGEEEK0OlKkCiGEEEKIVkeKVHHbsFiqamzfPnsSl08cuunr79+/ nyNHjtQZFxMTY/Uc1Pqqqqr5+OorOTkZb29vkpKSlLaSkhKCg4MJDg7G3t6e3bt339AaQgghRH1V K1Lj4uJYtWqVVVtWVhbr1q1j0aJFvPvuuxw4cKDZEhSiKRSc/4mv//5Ei+aQnJxMampq3YGNcPz4 ce65554a++Lj41Gr1Xh4eCifoKCganHe3t6MHz8ePz8/pc3R0ZGMjAwyMjLo3r37TcldCCGEqIn6 1z8kJydTXl5eLSgjI4OhQ4fi5+dHbm4uK1euxMfHp8ZfdEK0RmZDYYutXVpayrx584iLi6OyspKE hAQWL15MmzZtmmyN3NzcWvu7detW56tSPT09WbZsWZPlJIQQQtwIpUg1GAzs27ePUaNGVbukN2jQ IOXf7dq1w9/fH5PJ1HxZitva9tmTCLvvEdLiP8Nw+TweHbvR/6m/Y+9yrcgrvJDOic/fJy/9FGVG A749BtDn8fnY2tlj1ueTsCAKs6GAMmMRm54cCYCzlz8jFqxW1igvKWbvP57j8onD6Dx9GfSXhTj/ zl/pr6iooHfv3oSHh7N27doG5f/BBx9w6NAhzpw5g0ajYc2aNZhMJqVIzc/PZ+bMmezZs4f27dvT vn173N3dlfEpKSm88cYbHDlyhIKCAkaNGkVsbCwODg7k5OQwbNgwrl69SmFhIcHBwQB06NCBXbt2 1Su/YcOGkZ6eDsCFCxdITk6mS5cu9T6+iooKXnvtNeLi4rBYLAwYMIB//vOf8jYxIYQQN0S53L9t 2zaGDBmCvb19jYEWi4Xi4mKSkpIwmUwt+g5zcftJ/3Yrg597hwdWfY2N2o4fPnpX6Su6nEVQ/5GM eXcT45bvoPBiOj8lbATAwdWdMe9uIjJqLp6hPRgfu5PxsTutClSAHz+Npev907h/6Va0bm058fkH Vv1ms5n09HROnTrVqPxVKhUWiwW1Ws2MGTPw8fFR+qKiorCzsyMrK4utW7dy6dIlq7Fnz57lwQcf 5Pjx45w7d45Tp06xcuVK4NrZz+PHj7Ns2TIGDBigXJqvb4EKsGvXLmWch4dHg4/tlVdeITExkSNH jpCWloarqytz585t8DxCCCHEr6kBjh07hp2dHWFhYVy8eLHGwNTUVLZv347FYmHKlCmo1eoa44S4 GbrePxVtm2sFVIe7x3Iw9hWlzz/ibgDKTUYM2Zm4eAdy9cwJQhsw/11TnqVt+2sjgvrdw09fb7Tq 1+l0ZGZm4ujo2ODcp0+fzo8//khQUBBRUVHMmTMHV1dXAAoKCti8eTO5ubnY29vj4eHB8OHD+fnn n5XxY8eOBaCoqIi0tDQ6duzI4cOHG5TDiRMnrArQNWvWMHr06AYfS02WLl3Kzp070el0AMydO5de vXqxdOnSJplfCCHE7Umt1+tJTExk2rRptQaGhoYSGhpKfn4+GzdupF+/fnTt2rWZ0hTiv9z8O1Ba rFd+LsnP4fAHb1JuKqHtHWGobGypKClu0Jw2v/qjS+vWlsrysmoxv74E3xAajYZVq1bxzDPP8NZb bxESEkJ8fDx33nmncvaytv2ply5dIjo6muLiYnr16oVarUav1183vib12ZPaGLm5uRgMBh577DGr 9sZ+V0IIIcQv1GlpaahUKtasWQNc219mNBpZsmQJUVFRaLVaqwHu7u6Eh4dz+vRpKVJFiyi6nIXO 01f5OXHRbELvnUxg3+HAta0BWYe+sRpja6ehtOjGbp7S6/VotVo0Gk2jxoeFhbFu3Tqee+45Vq5c yfLly/H09CQ/Px+z2YyDg0ON4yZNmkR0dDQTJkwAYN26dWzevNkqxsHBgby8vEblVV82NjZUVFRY tbVt2xadTkd8fDwBAQE3dX0hhBC3F5uIiAiio6OVz8SJE/Hy8iI6OhqtVovJZGLDhg3KL8CCggJS UlLw9fWtY2ohmk7mgQQqy0spKykm+bMVdBx6v9JXfPUyKptr26sNl8+TlrCh2ng3/w4UZp3FmHsZ ALOhoEHrG41GgoKCGDx4cINzj46OZsWKFWRnZ3Pu3DkOHz5Mp06dAPDz86N79+689tprWCwWzp49 y/r1663Gnz9/HltbWwDOnDmj7Ef9tbCwME6ePElWVhYAV69ebXCedQkMDGTHjh1YLBby8/OBa3tt n3jiCWbOnKmc3c3JybkpZ22FEELcXup8mL9WqyUkJIQtW7awaNEi1qxZQ0BAAH369GmO/IQAQG3v wJezH2bLrLF4dA4nbPQUpS9y+guc+OIDtjxzP8fi3iNkxMRq43WevvScHM3O+Y+yOXos+xa/QFVl Zb3X12g0BAYGNuqGwVmzZnHs2DEiIyMZO3YsU6ZMsXpYf1xcHImJifj5+TFr1iymTJliNX7p0qW8 +eabdO3alRdffJEnnqj+vNfg4GBef/11Bg4cSGhoKI888ki1s543av78+SQkJBAQEGCVf0xMDBER EfTp04ewsDDGjh1LdnZ2k64thBDi9qOyWCyWG5mgqKio1v7lLi619s+pY/m65q/rMTcy/rc9Hq49 guquKc/i3S2iztjW7L333sPNzY3Jkye3dCpCCCFEqye36IvfiBv6W6pV8Pb2Vu6AF0IIIUTtpEgV opmMGzeupVMQQgghfjOkSBWt3n0L41o6BSGEEEI0szpvnBJCCCGEEKK5SZEqhBBCCCFaHSlShRBC CCFEqyNFqvjNMOZeJuGVGXw27W62/vUBsn88YNWfn5nGZ9OGkHv2pNJWUWpm05Mj2fTkSD568C4u nzjU3Gkr9u/fz5EjR+qMi4mJsXoOaX1VVVU1Ji1FcnIy3t7eJCUlKW0lJSUEBwcTHByMvb09u3fv vqE1hBBCiPqqVqTGxcWxatWq6w6oq1+Im+WHdYtw9Q1i/Iqd3PvGv/EM7WnV79jGg8C+w3Bq66W0 qe0dGB+7k/GxO2kT2Km5U7aSnJxMamrqTZn7+PHj3HPPPTX2xcfHo1ar8fDwUD5BQUHV4ry9vRk/ fjx+fn5Km6OjIxkZGWRkZNC9e/ebkrsQQghRE6u7+5OTkykvL79ucF39QtxMBed/ov+fX8XWzr7G fgdXdyKnz23mrOpWWlrKvHnziIuLo7KykoSEBBYvXkybNm2abI3c3Nxa+7t161bnq0o9PT1ZtmxZ k+UkhBBC3AjlTKrBYGDfvn3079+/xsC6+oW4WX741yK+eHoMhp+z+PbtZ9n05EgSFkQp/QkLopRL +v96IJzCC+kNmr+qspIfP43li6dH88Wf7+O7916i3GS0iqmoqKBHjx489thjDc7/gw8+4NChQ5w5 c4aLFy/Sv39/TCaT0p+fn89DDz2El5cXffv25dSpU1bjU1JSeOSRRwgNDeV3v/sdjz32GGazGYCc nBy6d+/O5MmT2b9/v3JpftiwYfXOb9iwYco4tVpNSkpKg46voqKCBQsWEBISQqdOnZg6dWqdbxoT Qggh6qIUqdu2bWPIkCHY29d8lqqufiFull5//Av3L92KzsOXofOWMz52JyMWrFb6RyxYrVzSd3Bx b/D8yZ/FcuXUD9z39qfcv3QbGkcdRz9eYhVjNptJT0+vVkDWl0qlwmKxoFarmTFjBj4+PkpfVFQU dnZ2ZGVlsXXrVi5dumQ19uzZszz44IMcP36cc+fOcerUKVauXAlcO/t5/Phxli1bxoABA5RL87t2 7ap3brt27VLGeXh4NPjYXnnlFRITEzly5AhpaWm4uroyd27rO6MthBDit8UG4NixY9jZ2REWFlZj UF39QvyWpX61np6TZ2Hn4AgqFd3GR3Eh6VurGJ1OR2ZmJnv37m3w/NOnT6dz584EBQUxd+5c9Hq9 0ldQUMDmzZtZsmQJ9vb2eHh4MHz4cKvxY8eOZfTo0ZjNZk6dOkXHjh05fPhwg3I4ceKE1Z7Ubdu2 Nfg4rmfp0qW8/vrr6HQ6VCoVc+fOZevWrU02vxBCiNuTWq/Xk5iYyLRp02oMqKtfiN+yUkMBZSYj 3y170ard3tmlWqy7e8PP0gJoNBpWrVrFM888w1tvvUVISAjx8fHceeedytnL2vanXrp0iejoaIqL i+nVqxdqtdqq0K2P+uxJbYzc3FwMBkO1bRCN/a6EEEKIX6jT0tJQqVSsWbMGuLa/zGg0smTJEqKi oqirX4jfCpWNCktlpVWbvbMbdg6ODH9pBU7tvGsdr9fr0Wq1aDSaRq0fFhbGunXreO6551i5ciXL ly/H09OT/Px8zGYzDg4ONY6bNGkS0dHRTJgwAYB169axefNmqxgHBwfy8vIalVd92djYUFFRYdXW tm1bdDod8fHxBAQE3NT1hRBC3F5sIiIiiI6OVj4TJ07Ey8uL6OhotFotdfUL8Vuh8/Dh4tF9YLFQ WvyfM5EqFSEjHuD7VTGUlRQDYNbnk59h/agoo9FIUFAQgwcPbvC60dHRrFixguzsbM6dO8fhw4fp 1Ona47D8/Pzo3r07r732GhaLhbNnz7J+/Xqr8efPn8fW1haAM2fOKPtRfy0sLIyTJ0+SlZUFwNWr VxucZ10CAwPZsWMHFouF/Px84Npe2yeeeIKZM2cqZ3dzcnJuyllbIYQQtxd5mL+4bXSfMIPs5INs eHwEh95/Q2nvMTmadnd05avnJ7N51h/45s1oSvJzrMZqNBoCAwPp2LFjg9edNWsWx44dIzIykrFj xzJlyhSrh/XHxcWRmJiIn58fs2bNYsqUKVbjly5dyptvvknXrl158cUXeeKJJ6qtERwczOuvv87A gQMJDQ3lkUceqXbW80bNnz+fhIQEAgICrPKPiYkhIiKCPn36EBYWxtixY8nOzm7StYUQQtx+VBaL xXIjE9T1qJnlLtX39v3anDqWr2t+Z2dnGX8Lj7+VvPfee7i5uTF58uSWTkUIIYRo9dR1hwghmoK3 tzc6na6l0xBCCCF+E6RIFaKZjBs3rqVTEEIIIX4zZE+qEEIIIYRodeRMqripbqc9p0IIIYRoOnIm VQghhBBCtDpSpAohhBBCiFZHilQhfiMslqqWTkEIIYRoNtX2pMbFxVFUVMSMGTOUtuTkZLZu3Yqd nZ3SNnr0aLp06dI8WYrfrNKiQj55dCB9Zswj5J4HAdi/ZB45acmMe297C2f321Hw/+3de1BUV57A 8W9DQ9PSAoJAEKFBJQoqEYKoo5MYokbjOIxi1AxaGaM4mq00MymKRGIm6kaTGGp11PgcfOyuYSer jovRqFEnojFRoyyorIw8hAhGRaB52Tx7/2C8sQNC4wNRf5+qrqLu+f3O/d0rlKdPn3s6/x+c2pLI mPc3POxShBBCiA5hMUhNT0+nrq6uWZDJZCIsLIxx48Z1WGHi8aHp6sKl4wfo+9JUGuvruH7x7MMu 6ZFjKi972CUIIYQQHUoZpJaXl3P06FHGjRvHoUOHLIJu3ryJo6NjhxcnHg/2jk6YjCWYjCUUZ5/D 2duPsh9ylfbGhgYytm8g79heMJvxCAwl/PW3sdM2/c6V/ZDD2Z1/4UZOJrVV5XiHjGDo7xdga6cB oLrkGsfXLsT4Qw42dva49Qoi5Ldv0tWzJwBbo4KZtiUVTVcXANKSV1NvqmbwzHgl//BHBkYtWMup zcsoSv+Wbr4BjFm4sc36fjz/PZkpW6mvuUnl9SuEz4znRNJHOPXQM/q9dVZd3xfxrzJs7p8497ck rpw9ic7Dm+ffWkbXp71JRsEAAA8ySURBVHwwGUs4sDAGU3kptVUV7Jg3FoCunj5KfQD19fUMHjyY QYMGsXnz5gf2bymEEEJ0FGVN6u7du4mIiECj0TQLMplMFBQUkJyczLZt2zhz5kyHFikebfWmavRD R1Fw4hD5335Fj0HDLdrTP1/L1czv+dUnf2Xiqt3Yd9FxZttKpb3iSgF+w8fy6+U7mLTmS8ou5/CP A9tvy1+HzsObqLX7mLgyBf/hY5UBoLVulhZzdMU7+IZHELXmS0YYllhdX1H6t4TPmk/3gIGc/dsm fvXxZxRnn6Oq+Eer8gG+XbuQARNnMXFVCloXN87uTALAwdmVXy/fwZCYBDwCQ4hau4+otfssBqjQ 9Deak5NDZmZmu65bCCGE6KxsANLS0rCzsyMoKKjFoP79+xMeHk5UVBQREREcO3aMtLS0Di1UPLoa 6mrwf248BScPU3IpC4++z1i0X9j7GaHRsdg5dAGVioFRMfxw6mul3Sf8BXzCnqehtgbj5RycvPQW Swa6uHlyNfM0P2aeprGxAZ/wF3Bw6tauGqtLrjEwajb6YaNRO2jp4uphdX3OPf1x8emNk5ce79AR aJy64djdi4qrl63KBwj5rQG3XoFourrg94uXMBbmtat+nU7HpUuXOHLkSLvyhBBCiM5KbTQaSU1N ZdasWXcM8vHxUX728vJi+PDhXLhwgZCQkI6oUTwGnHv4YSovxTtkBKhUyvGa8lJqb1bxzer3LOI1 XZ2Un6tLrnEy6SPqblbj1icIlY0t9dWVSntw1Gw0OmfO/OcKjEWX8Al7ntDoWIuBZlvUDl14qv/g Zsetqe+W2y5L+dnafBv1T8vDtS5uNNTVWl37La6uru3OEUIIITordVZWFiqVik2bNgFNa9uqqqpY uXIlMTExaLXaZkkqlQobG9m9SrTP828tw17nrHwMDk0PVdk5dGH0n9bh2N2rxbzUf4sncHw0+mGj Acj5OoWCE4eVdpWNLf3GTaPfuGnUVBo5sXEp367/V16cvwoAG7UdpvJSZU1qY33zhwPvxJr6HmT+ LbZ29tRUtP7wlNFoRKvVYm9vf9fnEUIIIToLm/DwcAwGg/KaMmUKnp6eGAwGtFotVVVVbN++ndLS UgDKysr45ptvCAwMfMili0dN16d80eicLQ+qVPQd8wrfbVhC7T9nR03GEkryLighldevoPrnm6Ly K/lkHfhviy7OfLaSsh9yANA4OuHcsxeYzUq7Uw89OV/vpqGuhsvfHyE3tR1bX1lR3wPN/ycXn96U FWRTVXylqY/yUov2qqoq/Pz8GDlyZLv6FUIIITqrZvuk/pyjoyN9+vRh586dVFRUYGNjw5AhQwgO Du6I+sQTICTawNkdf2HvO9GgUmHfRUfw5Dm4+vcDYMjs+WTs2EBa8mpcfAPoO2YKBSd+2oHCvc9A Tm1eRuX1K5gbG3Hq4cvQmAVKe/jMeI6vXUjO1ynoh40mNDq2XYPEtup70PkAOg9vQqMN7FvwO2zt HXDs7sWL736Kja0tAPb29uj1egICAqzuUwghhOjMVGbzbVNOd6GioqLV9jVOzdfu3e7tNk7fVv9d u3aV/E6cL4QQQghxN2RhqRBCCCGE6HRkkCqEEEIIITodGaQKIYQQQohOp80Hp4S4F7KmVQghhBB3 Q2ZShRBCCCFEpyODVCGEEEII0enIIFU8Mczmxke6/7YcO3aM06dPtxm3ZMkS3nzzzXb339h4b9eX np6Ol5cXp06dUo5VV1fj7++Pv78/Go2GQ4cOtdLD4+Fu778QQjxpmg1Sk5OT2bBhQ7PA7Oxs1q9f T2JiIhs3biQnJ6dDChTifijN/wdfLZ77yPZvjfT0dC5caN83WVkrIyODl156qcW2/fv3o1arcXd3 V15+fn7N4ry8vIiKiqJnz57KsS5dupCXl0deXt5df0HIhAkTcHd3R6/X4+bmxvjx48nPz7+rvoQQ QnQeFg9OpaenU1fX/HvNi4qK2LNnD9OmTcPT05MbN25QU1PTYUUKca9M5a1/731n7x/g3LlzdOvW DW9vb4vjNTU1vPvuuyQnJ9PQ0MCBAwdYsWIF3bp1u2/nLi4ubrV94MCBpKWltRrj4eHB6tWr71tN t0tMTOS1116jvLycpUuXMm/ePPbu3ftAziWEEKJjKDOp5eXlHD16lOHDhzcLSk1NJSIiAk9PTwDc 3Nzo0aNHx1Upnmi11ZUcX7OQ7b8fw843XiZj+wbMjQ1K+9aoYGoqfhokpiWv5tTmZQCYjCWk/DGK oyve4dr/pbFj3lh2zBvLgYUxSvwX8a+Sm7qHL999jb++PpLDH75JTXnpfesfoL6+npCQEGbOnHnX 92Hbtm189913zY4nJSVx4sQJLl68yOXLlxk+fDg3b95U2ktKSpQ3mMOGDSMzM9Mi//z580yfPp3A wECeeuopZs6ciclkAuDatWsEBwcTHR3NsWPHlI/mR40aZXXdo0aNUvLUajXnz59v13XX19ezcOFC +vbty9NPP83rr79+x10jnJycmDx5ssU1WpOfnJxMSEgI3t7ePPvss6SkpChtRqOR2bNn4+vrS+/e vfnggw9oaPjp96+wsJCwsDCuX7/O9OnT8fT0tLg/bd1/IYQQLVMGqbt37yYiIgKNRtMs6OrVq7i4 uLBnzx62bNnCoUOHqK2t7dBCxZPr27ULQQWT1uxl/LJkLp9OJXPPNqtyHZxd+fXyHQyJScAjMISo tfuIWruPMQs3WsTlfJ3CyLhEXtnwFTZqO77/j+X3tX+TyUROTs4DG6CoVCrMZjNqtZo5c+ZYvImM iYnBzs6OgoICUlJSKCwstMjNzs5m6tSpZGRkkJubS2ZmJuvXrweaZj8zMjJYvXo1I0aMUD6aP3jw oNW1HTx4UMlzd3dv97UtWrSI1NRUTp8+TVZWFs7OziQkJLQYW1xcTFJSEqGhoVbnf/7558yfP5/N mzdTWFjItm3bqK6uVtpjYmJQqVTk5ORw6tQp9uzZw5///GeL8/74449ER0fzm9/8htzcXLZu3WqR 39r9F0II0TIbgLS0NOzs7AgKCmoxqKKigsOHDxMaGsrUqVO5cePGE/GAg3j4aqsqyP/uIGGvxWFj q0ajc2bQtH/h4lc77ut5Bkx8HW03d2zUdvR+IZLCM8fua/86nY5Lly5x5MiRdueOHTuWwYMHs2XL FuLj4xk8eDBxcXFK++zZs+nXrx9+fn4kJCRgNBqVttLSUnbt2sXKlSvRaDS4u7szevRoi/4jIyOZ MGECJpOJzMxMAgICOHnyZLtqPHv2rMWa1N27d7f7Ou9k1apVLF26FJ1Oh0qlIiEhwWKmEyAuLg69 Xo+Hhwf19fVs3LjR6vzly5fz4YcfMmjQIAD69evHtGnTACgrK2Pnzp0kJiZiZ2eHq6srixcvtugf mmZTExISmDx5Mo6OjsqSDGvuvxBCiJapjUYjqampzJo1645Bjo6OREZG4uLiAsCQIUOa/SchxINQ ea0QB6du2HfRKcecvHypvPbgZqNcfHpTU2lsO7CdXF1d7ypv3759AMyfP5+wsDCioqIs2u3t7dmw YQN/+MMf+Pjjj+nbty/79+/nmWeeUWYvW1ufWlhYiMFgoLKykrCwMNRqtcVA1xrWrEm9G8XFxZSX lzdbJvHze5mYmMj06dMZMGAAY8aMwc3Nzer8ixcv0r9//xbPn5eXR/fu3XF2dlaO9enTh7y8PIs4 nU7HyJEjW8xv6/4LIYRomTorKwuVSsWmTZuApvVbVVVVrFy5kpiYGLRaLe7u7hQXFyuDVJ1O11qf Qtw3ju5emMpLqbtZhZ3WEYCKq5dxdP/p42wbtR2m8lI0XZt+Pxvrmz/8Z2tnb7GutDUVVwrQefz0 cNL96t9oNKLVarG3t7eqjvYKCgpi69atxMXFsX79etasWYOHhwclJSWYTCYcHBxazHv11VcxGAxM njwZgK1bt7Jr1y6LGAcHB27cuPFA6r7FxsaG+vp6i2Nubm7odDr279+Pr69vq/m2trYsWrSIBQsW MHHiRNRqtVX5fn5+ZGVltbi7gF6vp7i4mIqKCuXb0XJzc1vcvaAl1tx/IYQQLbMJDw/HYDAorylT puDp6YnBYECr1QJNM6d///vfMZlMmM1mjh8/ztNPP/2QSxdPAo3OGd/wCE7/+3LMjQ3UVlfyv/+1 hoBRk5QYpx56cr7eTUNdDZe/P0Ju6hfN+nHx6U1ZQTZVxVcAMN32YBTApeMHaKiroba6kvTP1xHw 4sT72n9VVRV+fn4tzrZZ65e//GWLf3cGg4F169ZRVFREbm4uJ0+eVOJ69uxJcHAwH3zwAWazmezs bD777DOL/Pz8fGxtbYGmWcVb61FvFxQUxLlz5ygoKADg+vXrd30dd6LX6/nyyy8xm82UlJQATWtt 586dyxtvvKHM7l67du2Os7avvPIKWq2WLVu2WJ0/b948EhISlO278vPz+eSTT4CmGdfIyEji4+Np aGjAaDTy/vvvt/rJ0+2suf9CCCFaZtVm/gEBAYSGhrJp0yZlC5mIiIgHWpgQt/zijUU01NWwY97L fBE3hR7PDKX/hBlKe/jMePKO7WXnG+MpyviO0OjYZn3oPLwJjTawb8Hv2GWI5OiK+TTe9oS2WuPA nvjf8j+xkbj3G0TQfe7f3t4evV5PQEDAXd+Hl19+mYEDBzY7HhsbS1paGkOGDCEyMpIZM2ZYbBaf nJxMamoqPXv2JDY2lhkzZljkr1q1io8++ogBAwbw3nvvMXdu8/1e/f39Wbp0Kc899xyBgYFMnz69 2aznvVqwYAEHDhzA19fXov4lS5YQHh7O0KFDCQoKIjIykqKiohb7UKlULF68mEWLFik7FLSVP2vW LOLi4pg0aRJ6vZ6oqCh69eqltCclJWEymejVqxehoaGMHj2at956y+rrauv+CyGEaJnKbDab76WD O20Fc8saJ6dW299u4/Rt9X/rIzjJfzzzO8IX8a/y7Iw/4jUw/GGXck8+/fRTXFxciI6OftilCCGE EPdM3XaIEE+Ce3qv1il4eXnJenEhhBCPDRmkCvGYmDRpUttBQgghxCNCBqniiferZckPuwQhhBBC /MwDX5PaGdYcCiGEEEKIR4tVT/cLIYQQQgjRkWSQKoQQQgghOh0ZpAohhBBCiE6n2YNTycnJVFRU MGfOHKDpm3JWrVplEdPQ0IBOpyM2tvmm5kIIIYQQQtyr/wcr3UlLfH/DGgAAAABJRU5ErkJggg== " + id="image817" + x="2.5102806" + y="0.015830245" /><rect + style="fill:none;stroke:#ff0000;stroke-width:1;stroke-linejoin:round;stroke-dasharray:none" + id="rect517" + width="73.620293" + height="15.372134" + x="2.1647058" + y="5.0289979" /><text + xml:space="preserve" + style="font-size:3.52777px;line-height:1.25;font-family:sans-serif;stroke-width:0.264583" + x="99.639122" + y="9.133729" + id="text1127"><tspan + sodipodi:role="line" + id="tspan1125" + style="font-weight:bold;stroke-width:0.264583" + x="99.639122" + y="9.133729">Converter-Node on level 0 (root):</tspan><tspan + sodipodi:role="line" + style="stroke-width:0.264583" + x="99.639122" + y="13.543442" + id="tspan1129">It matches folders with name "ExperimentalData"</tspan><tspan + sodipodi:role="line" + style="stroke-width:0.264583" + x="99.639122" + y="17.953154" + id="tspan1131">that are located in the crawler root folder.</tspan></text><text + xml:space="preserve" + style="font-size:3.52777px;line-height:1.25;font-family:sans-serif;stroke-width:0.264583" + x="110.16309" + y="38.105129" + id="text1127-35"><tspan + sodipodi:role="line" + id="tspan1125-6" + style="font-weight:bold;stroke-width:0.264583" + x="110.16309" + y="38.105129">Converter-Node on level 1:</tspan><tspan + sodipodi:role="line" + style="stroke-width:0.264583" + x="110.16309" + y="42.514843" + id="tspan1131-9">It matches folders that have a name matching the given</tspan><tspan + sodipodi:role="line" + style="stroke-width:0.264583" + x="110.16309" + y="46.924553" + id="tspan1286">regular expression (e.g. "2022_TestData"). All subfolders</tspan><tspan + sodipodi:role="line" + style="stroke-width:0.264583" + x="110.16309" + y="51.334267" + id="tspan2018">in "ExperimentalData" are considered.</tspan></text><text + xml:space="preserve" + style="font-size:3.52777px;line-height:1.25;font-family:sans-serif;stroke-width:0.264583" + x="87.131874" + y="58.891338" + id="text1127-35-1"><tspan + sodipodi:role="line" + id="tspan1125-6-2" + style="font-weight:bold;stroke-width:0.264583" + x="87.131874" + y="58.891338">Create a "Project" record:</tspan><tspan + sodipodi:role="line" + style="stroke-width:0.264583" + x="87.131874" + y="63.301052" + id="tspan2018-9">For each matching folder on level 1, a CaosDB record is created.</tspan><tspan + sodipodi:role="line" + style="stroke-width:0.264583" + x="87.131874" + y="67.710762" + id="tspan2080">This record has one parent (name "Project") and two properties "date"</tspan><tspan + sodipodi:role="line" + style="stroke-width:0.264583" + x="87.131874" + y="72.120476" + id="tspan2082">and "identifier". The two respective values are taken from the matched</tspan><tspan + sodipodi:role="line" + style="stroke-width:0.264583" + x="87.131874" + y="76.53019" + id="tspan2084">regular expression. The dollar-signs indicate that the two variables,</tspan><tspan + sodipodi:role="line" + style="stroke-width:0.264583" + x="87.131874" + y="80.939903" + id="tspan2086">which are created by the regular expression, are substituted.</tspan></text><text + xml:space="preserve" + style="font-size:3.52777px;line-height:1.25;font-family:sans-serif;stroke-width:0.264583" + x="66.51297" + y="91.64978" + id="text1127-35-1-3"><tspan + sodipodi:role="line" + style="stroke-width:0.264583" + x="66.51297" + y="91.64978" + id="tspan2086-1"><tspan + style="font-weight:bold" + id="tspan2175">Level 2 node "measurement"</tspan> uses a more complex regular expression</tspan><tspan + sodipodi:role="line" + style="stroke-width:0.264583" + x="66.51297" + y="96.059494" + id="tspan2173">to match subfolders of the project folder.</tspan></text><text + xml:space="preserve" + style="font-size:3.52777px;line-height:1.25;font-family:sans-serif;stroke-width:0.264583" + x="94.849007" + y="123.55322" + id="text1127-35-1-3-8"><tspan + sodipodi:role="line" + style="font-weight:bold;stroke-width:0.264583" + x="94.849007" + y="123.55322" + id="tspan2173-2">Measurement record:</tspan><tspan + sodipodi:role="line" + style="font-weight:normal;stroke-width:0.264583" + x="94.849007" + y="127.96294" + id="tspan2278">As no parents are given, the parent is automatically set to the name,</tspan><tspan + sodipodi:role="line" + style="font-weight:normal;stroke-width:0.264583" + x="94.849007" + y="132.37265" + id="tspan2280">in this case "Measurement". Apart from the two properties stemming</tspan><tspan + sodipodi:role="line" + style="font-weight:normal;stroke-width:0.264583" + x="94.849007" + y="136.78236" + id="tspan2282">from the regexp (date and identifier), a reference property is created</tspan><tspan + sodipodi:role="line" + style="font-weight:normal;stroke-width:0.264583" + x="94.849007" + y="141.19208" + id="tspan2284">that creates a link to the Project record that was created earlier.</tspan></text><text + xml:space="preserve" + style="font-size:3.52777px;line-height:1.25;font-family:sans-serif;stroke-width:0.264583" + x="94.862091" + y="162.61325" + id="text1127-35-1-3-8-2"><tspan + sodipodi:role="line" + style="font-weight:bold;stroke-width:0.264583" + x="94.862091" + y="162.61325" + id="tspan2173-2-3">"dat"-Files on level 3:</tspan><tspan + sodipodi:role="line" + style="font-weight:normal;stroke-width:0.264583" + x="94.862091" + y="167.02296" + id="tspan2284-2">Here, files are matched that end in ".dat".</tspan></text><text + xml:space="preserve" + style="font-size:3.52777px;line-height:1.25;font-family:sans-serif;stroke-width:0.264583" + x="94.639061" + y="185.28622" + id="text1127-35-1-3-8-2-2"><tspan + sodipodi:role="line" + style="font-weight:bold;stroke-width:0.264583" + x="94.639061" + y="185.28622" + id="tspan2173-2-3-8">File records:</tspan><tspan + sodipodi:role="line" + style="font-weight:normal;stroke-width:0.264583" + x="94.639061" + y="189.69594" + id="tspan2284-2-9">For each of the matched files create a file entity in CaosDB</tspan><tspan + sodipodi:role="line" + style="font-weight:normal;stroke-width:0.264583" + x="94.639061" + y="194.10565" + id="tspan3189">and set the path accordingly.</tspan></text><text + xml:space="preserve" + style="font-size:3.52777px;line-height:1.25;font-family:sans-serif;stroke-width:0.264583" + x="107.82945" + y="202.48438" + id="text1127-35-1-3-8-2-2-7"><tspan + sodipodi:role="line" + style="font-weight:bold;stroke-width:0.264583" + x="107.82945" + y="202.48438" + id="tspan2173-2-3-8-3">File properties:</tspan><tspan + sodipodi:role="line" + style="font-weight:normal;stroke-width:0.264583" + x="107.82945" + y="206.89409" + id="tspan3189-1">The file is added as a reference property (called "output")</tspan><tspan + sodipodi:role="line" + style="font-weight:normal;stroke-width:0.264583" + x="107.82945" + y="211.3038" + id="tspan3238">to the Measurement record.</tspan></text><text + xml:space="preserve" + style="font-size:3.52777px;line-height:1.25;font-family:sans-serif;stroke-width:0.264583" + x="45.830696" + y="27.973518" + id="text1127-3"><tspan + sodipodi:role="line" + style="stroke-width:0.264583" + x="45.830696" + y="27.973518" + id="tspan1131-5"><tspan + id="tspan1234" + style="font-weight:bold;stroke-width:0.264583">"subtree" </tspan>provides access to child-StructureElements,</tspan><tspan + sodipodi:role="line" + style="stroke-width:0.264583" + x="45.830696" + y="32.383232" + id="tspan1237">in this case the children of a directory converter are subdirectories.</tspan></text><rect + style="fill:none;stroke:#ff0000;stroke-width:1;stroke-linejoin:round;stroke-dasharray:none" + id="rect1133" + width="32.761078" + height="4.9509659" + x="2.1495595" + y="27.815798" /><rect + style="fill:none;stroke:#ff0000;stroke-width:0.999996;stroke-linejoin:round;stroke-dasharray:none" + id="rect2020" + width="105.07613" + height="13.460022" + x="2.0784345" + y="37.222599" /><rect + style="fill:none;stroke:#ff0000;stroke-width:1;stroke-linejoin:round;stroke-dasharray:none" + id="rect2022" + width="82.460075" + height="26.69338" + x="1.8999104" + y="55.421516" /><rect + style="fill:none;stroke:#ff0000;stroke-width:1;stroke-linejoin:round;stroke-dasharray:none" + id="rect2088" + width="185.95239" + height="15.581697" + x="1.5306897" + y="99.948242" /><rect + style="fill:none;stroke:#ff0000;stroke-width:1;stroke-linejoin:round;stroke-dasharray:none" + id="rect2227" + width="90.042404" + height="18.340836" + x="1.019053" + y="122.36252" /><rect + style="fill:none;stroke:#ff0000;stroke-width:0.999996;stroke-linejoin:round;stroke-dasharray:none" + id="rect2227-0" + width="88.036903" + height="14.435826" + x="1.6963811" + y="157.79872" /><rect + style="fill:none;stroke:#ff0000;stroke-width:1;stroke-linejoin:round;stroke-dasharray:none" + id="rect3146" + width="88.402161" + height="17.550062" + x="1.8143678" + y="181.32809" /><rect + style="fill:none;stroke:#ff0000;stroke-width:1;stroke-linejoin:round;stroke-dasharray:none" + id="rect3148" + width="102.97673" + height="5.1952801" + x="1.9082112" + y="203.22098" /></g></svg> diff --git a/src/doc/tutorials/index.rst b/src/doc/tutorials/index.rst index 1652515968c3b0025a2916604632d57c042f119b..412d29f01018f05b84e0fe8e43fa631b61b91d04 100644 --- a/src/doc/tutorials/index.rst +++ b/src/doc/tutorials/index.rst @@ -1,2 +1,12 @@ Tutorials +++++++++ + +This chapter contains a collection of tutorials. + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + Parameter File<parameterfile> + Scientific Data Folder<scifolder> + Single Structured File <single_file> diff --git a/src/doc/tutorials/parameterfile.rst b/src/doc/tutorials/parameterfile.rst new file mode 100644 index 0000000000000000000000000000000000000000..2442969541eebf9a4e058b797b48995b39372a3e --- /dev/null +++ b/src/doc/tutorials/parameterfile.rst @@ -0,0 +1,157 @@ +Tutorial: Parameter File +======================== + +Our data +-------- + +In the "HelloWorld" Example, the Record, that was synchronized with the +server, was created "manually" using the Python client. Now, we want to +have a look at how the Crawler can be told to do this for us. + +The Crawler needs instructions on what kind of Records it should +create given the data that it sees. This is done using so called +"CFood" YAML files. + +Let’s once again start with something simple. A common scenario is that we +want to insert the contents of a parameter file. Suppose the +parameter file is named ``params_2022-02-02.json`` and looks like the +following: + + +.. code-block:: json + :caption: params_2022-02-02.json + + { + "frequency": 0.5, + "resolution": 0.01 + } + +Suppose these are two Properties of an Experiment and the date in the file name +is the date of the Experiment. Thus, the data model could be described in a +``model.yml`` like this: + +.. code-block:: yaml + :caption: model.yml + + Experiment: + recommended_properties: + frequency: + datatype: DOUBLE + resolution: + datatype: DOUBLE + date: + datatype: DATETIME + +We will identify experiments solely using the date, so the ``identifiable.yml`` is: + +.. code-block:: yaml + :caption: identifiable.yml + + Experiment: + - date + + +Getting started with the CFood +------------------------------ + +CFoods (Crawler configurations) can be stored in YAML files: +The following section in a `cfood.yml` tells the crawler that the key value pair +``frequency: 0.5`` shall be used to set the Property "frequency" of an +"Experiment" Record: + +.. code:: yaml + + ... + my_frequency: # just the name of this section + type: FloatElement # it is a float value + match_name: ^frequency$ # regular expression: Match the 'frequency' key from the data json + match_value: ^(?P<value>.*)$ # regular expression: We match any value of that key + records: + Experiment: + frequency: $value + ... + +The first part of this section defines which kind of data element shall be handled (here: a +key-value pair with key "frequency" and a float value) and then we use this to set the "frequency" +Property. + +How does it work to actually assign the value? Let's look at what the +regular expressions do: + +- ``^frequency$`` assures that the key is exactly "frequency". "^" matches the + beginning of the string and "$" matches the end. +- ``^(?P<value>.*)$`` creates a *named match group* with the name "value" and the + pattern of this group is ".*". The dot matches any character and the star means + that it can occur zero, one or multiple times. Thus, this regular expression + matches anything and puts it in a group with the name ``value``. + +We can use the groups from the regular expressions that are used for matching. +In our example, we use the "value" group to assign the "frequency" value to the "Experiment". + +.. note:: + + For more information on the ``cfood.yml`` specification, read on in the chapter :ref:`Converters`. + +A fully grown CFood +------------------- + +Since we will not pass this key value pair on its own to the crawler, we need +to embed it into its context. The full CFood file ``cfood.yml`` for +this example might look like the following: + +.. code-block:: yaml + :caption: cfood.yml + + --- + metadata: + crawler-version: 0.5.0 + --- + directory: # corresponds to the directory given to the crawler + type: Directory + match: .* # we do not care how it is named here + subtree: + parameterfile: # corresponds to our parameter file + type: JSONFile + match: params_(?P<date>\d+-\d+-\d+)\.json # extract the date from the parameter file + records: + Experiment: # one Experiment is associated with the file + date: $date # the date is taken from the file name + subtree: + dict: # the JSON contains a dictionary + type: Dict + match: .* # the dictionary does not have a meaningful name + subtree: + my_frequency: # here we parse the frequency... + type: FloatElement + match_name: frequency + match_value: (?P<val>.*) + records: + Experiment: + frequency: $val + resolution: # ... and here the resolution + type: FloatElement + match_name: resolution + match_value: (?P<val>.*) + records: + Experiment: + resolution: $val + +You do not need to understand every aspect of this right now. We will cover +this later in greater depth. You might think: "Ohh.. This is lengthy". Well, +yes BUT this is a very generic approach that allows data integration from ANY +hierarchical data structure (directory trees, JSON, YAML, HDF5, DICOM, ... and +combinations of those!) and as you will see in later chapters there are ways +to write this in a more condensed way! + +For now, we want to see it running! + +The crawler can now be run with the following command (assuming that +the CFood file is in the current working directory): + +.. code:: sh + + caosdb-crawler -s update -i identifiables.yml cfood.yml . + +.. note:: + + ``caosdb-crawler`` currently only works with cfoods which have a directory as top level element. diff --git a/src/doc/tutorials/scifolder.rst b/src/doc/tutorials/scifolder.rst new file mode 100644 index 0000000000000000000000000000000000000000..1fd7d2ba14d30631e51cd1b22a2a87c0c8b2be8a --- /dev/null +++ b/src/doc/tutorials/scifolder.rst @@ -0,0 +1,103 @@ +Scientific Folder Structure +=========================== + +The SciFolder structure +----------------------- + +Let's walk through a more elaborate example of using the CaosDB Crawler, +this time making use of a simple directory structure. We assume +the directory structure to have the following form: + +.. code-block:: text + + ExperimentalData/ + + 2022_ProjectA/ + + 2022-02-17_TestDataset/ + file1.dat + file2.dat + ... + ... + + 2023_ProjectB/ + ... + ... + +This file structure is described in our article "Guidelines for a Standardized Filesystem Layout for Scientific Data" (https://doi.org/10.3390/data5020043). As a simplified example +we want to write a crawler that creates "Project" and "Measurement" records in CaosDB and set +some reasonable properties stemming from the file and directory names. Furthermore, we want +to link the data files to the measurement records. + +Let's first clarify the terms we are using: + +.. code-block:: text + + ExperimentalData/ <--- Category level (level 0) + 2022_ProjectA/ <--- Project level (level 1) + 2022-02-17_TestDataset/ <--- Activity / Measurement level (level 2) + file1.dat <--- Files on level 3 + file2.dat + ... + ... + 2023_ProjectB/ <--- Project level (level 1) + ... + ... + +So we can see that this follows the three-level folder structure described in the paper. +We use the term "Activity level" here, instead of the terms used in the article, as +it can be used in a more general way. + +A CFood for SciFolder +--------------------- + +The following YAML CFood is able to match and insert / update the records accordingly, with a +detailed explanation of the YAML definitions: + +.. image:: example_crawler.svg + + +See for yourself +---------------- + +If you want to try this out for yourself, you will need the following content: + +- Data files in a SciFolder structure. +- A data model which describes the data. +- An identifiables definition which describes how data Entities can be identified. +- A CFood definition which the crawler uses to map from the folder structure to entities in CaosDB. + +You can download all the necessarily files, packed in `scifolder_tutorial.tar.gz +<../_static/assets/scifolder_tutorial.tar.gz>`__. After storing this archive file, unpack it and go +into the ``scifolder`` directory, then follow these steps: + +.. role:: shell(code) + :language: shell + +1. Copy the data files folder to the ``extroot`` directory of your LinkAhead installation: + + :shell:`cp -r scifolder_data ../../<your_extroot>/`. +2. Load the content of the data folder into CaosDB: + + :shell:`python -m caosadvancedtools.loadFiles /opt/caosdb/mnt/extroot/scifolder_data`. + + The path to loadfiles is the one that the CaosDB server sees, which is not necessarily the same + as the one on your local machine. The prefix ``/opt/caosdb/mnt/extroot/`` is correct for all + LinkAhead instances. If you are in doubt, please ask your administrator for the correct path. + + For more information on `loadFiles`, call :shell:`python -m caosadvancedtools.loadFiles --help`. + + .. note:: + + If the Records that are created shall be referenced by CaosDB File Entities, you + (currently) need to make them accessible in CaosDB in advance. For example, if you + have a folder with experimental data files and you want those files to be referenced + (for example by an Experiment Record). +3. Teach the server about the data model: + + :shell:`python -m caosadvancedtools.models.parser model.yml --sync` +4. Run the crawler on the local ``scifolder_data`` folder, using the identifiables and CFood + definition files: + + :shell:`caosdb-crawler -s update -i identifiables.yml scifolder_cfood.yml scifolder_data` + diff --git a/src/doc/tutorials/single_file.rst b/src/doc/tutorials/single_file.rst new file mode 100644 index 0000000000000000000000000000000000000000..824a658985b9375e140df7fb63a1fc9e7f6a7563 --- /dev/null +++ b/src/doc/tutorials/single_file.rst @@ -0,0 +1,222 @@ +Tutorial: Single structured file +==================================== + +In this tutorial, we will create a crawler that reads a single structured file, +such as a CSV file. + +Declarations +------------ +This tutorial is based on the following simple data model: + +``model.yml`` + +.. code-block:: yaml + + Fish: + recommended_properties: + date: + datatype: DATETIME + number: + datatype: INTEGER + weight: + datatype: DOUBLE + species: + datatype: TEXT + +You can insert this model with the following command: + +.. code-block:: shell + + python -m caosadvancedtools.models.parser model.yml --sync + + +We will identify `Fish` Records in LinkAhead using the following two +attributes. + +``identifiables.yml`` + +.. code-block:: yaml + + Fish: + - date + - number + +And we will use the following crawler configuration. + +``cfood.yml`` + +.. code-block:: yaml + + --- + metadata: + crawler-version: 0.9.1 + --- + + fish_data_file: # Root file + type: CSVTableConverter + match: "^fish_data_.*.csv$" # Match CSV file with a name that starts with "fish_data_" + subtree: + table_row: # One row in the CSV file + type: DictElement + match_name: .* # we want to treat every row, so match anything + match_value: .* + records: + Fish: # Record for the current row; information from statements below + # are added to this Record + subtree: + date: # Element for the date column + type: TextElement + match_name: date # Name of the column in the table file + match_value: (?P<column_value>.*) # We match any value of the row in this + # column and assign it to the ``column_value`` + # variable + records: # Records edited for each cell + Fish: + date: $column_value + species: + type: TextElement + match_name: species + match_value: (?P<column_value>.*) + records: + Fish: + species: $column_value + number: + type: TextElement + match_name: identifier + match_value: (?P<column_value>.*) + records: + Fish: + number: $column_value + weight: + type: TextElement + match_name: weight + match_value: (?P<column_value>.*) + records: + Fish: + weight: $column_value + + + +Python code +----------- + +The following code allows us to read the csv file, create corresponding `Fish` +Records and synchronize those with LinkAhead. + +.. code-block:: python + + #!/usr/bin/env python3 + + # Copyright (C) 2023-2024 IndiScale GmbH <info@indiscale.com> + # + # This program is free software: you can redistribute it and/or modify + # it under the terms of the GNU Affero General Public License as + # published by the Free Software Foundation, either version 3 of the + # License, or (at your option) any later version. + # + # This program is distributed in the hope that it will be useful, + # but WITHOUT ANY WARRANTY; without even the implied warranty of + # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + # GNU Affero General Public License for more details. + # + # You should have received a copy of the GNU Affero General Public License + # along with this program. If not, see <https://www.gnu.org/licenses/>. + + """Crawler for fish data""" + + import os + import argparse + import sys + import logging + + from caoscrawler.scanner import load_definition, create_converter_registry, scan_structure_elements + from caoscrawler.structure_elements import File + from caoscrawler import Crawler, SecurityMode + from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter + + + def crawl_file(filename: str, dry_run: bool = False): + """Read a CSV file into a LinkAhead container. + + Parameters + ---------- + filename : str + The name of the CSV file. + + dry_run : bool + If True, do not modify the database. + """ + # setup logging + logger = logging.getLogger("caoscrawler") + logger.setLevel(level=(logging.DEBUG)) + logger.addHandler(logging.StreamHandler(stream=sys.stdout)) + + # load crawler configuration + definition = load_definition("cfood.yml") + converter_registry = create_converter_registry(definition) + + # crawl the CSV file + records = scan_structure_elements(items=File(name= os.path.basename(filename), path=filename), + crawler_definition=definition, + converter_registry=converter_registry) + logger.debug(records) + + crawler = Crawler(securityMode=SecurityMode.UPDATE) + # This defines how Records on the server are identified with the ones we have locally + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition("identifiables.yml") + crawler.identifiableAdapter = ident + + # Here we synchronize the data + inserts, updates = crawler.synchronize(commit_changes=True, unique_names=True, + crawled_data=records) + + #from IPython import embed + #embed() + + def _parse_arguments(): + """Parse the arguments.""" + parser = argparse.ArgumentParser(description='Crawler for fish data') + parser.add_argument('-n', '--dry-run', help="Do not modify the database.", action="store_true") + parser.add_argument('csv_file', metavar="csv file", help="The csv file to be crawled.") + return parser.parse_args() + + + def main(): + """Main function.""" + args = _parse_arguments() + crawl_file(args.csv_file, dry_run=args.dry_run) + + + if __name__ == '__main__': + main() + +Running it +---------- +This is an example for the data files that we can crawl: + +``fish_data_1.csv`` + +.. code-block:: + + identifier,date,species,weight + 1,2022-01-02,pike,3.4 + 2,2022-01-02,guppy,2.3 + 3,2022-01-02,pike,2.2 + 3,2022-01-06,pike,2.1 + + +If you have created all the files, you can run: + +.. code-block:: shell + + python3 crawl.py fish_data_2.csv + +Note, that you can run the same script again and you will not see any changes +being done to the data in LinkAhead. + + +You may play around with changing data in the data table. Changes will +propagate into LinkAhead when you run the Crawler again. If you change one of +the identifying properties, the Crawler will consider the data that it reads as +new and create new `Fish` Records. diff --git a/src/doc_sources/scifolder/identifiables.yml b/src/doc_sources/scifolder/identifiables.yml new file mode 100644 index 0000000000000000000000000000000000000000..ac2e458b02416e2cbd93b5132468a7daa31fb135 --- /dev/null +++ b/src/doc_sources/scifolder/identifiables.yml @@ -0,0 +1,8 @@ +Person: + - last_name +Measurement: + - date + - project +Project: + - date + - identifier diff --git a/src/doc_sources/scifolder/model.yml b/src/doc_sources/scifolder/model.yml new file mode 100644 index 0000000000000000000000000000000000000000..7e1a391186be6a01fb10d0b32e8516238012f374 --- /dev/null +++ b/src/doc_sources/scifolder/model.yml @@ -0,0 +1,88 @@ +Experiment: + obligatory_properties: + date: + datatype: DATETIME + description: 'date of the experiment' + identifier: + datatype: TEXT + description: 'identifier of the experiment' + # TODO empty recommended_properties is a problem + #recommended_properties: + responsible: + datatype: LIST<Person> +Project: +SoftwareVersion: + recommended_properties: + version: + datatype: TEXT + description: 'Version of the software.' + binaries: + sourceCode: + Software: +DepthTest: + obligatory_properties: + temperature: + datatype: DOUBLE + description: 'temp' + depth: + datatype: DOUBLE + description: 'temp' +Person: + obligatory_properties: + first_name: + datatype: TEXT + description: 'First name of a Person.' + last_name: + datatype: TEXT + description: 'LastName of a Person.' + recommended_properties: + email: + datatype: TEXT + description: 'Email of a Person.' +revisionOf: + datatype: REFERENCE +results: + datatype: LIST<REFERENCE> +sources: + datatype: LIST<REFERENCE> +scripts: + datatype: LIST<REFERENCE> +single_attribute: + datatype: LIST<INTEGER> +Simulation: + obligatory_properties: + date: + identifier: + responsible: +Analysis: + obligatory_properties: + date: + identifier: + responsible: + suggested_properties: + mean_value: + datatype: DOUBLE +Publication: +Thesis: + inherit_from_suggested: + - Publication +Article: + inherit_from_suggested: + - Publication +Poster: + inherit_from_suggested: + - Publication +Presentation: + inherit_from_suggested: + - Publication +Report: + inherit_from_suggested: + - Publication +hdf5File: + datatype: REFERENCE +Measurement: + recommended_properties: + date: +ReadmeFile: + datatype: REFERENCE +ProjectMarkdownReadme: diff --git a/src/doc_sources/scifolder/scifolder_cfood.yml b/src/doc_sources/scifolder/scifolder_cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..34256309989acf5447abf83e32162190acba90bf --- /dev/null +++ b/src/doc_sources/scifolder/scifolder_cfood.yml @@ -0,0 +1,86 @@ +# This is only a scifolder test cfood with a limited functionality. +# The full scifolder cfood will be developed here: +# https://gitlab.indiscale.com/caosdb/src/crawler-cfoods/scifolder-cfood + +--- +metadata: + crawler-version: 0.5.1 +--- +Definitions: + type: Definitions + #include "description.yml" + +Data: # name of the converter + type: Directory + match: (.*) + subtree: + DataAnalysis: # name of the converter + type: Directory + match: DataAnalysis + subtree: &template + project_dir: # name of the first subtree element which is a converter + type: Directory + match: ((?P<date>[0-9]{4,4})_)?(?P<identifier>.*) + records: + Project: # this is an identifiable in this case + parents: + - Project # not needed as the name is equivalent + date: $date + identifier: ${identifier} + + subtree: + measurement: # new name for folders on the 3rd level + type: Directory + match: (?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))? + records: + Measurement: + date: $date + identifier: $identifier + project: $Project + subtree: + README: + type: MarkdownFile # this is a subclass of converter File + # function signature: GeneralStore, StructureElement + # preprocessors: custom.caosdb.convert_values + match: ^README\.md$ + # how to make match case insensitive? + subtree: + description: + type: TextElement + match_value: (?P<description>.*) + match_name: description + records: + Measurement: + description: $description + responsible_single: + type: TextElement + match_name: responsible + match_value: &person_regexp ((?P<first_name>.+) )?(?P<last_name>.+) + records: &responsible_records + Person: + first_name: $first_name + last_name: $last_name + Measurement: # this uses the reference to the above defined record + responsible: +$Person # each record also implicitely creates a variable + # with the same name. The "+" indicates, that + # this will become a list entry in list property + # "responsible" belonging to Measurement. + + responsible_list: + type: DictListElement + match_name: responsible + subtree: + Person: + type: TextElement + match_value: *person_regexp + records: *responsible_records + + ExperimentalData: # name of the converter + type: Directory + match: ExperimentalData + subtree: *template + + SimulationData: # name of the converter + type: Directory + match: SimulationData + subtree: *template diff --git a/src/doc_sources/scifolder/scifolder_data/DataAnalysis/2020_SpeedOfLight/2020-01-04_average-all-exp/README.md b/src/doc_sources/scifolder/scifolder_data/DataAnalysis/2020_SpeedOfLight/2020-01-04_average-all-exp/README.md new file mode 100644 index 0000000000000000000000000000000000000000..87f2206efa83701d9a90757811462d3042d8eb3f --- /dev/null +++ b/src/doc_sources/scifolder/scifolder_data/DataAnalysis/2020_SpeedOfLight/2020-01-04_average-all-exp/README.md @@ -0,0 +1,20 @@ +--- +responsible: AuthorA +description: Average over all data of each type of experiment separately and comined. +sources: +- ../../../ExperimentalData/2020_SpeedOfLight/2020-01-01_TimeOfFlight +- ../../../ExperimentalData/2020_SpeedOfLight/2020-01-02_Cavity +- ../../../ExperimentalData/2020_SpeedOfLight/2020-01-03/velocities.txt +results: +- file: single-averages-*.csv + description: average speed of light from all single types of measurements +- file: all-averages.csv + description: average speed of light from all measurements combined +- file: "*.pdf" + description: Plots of the averages +scripts: +- file: calculate-averages.py + description: python code doing the calculation +- file: plot-averages.py + description: create nice plots for article +... diff --git a/src/doc_sources/scifolder/scifolder_data/DataAnalysis/2020_SpeedOfLight/2020-01-05_average-all-exp-corr/README.md b/src/doc_sources/scifolder/scifolder_data/DataAnalysis/2020_SpeedOfLight/2020-01-05_average-all-exp-corr/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fe1cdb06194c473f44c4179210cc58692ee68e9d --- /dev/null +++ b/src/doc_sources/scifolder/scifolder_data/DataAnalysis/2020_SpeedOfLight/2020-01-05_average-all-exp-corr/README.md @@ -0,0 +1,21 @@ +--- +responsible: AuthorA +description: Average over all data of each type of experiment separately and comined. +sources: +- ../../../ExperimentalData/2020_SpeedOfLight/2020-01-01_TimeOfFlight +- ../../../ExperimentalData/2020_SpeedOfLight/2020-01-02_Cavity +- ../../../ExperimentalData/2020_SpeedOfLight/2020-01-03/velocities.txt +results: +- file: single-averages-*.csv + description: average speed of light from all single types of measurements +- file: all-averages.csv + description: average speed of light from all measurements combined +- file: "*.pdf" + description: Plots of the averages +scripts: +- file: calculate-averages.py + description: python code doing the calculation +- file: plot-averages.py + description: create nice plots for article +revisionOf: ../2020-01-04_average-all-exp +... diff --git a/src/doc_sources/scifolder/scifolder_data/DataAnalysis/2020_climate-model-predict/2020-02-08_prediction-errors/README.md b/src/doc_sources/scifolder/scifolder_data/DataAnalysis/2020_climate-model-predict/2020-02-08_prediction-errors/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c9c2050816362f8f80887b9f964e70ba7a413f8f --- /dev/null +++ b/src/doc_sources/scifolder/scifolder_data/DataAnalysis/2020_climate-model-predict/2020-02-08_prediction-errors/README.md @@ -0,0 +1,15 @@ +--- +responsible: AuthorD +description: comparison between predicted and measured temperatures for 2010 to 2019 +sources: +- ../../../ExperimentalData/2020_climate-model-predict/2010-01-01/temperatures-*.csv +- ../../../SimulationData/2020_climate-model-predict/2020-02-01/predictions-*.csv +results: +- file: "*.pdf" + description: Plots of absolute and relative errors +- file: errors.csv + description: prediction errors for all measurement locations +scripts: +- file: differences.py + description: Calculate the absolute and relative differences between predicted and measured temperatures, and plot them. +... diff --git a/src/doc_sources/scifolder/scifolder_data/ExperimentalData/2020_SpeedOfLight/2020-01-01_TimeOfFlight/README.md b/src/doc_sources/scifolder/scifolder_data/ExperimentalData/2020_SpeedOfLight/2020-01-01_TimeOfFlight/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b6bad97bbd6697638f912ac99799b621719d1884 --- /dev/null +++ b/src/doc_sources/scifolder/scifolder_data/ExperimentalData/2020_SpeedOfLight/2020-01-01_TimeOfFlight/README.md @@ -0,0 +1,6 @@ +--- +responsible: +- AuthorA +- AuthorB +description: Time-of-flight measurements to determine the speed of light +... diff --git a/src/doc_sources/scifolder/scifolder_data/ExperimentalData/2020_SpeedOfLight/2020-01-02_Cavity/README.md b/src/doc_sources/scifolder/scifolder_data/ExperimentalData/2020_SpeedOfLight/2020-01-02_Cavity/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f5302678afe92c507b735009918cba0425a3bf76 --- /dev/null +++ b/src/doc_sources/scifolder/scifolder_data/ExperimentalData/2020_SpeedOfLight/2020-01-02_Cavity/README.md @@ -0,0 +1,6 @@ +--- +responsible: +- AuthorA +- AuthorC +description: Cavity resonance measurements for determining the speed of light +... diff --git a/src/doc_sources/scifolder/scifolder_data/ExperimentalData/2020_SpeedOfLight/2020-01-03/README.md b/src/doc_sources/scifolder/scifolder_data/ExperimentalData/2020_SpeedOfLight/2020-01-03/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4d32e1d7f682c138cf42b36dc482ce4cecb0e940 --- /dev/null +++ b/src/doc_sources/scifolder/scifolder_data/ExperimentalData/2020_SpeedOfLight/2020-01-03/README.md @@ -0,0 +1,9 @@ +--- +responsible: +- AuthorA +- AuthorB +description: Radio interferometry measurements to determine the speed of light +results: +- file: velocities.txt + description: velocities of all measurements +... diff --git a/src/doc_sources/scifolder/scifolder_data/ExperimentalData/2020_climate-model-predict/1980-01-01/README.md b/src/doc_sources/scifolder/scifolder_data/ExperimentalData/2020_climate-model-predict/1980-01-01/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6a625d10fd1f7d1a0fa4f024872ab19084ebccec --- /dev/null +++ b/src/doc_sources/scifolder/scifolder_data/ExperimentalData/2020_climate-model-predict/1980-01-01/README.md @@ -0,0 +1,7 @@ +--- +responsible: AuthorD +description: Average temperatures of the years 1980-1989 as obtained from wheatherdata.example +results: +- file: temperatures-198*.csv + description: single year averages of all measurement stations with geographic locations +... diff --git a/src/doc_sources/scifolder/scifolder_data/ExperimentalData/2020_climate-model-predict/1990-01-01/README.md b/src/doc_sources/scifolder/scifolder_data/ExperimentalData/2020_climate-model-predict/1990-01-01/README.md new file mode 100644 index 0000000000000000000000000000000000000000..87053c2c1902e791f42743b7de93bd79b6fd5649 --- /dev/null +++ b/src/doc_sources/scifolder/scifolder_data/ExperimentalData/2020_climate-model-predict/1990-01-01/README.md @@ -0,0 +1,7 @@ +--- +responsible: AuthorD +description: Average temperatures of the years 1990-1999 as obtained from wheatherdata.example +results: +- file: temperatures-199*.csv + description: single year averages of all measurement stations with geographic locations +... diff --git a/src/doc_sources/scifolder/scifolder_data/ExperimentalData/2020_climate-model-predict/2000-01-01/README.md b/src/doc_sources/scifolder/scifolder_data/ExperimentalData/2020_climate-model-predict/2000-01-01/README.md new file mode 100644 index 0000000000000000000000000000000000000000..95eb81650437267d67ddbaaceecc246a56e619cf --- /dev/null +++ b/src/doc_sources/scifolder/scifolder_data/ExperimentalData/2020_climate-model-predict/2000-01-01/README.md @@ -0,0 +1,7 @@ +--- +responsible: AuthorD +description: Average temperatures of the years 2000-2009 as obtained from wheatherdata.example +results: +- file: temperatures-200*.csv + description: single year averages of all measurement stations with geographic locations +... diff --git a/src/doc_sources/scifolder/scifolder_data/ExperimentalData/2020_climate-model-predict/2010-01-01/README.md b/src/doc_sources/scifolder/scifolder_data/ExperimentalData/2020_climate-model-predict/2010-01-01/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bb91400eb3a8662e1b589eda7a0af65f0a68064a --- /dev/null +++ b/src/doc_sources/scifolder/scifolder_data/ExperimentalData/2020_climate-model-predict/2010-01-01/README.md @@ -0,0 +1,7 @@ +--- +responsible: AuthorD +description: Average temperatures of the years 2010-2019 as obtained from wheatherdata.example +results: +- file: temperatures-201*.csv + description: single year averages of all measurement stations with geographic locations +... diff --git a/src/doc_sources/scifolder/scifolder_data/Publications/Articles/2020_AuthorA-JourRel/README.md b/src/doc_sources/scifolder/scifolder_data/Publications/Articles/2020_AuthorA-JourRel/README.md new file mode 100644 index 0000000000000000000000000000000000000000..25078c5084c72a9b0d7f0388605373fdf88a5cdd --- /dev/null +++ b/src/doc_sources/scifolder/scifolder_data/Publications/Articles/2020_AuthorA-JourRel/README.md @@ -0,0 +1,16 @@ +--- +responsible: +- AuthorA +- AuthorB +- AuthorC +description: Article on the comparison of several experimental methods for determining the speed of light. +sources: +- ../../../ExperimentalData/2020_SpeedOfLight/2020-01-01_TimeOfFlight +- ../../../ExperimentalData/2020_SpeedOfLight/2020-01-02_Cavity +- ../../../ExperimentalData/2020_SpeedOfLight/2020-01-03/velocities.txt +- ../../../DataAnalysis/2020-01-05_average-all-exp-corr +... + +# Further Notes + + The corrected analysis was used in Figure 1. \ No newline at end of file diff --git a/src/doc_sources/scifolder/scifolder_data/Publications/Presentations/2020-03-01_AuthorD-climate-model-conf/README.md b/src/doc_sources/scifolder/scifolder_data/Publications/Presentations/2020-03-01_AuthorD-climate-model-conf/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5f04c0747a9eb6910c23421905268b845baf7485 --- /dev/null +++ b/src/doc_sources/scifolder/scifolder_data/Publications/Presentations/2020-03-01_AuthorD-climate-model-conf/README.md @@ -0,0 +1,11 @@ +--- +responsible: AuthorD +description: beamer slides of the conference talk given at the 2020 climate modeling conference in Berlin +sources: +- ../../../ExperimentalData/2020_climate-model-predict/1980-01-01/temperatures-*.csv +- ../../../ExperimentalData/2020_climate-model-predict/1990-01-01/temperatures-*.csv +- ../../../ExperimentalData/2020_climate-model-predict/2000-01-01/temperatures-*.csv +- ../../../ExperimentalData/2020_climate-model-predict/2010-01-01/temperatures-*.csv +- ../../../SimulationData/2020_climate-model-predict/2020-02-01 +- ../../../DataAnalysis/2020_climate-model-predict/2020-02-08_prediction-errors +... diff --git a/src/doc_sources/scifolder/scifolder_data/Publications/Reports/2020-01-10_avg-speed-of-light/README.md b/src/doc_sources/scifolder/scifolder_data/Publications/Reports/2020-01-10_avg-speed-of-light/README.md new file mode 100644 index 0000000000000000000000000000000000000000..781d1550a661ff09633477af0efa22ca98cfdb76 --- /dev/null +++ b/src/doc_sources/scifolder/scifolder_data/Publications/Reports/2020-01-10_avg-speed-of-light/README.md @@ -0,0 +1,4 @@ +--- +responsible: AuthorA +description: Short report comparing different speed of light measurements +... diff --git a/src/doc_sources/scifolder/scifolder_data/SimulationData/2020_climate-model-predict/2020-02-01/README.md b/src/doc_sources/scifolder/scifolder_data/SimulationData/2020_climate-model-predict/2020-02-01/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0c91d6b5f7601334b84a77328b888d227e779a93 --- /dev/null +++ b/src/doc_sources/scifolder/scifolder_data/SimulationData/2020_climate-model-predict/2020-02-01/README.md @@ -0,0 +1,24 @@ +--- +responsible: AuthorE +description: >- + Code for fitting the predictive model to the + training data and for predicting the average + annual temperature for all measurement stations + for the years 2010 to 2019 +sources: +- ../../../ExperimentalData/2020_climate-model-predict/1980-01-01/temperatures-*.csv +- ../../../ExperimentalData/2020_climate-model-predict/1990-01-01/temperatures-*.csv +- ../../../ExperimentalData/2020_climate-model-predict/2000-01-01/temperatures-*.csv +results: +- file: params.json + description: Model parameters for the best fit to the training set +- file: predictions-201*.csv + description: Annual temperature predictions with geographical locations +scripts: +- file: model.py + description: python module with the model equations +- file: fit_parameters.py + description: Fit model parameters to training data using a basinhopping optimizer +- file: predict.py + description: Use optimized parameters to simulate average temperatures from 2010 to 2019 +... diff --git a/tox.ini b/tox.ini index a7d4465ed36f0fe5e49c06721d3e3a0cdf453fa0..d44fbb6d50c58f44fe7944a2a49711b8def18cd6 100644 --- a/tox.ini +++ b/tox.ini @@ -1,20 +1,22 @@ [tox] -envlist = py37, py38, py39, py310, py311 +envlist = py38, py39, py310, py311, py312, py313 skip_missing_interpreters = true [testenv] -deps = . +deps = .[h5-crawler,spss,rocrate] pytest pytest-cov - # TODO: Make this f-branch sensitive - git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev - git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev + linkahead + caosadvancedtools commands = caosdb-crawler --help - py.test --cov=caosdb -vv {posargs} + py.test --cov=caoscrawler -vv {posargs} [flake8] max-line-length = 100 +[pycodestyle] +max-line-length = 100 + [pytest] testpaths = unittests -xfail_strict = True \ No newline at end of file +xfail_strict = True diff --git a/unittests/broken_cfoods/broken_record_from_dict.yml b/unittests/broken_cfoods/broken_record_from_dict.yml new file mode 100644 index 0000000000000000000000000000000000000000..fd8ffdbd29f6ad7b8b38fc17eb43686f4170dbcb --- /dev/null +++ b/unittests/broken_cfoods/broken_record_from_dict.yml @@ -0,0 +1,7 @@ +RecordFromDictElement: + type: PropertiesFromDictElement + match: "(.*)" + subtree: + AnotherElement: + type: Text + match_name: "(.*)" diff --git a/unittests/broken_cfoods/broken_record_from_dict_2.yml b/unittests/broken_cfoods/broken_record_from_dict_2.yml new file mode 100644 index 0000000000000000000000000000000000000000..ca321373c6c4d6bcc8c104c8c4b3c7147bf71375 --- /dev/null +++ b/unittests/broken_cfoods/broken_record_from_dict_2.yml @@ -0,0 +1,11 @@ +RecordFromDictElement: + type: PropertiesFromDictElement + record_from_dict: + parents: + - MyType1 + - MyType2 + match: "(.*)" + subtree: + AnotherElement: + type: Text + match_name: "(.*)" diff --git a/unittests/cfood_variable_deletion.yml b/unittests/cfood_variable_deletion.yml new file mode 100644 index 0000000000000000000000000000000000000000..9edfc1b06cdd6f57a52cc71a96306984ee9f2dbe --- /dev/null +++ b/unittests/cfood_variable_deletion.yml @@ -0,0 +1,29 @@ + +Data: + type: Directory + match: (.*) + subtree: + Data_1: + type: Directory + match: ^Data_1$ + subtree: + Subdir: + type: Directory + match: ^(?P<test_1>.*)$ + records: + DummyRecord: + name: "Record from Data_1" + var1: $test_1 + var2: $test_2 + Data_2: + type: Directory + match: ^Data_2$ + subtree: + Subdir: + type: Directory + match: ^(?P<test_2>.*)$ + records: + DummyRecord: + name: "Record from Data_2" + var1: $test_1 + var2: $test_2 diff --git a/unittests/cfood_variable_deletion2.yml b/unittests/cfood_variable_deletion2.yml new file mode 100644 index 0000000000000000000000000000000000000000..729fe519e00323c046d77e93904421c3ba6a666e --- /dev/null +++ b/unittests/cfood_variable_deletion2.yml @@ -0,0 +1,29 @@ + +Data: + type: Directory + match: (?P<test_1>.*) + subtree: + Data_1: + type: Directory + match: ^Data_1$ + subtree: + Subdir: + type: Directory + match: ^(?P<test_1>.*)$ + records: + DummyRecord: + name: "Record from Data_1" + var1: $test_1 + var2: $test_2 + Data_2: + type: Directory + match: ^Data_2$ + subtree: + Subdir: + type: Directory + match: ^(?P<test_2>.*)$ + records: + DummyRecord: + name: "Record from Data_2" + var1: $test_1 + var2: $test_2 diff --git a/unittests/datamodels/datamodel.yaml b/unittests/datamodels/datamodel.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2759ecba7f2967062937d9b2f4805a9b501ab6c4 --- /dev/null +++ b/unittests/datamodels/datamodel.yaml @@ -0,0 +1,6 @@ +Dataset: + obligatory_properties: + keywords: + datatype: TEXT + dateModified: + datatype: DATETIME diff --git a/unittests/eln_cfood.yaml b/unittests/eln_cfood.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bb29b7da7c1e6c3fc555038412f42ff2ab4d28fa --- /dev/null +++ b/unittests/eln_cfood.yaml @@ -0,0 +1,43 @@ +--- +metadata: + crawler-version: 0.9.2 + macros: +--- +Converters: + ELNFile: + converter: ELNFileConverter + package: caoscrawler.converters + ROCrateEntity: + converter: ROCrateEntityConverter + package: caoscrawler.converters + +DataDir: + type: Directory + match: .* + subtree: + ELNFile: + type: ELNFile + match: ^.*\.eln$ + subtree: + RecordsExample: + type: ROCrateEntity + match_type: Dataset + match_properties: + "@id": records-example/$ + name: (?P<name>.*) + keywords: (?P<keywords>.*) + dateModified: (?P<dateModified>.*) + records: + Dataset: + name: $name + keywords: $keywords + dateModified: $dateModified + subtree: + Description: + type: ROCrateEntity + match_type: TextObject + match_properties: + text: (?P<description>.*) + records: + Dataset: + description: $description diff --git a/unittests/eln_files/records-example.eln b/unittests/eln_files/records-example.eln new file mode 100644 index 0000000000000000000000000000000000000000..4907bcc4e88e2152fdf2675a50ca661b666c947d Binary files /dev/null and b/unittests/eln_files/records-example.eln differ diff --git a/unittests/example_cfood.yml b/unittests/example_cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..59cb601395f73bd26ed81bd6ea9c51f670798d36 --- /dev/null +++ b/unittests/example_cfood.yml @@ -0,0 +1,47 @@ +--- +metadata: + crawler-version: 0.9.0 +--- +Definitions: + type: Definitions + +data: + type: Dict + match_name: '.*' + subtree: + Expiments: + type: ListElement + match_name: 'Experiments' + subtree: + Experiment: + type: DictElement + match: '.*' + records: + Ent: + parents: ["Experiment"] + subtree: &date_res + date: + type: Date + match_name: 'date' + match_value: '(?P<date>.*)' + records: + Ent: + date: $date + result: + type: TextElement + match_name: 'result' + match_value: '(?P<res>.*)' + records: + Ent: + result: $res + Analyses: + type: ListElement + match_name: 'Analyses' + subtree: + Analysis: + type: DictElement + match: '.*' + records: + Ent: + parents: ["Analysis"] + subtree: *date_res diff --git a/unittests/example_datastructure.yml b/unittests/example_datastructure.yml new file mode 100644 index 0000000000000000000000000000000000000000..1ec9d4575c7216fe5b8954db22cd9f2d03a7e749 --- /dev/null +++ b/unittests/example_datastructure.yml @@ -0,0 +1,10 @@ +Experiments: + - date: 2022-02-01 + result: FAIL + - date: 2022-02-02 + result: SUCCESS +Analyses: + - date: 2022-03-01 + result: homogeneous + - date: 2022-03-02 + result: heterogeneous diff --git a/unittests/example_identifiables.yml b/unittests/example_identifiables.yml new file mode 100644 index 0000000000000000000000000000000000000000..7e7da0a74cc202178bcae8a70be52d85f660d4e6 --- /dev/null +++ b/unittests/example_identifiables.yml @@ -0,0 +1,4 @@ +Experiment: + - date +Analysis: + - date diff --git a/unittests/h5_cfood.yml b/unittests/h5_cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..dc789a85aabcbdc32388fd91460d42d477630f37 --- /dev/null +++ b/unittests/h5_cfood.yml @@ -0,0 +1,69 @@ +--- +metadata: + crawler-version: 0.9.0 +--- +Converters: + H5Dataset: + converter: H5DatasetConverter + package: caoscrawler.converters.hdf5_converter + H5File: + converter: H5FileConverter + package: caoscrawler.converters.hdf5_converter + H5Group: + converter: H5GroupConverter + package: caoscrawler.converters.hdf5_converter + H5Ndarray: + converter: H5NdarrayConverter + package: caoscrawler.converters.hdf5_converter +# Top-level, we have just the HDF5 file. +ParentDirectory: + type: Directory + match: (.*) + subtree: + H5FileElement: + type: H5File + match: (.*)\.(hdf5|h5)$ + records: + H5File: + parents: + - H5File + role: File + path: $H5FileElement + file: $H5FileElement + subtree: + # Here, we have the groups, the top-level dataset, and possible + # attributes (empty for now). + RootIntegerElement: + type: H5Dataset + match_name: ^root_integers$ + records: + H5Dataset: + parents: + - H5Dataset + H5File: + H5Dataset: +$H5Dataset + subtree: + # included NDArray in this dataset + TopLevelIntNDElement: + type: H5Ndarray + match_name: (.*) + recordname: this + records: + # this: + # ContainingFile: $H5File + H5Dataset: + Ndarray: $this + # There is one more list-valued attribute to this dataset. + TopLevelDataAttribute: + type: ListElement + match_name: ^attr_data_root$ + subtree: + AttributeListEntry: + type: FloatElement + match_name: (.*) + match_value: (?P<value>.*) + records: + H5Dataset: + attr_data_root: +$value + + diff --git a/unittests/hdf5_dummy_file.hdf5 b/unittests/hdf5_dummy_file.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..41bfb7ab3bcac19d90fd4f018cdd8118ae806eaf Binary files /dev/null and b/unittests/hdf5_dummy_file.hdf5 differ diff --git a/unittests/record_from_dict_cfood.yml b/unittests/record_from_dict_cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..1ea2159df9d63256d9a0b2e293d82a9ad694608f --- /dev/null +++ b/unittests/record_from_dict_cfood.yml @@ -0,0 +1,12 @@ +PropertiesFromDictElement: + type: PropertiesFromDictElement + match: ".*" + record_from_dict: + variable_name: MyRec + parents: + - MyType1 + - MyType2 + references: + author: + parents: + - Person diff --git a/unittests/scifolder_cfood.yml b/unittests/scifolder_cfood.yml index 74fd027563907c5ae416ca389faba0ecd64d5848..f32c24e772b86ab8adf530d20ec208722b74deac 100644 --- a/unittests/scifolder_cfood.yml +++ b/unittests/scifolder_cfood.yml @@ -2,6 +2,10 @@ # The full scifolder cfood will be developed here: # https://gitlab.indiscale.com/caosdb/src/crawler-cfoods/scifolder-cfood +--- +metadata: + crawler-version: 0.9.0 +--- Definitions: type: Definitions #include "description.yml" @@ -22,7 +26,7 @@ Data: # name of the converter parents: - Project # not needed as the name is equivalent date: $date - identifier: $identifier + identifier: ${identifier} subtree: measurement: # new name for folders on the 3rd level diff --git a/unittests/simulated_server_data.py b/unittests/simulated_server_data.py deleted file mode 100644 index dd0c6b4e8693d64c9d96cafc5db2f447613daa1b..0000000000000000000000000000000000000000 --- a/unittests/simulated_server_data.py +++ /dev/null @@ -1,24 +0,0 @@ - -import caosdb as db -data_model = {"person": (db.RecordType(id=259, name="Person") - .add_property(name="first_name") - .add_property(name="last_name")), - "measurement": (db.RecordType(id=278, name="Measurement") - .add_property(name="identifier") - .add_property(name="date") - .add_property(name="project")), - "project": (db.RecordType(id=250, name="Project") - .add_property(name="date") - .add_property(name="identifier")), - "first_name": db.Property(name="first_name", datatype=db.TEXT, id=261), - "responsible": db.Property(name="responsible", datatype="Person", id=249), - "last_name": db.Property(name="last_name", datatype=db.TEXT, id=262), - "identifier": db.Property(name="identifier", datatype=db.TEXT, id=248), - "date": db.Property(name="date", datatype=db.DATETIME, id=247), - } -existing_data = { -} - -full_data = {} -full_data.update(data_model) -full_data.update(existing_data) diff --git a/unittests/test_cfood_metadata.py b/unittests/test_cfood_metadata.py index 09d6c88bdc27e1066ed18a9c5865cbfb95270c3a..b123f98584ba99ed4fec412732cb2bf536034a91 100644 --- a/unittests/test_cfood_metadata.py +++ b/unittests/test_cfood_metadata.py @@ -17,24 +17,14 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # +from tempfile import NamedTemporaryFile +from unittest.mock import patch + import pytest import yaml -from tempfile import NamedTemporaryFile - import caoscrawler - -CRAWLER_VERSION = "" - - -def setup_function(function): - """Store original crawler version in case it is altered for tests.""" - CRAWLER_VERSION = caoscrawler.version.version - - -def teardown_function(function): - """Reset version""" - caoscrawler.version.version = CRAWLER_VERSION +from caoscrawler.scanner import load_definition def _temp_file_load(txt: str): @@ -43,11 +33,10 @@ def _temp_file_load(txt: str): definition using load_definition from Crawler. """ definition = None - with NamedTemporaryFile() as f: + with NamedTemporaryFile(delete=False) as f: f.write(txt.encode()) f.flush() - c = caoscrawler.Crawler() - definition = c.load_definition(f.name) + definition = load_definition(f.name) return definition @@ -68,9 +57,12 @@ SimulationData: with pytest.warns(UserWarning) as uw: _temp_file_load(definition_text) - assert len(uw) == 1 - assert "No crawler version specified in cfood definition" in uw[0].message.args[0] - assert "Specifying a version is highly recommended" in uw[0].message.args[0] + found = False + for w in uw: + if ("No crawler version specified in cfood definition" in w.message.args[0] and + "Specifying a version is highly recommended" in w.message.args[0]): + found = True + assert found # metadata section is missing alltogether definition_text = """ @@ -82,12 +74,16 @@ SimulationData: with pytest.warns(UserWarning) as uw: _temp_file_load(definition_text) - assert len(uw) == 1 - assert "No crawler version specified in cfood definition" in uw[0].message.args[0] - assert "Specifying a version is highly recommended" in uw[0].message.args[0] + found = False + for w in uw: + if ("No crawler version specified in cfood definition" in w.message.args[0] and + "Specifying a version is highly recommended" in w.message.args[0]): + found = True + assert found -def test_warning_if_version_too_old(): +@patch("caoscrawler.version.get_caoscrawler_version") +def test_warning_if_version_too_old(get_version): """Warn if the cfood was written for an older crawler version.""" definition_text = """ @@ -102,31 +98,38 @@ SimulationData: match: SimulationData """ - # higher minor - caoscrawler.version.version = "0.3.0" + get_version.side_effect = lambda: "0.3.0" with pytest.warns(UserWarning) as uw: _temp_file_load(definition_text) - assert len(uw) == 1 - assert "cfood was written for a previous crawler version" in uw[0].message.args[0] - assert "version specified in cfood: 0.2.0" in uw[0].message.args[0] - assert "version installed on your system: 0.3.0" in uw[0].message.args[0] + found = False + for w in uw: + if ("cfood was written for a previous crawler version" in w.message.args[0] and + "version specified in cfood: 0.2.0" in w.message.args[0] and + "version installed on your system: 0.3.0" in w.message.args[0]): + found = True + assert found # higher major - caoscrawler.version.version = "1.1.0" + get_version.side_effect = lambda: "1.1.0" with pytest.warns(UserWarning) as uw: _temp_file_load(definition_text) - assert len(uw) == 1 - assert "cfood was written for a previous crawler version" in uw[0].message.args[0] - assert "version specified in cfood: 0.2.0" in uw[0].message.args[0] - assert "version installed on your system: 1.1.0" in uw[0].message.args[0] + found = False + for w in uw: + if ("cfood was written for a previous crawler version" in w.message.args[0] and + "version specified in cfood: 0.2.0" in w.message.args[0] and + "version installed on your system: 1.1.0" in w.message.args[0]): + found = True + assert found -def test_error_if_version_too_new(): +@patch("caoscrawler.version.get_caoscrawler_version") +def test_error_if_version_too_new(get_version): """Raise error if the cfood requires a newer crawler version.""" # minor too old + get_version.side_effect = lambda: "0.1.5" definition_text = """ --- metadata: @@ -138,7 +141,6 @@ SimulationData: type: Directory match: SimulationData """ - caoscrawler.version.version = "0.1.5" with pytest.raises(caoscrawler.CfoodRequiredVersionError) as cre: _temp_file_load(definition_text) @@ -166,7 +168,7 @@ SimulationData: assert "version installed on your system: 0.1.5" in str(cre.value) # patch to old - caoscrawler.version.version = "1.0.0" + get_version.side_effect = lambda: "1.0.0" with pytest.raises(caoscrawler.CfoodRequiredVersionError) as cre: _temp_file_load(definition_text) @@ -176,7 +178,8 @@ SimulationData: assert "version installed on your system: 1.0.0" in str(cre.value) -def test_matching_version(): +@patch("caoscrawler.version.get_caoscrawler_version") +def test_matching_version(get_version): """Test that there is no warning or error in case the version matches.""" definition_text = """ @@ -190,10 +193,10 @@ SimulationData: type: Directory match: SimulationData """ - caoscrawler.version.version = "0.2.1" + get_version.side_effect = lambda: "0.2.1" assert _temp_file_load(definition_text) # The version is also considered a match if the patch version of the # installed crawler is newer than the one specified in the cfood metadata - caoscrawler.version.version = "0.2.7" + get_version.side_effect = lambda: "0.2.7" assert _temp_file_load(definition_text) diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 5942b1e124ebd1228a619ed7a1024738c70ee0aa..e4b442d91060c7ba98cb1a910156b1800f050be3 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -3,8 +3,9 @@ # # This file is a part of the CaosDB Project. # -# Copyright (C) 2021,2022 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2021-2024 Indiscale GmbH <info@indiscale.com> # Copyright (C) 2021,2022 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -23,27 +24,45 @@ """ test the converters module """ -import json -import yaml +import datetime import importlib +import json +import logging import os -from itertools import product import pytest import yaml -from caoscrawler.converters import (Converter, ConverterValidationError, DictElementConverter, - DirectoryConverter, DictIntegerElementConverter, - handle_value, MarkdownFileConverter, - FloatElementConverter, IntegerElementConverter, - JSONFileConverter, YAMLFileConverter) -from caoscrawler.converters import _AbstractScalarValueElementConverter +from itertools import product +from pathlib import Path +from tempfile import NamedTemporaryFile + +import linkahead as db + +from caoscrawler.converters import (Converter, ConverterValidationError, + DateElementConverter, DictElementConverter, + DictIntegerElementConverter, + DirectoryConverter, FloatElementConverter, + IntegerElementConverter, JSONFileConverter, + ListElementConverter, + MarkdownFileConverter, + PropertiesFromDictConverter, + YAMLFileConverter, handle_value, + replace_variables) +from caoscrawler.converters.converters import \ + _AbstractScalarValueElementConverter from caoscrawler.crawl import Crawler -from caoscrawler.stores import GeneralStore -from caoscrawler.structure_elements import (File, TextElement, ListElement, DictElement, - BooleanElement, IntegerElement, - FloatElement, Directory) +from caoscrawler.scanner import (_load_definition_from_yaml_dict, + create_converter_registry, + create_transformer_registry, load_definition, + scan_structure_elements) +from caoscrawler.stores import GeneralStore, RecordStore +from caoscrawler.structure_elements import (BooleanElement, DictElement, + Directory, File, FloatElement, + IntegerElement, ListElement, + TextElement) +from caoscrawler.transformer_functions import replace, split -from test_tool import rfp +UNITTESTDIR = Path(__file__).parent @pytest.fixture @@ -55,18 +74,22 @@ def converter_registry(): "MarkdownFile": { "converter": "MarkdownFileConverter", "package": "caoscrawler.converters"}, + "Date": { + "converter": "DateElementConverter", + "package": "caoscrawler.converters"}, "DictElement": { "converter": "DictElementConverter", "package": "caoscrawler.converters"}, + "PropertiesFromDictElement": { + "converter": "PropertiesFromDictConverter", + "package": "caoscrawler.converters" + }, "TextElement": { "converter": "TextElementConverter", "package": "caoscrawler.converters"}, "ListElement": { "converter": "ListElementConverter", "package": "caoscrawler.converters"}, - "TextElement": { - "converter": "TextElementConverter", - "package": "caoscrawler.converters"}, "JSONFile": { "converter": "JSONFileConverter", "package": "caoscrawler.converters"}, @@ -104,7 +127,7 @@ def testDirectoryConverter(converter_registry): }, name="Test", converter_registry=converter_registry) elements = dc.create_children(GeneralStore(), - Directory("test_directories", rfp("test_directories"))) + Directory("test_directories", UNITTESTDIR / "test_directories")) # Check whether the right structure elements were created # this has been updated, there are more directories now @@ -121,20 +144,16 @@ def testDirectoryConverter(converter_registry): def test_markdown_converter(converter_registry): test_readme = File( "README.md", - rfp( - "test_directories", "examples_article", "DataAnalysis", - "2020_climate-model-predict", "2020-02-08_prediction-errors", "README.md" - ) + UNITTESTDIR / + "test_directories" / "examples_article" / "DataAnalysis" / + "2020_climate-model-predict" / "2020-02-08_prediction-errors" / "README.md" ) - converter = MarkdownFileConverter({ - "match": "(.*)" - }, "TestMarkdownFileConverter", - converter_registry) + converter = MarkdownFileConverter({"match": "(.*)"}, "TestMarkdownFileConverter", + converter_registry) - m = converter.match(File("test_tool.py", rfp( - "test_tool.py"))) - assert m is None + with pytest.raises(ConverterValidationError): + converter.create_children(None, File("test_tool.py", UNITTESTDIR / "test_crawler.py")) m = converter.match(test_readme) assert m is not None @@ -162,8 +181,8 @@ def test_markdown_converter(converter_registry): test_readme2 = File( "README.md", - rfp("test_directories", "examples_article", - "ExperimentalData", "2020_SpeedOfLight", "2020-01-01_TimeOfFlight", "README.md") + UNITTESTDIR / "test_directories" / "examples_article" / + "ExperimentalData" / "2020_SpeedOfLight" / "2020-01-01_TimeOfFlight" / "README.md" ) m = converter.match(test_readme2) @@ -182,8 +201,8 @@ def test_markdown_converter(converter_registry): def test_json_converter(converter_registry): - test_json = File("testjson.json", rfp( - "test_directories", "examples_json", "testjson.json")) + test_json = File("testjson.json", UNITTESTDIR / + "test_directories" / "examples_json" / "testjson.json") schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_directories", "examples_json", "testjson.schema.json") @@ -240,7 +259,7 @@ def test_json_converter(converter_registry): invalid_json = File( "invalidjson.json", - rfp("test_directories", "examples_json", "invalidjson.json") + UNITTESTDIR / "test_directories" / "examples_json" / "invalidjson.json" ) # Doesn't validate because of missing required 'name' property with pytest.raises(ConverterValidationError) as err: @@ -249,15 +268,15 @@ def test_json_converter(converter_registry): broken_json = File( "brokenjson.json", - rfp("test_directories", "examples_json", "brokenjson.json") + UNITTESTDIR / "test_directories" / "examples_json" / "brokenjson.json" ) with pytest.raises(json.decoder.JSONDecodeError) as err: jsonconverter.create_children(None, broken_json) def test_yaml_converter(converter_registry): - test_yaml = File("testyaml.yml", rfp( - "test_directories", "test_yamls", "testyaml.yml")) + test_yaml = File("testyaml.yml", UNITTESTDIR / + "test_directories" / "test_yamls" / "testyaml.yml") schema_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_directories", "test_yamls", "testyaml.schema.json") @@ -314,7 +333,7 @@ def test_yaml_converter(converter_registry): invalid_yaml = File( "invalidyaml.yml", - rfp("test_directories", "test_yamls", "invalidyaml.yml") + UNITTESTDIR / "test_directories" / "test_yamls" / "invalidyaml.yml" ) # Doesn't validate because of missing required 'name' property @@ -324,7 +343,7 @@ def test_yaml_converter(converter_registry): broken_yaml = File( "brokenyaml.yml", - rfp("test_directories", "test_yamls", "brokenyaml.yml") + UNITTESTDIR / "test_directories" / "test_yamls" / "brokenyaml.yml" ) with pytest.raises(yaml.parser.ParserError) as err: yamlconverter.create_children(None, broken_yaml) @@ -334,43 +353,86 @@ def test_variable_replacement(): values = GeneralStore() values["a"] = 4 values["b"] = "68" - - assert handle_value("b", values) == ("b", "single") - assert handle_value("+b", values) == ("b", "list") - assert handle_value("*b", values) == ("b", "multiproperty") - assert handle_value("$b", values) == ("68", "single") - assert handle_value("+$b", values) == ("68", "list") - assert handle_value("*$b", values) == ("68", "multiproperty") - + values["my_unit"] = "m" + values["cm"] = "cm" + + # basic values stay unchanged + assert replace_variables(5, values) is 5 + assert replace_variables(True, values) is True + assert replace_variables("$a", values) is 4 + assert replace_variables("${b}", values) == "68" + + # values given as simple strings never have units + assert handle_value("b", values) == ("b", None, "single") + assert handle_value("+b", values) == ("b", None, "list") + assert handle_value("*b", values) == ("b", None, "multiproperty") + assert handle_value("$b", values) == ("68", None, "single") + assert handle_value("+$b", values) == ("68", None, "list") + assert handle_value("*$b", values) == ("68", None, "multiproperty") + + # No units in dicts assert handle_value({"value": "b", - "collection_mode": "single"}, values) == ("b", "single") + "collection_mode": "single"}, values) == ("b", None, "single") assert handle_value({"value": "b", - "collection_mode": "list"}, values) == ("b", "list") + "collection_mode": "list"}, values) == ("b", None, "list") assert handle_value({"value": "b", - "collection_mode": "multiproperty"}, values) == ("b", "multiproperty") + "collection_mode": "multiproperty"}, values) == ("b", None, "multiproperty") assert handle_value({"value": "$b", - "collection_mode": "single"}, values) == ("68", "single") + "collection_mode": "single"}, values) == ("68", None, "single") assert handle_value({"value": "$b", - "collection_mode": "list"}, values) == ("68", "list") + "collection_mode": "list"}, values) == ("68", None, "list") assert handle_value({"value": "$b", - "collection_mode": "multiproperty"}, values) == ("68", "multiproperty") + "collection_mode": "multiproperty"}, values) == ("68", None, "multiproperty") - assert handle_value(["a", "b"], values) == (["a", "b"], "single") - assert handle_value(["$a", "$b"], values) == (["4", "68"], "single") + # Unit specified in the same way as value: + assert handle_value({"value": 5, "unit": "m"}, values) == (5, "m", "single") + assert handle_value({"value": 5, "unit": "${my_unit}"}, values) == (5, "m", "single") + assert handle_value({"value": "+5", "unit": "${my_unit}"}, values) == ("5", "m", "list") + assert handle_value({"value": "*5", "unit": "${my_unit}"}, + values) == ("5", "m", "multiproperty") + assert handle_value(["a", "b"], values) == (["a", "b"], None, "single") + assert handle_value(["$a", "$b"], values) == ([4, "68"], None, "single") + assert handle_value({"value": ["$a", "$a"], "unit": "$cm"}, values) == ([4, 4], "cm", "single") -def test_filter_children_of_directory(converter_registry, capsys): - """Verify that children (i.e., files) in a directory are filtered or sorted - correctly. - """ - test_dir = Directory("examples_filter_children", rfp( - "test_directories", "examples_filter_children")) +def test_apply_transformers(converter_registry): + cfood_def = {"type": 'ListElement', "debug_match": True, "match_name": ".*", + 'transform': {'test': {'in': '$a', 'out': '$b', 'functions': [{ + 'split': {'marker': '|'}}]}}} + values = GeneralStore() + values["a"] = "a|b|c" + + # transformer_functions = create_transformer_registry(crawler_definition) + transformer_functions = {"split": split} + + conv = ListElementConverter(definition=cfood_def, name='test', + converter_registry=converter_registry) + + assert values['a'] is "a|b|c" + conv.apply_transformers(values, transformer_functions) + assert values['a'] is "a|b|c" + assert values['b'] == ["a", "b", "c"] + + # Check replacing of existing variable + cfood_def = {"type": 'ListElement', "debug_match": True, "match_name": ".*", + 'transform': {'test': {'in': '$a', 'out': '$a', 'functions': [{ + 'split': {'marker': '|'}}]}}} + conv = ListElementConverter(definition=cfood_def, name='test', + converter_registry=converter_registry) + + conv.apply_transformers(values, transformer_functions) + assert values['a'] == ["a", "b", "c"] + + +def test_filter_children_of_directory(converter_registry, capsys): + """Verify that children (i.e., files) in a directory are filtered or sorted correctly. """ + test_dir = Directory("examples_filter_children", UNITTESTDIR / + "test_directories" / "examples_filter_children") dc = DirectoryConverter( definition={ "match": "(.*)", - "debug_match": True, "filter": { "expr": "test_(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}).json", "group": "date", @@ -383,14 +445,6 @@ def test_filter_children_of_directory(converter_registry, capsys): m = dc.match(test_dir) assert m is not None - # checking debug output - captured = capsys.readouterr() - # the name - assert "examples_filter_children" in captured.out - # the regexp - assert "(.*)" in captured.out - # the empty result set - assert "{}" in captured.out # This should only contain the youngest json and the csv that doesn't match # the above filter expression. @@ -446,6 +500,7 @@ def test_filter_children_of_directory(converter_registry, capsys): children = dc.create_children(None, test_dir) +@pytest.mark.filterwarnings("ignore::UserWarning") def test_validate_custom_converters(): one_doc_yaml = """ Converters: @@ -456,8 +511,7 @@ MyElement: type: MyNewType match: something """ - crawler1 = Crawler() - one_doc_definitions = crawler1._load_definition_from_yaml_dict( + one_doc_definitions = _load_definition_from_yaml_dict( [yaml.load(one_doc_yaml, Loader=yaml.SafeLoader)]) assert "MyElement" in one_doc_definitions assert one_doc_definitions["MyElement"]["type"] == "MyNewType" @@ -466,6 +520,7 @@ MyElement: two_doc_yaml = """ --- metadata: + crawler-version: 0.9.0 Converters: MyNewType: converter: MyNewTypeConverter @@ -475,8 +530,7 @@ MyElement: type: MyNewType match: something """ - crawler2 = Crawler() - two_doc_definitions = crawler2._load_definition_from_yaml_dict( + two_doc_definitions = _load_definition_from_yaml_dict( list(yaml.safe_load_all(two_doc_yaml))) assert "MyElement" in two_doc_definitions assert two_doc_definitions["MyElement"]["type"] == one_doc_definitions["MyElement"]["type"] @@ -540,7 +594,8 @@ def test_converter_value_match(converter_registry): assert m is not None -def test_match_debug(converter_registry, capsys): +def test_match_debug(converter_registry, caplog): + caplog.set_level(logging.DEBUG, logger="caoscrawler.converters") for m, mn, mv in product([".*", None], [".*", None], [".*", None]): defi = {"debug_match": True} if m: @@ -562,11 +617,515 @@ def test_match_debug(converter_registry, capsys): mtch = dc.match(IntegerElement(name="a", value=4)) if not (m is None and mn is None and mv is None): assert mtch is not None - # checking debug output - captured = capsys.readouterr() # the name - assert "a" in captured.out + assert "a" in caplog.text # the regexp - assert ".*" in captured.out + assert ".*" in caplog.text # the empty result set - assert "{}" in captured.out + assert "{}" in caplog.text + caplog.clear() + + +def test_date_converter(): + dictconverter = DateElementConverter( + definition={"match_value": "(?P<date>.*)"}, + name="conv", + converter_registry=converter_registry) + matches = dictconverter.match(TextElement("text", "2022-11-11")) + assert "date" in matches + assert isinstance(matches["date"], datetime.date) + assert matches["date"].year == 2022 + + dictconverter = DateElementConverter( + definition={"match_value": r"(?P<date>(\d|-)+)", + "date_format": "%y-%m-%d"}, + name="conv", + converter_registry=converter_registry) + matches = dictconverter.match(TextElement("text", "22-11-11")) + assert "date" in matches + assert isinstance(matches["date"], datetime.date) + assert matches["date"].year == 2022 + + matches = dictconverter.match(TextElement("text", "alve")) + assert matches is None + + +def test_load_converters(): + converter_registry = create_converter_registry({}) + # The previous function call actually already asserts that all defined + # converter classes can be loaded from their respective packages. + + # Please adapt, if defaults change! + assert len(converter_registry) == 29 + + # All of them are contained in caoscrawler.converters + # except for the xml converters: + for conv_key, conv in converter_registry.items(): + assert conv["package"] == "caoscrawler.converters" + # ... and their names all end in "Converter" + assert conv["converter"].endswith("Converter") + + # Some checks: + assert "CSVTableConverter" in converter_registry + assert "SimpleFile" in converter_registry + assert "Directory" in converter_registry + assert "ListElement" in converter_registry + + +def test_create_path_value(converter_registry): + """ test whether the variable containing the path is added to the general store""" + dc = Converter.converter_factory( + definition={ + "type": "Directory", + "match": ".*" + }, + name="Test", converter_registry=converter_registry) + values = GeneralStore() + dc.create_values(values, Directory("a", "/a")) + assert "Test.path" in values + assert values["Test.path"] == "/a" + + +def test_properties_from_dict_basic(converter_registry): + """Test that a record with the correct name and properties is created, and + that the children are still created correctly. + + """ + # definitions with blacklist and named references + pfdc = PropertiesFromDictConverter( + definition={ + "type": "PropertiesFromDictElement", + "match": ".*", + "record_from_dict": { + "variable_name": "MyRec", + "parents": ["DictRT1", "DictRT2"], + "properties_blacklist": ["blacklisted_int", "blacklisted_ref"], + "references": { + "authors": { + "parents": ["Person"] + } + } + } + }, + name="Test", converter_registry=converter_registry) + # Tests for Dict with scalars, dict with lists, dict with reference, + # dict with list of references, dict with reference with reference, named + # reference + values = GeneralStore() + records = RecordStore() + test_dict_element = DictElement("TestDictElement", { + "a": 5, + "b": ["a", "b", "c"], + "scalar_ref": { + "name": "Scalar Ref", + "a": 23, + "blacklisted_int": 42 + }, + "list_ref": [ + { + "c": True + }, + { + "c": False + } + ], + "ref_with_ref": { + "a": 789, + "ref_in_ref": { + "b": "something" + } + }, + "blacklisted_int": -123, + "blacklisted_ref": { + "a": 25 + }, + "authors": { + "full_name": "Some Author" + } + }) + pfdc.create_records(values=values, records=records, element=test_dict_element) + assert "MyRec" in records + my_rec = records["MyRec"] + assert isinstance(my_rec, db.Record) + assert len(my_rec.parents) == 2 + assert "DictRT1" in [par.name for par in my_rec.parents] + assert "DictRT2" in [par.name for par in my_rec.parents] + + # scalar prop + assert my_rec.get_property("a") is not None + assert my_rec.get_property("a").value == 5 + + # list prop + assert my_rec.get_property("b") is not None + assert len(my_rec.get_property("b").value) == 3 + for elt in ["a", "b", "c"]: + assert elt in my_rec.get_property("b").value + + # scalar ref + assert my_rec.get_property("scalar_ref") is not None + referenced = my_rec.get_property("scalar_ref").value + assert isinstance(referenced, db.Record) + assert referenced.name == "Scalar Ref" + assert len(referenced.parents) == 1 + assert "scalar_ref" in [par.name for par in referenced.parents] + assert referenced.get_property("a") is not None + assert referenced.get_property("a").value == 23 + # blacklisted + assert referenced.get_property("blacklisted_int") is None + + # list of ref + assert my_rec.get_property("list_ref") is not None + assert isinstance(my_rec.get_property("list_ref").value, list) + assert len(my_rec.get_property("list_ref").value) == 2 + for rec in my_rec.get_property("list_ref").value: + assert isinstance(rec, db.Record) + assert len(rec.parents) == 1 + assert "list_ref" in [par.name for par in rec.parents] + assert rec.get_property("c") is not None + assert type(rec.get_property("c").value) is bool + assert True in [rec.get_property("c").value for rec in my_rec.get_property("list_ref").value] + assert False in [rec.get_property("c").value for rec in my_rec.get_property("list_ref").value] + + # ref with ref + assert my_rec.get_property("ref_with_ref") is not None + outer_rec = my_rec.get_property("ref_with_ref").value + assert isinstance(outer_rec, db.Record) + assert len(outer_rec.parents) == 1 + assert "ref_with_ref" in [par.name for par in outer_rec.parents] + assert outer_rec.get_property("a") is not None + assert outer_rec.get_property("a").value == 789 + assert outer_rec.get_property("ref_in_ref") is not None + inner_rec = outer_rec.get_property("ref_in_ref").value + assert isinstance(inner_rec, db.Record) + assert len(inner_rec.parents) == 1 + assert "ref_in_ref" in [par.name for par in inner_rec.parents] + assert inner_rec.get_property("b") is not None + assert inner_rec.get_property("b").value == "something" + + # blacklisted + assert my_rec.get_property("blacklisted_int") is None + assert my_rec.get_property("blacklisted_ref") is None + + # named reference property + assert my_rec.get_property("authors") is not None + author_rec = my_rec.get_property("authors").value + assert isinstance(author_rec, db.Record) + assert len(author_rec.parents) == 1 + assert "Person" in [par.name for par in author_rec.parents] + assert author_rec.get_property("full_name") is not None + assert author_rec.get_property("full_name").value == "Some Author" + + +def test_properties_from_dict_callable(converter_registry): + + def convert_some_values(rec: db.Record, records: RecordStore, values: GeneralStore): + """Add an URL prefix to a property value if appliccable.""" + + if rec.get_property("url") is not None: + + old_val = rec.get_property("url").value + if not (old_val is None or old_val.startswith("http")): + + # only add if there is a value that doesn't look like an URL + rec.get_property("url").value = f"https://test.com/{old_val}" + + return rec + + pdfc = PropertiesFromDictConverter( + definition={ + "record_from_dict": { + "variable_name": "MyRec", + "name": "My New Record" + } + }, + name="TestConverter", + converter_registry=converter_registry, + referenced_record_callback=convert_some_values + ) + + values = GeneralStore() + records = RecordStore() + test_dict_element = DictElement("TestDictElement", { + "url": "something", + "referenced1": { + "url": "referenced" + }, + "referenced2": { + "nourl": "something else", + "url": "https://indiscale.com" + } + }) + pdfc.create_records(values=values, records=records, element=test_dict_element) + assert "MyRec" in records + my_rec = records["MyRec"] + assert isinstance(my_rec, db.Record) + assert len(my_rec.parents) == 1 + assert "MyRec" in [par.name for par in my_rec.parents] + assert my_rec.name == "My New Record" + + # simple conversion + assert my_rec.get_property("url") is not None + assert my_rec.get_property("url").value == "https://test.com/something" + + # also works in referenced + assert my_rec.get_property("referenced1") is not None + referenced1 = my_rec.get_property("referenced1").value + assert isinstance(referenced1, db.Record) + assert referenced1.get_property("url") is not None + assert referenced1.get_property("url").value == "https://test.com/referenced" + + # ... and works as expected + assert my_rec.get_property("referenced2") is not None + referenced2 = my_rec.get_property("referenced2").value + assert isinstance(referenced2, db.Record) + assert referenced2.get_property("nourl") is not None + assert referenced2.get_property("nourl").value == "something else" + assert referenced2.get_property("url") is not None + assert referenced2.get_property("url").value == "https://indiscale.com" + + +def test_properties_from_dict_nested(converter_registry): + """Test the PropertiesFromDictConverter with a nested dict, + together with the regular DictElementConverter and Records created + and used on different subtree levels. + + """ + root_dict_element = DictElement("RootDict", { + "TopLevelRec": "MyRec", + "propertiesDict": { + "a": 5, + "blacklisted": { + "bl_name": "BlackList", + "date": "2023-12-31" + } + }, + "otherDict": { + "additional_from_other": "other" + } + }) + def_dict = { + "RootElt": { + # Root dictionary + "type": "DictElement", + "match": ".*", + "records": { + # Define top-level, use below in subtrees + "MyRec": { + "parents": ["MyType"] + } + }, + "subtree": { + # Top-level text element for the Record name + "NameElt": { + "type": "TextElement", + "match_name": "^TopLevelRec$", + "match_value": "(?P<name>.*)", + "records": { + "MyRec": { + "name": "$name" + } + } + }, + "PFDElement": { + "type": "PropertiesFromDictElement", + "match_name": "^propertiesDict$", + "record_from_dict": { + "variable_name": "MyRec", + "properties_blacklist": ["blacklisted"] + }, + "subtree": { + "BLElement": { + "type": "DictElement", + "match_name": "^blacklisted$", + "records": { + "BLRec": { + "parents": ["BlackListedType"], + "MyRec": "$MyRec" + } + }, + "subtree": { + "BLNameElt": { + "type": "TextElement", + "match_name": "^bl_name$", + "match_value": "(?P<name>.*)", + "records": { + "BLRec": { + "name": "$name" + } + } + }, + "BLDateElt": { + "type": "TextElement", + "match_name": "^date$", + "match_value": "(?P<date>.*)", + "records": { + "BLRec": { + "creation_date": "$date" + } + } + } + } + } + } + }, + # Other dict which uses the DictElementConverter + "OtherDictElement": { + "type": "DictElement", + "match_name": "^otherDict$", + "subtree": { + "additionalElt": { + "type": "TextElement", + "match_name": "^additional_from_other$", + "match_value": "(?P<val>.*)", + "records": { + "MyRec": { + "additional_from_other": "$val" + } + } + } + } + } + } + } + } + + records = scan_structure_elements(root_dict_element, def_dict, converter_registry) + + # All records need to be there + assert len(records) == 2 + myrec = None + blrec = None + for rec in records: + if rec.name == "MyRec": + myrec = rec + elif rec.name == "BlackList": + blrec = rec + assert myrec is not None + assert blrec is not None + + # Parent is set from top level + assert len(myrec.parents) == 1 + assert "MyType" in [par.name for par in myrec.parents] + + # Set automatically, with blacklist + assert myrec.get_property("a") is not None + assert myrec.get_property("a").value == 5 + assert myrec.get_property("blacklisted") is None + + # Now check blacklisted record from subtree + assert len(blrec.parents) == 1 + assert "BlackListedType" in [par.name for par in blrec.parents] + assert blrec.get_property("MyRec") is not None + assert blrec.get_property("MyRec").value == myrec + assert blrec.get_property("creation_date") is not None + assert blrec.get_property("creation_date").value == "2023-12-31" + + # The "old" DictConverter should have added the additional property: + assert myrec.get_property("additional_from_other") is not None + assert myrec.get_property("additional_from_other").value == "other" + + +def test_dict_match_properties(converter_registry): + + root_dict_element = DictElement("RootDict", { + "prop_a": "value", + "prop_b": "25", + "prop_c": 24 + }) + + def_dict = { + "RootElt": { + # Root dictionary + "type": "DictElement", + "match_properties": { + "prop_a": "(?P<a>.*)$", + "prop_[^ac]": "(?P<b>.*)$", + "prop_c": "(?P<c>.*)$", + }, + "records": { + # Define top-level, use below in subtrees + "MyRec": { + "prop_a": "$a", + "prop_b": "$b", + "$a": "$c" + } + }}} + records = scan_structure_elements(root_dict_element, def_dict, converter_registry) + assert len(records) == 1 + record = records[0] + assert record.get_property("prop_a").value == "value" + assert record.get_property("prop_b").value == "25" + assert record.get_property("value").value == "24" # Note the type change here + + root_dict_element = DictElement("RootDict", { + "prop_a": "value", + "prop_b": "25", + # Property missing + }) + + records = scan_structure_elements(root_dict_element, def_dict, converter_registry) + assert len(records) == 0 + + with pytest.raises(RuntimeError, match="Multiple properties match the same match_properties entry."): + root_dict_element = DictElement("RootDict", { + "prop_a": "value", + "prop_b": "25", + "prop_d": 24 # duplicate matches + }) + records = scan_structure_elements(root_dict_element, def_dict, converter_registry) + + +def test_directory_converter_change_date(caplog, converter_registry): + """Test that only directories that were modified after a certain + date are crawled. + + """ + test_dir_element = Directory("test_directories", UNITTESTDIR / "test_directories") + date_of_dir_change = DirectoryConverter._get_most_recent_change_in_dir(test_dir_element) + past_date = date_of_dir_change - datetime.timedelta(days=1) + future_date = date_of_dir_change + datetime.timedelta(days=1) + + tmpfi = NamedTemporaryFile(delete=False) + + # Write down past + with open(tmpfi.name, "w") as fi: + fi.write(f"{past_date.isoformat()}\n") + + converter_def = { + "type": "Directory", + "match": "^test_directories$", + "match_newer_than_file": tmpfi.name + } + dc = DirectoryConverter(name="DC1", definition=converter_def, + converter_registry=converter_registry) + assert dc.match(test_dir_element) is not None + + # Write down future, so nothing should match + with open(tmpfi.name, "w") as fi: + fi.write(f"{future_date.isoformat()}\n") + assert dc.match(test_dir_element) is None + + # Also match in the corner case of equality: + with open(tmpfi.name, "w") as fi: + fi.write(f"{date_of_dir_change.isoformat()}\n") + assert dc.match(test_dir_element) is not None + + # Match but warn + with open(tmpfi.name, "w") as fi: + fi.write(f"This is garbage.\n") + with pytest.raises(ValueError): + dc.match(test_dir_element) + assert len(caplog.record_tuples) == 1 + assert caplog.record_tuples[0][1] == logging.ERROR + assert tmpfi.name in caplog.record_tuples[0][2] + assert "doesn't contain a ISO formatted datetime in its first line" in caplog.record_tuples[0][2] + + # Match anything since file doesn't exist, inform in debug log. + os.remove(tmpfi.name) + # Clear log and enforce debug level. + caplog.clear() + caplog.set_level(logging.DEBUG) + assert dc.match(test_dir_element) is not None + assert len(caplog.record_tuples) == 1 + assert caplog.record_tuples[0][1] == logging.DEBUG + assert "Reference file doesn't exist." == caplog.record_tuples[0][2] diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py new file mode 100644 index 0000000000000000000000000000000000000000..bdb22ba2171c6d52633c4429d98735e560cf6375 --- /dev/null +++ b/unittests/test_crawler.py @@ -0,0 +1,950 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +test the Crawler class +""" +import logging +import os +import warnings +from copy import deepcopy +from functools import partial +from os.path import basename, dirname, join +from pathlib import Path +from unittest.mock import MagicMock, Mock, patch + +import linkahead as db +import linkahead.common.models as dbmodels +import pytest +import yaml +from caosadvancedtools.models.parser import parse_model_from_string +from linkahead.apiutils import compare_entities +from linkahead.cached import cache_clear +from linkahead.exceptions import EmptyUniqueQueryError +from pytest import raises + +import caoscrawler +from caoscrawler.crawl import (Crawler, SecurityMode, _treat_deprecated_prefix, + crawler_main, split_restricted_path) +from caoscrawler.debug_tree import DebugTree +from caoscrawler.exceptions import (ImpossibleMergeError, + MissingIdentifyingProperty, + MissingReferencingEntityError) +from caoscrawler.identifiable import Identifiable +from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, + IdentifiableAdapter, + LocalStorageIdentifiableAdapter) +from caoscrawler.scanner import (create_converter_registry, scan_directory, + scan_structure_elements) +from caoscrawler.stores import GeneralStore, RecordStore +from caoscrawler.structure_elements import (DictElement, DictListElement, + DictTextElement, File) +from caoscrawler.sync_graph import SyncGraph + +UNITTESTDIR = Path(__file__).parent + +EXAMPLE_SERVER_STATE = [ + db.Property(id=1, name='result', datatype=db.TEXT), + db.Property(id=2, name='date', datatype=db.DATETIME), + db.RecordType(id=3, name="Experiment"), + db.RecordType(id=4, name="Analysis"), + db.Record(id=5) + .add_parent(name="Experiment", id=3) + .add_property(name="date", value="2022-02-01") + .add_property(name="result", value="FAIL"), + db.Record(id=6) + .add_parent(name="Experiment", id=3) + .add_property(name="date", value="2022-02-02") + .add_property(name="result", value="SUCCESS"), + db.Record(id=7) + .add_parent(name="Analysis", id=4) + .add_property(name="date", value="2022-03-01") + .add_property(name="result", value="homogeneous"), + db.Record(id=8) + .add_parent(name="Analysis", id=4) + .add_property(name="date", value="2022-03-02") + .add_property(name="result", value="heterogeneous"), +] +NEW_ELEMENT = (db.Record() + .add_parent(name="Analysis", id=4) + .add_property(name="date", value="2022-03-05") # new date + .add_property(name="result", value="homogeneous")) + + +def reset_mocks(mocks): + for mock in mocks: + mock.reset_mock() + + +def mock_create_values(values, element): + pass + + +def mock_get_entity_by_query(query=None): + if query is not None: + return db.Record(id=1111, name='rec_name').add_parent('RT') + + +def mock_get_entity_by(eid=None, name=None, path=None): + if eid is not None: + candidates = [el for el in EXAMPLE_SERVER_STATE if el.id == eid] + if len(candidates) > 0: + return candidates[0] + else: + raise EmptyUniqueQueryError("") + if name is not None: + candidates = [el for el in EXAMPLE_SERVER_STATE + if (el.name is not None and el.name.lower() == name.lower())] + if len(candidates) > 0: + return candidates[0] + else: + raise EmptyUniqueQueryError("") + if path is not None: + candidates = [el for el in EXAMPLE_SERVER_STATE + if (el.path is not None and el.path == path)] + if len(candidates) > 0: + return candidates[0] + else: + raise EmptyUniqueQueryError("") + + +def basic_retrieve_by_name_mock_up(rec, referencing_entities=None, known=None): + """ returns a stored Record if rec.name is an existing key, None otherwise """ + if rec.name in known: + return known[rec.name] + else: + return None + + +def mock_retrieve_record(identifiable: Identifiable): + """ assumes that the identifiable is always only the date""" + + for record in EXAMPLE_SERVER_STATE: + if (record.role == "Record" and "date" in identifiable.properties + and record.get_property("date").value == identifiable.properties['date']): + return record + return None + + +def mock_cached_only_rt(query_string: str): + """Always return an empty Container""" + result = db.Container() + lo_query = query_string.lower() + if lo_query.startswith("find record ") or lo_query.startswith("find file "): + return result + model = parse_model_from_string(""" +B: + obligatory_properties: + C: + obligatory_properties: + prop_other: + datatype: INTEGER + prop_ident: + datatype: INTEGER +A: + obligatory_properties: + B: + datatype: LIST<B> + prop_ident: +""") + if query_string == "FIND RECORDTYPE 'A'": + model.get_deep("A").id = 1 + return result + [model.get_deep("A")] + if query_string == "FIND RECORDTYPE 'B'": + model.get_deep("A").id = 2 + return result + [model.get_deep("B")] + print(query_string) + raise NotImplementedError(f"Mock for this case is missing: {query_string}") + + +def mock_cached_only_rt_allow_empty(query_string: str): + try: + result = mock_cached_only_rt(query_string) + except NotImplementedError: + result = db.Container() + return result + + +@pytest.fixture(autouse=True) +def clear_cache(): + cache_clear() + + +@pytest.fixture +def crawler_mocked_identifiable_retrieve(): + crawler = Crawler() + # TODO use minimal setup + # mock retrieval of registered identifiabls: return Record with just a parent + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent(x.parents[0].name).add_property(name='name')) + + # Simulate remote server content by using the names to identify records + # There is only a single known Record with name A + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( + side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) + return crawler + + +@pytest.fixture +def crawler_mocked_for_backref_test(): + crawler = Crawler() + # mock retrieval of registered identifiabls: return Record with just a parent + + def get_reg_ident(x): + if x.parents[0].name == "C": + return db.Record().add_parent(x.parents[0].name).add_property( + "is_referenced_by", value=["BR"]).add_property("name") + elif x.parents[0].name == "D": + return db.Record().add_parent(x.parents[0].name).add_property( + "is_referenced_by", value=["BR", "BR2"]).add_property("name") + else: + return db.Record().add_parent(x.parents[0].name).add_property("name") + crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=get_reg_ident) + + # Simulate remote server content by using the names to identify records + # There is only a single known Record with name A + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( + side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": + db.Record(id=1111, name="A").add_parent("BR")})) + return crawler + + +@pytest.mark.filterwarnings("ignore::DeprecationWarning") +def test_constructor(): + # tests that appropriate DeprecationWarnings are triggered by the constructor when deprecated + # arguments are being passed. + with warnings.catch_warnings(record=True) as w: + # Cause all warnings to always be triggered. + warnings.filterwarnings("ignore") + warnings.filterwarnings("always", category=DeprecationWarning) + + Crawler(debug=True) + assert issubclass(w[-1].category, DeprecationWarning) + assert "The debug argument of the Crawler class" in str(w[-1].message) + + Crawler(generalStore=GeneralStore()) + assert issubclass(w[-1].category, DeprecationWarning) + assert "The generalStore argument of the Crawler" in str(w[-1].message) + + +@pytest.mark.filterwarnings("ignore::DeprecationWarning") +def test_deprecated_functions(): + # tests that appropriate DeprecationWarnings are triggered by deprecated methods + with warnings.catch_warnings(record=True) as w: + # Cause all warnings to always be triggered. + warnings.filterwarnings("ignore") + warnings.filterwarnings("always", category=DeprecationWarning) + cr = Crawler() + cr.crawl_directory(UNITTESTDIR, UNITTESTDIR / "scifolder_cfood.yml") + print(w) + print(w[0].message) + assert issubclass(w[-1].category, DeprecationWarning) + assert "The function crawl_directory in the crawl" in str(w[-1].message) + + cr.start_crawling([], {}, {}) + assert issubclass(w[-1].category, DeprecationWarning) + assert "The function start_crawling in the crawl module" in str(w[-1].message) + + cr.crawled_data + assert issubclass(w[-1].category, DeprecationWarning) + assert "The use of self.crawled_data is depricated" in str(w[-1].message) + + +def test_check_whether_parent_exists(): + trivial_result = Crawler.check_whether_parent_exists([], []) + assert len(trivial_result) == 0 + assert isinstance(trivial_result, list) + + trivial_result2 = Crawler.check_whether_parent_exists([db.Record(), db.Record()], []) + assert len(trivial_result) == 0 + assert isinstance(trivial_result, list) + + # make sure records with parent is collected + a_recs = Crawler.check_whether_parent_exists( + [ + db.Record(id=1).add_parent("A"), + db.Record(id=2).add_parent("B"), + db.Record(id=3).add_parent("B"), + db.Record(id=4).add_parent("A"), + ], ["A"]) + a_recs_ids = [el.id for el in a_recs] + assert 1 in a_recs_ids + assert 4 in a_recs_ids + + +def test_remove_unnecessary_updates(): + # test trvial case + crawled_data = [db.Record().add_parent("A")] + identified_records = [db.Record().add_parent("A")] + updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records) + assert len(updates) == 0 + + # test property difference case + crawled_data = [db.Record().add_parent("A").add_property("a", 3)] + identified_records = [db.Record().add_parent("A")] # ID should be s + Crawler.remove_unnecessary_updates(crawled_data, identified_records) + assert len(crawled_data) == 1 + + # test value difference case + crawled_data = [db.Record().add_parent("A").add_property("a", 5)] + identified_records = [db.Record().add_parent("A").add_property("a")] + updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records) + assert len(updates) == 1 + crawled_data = [db.Record().add_parent("A").add_property("a", 5)] + identified_records = [db.Record().add_parent("A").add_property("a", 5)] + updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records) + assert len(updates) == 0 + + # test unit difference case + crawled_data = [db.Record().add_parent("A").add_property("a", unit='cm')] + identified_records = [db.Record().add_parent("A").add_property("a")] + updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records) + assert len(updates) == 1 + + # test None difference case + crawled_data = [db.Record().add_parent("A").add_property("a")] + identified_records = [db.Record().add_parent("A").add_property("a", 5)] + updates = Crawler.remove_unnecessary_updates(crawled_data, identified_records) + assert len(updates) == 1 + + +def test_split_into_inserts_and_updates_trivial(): + crawler = Crawler() + st = SyncGraph([], crawler.identifiableAdapter) + crawler._split_into_inserts_and_updates(st) + + +def test_split_into_inserts_and_updates_simple(crawler_mocked_identifiable_retrieve): + # basic test that checks whether two records are correctly sorted to update and insert based on + # whether an entity can be found using the identifiable + crawler = crawler_mocked_identifiable_retrieve + identlist = [Identifiable(name="A", record_type="C"), Identifiable(name="B", record_type="C")] + entlist = [db.Record(name="A").add_parent("C"), + db.Record(name="B").add_parent("C")] + + st = SyncGraph(entlist, crawler.identifiableAdapter) + # check setup + + insert, update = crawler._split_into_inserts_and_updates(st) + assert len(insert) == 1 + assert insert[0].name == "B" + assert len(update) == 1 + assert update[0].name == "A" + # if this ever fails, the mock up may be removed + crawler.identifiableAdapter.get_registered_identifiable.assert_called() + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() + + +def test_split_into_inserts_and_updates_with_circ(crawler_mocked_identifiable_retrieve): + # test trying to split circular dependency + crawler = crawler_mocked_identifiable_retrieve + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent('C').add_property(name='a') + ) + # two records that reference each other via identifying properties + a = db.Record().add_parent("C") + b = db.Record().add_parent("C").add_property(name='a', value=a) + a.add_property(name='a', value=b) + + st = SyncGraph([a, b], crawler.identifiableAdapter) + with pytest.raises(RuntimeError): + crawler._split_into_inserts_and_updates(st) + + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) +def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable_retrieve): + crawler = crawler_mocked_identifiable_retrieve + # A + # ^ + # | + # F <- B <- G + a = db.Record(name="A").add_parent("C").add_property( + 'd', 13).add_property('e', "lskdjlsfdj") + b = db.Record(name="B").add_parent("C") + g = db.Record(name="G").add_parent("C") + f = db.Record(name="F").add_parent("C") + g.add_property("C", b) + b.add_property("A", a) + b.add_property("C", f) + entlist = [a, b, g] + st = SyncGraph(entlist, crawler.identifiableAdapter) + insert, update = crawler._split_into_inserts_and_updates(st) + assert len(insert) == 3 + assert "B" in [el.name for el in insert] + assert len(update) == 1 + assert update[0].name == "A" + # if this ever fails, the mock up may be removed + crawler.identifiableAdapter.get_registered_identifiable.assert_called() + crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() + + # TODO write test where the unresoled entity is not part of the identifiable + + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@patch("caoscrawler.identifiable_adapters.cached_query", + new=Mock(side_effect=mock_cached_only_rt)) +def test_split_iiau_with_unmergeable_list_items(): + """Test for meaningful exception when referencing a list of unmergeable entities. + +Datamodel +--------- +A: + B: LIST<B> + prop_ident: INTEGER + +B: + prop_ident: + C: + +C: + prop_other: INTEGER + +Identifiables +------------- + +id_A: [prop_ident] +id_B: [prop_ident, "is_referenced_by: A"] +id_C: [prop_other, "is_referenced_by: B"] + +Data +---- + +c1: (23) +c2: (42) + +b1: ("same", c1) +b2: ("same", c2) + +a: ([b1, b2]) + + + +- a can be identified. +- bs can be identified with each other once a is identified +- cs depend on b(s), but cannot be put in one Entity because they have conflicting properties + """ + prop_ident = db.Property("prop_ident", datatype=db.INTEGER) + prop_other = db.Property("prop_ident", datatype=db.INTEGER) + rt_c = db.RecordType("C").add_property(prop_other) + # Somehow it is necessary that `B` has a reference property. Dunno if C must have an + # identifiable as well. + rt_b = db.RecordType("B").add_property(prop_ident).add_property("C") + rt_a = db.RecordType("A").add_property(prop_ident).add_property("LIST<B>") + + ident_a = db.RecordType().add_parent("A").add_property("prop_ident") + ident_b = db.RecordType().add_parent("B").add_property("prop_ident").add_property( + "is_referenced_by", value="A") + ident_c = db.RecordType().add_parent("C").add_property("prop_other").add_property( + "is_referenced_by", value="B") + + rec_a = db.Record("a").add_parent(rt_a).add_property("prop_ident", value=1234) + rec_b = [] + rec_c = [] + for value in [23, 42]: + new_c = db.Record().add_parent(rt_c).add_property("prop_other", value=value) + rec_c.append(new_c) + rec_b.append(db.Record().add_parent(rt_b).add_property( + "prop_ident", value=2020).add_property("C", value=new_c)) + rec_a.add_property("B", rec_b) + + ident_adapter = CaosDBIdentifiableAdapter() + ident_adapter.register_identifiable("A", ident_a) + ident_adapter.register_identifiable("B", ident_b) + ident_adapter.register_identifiable("C", ident_c) + + crawler = Crawler(identifiableAdapter=ident_adapter) + + st = SyncGraph(deepcopy([rec_a, *rec_b, *rec_c]), crawler.identifiableAdapter) + assert st._identity_relies_on_unchecked_entity(st.nodes[0]) is False + assert st._identity_relies_on_unchecked_entity(st.nodes[1]) + assert st._identity_relies_on_unchecked_entity(st.nodes[2]) + assert st._identity_relies_on_unchecked_entity(st.nodes[3]) + assert st._identity_relies_on_unchecked_entity(st.nodes[4]) + assert len(st.unchecked) == 5 + + # The Cs cannot be merged due to different identifying properties + # The Bs cannot be merged due to different references to Cs + with raises(ImpossibleMergeError) as rte: + crawler._split_into_inserts_and_updates(st) + + # The order of the Cs is random so we only know that they are the + # last two elements but not in which order they have been tried to + # be merged. + assert "The problematic property is 'C' with values " in str(rte.value) + assert f"'[{st.nodes[-2]}]'" in str(rte.value) + assert f"'[{st.nodes[-1]}]'" in str(rte.value) + + # TODO + # assert not isinstance(rte.value, NotImplementedError), \ + # "Exception must not be NotImplementedError, but plain RuntimeError." + # assert "Could not find referencing entities" in rte.value.args[0] + # assert "merge conflicts in the referencing" in rte.value.args[0] + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test): + # test that backrefs are appropriately considered in the identifiable + crawler = crawler_mocked_for_backref_test + identlist = [Identifiable(name="A", record_type="BR"), + Identifiable(name="B", record_type="C", backrefs=[db.Entity()])] + referenced = db.Record(name="B").add_parent("C") + entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] + + # Test without referencing object + # currently a RuntimeError is raised if necessary properties are missing. + with raises(MissingReferencingEntityError): + st = SyncGraph([db.Record(name="B").add_parent("C")], crawler.identifiableAdapter) + + # identifiables were not yet checked + st = SyncGraph(entlist, crawler.identifiableAdapter) + assert st.get_equivalent(st.nodes[1]) is None + assert st.get_equivalent(st.nodes[0]) is None + # one can be found remotely, one not + + # check the split... + insert, update = crawler._split_into_inserts_and_updates(st) + # A was found remotely and is therefore in the update list + assert len(update) == 1 + assert update[0].name == "A" + # B does not exist on the (simulated) remote server + assert len(insert) == 1 + assert insert[0].name == "B" + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test): + # test whether multiple references of the same record type are correctly used + crawler = crawler_mocked_for_backref_test + referenced = db.Record(name="B").add_parent("C") + entlist = [referenced, + db.Record(id=1, name="A").add_parent("BR").add_property("ref", referenced), + db.Record(id=2, name="C").add_parent("BR").add_property("ref", referenced), + ] + + # test whether both entities are listed in the backref attribute of the identifiable + st = SyncGraph(entlist, crawler.identifiableAdapter) + + identifiable = crawler.identifiableAdapter.get_identifiable( + st.nodes[0], + st.backward_references_backref[id(st.nodes[0])]) + assert len(identifiable.backrefs) == 2 + + # check the split... + insert, update = crawler._split_into_inserts_and_updates(st) + assert len(update) == 2 + assert len(insert) == 1 + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test): + # test whether multiple references of the different record types are correctly used + crawler = crawler_mocked_for_backref_test + referenced = db.Record(name="B").add_parent("D") + entlist = [referenced, + db.Record(id=1, name="A").add_parent("BR").add_property("ref", referenced), + db.Record(id=2, name="A").add_parent("BR2").add_property("ref", referenced), + ] + + # test whether both entities are listed in the backref attribute of the identifiable + st = SyncGraph(entlist, crawler.identifiableAdapter) + identifiable = crawler.identifiableAdapter.get_identifiable( + st.nodes[0], + st.backward_references_backref[id(st.nodes[0])]) + + assert len(identifiable.backrefs) == 2 + + # check the split... + insert, update = crawler._split_into_inserts_and_updates(st) + assert len(update) == 2 + assert len(insert) == 1 + + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) +def test_replace_entities_with_ids(): + crawler = Crawler() + a = (db.Record().add_parent("B").add_property("A", 12345) + .add_property("B", db.Record(id=12345)) + .add_property("C", [db.Record(id=12345), 233324])) + + crawler.replace_entities_with_ids(a) + assert a.get_property("A").value == 12345 + assert a.get_property("B").value == 12345 + assert a.get_property("C").value == [12345, 233324] + + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@patch("caoscrawler.identifiable_adapters.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." + "retrieve_identified_record_for_identifiable", + new=Mock(side_effect=mock_retrieve_record)) +@patch("caoscrawler.crawl.db.Container.insert") +@patch("caoscrawler.crawl.db.Container.update") +def test_synchronization_no_commit(upmock, insmock): + crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"] + # change one; add one + crawled_data[-1].get_property('result').value = "wst" + crawled_data.append(NEW_ELEMENT.copy()) + + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml") + crawler = Crawler(securityMode=SecurityMode.UPDATE, identifiableAdapter=ident) + ins, ups = crawler.synchronize(commit_changes=False, crawled_data=crawled_data) + insmock.assert_not_called() + upmock.assert_not_called() + assert len(ins) == 1 + assert len(ups) == 1 + + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@patch("caoscrawler.identifiable_adapters.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +@patch("caoscrawler.identifiable_adapters.CaosDBIdentifiableAdapter." + "retrieve_identified_record_for_identifiable", + new=Mock(side_effect=mock_retrieve_record)) +@patch("caoscrawler.crawl.db.Container.insert") +@patch("caoscrawler.crawl.db.Container.update") +@patch("caoscrawler.crawl.UpdateCache.insert") +def test_security_mode(updateCacheMock, upmock, insmock): + # trivial case: nothing to do + crawled_data = [r.copy() for r in EXAMPLE_SERVER_STATE if r.role == "Record"] + crawler = Crawler(securityMode=SecurityMode.RETRIEVE) + crawler.synchronize(commit_changes=True, crawled_data=crawled_data) + assert crawler.run_id is not None + insmock.assert_not_called() + upmock.assert_not_called() + updateCacheMock.assert_not_called() + + # RETRIEVE: insert only + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml") + crawler = Crawler(securityMode=SecurityMode.RETRIEVE, identifiableAdapter=ident) + + # add a new entity + crawled_data.append(NEW_ELEMENT.copy()) + + # insert forbidden + crawler.synchronize(commit_changes=True, crawled_data=crawled_data) + assert crawler.run_id is not None + insmock.assert_not_called() + upmock.assert_not_called() + assert updateCacheMock.call_count == 1 + # reset counts + reset_mocks([updateCacheMock, insmock, upmock]) + # remove new record again + crawled_data.pop() + + # RETRIEVE: update only + crawler = Crawler(securityMode=SecurityMode.RETRIEVE) + # change one element + crawled_data[-1].get_property('result').value = "wst" + crawler.synchronize(commit_changes=True, crawled_data=crawled_data) + assert crawler.run_id is not None + insmock.assert_not_called() + upmock.assert_not_called() + assert updateCacheMock.call_count == 1 + # reset counts + reset_mocks([updateCacheMock, insmock, upmock]) + # reset value + crawled_data[-1] = EXAMPLE_SERVER_STATE[-1].copy() + + # INSERT: insert only + # add one element + crawled_data.append(NEW_ELEMENT.copy()) + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml") + crawler = Crawler(securityMode=SecurityMode.INSERT, identifiableAdapter=ident) + crawler.synchronize(commit_changes=True, crawled_data=crawled_data) + assert crawler.run_id is not None + insmock.assert_called_once() + upmock.assert_not_called() + updateCacheMock.assert_not_called() + # reset counts + reset_mocks([updateCacheMock, insmock, upmock]) + # remove new record again + crawled_data.pop() + + # INSERT: update only + crawler = Crawler(securityMode=SecurityMode.INSERT, identifiableAdapter=ident) + # change one element + crawled_data[-1].get_property('result').value = "wst" + crawler.synchronize(commit_changes=True, crawled_data=crawled_data) + assert crawler.run_id is not None + insmock.assert_not_called() + upmock.assert_not_called() + updateCacheMock.assert_called_once() + # reset counts + reset_mocks([updateCacheMock, insmock, upmock]) + # reset value + crawled_data[-1] = EXAMPLE_SERVER_STATE[-1].copy() + + # INSERT: insert and update + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml") + crawler = Crawler(securityMode=SecurityMode.INSERT, identifiableAdapter=ident) + # change one; add one + crawled_data[-1].get_property('result').value = "wst" + crawled_data.append(NEW_ELEMENT.copy()) + crawler.synchronize(commit_changes=True, crawled_data=crawled_data) + assert crawler.run_id is not None + insmock.asser_called_once() + upmock.assert_not_called() + updateCacheMock.assert_called_once() + # reset counts + reset_mocks([updateCacheMock, insmock, upmock]) + # restore original ident + crawled_data.pop() + crawled_data[-1] = EXAMPLE_SERVER_STATE[-1].copy() + + +def test_validation_error_print(caplog): + caplog.set_level(logging.DEBUG, logger="caoscrawler.converters") + # there should be no server interaction since we only test the behavior if a validation error + # occurs during the data collection stage + DATADIR = os.path.join(os.path.dirname(__file__), "test_data", "failing_validation") + for fi in ["cfood.yml", "cfood2.yml"]: + ret = crawler_main(DATADIR, + os.path.join(DATADIR, fi), + os.path.join(DATADIR, "identifiables.yml"), + True, + None, + False) + assert "Couldn't validate" in caplog.text + caplog.clear() + + +@patch("caoscrawler.converters.IntegerElementConverter.create_values") +def test_restricted_path(create_mock): + """ + The restricted_path argument allows to ignroe part of the crawled data structure. Here, we make + sure, that is that argument is provided, ideed only the given path of the tree is traversed. + + The check is done using the mock of the create_values function of the IntegerElementConverter. + This function is only called if elements are being treated. + """ + crawler_definition = { + "DictTest": { + "type": "DictElement", + "match": "(.*)", + "subtree": { + "nextdict": { + "type": "DictElement", + "match": "(.*)", + "subtree": { + "int_element": { + "type": "IntegerElement", + "match_name": ".*", + "match_value": "(?P<int_value>.*)", + "records": { + "Dataset": { + "Subject": "$int_value" + } + } + } + } + } + } + } + } + + crawler = Crawler() + converter_registry = create_converter_registry(crawler_definition) + + # This structure is crawled + test_dict = { + "v1": { + "a": 1, + "b": 2, + }, + "v2": { + "c": 3, + "d": 4, + } + } + # first test without a restricted_path + restricted_path = None + records = scan_structure_elements( + DictElement("TestDict", test_dict), crawler_definition, converter_registry, + restricted_path + ) + assert create_mock.call_count == 4 + create_mock.reset_mock() + + # test with a restricted_path but one that has no effect (single root element) + # this also tests that the remainder of the tree is fully traversed + restricted_path = ["TestDict"] + records = scan_structure_elements( + DictElement("TestDict", test_dict), crawler_definition, converter_registry, + restricted_path + ) + assert create_mock.call_count == 4 + create_mock.reset_mock() + + # test with a restricted_path that restricts the tree (single root element) + restricted_path = ["TestDict", "v2"] + records = scan_structure_elements( + DictElement("TestDict", test_dict), crawler_definition, converter_registry, + restricted_path + ) + assert create_mock.call_count == 2 + create_mock.reset_mock() + + # test with a restricted_path that contains a bad element + restricted_path = ["TestDict", "v3"] + with raises(RuntimeError): + records = scan_structure_elements( + DictElement("TestDict", test_dict), crawler_definition, converter_registry, + restricted_path + ) + + +def test_split_restricted_path(): + assert ["el"] == split_restricted_path(os.path.sep + "el") + assert ["el"] == split_restricted_path(os.path.sep + "el" + os.path.sep) + assert ["el", "el"] == split_restricted_path(os.path.sep + "el" + os.path.sep + "el") + + +# Filter the warning because we want to have it here and this way it does not hinder running +# tests with -Werror. +@pytest.mark.filterwarnings("ignore:The prefix:DeprecationWarning") +def test_deprecated_prefix_option(): + """Test that calling the crawler's main function with the deprecated + `prefix` option raises the correct errors and warnings. + + """ + + with pytest.deprecated_call(): + crawler_main("./", UNITTESTDIR / "scifolder_cfood.yml", prefix="to/be/removed") + + # Check that crawler main terminates with an error + assert 1 == crawler_main("./", UNITTESTDIR / "scifolder_cfood.yml", prefix="to/be/removed", + remove_prefix="to/be/removed") + + with raises(ValueError) as ve: + + _treat_deprecated_prefix(prefix="to/be/removed", remove_prefix="to/be/removed") + + assert "(deprecated) `prefix` and the `remove_prefix`" in str(ve.value) + + +def test_create_entity_summary(): + assert "" == Crawler.create_entity_summary([]).strip() + + entities = [ + db.Record(id=1).add_parent("A"), + db.Record(id=4, name='a').add_parent("B"), + db.Record(id=5).add_parent("A"), + db.Record(id=6, name='b').add_parent("B"), + ] + text = Crawler.create_entity_summary(entities).strip() + assert 'a' in text + assert 'b' in text + assert 'A:' in text + assert 'B:' in text + assert "<a href='/Entity/4'>a</a>, <a href='/Entity/6'>b</a>" in text + + +@patch("caoscrawler.crawl.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by_query)) +def test_replace_name_with_referenced_entity(): + test_text = 'lkajsdf' + test_int = 134343 + test_id = 1111 + test_name = 'rec_name' + + # do not touch Properties with non-ref datatype + prop = db.Property(name='a', datatype=db.TEXT, value=test_text) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value is test_text + + # do not touch Properties with generic-ref datatype + prop = db.Property(name='a', datatype=db.REFERENCE, value=test_text) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value is test_text + + # do not touch Properties with file-ref datatype + prop = db.Property(name='a', datatype=db.FILE, value=test_text) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value is test_text + + # do not touch Properties with non-str values + prop = db.Property(name='a', datatype="RT", value=test_int) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value is test_int + + # no LinkAhead acccess until here + assert caoscrawler.crawl.cached_get_entity_by.call_count == 0 + + # change Properties with custom dt and str value + prop = db.Property(name='a', datatype="RT", value=test_name) + Crawler.replace_name_with_referenced_entity_id(prop) + assert isinstance(prop.value, int) + assert prop.value == test_id + assert caoscrawler.crawl.cached_get_entity_by.call_count == 1 + + # do not touch Properties with non-ref datatype (LIST) + prop = db.Property(name='a', datatype=db.LIST(db.TEXT), value=[test_text]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value[0] is test_text + + # do not touch Properties with generic-ref datatype (LIST) + prop = db.Property(name='a', datatype=db.LIST(db.REFERENCE), value=[test_text]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value[0] is test_text + + # do not touch Properties with file-ref datatype (LIST) + prop = db.Property(name='a', datatype=db.LIST(db.FILE), value=[test_text]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value[0] is test_text + + # do not touch Properties with non-str values (LIST) + prop = db.Property(name='a', datatype=db.LIST("RT"), value=[test_int]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert prop.value[0] is test_int + + # change Properties with custom dt and str value + prop = db.Property(name='a', datatype=db.LIST("RT"), value=[test_name, db.Record(name='hi'), + test_name]) + Crawler.replace_name_with_referenced_entity_id(prop) + assert isinstance(prop.value[0], int) + assert prop.value[0] == test_id + assert isinstance(prop.value[1], db.Entity) + assert prop.value[1].name == "hi" + assert isinstance(prop.value[2], int) + assert prop.value[2] == test_id + assert caoscrawler.crawl.cached_get_entity_by.call_count == 3 diff --git a/unittests/test_data/invalid_identifiable/identifiable_content_no_list.yaml b/unittests/test_data/invalid_identifiable/identifiable_content_no_list.yaml new file mode 100644 index 0000000000000000000000000000000000000000..aee572a190bd7f439f638ef7c9a5d94a831aca81 --- /dev/null +++ b/unittests/test_data/invalid_identifiable/identifiable_content_no_list.yaml @@ -0,0 +1,4 @@ +Experiment: + date: + - 1 + - 2 diff --git a/unittests/test_data/invalid_identifiable/identifiable_no_str_or_dict.yaml b/unittests/test_data/invalid_identifiable/identifiable_no_str_or_dict.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a33c4ace9f8709a9b4a77c5fd8f38514acbe1e9c --- /dev/null +++ b/unittests/test_data/invalid_identifiable/identifiable_no_str_or_dict.yaml @@ -0,0 +1,3 @@ +Experiment: +- date +- 23 diff --git a/unittests/test_data/invalid_identifiable/identifiable_referenced_no_list.yaml b/unittests/test_data/invalid_identifiable/identifiable_referenced_no_list.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a504eab748d4891c3e1088ee785afcf6347fbbab --- /dev/null +++ b/unittests/test_data/invalid_identifiable/identifiable_referenced_no_list.yaml @@ -0,0 +1,5 @@ +Experiment: +- date +Event: +- is_referenced_by: Experiment +- event_id diff --git a/unittests/test_directories/example_variable_deletion/Data_1/bla/README.md b/unittests/test_directories/example_variable_deletion/Data_1/bla/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/unittests/test_directories/example_variable_deletion/Data_2/test/README.md b/unittests/test_directories/example_variable_deletion/Data_2/test/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/unittests/test_directories/examples_json/invalidjson.json b/unittests/test_directories/examples_json/invalidjson.json index 9c012bf062264014278fc2df7be6cf33b65c7469..49a00fc6df33fe8d82ec2735e39c400a2342f0bf 100644 --- a/unittests/test_directories/examples_json/invalidjson.json +++ b/unittests/test_directories/examples_json/invalidjson.json @@ -1,13 +1,13 @@ { - "projectId": 10002, - "archived": false, - "coordinator": { - "firstname": "Miri", - "lastname": "Mueller", - "email": "miri.mueller@science.de" - }, - "start_date": "2022-03-01", - "candidates": ["Mouse", "Penguine"], - "rvalue": 0.4444, - "url": "https://site.de/index.php/" + "projectId": 10002, + "archived": false, + "coordinator": { + "firstname": "Miri", + "lastname": "Mueller", + "email": "miri.mueller@science.de" + }, + "start_date": "2022-03-01", + "candidates": ["Mouse", "Penguine"], + "rvalue": 0.4444, + "url": "https://site.de/index.php/" } diff --git a/unittests/test_directories/examples_json/testjson.json b/unittests/test_directories/examples_json/testjson.json index d37ea2defc21d767e4e13ad3b39d6682b3c452ef..29d59780f4824d9c2edbc8fe1da3a6b380def57b 100644 --- a/unittests/test_directories/examples_json/testjson.json +++ b/unittests/test_directories/examples_json/testjson.json @@ -1,22 +1,21 @@ { - "name": "DEMO", - "projectId": 10002, - "archived": false, - "Person": [ - { - "firstname": "Miri", - "lastname": "Mueller", - "other": null, - "email": "miri.mueller@science.de" - }, + "name": "DEMO", + "projectId": 10002, + "archived": false, + "Person": [{ + "firstname": "Miri", + "lastname": "Mueller", + "other": null, + "email": "miri.mueller@science.de" + }, { "firstname": "Mara", "lastname": "Mueller", - "email": "mara.mueller@science.de" + "email": "mara.mueller@science.de" } ], - "start_date": "2022-03-01", - "candidates": ["Mouse", "Penguine"], - "rvalue": 0.4444, - "url": "https://site.de/index.php/" + "start_date": "2022-03-01", + "candidates": ["Mouse", "Penguine"], + "rvalue": 0.4444, + "url": "https://site.de/index.php/" } diff --git a/unittests/test_directories/examples_tables/ExperimentalData/test_with_empty.csv b/unittests/test_directories/examples_tables/ExperimentalData/test_with_empty.csv new file mode 100644 index 0000000000000000000000000000000000000000..be25239a6d96ecde3876a7bbabdae8769994b455 --- /dev/null +++ b/unittests/test_directories/examples_tables/ExperimentalData/test_with_empty.csv @@ -0,0 +1,4 @@ +event,date +event_a,2025-02-06 +event_b, +event_c,2025-02-06T09:00:00 diff --git a/unittests/test_directories/examples_tables/crawler_for_issue_112.yml b/unittests/test_directories/examples_tables/crawler_for_issue_112.yml new file mode 100644 index 0000000000000000000000000000000000000000..4bab5adabcf889d7784583a80dcbb94b714fd3fc --- /dev/null +++ b/unittests/test_directories/examples_tables/crawler_for_issue_112.yml @@ -0,0 +1,27 @@ +ExperimentalData: + type: Directory + match: ExperimentalData + subtree: + CSVTable: + type: CSVTableConverter + match: "test_with_empty\\.csv" + subtree: + Row: + type: DictElement + records: + Event: + subtree: + EventName: + type: TextElement + match_name: "event" + match_value: "(?P<name>.*)" + records: + Event: + name: $name + Date: + type: Datetime + match_name: "date" + match_value: "(?P<date>.+)" + records: + Event: + event_time: $date diff --git a/unittests/test_directories/test_transformers/Day_Mon/README.md b/unittests/test_directories/test_transformers/Day_Mon/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/unittests/test_directories/test_transformers/Day_Tue/README.md b/unittests/test_directories/test_transformers/Day_Tue/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/unittests/test_directories/test_transformers/Day_Unk/README.md b/unittests/test_directories/test_transformers/Day_Unk/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/unittests/test_directories/test_transformers/cfood.yml b/unittests/test_directories/test_transformers/cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..ec79eaf00203b5df1a436dfe50fbf17fa7b764db --- /dev/null +++ b/unittests/test_directories/test_transformers/cfood.yml @@ -0,0 +1,49 @@ + +# See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/107 +# +Transformers: + ceil: + function: quote + package: shlex + +RootDir: + type: Directory + match: ^.*$ + subtree: + DateDir: + type: Directory + match: ^Day_(?P<day_short>.*)$ # Example: Day_Mon + transform: + MakeDayLong: + in: $day_short + out: $day_long + functions: + - submatch: # name of the function + match: Mon # match is one specific argument + then: Monday # then another one + - submatch: # next function + match: Tue + then: Tuesday + TestSplit: + in: $day_short + out: $day_split + functions: + - split: + marker: o + records: + DayFolder: + Day: $day_long + DayShort: $day_short # just for checking, whether this variable remains + DaySplit: $day_split # just for checking, whether this variable remains + Testfi: + type: File + match: ^(?P<no>(\d+ )*)$ + transform: + up: + in: $no + out: $no + functions: + - ceil: {} + records: + Number: + num: $no diff --git a/unittests/test_entity_comparison.py b/unittests/test_entity_comparison.py index 549bc4f42a59765d25446d44fbb845e49ca4d9b9..8543732fde4d584e2022dcf6432e9572ae625eb5 100644 --- a/unittests/test_entity_comparison.py +++ b/unittests/test_entity_comparison.py @@ -2,8 +2,7 @@ # Tests for entity comparison # A. Schlemmer, 06/2021 -import caosdb as db - +import linkahead as db import pytest from pytest import raises diff --git a/unittests/test_file_identifiables.py b/unittests/test_file_identifiables.py deleted file mode 100644 index aff174d0228d2750efd1cca129547c821c974127..0000000000000000000000000000000000000000 --- a/unittests/test_file_identifiables.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/python -# Tests for file identifiables -# A. Schlemmer, 06/2021 - -import caosdb as db - -import pytest -from pytest import raises - -from caoscrawler.identifiable_adapters import LocalStorageIdentifiableAdapter -from caoscrawler.identifiable import Identifiable - - -def test_file_identifiable(): - ident = LocalStorageIdentifiableAdapter() - - # Without a path there is no identifying information - with raises(ValueError): - ident.get_identifiable(db.File(), []) - - fp = "/test/bla/bla.txt" - file_obj = db.File(path=fp) - identifiable = ident.get_identifiable(file_obj) - - # the path is copied to the identifiable - assert fp == identifiable.path - assert isinstance(identifiable, Identifiable) - - # __eq__ function is only defined for Identifiable objects - with raises(ValueError): - file_obj != identifiable - - # since the path does not exist in the data in ident, the follwoing functions return None - assert ident.retrieve_identified_record_for_record(file_obj) is None - assert ident.get_file(identifiable) is None - - # Try again with actual files in the store: - records = ident.get_records() - test_record_wrong_path = db.File(path="/bla/bla/test.txt") - test_record_correct_path = db.File(path="/test/bla/bla.txt") - test_record_alsocorrect_path = db.File(path="/test/bla/bla.txt") - records.append(test_record_wrong_path) - # Now, there is a file, but still wrong path -> result is still None - identified_file = ident.get_file(file_obj) - assert identified_file is None - - records.append(test_record_correct_path) - # now there is a match - identified_file = ident.get_file(file_obj) - assert identified_file is not None - assert identified_file.path == file_obj.path - - with raises(RuntimeError, match=".*unambigiously.*"): - records.append(test_record_alsocorrect_path) - identified_file = ident.get_file(file_obj) diff --git a/unittests/test_h5_converter.py b/unittests/test_h5_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..9c1058812c75c6d1e5ee7028c8f6fccd7081a54c --- /dev/null +++ b/unittests/test_h5_converter.py @@ -0,0 +1,134 @@ +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2023 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +from functools import partial +from pathlib import Path + +import linkahead as db +import numpy as np +from pytest import fixture, importorskip +from utils import dircheckstr as dircheck_base + +from caoscrawler.converters.hdf5_converter import ( + H5DatasetElement, H5GroupElement, H5NdarrayElement, + convert_basic_element_with_nd_array, convert_h5_element) +from caoscrawler.debug_tree import DebugTree +from caoscrawler.scanner import scan_directory +from caoscrawler.structure_elements import (FloatElement, ListElement, + TextElement) + +# Skip the whole module if h5py hasn't been installed +h5py = importorskip("h5py") + + +UNITTESTDIR = Path(__file__).parent + +# always add the path here +dircheckstr = partial(dircheck_base, UNITTESTDIR) + + +@fixture +def h5_dummy_file(): + + path = UNITTESTDIR / "hdf5_dummy_file.hdf5" + + return h5py.File(path, 'r') + + +def test_h5_elements(h5_dummy_file): + + elt = convert_h5_element(h5_dummy_file["group_level1_a"], "test") + assert isinstance(elt, H5GroupElement) + + elt = convert_h5_element(h5_dummy_file["root_integers"], "test") + assert isinstance(elt, H5DatasetElement) + + +def test_nd_array_conversion(): + + # Only test array handling here, `convert_basic_element` is tested + # elsewhere. + arr = np.array([[["something"]]]) + elt = convert_basic_element_with_nd_array(arr) + assert isinstance(elt, TextElement) + assert elt.value == "something" + + arr = np.zeros((1, 1)) + elt = convert_basic_element_with_nd_array(arr) + assert isinstance(elt, FloatElement) + assert elt.value == 0 + + arr = np.zeros((1, 3, 1)) + elt = convert_basic_element_with_nd_array(arr) + assert isinstance(elt, ListElement) + assert elt.value == [0, 0, 0] + + arr = np.array([[1, 2, 3], [4, 5, 6]]) + elt = convert_basic_element_with_nd_array(arr, internal_path="some/path") + assert isinstance(elt, H5NdarrayElement) + assert elt.internal_path == "some/path" + + # Non-arrays should be forwarded correctly + elt = convert_basic_element_with_nd_array("something") + assert isinstance(elt, TextElement) + assert elt.value == "something" + + elt = convert_basic_element_with_nd_array([0, 0, 0]) + assert isinstance(elt, ListElement) + assert elt.value == [0, 0, 0] + + +def test_record_creation(): + + dbt = DebugTree() + records = scan_directory(UNITTESTDIR, UNITTESTDIR / "h5_cfood.yml", debug_tree=dbt) + + # In total 3 records: The file, the Dataset, and its ndarray + assert len(records) == 3 + file_rec = [rec for rec in records if isinstance(rec, db.File)] + # exactly on file + assert len(file_rec) == 1 + + subd = dbt.debug_tree[dircheckstr("hdf5_dummy_file.hdf5")] + # At this level, we have 5 variables (directories and paths, plus H5File + # record), and one record. + assert len(subd[0]) == 5 + assert len(subd[1]) == 1 + file_rec = subd[1]["H5File"] + assert file_rec.get_property("H5Dataset") is not None + assert file_rec.get_property("H5Dataset").value is not None + # Reference properties currently need to be integration tested (especially + # with the circular dependency between) H5File and NDArray. + + # top level integers + subd = dbt.debug_tree["root_integers"] + # Two additional variables (RootIntegerElement + Dataset record), one + # additional record + assert len(subd[0]) == 7 + assert len(subd[1]) == 2 + ds_rec = subd[1]["H5Dataset"] + assert isinstance(ds_rec, db.Record) + assert len(ds_rec.parents) == 1 + assert ds_rec.parents[0].name == "H5Dataset" + assert ds_rec.get_property("Ndarray") is not None + assert ds_rec.get_property("Ndarray").value is not None + assert ds_rec.get_property("attr_data_root") is not None + assert isinstance(ds_rec.get_property("attr_data_root").value, list) + for number in [-2., -4., -8., -10.12345]: + assert number in [float(val) for val in ds_rec.get_property("attr_data_root").value] diff --git a/unittests/test_identifiable.py b/unittests/test_identifiable.py index 3f3c606b163df4dc238be9a669fd31eb630a582d..44aac6a3edd40e0df8558f68083e22245ff58127 100644 --- a/unittests/test_identifiable.py +++ b/unittests/test_identifiable.py @@ -24,10 +24,11 @@ test identifiable module """ +import linkahead as db import pytest -import caosdb as db + from caoscrawler.identifiable import Identifiable -from caoscrawler.identified_cache import IdentifiedCache +from caoscrawler.sync_node import SyncNode def test_create_hashable_string(): @@ -43,25 +44,20 @@ def test_create_hashable_string(): assert ( Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", - properties={'a': db.Record(id=12)}) + properties={'a': SyncNode(db.Record(id=12))}) ) == "P<B>N<A>R<[]>a:12") a = Identifiable._create_hashable_string( - Identifiable(name="A", record_type="B", properties={'a': [db.Record(id=12)]})) + Identifiable(name="A", record_type="B", properties={'a': [SyncNode(db.Record(id=12))]})) assert (a == "P<B>N<A>R<[]>a:[12]") assert (Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", properties={'a': [12]})) == "P<B>N<A>R<[]>a:[12]") assert ( Identifiable._create_hashable_string( Identifiable(name="A", record_type="B", properties={ - 'a': [db.Record(id=12), 11]}) + 'a': [SyncNode(db.Record(id=12)), 11]}) ) == "P<B>N<A>R<[]>a:[12, 11]") - assert ( - Identifiable._create_hashable_string( - Identifiable(record_type="B", properties={'a': [db.Record()]}) - ) != Identifiable._create_hashable_string( - Identifiable(record_type="B", properties={'a': [db.Record()]}))) assert Identifiable._create_hashable_string( - Identifiable(name="A", record_type="B", backrefs=[123, db.Entity(id=124)], + Identifiable(name="A", record_type="B", backrefs=[123, SyncNode(db.Record(id=124))], properties={'a': 5})) == "P<B>N<A>R<['123', '124']>a:5" @@ -74,9 +70,9 @@ def test_repr(): # only test that something meaningful is returned assert 'properties' in str(Identifiable(name="A", record_type="B")) assert str(Identifiable(name="A", record_type="B", properties={'a': 0})).split( - "properties:\n")[1].split('\n')[0] == '{"a": 0}' + "properties:\n")[1].split('\n')[0] == '{"a": "0"}' assert str(Identifiable(name="A", record_type="B", properties={'a': 0, 'b': "test"})).split( - "properties:\n")[1].split('\n')[0] == '{"a": 0, "b": "test"}' + "properties:\n")[1].split('\n')[0] == '{"a": "0", "b": "test"}' # TODO(henrik): Add a test using backrefs once that's implemented. @@ -88,13 +84,5 @@ def test_equality(): record_id=12, properties={"a": 0}) != Identifiable(record_id=13, properties={"a": 0}) assert Identifiable( record_id=12, properties={"a": 0}) == Identifiable(properties={"a": 0}) - assert Identifiable( - path="a", properties={"a": 0}) != Identifiable(path="b", properties={"a": 0}) - assert Identifiable( - path="a", properties={"a": 0}) == Identifiable(path="a", properties={"a": 1}) - assert Identifiable( - path="a", properties={"a": 0}) == Identifiable(properties={"a": 0}) - assert Identifiable(properties={"a": 0}) == Identifiable( - properties={"a": 0}) - assert Identifiable(properties={"a": 0}) != Identifiable( - properties={"a": 1}) + assert Identifiable(properties={"a": 0}) == Identifiable(properties={"a": 0}) + assert Identifiable(properties={"a": 0}) != Identifiable(properties={"a": 1}) diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index c3b44e1c8e3775fc6e8b7118b82ffa6a20bef484..1c7733acfe952a2f47eff2853c2b90684c098dbf 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -27,18 +27,43 @@ test identifiable_adapters module """ -import os from datetime import datetime -from caoscrawler.identifiable_adapters import ( - CaosDBIdentifiableAdapter, convert_value, IdentifiableAdapter) +from pathlib import Path +from unittest.mock import MagicMock, Mock, patch + +import linkahead as db +import pytest + +from caoscrawler.exceptions import InvalidIdentifiableYAML from caoscrawler.identifiable import Identifiable -import caosdb as db +from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, + IdentifiableAdapter, + convert_value) +from caoscrawler.sync_graph import SyncNode + +UNITTESTDIR = Path(__file__).parent + + +def mock_retrieve_RecordType(id, name): + return { + "Person": db.RecordType(name="Person"), + "Keyword": db.RecordType(name="Keyword"), + "Project": db.RecordType(name="Project"), + "A": db.RecordType(name="A"), + "Experiment": db.RecordType(name="Experiment"), + "Lab": db.RecordType(name="Lab"), + "Analysis": db.RecordType(name="Analysis"), + "MetaAnalysis": db.RecordType(name="MetaAnalysis").add_parent("Analysis"), + # Test that two parents are possible; only one of them + # (Experiment) has an identifiable. + "Measurement": db.RecordType(name="Measurement").add_parent("Experiment").add_parent("A") + }[name] def test_create_query_for_identifiable(): query = IdentifiableAdapter.create_query_for_identifiable( Identifiable(record_type="Person", properties={"first_name": "A", "last_name": "B"})) - assert query.lower() == "find record person with 'first_name'='a' and 'last_name'='b' " + assert query.lower() == "find record 'person' with 'first_name'='a' and 'last_name'='b' " query = IdentifiableAdapter.create_query_for_identifiable( Identifiable(name="A", record_type="B", properties={ @@ -50,38 +75,53 @@ def test_create_query_for_identifiable(): "h": db.Record(id=1111), "i": db.File(id=1112), "j": [2222, db.Record(id=3333)]})) - assert (query == "FIND RECORD B WITH name='A' AND 'c'='c' AND 'd'='5' AND 'e'='5.5'" + assert (query == "FIND RECORD 'B' WITH name='A' AND 'c'='c' AND 'd'='5' AND 'e'='5.5'" " AND 'f'='2020-10-10T00:00:00' AND 'g'='TRUE' AND 'h'='1111' AND 'i'='1112' AND " "'j'='2222' AND 'j'='3333' ") # The name can be the only identifiable query = IdentifiableAdapter.create_query_for_identifiable( Identifiable(name="TestRecord", record_type="TestType")) - assert query.lower() == "find record testtype with name='testrecord'" + assert query.lower() == "find record 'testtype' with name='testrecord'" # With referencing entity (backref) query = IdentifiableAdapter.create_query_for_identifiable( Identifiable(record_type="Person", backrefs=[14433], properties={'last_name': "B"})) - assert query.lower() == ("find record person which is referenced by 14433 and with " + assert query.lower() == ("find record 'person' which is referenced by 14433 and with " "'last_name'='b' ") # With two referencing entities (backref) query = IdentifiableAdapter.create_query_for_identifiable( Identifiable(record_type="Person", backrefs=[14433, 333], properties={'last_name': "B"})) - assert query.lower() == ("find record person which is referenced by 14433 and which is " + assert query.lower() == ("find record 'person' which is referenced by 14433 and which is " "referenced by 333 and with 'last_name'='b' ") # With single quote in string query = IdentifiableAdapter.create_query_for_identifiable( Identifiable(record_type="Person", backrefs=[], properties={'last_name': "B'Or"})) - assert query == ("FIND RECORD Person WITH 'last_name'='B\\'Or' ") + assert query == ("FIND RECORD 'Person' WITH 'last_name'='B\\'Or' ") + + # With only backref + query = IdentifiableAdapter.create_query_for_identifiable( + Identifiable(backrefs=[160], properties={})) + assert query == ("FIND RECORD WHICH IS REFERENCED BY 160") + + # With only backref and name + query = IdentifiableAdapter.create_query_for_identifiable( + Identifiable(backrefs=[160], name='lo', properties={})) + assert query == ("FIND RECORD WHICH IS REFERENCED BY 160 AND WITH name='lo'") + + query = IdentifiableAdapter.create_query_for_identifiable( + Identifiable(record_type="record type", name="it's weird")) + assert query == ("FIND RECORD 'record type' WITH name='it\\'s weird'") +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=mock_retrieve_RecordType)) def test_load_from_yaml_file(): ident = CaosDBIdentifiableAdapter() ident.load_from_yaml_definition( - os.path.join(os.path.dirname(__file__), "test_directories", - "single_file_test_data", "identifiables.yml") + UNITTESTDIR / "test_directories" / "single_file_test_data" / "identifiables.yml" ) person_i = ident.get_registered_identifiable( @@ -101,6 +141,49 @@ def test_load_from_yaml_file(): assert project_i.get_property("title") is not None +def test_invalid_yaml(): + ident = CaosDBIdentifiableAdapter() + invalid_dir = UNITTESTDIR / "test_data" / "invalid_identifiable" + with pytest.raises(InvalidIdentifiableYAML) as exc: + ident.load_from_yaml_definition(invalid_dir / "identifiable_content_no_list.yaml") + assert str(exc.value) == "Identifiable contents must be lists, but this was not: Experiment" + + with pytest.raises(InvalidIdentifiableYAML) as exc: + ident.load_from_yaml_definition(invalid_dir / "identifiable_referenced_no_list.yaml") + assert str(exc.value) == "'is_referenced_by' must be a list. Found in: Event" + + with pytest.raises(InvalidIdentifiableYAML) as exc: + ident.load_from_yaml_definition(invalid_dir / "identifiable_no_str_or_dict.yaml") + assert str(exc.value) == ("Identifiable properties must be str or dict, but this one was not:\n" + " Experiment/23") + + +def test_non_default_name(): + ident = CaosDBIdentifiableAdapter() + identifiable = ident.get_identifiable(SyncNode(db.Record(name="don't touch it") + .add_parent("Person") + .add_property(name="last_name", value='Tom'), + db.RecordType() + .add_parent(name="Person") + .add_property(name="last_name")), []) + assert identifiable.name is None + + +def test_wildcard_ref(): + ident = CaosDBIdentifiableAdapter() + rec = (db.Record(name="don't touch it").add_parent("Person") + .add_property(name="last_name", value='Tom')) + dummy = SyncNode(db.Record(), None) + dummy.id = 1 + identifiable = ident.get_identifiable(SyncNode(rec, db.RecordType() + .add_parent(name="Person") + .add_property(name="is_referenced_by", + value=["*"])), + [dummy] + ) + assert identifiable.backrefs[0] == 1 + + def test_convert_value(): # test that string representation of objects stay unchanged. No stripping or so. class A(): @@ -108,3 +191,154 @@ def test_convert_value(): return " a " assert convert_value(A()) == " a " + + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=mock_retrieve_RecordType)) +def test_get_identifiable(): + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml") + rec = (db.Record(id=5) + .add_parent(name="Experiment", id=3) + .add_property(name="date", value="2022-02-01") + .add_property(name="result", value="FAIL")) + se = SyncNode(rec, + ident.get_registered_identifiable(rec)) + id_r0 = ident.get_identifiable(se, []) + assert rec.parents[0].name == id_r0.record_type + assert rec.get_property("date").value == id_r0.properties["date"] + assert len(rec.parents) == 1 + assert len(rec.properties) == 2 + assert len(id_r0.properties) == 1 + + ident = CaosDBIdentifiableAdapter() + ident_a = db.RecordType(name="A").add_parent("A").add_property("name").add_property("a") + ident.register_identifiable("A", ident_a) + rec = (db.Record(id=5) + .add_parent(name="A", id=3) + .add_property(name="a", value="2022-02-01") + .add_property(name="result", value="FAIL")) + se = SyncNode(rec, ident.get_registered_identifiable(rec)) + for el in [ + db.Record() + .add_parent(name="A", id=3) + .add_property(name="a", value="2022-02-01") + .add_property(name="result", value="FAIL"), + db.Record(name='a') + .add_parent(name="A", id=3) + .add_property(name="a", value="2022-02-01") + .add_property(name="result", value="FAIL"), + ]: + se.update(SyncNode(el)) + + id_r0 = ident.get_identifiable(se, []) + assert "A" == id_r0.record_type + assert "2022-02-01" == id_r0.properties["a"] + assert 'a' == id_r0.name + assert len(id_r0.properties) == 1 + + rec = (db.Record(name='a') + .add_parent(name="A") + .add_property(name="a", value="2") + ) + se = SyncNode(rec, ident.get_registered_identifiable(rec)) + se.update(SyncNode( + db.Record(name='a') + .add_parent(name="A") + .add_property(name="a", value="3") + )) + + with pytest.raises(RuntimeError): + id_r0 = ident.get_identifiable(se, []) + + +@pytest.mark.xfail +def test_retrieve_identified_record_for_identifiable(): + # TODO modify this such that it becomes a test that acutally tests (sufficiently) the + # retrieve_identified_record_for_identifiable function + idr_r0_test = ident.retrieve_identified_record_for_identifiable(id_r0) + idr_r0 = ident.retrieve_identified_record_for_record(r_cur) + assert idr_r0 == idr_r0_test + + # take the first measurement in the list of records: + for r in ident.get_records(): + if r.parents[0].name == "Measurement": + r_cur = r + break + + id_r1 = ident.get_identifiable(r_cur, []) + assert r_cur.parents[0].name == id_r1.record_type + assert r_cur.get_property( + "identifier").value == id_r1.properties["identifier"] + assert r_cur.get_property("date").value == id_r1.properties["date"] + assert r_cur.get_property( + "project").value == id_r1.properties["project"] + assert len(r_cur.parents) == 1 + assert len(r_cur.properties) == 4 + assert len(id_r1.properties) == 3 + + idr_r1_test = ident.retrieve_identified_record_for_identifiable(id_r1) + idr_r1 = ident.retrieve_identified_record_for_record(r_cur) + assert idr_r1 == idr_r1_test + assert idr_r1 != idr_r0 + assert idr_r1_test != idr_r0_test + + assert len(idr_r1.properties) == 4 + assert r_cur.get_property( + "responsible").value == idr_r1.get_property("responsible").value + assert r_cur.description == idr_r1.description + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_referencing_entity_has_appropriate_type(): + dummy = db.Record().add_parent("A") + registered_identifiable = db.RecordType() + rft = IdentifiableAdapter.referencing_entity_has_appropriate_type + assert not rft([], registered_identifiable) + assert not rft(dummy.parents, registered_identifiable) + registered_identifiable.add_property("is_referenced_by", "B") + assert not rft(dummy.parents, registered_identifiable) + registered_identifiable.properties[0].value = ["B", "A"] + assert rft(dummy.parents, registered_identifiable) + registered_identifiable.properties[0].value = ["B", "*"] + assert rft(dummy.parents, registered_identifiable) + + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=mock_retrieve_RecordType)) +def test_get_registered_identifiable(): + # Test the case that the record has a parent for which an identifiable is registered + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml") + rec = db.Record().add_parent(name="Experiment") + registered = ident.get_registered_identifiable(rec) + assert registered is not None + assert registered.parents[0].name == "Experiment" + + # Test the same but with an additional parent + rec = db.Record().add_parent(name="Lab").add_parent(name="Experiment") + registered = ident.get_registered_identifiable(rec) + assert registered is not None + assert registered.parents[0].name == "Experiment" + + # Test the same but with an additional parent that also has a registered identifiable + rec = db.Record().add_parent(name="Analysis").add_parent(name="Experiment") + with pytest.raises(RuntimeError): + registered = ident.get_registered_identifiable(rec) + + # Test the same but with an additional parent that has a parent with a registered identifiable + rec = db.Record().add_parent(name="MetaAnalysis").add_parent(name="Experiment") + with pytest.raises(RuntimeError): + registered = ident.get_registered_identifiable(rec) + + # Test the case that the record has a parent for which no + # identifiable is registered and there is a registered + # identifiable for a grand parent. Note that this also tests the + # case of two grandparents, only one of which has an identifiable. + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition(UNITTESTDIR / "example_identifiables.yml") + rec = db.Record().add_parent(name="Measurement") + registered = ident.get_registered_identifiable(rec) + assert registered is not None + assert registered.parents[0].name == "Experiment" diff --git a/unittests/test_identified_cache.py b/unittests/test_identified_cache.py deleted file mode 100644 index 4ed7c55c7326415308917e20e9f391b17b07ad87..0000000000000000000000000000000000000000 --- a/unittests/test_identified_cache.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -# -# ** header v3.0 -# This file is a part of the CaosDB Project. -# -# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com> -# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see <https://www.gnu.org/licenses/>. -# -# ** end header -# - -""" -test identified_cache module -""" - -import caosdb as db -from caoscrawler.identifiable import Identifiable -from caoscrawler.identified_cache import IdentifiedCache - - -def test_IdentifiedCache(): - ident = Identifiable(name="A", record_type="B") - record = db.Record("A").add_parent("B").add_property('b', 5) - cache = IdentifiedCache() - assert ident not in cache - cache.add(record=record, identifiable=ident) - assert ident in cache - assert cache[ident] is record - assert Identifiable(name="A", record_type="C") != Identifiable(name="A", record_type="B") - assert Identifiable(name="A", record_type="C") not in cache diff --git a/unittests/test_issues.py b/unittests/test_issues.py index a1724e5a989190977a7ec0d86846fc2b7433ab5d..779f77711fe18df2433f03580e7e3e4f2035f0f4 100644 --- a/unittests/test_issues.py +++ b/unittests/test_issues.py @@ -20,15 +20,44 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from pytest import mark +import importlib -import caosdb as db +from pathlib import Path +from pytest import fixture, mark +from caoscrawler.converters import (CrawlerTemplate, replace_variables, TextElementConverter) from caoscrawler.crawl import Crawler -from caoscrawler.identifiable import Identifiable -from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter -from caoscrawler.structure_elements import DictElement -from test_tool import rfp +from caoscrawler.scanner import (create_converter_registry, scan_directory, + scan_structure_elements) +from caoscrawler.stores import GeneralStore +from caoscrawler.structure_elements import DictElement, TextElement + + +UNITTESTDIR = Path(__file__).parent + + +@fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "TextElement": { + "converter": "TextElementConverter", + "package": "caoscrawler.converters"}, + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + "CSVTableConverter": { + "converter": "CSVTableConverter", + "package": "caoscrawler.converters"}, + "Datetime": { + "converter": "DatetimeElementConverter", + "package": "caoscrawler.converters" + } + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry def test_issue_10(): @@ -55,14 +84,13 @@ def test_issue_10(): } } - crawler = Crawler(debug=True) - converter_registry = crawler.load_converters(crawler_definition) + converter_registry = create_converter_registry(crawler_definition) test_dict = { "float_value": 4 } - records = crawler.start_crawling( + records = scan_structure_elements( DictElement("TestDict", test_dict), crawler_definition, converter_registry) assert len(records) == 1 assert records[0].parents[0].name == "TestRec" @@ -94,7 +122,7 @@ def test_list_datatypes(): } } - crawler = Crawler(debug=True) + crawler = Crawler() converter_registry = crawler.load_converters(crawler_definition) test_dict = { @@ -110,3 +138,89 @@ def test_list_datatypes(): assert isinstance(records[0].get_property("Subject").value, list) assert records[0].get_property("Subject").datatype is not None assert records[0].get_property("Subject").datatype.startswith("LIST") + + +def test_issue_93(): + """https://gitlab.com/linkahead/linkahead-crawler/-/issues/93 + + cfood.yaml does not allow umlaut in $expression""" + values = GeneralStore() + expressions = [ + "foo", + "foo.bär", + "_1", + "Ä", + "ųøîµ", + ] + for exp in expressions: + values[exp] = f"This is {exp}" + # ## Test preliminary check + # With braces + for exp in expressions: + assert replace_variables(f"${{{exp}}}", values) == f"This is {exp}" + # Without braces + for exp in expressions: + assert replace_variables(f"${exp}", values) == f"This is {exp}" + + # ## Test actual replacement + for exp in expressions: + # as-is + propvalue = f"${{{exp}}}" + propvalue_template = CrawlerTemplate(propvalue) + # from IPython import embed + # embed() + + assert propvalue_template.safe_substitute(**values.get_storage()) == f"This is {exp}" + + # String embedded into context + propvalue = f"some text before >> ${{{exp}}} << some text after" + print(propvalue) + propvalue_template = CrawlerTemplate(propvalue) + assert (propvalue_template.safe_substitute(**values.get_storage()) + == f"some text before >> This is {exp} << some text after") + + +def test_issue_112(converter_registry): + """Test that empty table cells are not matched in case of + ``match_value: ".+"``. + + See https://gitlab.com/linkahead/linkahead-crawler/-/issues/112. + + """ + tec = TextElementConverter( + name="TestTextConverter", + definition={ + "match_name": ".*", + "match_value": "(?P<content>.+)" + }, + converter_registry=converter_registry + ) + + empty = TextElement(name="empty", value='') + assert tec.match(empty) is None + + empty_none = TextElement(name="empty", value=None) + assert tec.match(empty_none) is None + + non_empty = TextElement(name="empty", value=' ') + matches = tec.match(non_empty) + assert "content" in matches + assert matches["content"] == ' ' + + # Cfood definition for CSV example file + records = scan_directory(UNITTESTDIR / "test_directories" / "examples_tables" / "ExperimentalData", + UNITTESTDIR / "test_directories" / "examples_tables" / "crawler_for_issue_112.yml") + assert records + for rec in records: + print(rec.name) + assert len(rec.parents.filter_by_identity(name="Event")) > 0 + assert rec.name in ["event_a", "event_b", "event_c"] + if rec.name == "event_a": + assert rec.get_property("event_time") is not None + assert rec.get_property("event_time").value == "2025-02-06" + if rec.name == "event_b": + # `date` field is empty, so there must be no match + assert rec.get_property("event_time") is None + if rec.name == "event_c": + assert rec.get_property("event_time") is not None + assert rec.get_property("event_time").value == "2025-02-06T09:00:00" diff --git a/unittests/test_json.py b/unittests/test_json.py index 41fd31a43389148ad6fbc4167fd3fbd4f7f2ee9f..5d145b38fd36fa2de4e4ab754cbadda0fff6eff7 100644 --- a/unittests/test_json.py +++ b/unittests/test_json.py @@ -26,30 +26,31 @@ """ test the JSON converter """ -import json import os +from pathlib import Path +import linkahead as db from pytest import raises -import caosdb as db - from caoscrawler.converters import JSONFileConverter from caoscrawler.crawl import Crawler +from caoscrawler.scanner import (create_converter_registry, load_definition, + scan_structure_elements) from caoscrawler.structure_elements import File, JSONFile -from test_tool import rfp, dircheckstr + +UNITTESTDIR = Path(__file__).parent def test_json(): - crawler_definition_path = rfp("test_directories", "examples_json", - "jsontest_cfood.yml") - json_file_path = rfp("test_directories", "examples_json", "testjson.json") + crawler_definition_path = (UNITTESTDIR / "test_directories" / "examples_json" + / "jsontest_cfood.yml") + json_file_path = UNITTESTDIR / "test_directories" / "examples_json" / "testjson.json" - crawler = Crawler(debug=True) - crawler_definition = crawler.load_definition(crawler_definition_path) + crawler_definition = load_definition(crawler_definition_path) # Load and register converter packages: - converter_registry = crawler.load_converters(crawler_definition) + converter_registry = create_converter_registry(crawler_definition) - records = crawler.start_crawling( + records = scan_structure_elements( JSONFile(os.path.basename(json_file_path), json_file_path), crawler_definition, converter_registry @@ -68,10 +69,8 @@ def test_json(): def test_broken_validation(): - crawler_definition_path = rfp( - "broken_cfoods", "broken_validation_path.yml") - crawler = Crawler() + crawler_definition_path = UNITTESTDIR / "broken_cfoods" / "broken_validation_path.yml" with raises(FileNotFoundError) as err: - crawler_definition = crawler.load_definition(crawler_definition_path) + crawler_definition = load_definition(crawler_definition_path) assert str(err.value).startswith("Couldn't find validation file") diff --git a/unittests/test_macros.py b/unittests/test_macros.py index b5ea5d84846f5f33853910c292132d7b5026600e..03fe0e665652bb12e204d76857771c1d064ec28a 100644 --- a/unittests/test_macros.py +++ b/unittests/test_macros.py @@ -22,14 +22,15 @@ # ** end header # -from caoscrawler.macros import defmacro_constructor, macro_constructor -from caoscrawler.macros.macro_yaml_object import macro_store -from caoscrawler.crawl import Crawler - from tempfile import NamedTemporaryFile -import yaml import pytest +import yaml + +from caoscrawler.crawl import Crawler +from caoscrawler.macros import defmacro_constructor, macro_constructor +from caoscrawler.macros.macro_yaml_object import macro_store +from caoscrawler.scanner import load_definition @pytest.fixture @@ -49,17 +50,16 @@ def _temp_file_load(txt: str): definition using load_definition from Crawler. """ definition = None - with NamedTemporaryFile() as f: + with NamedTemporaryFile(delete=False) as f: f.write(txt.encode()) f.flush() - c = Crawler() - definition = c.load_definition(f.name) + definition = load_definition(f.name) return definition def test_macros(register_macros, macro_store_reset): dat = yaml.load(""" -defs: +macros: - !defmacro name: test params: @@ -85,7 +85,7 @@ testnode: def test_macro_list_replacment(register_macros, macro_store_reset): dat = yaml.load(""" -defs: +macros: - !defmacro name: test params: @@ -112,7 +112,7 @@ testnode: def test_multi_macros(register_macros, macro_store_reset): dat = yaml.load(""" -defs: +macros: - !defmacro name: test_one params: {} @@ -142,6 +142,7 @@ def test_multi_macros_toplevel(register_macros, macro_store_reset): dat_loader = list(yaml.safe_load_all(""" --- metadata: + crawler-version: 0.9.0 macros: - !defmacro name: test_one @@ -168,6 +169,10 @@ testnode: !macro def test_load_definition(register_macros, macro_store_reset): txt = """ +--- +metadata: + crawler-version: 0.9.0 +--- extroot: type: Directory match: extroot @@ -183,11 +188,13 @@ extroot: cfood = _temp_file_load(""" --- metadata: + crawler-version: 0.9.0 macros: - !defmacro name: test_one params: {} definition: + type: TextElement replaced1: ok - !defmacro name: test_two @@ -207,6 +214,7 @@ extroot: extroot2: !macro # test top level macro test_one: extroot3: + type: Directory subtree: SimulationData: !macro test_two: @@ -217,38 +225,124 @@ extroot3: assert cfood["extroot3"]["subtree"]["SimulationData"]["match"] == "SimulationData" -@pytest.mark.xfail def test_replace_arbitrary_objects(register_macros, macro_store_reset): """ See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/24 """ dat = yaml.load(""" -defs: +macros: - !defmacro name: test params: b: 25 + testvar_list_empty: [] testvar_list: - a - $b + testvar_dict_empty: {} testvar_dict: t1: a t2: $b definition: replaced1: $b: ok - c: $testvar_dict - d: $testvar_list + dict_empty: $testvar_dict_empty + dict: $testvar_dict + list_empty: $testvar_list_empty + list: ${testvar_list} testnode: obl: !macro test: """, Loader=yaml.SafeLoader) print(yaml.dump(dat)) - assert dat["testnode"]["obl"]["replaced1"]["c"]["t1"] == "a" - assert dat["testnode"]["obl"]["replaced1"]["c"]["t2"] == "25" - assert dat["testnode"]["obl"]["replaced1"]["d"][0] == "a" - assert dat["testnode"]["obl"]["replaced1"]["d"][1] == "25" + replaced = dat["testnode"]["obl"]["replaced1"] + assert replaced["dict_empty"] == {} + assert replaced["dict"]["t1"] == "a" + assert replaced["dict"]["t2"] == 25 + assert replaced["list_empty"] == [] + assert replaced["list"][0] == "a" + assert replaced["list"][1] == 25 + + +def test_macros_in_macros(register_macros, macro_store_reset): + """ + Test that macros can be used in macro definitions. + """ + cfood = _temp_file_load(""" +--- +metadata: + crawler-version: 0.9.0 + macros: + - !defmacro + name: one_macro + params: + a: 25 + definition: + type: DictElement + macro_sub_$a: + b: $a + another_param: 3 + - !defmacro + name: test_macrodef + params: {} + definition: + type: DictElement + macro_top: !macro + one_macro: + - a: 17 + - {} + - a: 98 + not_macro: + a: 26 +--- +extroot: !macro + test_macrodef: + """) + + assert "test_macro" not in cfood["extroot"] + assert cfood["extroot"]["macro_top"]["not_macro"]["a"] == 26 + d = cfood["extroot"]["macro_top"] + assert d["macro_sub_17"]["b"] == 17 + assert d["macro_sub_17"]["another_param"] == 3 + assert d["macro_sub_25"]["b"] == 25 + assert d["macro_sub_25"]["another_param"] == 3 + assert d["macro_sub_98"]["b"] == 98 + assert d["macro_sub_98"]["another_param"] == 3 + + +@pytest.mark.xfail( + reason="This is discussed in the following issue" + "https://gitlab.com/caosdb/caosdb-crawler/-/issues/74." +) +def test_silent_overwrite(register_macros, macro_store_reset): + cfood = _temp_file_load(""" +--- +metadata: + crawler-version: 0.9.0 + macros: + - !defmacro + name: one_macro + params: + a: 25 + definition: + macro_sub: + b: $a + another_param: 3 + - !defmacro + name: test_macrodef + params: {} + definition: + macro_top: !macro + one_macro: + - a: 17 + - a: 98 +--- +extroot: !macro + test_macrodef: + """) + + assert len(cfood["extroot"]["macro_top"]) == 2 def test_circular_macro_definition(register_macros, macro_store_reset): @@ -256,11 +350,13 @@ def test_circular_macro_definition(register_macros, macro_store_reset): cfood = _temp_file_load(""" --- metadata: + crawler-version: 0.9.0 macros: - !defmacro name: test_one params: {} definition: !macro + type: TextElement test_two: - !defmacro name: test_two @@ -276,6 +372,7 @@ metadata: name: test_four params: {} definition: !macro + type: TextElement test_four: --- extroot: !macro @@ -304,6 +401,7 @@ def test_use_macro_twice(): cfood = _temp_file_load(""" --- metadata: + crawler-version: 0.9.0 macros: - !defmacro name: test_twice @@ -311,6 +409,7 @@ metadata: macro_name: default_name a: 4 definition: + type: DictElement $macro_name: something: a: $a @@ -324,9 +423,9 @@ extroot: !macro """) for name in ["once", "twice", "default_name"]: assert name in cfood["extroot"] - assert cfood["extroot"]["once"]["something"]["a"] == "4" - assert cfood["extroot"]["twice"]["something"]["a"] == "5" - assert cfood["extroot"]["default_name"]["something"]["a"] == "4" + assert cfood["extroot"]["once"]["something"]["a"] == 4 + assert cfood["extroot"]["twice"]["something"]["a"] == 5 + assert cfood["extroot"]["default_name"]["something"]["a"] == 4 # Code sample to generate the expanded macro: # with open("expanded_test_macro.yaml", "w") as f: # f.write(yaml.dump(cfood)) @@ -337,6 +436,7 @@ def test_documentation_example_2(): cfood = _temp_file_load(""" --- metadata: + crawler-version: 0.9.0 macros: - !defmacro name: MarkdownFile @@ -374,6 +474,7 @@ def test_documentation_example_1(): cfood = _temp_file_load(""" --- metadata: + crawler-version: 0.9.0 macros: - !defmacro name: SimulationDatasetFile @@ -422,6 +523,7 @@ def test_def_replacements(): cfood = _temp_file_load(""" --- metadata: + crawler-version: 0.9.0 macros: - !defmacro name: test_def_replacements @@ -460,7 +562,7 @@ extroot: !macro def test_list_macro_application(register_macros, macro_store_reset): dat = yaml.load(""" -defs: +macros: - !defmacro name: test params: @@ -484,14 +586,14 @@ testnode: test2: a: 4 """, Loader=yaml.SafeLoader) - assert dat["testnode"]["obl"]["expanded_4"]["param"] == "4" - assert dat["testnode"]["obl"]["expanded_2"]["param"] == "2" - assert dat["testnode"]["obl"]["expanded_4_test2"]["param"] == "4" + assert dat["testnode"]["obl"]["expanded_4"]["param"] == 4 + assert dat["testnode"]["obl"]["expanded_2"]["param"] == 2 + assert dat["testnode"]["obl"]["expanded_4_test2"]["param"] == 4 def test_variable_in_macro_definition(register_macros, macro_store_reset): dat = yaml.load(""" -defs: +macros: - !defmacro name: test params: @@ -509,7 +611,7 @@ testnode: - a: 2 b: 4 """, Loader=yaml.SafeLoader) - assert dat["testnode"]["obl"]["expanded_4"]["param"] == "4" - assert dat["testnode"]["obl"]["expanded_4"]["param_b"] == "4" - assert dat["testnode"]["obl"]["expanded_2"]["param"] == "2" - assert dat["testnode"]["obl"]["expanded_2"]["param_b"] == "4" + assert dat["testnode"]["obl"]["expanded_4"]["param"] == 4 + assert dat["testnode"]["obl"]["expanded_4"]["param_b"] == 4 + assert dat["testnode"]["obl"]["expanded_2"]["param"] == 2 + assert dat["testnode"]["obl"]["expanded_2"]["param_b"] == 4 diff --git a/unittests/test_parent_cfood.yml b/unittests/test_parent_cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..21b49a2db8ac44f806c77718b2fa49fbc7488828 --- /dev/null +++ b/unittests/test_parent_cfood.yml @@ -0,0 +1,39 @@ +--- +metadata: + crawler-version: 0.9.0 +--- +Definitions: + type: Definitions + +data: + type: Dict + match_name: '.*' + records: + Experiment: + Projekt: + parents: ["project"] + name: "p" + Campaign: + name: "c" + Stuff: + name: "s" + subtree: + Experiment: + type: DictElement + match: '.*' + records: + Experiment: + parents: ["Exp"] + name: "e" + Projekt: + parents: ["Projekt"] + Campaign: + parents: ["Cap"] + Stuff: + name: "s" + Experiment2: + type: DictElement + match: '.*' + records: + Campaign: + parents: ["Cap2"] diff --git a/unittests/test_rocrate_converter.py b/unittests/test_rocrate_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..4b6bde171c789017e95a38729ae93f49ecf3f97b --- /dev/null +++ b/unittests/test_rocrate_converter.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +test the XML converters +""" +import importlib +import os +from pathlib import Path + +import linkahead as db +import pytest +import rocrate +import yaml +from caoscrawler import scanner +from caoscrawler.converters import ELNFileConverter, ROCrateEntityConverter +from caoscrawler.stores import GeneralStore +from caoscrawler.structure_elements import (DictElement, File, ROCrateEntity, + TextElement) +from rocrate.model.entity import Entity + +UNITTESTDIR = Path(__file__).parent + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "ELNFile": { + "converter": "ELNFileConverter", + "package": "caoscrawler.converters"}, + "ROCrateEntity": { + "converter": "ROCrateEntityConverter", + "package": "caoscrawler.converters", + } + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +@pytest.fixture +def basic_eln_converter(converter_registry): + return ELNFileConverter(yaml.safe_load(""" +type: ELNFile +match: .*\\.eln +"""), "TestELNConverter", converter_registry) + + +@pytest.fixture +def eln_entities(basic_eln_converter): + f_k4mat = File("records-example.eln", + os.path.join(UNITTESTDIR, "eln_files", "records-example.eln")) + store = GeneralStore() + entities = basic_eln_converter.create_children(store, f_k4mat) + return entities + + +@pytest.mark.xfail( + reason="The example files for PASTA have not yet been updated in:" + "https://github.com/TheELNConsortium/TheELNFileFormat/tree/master/examples/PASTA" + "However, there was the announcement that these files are going to follow the" + "flattened structure soon: https://github.com/TheELNConsortium/TheELNFileFormat/issues/98" +) +def test_load_pasta(basic_eln_converter): + """ + Test for loading the .eln example export from PASTA. + """ + f_pasta = File("PASTA.eln", os.path.join(UNITTESTDIR, "eln_files", "PASTA.eln")) + match = basic_eln_converter.match(f_pasta) + assert match is not None + entities = basic_eln_converter.create_children(GeneralStore(), f_pasta) + assert len(entities) == 20 + assert isinstance(entities[0], ROCrateEntity) + assert isinstance(entities[0].folder, str) + assert isinstance(entities[0].entity, Entity) + + +def test_load_kadi4mat(basic_eln_converter): + """ + Test for loading the .eln example export from PASTA. + """ + f_k4mat = File("records-example.eln", + os.path.join(UNITTESTDIR, "eln_files", "records-example.eln")) + match = basic_eln_converter.match(f_k4mat) + assert match is not None + entities = basic_eln_converter.create_children(GeneralStore(), f_k4mat) + assert len(entities) == 17 + assert isinstance(entities[0], ROCrateEntity) + assert isinstance(entities[0].folder, str) + assert isinstance(entities[0].entity, Entity) + + +def test_match_rocrate_entities(eln_entities): + ds1 = ROCrateEntityConverter(yaml.safe_load(""" +type: ROCrateEntity +match_properties: + "@id": \\./ + datePublished: (?P<datePublished>.*) +"""), "TestELNConverter", converter_registry) + + match = ds1.match(eln_entities[0]) + assert match is not None + + ds2 = ROCrateEntityConverter(yaml.safe_load(""" +type: ROCrateEntity +match_type: CreativeWork +match_properties: + "@id": ro-crate-metadata.json + dateCreated: (?P<dateCreated>.*) +"""), "TestELNConverter", converter_registry) + + match = ds2.match(eln_entities[0]) + assert match is None + match = ds1.match(eln_entities[1]) + assert match is None + + match = ds2.match(eln_entities[1]) + assert match is not None + assert match["dateCreated"] == "2024-11-19T13:44:35.476888+00:00" + + children = ds2.create_children(GeneralStore(), eln_entities[1]) + assert len(children) == 8 + assert isinstance(children[0], TextElement) + assert children[0].name == "@id" + assert children[0].value == "ro-crate-metadata.json" + assert isinstance(children[5], ROCrateEntity) + assert children[5].name == "https://kadi.iam.kit.edu" + + +def test_file(eln_entities): + ds_csv = ROCrateEntityConverter(yaml.safe_load(""" +type: ROCrateEntity +match_type: File +match_properties: + "@id": .*\.csv$ +"""), "TestELNConverter", converter_registry) + + ent_csv = eln_entities[5] + match = ds_csv.match(ent_csv) + assert match is not None + + children = ds_csv.create_children(GeneralStore(), ent_csv) + + # Number of children = number of properties + number of files: + assert len(children) == len(ent_csv.entity.properties()) + 1 + # Get the file: + f_csv = [f for f in children if isinstance(f, File)][0] + with open(f_csv.path) as f: + text = f.read() + assert "Ultrasound Transducer" in text + + +def test_has_part(eln_entities): + ds_parts = ROCrateEntityConverter(yaml.safe_load(""" +type: ROCrateEntity +match_type: Dataset +match_properties: + "@id": records-example/ +"""), "TestELNConverter", converter_registry) + + ent_parts = eln_entities[2] + match = ds_parts.match(ent_parts) + assert match is not None + + children = ds_parts.create_children(GeneralStore(), ent_parts) + # Number of children = number of properties + number of parts + + # number of variables measured + number of files + assert len(children) == (len(ent_parts.entity.properties()) + + len(ent_parts.entity.properties()["hasPart"]) + + len(ent_parts.entity.properties()["variableMeasured"])) + + entity_children = [f for f in children if isinstance(f, ROCrateEntity)] + assert len(entity_children) == 13 + file_counter = 0 + + for f in entity_children: + if isinstance(f.entity, rocrate.model.file.File): + file_counter += 1 + assert file_counter == 4 + + +def test_scanner(): + rlist = scanner.scan_directory(os.path.join(UNITTESTDIR, "eln_files/"), + os.path.join(UNITTESTDIR, "eln_cfood.yaml")) + assert len(rlist) == 1 + assert isinstance(rlist[0], db.Record) + assert rlist[0].name == "records-example" + # This assertion was moved to a different test, see below: + # assert rlist[0].description == "This is a sample record." + assert rlist[0].parents[0].name == "Dataset" + assert rlist[0].get_property("keywords").value == "sample" + assert rlist[0].get_property("dateModified").value == "2024-08-21T11:43:17.626965+00:00" + + +def test_description_reference(): + rlist = scanner.scan_directory(os.path.join(UNITTESTDIR, "eln_files/"), + os.path.join(UNITTESTDIR, "eln_cfood.yaml")) + assert rlist[0].description == "This is a sample record." diff --git a/unittests/test_scalars_cfood.py b/unittests/test_scalars_cfood.py index 1bf8f0b7d67f00f2018b5b68424d6b9cc17602eb..577fcd5f6c93bee2bc05451983d358aa2e07f798 100644 --- a/unittests/test_scalars_cfood.py +++ b/unittests/test_scalars_cfood.py @@ -2,24 +2,20 @@ # Tests for: # https://gitlab.com/caosdb/caosdb-crawler/-/issues/9 # A. Schlemmer, 06/2021 +from pathlib import Path import pytest +from utils import dircheckstr # The main function that is affected by this issue: from caoscrawler.converters import handle_value from caoscrawler.crawl import Crawler +from caoscrawler.debug_tree import DebugTree +from caoscrawler.scanner import scan_directory # We need the store for the above function from caoscrawler.stores import GeneralStore -from test_tool import dircheckstr, rfp - - -@pytest.fixture -def crawler(): - crawler = Crawler(debug=True) - crawler.crawl_directory(rfp("test_directories", "examples_article"), - rfp("cfoods_scalar.yml")) - return crawler +UNITTESTDIR = Path(__file__).parent def test_handle_value(): @@ -27,31 +23,43 @@ def test_handle_value(): store = GeneralStore() # This one should work: - assert handle_value("bla", store) == ("bla", "single") + assert handle_value("bla", store) == ("bla", None, "single") # These failed: - assert handle_value(4, store) == (4, "single") - assert handle_value(4.2, store) == (4.2, "single") - assert handle_value(True, store) == (True, "single") + assert handle_value(4, store) == (4, None, "single") + assert handle_value(4.2, store) == (4.2, None, "single") + assert handle_value(True, store) == (True, None, "single") # List test: - assert handle_value([4, 3, 2], store) == ([4, 3, 2], "single") + assert handle_value([4, 3, 2], store) == ([4, 3, 2], None, "single") -def test_record_structure_generation(crawler): - subd = crawler.debug_tree[dircheckstr("DataAnalysis")] +def test_record_structure_generation(): + dbt = DebugTree() + scan_directory(UNITTESTDIR / "test_directories" / "examples_article", + UNITTESTDIR / "cfoods_scalar.yml", + debug_tree=dbt) + subd = dbt.debug_tree[dircheckstr( + UNITTESTDIR / "test_directories" / "examples_article", "DataAnalysis")] assert len(subd) == 2 # variables store on Data Analysis node of debug tree - assert len(subd[0]) == 3 - assert "Data" in subd[0] - assert "DataAnalysis" in subd[0] - assert "RecordThatGetsParentsLater" in subd[0] + if "Data" in subd[0]: + subddata = subd[0] + subdRTGPL = subd[1] + else: + subddata = subd[1] + subdRTGPL = subd[0] + assert len(subddata) == 5 + assert "DataAnalysis" in subddata + assert "DataAnalysis.path" in subddata + assert "Data.path" in subddata + assert "RecordThatGetsParentsLater" in subddata - prop = subd[0]["RecordThatGetsParentsLater"].get_property("someId") + prop = subddata["RecordThatGetsParentsLater"].get_property("someId") assert type(prop.value) == int assert prop.value == 23 # record store on Data Analysis node of debug tree - assert len(subd[1]) == 1 - prop2 = subd[1]["RecordThatGetsParentsLater"].get_property("someId") + assert len(subdRTGPL) == 1 + prop2 = subdRTGPL["RecordThatGetsParentsLater"].get_property("someId") assert prop == prop2 diff --git a/unittests/test_scanner.py b/unittests/test_scanner.py new file mode 100644 index 0000000000000000000000000000000000000000..c531f66fd38a714ba4f6f538d41c9fbaeb364d44 --- /dev/null +++ b/unittests/test_scanner.py @@ -0,0 +1,497 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2023,2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2023,2024 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# 2021-2023 Research Group Biomedical Physics, +# Max-Planck-Institute for Dynamics and Self-Organization Göttingen +# Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +""" +Unit test functions for the scanner. +""" + +from functools import partial +from pathlib import Path +from tempfile import NamedTemporaryFile +from unittest.mock import MagicMock, Mock, patch +import os +import linkahead as db +import pytest +import yaml +from pytest import raises +from utils import dircheckstr as dircheck_base + +from caoscrawler.crawl import Crawler +from caoscrawler.debug_tree import DebugTree +from caoscrawler.scanner import (_load_definition_from_yaml_dict, + create_converter_registry, load_definition, + scan_directory, scan_structure_elements) +from caoscrawler.structure_elements import (DictElement, DictListElement, + DictTextElement, File) + +UNITTESTDIR = Path(__file__).parent + +dircheckstr = partial(dircheck_base, UNITTESTDIR / "test_directories" / "examples_article") + + +def test_scan_structure_elements(): + tmpfi = NamedTemporaryFile(delete=False) + with open(UNITTESTDIR / "example_datastructure.yml", "r") as f: + data = yaml.load(f, Loader=yaml.SafeLoader) + + crawler_definition = load_definition(UNITTESTDIR / "example_cfood.yml") + converter_registry = create_converter_registry(crawler_definition) + recs = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry) + assert len(recs) == 4 + + +def test_provenance_debug_data(): + # TODO rewrite the test to use a smaller example setup + tmpfi = NamedTemporaryFile(delete=False) + debug_tree = DebugTree() + with open(UNITTESTDIR / "example_datastructure.yml", "r") as f: + data = yaml.load(f, Loader=yaml.SafeLoader) + + crawler_definition = load_definition(UNITTESTDIR / "example_cfood.yml") + converter_registry = create_converter_registry(crawler_definition) + stuff = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry, debug_tree=debug_tree) + crawler = Crawler() + crawler.save_debug_data(tmpfi.name, debug_tree) + with open(tmpfi.name, "r") as f: + provenance = yaml.load(f, Loader=yaml.SafeLoader) + + pr = provenance["provenance"] + + def check_key_count(prefix): + return sum([1 for key in pr.keys() if key.startswith(prefix)]) + assert check_key_count("Ent") == 4 + + +def test_record_structure_generation(): + # TODO create a test from this that tests scan_structure + # the cfood should be minimal but cover typical scenarios (e.g. children) + # add also a minimal test for scan_directory; it can be very basic since the only difference + # to scan_structure is the kind of starting structure_element (check this statement) + # The test should not check debug tree output but actual created records + + # TODO test creation of debug information in a separate test + + dbt = DebugTree() + scan_directory(UNITTESTDIR / "test_directories" / "examples_article", + UNITTESTDIR / "scifolder_cfood.yml", + debug_tree=dbt) + subd = dbt.debug_tree[dircheckstr("DataAnalysis")] + subc = dbt.debug_metadata["copied"][dircheckstr("DataAnalysis")] + assert len(subd) == 2 + # variables store on Data Analysis node of debug tree + assert len(subd[0]) == 4 + # record store on Data Analysis node of debug tree + assert len(subd[1]) == 0 + assert len(subc) == 2 + assert len(subc[0]) == 4 + assert len(subc[1]) == 0 + + # The data analysis node creates one variable for the node itself: + assert subd[0]["DataAnalysis"] == os.path.join("examples_article", "DataAnalysis") + assert subc[0]["DataAnalysis"] is False + + subd = dbt.debug_tree[dircheckstr("DataAnalysis", "2020_climate-model-predict")] + subc = dbt.debug_metadata["copied"][dircheckstr("DataAnalysis", "2020_climate-model-predict")] + + assert len(subd[1]) == 1 + assert len(subd[1]["Project"].get_parents()) == 1 + assert subd[1]["Project"].get_parents()[0].name == "Project" + assert subd[1]["Project"].get_property("date").value == "2020" + assert subd[1]["Project"].get_property( + "identifier").value == "climate-model-predict" + + assert len(subd[0]) == 9 + assert subd[0]["date"] == "2020" + assert subd[0]["identifier"] == "climate-model-predict" + assert subd[0]["Project"].__class__ == db.Record + + assert subd[0]["DataAnalysis"] == os.path.join("examples_article", "DataAnalysis") + assert subc[0]["DataAnalysis"] is True + assert subd[0]["project_dir"] == os.path.join( + "examples_article", "DataAnalysis", "2020_climate-model-predict") + assert subc[0]["project_dir"] is False + + # Check the copy flags for the first level in the hierarchy: + assert len(subc[0]) == 9 + assert len(subc[1]) == 1 + assert subc[1]["Project"] is False + assert subc[0]["Project"] is False + assert subc[0]["date"] is False + assert subc[0]["identifier"] is False + + subd = dbt.debug_tree[dircheckstr("DataAnalysis", + "2020_climate-model-predict", + "2020-02-08_prediction-errors")] + subc = dbt.debug_metadata["copied"][dircheckstr("DataAnalysis", + "2020_climate-model-predict", + "2020-02-08_prediction-errors")] + assert len(subd[0]) == 12 + assert subd[0]["date"] == "2020-02-08" + assert subd[0]["identifier"] == "prediction-errors" + assert subd[0]["Project"].__class__ == db.Record + assert subd[0]["Measurement"].__class__ == db.Record + + assert len(subd[1]) == 2 + + assert len(subd[1]["Project"].get_parents()) == 1 + assert subd[1]["Project"].get_parents()[0].name == "Project" + assert subd[1]["Project"].get_property("date").value == "2020" + assert subd[1]["Project"].get_property( + "identifier").value == "climate-model-predict" + + assert len(subd[1]["Measurement"].get_parents()) == 1 + assert subd[1]["Measurement"].get_parents()[0].name == "Measurement" + assert subd[1]["Measurement"].get_property("date").value == "2020-02-08" + assert subd[1]["Measurement"].get_property( + "identifier").value == "prediction-errors" + assert subd[1]["Measurement"].get_property("project").value != "$Project" + assert subd[1]["Measurement"].get_property( + "project").value.__class__ == db.Record + assert subd[1]["Measurement"].get_property( + "project").value == subd[0]["Project"] + + # Check the copy flags for the second level in the hierarchy: + assert subc[1]["Project"] is True + assert subc[0]["Project"] is True + assert subc[1]["Measurement"] is False + assert subc[0]["Measurement"] is False + assert subc[0]["date"] is False + assert subc[0]["identifier"] is False + + +def test_record_generation(): + """ + Test the correct list of returned records by the scanner using the + scifolder example from the article. + """ + + records = scan_directory(UNITTESTDIR / "test_directories" / "examples_article", + UNITTESTDIR / "scifolder_cfood.yml") + + def parent_filter(parent_name): + return [p for p in records if len(p.parents) == 1 and p.parents[0].name == parent_name] + + def check_properties(records, check_props, check_additional=True): + records_found = [0 for r in check_props] + for rec in records: + rec_found = 0 + # Try each record to check + for i, check_prop in enumerate(check_props): + matches = True + # Verify that all props are in the record and have the right value + for pr in check_prop: + if rec.get_property(pr) is None: + matches = False + break + if check_prop[pr] is None: + if rec.get_property(pr).value is not None: + matches = False + break + else: + if rec.get_property(pr).value != check_prop[pr]: + matches = False + break + if check_additional: + # Verify that there are no additional props in the record + for rpr in rec.properties: + if rpr.name not in check_prop: + matches = False + break + if matches: + records_found[i] += 1 + return records_found + + # Check projects: + # Ther are two projects in mixed categories: climate_model_predict and SpeedOfLight + projects_found = check_properties(parent_filter("Project"), [ + {"identifier": "climate-model-predict", "date": "2020"}, + {"identifier": "SpeedOfLight", "date": "2020"} + ]) + assert projects_found == [3, 2] + + measurements = parent_filter("Measurement") + assert len(measurements) == 11 + measurements_found = check_properties(measurements, [ + {"identifier": "prediction-errors", "date": "2020-02-08"}, + {"identifier": "average-all-exp", "date": "2020-01-04"}, + {"identifier": "average-all-exp-corr", "date": "2020-01-05"}, + {"date": "1980-01-01", "identifier": None}, + {"date": "1990-01-01", "identifier": None}, + {"date": "2000-01-01", "identifier": None}, + {"date": "2010-01-01", "identifier": None}, + {"date": "2020-01-01", "identifier": "TimeOfFlight"}, + {"date": "2020-01-02", "identifier": "Cavity"}, + {"date": "2020-01-03", "identifier": None}, + {"date": "2020-02-01", "identifier": None}, + ], False) + for f in measurements_found: + assert f == 1 + + persons = parent_filter("Person") + check_props = [ + {"first_name": None, "last_name": "Author" + letter} for letter in + ("A", "B", "C", "D", "E")] + persons_found = check_properties(persons, check_props) + for f in persons_found: + assert f > 0 + + +def test_variable_deletion_problems(): + records = scan_directory(UNITTESTDIR / "test_directories" / "example_variable_deletion", + UNITTESTDIR / "cfood_variable_deletion.yml") + + for record in records: + if record.name == "Record from Data_1": + assert record.get_property("var1").value == "bla" + assert record.get_property("var2").value == "$test_2" + elif record.name == "Record from Data_2": + assert record.get_property("var1").value == "$test_1" + assert record.get_property("var2").value == "test" + else: + raise RuntimeError("Wrong name") + + records = scan_directory(UNITTESTDIR / "test_directories" / "example_variable_deletion", + UNITTESTDIR / "cfood_variable_deletion2.yml") + + # For the following test the order of records is actually important: + assert records[0].name == "Record from Data_1" + assert records[1].name == "Record from Data_2" + for record in records: + if record.name == "Record from Data_1": + assert record.get_property("var1").value == "bla" + assert record.get_property("var2").value == "$test_2" + elif record.name == "Record from Data_2": + assert record.get_property("var1").value == "example_variable_deletion" + assert record.get_property("var2").value == "test" + else: + raise RuntimeError("Wrong name") + + +def test_record_parents(): + """ Test the correct list of returned records by the scanner """ + + data = { + 'Experiments': {} + } + + crawler_definition = load_definition(UNITTESTDIR / "test_parent_cfood.yml") + converter_registry = create_converter_registry(crawler_definition) + + records = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry) + assert len(records) == 4 + for rec in records: + if rec.name == 'e': + assert rec.parents[0].name == 'Exp' # default parent was overwritten + assert len(rec.parents) == 1 + elif rec.name == 'c': + assert rec.parents[0].name == 'Cap2' # default parent was overwritten by second + # converter + assert len(rec.parents) == 1 + elif rec.name == 'p': + assert rec.parents[0].name == 'Projekt' # top level set parent was overwritten + assert len(rec.parents) == 1 + elif rec.name == 's': + assert rec.parents[0].name == 'Stuff' # default parent stays if no parent is given on + # lower levels + assert len(rec.parents) == 1 + + +def test_error_messages(): + data = { + 'Experiments': {} + } + + broken_yaml = """ +EmptyConverter: + """ + broken_definition = _load_definition_from_yaml_dict( + [yaml.load(broken_yaml, Loader=yaml.SafeLoader)]) + + converter_registry = create_converter_registry(broken_definition) + + with pytest.raises(RuntimeError, match="Definition of converter \"EmptyConverter\" is empty"): + scan_structure_elements(DictElement(name="", value=data), + broken_definition, converter_registry) + + broken_yaml = """ +Converter: + type: DictElement + records: + TestRecord: "42" + """ + + broken_definition = _load_definition_from_yaml_dict( + [yaml.load(broken_yaml, Loader=yaml.SafeLoader)]) + + converter_registry = create_converter_registry(broken_definition) + + with pytest.raises(RuntimeError, match="dict expected, but found str: 42"): + scan_structure_elements(DictElement(name="", value=data), + broken_definition, converter_registry) + + +def test_units(): + """Test the correct setting of units.""" + crawler_definition = load_definition(UNITTESTDIR / "test_unit_cfood.yml") + converter_registry = create_converter_registry(crawler_definition) + + data = { + "value_with_unit": "1.1 m", + "array_with_units": [ + "1.1 cm", + "2.2 cm" + ] + } + records = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry) + assert len(records) == 1 + rec = records[0] + # This is hard-coded in cfood: + assert rec.get_property("may_be_overwritten") is not None + assert rec.get_property("may_be_overwritten").value == "12" + assert rec.get_property("may_be_overwritten").unit == "K" + # Those are set from data + assert rec.get_property("value_with_unit") is not None + assert rec.get_property("value_with_unit").value == "1.1" + assert rec.get_property("value_with_unit").unit == "m" + assert rec.get_property("list_with_unit") is not None + assert rec.get_property("list_with_unit").value == ["1.1", "2.2"] + assert rec.get_property("list_with_unit").unit == "cm" + + # Contradictory units + data = { + "array_with_units": [ + "1.1 K", + "45 W" + ] + } + with raises(RuntimeError) as rte: + records = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry) + assert "Property 'list_with_unit' has contradictory units" in str(rte.value) + + # Overwrite value and unit + data = { + "may_be_overwritten": "400 °C" + } + records = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry) + assert len(records) == 1 + rec = records[0] + # Now set from data + assert rec.get_property("may_be_overwritten") is not None + assert rec.get_property("may_be_overwritten").value == "400" + assert rec.get_property("may_be_overwritten").unit == "°C" + + +def test_recursive_definition(): + """ + This is basically a test for: + https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/16 + """ + + recursive_yaml = """ +Converter: + type: DictElement + records: + Block: + Experiment: $Experiment + Experiment: + Block: $Block + """ + + crawler_definition = _load_definition_from_yaml_dict( + [yaml.load(recursive_yaml, Loader=yaml.SafeLoader)]) + converter_registry = create_converter_registry(crawler_definition) + + data = { + "value_with_unit": "1.1 m", + "array_with_units": [ + "1.1 cm", + "2.2 cm" + ] + } + records = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry) + + assert len(records) == 2 + assert len(records[0].parents) == 1 + assert records[0].parents[0].name == "Block" + assert len(records[1].parents) == 1 + assert records[1].parents[0].name == "Experiment" + + assert records[0].get_property("Experiment").value == records[1] + assert records[1].get_property("Block").value == records[0] + + +def test_recursive_definition_2(): + """ + This is another a test for: + https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/16 + + It defines Experiment on a different level, therefore allowing the recursive definition. + This is, however, no workaround for test_recursive_definition as a bidirectional link on the + same level is still not achieved. + """ + + recursive_yaml = """ +FirstConverter: + type: DictElement + records: + Experiment: + subtree: + Converter: + type: DictElement + records: + Block: + Experiment: $Experiment + Experiment: + Block: $Block + """ + + crawler_definition = _load_definition_from_yaml_dict( + [yaml.load(recursive_yaml, Loader=yaml.SafeLoader)]) + converter_registry = create_converter_registry(crawler_definition) + + data = {"data": { + "value_with_unit": "1.1 m", + "array_with_units": [ + "1.1 cm", + "2.2 cm" + ] + }} + records = scan_structure_elements(DictElement(name="", value=data), crawler_definition, + converter_registry) + + assert len(records) == 2 + assert len(records[0].parents) == 1 + assert records[0].parents[0].name == "Block" + assert len(records[1].parents) == 1 + assert records[1].parents[0].name == "Experiment" + + assert records[0].get_property("Experiment").value == records[1] + assert records[1].get_property("Block").value == records[0] diff --git a/unittests/test_schema.py b/unittests/test_schema.py index 0736698eb32146fb3cfbee6acbcf11f5436df27e..96c388ac362583eda13ca368519467c34446868e 100644 --- a/unittests/test_schema.py +++ b/unittests/test_schema.py @@ -2,16 +2,16 @@ # Tests for schema validation # A. Schlemmer, 06/2021 -from importlib_resources import files -import caosdb as db - -from os.path import join, dirname -from caoscrawler import Crawler +from os.path import dirname, join +import linkahead as db import pytest +from importlib_resources import files +from jsonschema.exceptions import ValidationError from pytest import raises -from jsonschema.exceptions import ValidationError +from caoscrawler import Crawler +from caoscrawler.scanner import load_definition def rfp(*pathcomponents): @@ -23,9 +23,15 @@ def rfp(*pathcomponents): def test_schema_validation(): - cr = Crawler() - cr.load_definition(rfp("scifolder_cfood.yml")) - cr.load_definition(rfp("scifolder_extended.yml")) + load_definition(rfp("scifolder_cfood.yml")) + load_definition(rfp("scifolder_extended.yml")) + load_definition(rfp("record_from_dict_cfood.yml")) with raises(ValidationError, match=".*enum.*"): - cr.load_definition(rfp("broken_cfoods", "broken1.yml")) + load_definition(rfp("broken_cfoods", "broken1.yml")) + + with raises(ValidationError, match=".*required.*"): + load_definition(rfp("broken_cfoods", "broken_record_from_dict.yml")) + + with raises(ValidationError, match=".*required.*"): + load_definition(rfp("broken_cfoods", "broken_record_from_dict_2.yml")) diff --git a/unittests/test_scripts.py b/unittests/test_scripts.py new file mode 100644 index 0000000000000000000000000000000000000000..da03c1f24fbd3d7ca13cfa55d6f69c0cb5a6a6f1 --- /dev/null +++ b/unittests/test_scripts.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +# This file is a part of the LinkAhead project. +# +# Copyright (C) 2024 IndiScale GmbH <www.indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Test if the scripts work as expected. +""" + +from subprocess import run + +SCRIPTS = [ + "linkahead-crawler", + "caosdb-crawler", + "spss_to_datamodel", + "csv_to_datamodel", +] + + +def test_script_loading(): + """Run the scripts with "-h".""" + for script in SCRIPTS: + run([script, "-h"], check=True) diff --git a/unittests/test_spss_converter.py b/unittests/test_spss_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..59fe723849dadcda21a699416372f08f2756f4e1 --- /dev/null +++ b/unittests/test_spss_converter.py @@ -0,0 +1,79 @@ +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Testing converter for SPSS files.""" + +import datetime +import importlib +from pathlib import Path + +import numpy as np +import pytest + +from caoscrawler.converters import ConverterValidationError, SPSSConverter +from caoscrawler.structure_elements import (BooleanElement, DictElement, + Directory, File, FloatElement, + IntegerElement, ListElement, + TextElement) + +UNITTESTDIR = Path(__file__).parent + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +def test_spss_converter(converter_registry): + converter = SPSSConverter({ + "match": ("sample.sav") + }, + "ThisConverterNameIsIrrelevant", converter_registry + ) + + spss_dir = UNITTESTDIR / "test_tables" / "spss" + for sav_file, length, thistype in [ + (File("sample.sav", spss_dir / "sample.sav"), 5, str), + (File("sample.sav", spss_dir / "sample_large.sav"), 485, int), + ]: + m = converter.match(sav_file) + assert m is not None + assert len(m) == 0 + + children = converter.create_children(None, sav_file) + assert len(children) == length + + for ii, child in enumerate(children): + assert child.__class__ == DictElement + assert child.name == str(ii) + my_dict = child.value + assert isinstance(my_dict["mychar"], str) + assert isinstance(my_dict["mydate"], datetime.date) or np.isnan(my_dict["mydate"]) + assert isinstance(my_dict["dtime"], datetime.datetime) or np.isnan(my_dict["dtime"]) + assert isinstance(my_dict["mytime"], datetime.time) or np.isnan(my_dict["mytime"]) + assert isinstance(my_dict["mylabl"], thistype), f"{type(my_dict['mylabl'])}" + assert isinstance(my_dict["myord"], thistype), f"{type(my_dict['myord'])}" diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..030306b95578865cdbfe19bdef2998a573848bd5 --- /dev/null +++ b/unittests/test_sync_graph.py @@ -0,0 +1,698 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +import logging +from functools import partial +from itertools import product +from unittest.mock import MagicMock, Mock, patch + +import linkahead as db +import pytest +from test_crawler import (basic_retrieve_by_name_mock_up, + mock_cached_only_rt_allow_empty, mock_get_entity_by) + +from caoscrawler.exceptions import (MissingIdentifyingProperty, + MissingRecordType) +from caoscrawler.identifiable import Identifiable +from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter +from caoscrawler.sync_graph import SyncGraph, _set_each_scalar_value +from caoscrawler.sync_node import SyncNode, parent_in_list, property_in_list + + +@pytest.fixture +def simple_adapter(): + # different RTs with different registered identifiables to allow to test various behavior + ident_adapter = CaosDBIdentifiableAdapter() + ident_adapter.register_identifiable( + "RT1", + db.RecordType().add_parent("RT1").add_property("RT2")) + ident_adapter.register_identifiable( + "RT2", + db.RecordType().add_parent("RT2").add_property("is_referenced_by", ["RT1", "RT3"])) + ident_adapter.register_identifiable( + "RT3", + db.RecordType().add_parent("RT3").add_property("a")) + ident_adapter.register_identifiable( + "RT4", + db.RecordType().add_parent("RT4").add_property("RT3")) + ident_adapter.register_identifiable( + "RT5", + db.RecordType().add_parent("RT5").add_property("name")) + return ident_adapter + + +def test_create_flat_list(): + a = db.Record() + b = db.Record() + a.add_property(name="a", value=a) + a.add_property(name="b", value=b) + flat = SyncGraph._create_flat_list([a]) + assert len(flat) == 2 + assert a in flat + assert b in flat + c = db.Record() + c.add_property(name="a", value=a) + # This would cause a recursion error if it is not dealt with properly. + a.add_property(name="c", value=c) + flat = SyncGraph._create_flat_list([c]) + assert len(flat) == 3 + assert a in flat + assert b in flat + assert c in flat + + # Test for lists: + a = db.Record() + b = db.Record() + d = db.Record() + a.add_property(name="a", value=a) + a.add_property(name="list", value=[b, d]) + flat = SyncGraph._create_flat_list([a]) + assert len(flat) == 3 + assert a in flat + assert b in flat + assert d in flat + + c = db.Record() + c.add_property(name="a", value=a) + # This would cause a recursion error if it is not dealt with properly. + a.add_property(name="second_list", value=[b, d, c]) + flat = SyncGraph._create_flat_list([c]) + assert len(flat) == 4 + assert a in flat + assert b in flat + assert c in flat + assert d in flat + + +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_create_reference_mapping(): + a = SyncNode(db.Record().add_parent("RT1"), + db.RecordType().add_property("is_referenced_by", ["RT2"])) + b = SyncNode(db.Record(id=132).add_parent("RT2").add_property('a', a), + db.RecordType().add_property("a")) + ses = [a, b] + + mappings = SyncGraph._create_reference_mapping(ses) + # test initialization + for index, mapping in product((0, 1), mappings): + assert id(ses[index]) in mapping + + (forward_references, backward_references, forward_references_id_props, + backward_references_id_props, forward_references_backref, + backward_references_backref) = mappings + + # a has no ref + assert len(forward_references[id(a)]) == 0 + assert backward_references[id(a)] == set([b]) + # b does + assert forward_references[id(b)] == set([a]) + assert backward_references[id(b)] == set() + # a has no identifying reference + assert forward_references_id_props[id(a)] == set() + assert backward_references_id_props[id(a)] == set([b]) + # b has an identifying reference + assert forward_references_id_props[id(b)] == set([a]) + assert backward_references_id_props[id(b)] == set() + # a has an identifying back reference + assert forward_references_backref[id(a)] == set() + assert backward_references_backref[id(a)] == set([b]) + # b does not + assert forward_references_backref[id(b)] == set([a]) + assert backward_references_backref[id(b)] == set() + + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) +@patch("caoscrawler.sync_graph.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +def test_SyncGraph_init(): + # trivial case + a = db.Record(id=101).add_parent("A") + ident_a = db.RecordType().add_parent("A").add_property("prop_ident") + ident_adapter = CaosDBIdentifiableAdapter() + ident_adapter.register_identifiable("A", ident_a) + SyncGraph([a], ident_adapter) + SyncGraph([], ident_adapter) # should not fail either... + # test whether missing identifying properties cause an exception + with pytest.raises(MissingIdentifyingProperty): + SyncGraph([db.Record().add_parent("A")], ident_adapter) + + entlist = [ + db.Record(id=101).add_parent("A"), + db.Record(id=102).add_parent("A"), + db.File(path='a').add_parent("A"), + db.File(path='b').add_parent("A"), + db.Record(id=103).add_parent("A"), + db.Record(id=104).add_parent("A").add_property(name='prop_ident', value="MERGEME"), + db.Record().add_parent("A").add_property(name='prop_ident', value="MERGEME"), + db.File(path='a', file='b').add_parent("A"), + db.Record(id=101).add_parent("A"), + db.Record().add_parent("A").add_property(name='prop_ident', value="other"), + db.Record().add_parent("A").add_property(name='prop_ident', + value=db.Record().add_parent("A") + .add_property(name='prop_ident', value="other")), + db.File(path='a', file='b').add_parent("A"), + db.Record(id=101).add_parent("A"), + ] + st = SyncGraph(entlist, ident_adapter) + # all nodes with ID=101 have been merged + assert len([el for el in st.nodes if el.id == 101]) == 1 + # all nodes with path='a' have been merged + assert len([el for el in st.nodes if el.path == 'a']) == 1 + # all nodes with ID or path were removed from unchecked + for el in st.nodes: + if el.id is not None or el.path is not None: + assert el not in st.unchecked + # all nodes with ID are in the ID lookup + for el in st.nodes: + if el.id is not None: + assert st._id_look_up[el.id] is el + # all nodes with path are in the path lookup + for el in st.nodes: + if el.path is not None: + assert st._path_look_up[el.path] is el + # all nodes with identifiable are in the identifiable lookup + for el in st.nodes: + if el.identifiable is not None: + assert st._identifiable_look_up[el.identifiable.get_representation()] is el + # The node, which has no ID but has an identifiable, was merged with another node with ID (due + # to the shared identifiable) + new_one = [el for el in st.nodes if len(el.properties) > 0 + and el.properties[0].value == "MERGEME"] + assert len(new_one) == 1 + assert new_one[0].id == 104 + # every node that does not rely on something unchecked has an identifiable or an ID + for el in st.nodes: + if not st._identity_relies_on_unchecked_entity(el): + assert el.identifiable is not None or el.id is not None + + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_merge_into_trivial(simple_adapter): + # simplest case: a -> c + # b + # (a reference c; b does not reference anything; a & b have the same target + # record) + c = db.Record(name='c').add_parent("RT2") + a = db.Record(name='a').add_parent("RT1").add_property('RT2', c) + b = db.Record(id=101).add_parent("RT1") + + st = SyncGraph([a, b], simple_adapter) + se_a, se_b, se_c = st.nodes + assert se_a.name == 'a' + assert se_b.id == 101 + assert se_c.name == 'c' + + # CHECK REFERENCE MAP (before merge): + # c is referenced by a + assert len(st.forward_references[id(se_a)]) == 1 + assert se_c in st.forward_references[id(se_a)] + assert len(st.forward_references[id(se_b)]) == 0 + assert len(st.forward_references[id(se_c)]) == 0 + assert len(st.backward_references[id(se_a)]) == 0 + assert len(st.backward_references[id(se_b)]) == 0 + assert len(st.backward_references[id(se_c)]) == 1 + assert se_a in st.backward_references[id(se_c)] + + assert len(st.forward_references_id_props[id(se_a)]) == 1 + assert se_c in st.forward_references_id_props[id(se_a)] + assert len(st.forward_references_id_props[id(se_b)]) == 0 + assert len(st.forward_references_id_props[id(se_c)]) == 0 + assert len(st.backward_references_id_props[id(se_a)]) == 0 + assert len(st.backward_references_id_props[id(se_b)]) == 0 + assert len(st.backward_references_id_props[id(se_c)]) == 1 + assert se_a in st.backward_references_id_props[id(se_c)] + + assert len(st.forward_references_backref[id(se_a)]) == 1 + assert se_c in st.forward_references_backref[id(se_a)] + assert len(st.forward_references_backref[id(se_b)]) == 0 + assert len(st.forward_references_backref[id(se_c)]) == 0 + assert len(st.backward_references_backref[id(se_a)]) == 0 + assert len(st.backward_references_backref[id(se_b)]) == 0 + assert len(st.backward_references_backref[id(se_c)]) == 1 + assert se_a in st.backward_references_backref[id(se_c)] + + st.set_id_of_node(se_a, 101) + + # CHECK REFERENCE MAP (after merge): + # c is now referenced by b + assert id(se_a) not in st.forward_references + assert len(st.forward_references[id(se_b)]) == 1 + assert se_c in st.forward_references[id(se_b)] + assert len(st.forward_references[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references + assert len(st.backward_references[id(se_b)]) == 0 + assert len(st.backward_references[id(se_c)]) == 1 + assert se_b in st.backward_references[id(se_c)] + + assert id(se_a) not in st.forward_references_id_props + assert len(st.forward_references_id_props[id(se_b)]) == 1 + assert se_c in st.forward_references_id_props[id(se_b)] + assert len(st.forward_references_id_props[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references_id_props + assert len(st.backward_references_id_props[id(se_b)]) == 0 + assert len(st.backward_references_id_props[id(se_c)]) == 1 + assert se_b in st.backward_references_id_props[id(se_c)] + + assert id(se_a) not in st.forward_references_backref + assert len(st.forward_references_backref[id(se_b)]) == 1 + assert se_c in st.forward_references_backref[id(se_b)] + assert len(st.forward_references_backref[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references_backref + assert len(st.backward_references_backref[id(se_b)]) == 0 + assert len(st.backward_references_backref[id(se_c)]) == 1 + assert se_b in st.backward_references_backref[id(se_c)] + + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_merge_into_simple(simple_adapter): + # simple case: a -> c <- b (a & b reference c; a & b have the same target record) + c = db.Record(name='c').add_parent("RT2") + a = db.Record().add_parent("RT1").add_property('RT2', c) + b = db.Record().add_parent("RT1").add_property('RT2', c) + + st = SyncGraph([a, b], simple_adapter) + se_a = st.nodes[0] + se_b = st.nodes[1] + se_c = st.nodes[2] + + # CHECK REFERENCE MAP: + # c is referenced by a & b + assert len(st.forward_references[id(se_a)]) == 1 + se_c in st.forward_references[id(se_a)] + assert len(st.forward_references[id(se_b)]) == 1 + se_c in st.forward_references[id(se_b)] + assert len(st.forward_references[id(se_c)]) == 0 + assert len(st.backward_references[id(se_a)]) == 0 + assert len(st.backward_references[id(se_b)]) == 0 + assert len(st.backward_references[id(se_c)]) == 2 + se_a in st.backward_references[id(se_c)] + se_b in st.backward_references[id(se_c)] + + assert len(st.forward_references_id_props[id(se_a)]) == 1 + se_c in st.forward_references_id_props[id(se_a)] + assert len(st.forward_references_id_props[id(se_b)]) == 1 + se_c in st.forward_references_id_props[id(se_b)] + assert len(st.forward_references_id_props[id(se_c)]) == 0 + assert len(st.backward_references_id_props[id(se_a)]) == 0 + assert len(st.backward_references_id_props[id(se_b)]) == 0 + assert len(st.backward_references_id_props[id(se_c)]) == 2 + se_a in st.backward_references_id_props[id(se_c)] + se_b in st.backward_references_id_props[id(se_c)] + + assert len(st.forward_references_backref[id(se_a)]) == 1 + se_c in st.forward_references_backref[id(se_a)] + assert len(st.forward_references_backref[id(se_b)]) == 1 + se_c in st.forward_references_backref[id(se_b)] + assert len(st.forward_references_backref[id(se_c)]) == 0 + assert len(st.backward_references_backref[id(se_a)]) == 0 + assert len(st.backward_references_backref[id(se_b)]) == 0 + assert len(st.backward_references_backref[id(se_c)]) == 2 + se_a in st.backward_references_backref[id(se_c)] + se_b in st.backward_references_backref[id(se_c)] + + st._merge_into(se_a, se_b) + + # CHECK REFERENCE MAP (after merge): + # c is now referenced by b + # (same situation as above) + assert id(se_a) not in st.forward_references + assert len(st.forward_references[id(se_b)]) == 1 + se_c in st.forward_references[id(se_b)] + assert len(st.forward_references[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references + assert len(st.backward_references[id(se_b)]) == 0 + assert len(st.backward_references[id(se_c)]) == 1 + se_b in st.backward_references[id(se_c)] + + assert id(se_a) not in st.forward_references_id_props + assert len(st.forward_references_id_props[id(se_b)]) == 1 + se_c in st.forward_references_id_props[id(se_b)] + assert len(st.forward_references_id_props[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references_id_props + assert len(st.backward_references_id_props[id(se_b)]) == 0 + assert len(st.backward_references_id_props[id(se_c)]) == 1 + se_b in st.backward_references_id_props[id(se_c)] + + assert id(se_a) not in st.forward_references_backref + assert len(st.forward_references_backref[id(se_b)]) == 1 + se_c in st.forward_references_backref[id(se_b)] + assert len(st.forward_references_backref[id(se_c)]) == 0 + assert id(se_a) not in st.backward_references_backref + assert len(st.backward_references_backref[id(se_b)]) == 0 + assert len(st.backward_references_backref[id(se_c)]) == 1 + se_b in st.backward_references_backref[id(se_c)] + + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_backward_references_backref(): + # We use the reference as identifying reference in both directions. Thus the map is the same + # for all three categories: references, id_references and id_referenced_by + ident_a = db.RecordType().add_parent("BR").add_property("name") + ident_b = db.RecordType().add_parent("C").add_property("is_referenced_by", ["BR"]) + ident_adapter = CaosDBIdentifiableAdapter() + ident_adapter.register_identifiable("BR", ident_a) + ident_adapter.register_identifiable("C", ident_b) + + referenced = db.Record(name="B").add_parent("C") + ent_list = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] + + st = SyncGraph(ent_list, ident_adapter) + assert st.nodes[1] in st.backward_references_backref[id(st.nodes[0])] + + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) +@patch("caoscrawler.identifiable_adapters.get_children_of_rt", + new=Mock(side_effect=lambda x: [x])) +def test_set_id_of_node(simple_adapter): + # setting the id should lead to the node being marked as existing + ent_list = [db.Record(name='a').add_parent("RT5")] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 1 + st.set_id_of_node(st.unchecked[0], 101) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert id(st.nodes[0]) in st._existing + + # setting the id with None should lead to the node being marked as missing + ent_list = [db.Record().add_parent("RT1").add_property(name="RT2", value=1)] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 1 + # is automatically set in during initialization of graph + assert st.nodes[0].identifiable is not None + st.set_id_of_node(st.unchecked[0]) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert id(st.nodes[0]) in st._missing + + # setting the id to one that already exists should lead to a merge + ent_list = [ + db.Record(id=101).add_parent("RT5"), + db.Record(name='a').add_parent("RT5").add_property(name="RT2", value=1)] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 1 + st.set_id_of_node(st.unchecked[0], 101) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert st.nodes[0].properties[0].name == "RT2" + + # setting the id to None should lead to depending nodes marked as missing + ent_list = [ + db.Record().add_parent("RT3").add_property(name="a", value=1).add_property( + name="RT2", value=db.Record().add_parent("RT2")), + ] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 2 + st.set_id_of_node(st.unchecked[0]) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 0 + assert id(st.nodes[0]) in st._missing + assert id(st.nodes[1]) in st._missing + + # same as above but with backref + ent_list = [ + db.Record() + .add_parent("RT4") + .add_property(name="RT3", + value=db.Record().add_parent("RT3").add_property(name="a", value=1)), + ] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 2 + assert st.unchecked[1].identifiable is not None + st.set_id_of_node(st.unchecked[1]) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 0 + assert id(st.nodes[0]) in st._missing + assert id(st.nodes[1]) in st._missing + + # setting an id might allow to check another node that depends on the former + ent_list = [ + db.Record() + .add_parent("RT4") + .add_property(name="RT3", + value=db.Record().add_parent("RT3").add_property(name="a", value=1)), + ] + st = SyncGraph(ent_list, simple_adapter) + assert st.nodes[0].identifiable is None + assert st.nodes[1].identifiable is not None + st.set_id_of_node(st.unchecked[1], 111) + assert st.nodes[0].identifiable is not None + assert st.nodes[1].identifiable is not None + + # same as above but going one step further: the new identifiable allows to merge that node + ent_list = [ + (db.Record() + .add_parent("RT4") + .add_property(name="RT3", + value=db.Record().add_parent("RT3").add_property(name="a", value=1))), + + (db.Record() + .add_parent("RT4") + .add_property(name="RT3", value=111)) + ] + st = SyncGraph(ent_list, simple_adapter) + assert st.nodes[0].identifiable is None + assert st.nodes[1].identifiable is not None + assert st.nodes[2].identifiable is not None + assert len(st.nodes) == 3 + st.set_id_of_node(st.unchecked[2], 111) + assert st.nodes[0].identifiable is not None + assert len(st.nodes) == 2 + + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) +@patch("caoscrawler.sync_graph.cached_get_entity_by", + new=Mock(side_effect=mock_get_entity_by)) +def test_merging(simple_adapter): + # identifying information can be given at various locations in the hierachical tree + # test whether an object is correctly combined for all cases + ident_adapter = CaosDBIdentifiableAdapter() + ident_a = db.RecordType().add_parent("A").add_property("name").add_property("a") + ident_adapter.register_identifiable("A", ident_a) + ident_adapter.retrieve_identified_record_for_identifiable = Mock( + side_effect=partial( + basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) + + # merging based on id + ent_list = [ + db.Record(id=101).add_parent("A"), + db.Record(id=101).add_parent("A")] + st = SyncGraph(ent_list, ident_adapter) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert 101 == st.nodes[0].id + assert "A" == st.nodes[0].parents[0].name + + # merging based on path + ent_list = [ + db.File(path='101').add_parent("A"), + db.File(path='101').add_parent("A")] + st = SyncGraph(ent_list, ident_adapter) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert '101' == st.nodes[0].path + assert "A" == st.nodes[0].parents[0].name + + # merging based on identifiable (non identifying properties are ignored) + ent_list = [ + db.File(name='101').add_parent("A").add_property('a', value=1).add_property('b', value=1), + db.File(name='101').add_parent("A").add_property('a', value=1).add_property('b', value=2)] + st = SyncGraph(ent_list, ident_adapter) + assert len(st.nodes) == 1 + assert st.nodes[0].id is None + assert '101' == st.nodes[0].name + assert "A" == st.nodes[0].parents[0].name + assert 1 == st.nodes[0].properties[0].value + assert "a" == st.nodes[0].properties[0].name + + # Merging a mix. One Record needs the identifiable to be merged. But the identifying + # information is scattered in the other case. + ent_list = [ + db.Record(id=101).add_parent("A"), + db.Record(id=101, name='a').add_parent("A"), + db.Record(id=101).add_parent("A").add_property('a', value=1), + db.Record(name='a').add_parent("A").add_property('a', value=1)] + + st = SyncGraph(ent_list, ident_adapter) + assert len(st.nodes) == 1 + assert len(st.unchecked) == 0 + assert 'a' == st.nodes[0].name + assert "A" == st.nodes[0].parents[0].name + assert 1 == st.nodes[0].properties[0].value + assert "a" == st.nodes[0].properties[0].name + assert 101 == st.nodes[0].id + + # test that adding an ID can lead to a cascade of merges + # This also tests whether setting something to missing allows to create an identifiable + # and thus allows a merge + subtree = db.Record(name='a').add_parent("A").add_property('a', value=db.Record( + name='b').add_parent("A").add_property('a', value=db.Record( + name='c').add_parent("A").add_property('a', value="missing"))) + ent_list = [ + db.Record(id=101).add_parent("A"), + db.Record(id=101, name='z').add_parent("A"), + db.Record(id=101).add_parent("A").add_property('a', value=subtree), + db.Record(name='z').add_parent("A").add_property('a', value=subtree), + ] + + st = SyncGraph(ent_list, ident_adapter) + assert len(st.nodes) == 5 + assert len(st.unchecked) == 4 + missing_one = [el for el in st.nodes if el.name == 'c'][0] + st.set_id_of_node(missing_one) + # setting c to missing means that b cannot exist which means that a cannot exist, this allows + # to merge the two z nodes + assert len(st.nodes) == 4 + assert len(st.unchecked) == 0 + + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) +def test_update_of_reference_values(simple_adapter): + # multiple nodes are merged including one that is referenced + # assure that this still leads to the value of the property of the referencing node to be + # updated, when the id is set. (Value object is replaced appropriately) + a = db.Record().add_parent("RT3").add_property('a', value=1) + ent_list = [ + a, + db.Record().add_parent("RT3").add_property('a', value=1), + db.Record().add_parent("RT3").add_property('a', value=1), + db.Record().add_parent("RT3").add_property('a', value=1), + db.Record().add_parent("RT3").add_property('a', value=1), + db.Record().add_parent("RT4").add_property('RT3', value=a), + db.Record().add_parent("RT3").add_property('a', value=1), + db.Record().add_parent("RT3").add_property('a', value=1)] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 2 + assert 'RT4' == st.nodes[1].parents[0].name + st.set_id_of_node(st.nodes[0], 101) + b_prop = st.nodes[1].properties[0].value + assert b_prop.id == 101 + + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) +def test_ignoring_irrelevant_references(simple_adapter): + # make sure that a circle of references is no problem if one references is not identifying + b = db.Record(name='b').add_parent("RT5") + a = db.Record().add_parent("RT3").add_property('a', value=b) + b.add_property('a', value=a) + ent_list = [a, b] + st = SyncGraph(ent_list, simple_adapter) + assert len(st.nodes) == 2 + assert len(st.unchecked) == 2 + assert st.nodes[1].name == 'b' + + # a relies on b + assert st._identity_relies_on_unchecked_entity(st.nodes[0]) + # b relies on nothing + assert not st._identity_relies_on_unchecked_entity(st.nodes[1]) + # set ID of b + st.set_id_of_node(st.nodes[1], 101) + assert len(st.unchecked) == 1 + # now a nolonger relies on unchecked + assert not st._identity_relies_on_unchecked_entity(st.nodes[0]) + +# 'is implementation insufficient' + + +@pytest.mark.xfail() +def test_detect_circular_dependency(crawler_mocked_identifiable_retrieve, caplog): + crawler = crawler_mocked_identifiable_retrieve + crawler.identifiableAdapter.get_registered_identifiable = Mock( + side_effect=lambda x: db.Record().add_parent('C').add_property(name='C')) + a = db.Record(name='a').add_parent("C") + b = db.Record(name='b').add_parent("C").add_property(name="C", value=a) + c = db.Record(name='c').add_parent("C").add_property(name='D', value='e' + ).add_property(name="C", value=b) + d = db.Record(name='c').add_parent("C") + a.add_property(name="C", value=c) + flat = [a, b, c] + circle = Crawler.detect_circular_dependency(flat) + assert [id(el) for el in circle] == [id(el) for el in [a, c, b, a]] + + assert Crawler.detect_circular_dependency([d]) is None + st = SyncGraph(flat, crawler.identifiableAdapter) + with pytest.raises(RuntimeError): + _, _ = crawler._split_into_inserts_and_updates(st) + caplog.set_level(logging.ERROR, logger="caoscrawler.converters") + assert "Found circular dependency" in caplog.text + assert "\n--------\n\n> Parent: C\n\n>> Name: a\n[\'C\']" in caplog.text + caplog.clear() + + +def test_set_each_scalar_value(): + """Test whether properties with None as value are treated appropriately.""" + a = SyncNode(db.Record().add_parent("RT1").add_property(name="bla"), + db.RecordType().add_property("is_referenced_by", ["RT2"])) + _set_each_scalar_value(a, lambda x: False, None) + _set_each_scalar_value(a, lambda x: isinstance(x, SyncNode), None) + _set_each_scalar_value(a, lambda x: x is None, lambda x: 42) + assert a.properties[0].value == 42 + _set_each_scalar_value(a, lambda x: x == 42, lambda x: None) + assert a.properties[0].value is None + + +@patch("caoscrawler.identifiable_adapters._retrieve_RecordType", + new=Mock(side_effect=lambda id, name: db.RecordType(id=id, name=name))) +@patch("caoscrawler.identifiable_adapters.cached_query", + new=Mock(side_effect=mock_cached_only_rt_allow_empty)) +def test_merge_referenced_by(): + """Merging two entities that are referenced by a third entity with nonexistent RecordType. + + See also https://gitlab.com/linkahead/linkahead-crawler/-/issues/95 + """ + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_object({ + "RT_A": ["name"], + "RT_B": [{"is_referenced_by": ["RT_A"]}, "my_id"] + }) + crawled_data: list = [] + references: list = [] + for ii in [0, 1]: + rec = db.Record().add_parent("RT_B").add_property("my_id", value=ii) + references.append(rec) + crawled_data.append(rec) + rec_a = db.Record(name="Rec_A").add_parent("RT_A") + rec_a.add_property("my_ref", value=references) + crawled_data.append(rec_a) + + with pytest.raises(MissingRecordType) as mrt: + SyncGraph(crawled_data, ident) + assert str(mrt.value).endswith("Record type could not be found on server: RT_A") diff --git a/unittests/test_sync_node.py b/unittests/test_sync_node.py new file mode 100644 index 0000000000000000000000000000000000000000..1f95551d34f9e06ab3e2fc196e1e7809eabfa019 --- /dev/null +++ b/unittests/test_sync_node.py @@ -0,0 +1,376 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +from unittest.mock import MagicMock, Mock, patch + +import linkahead as db +import pytest +from test_crawler import basic_retrieve_by_name_mock_up, mock_get_entity_by + +from caoscrawler.exceptions import ImpossibleMergeError +from caoscrawler.identifiable import Identifiable +from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter +from caoscrawler.sync_graph import SyncGraph +from caoscrawler.sync_node import SyncNode, parent_in_list, property_in_list + + +def assert_parents_equal(p1, p2): + """Special assertion for comparing parents.""" + for a, b in zip(p1, p2): + assert a.id == b.id + assert a.name == b.name + + +def assert_properties_equal(p1, p2): + """Special assertion for comparing properties.""" + for a, b in zip(p1, p2): + assert a.id == b.id + assert a.name == b.name + assert a.value == b.value + assert a.datatype == b.datatype + + +def test_sync_node(): + # initialization + rec = (db.Record(id=101, name='101') + .add_parent("A") + .add_parent("B") + .add_parent(id=102) + .add_property(name="a", value='a') + .add_property(id=103, value='b')) + rec.description = "hallo" + sna = SyncNode(rec) + # check information stored in initialized SyncNode + assert "Record" in str(sna) + assert sna.id == rec.id + assert sna.role == rec.role + assert sna.name == rec.name + assert sna.description == rec.description + assert_parents_equal(sna.parents, rec.parents) + assert_properties_equal(sna.properties, rec.properties) + # ... special case File (path and file attributes) + fi = db.File(id=101, name='101', path='/a/') + snb = SyncNode(fi) + assert snb.role == fi.role + assert snb.name == fi.name + assert snb.id == fi.id + assert snb.path == fi.path + assert snb.file == fi.file + + # check information in exported db.Entity + export = sna.export_entity() + assert export.id == rec.id + assert export.role == rec.role + assert export.name == rec.name + assert export.description == rec.description + assert_parents_equal(export.parents, rec.parents) + assert_properties_equal(export.properties, rec.properties) + export = snb.export_entity() + assert export.role == fi.role + assert export.name == fi.name + assert export.id == fi.id + assert export.path == fi.path + assert export.file == fi.file + + # merge no common information + # --------------------------- + rec_a = (db.Record(name='101') + .add_parent("A") + .add_parent(id=102) + .add_property(name="a", value='a') + .add_property(id=103, value='b')) + + rec_b = (db.Record(id=101) + .add_parent("B") + .add_parent(id=103) + .add_property(name="a", value='a') + .add_property(id=103, value='b')) + rec_b.description = "tja" + + sn_a = SyncNode(rec_a) + sn_b = SyncNode(rec_b) + sn_a.update(sn_b) + # test information in updated node + assert sn_a.id == rec_b.id + assert sn_a.role == rec_a.role + assert sn_a.name == rec_a.name + assert sn_a.description == rec_b.description + for p in rec_a.parents + rec_b.parents: + assert p in sn_a.parents + for p in rec_a.properties + rec_b.properties: + assert p in sn_a.properties + # Check for duplicated property: + ps = [p for p in sn_a.properties if p.name == "a"] + assert len(ps) == 2 + assert ps[0].value == "a" + assert ps[1].value == "a" + + # test information in exported entity + export = sn_a.export_entity() + assert export.id == rec_b.id + assert export.name == rec_a.name + for p in rec_a.parents + rec_b.parents: + assert parent_in_list(p, export.parents) + for p in rec_a.properties + rec_b.properties: + if p.name is not None: + assert p.name in [el.name for el in export.properties] + if p.id is not None: + assert p.id in [el.id for el in export.properties] + assert len(export.properties) == 2 + assert export.get_property('a').value == 'a' + assert export.get_property(103).value == 'b' + assert export.description == rec_b.description + assert export.role == rec_a.role + + # merge with common information + # ----------------------------- + rec_a = (db.Record(id=101, name='101') + .add_parent("A") + .add_parent(id=102) + .add_property(name="a", value='a')) + + rec_b = (db.Record(id=101, name='101') + .add_parent("A") + .add_parent(id=102) + .add_property(name="a", value='a')) + + sn_a = SyncNode(rec_a) + sn_b = SyncNode(rec_b) + sn_a.update(sn_b) + assert sn_a.id == rec_b.id + assert sn_a.name == rec_a.name + for p in rec_a.parents + rec_b.parents: + assert parent_in_list(p, sn_a.parents) + for p in rec_a.properties + rec_b.properties: + assert property_in_list(p, sn_a.properties) + assert sn_a.description == rec_b.description + assert sn_a.role == rec_a.role + + # merge with conflicting information + # ---------------------------------- + # ID mismatch + sn_a = SyncNode(db.Record(id=102)) + with pytest.raises(ImpossibleMergeError, match="Trying to update"): + sn_a.update(SyncNode(db.Record(id=101))) + + # name mismatch + sn_a = SyncNode(db.Record(name='102')) + with pytest.raises(ImpossibleMergeError, match="Trying to update"): + sn_a.update(SyncNode(db.Record(name='101'))) + + # type mismatch + sn_a = SyncNode(db.Record(name='102')) + with pytest.raises(ImpossibleMergeError, match="Trying to update"): + sn_a.update(SyncNode(db.File(name='102'))) + + # description mismatch + sn_a = SyncNode(db.Record(description='102')) + with pytest.raises(ImpossibleMergeError, match="Trying to update"): + sn_a.update(SyncNode(db.Record(description='101'))) + + # path mismatch + sn_a = SyncNode(db.File(path='102')) + with pytest.raises(ImpossibleMergeError, match="Trying to update"): + sn_a.update(SyncNode(db.File(path='101'))) + + # identifiable mismatch + sn_a = SyncNode(db.File(path='102')) + sn_a.identifiable = Identifiable(name='a') + sn_b = SyncNode(db.File(path='101')) + sn_b.identifiable = Identifiable(name='b') + with pytest.raises(ValueError, match="identifiable"): + sn_a.update(sn_b) + + +def test_export_node(): + rec_a = (db.Record(id=101) + .add_parent("B") + .add_parent(id=103) + .add_property(name="a", value=[SyncNode(db.Record())]) + .add_property(name='b', id=103, value='b')) + + sn_a = SyncNode(rec_a) + exp = sn_a.export_entity() + assert exp.id == rec_a.id + assert exp.name == rec_a.name + for p in rec_a.parents: + assert len([el for el in exp.parents if p.name == el.name]) == 1 + for p in rec_a.properties: + assert p.value == exp.get_property(p.name).value + if isinstance(p.value, list): + assert len(p.value) == len(exp.get_property(p.name).value) + assert len(exp.properties) == len(rec_a.properties) + assert len(exp.parents) == len(rec_a.parents) + + # --------------------------------------------------------------------------------------------- + # NOTE: in the following we create a SyncNode object with twice the same Property as a short + # hand for a SyncNode that was created from one Entity with such a Property and then updating + # it with another SyncNode that also has the Property + # --------------------------------------------------------------------------------------------- + + # same property name, different values + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value='b') + .add_property(name="a", value='a')) + + # there should be a warning when multiproperties are used + with pytest.warns(UserWarning) as caught: + SyncNode(rec_a) + messages = {str(w.message) for w in caught} + assert ("Multiproperties are not supported by the crawler.") in messages + + with pytest.raises(ImpossibleMergeError) as ime: + exp = SyncNode(rec_a).export_entity() + assert "The problematic property is 'a' with values '['b']' and '['a']'" in str(ime.value) + + # SyncNodes with same ID are considered equal + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=SyncNode(db.Record(id=1))) + .add_property(name="a", value=SyncNode(db.Record(id=1)))) + + exp = SyncNode(rec_a).export_entity() + assert exp.get_property('a').value.id == 1 + # SyncNodes convert multi properties into single properties + assert len([p for p in exp.properties if p.name == "a"]) == 1 + + # same SyncNode object is obviously equal + sn = SyncNode(db.Record(id=1)) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=sn) + .add_property(name="a", value=sn)) + + exp = SyncNode(rec_a).export_entity() + assert exp.get_property('a').value.id == 1 + assert len([p for p in exp.properties if p.name == "a"]) == 1 + + # different SyncNode Objects (without an ID) are not equal + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=SyncNode(db.Record())) + .add_property(name="a", value=SyncNode(db.Record()))) + + with pytest.raises(ImpossibleMergeError) as ime: + exp = SyncNode(rec_a).export_entity() + + msg = (f"The problematic property is 'a' with values '[{SyncNode(db.Record())}]' " + f"and '[{SyncNode(db.Record())}]'") + assert msg in str(ime.value) + + # different SyncNode Objects with differing ID are not equal + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=SyncNode(db.Record(id=1))) + .add_property(name="a", value=SyncNode(db.Record(id=2)))) + + with pytest.raises(ImpossibleMergeError) as ime: + exp = SyncNode(rec_a).export_entity() + + msg = (f"The problematic property is 'a' with values '[{SyncNode(db.Record(id=1))}]' " + f"and '[{SyncNode(db.Record(id=2))}]'") + assert msg in str(ime.value) + + # SyncNodes with same ID are considered equal (list) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=2))]) + .add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=2))])) + + exp = SyncNode(rec_a).export_entity() + assert exp.get_property('a').value[0].id == 1 + assert len([p for p in exp.properties if p.name == "a"]) == 1 + + # SyncNodes with same ID are not equal when in different order (list) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=2))]) + .add_property(name="a", value=[SyncNode(db.Record(id=2)), SyncNode(db.Record(id=1))])) + + with pytest.raises(ImpossibleMergeError) as ime: + exp = SyncNode(rec_a).export_entity() + + msg = ("The problematic property is 'a' with values " + f"'{[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=2))]}' " + f"and '{[SyncNode(db.Record(id=2)), SyncNode(db.Record(id=1))]}'") + assert msg in str(ime.value) + + # same SyncNode object is obviously equal (list) + sn = SyncNode(db.Record(id=1)) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[sn]) + .add_property(name="a", value=[sn])) + + exp = SyncNode(rec_a).export_entity() + assert exp.get_property('a').value[0].id == 1 + + # different SyncNode Objects are not equal (list) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[SyncNode(db.Record())]) + .add_property(name="a", value=[SyncNode(db.Record())])) + + with pytest.raises(ImpossibleMergeError) as ime: + exp = SyncNode(rec_a).export_entity() + + msg = ("The problematic property is 'a' with values " + f"'{[SyncNode(db.Record())]}' and '{[SyncNode(db.Record())]}'") + assert msg in str(ime.value) + + # different SyncNode Objects with differing are not equal (list) + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[SyncNode(db.Record(id=1))]) + .add_property(name="a", value=[SyncNode(db.Record(id=2))])) + + with pytest.raises(ImpossibleMergeError) as ime: + exp = SyncNode(rec_a).export_entity() + + msg = ("The problematic property is 'a' with values " + f"'{[SyncNode(db.Record(id=1))]}' and '{[SyncNode(db.Record(id=2))]}'") + assert msg in str(ime.value) + + # list vs no list + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=SyncNode(db.Record(id=1))) + .add_property(name="a", value=[SyncNode(db.Record(id=1))])) + + with pytest.raises(ImpossibleMergeError) as ime: + exp = SyncNode(rec_a).export_entity() + msg = ("The problematic property is 'a' with values " + f"'[{SyncNode(db.Record(id=1))}]' and '{[SyncNode(db.Record(id=1))]}'") + assert msg in str(ime.value) + + # different list sizes + rec_a = (db.Record(id=101) + .add_parent("B") + .add_property(name="a", value=[SyncNode(db.Record(id=1))]) + .add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=1))])) + + with pytest.raises(ImpossibleMergeError) as ime: + exp = SyncNode(rec_a).export_entity() + + msg = ("The problematic property is 'a' with values " + f"'{[SyncNode(db.Record(id=1))]}' and " + f"'{[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=1))]}'") + assert msg in str(ime.value) diff --git a/unittests/test_table_converter.py b/unittests/test_table_converter.py index abe4ac85ec4fc0a78e71c177222817e1b84e9e56..c606c1d3cdf9a95f00728eaae88153631b08af53 100644 --- a/unittests/test_table_converter.py +++ b/unittests/test_table_converter.py @@ -26,27 +26,30 @@ test the converters module """ -from caoscrawler.converters import Converter -from caoscrawler.stores import GeneralStore -from caoscrawler.converters import (ConverterValidationError, - DictConverter, XLSXTableConverter, CSVTableConverter) -from caoscrawler.structure_elements import Directory -from caoscrawler.structure_elements import (File, TextElement, ListElement, DictElement, - BooleanElement, IntegerElement, FloatElement) - -from os.path import join, dirname, basename - -from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter - -import pytest -import os import importlib - import math +from os.path import basename, dirname, join +from pathlib import Path + +import linkahead as db +import pytest +from utils import dircheckstr from caoscrawler import Crawler +from caoscrawler.converters import (Converter, ConverterValidationError, + CSVTableConverter, DictConverter, + XLSXTableConverter) +from caoscrawler.debug_tree import DebugTree +from caoscrawler.identifiable_adapters import (IdentifiableAdapter, + LocalStorageIdentifiableAdapter) +from caoscrawler.scanner import scan_directory +from caoscrawler.stores import GeneralStore +from caoscrawler.structure_elements import (BooleanElement, DictElement, + Directory, File, FloatElement, + IntegerElement, ListElement, + TextElement) -import caosdb as db +UNITTESTDIR = Path(__file__).parent @pytest.fixture @@ -84,21 +87,6 @@ def rfp(*pathcomponents): return join(dirname(__file__), *pathcomponents) -def dircheckstr(*pathcomponents): - """ - Return the debug tree identifier for a given path. - """ - return "caoscrawler.structure_elements.File: " + basename(join(*pathcomponents)) + ", " + rfp("test_directories", "examples_tables", "ExperimentalData", *pathcomponents) - - -@pytest.fixture -def crawler(): - crawler = Crawler(debug=True) - crawler.crawl_directory(rfp("test_directories", "examples_tables", "ExperimentalData"), - rfp("test_directories", "examples_tables", "crawler_for_tables.yml")) - return crawler - - def test_convert_table(converter_registry): extentions = ["xlsx", "csv", "tsv"] if importlib.util.find_spec("odf") is not None: @@ -151,9 +139,15 @@ def test_convert_table(converter_registry): assert res[0].name == "jdsfkljadskf" -def test_crawl_csv_table(crawler): +def test_crawl_csv_table(): + dbt = DebugTree() + scan_directory(rfp("test_directories", "examples_tables", "ExperimentalData"), + rfp("test_directories", "examples_tables", "crawler_for_tables.yml"), + debug_tree=dbt) for file_ext in ["xlsx", "csv"]: - subd = crawler.debug_tree[dircheckstr("test1." + file_ext)] + subd = dbt.debug_tree[dircheckstr( + UNITTESTDIR / "test_directories" / "examples_tables" / "ExperimentalData", + "test1." + file_ext)] record_experiment = subd[1]["Experiment"] assert isinstance(record_experiment, db.Record) assert isinstance(record_experiment.get_property("Measurements").value, list) diff --git a/unittests/test_tables/spss/CITATION.cff b/unittests/test_tables/spss/CITATION.cff new file mode 100644 index 0000000000000000000000000000000000000000..140fcc071bf2d5f5709cf31bf11bd9676b81ca5f --- /dev/null +++ b/unittests/test_tables/spss/CITATION.cff @@ -0,0 +1,11 @@ +cff-version: 1.2.0 +message: "If you use this software, please cite it as below." +authors: +- family-names: "Fajardo" + given-names: "Otto" + orcid: "https://orcid.org/0000-0002-3363-9287" +title: "Pyreadstat" +version: 1.2.7 +doi: 10.5281/zenodo.6612282 +date-released: 2018-09-24 +url: "https://github.com/Roche/pyreadstat" diff --git a/unittests/test_tables/spss/LICENSE b/unittests/test_tables/spss/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..a2f94b1a2a5a4255fc8ef6d0beb94cce89f545e8 --- /dev/null +++ b/unittests/test_tables/spss/LICENSE @@ -0,0 +1,210 @@ +Test data files were copied from [pyreadstat](https://github.com/Roche/pyreadstat), they are +licensed under the Apache License, cited below. + +Copyright (C) 2018-2024 Otto Fajardo +Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com> + +pyreadstat liscence: +--------------------------------------------------------------------------- + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/unittests/test_tables/spss/sample.sav b/unittests/test_tables/spss/sample.sav new file mode 100644 index 0000000000000000000000000000000000000000..20d0c5ce6689a60adfa329a17b4347274e9a863b Binary files /dev/null and b/unittests/test_tables/spss/sample.sav differ diff --git a/unittests/test_tables/spss/sample_large.sav b/unittests/test_tables/spss/sample_large.sav new file mode 100644 index 0000000000000000000000000000000000000000..b0c16c1390a15a4f62a859ade76aa17b89c6ae40 Binary files /dev/null and b/unittests/test_tables/spss/sample_large.sav differ diff --git a/unittests/test_tool.py b/unittests/test_tool.py deleted file mode 100755 index 6a828532c1de9796008a6e51c21811f83b85657a..0000000000000000000000000000000000000000 --- a/unittests/test_tool.py +++ /dev/null @@ -1,869 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -# -# This file is a part of the CaosDB Project. -# -# Copyright (C) 2021 Alexander Schlemmer -# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com> -# Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see <https://www.gnu.org/licenses/>. -# - -""" -Tests for the tool using pytest -Adapted from check-sfs -""" - -import os -from caoscrawler.crawl import Crawler, SecurityMode -from caoscrawler.identifiable import Identifiable -from caoscrawler.structure_elements import File, DictTextElement, DictListElement -from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter -from simulated_server_data import full_data -from functools import partial -from copy import deepcopy -from unittest.mock import patch -from caoscrawler.crawl import crawler_main -import caosdb.common.models as dbmodels -from unittest.mock import MagicMock, Mock -from os.path import join, dirname, basename -import yaml -import caosdb as db -from caosdb.apiutils import compare_entities - -import pytest -from pytest import raises - - -def rfp(*pathcomponents): - """ - Return full path. - Shorthand convenience function. - """ - return join(dirname(__file__), *pathcomponents) - - -ident = LocalStorageIdentifiableAdapter() -ident.restore_state(rfp("records.xml")) -full_data.update({el.name: el for el in ident._records if el.name is not None}) -full_data.update({el.id: el for el in ident._records if el.name is None}) - - -def dircheckstr(*pathcomponents): - """ - Return the debug tree identifier for a given path. - """ - return ("caoscrawler.structure_elements.Directory: " + basename( - join(*pathcomponents)) + ", " + rfp( - "test_directories", "examples_article", *pathcomponents)) - - -@pytest.fixture -def crawler(): - crawler = Crawler(debug=True) - crawler.crawl_directory(rfp("test_directories", "examples_article"), - rfp("scifolder_cfood.yml")) - return crawler - - -@pytest.fixture -def ident(crawler): - ident = LocalStorageIdentifiableAdapter() - crawler.identifiableAdapter = ident - - # The records.xml file is constructed as follows: - # To a full run of the crawler, resolve all identifiables and insert all resulting entities. - # See: test-setup/datamodel/generate_test_data.py for details. - ident.restore_state(rfp("records.xml")) - - ident.register_identifiable( - "Person", db.RecordType() - .add_parent(name="Person") - .add_property(name="first_name") - .add_property(name="last_name")) - ident.register_identifiable( - "Measurement", db.RecordType() - .add_parent(name="Measurement") - .add_property(name="identifier") - .add_property(name="date") - .add_property(name="project")) - ident.register_identifiable( - "Project", db.RecordType() - .add_parent(name="Project") - .add_property(name="date") - .add_property(name="identifier")) - return ident - - -def test_record_structure_generation(crawler): - subd = crawler.debug_tree[dircheckstr("DataAnalysis")] - subc = crawler.debug_metadata["copied"][dircheckstr("DataAnalysis")] - assert len(subd) == 2 - # variables store on Data Analysis node of debug tree - assert len(subd[0]) == 2 - # record store on Data Analysis node of debug tree - assert len(subd[1]) == 0 - assert len(subc) == 2 - assert len(subc[0]) == 2 - assert len(subc[1]) == 0 - - # The data analysis node creates one variable for the node itself: - assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" - assert subc[0]["DataAnalysis"] is False - - subd = crawler.debug_tree[dircheckstr( - "DataAnalysis", "2020_climate-model-predict")] - subc = crawler.debug_metadata["copied"][dircheckstr( - "DataAnalysis", "2020_climate-model-predict")] - - assert len(subd[1]) == 1 - assert len(subd[1]["Project"].get_parents()) == 1 - assert subd[1]["Project"].get_parents()[0].name == "Project" - assert subd[1]["Project"].get_property("date").value == "2020" - assert subd[1]["Project"].get_property( - "identifier").value == "climate-model-predict" - - assert len(subd[0]) == 6 - assert subd[0]["date"] == "2020" - assert subd[0]["identifier"] == "climate-model-predict" - assert subd[0]["Project"].__class__ == db.Record - - assert subd[0]["DataAnalysis"] == "examples_article/DataAnalysis" - assert subc[0]["DataAnalysis"] is True - assert subd[0]["project_dir"] == "examples_article/DataAnalysis/2020_climate-model-predict" - assert subc[0]["project_dir"] is False - - # Check the copy flags for the first level in the hierarchy: - assert len(subc[0]) == 6 - assert len(subc[1]) == 1 - assert subc[1]["Project"] is False - assert subc[0]["Project"] is False - assert subc[0]["date"] is False - assert subc[0]["identifier"] is False - - subd = crawler.debug_tree[dircheckstr("DataAnalysis", - "2020_climate-model-predict", - "2020-02-08_prediction-errors")] - subc = crawler.debug_metadata["copied"][dircheckstr("DataAnalysis", - "2020_climate-model-predict", - "2020-02-08_prediction-errors")] - assert len(subd[0]) == 8 - assert subd[0]["date"] == "2020-02-08" - assert subd[0]["identifier"] == "prediction-errors" - assert subd[0]["Project"].__class__ == db.Record - assert subd[0]["Measurement"].__class__ == db.Record - - assert len(subd[1]) == 2 - - assert len(subd[1]["Project"].get_parents()) == 1 - assert subd[1]["Project"].get_parents()[0].name == "Project" - assert subd[1]["Project"].get_property("date").value == "2020" - assert subd[1]["Project"].get_property( - "identifier").value == "climate-model-predict" - - assert len(subd[1]["Measurement"].get_parents()) == 1 - assert subd[1]["Measurement"].get_parents()[0].name == "Measurement" - assert subd[1]["Measurement"].get_property("date").value == "2020-02-08" - assert subd[1]["Measurement"].get_property( - "identifier").value == "prediction-errors" - assert subd[1]["Measurement"].get_property("project").value != "$Project" - assert subd[1]["Measurement"].get_property( - "project").value.__class__ == db.Record - assert subd[1]["Measurement"].get_property( - "project").value == subd[0]["Project"] - - # Check the copy flags for the second level in the hierarchy: - assert subc[1]["Project"] is True - assert subc[0]["Project"] is True - assert subc[1]["Measurement"] is False - assert subc[0]["Measurement"] is False - assert subc[0]["date"] is False - assert subc[0]["identifier"] is False - - -# def prepare_test_record_file(): -# ident = LocalStorageIdentifiableAdapter() -# crawler = Crawler(debug=True, identifiableAdapter=ident) -# crawler.crawl_directory(rfp("test_directories", "examples_article"), -# rfp("scifolder_cfood.yml")) - -# # clean record list: -# recordlist = ident.get_records() -# for i in range(len(recordlist)-1, 1, -1): -# if recordlist[i].parents[0].name == "Person": -# del recordlist[i] - -# ident.store_state(rfp("records.xml")) - - -def test_crawler_update_list(crawler, ident): - # If the following assertions fail, that is a hint, that the test file records.xml has changed - # and this needs to be updated: - assert len(ident.get_records()) == 18 - assert len( - [r for r in ident.get_records() if r.parents[0].name == "Person"] - ) == 5 - assert len( - [r for r in ident.get_records() if r.parents[0].name == "Measurement"] - ) == 11 - assert len( - [r for r in ident.get_records() if r.parents[0].name == "Project"] - ) == 2 - - # The crawler contains lots of duplicates, because identifiables have not been resolved yet: - assert len(ident.get_records()) != len(crawler.crawled_data) - - # Check consistency: - # Check whether identifiables retrieved from current identifiable store return - # the same results. - - # take the first person in the list of records: - for r in ident.get_records(): - if r.parents[0].name == "Person": - r_cur = r - break - - id_r0 = ident.get_identifiable(r_cur) - assert r_cur.parents[0].name == id_r0.record_type - assert r_cur.get_property( - "first_name").value == id_r0.properties["first_name"] - assert r_cur.get_property( - "last_name").value == id_r0.properties["last_name"] - assert len(r_cur.parents) == 1 - assert len(r_cur.properties) == 2 - assert len(id_r0.properties) == 2 - - idr_r0_test = ident.retrieve_identified_record_for_identifiable(id_r0) - idr_r0 = ident.retrieve_identified_record_for_record(r_cur) - assert idr_r0 == idr_r0_test - - # take the first measurement in the list of records: - for r in ident.get_records(): - if r.parents[0].name == "Measurement": - r_cur = r - break - - id_r1 = ident.get_identifiable(r_cur) - assert r_cur.parents[0].name == id_r1.record_type - assert r_cur.get_property( - "identifier").value == id_r1.properties["identifier"] - assert r_cur.get_property("date").value == id_r1.properties["date"] - assert r_cur.get_property( - "project").value == id_r1.properties["project"] - assert len(r_cur.parents) == 1 - assert len(r_cur.properties) == 4 - assert len(id_r1.properties) == 3 - - idr_r1_test = ident.retrieve_identified_record_for_identifiable(id_r1) - idr_r1 = ident.retrieve_identified_record_for_record(r_cur) - assert idr_r1 == idr_r1_test - assert idr_r1 != idr_r0 - assert idr_r1_test != idr_r0_test - - assert len(idr_r1.properties) == 4 - assert r_cur.get_property( - "responsible").value == idr_r1.get_property("responsible").value - assert r_cur.description == idr_r1.description - - -def test_synchronization(crawler, ident): - insl, updl = crawler.synchronize(commit_changes=False) - assert len(insl) == 0 - assert len(updl) == 0 - - -def test_remove_unnecessary_updates(): - # test trvial case - upl = [db.Record().add_parent("A")] - irs = [db.Record().add_parent("A")] - updates = Crawler.remove_unnecessary_updates(upl, irs) - assert len(updates) == 0 - - # test property difference case - # TODO this should work right? - # upl = [db.Record().add_parent("A").add_property("a", 3)] - # irs = [db.Record().add_parent("A")] # ID should be s - # Crawler.remove_unnecessary_updates(upl, irs) - # assert len(upl) == 1 - - # test value difference case - upl = [db.Record().add_parent("A").add_property("a", 5)] - irs = [db.Record().add_parent("A").add_property("a")] - updates = Crawler.remove_unnecessary_updates(upl, irs) - assert len(updates) == 1 - upl = [db.Record().add_parent("A").add_property("a", 5)] - irs = [db.Record().add_parent("A").add_property("a", 5)] - updates = Crawler.remove_unnecessary_updates(upl, irs) - assert len(updates) == 0 - - # test unit difference case - upl = [db.Record().add_parent("A").add_property("a", unit='cm')] - irs = [db.Record().add_parent("A").add_property("a")] - updates = Crawler.remove_unnecessary_updates(upl, irs) - assert len(updates) == 1 - - # test None difference case - upl = [db.Record().add_parent("A").add_property("a")] - irs = [db.Record().add_parent("A").add_property("a", 5)] - updates = Crawler.remove_unnecessary_updates(upl, irs) - assert len(updates) == 1 - - -# Current status: -# TODO: currently, this test fails, because non identifiable records cannot -# be inserted into the cache. Solution might be, just not to add them -# into the local cache. Probably in split_into_inserts_and_updates. -@pytest.mark.xfail -def test_identifiable_adapter_no_identifiable(crawler, ident): - del ident._registered_identifiables["Person"] - insl, updl = crawler.synchronize() - assert len(updl) == 0 - - pers = [r for r in crawler.crawled_data if r.parents[0].name == "Person"] - # All persons are inserted, because they are not identifiable: - assert len(insl) == len(pers) - - -def test_provenance_debug_data(crawler): - crawler.save_debug_data(rfp("provenance.yml")) - - with open(rfp("provenance.yml"), "r") as f: - provenance = yaml.load(f, Loader=yaml.SafeLoader) - - pr = provenance["provenance"] - - def check_key_count(prefix): - return sum([1 for key in pr.keys() if key.startswith(prefix)]) - assert check_key_count("Measurement") == 11 - assert check_key_count("Project") == 5 - assert check_key_count("Person") == 14 - - -def test_split_into_inserts_and_updates_trivial(crawler): - crawler.split_into_inserts_and_updates([]) - - -def basic_retrieve_by_name_mock_up(rec, referencing_entities=None, known=None): - """ returns a stored Record if rec.name is an existing key, None otherwise """ - if rec.name in known: - return known[rec.name] - else: - return None - - -@pytest.fixture -def crawler_mocked_identifiable_retrieve(crawler): - # mock retrieval of registered identifiabls: return Record with just a parent - crawler.identifiableAdapter.get_registered_identifiable = Mock( - side_effect=lambda x: db.Record().add_parent(x.parents[0].name)) - - # Simulate remote server content by using the names to identify records - # There is only a single known Record with name A - crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( - basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( - side_effect=partial( - basic_retrieve_by_name_mock_up, known={"A": db.Record(id=1111, name="A")})) - return crawler - - -def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retrieve): - crawler = crawler_mocked_identifiable_retrieve - identlist = [Identifiable(name="A", record_type="C"), Identifiable(name="B", record_type="C")] - entlist = [db.Record(name="A").add_parent( - "C"), db.Record(name="B").add_parent("C")] - - assert crawler.get_from_any_cache(identlist[0]) is None - assert crawler.get_from_any_cache(identlist[1]) is None - assert not crawler._has_reference_value_without_id(identlist[0]) - assert not crawler._has_reference_value_without_id(identlist[1]) - assert crawler.identifiableAdapter.retrieve_identified_record_for_record( - identlist[0]).id == 1111 - assert crawler.identifiableAdapter.retrieve_identified_record_for_record( - identlist[1]) is None - - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - assert len(insert) == 1 - assert insert[0].name == "B" - assert len(update) == 1 - assert update[0].name == "A" - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() - - -def test_split_into_inserts_and_updates_with_duplicate(crawler_mocked_identifiable_retrieve): - crawler = crawler_mocked_identifiable_retrieve - a = db.Record(name="A").add_parent("C") - b = db.Record(name="B").add_parent("C") - b.add_property("A", a) - # This is identical to a and should be removed - c = db.Record(name="A").add_parent("C") - entlist = [a, b, c] - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - assert len(insert) == 1 - assert insert[0].name == "B" - assert len(update) == 1 - assert update[0].name == "A" - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() - - -def test_split_into_inserts_and_updates_with_ref(crawler_mocked_identifiable_retrieve): - crawler = crawler_mocked_identifiable_retrieve - # try it with a reference - a = db.Record(name="A").add_parent("C") - b = db.Record(name="B").add_parent("C") - b.add_property("A", a) - entlist = [a, b] - insert, update = crawler.split_into_inserts_and_updates(entlist) - assert len(insert) == 1 - assert insert[0].name == "B" - assert len(update) == 1 - assert update[0].name == "A" - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() - - -def test_split_into_inserts_and_updates_with_circ(crawler): - # try circular - a = db.Record(name="A").add_parent("C") - b = db.Record(name="B").add_parent("C") - b.add_property("A", a) - a.add_property("B", b) - entlist = [a, b] - # TODO this does not seem to be complete! - - -def test_split_into_inserts_and_updates_with_complex(crawler_mocked_identifiable_retrieve): - crawler = crawler_mocked_identifiable_retrieve - # A - # ^ - # | - # F <- B <- G - a = db.Record(name="A").add_parent("C").add_property( - 'd', 13).add_property('e', "lskdjlsfdj") - b = db.Record(name="B").add_parent("C") - g = db.Record(name="G").add_parent("C") - f = db.Record(name="F").add_parent("C") - g.add_property("A", a) - b.add_property("A", f) - b.add_property("A", a) - entlist = [a, b, g] - insert, update = crawler.split_into_inserts_and_updates(entlist) - assert len(insert) == 3 - assert "B" in [el.name for el in insert] - assert len(update) == 1 - assert update[0].name == "A" - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() - - # TODO write test where the unresoled entity is not part of the identifiable - - -def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiable_retrieve): - crawler = crawler_mocked_identifiable_retrieve - # assume identifiable is only the name - a = db.Record(name="A").add_parent("C") - a.add_property("foo", 1) - b = db.Record(name="A").add_parent("C") - b.add_property("bar", 2) - entlist = [a, b] - insert, update = crawler.split_into_inserts_and_updates(entlist) - - assert update[0].get_property("bar").value == 2 - assert update[0].get_property("foo").value == 1 - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() - - -def test_has_missing_object_in_references(crawler): - # Simulate remote server content by using the names to identify records - # There are only two known Records with name A and B - crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=partial( - basic_retrieve_by_name_mock_up, known={"C": db.Record(name="C").add_parent("RTC") - .add_property("d"), - "D": db.Record(name="D").add_parent("RTD") - .add_property("d").add_property("e"), - })) - - # one reference with id -> check - assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': 123}), []) - # one ref with Entity with id -> check - assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': db.Record(id=123) - .add_parent("C")}), []) - # one ref with id one with Entity with id (mixed) -> check - assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTD", - properties={'d': 123, 'b': db.Record(id=123).add_parent("RTC")}), []) - # entity to be referenced in the following - a = db.Record(name="C").add_parent("C").add_property("d", 12311) - # one ref with id one with Entity without id (but not identifying) -> fail - assert not crawler._has_missing_object_in_references( - Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a}), []) - - # one ref with id one with Entity without id (mixed) -> fail - assert not crawler._has_missing_object_in_references( - Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) - - crawler.add_to_remote_missing_cache(a, Identifiable(name="C", record_type="RTC", - properties={'d': 12311})) - # one ref with id one with Entity without id but in cache -> check - assert crawler._has_missing_object_in_references( - Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), []) - - # if this ever fails, the mock up may be removed - crawler.identifiableAdapter.get_registered_identifiable.assert_called() - - -@pytest.mark.xfail() -def test_references_entities_without_ids(crawler, ident): - assert not crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('last_name', 123) - .add_property('first_name', 123)) - # id and rec with id - assert not crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('first_name', 123) - .add_property('last_name', - db.Record(id=123))) - # id and rec with id and one unneeded prop - assert crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('first_name', 123) - .add_property('stuff', db.Record()) - .add_property('last_name', db.Record(id=123))) - - # one identifying prop is missing - assert crawler._has_reference_value_without_id(db.Record().add_parent("Person") - .add_property('first_name', 123) - .add_property('last_name', db.Record())) - - -def test_replace_entities_with_ids(crawler): - a = (db.Record().add_parent("B").add_property("A", 12345) - .add_property("B", db.Record(id=12345)) - .add_property("C", [db.Record(id=12345), 233324])) - - crawler.replace_entities_with_ids(a) - assert a.get_property("A").value == 12345 - assert a.get_property("B").value == 12345 - assert a.get_property("C").value == [12345, 233324] - - -def mock_get_entity_by_id(id): - candidates = [el for el in list(full_data.values()) if el.id == id] - if len(candidates) > 0: - return candidates[0] - else: - raise ValueError() - - -def mock_get_entity_by_name(name): - candidates = [el for el in full_data.values() - if (el.name is not None and el.name.lower() == name.lower())] - if len(candidates) > 0: - return candidates[0] - else: - raise ValueError() - - -def prepare_crawler_with_sec_mode(mode, ident): - crawler = Crawler(debug=True, securityMode=mode) - crawler.crawl_directory(rfp("test_directories", "examples_article"), - rfp("scifolder_cfood.yml")) - crawler.identifiableAdapter = ident - - return crawler - - -def reset_mocks(mocks): - for mock in mocks: - mock.reset_mock() - - -def change_identifiable_prop(ident): - """ - This function is supposed to change a non identifiing property. - """ - for ent in ident._records: - if len(ent.parents) == 0 or ent.parents[0].name != "Measurement": - continue - for prop in ent.properties: - if prop.name != "date": - continue - # change one element; This removes a responsible which is not part of the identifiable - prop.value = "2022-01-04" - return - # If it does not work, this test is not implemented properly - raise RuntimeError("Did not find the property that should be changed.") - - -def change_non_identifiable_prop(ident): - """ - This function is supposed to change a non identifiing property. - """ - for ent in ident._records: - if len(ent.parents) == 0 or ent.parents[0].name != "Measurement": - continue - - for prop in ent.properties: - if prop.name != "responsible" or len(prop.value) < 2: - continue - # change one element; This removes a responsible which is not part of the identifiable - del prop.value[-1] - return - raise RuntimeError("Did not find the property that should be changed.") - - -@patch("caoscrawler.crawl.Crawler._get_entity_by_id", - new=Mock(side_effect=mock_get_entity_by_id)) -@patch("caoscrawler.crawl.Crawler._get_entity_by_name", - new=Mock(side_effect=mock_get_entity_by_name)) -@patch("caoscrawler.crawl.db.Container.insert") -@patch("caoscrawler.crawl.db.Container.update") -@patch("caoscrawler.crawl.UpdateCache.insert") -def test_security_mode(updateCacheMock, upmock, insmock, ident): - records_backup = deepcopy(ident._records) - - # trivial case: nothing to do - crawler = prepare_crawler_with_sec_mode(SecurityMode.RETRIEVE, ident) - crawler.synchronize(commit_changes=True) - assert crawler.run_id is not None - insmock.assert_not_called() - upmock.assert_not_called() - updateCacheMock.assert_not_called() - - # RETRIEVE: insert only - crawler = prepare_crawler_with_sec_mode(SecurityMode.RETRIEVE, ident) - # remove one element - del ident._records[-1] - # insert forbidden - crawler.synchronize(commit_changes=True) - assert crawler.run_id is not None - insmock.assert_not_called() - upmock.assert_not_called() - assert updateCacheMock.call_count == 1 - # reset counts - reset_mocks([updateCacheMock, insmock, upmock]) - # restore original ident - ident._records = deepcopy(records_backup) - - # RETRIEVE: update only - crawler = prepare_crawler_with_sec_mode(SecurityMode.RETRIEVE, ident) - # change one element - change_non_identifiable_prop(ident) - crawler.synchronize(commit_changes=True) - assert crawler.run_id is not None - insmock.assert_not_called() - upmock.assert_not_called() - assert updateCacheMock.call_count == 1 - # reset counts - reset_mocks([updateCacheMock, insmock, upmock]) - # restore original ident - ident._records = deepcopy(records_backup) - - # INSERT: insert only - crawler = prepare_crawler_with_sec_mode(SecurityMode.INSERT, ident) - # remove one element - del ident._records[-1] - crawler.synchronize(commit_changes=True) - assert crawler.run_id is not None - insmock.assert_called_once() - upmock.assert_not_called() - updateCacheMock.assert_not_called() - # reset counts - reset_mocks([updateCacheMock, insmock, upmock]) - # restore original ident - ident._records = deepcopy(records_backup) - - # INSERT: update only - crawler = prepare_crawler_with_sec_mode(SecurityMode.INSERT, ident) - # change one element - change_non_identifiable_prop(ident) - crawler.synchronize(commit_changes=True) - assert crawler.run_id is not None - insmock.assert_not_called() - upmock.assert_not_called() - updateCacheMock.assert_called_once() - # reset counts - reset_mocks([updateCacheMock, insmock, upmock]) - # restore original ident - ident._records = deepcopy(records_backup) - - # INSERT: insert and update - crawler = prepare_crawler_with_sec_mode(SecurityMode.INSERT, ident) - # change two elements - change_non_identifiable_prop(ident) - change_identifiable_prop(ident) - crawler.synchronize(commit_changes=True) - assert crawler.run_id is not None - insmock.asser_called_once() - upmock.assert_not_called() - updateCacheMock.assert_called_once() - # reset counts - reset_mocks([updateCacheMock, insmock, upmock]) - # restore original ident - ident._records = deepcopy(records_backup) - - -def test_create_reference_mapping(): - a = db.Record().add_parent("A") - b = db.Record().add_parent("B").add_property('a', a) - ref = Crawler.create_reference_mapping([a, b]) - assert id(a) in ref - assert id(b) not in ref - assert "B" in ref[id(a)] - assert ref[id(a)]["B"] == [b] - - -def test_create_flat_list(): - a = db.Record() - b = db.Record() - a.add_property(name="a", value=a) - a.add_property(name="b", value=b) - flat = Crawler.create_flat_list([a]) - assert len(flat) == 2 - assert a in flat - assert b in flat - c = db.Record() - c.add_property(name="a", value=a) - # This would caus recursion if it is not dealt with properly. - a.add_property(name="c", value=c) - flat = Crawler.create_flat_list([c]) - assert len(flat) == 3 - assert a in flat - assert b in flat - assert c in flat - - -@pytest.fixture -def crawler_mocked_for_backref_test(crawler): - # mock retrieval of registered identifiabls: return Record with just a parent - def get_reg_ident(x): - if x.parents[0].name == "C": - return db.Record().add_parent(x.parents[0].name).add_property( - "is_referenced_by", value=["BR"]) - elif x.parents[0].name == "D": - return db.Record().add_parent(x.parents[0].name).add_property( - "is_referenced_by", value=["BR", "BR2"]) - else: - return db.Record().add_parent(x.parents[0].name) - crawler.identifiableAdapter.get_registered_identifiable = Mock(side_effect=get_reg_ident) - - # Simulate remote server content by using the names to identify records - # There is only a single known Record with name A - crawler.identifiableAdapter.retrieve_identified_record_for_record = Mock(side_effect=partial( - basic_retrieve_by_name_mock_up, known={"A": - db.Record(id=1111, name="A").add_parent("BR")})) - crawler.identifiableAdapter.retrieve_identified_record_for_identifiable = Mock( - side_effect=partial( - basic_retrieve_by_name_mock_up, known={"A": - db.Record(id=1111, name="A").add_parent("BR")})) - return crawler - - -def test_validation_error_print(capsys): - # there should be no server interaction since we only test the behavior if a validation error - # occurs during the data collection stage - DATADIR = os.path.join(os.path.dirname(__file__), "test_data", "failing_validation") - for fi in ["cfood.yml", "cfood2.yml"]: - ret = crawler_main(DATADIR, - os.path.join(DATADIR, fi), - os.path.join(DATADIR, "identifiables.yml"), - True, - None, - False, - "/use_case_simple_presentation") - captured = capsys.readouterr() - assert "Couldn't validate" in captured.out - - -def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test): - crawler = crawler_mocked_for_backref_test - identlist = [Identifiable(name="A", record_type="BR"), - Identifiable(name="B", record_type="C", backrefs=[db.Entity()])] - referenced = db.Record(name="B").add_parent("C") - entlist = [referenced, db.Record(name="A").add_parent("BR").add_property("ref", referenced), ] - - # Test without referencing object - # currently a NotImplementedError is raised if necessary properties are missing. - with raises(NotImplementedError): - crawler.split_into_inserts_and_updates([db.Record(name="B").add_parent("C")]) - - # identifiables were not yet checked - assert crawler.get_from_any_cache(identlist[0]) is None - assert crawler.get_from_any_cache(identlist[1]) is None - # one with reference, one without - assert not crawler._has_reference_value_without_id(identlist[0]) - assert crawler._has_reference_value_without_id(identlist[1]) - # one can be found remotely, one not - assert crawler.identifiableAdapter.retrieve_identified_record_for_record( - identlist[0]).id == 1111 - assert crawler.identifiableAdapter.retrieve_identified_record_for_record( - identlist[1]) is None - - # check the split... - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - # A was found remotely and is therefore in the update list - assert len(update) == 1 - assert update[0].name == "A" - # B does not exist on the (simulated) remote server - assert len(insert) == 1 - assert insert[0].name == "B" - - -def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_test): - # test whether multiple references of the same record type are correctly used - crawler = crawler_mocked_for_backref_test - referenced = db.Record(name="B").add_parent("C") - entlist = [referenced, - db.Record(name="A").add_parent("BR").add_property("ref", referenced), - db.Record(name="C").add_parent("BR").add_property("ref", referenced), - ] - - # test whether both entities are listed in the backref attribute of the identifiable - referencing_entities = crawler.create_reference_mapping(entlist) - identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities) - assert len(identifiable.backrefs) == 2 - - # check the split... - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - assert len(update) == 1 - assert len(insert) == 2 - - -def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_test): - # test whether multiple references of the different record types are correctly used - crawler = crawler_mocked_for_backref_test - referenced = db.Record(name="B").add_parent("D") - entlist = [referenced, - db.Record(name="A").add_parent("BR").add_property("ref", referenced), - db.Record(name="A").add_parent("BR2").add_property("ref", referenced), - ] - - # test whether both entities are listed in the backref attribute of the identifiable - referencing_entities = crawler.create_reference_mapping(entlist) - identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities) - assert len(identifiable.backrefs) == 2 - - # check the split... - insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) - assert len(update) == 2 - assert len(insert) == 1 diff --git a/unittests/test_tool_extended.py b/unittests/test_tool_extended.py deleted file mode 100644 index d0b431a539a15e3e83906540c69becff437742ec..0000000000000000000000000000000000000000 --- a/unittests/test_tool_extended.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/python -# Tests for the tool using pytest -# Adapted from check-sfs -# A. Schlemmer, 06/2021 - -from caoscrawler import Crawler -from caoscrawler.structure_elements import File, DictTextElement, DictListElement -from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter -from functools import partial -from copy import deepcopy -from unittest.mock import MagicMock, Mock -from os.path import join, dirname, basename -import yaml -import caosdb as db -from caosdb.apiutils import compare_entities - -import pytest -from pytest import raises - - -def rfp(*pathcomponents): - """ - Return full path. - Shorthand convenience function. - """ - return join(dirname(__file__), *pathcomponents) - - -def dircheckstr(*pathcomponents, structure_element_type="Directory"): - """ - Return the debug tree identifier for a given path. - """ - return ("caoscrawler.structure_elements." + structure_element_type + ": " + - basename(join(*pathcomponents)) + ", " + - rfp("test_directories", "examples_article", *pathcomponents)) - - -@pytest.fixture -def crawler(): - crawler = Crawler(debug=True) - crawler.crawl_directory(rfp("test_directories", "examples_article"), - rfp("scifolder_extended.yml")) - return crawler - - -# @pytest.fixture -# def ident(crawler): -# ident = LocalStorageIdentifiableAdapter() -# crawler.identifiableAdapter = ident - -# ident.restore_state(rfp("records.xml")) - -# ident.register_identifiable( -# "Person", db.RecordType() -# .add_parent(name="Person") -# .add_property(name="first_name") -# .add_property(name="last_name")) -# ident.register_identifiable( -# "Measurement", db.RecordType() -# .add_parent(name="Measurement") -# .add_property(name="identifier") -# .add_property(name="date") -# .add_property(name="project")) -# ident.register_identifiable( -# "Project", db.RecordType() -# .add_parent(name="Project") -# .add_property(name="date") -# .add_property(name="identifier")) -# return ident - - -def test_file_structure_generation(crawler): - sd = crawler.debug_tree[dircheckstr("SimulationData", - "2020_climate-model-predict", "2020-02-01", - "README.md", structure_element_type="File")] - assert sd[1]["ReadmeFile"].role == "File" - assert len(sd[1]["ReadmeFile"].path) > 0 - assert len(sd[1]["ReadmeFile"].file) > 0 diff --git a/unittests/test_transformers.py b/unittests/test_transformers.py new file mode 100644 index 0000000000000000000000000000000000000000..a2d227adc5b0c6a8f2f96cb054e1c7670e980e10 --- /dev/null +++ b/unittests/test_transformers.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Research Group Biomedical Physics, +# Max-Planck-Institute for Dynamics and Self-Organization Göttingen +# Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +Unit test functions for the transformer feature of the scanner. + +Currently, this is under development. +See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/107 +""" + +import importlib +from pathlib import Path +from unittest.mock import Mock + +import pytest +from caoscrawler.converters import Converter, ListElementConverter +from caoscrawler.scanner import create_transformer_registry, scan_directory +from caoscrawler.stores import GeneralStore +from caoscrawler.transformer_functions import (cast_to_bool, cast_to_float, + cast_to_int, cast_to_str, + replace, split) +from pytest import raises + +UNITTESTDIR = Path(__file__).parent + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + "MarkdownFile": { + "converter": "MarkdownFileConverter", + "package": "caoscrawler.converters"}, + "Date": { + "converter": "DateElementConverter", + "package": "caoscrawler.converters"}, + "DictElement": { + "converter": "DictElementConverter", + "package": "caoscrawler.converters"}, + "TextElement": { + "converter": "TextElementConverter", + "package": "caoscrawler.converters"}, + "ListElement": { + "converter": "ListElementConverter", + "package": "caoscrawler.converters"}, + "JSONFile": { + "converter": "JSONFileConverter", + "package": "caoscrawler.converters"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +def test_simple_transformer(): + """ + Test the correct list of returned records by the scanner using the + scifolder example from the article. + """ + + records = scan_directory(UNITTESTDIR / "test_directories" / "test_transformers", + UNITTESTDIR / "test_directories" / "test_transformers" / + "cfood.yml") + + for r in records: + if r.parents[0].name == "DayFolder": + assert r.get_property("Day") is not None + assert r.get_property("DayShort") is not None + assert r.get_property("DayShort").value != "$day_short" + if r.get_property("DayShort").value == "Unk": + # This unkown folder should not lead to a replacement + assert r.get_property("Day").value == "Unk" + assert r.get_property("DaySplit").value == ["Unk"] + elif r.get_property("DayShort").value == "Mon": + assert r.get_property("Day").value == "Monday" + assert r.get_property("DaySplit").value == ["M", "n"] + elif r.get_property("DayShort").value == "Tue": + assert r.get_property("Day").value == "Tuesday" + assert r.get_property("DaySplit").value == ["Tue"] + else: + # unexpected occurence of a short form, something wrong with test directories + assert False + elif r.parents[0].name == "Number": + assert r.get_property("num") is not None + assert r.get_property("num").value == "'12345 5 '" + else: + # unkown error, something wrong with test directories + assert False + + +def test_apply_replace(converter_registry): + cfood_def = {"type": 'ListElement', "match_name": ".*", + 'transform': {'test': {'in': '$a', 'out': '$b', 'functions': [{ + 'replace': {'insert': ':', "remove": "_"}}]}}} + values = GeneralStore() + values["a"] = "16_45" + + # transformer_functions = create_transformer_registry(crawler_definition) + transformer_functions = {"replace": replace} + + conv = ListElementConverter(definition=cfood_def, name='test', + converter_registry=converter_registry) + + conv.apply_transformers(values, transformer_functions) + assert values['b'] == "16:45" + + +def test_apply_replace_from_def(converter_registry): + cfood_def = {"type": 'ListElement', "match_name": ".*", + 'transform': {'test': {'in': '$a', 'out': '$b', 'functions': [{ + 'replace': {'insert': ':', "remove": "_"}}]}}} + values = GeneralStore() + values["a"] = "16_45" + + transformer_functions = create_transformer_registry({}) + # transformer_functions = {"replace": replace} + + conv = ListElementConverter(definition=cfood_def, name='test', + converter_registry=converter_registry) + + conv.apply_transformers(values, transformer_functions) + assert values['b'] == "16:45" + + +def test_empty_functions_list(converter_registry): + cfood_def = {"type": 'ListElement', + "match_name": ".*", + 'transform': {'test': {'in': '$a', 'out': '$b', + 'functions': []}}} + values = GeneralStore() + values["a"] = "16_45" + + # transformer_functions = create_transformer_registry(crawler_definition) + transformer_functions = {"replace": replace} + + conv = ListElementConverter(definition=cfood_def, name='test', + converter_registry=converter_registry) + + conv.apply_transformers(values, transformer_functions) + assert values['b'] == "16_45" + + +def test_cast_transformer_functions(): + for val in ("True", "true", "False", "false"): + assert type(cast_to_bool(val, {})) == bool + if val[1] == "r": + assert cast_to_bool(val, {}) is True + else: + assert cast_to_bool(val, {}) is False + for val_err in ("jaksdlfj", "0", 1): + with pytest.raises(ValueError): + cast_to_bool(val_err, {}) + assert cast_to_bool(False, {}) is False + assert cast_to_bool(True, {}) is True + + assert cast_to_int("24", {}) == 24 + assert cast_to_int(24.0, {}) == 24 + assert cast_to_int(24, {}) == 24 + assert cast_to_int("-24", {}) == -24 + with pytest.raises(ValueError): + cast_to_int("24dsf", {}) + with pytest.raises(ValueError): + cast_to_int("24.0", {}) == 24 + + assert cast_to_float("24", {}) == 24.0 + assert cast_to_float("24.0", {}) == 24.0 + assert cast_to_float(24.0, {}) == 24.0 + assert cast_to_float(24, {}) == 24.0 + with pytest.raises(ValueError): + cast_to_float("24dsf", {}) + + assert cast_to_str(24, {}) == "24" + + +def test_replace_variables(): + vals = GeneralStore() + vals["test"] = "with" + vals["a"] = "str_without_replacement" + conv = Mock() + conv.definition = {} + conv.definition["transform"] = { + "test": { + "in": "$a", + "out": "$a", + "functions": [ + {"replace": { + "remove": "without", + "insert": "$test" + }} + ]}} + Converter.apply_transformers(conv, vals, {"replace": replace}) + assert vals["a"] == "str_with_replacement" diff --git a/unittests/test_unit_cfood.yml b/unittests/test_unit_cfood.yml new file mode 100644 index 0000000000000000000000000000000000000000..214aa49adceedce49a162f380ec453fb8597f215 --- /dev/null +++ b/unittests/test_unit_cfood.yml @@ -0,0 +1,43 @@ +--- +metadata: + crawler-version: 0.9.0 +--- +data: + type: Dict + match_name: '.*' + records: + MyRec: + may_be_overwritten: + value: "12" + unit: K + subtree: + ValueWithUnit: + type: TextElement + match_name: ^value_with_unit$ + match_value: "^(?P<number>\\d+\\.?\\d*)\\s+(?P<unit>.+)" + records: + MyRec: + value_with_unit: + value: $number + unit: $unit + MayBeOverwritten: + type: TextElement + match_name: ^may_be_overwritten$ + match_value: "^(?P<number>\\d+\\.?\\d*)\\s+(?P<unit>.+)" + records: + MyRec: + may_be_overwritten: + value: $number + unit: $unit + ListOfValues: + type: ListElement + match_name: ^array_with_units$ + subtree: + SingleValueWithUnit: + type: TextElement + match_value: "^(?P<number>\\d+\\.?\\d*)\\s+(?P<unit>.+)" + records: + MyRec: + list_with_unit: + value: +$number + unit: $unit diff --git a/unittests/test_utilities.py b/unittests/test_utilities.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b052524957b6f8c1e0378e3153fc06f4f36806 --- /dev/null +++ b/unittests/test_utilities.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Alexander Schlemmer <alexander.schlemmer@ds.mpg.de> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +import pytest +from os.path import sep +from caoscrawler.crawl import split_restricted_path +from caoscrawler.utils import MissingImport, get_shared_resource_link + + +def test_split_restricted_path(): + assert split_restricted_path("") == [] + assert split_restricted_path(f"{sep}") == [] + assert split_restricted_path(f"test{sep}") == ["test"] + assert split_restricted_path(f"{sep}test{sep}") == ["test"] + assert split_restricted_path(f"test{sep}bla") == ["test", "bla"] + assert split_restricted_path(f"{sep}test{sep}bla") == ["test", "bla"] + assert split_restricted_path(f"{sep}test1{sep}test2{sep}bla") == ["test1", "test2", "bla"] + assert split_restricted_path(f"{sep}test{sep}{sep}bla") == ["test", "bla"] + assert split_restricted_path(f"{sep}{sep}test{sep}bla") == ["test", "bla"] + assert split_restricted_path( + f"{sep}{sep}{sep}test{sep}{sep}bla{sep}{sep}{sep}{sep}") == ["test", "bla"] + + +def test_dummy_class(): + Missing = MissingImport(name="Not Important", hint="Do the thing instead.") + with pytest.raises(RuntimeError) as err_info_1: + print(Missing.__name__) + with pytest.raises(RuntimeError) as err_info_2: + Missing() + with pytest.raises(RuntimeError) as err_info_3: + print(Missing.foo) + + for err_info in (err_info_1, err_info_2, err_info_3): + msg = str(err_info.value) + assert "(Not Important)" in msg + assert msg.endswith("Do the thing instead.") + + MissingErr = MissingImport(name="Not Important", hint="Do the thing instead.", + err=ImportError("Old error")) + with pytest.raises(RuntimeError) as err_info_1: + print(MissingErr.__name__) + with pytest.raises(RuntimeError) as err_info_2: + MissingErr() + with pytest.raises(RuntimeError) as err_info_3: + print(MissingErr.foo) + + for err_info in (err_info_1, err_info_2, err_info_3): + msg = str(err_info.value) + assert "(Not Important)" in msg + orig_msg = str(err_info.value.__cause__) + assert orig_msg == "Old error" + + +def test_shared_resource_link(): + + assert get_shared_resource_link( + "https://example.com/", "file.txt") == "https://example.com/Shared/file.txt" + assert get_shared_resource_link( + "https://example.com", "file.txt") == "https://example.com/Shared/file.txt" + assert get_shared_resource_link( + "https://example.com", "path/to/file.txt") == "https://example.com/Shared/path/to/file.txt" + assert get_shared_resource_link( + "https://example.com/context-root", "path/to/file.txt") == "https://example.com/context-root/Shared/path/to/file.txt" + assert get_shared_resource_link( + "https://example.com/context-root/", "path/to/file.txt") == "https://example.com/context-root/Shared/path/to/file.txt" diff --git a/unittests/test_validation.py b/unittests/test_validation.py new file mode 100644 index 0000000000000000000000000000000000000000..a3215963f67b61241b321a0eb7345f9fe6fde1f2 --- /dev/null +++ b/unittests/test_validation.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +test validation +""" +from os.path import join +from pathlib import Path + +import jsonschema +import linkahead as db +import pytest +from caoscrawler.validator import (convert_record, + load_json_schema_from_datamodel_yaml, + validate) +from jsonschema import ValidationError + +UNITTESTDIR = Path(__file__).parent + + +def test_create_json_schema(): + json = load_json_schema_from_datamodel_yaml(join(UNITTESTDIR, "datamodels", "datamodel.yaml")) + r = db.Record() + r.add_parent(name="Dataset") + r.add_property(name="keywords", value="jakdlfjakdf") + r.add_property(name="dateModified", value="2024-11-16") + + pobj = convert_record(r) + # print(yaml.dump(pobj)) + # print(yaml.dump(json[0])) + assert "Dataset" in json + jsonschema.validate(pobj, json["Dataset"]) + + # Failing test: + r = db.Record() + r.add_parent(name="Dataset") + r.add_property(name="keywordss", value="jakdlfjakdf") + r.add_property(name="dateModified", value="2024-11-16") + + pobj = convert_record(r) + + with pytest.raises(ValidationError, match=".*'keywords' is a required property.*"): + jsonschema.validate(pobj, json["Dataset"]) + + +def test_validation(): + """ + Test for the main validation API function `validate` + """ + json = load_json_schema_from_datamodel_yaml( + join(UNITTESTDIR, "datamodels", "datamodel.yaml")) + r1 = db.Record() + r1.add_parent(name="Dataset") + r1.add_property(name="keywords", value="jakdlfjakdf") + r1.add_property(name="dateModified", value="2024-11-16") + + r2 = db.Record() + r2.add_parent(name="Dataset") + r2.add_property(name="keywordss", value="jakdlfjakdf") + r2.add_property(name="dateModified", value="2024-11-16") + + valres = validate([r1, r2], json) + assert valres[0][0] is True + assert valres[0][1] is None + assert not valres[1][0] + assert isinstance(valres[1][1], ValidationError) + assert valres[1][1].message == "'keywords' is a required property" diff --git a/unittests/test_variable_substitutions.py b/unittests/test_variable_substitutions.py index f6c3b6375a3111faff9d746779805ba16af260b7..c75e37956c1ec24e47ff9cbd9b03572ed4a0f80e 100644 --- a/unittests/test_variable_substitutions.py +++ b/unittests/test_variable_substitutions.py @@ -1,20 +1,47 @@ -#!/bin/python -# Tests for variable substitutions -# A. Schlemmer, 05/2022 +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2022 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# -from caoscrawler import Crawler -from caoscrawler.structure_elements import File, DictTextElement, DictListElement -from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter from functools import partial -from copy import deepcopy +from os.path import basename, dirname, join +from pathlib import Path from unittest.mock import MagicMock, Mock -from os.path import join, dirname, basename -import yaml -import caosdb as db -from caosdb.apiutils import compare_entities +import linkahead as db import pytest +import yaml +from linkahead.apiutils import compare_entities from pytest import raises +from utils import dircheckstr as dircheckstr_base + +from caoscrawler import Crawler +from caoscrawler.debug_tree import DebugTree +from caoscrawler.identifiable_adapters import (IdentifiableAdapter, + LocalStorageIdentifiableAdapter) +from caoscrawler.scanner import scan_directory +from caoscrawler.structure_elements import (DictListElement, DictTextElement, + File) + +UNITTESTDIR = Path(__file__).parent +dircheckstr = partial(dircheckstr_base, UNITTESTDIR / "test_directories" / + "example_substitutions") def rfp(*pathcomponents): @@ -25,40 +52,20 @@ def rfp(*pathcomponents): return join(dirname(__file__), *pathcomponents) -def dircheckstr(element_type, *pathcomponents): - """ - Return the debug tree identifier for a given path. - """ - return "caoscrawler.structure_elements." + element_type + ": " + basename(join(*pathcomponents)) + ", " + rfp("test_directories", "example_substitutions", *pathcomponents) - - -@pytest.fixture -def crawler(): - crawler = Crawler(debug=True) - crawler.crawl_directory(rfp("test_directories", "example_substitutions", "ExperimentalData"), - rfp("test_directories", "example_substitutions", "substitutions.yml")) - return crawler - - -@pytest.fixture -def crawler_2(): - crawler = Crawler(debug=True) - crawler.crawl_directory(rfp("test_directories", "example_substitutions", "ExperimentalData"), - rfp("test_directories", "example_substitutions", - "substitutions_parents.yml")) - return crawler - +def test_substitutions(): -def test_substitutions(crawler): + dbt = DebugTree() + scan_directory(rfp("test_directories", "example_substitutions", "ExperimentalData"), + rfp("test_directories", "example_substitutions", "substitutions.yml"), + debug_tree=dbt) # @review Florian Spreckelsen 2022-05-13 for i in range(2): - subd = crawler.debug_tree[dircheckstr( - "File", "ExperimentalData", "220512_data.dat")] + subd = dbt.debug_tree[dircheckstr("ExperimentalData", "220512_data.dat")] assert subd[i]["Experiment"].get_property("date").value == "2022-05-12" assert isinstance(subd[i]["ExperimentSeries"].get_property( "Experiment").value, db.Record) - subd = crawler.debug_tree[dircheckstr("Directory", "ExperimentalData")] + subd = dbt.debug_tree[dircheckstr("ExperimentalData")] assert subd[i]["Project"].name == "project" assert isinstance(subd[i]["Project"].get_property( "Experiments").value, list) @@ -70,12 +77,16 @@ def test_substitutions(crawler): "dates").value[0] == "2022-05-12" -def test_substitutions_parents(crawler_2): +def test_substitutions_parents(): + dbt = DebugTree() + scan_directory(rfp("test_directories", "example_substitutions", "ExperimentalData"), + rfp("test_directories", "example_substitutions", + "substitutions_parents.yml"), + debug_tree=dbt) # This is a test for: # https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/35 # ... testing whether variable substitutions can be used in parent declarations. - subd = crawler_2.debug_tree[dircheckstr( - "File", "ExperimentalData", "220512_data.dat")] + subd = dbt.debug_tree[dircheckstr("ExperimentalData", "220512_data.dat")] # subd[0] <- generalStore # subd[1] <- recordStore @@ -85,12 +96,16 @@ def test_substitutions_parents(crawler_2): assert parents[1].name == "Month_05" -def test_empty_parents(crawler_2): +def test_empty_parents(): + dbt = DebugTree() + scan_directory(rfp("test_directories", "example_substitutions", "ExperimentalData"), + rfp("test_directories", "example_substitutions", + "substitutions_parents.yml"), + debug_tree=dbt) # This is a test for: # https://gitlab.com/caosdb/caosdb-crawler/-/issues/8 - subd = crawler_2.debug_tree[dircheckstr( - "File", "ExperimentalData", "220512_data.dat")] + subd = dbt.debug_tree[dircheckstr("ExperimentalData", "220512_data.dat")] parents = subd[1]["RecordWithoutParents"].get_parents() assert len(parents) == 0 diff --git a/unittests/test_xml_converter.py b/unittests/test_xml_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..e8869ef6ffad511159a583a14fd49d2fad48766b --- /dev/null +++ b/unittests/test_xml_converter.py @@ -0,0 +1,379 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +test the XML converters +""" +import importlib +from pathlib import Path + +import pytest +import yaml +from lxml.etree import fromstring + +from caoscrawler.converters import (XMLAttributeNodeConverter, XMLTagConverter, + XMLTextNodeConverter) +from caoscrawler.scanner import load_definition +from caoscrawler.stores import GeneralStore +from caoscrawler.structure_elements import XMLTagElement + +UNITTESTDIR = Path(__file__).parent + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "XMLTag": { + "converter": "XMLTagConverter", + "package": "caoscrawler.converters"}, + + "XMLTextNode": { + "converter": "XMLTextNodeConverter", + "package": "caoscrawler.converters"}, + "XMLAttributeNode": { + "converter": "XMLAttributeNodeConverter", + "package": "caoscrawler.converters"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +@pytest.fixture +def basic_xmltag_converter(converter_registry): + return XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: a +match_attrib: # default is the empty dictionary + "(?P<ref>(href|url))": "test(?P<number>[0-9])" # either the "href" or the "url" attribute must be set + alt: (.+) # this attribute must be present and contain at least one character +match_text: \\s*(?P<node_text>.+)\\s* + +subtree: + img: + type: XMLTag + match_name: img + match_attrib: + src: test2 +"""), "TestXMLTagConverter", converter_registry) + + +@pytest.fixture +def basic_xpath_xmltag_converter(converter_registry): + return XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: a +match_attrib: # default is the empty dictionary + "(?P<ref>(href|url))": "test(?P<number>[0-9])" # either the "href" or the "url" attribute must be set + alt: (.+) # this attribute must be present and contain at least one character +match_text: \\s*(?P<node_text>.+)\\s* +xpath: child::*/* + +subtree: + img: + type: XMLTag + match_name: img + match_attrib: + src: test2 + testnode: + type: XMLTag + match_name: testnode +"""), "TestXMLTagConverter", converter_registry) + + +def test_simple_xml(basic_xmltag_converter): + """ + Test for basic xml conversion functionality. + """ + xml_text = """ + <a href="test1" alt="no link"> + test <img src="test2"/> + </a> + """ + + xml = fromstring(xml_text) + tag = XMLTagElement(xml) + assert tag.name == "." + + m = basic_xmltag_converter.match(tag) + + assert m is not None + assert m["ref"] == "href" + assert m["number"] == "1" + assert m["node_text"] == "test " + + +def test_not_matching(basic_xmltag_converter): + m = basic_xmltag_converter.match(XMLTagElement(fromstring(""" + <a href="test1"> + test <img src="test2"/> + </a> + """))) + + assert m is None # alt-attribute was missing + + m = basic_xmltag_converter.match(XMLTagElement(fromstring(""" + <a href="test" alt="no link"> + test <img src="test2"/> + </a> + """))) + + assert m is None # href attribute did not match + + m = basic_xmltag_converter.match(XMLTagElement(fromstring(""" + <a href="test1" url="http" alt="no link"> + test <img src="test2"/> + </a> + """))) + + assert m is None # href and url must not be present simultaneously + + m = basic_xmltag_converter.match(XMLTagElement(fromstring(""" + <a href="test1" alt="no link"><img src="test2"/></a> + """))) + + assert m is None # text node is empty + + m = basic_xmltag_converter.match(XMLTagElement(fromstring(""" + <a href="test1" alt="no link"/> + """))) + + assert m is None # text node is empty + + # TODO: adapt converter -> empty (==None) text node is equivalent to empty string text node + # TODO: adapt tests + # TODO: how to match " ajskdlfjaldsf ajsdklfjadkl " without the whitespaces in regexp correctly? + + +def test_nested_simple_xml(basic_xmltag_converter, basic_xpath_xmltag_converter): + """ + Test for xml conversion including children. + """ + xml_text = """ + <a href="test1" alt="no link"> + test <img src="test2"/> + </a> + """ + + tag = XMLTagElement(fromstring(xml_text)) + m = basic_xmltag_converter.match(tag) + assert m is not None + + general_store = GeneralStore() + children = basic_xmltag_converter.create_children(general_store, tag) + + assert len(children) == 1 + assert isinstance(children[0], XMLTagElement) + assert children[0].name == "img" + + xml_text = """ + <a href="test1" alt="no link"> + test <img src="test2"> + <testnode/> </img> + </a> + """ + + tag = XMLTagElement(fromstring(xml_text)) + m = basic_xpath_xmltag_converter.match(tag) + assert m is not None + + general_store = GeneralStore() + children = basic_xpath_xmltag_converter.create_children(general_store, tag) + + assert len(children) == 1 + assert isinstance(children[0], XMLTagElement) + assert children[0].name == "img/testnode" + + +def test_namespace_xml(converter_registry): + """ + Test for xml conversion including children. + Nodes have namespaces. + """ + + xml_text = """ + <root xmlns="default-namespace" xmlns:test="alternative-namespace"> + <node1 active="true"> + Bla + </node1> + <node1 active="true" size="45"> + text + <node2 xmlns="sub-namespace"> + <node3> + ok + </node3> + </node2> + <test:node2> + sep + </test:node2> + </node1> + </root> +""" + + # Test unsupported xpath (containing text()): + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "{default-namespace}root" +xpath: "default:node1/text()" +default_namespace: default +"""), "TestXMLTagConverter", converter_registry) + + tag = XMLTagElement(fromstring(xml_text)) + m = converter.match(tag) + assert m is not None + + with pytest.raises(RuntimeError, match="Only standard xml nodes.*"): + converter.create_children(GeneralStore(), tag) + + # Test complex xml using namespaces and text nodes: + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "{default-namespace}root" +xpath: "default:node1" +default_namespace: default +attribs_as_children: false +text_as_children: true +tags_as_children: false +"""), "TestXMLTagConverter", converter_registry) + children = converter.create_children(GeneralStore(), tag) + assert len(children) == 2 + assert children[0].name == "{default-namespace}node1[1]/text()" + assert children[0].value.strip() == "Bla" + assert children[1].name == "{default-namespace}node1[2]/text()" + assert children[1].value.strip() == "text" + + # Check child generation of attributes: + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "{default-namespace}root" +xpath: "default:node1" +default_namespace: default +attribs_as_children: true +text_as_children: false +tags_as_children: false +"""), "TestXMLTagConverter", converter_registry) + children = converter.create_children(GeneralStore(), tag) + + assert len(children) == 3 + assert children[0].name == "{default-namespace}node1[1]@active" + assert children[0].value.strip() == "true" + assert children[1].name == "{default-namespace}node1[2]@active" + assert children[1].value.strip() == "true" + assert children[2].name == "{default-namespace}node1[2]@size" + assert children[2].value.strip() == "45" + + # Test setting nsmap entries: + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "{default-namespace}root" +xpath: "//s:node2" +default_namespace: default +nsmap: + s: sub-namespace +"""), "TestXMLTagConverter", converter_registry) + children = converter.create_children(GeneralStore(), tag) + assert len(children) == 1 + assert children[0].name == "{default-namespace}node1[2]/{sub-namespace}node2" + + +def test_attrib_nodes(converter_registry): + """ + Test attribute node converters. + """ + + xml_text = """ + <node1 active="true" size="45"> + Bla + </node1> +""" + + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "node1" +xpath: . +tags_as_children: false +attribs_as_children: true +"""), "TestXMLTagConverter", converter_registry) + + tag = XMLTagElement(fromstring(xml_text)) + m = converter.match(tag) + assert m is not None + children = converter.create_children(GeneralStore(), tag) + assert len(children) == 2 + + attrib_converter = XMLAttributeNodeConverter(yaml.safe_load(""" +type: XMLAttributeNode +match_name: active +match_value: (?P<val>.*) +"""), "TestXMLAttributeNodeConverter", converter_registry) + m = attrib_converter.match(children[1]) + assert m is None + m = attrib_converter.match(children[0]) + assert m is not None + assert m["val"] == "true" + + attrib_converter = XMLAttributeNodeConverter(yaml.safe_load(""" +type: XMLAttributeNode +match_name: size +match_value: (?P<val>.*) +"""), "TestXMLAttributeNodeConverter", converter_registry) + m = attrib_converter.match(children[0]) + assert m is None + m = attrib_converter.match(children[1]) + assert m is not None + assert m["val"] == "45" + + +def test_text_nodes(converter_registry): + """ + Test text node converters. + """ + + xml_text = """ + <node1 active="true" size="45"> + Bla + </node1> +""" + + converter = XMLTagConverter(yaml.safe_load(""" +type: XMLTag +match_tag: "node1" +xpath: . +tags_as_children: false +text_as_children: true +"""), "TestXMLTagConverter", converter_registry) + + tag = XMLTagElement(fromstring(xml_text)) + m = converter.match(tag) + assert m is not None + children = converter.create_children(GeneralStore(), tag) + assert len(children) == 1 + + attrib_converter = XMLTextNodeConverter(yaml.safe_load(""" +type: XMLTextNode +match_text: \s*(?P<val>\w*)\s* +"""), "TestXMLTextNodeConverter", converter_registry) + m = attrib_converter.match(children[0]) + assert m is not None + assert m["val"] == "Bla" diff --git a/unittests/test_zipfile_converter.py b/unittests/test_zipfile_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..451d23c93bfc15889d5b7a9f97ef1f157aece6ee --- /dev/null +++ b/unittests/test_zipfile_converter.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +test the zip-file converter +""" +import importlib +import os +from pathlib import Path + +import pytest +import yaml +from caoscrawler.converters import DirectoryConverter, ZipFileConverter +from caoscrawler.stores import GeneralStore +from caoscrawler.structure_elements import Directory, File + +UNITTESTDIR = Path(__file__).parent + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "ZipFile": { + "converter": "ZipFileConverter", + "package": "caoscrawler.converters"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +@pytest.mark.xfail( + reason="The example files for PASTA have not yet been updated in:" + "https://github.com/TheELNConsortium/TheELNFileFormat/tree/master/examples/PASTA" + "However, there was the announcement that these files are going to follow the" + "flattened structure soon: https://github.com/TheELNConsortium/TheELNFileFormat/issues/98" +) +def test_zipfile_converter(converter_registry): + zipfile = File("PASTA.eln", os.path.join(UNITTESTDIR, "eln_files", "PASTA.eln")) + zip_conv = ZipFileConverter(yaml.safe_load(""" +type: ZipFile +match: .*$ +"""), "TestZipFileConverter", converter_registry) + + match = zip_conv.match(zipfile) + assert match is not None + + children = zip_conv.create_children(GeneralStore(), zipfile) + assert len(children) == 1 + assert children[0].name == "PASTA" + + dir_conv = DirectoryConverter(yaml.safe_load(""" +type: Directory +match: ^PASTA$ +"""), "TestDirectory", converter_registry) + match = dir_conv.match(children[0]) + assert match is not None + children = dir_conv.create_children(GeneralStore(), children[0]) + assert len(children) == 5 + print(children) + for i in range(2): + assert isinstance(children[i], Directory) + for i in range(2, 5): + assert isinstance(children[i], File) + + +def test_zipfile_minimal(converter_registry): + zipfile = File("empty.zip", os.path.join(UNITTESTDIR, "zip_minimal", "empty.zip")) + zip_conv = ZipFileConverter(yaml.safe_load(""" +type: ZipFile +match: .*$ +"""), "TestZipFileConverter", converter_registry) + + match = zip_conv.match(zipfile) + assert match is not None + + children = zip_conv.create_children(GeneralStore(), zipfile) + assert len(children) == 2 + + file_obj = None + dir_obj = None + for ch in children: + if isinstance(ch, File): + file_obj = ch + elif isinstance(ch, Directory): + dir_obj = ch + else: + assert False + assert file_obj is not None and dir_obj is not None + assert file_obj.name == "empty.txt" + + dir_conv = DirectoryConverter(yaml.safe_load(""" +type: Directory +match: ^folder$ +"""), "TestDirectory", converter_registry) + match = dir_conv.match(dir_obj) + assert match is not None + children = dir_conv.create_children(GeneralStore(), dir_obj) + assert len(children) == 3 + for i in range(3): + assert isinstance(children[i], File) diff --git a/unittests/utils.py b/unittests/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..fee80e44028667b9b3c8c8f8201b1a774c46afdf --- /dev/null +++ b/unittests/utils.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2023 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2023 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +import os +from pathlib import Path + +""" +utilities for tests +""" +UNITTESTDIR = Path(__file__).parent + + +def dircheckstr(prefix, *pathcomponents): + """ + Return the debug tree identifier for a given path. + """ + if os.path.isdir(os.path.join(prefix, *pathcomponents)): + ftype = "Directory" + else: + ftype = "File" + return (f"caoscrawler.structure_elements.structure_elements.{ftype}: " + os.path.basename( + os.path.join(*pathcomponents)) + ", " + os.path.join(prefix, *pathcomponents)) diff --git a/unittests/zip_minimal/empty.zip b/unittests/zip_minimal/empty.zip new file mode 100644 index 0000000000000000000000000000000000000000..3eb2cee755e1b0265b13b1ee8f31c2aa1abe62de Binary files /dev/null and b/unittests/zip_minimal/empty.zip differ