diff --git a/.gitlab/issue_templates/Default.md b/.gitlab/issue_templates/Default.md new file mode 100644 index 0000000000000000000000000000000000000000..aa1a65aca363b87aff50280e1a86824009d2098b --- /dev/null +++ b/.gitlab/issue_templates/Default.md @@ -0,0 +1,28 @@ +## Summary + +*Please give a short summary of what the issue is.* + +## Expected Behavior + +*What did you expect how the software should behave?* + +## Actual Behavior + +*What did the software actually do?* + +## Steps to Reproduce the Problem + +*Please describe, step by step, how others can reproduce the problem. Please try these steps for yourself on a clean system.* + +1. +2. +3. + +## Specifications + +- Version: *Which version of this software?* +- Platform: *Which operating system, which other relevant software versions?* + +## Possible fixes + +*Do you have ideas how the issue can be resolved?* diff --git a/.gitlab/merge_request_templates/Default.md b/.gitlab/merge_request_templates/Default.md new file mode 100644 index 0000000000000000000000000000000000000000..7859b7be21fb1c3eda91ee35173a8e3412a62066 --- /dev/null +++ b/.gitlab/merge_request_templates/Default.md @@ -0,0 +1,53 @@ +# Summary + +*Insert a meaningful description for this merge request here: What is the new/changed behavior? +Which bug has been fixed? Are there related issues?* + + +# Focus + +*Point the reviewer to the core of the code change. Where should they start reading? What should +they focus on (e.g. security, performance, maintainability, user-friendliness, compliance with the +specs, finding more corner cases, concrete questions)?* + + +# Test Environment + +*How to set up a test environment for manual testing?* + + +# Check List for the Author + +Please, prepare your MR for a review. Be sure to write a summary and a focus and create gitlab +comments for the reviewer. They should guide the reviewer through the changes, explain your changes +and also point out open questions. For further good practices have a look at [our review +guidelines](https://gitlab.com/caosdb/caosdb/-/blob/dev/REVIEW_GUIDELINES.md) + +- [ ] All automated tests pass +- [ ] Reference related issues +- [ ] Up-to-date CHANGELOG.md (or not necessary) +- [ ] Appropriate user and developer documentation (or not necessary) + - How do I use the software? Assume "stupid" users. + - How do I develop or debug the software? Assume novice developers. +- [ ] Annotations in code (Gitlab comments) + - Intent of new code + - Problems with old code + - Why this implementation? + + +# Check List for the Reviewer + +- [ ] I understand the intent of this MR +- [ ] All automated tests pass +- [ ] Up-to-date CHANGELOG.md (or not necessary) +- [ ] Appropriate user and developer documentation (or not necessary) +- [ ] The test environment setup works and the intended behavior is reproducible in the test + environment +- [ ] In-code documentation and comments are up-to-date. +- [ ] Check: Are there specifications? Are they satisfied? + +For further good practices have a look at [our review guidelines](https://gitlab.com/caosdb/caosdb/-/blob/dev/REVIEW_GUIDELINES.md). + + +/assign me +/target_branch dev diff --git a/CHANGELOG.md b/CHANGELOG.md index f086e1317f05277452659adf3fe20547adab2ae3..c498b9286e0977295066340a2a4172093ac10bfe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,43 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.4.0] - 2023-03-22 ## +(Florian Spreckelsen) + +### Added ### + +- DateElementConverter: allows to interpret text as a date object +- the restricted_path argument allows to crawl only a subtree +- logging that provides a summary of what is inserted and updated +- You can now access the file system path of a structure element (if it has one) using the variable + name ``<converter name>.path`` +- ``add_prefix`` and ``remove_prefix`` arguments for the command line interface + and the ``crawler_main`` function for the adding/removal of path prefixes when + creating file entities. + +### Changed ### + +- The definitions for the default converters were removed from crawl.py and placed into + a separate yaml file called `default_converters.yml`. There is a new test testing for + the correct loading behavior of that file. +- JSONFileConverter, YAMLFileConverter and MarkdownFileConverter now inherit from + SimpleFileConverter. Behavior is unchanged, except that the MarkdownFileConverter now raises a + ConverterValidationError when the YAML header cannot be read instead of silently not matching. + +### Deprecated ### + +- The ``prefix`` argument of `crawler_main` is deprecated. Use the new argument + ``remove_prefix`` instead. + +### Removed ### +- The command line argument ``--prefix``. Use the new argument ``--remove-prefix`` instead. + +### Fixed ### + +- an empty string as name is treated as no name (as does the server). This, fixes + queries for identifiables since it would contain "WITH name=''" otherwise + which is an impossible condition. If your cfoods contained this case, they are ill defined. + ## [0.3.0] - 2022-01-30 ## (Florian Spreckelsen) diff --git a/CITATION.cff b/CITATION.cff index ad00d0edb29ecfe2edf4b1aeb621ff35f8304f90..9c8bf551c41a6a3447b076914741b349a8c72b9c 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -20,6 +20,6 @@ authors: given-names: Stefan orcid: https://orcid.org/0000-0001-7214-8125 title: CaosDB - Crawler -version: 0.3.0 +version: 0.4.0 doi: 10.3390/data4020083 -date-released: 2023-01-30 \ No newline at end of file +date-released: 2023-03-22 \ No newline at end of file diff --git a/INSTALL.md b/INSTALL.md new file mode 100644 index 0000000000000000000000000000000000000000..ba220626460c559aeded69d360c85917e0c78066 --- /dev/null +++ b/INSTALL.md @@ -0,0 +1,57 @@ +# Installation ## + + +## Linux #### + +Make sure that Python (at least version 3.8) and pip is installed, using your system tools and +documentation. + +Then open a terminal and continue in the [Generic installation](#generic-installation) section. + +## Windows #### + +If a Python distribution is not yet installed, we recommend Anaconda Python, which you can download +for free from [https://www.anaconda.com](https://www.anaconda.com). The "Anaconda Individual Edition" provides most of all +packages you will ever need out of the box. If you prefer, you may also install the leaner +"Miniconda" installer, which allows you to install packages as you need them. + +After installation, open an Anaconda prompt from the Windows menu and continue in the [Generic +installation](#generic-installation) section. + +## MacOS #### + +If there is no Python 3 installed yet, there are two main ways to +obtain it: Either get the binary package from +[python.org](https://www.python.org/downloads/) or, for advanced +users, install via [Homebrew](https://brew.sh/). After installation +from python.org, it is recommended to also update the TLS certificates +for Python (this requires administrator rights for your user): + +```sh +# Replace this with your Python version number: +cd /Applications/Python\ 3.9/ + +# This needs administrator rights: +sudo ./Install\ Certificates.command +``` + +After these steps, you may continue with the [Generic +installation](#generic-installation). + +## Generic installation #### + +The CaosDB crawler is available as [PyPi +package](https://pypi.org/project/caoscrawler/) and can simply installed by + +```sh +pip3 install caoscrawler +``` + +Alternatively, obtain the sources from GitLab and install from there (`git` must +be installed for this option): + +```sh +git clone https://gitlab.com/caosdb/caosdb-crawler +cd caosdb-crawler +pip3 install --user . +``` diff --git a/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml b/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml index 7a64d708667182b80b739812e5fdf3369fc5b462..37a34d125dcff1d121b1bded2fe959c4d30ff403 100644 --- a/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml +++ b/integrationtests/test_data/extroot/realworld_example/dataset_cfoods.yml @@ -153,6 +153,13 @@ Data: metadata_json: &metadata_json_template type: JSONFile match: metadata.json + records: + JSONFile: + parents: + - JSONFile + role: File + path: ${metadata_json.path} + file: ${metadata_json.path} validate: schema/dataset.schema.json subtree: jsondict: diff --git a/integrationtests/test_data/extroot/realworld_example/schema/dataspace.schema.json b/integrationtests/test_data/extroot/realworld_example/schema/dataspace.schema.json index 01653bfa821e0a0acbb5a481bfd458e2ed784fb9..36233230ae05f9df58ae4e492ff1f709322f6e51 100644 --- a/integrationtests/test_data/extroot/realworld_example/schema/dataspace.schema.json +++ b/integrationtests/test_data/extroot/realworld_example/schema/dataspace.schema.json @@ -9,6 +9,7 @@ "minimum": 20000 }, "archived": { "type": "boolean" }, + "JSONFile": { "type": "object" }, "url": { "type": "string", "description": "link to folder on file system (CaosDB or cloud folder)" diff --git a/integrationtests/test_data/extroot/use_case_simple_presentation/cfood.yml b/integrationtests/test_data/extroot/use_case_simple_presentation/cfood.yml index 6495e1828dc56e99459c162f7751951f880ea55c..c55be2157a1f079ecfb5809c3658586f9114fad1 100644 --- a/integrationtests/test_data/extroot/use_case_simple_presentation/cfood.yml +++ b/integrationtests/test_data/extroot/use_case_simple_presentation/cfood.yml @@ -25,8 +25,8 @@ extroot: parents: - mdfile role: File - path: $DataFile - file: $DataFile + path: ${DataFile.path} + file: ${DataFile.path} Experiment: mdfile: $mdfile @@ -68,8 +68,8 @@ extroot: parents: - mdfile role: File - path: $DataFile - file: $DataFile + path: ${DataFile.path} + file: ${DataFile.path} Experiment: {} diff --git a/integrationtests/test_realworld_example.py b/integrationtests/test_realworld_example.py index 4158ed22278ef5c871a22d45885e58fbfa84ea3b..cb5ed2c769945af033bc56a2d6af3bf1cec86de4 100644 --- a/integrationtests/test_realworld_example.py +++ b/integrationtests/test_realworld_example.py @@ -25,6 +25,7 @@ an integration test module that runs a test against a (close to) real world example """ from caosdb.utils.register_tests import clear_database, set_test_key +import logging import json import os @@ -35,6 +36,7 @@ from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.structure_elements import Directory import pytest from caosadvancedtools.models.parser import parse_model_from_json_schema, parse_model_from_yaml +from caosadvancedtools.loadFiles import loadpath import sys @@ -52,6 +54,17 @@ def rfp(*pathcomponents): DATADIR = rfp("test_data", "extroot", "realworld_example") +@pytest.fixture +def addfiles(): + loadpath(path='/opt/caosdb/mnt/extroot/', + include=None, + exclude=None, + prefix="", + dryrun=False, + forceAllowSymlinks=True, + ) + + @pytest.fixture def usemodel(): # First load dataspace data model @@ -85,22 +98,21 @@ def create_identifiable_adapter(): return ident -def test_dataset(clear_database, usemodel): - ident = create_identifiable_adapter() - crawler = Crawler(identifiableAdapter=ident) - crawler_definition = crawler.load_definition( - os.path.join(DATADIR, "dataset_cfoods.yml")) - # print(json.dumps(crawler_definition, indent=3)) - # Load and register converter packages: - converter_registry = crawler.load_converters(crawler_definition) - # print("DictIntegerElement" in converter_registry) - - records = crawler.start_crawling( - Directory("data", os.path.join(DATADIR, 'data')), - crawler_definition, - converter_registry +def test_dataset(clear_database, usemodel, addfiles, caplog): + caplog.set_level(logging.DEBUG, logger="caoscrawler") + identifiable_path = os.path.join(DATADIR, "identifiables.yml") + crawler_definition_path = os.path.join(DATADIR, "dataset_cfoods.yml") + crawler_main( + os.path.join(DATADIR, 'data'), + crawler_definition_path, + identifiable_path, + True, + os.path.join(DATADIR, "provenance.yml"), + False, + remove_prefix=DATADIR, + # this test will fail without this prefix since the crawler would try to create new files + add_prefix="/extroot/realworld_example" ) - crawler.synchronize() dataspace = db.execute_query("FIND RECORD Dataspace WITH name=35 AND dataspace_id=20002 AND " "archived=FALSE AND url='https://datacloud.de/index.php/f/7679'" @@ -119,13 +131,17 @@ def test_dataset(clear_database, usemodel): "start_datetime='2022-02-10T16:36:48+01:00'") == 1 assert db.execute_query(f"FIND Event WITH latitude=53", unique=True) + # test logging + assert "Executed inserts" in caplog.text + assert "Going to insert" in caplog.text + assert "Executed updates" in caplog.text + -def test_event_update(clear_database, usemodel): +def test_event_update(clear_database, usemodel, addfiles): identifiable_path = os.path.join(DATADIR, "identifiables.yml") crawler_definition_path = os.path.join(DATADIR, "dataset_cfoods.yml") - # TODO(fspreck): Use crawler_main crawler_main( os.path.join(DATADIR, 'data'), crawler_definition_path, @@ -133,7 +149,9 @@ def test_event_update(clear_database, usemodel): True, os.path.join(DATADIR, "provenance.yml"), False, - "" + remove_prefix=DATADIR, + # this test will fail without this prefix since the crawler would try to create new files + add_prefix="/extroot/realworld_example" ) old_dataset_rec = db.execute_query( diff --git a/integrationtests/test_use_case_simple_presentation.py b/integrationtests/test_use_case_simple_presentation.py index 91c523be90a4d0117a7cc54217cae0b911511957..5fc0f6c7d85a0fce4490c72952e711fe241a0099 100644 --- a/integrationtests/test_use_case_simple_presentation.py +++ b/integrationtests/test_use_case_simple_presentation.py @@ -38,9 +38,7 @@ DATADIR = os.path.join(os.path.dirname(__file__), "test_data", "extroot", "use_case_simple_presentation") -def test_complete_crawler( - clear_database -): +def test_complete_crawler(clear_database): # Setup the data model: model = parser.parse_model_from_yaml(os.path.join(DATADIR, "model.yml")) model.sync_data_model(noquestion=True, verbose=False) @@ -57,13 +55,24 @@ def test_complete_crawler( dryrun=False, forceAllowSymlinks=False) + # test that a bad value for "remove_prefix" leads to runtime error + with pytest.raises(RuntimeError) as re: + crawler_main(DATADIR, + os.path.join(DATADIR, "cfood.yml"), + os.path.join(DATADIR, "identifiables.yml"), + True, + os.path.join(DATADIR, "provenance.yml"), + False, + remove_prefix="sldkfjsldf") + assert "path does not start with the prefix" in str(re.value) + crawler_main(DATADIR, os.path.join(DATADIR, "cfood.yml"), os.path.join(DATADIR, "identifiables.yml"), True, os.path.join(DATADIR, "provenance.yml"), False, - "/use_case_simple_presentation") + remove_prefix=os.path.abspath(DATADIR)) res = db.execute_query("FIND Record Experiment") assert len(res) == 1 diff --git a/setup.cfg b/setup.cfg index e16a49cbbb55699db9abd37fbc5890eca5634ef6..fbdd9d7119312e2831c77fe3e8b24bd16b5826b4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = caoscrawler -version = 0.3.0 +version = 0.4.0 author = Alexander Schlemmer author_email = alexander.schlemmer@ds.mpg.de description = A new crawler for caosdb diff --git a/src/caoscrawler/cfood-schema.yml b/src/caoscrawler/cfood-schema.yml index 5e724c83695e098ce980e1aa8e81c65ae8525e19..b0d77bbf5d7ba09df3c0c47d656fa3d22d07b6d2 100644 --- a/src/caoscrawler/cfood-schema.yml +++ b/src/caoscrawler/cfood-schema.yml @@ -27,6 +27,7 @@ cfood: - BooleanElement - Definitions - Dict + - Date - JSONFile - CSVTableConverter - XLSXTableConverter diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index d4e25f73a8a9e7dad42c50d907745dfb7329bb13..80a3728ce5b1f413d2bdd674b26a7dca1122eef5 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -43,6 +43,8 @@ from string import Template import yaml_header_tools import pandas as pd +import logging + import yaml @@ -51,6 +53,12 @@ import yaml SPECIAL_PROPERTIES = ("description", "name", "id", "path", "file", "checksum", "size") +logger = logging.getLogger(__name__) + + +class CrawlerTemplate(Template): + braceidpattern = r"(?a:[_a-z][_\.a-z0-9]*)" + def _only_max(children_with_keys): @@ -106,6 +114,19 @@ class ConverterValidationError(Exception): self.message = msg +def create_path_value(func): + """decorator for create_values functions that adds a value containing the path + + should be used for StructureElement that are associated with file system objects that have a + path, like File or Directory. + """ + + def inner(self, values: GeneralStore, element: StructureElement): + func(self, values=values, element=element) + values.update({self.name + ".path": element.path}) + return inner + + def replace_variables(propvalue, values: GeneralStore): """ This function replaces variables in property values (and possibly other locations, @@ -129,7 +150,7 @@ def replace_variables(propvalue, values: GeneralStore): if isinstance(values[varname], db.Entity): return values[varname] - propvalue_template = Template(propvalue) + propvalue_template = CrawlerTemplate(propvalue) return propvalue_template.safe_substitute(**values.get_storage()) @@ -237,7 +258,7 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict continue # Allow replacing variables in keys / names of properties: - key_template = Template(key) + key_template = CrawlerTemplate(key) key = key_template.safe_substitute(**values.get_storage()) keys_modified.append((name, key)) @@ -325,9 +346,7 @@ class Converter(object, metaclass=ABCMeta): return converter - def create_values(self, - values: GeneralStore, - element: StructureElement): + def create_values(self, values: GeneralStore, element: StructureElement): """ Extract information from the structure element and store them as values in the general store. @@ -346,13 +365,14 @@ class Converter(object, metaclass=ABCMeta): element: StructureElement): pass - def create_records(self, values: GeneralStore, - records: RecordStore, + def create_records(self, values: GeneralStore, records: RecordStore, element: StructureElement): + # TODO why is element passed but not used??? if "records" not in self.definition: return [] + # TODO please rename due to conflict return create_records(values, records, self.definition["records"]) @@ -364,7 +384,8 @@ class Converter(object, metaclass=ABCMeta): if rule not in FILTER_FUNCTIONS: raise RuntimeError( - f"{rule} is not a known filter rule. Only {list(FILTER_FUNCTIONS.keys())} are implemented." + f"{rule} is not a known filter rule. Only " + f"{list(FILTER_FUNCTIONS.keys())} are implemented." ) to_be_filtered = [] @@ -391,19 +412,21 @@ class Converter(object, metaclass=ABCMeta): pass @staticmethod - def _debug_matching_template(name: str, regexp: list[str], matched: list[str], result: Optional[dict]): + def _debug_matching_template(name: str, regexp: list[str], matched: list[str], + result: Optional[dict]): """ Template for the debugging output for the match function """ - print("\n--------", name, "-----------") + msg = "\n--------" + name + "-----------" for re, ma in zip(regexp, matched): - print("matching against:\n" + re) - print("matching:\n" + ma) - print("---------") + msg += "matching against:\n" + re + msg += "matching:\n" + ma + msg += "---------" if result is None: - print("No match") + msg += "No match" else: - print("Matched groups:") - print(result) - print("----------------------------------------") + msg += "Matched groups:" + msg += str(result) + msg += "----------------------------------------" + logger.debug(msg) @staticmethod def debug_matching(kind=None): @@ -471,6 +494,10 @@ class DirectoryConverter(Converter): return children + @create_path_value + def create_values(self, values: GeneralStore, element: StructureElement): + super().create_values(values=values, element=element) + def typecheck(self, element: StructureElement): return isinstance(element, Directory) @@ -518,6 +545,10 @@ class SimpleFileConverter(Converter): def create_children(self, generalStore: GeneralStore, element: StructureElement): return list() + @create_path_value + def create_values(self, values: GeneralStore, element: StructureElement): + super().create_values(values=values, element=element) + @Converter.debug_matching("name") def match(self, element: StructureElement): # TODO: See comment on types and inheritance @@ -536,7 +567,7 @@ class FileConverter(SimpleFileConverter): super().__init__(*args, **kwargs) -class MarkdownFileConverter(Converter): +class MarkdownFileConverter(SimpleFileConverter): """ reads the yaml header of markdown files (if a such a header exists). """ @@ -546,8 +577,18 @@ class MarkdownFileConverter(Converter): if not isinstance(element, File): raise RuntimeError("A markdown file is needed to create children.") - header = yaml_header_tools.get_header_from_file( - element.path, clean=False) + try: + header = yaml_header_tools.get_header_from_file( + element.path, clean=False) + except yaml_header_tools.NoValidHeader: + if generalStore is not None and self.name in generalStore: + path = generalStore[self.name] + else: + path = "<path not set>" + raise ConverterValidationError( + "Error during the validation (yaml header cannot be read) of the markdown file " + "located at the following node in the data structure:\n" + f"{path}") children: List[StructureElement] = [] for name, entry in header.items(): @@ -560,25 +601,6 @@ class MarkdownFileConverter(Converter): "Header entry {} has incompatible type.".format(name)) return children - def typecheck(self, element: StructureElement): - return isinstance(element, File) - - @Converter.debug_matching("name") - def match(self, element: StructureElement): - # TODO: See comment on types and inheritance - if not isinstance(element, File): - raise RuntimeError("Element must be a file.") - m = re.match(self.definition["match"], element.name) - if m is None: - return None - try: - yaml_header_tools.get_header_from_file(element.path) - except yaml_header_tools.NoValidHeader: - # TODO(salexan): Raise a validation error instead of just not - # matching silently. - return None - return m.groupdict() - def convert_basic_element(element: Union[list, dict, bool, int, float, str, None], name=None, msg_prefix=""): @@ -685,20 +707,7 @@ class DictDictElementConverter(DictElementConverter): super().__init__(*args, **kwargs) -class JSONFileConverter(Converter): - def typecheck(self, element: StructureElement): - return isinstance(element, File) - - @Converter.debug_matching("name") - def match(self, element: StructureElement): - # TODO: See comment on types and inheritance - if not self.typecheck(element): - raise RuntimeError("Element must be a file") - m = re.match(self.definition["match"], element.name) - if m is None: - return None - return m.groupdict() - +class JSONFileConverter(SimpleFileConverter): def create_children(self, generalStore: GeneralStore, element: StructureElement): # TODO: See comment on types and inheritance if not isinstance(element, File): @@ -714,26 +723,13 @@ class JSONFileConverter(Converter): f"{element.path}\n" + err.message) structure_element = convert_basic_element( json_data, - name=element.name+"_child_dict", + name=element.name + "_child_dict", msg_prefix="The JSON File contained content that was parsed to a Python object" " with an unexpected type.") return [structure_element] -class YAMLFileConverter(Converter): - def typecheck(self, element: StructureElement): - return isinstance(element, File) - - @Converter.debug_matching("name") - def match(self, element: StructureElement): - # TODO: See comment on types and inheritance - if not self.typecheck(element): - raise RuntimeError("Element must be a file") - m = re.match(self.definition["match"], element.name) - if m is None: - return None - return m.groupdict() - +class YAMLFileConverter(SimpleFileConverter): def create_children(self, generalStore: GeneralStore, element: StructureElement): # TODO: See comment on types and inheritance if not isinstance(element, File): @@ -749,7 +745,7 @@ class YAMLFileConverter(Converter): f"{element.path}\n" + err.message) structure_element = convert_basic_element( yaml_data, - name=element.name+"_child_dict", + name=element.name + "_child_dict", msg_prefix="The YAML File contained content that was parsed to a Python object" " with an unexpected type.") return [structure_element] @@ -1101,3 +1097,22 @@ class CSVTableConverter(TableConverter): child_elements.append( DictElement(str(index), row.to_dict())) return child_elements + + +class DateElementConverter(TextElementConverter): + """ + allows to convert different text formats of dates to Python date objects. + + The text to be parsed must be contained in the "date" group. The format string can be supplied + under "dateformat" in the Converter definition. The library used is datetime so see its + documentation for information on how to create the format string. + """ + + def match(self, element: StructureElement): + matches = super().match(element) + if matches is not None and "date" in matches: + matches.update({"date": datetime.datetime.strptime( + matches["date"], + self.definition["date_format"] if "date_format" in self.definition else "%Y-%m-%d" + ).date()}) + return matches diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 6cf025a024e8cc392a7175421d47fb69059302a4..c77dcee1f29eac69732ce353e0271761eca2df13 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -49,6 +49,7 @@ from typing import Any, Optional, Type, Union import caosdb as db +from caosadvancedtools.utils import create_entity_link from caosadvancedtools.cache import UpdateCache, Cache from caosadvancedtools.crawler import Crawler as OldCrawler from caosdb.apiutils import (compare_entities, EntityMergeConflictError, @@ -303,8 +304,7 @@ class Crawler(object): # Validator is given by a path if not value.startswith('/'): # Not an absolute path - definition[key] = os.path.join( - os.path.dirname(definition_path), value) + definition[key] = os.path.join(os.path.dirname(definition_path), value) if not os.path.isfile(definition[key]): # TODO(henrik) capture this in `crawler_main` similar to # `ConverterValidationError`. @@ -312,8 +312,7 @@ class Crawler(object): f"Couldn't find validation file {definition[key]}") elif isinstance(value, dict): # Recursively resolve all validators - definition[key] = self._resolve_validator_paths( - value, definition_path) + definition[key] = self._resolve_validator_paths(value, definition_path) return definition @@ -328,74 +327,13 @@ class Crawler(object): directory: schema.yml file README.md documentation + + TODO: this function does not make use of self, so it could become static. """ # Defaults for the converter registry: - converter_registry: dict[str, dict[str, str]] = { - "Directory": { - "converter": "DirectoryConverter", - "package": "caoscrawler.converters"}, - "SimpleFile": { - "converter": "SimpleFileConverter", - "package": "caoscrawler.converters"}, - "MarkdownFile": { - "converter": "MarkdownFileConverter", - "package": "caoscrawler.converters"}, - "File": { - "converter": "SimpleFileConverter", - "package": "caoscrawler.converters"}, - "JSONFile": { - "converter": "JSONFileConverter", - "package": "caoscrawler.converters"}, - "YAMLFile": { - "converter": "YAMLFileConverter", - "package": "caoscrawler.converters"}, - "CSVTableConverter": { - "converter": "CSVTableConverter", - "package": "caoscrawler.converters"}, - "XLSXTableConverter": { - "converter": "XLSXTableConverter", - "package": "caoscrawler.converters"}, - "DictBooleanElement": { - "converter": "BooleanElementConverter", - "package": "caoscrawler.converters"}, - "BooleanElement": { - "converter": "BooleanElementConverter", - "package": "caoscrawler.converters"}, - "DictFloatElement": { - "converter": "FloatElementConverter", - "package": "caoscrawler.converters"}, - "FloatElement": { - "converter": "FloatElementConverter", - "package": "caoscrawler.converters"}, - "DictTextElement": { - "converter": "TextElementConverter", - "package": "caoscrawler.converters"}, - "TextElement": { - "converter": "TextElementConverter", - "package": "caoscrawler.converters"}, - "DictIntegerElement": { - "converter": "IntegerElementConverter", - "package": "caoscrawler.converters"}, - "IntegerElement": { - "converter": "IntegerElementConverter", - "package": "caoscrawler.converters"}, - "DictListElement": { - "converter": "ListElementConverter", - "package": "caoscrawler.converters"}, - "ListElement": { - "converter": "ListElementConverter", - "package": "caoscrawler.converters"}, - "DictDictElement": { - "converter": "DictElementConverter", - "package": "caoscrawler.converters"}, - "DictElement": { - "converter": "DictElementConverter", - "package": "caoscrawler.converters"}, - "Dict": { - "converter": "DictElementConverter", - "package": "caoscrawler.converters"}, - } + with open(str(files('caoscrawler').joinpath('default_converters.yml')), "r") as f: + converter_registry: dict[str, dict[str, str]] = yaml.safe_load(f) # More converters from definition file: if "Converters" in definition: @@ -417,11 +355,16 @@ class Crawler(object): value["class"] = getattr(module, value["converter"]) return converter_registry - def crawl_directory(self, dirname: str, crawler_definition_path: str): + def crawl_directory(self, dirname: str, crawler_definition_path: str, + restricted_path: Optional[list[str]] = None): """ Crawl a single directory. Convenience function that starts the crawler (calls start_crawling) with a single directory as the StructureElement. + + restricted_path: optional, list of strings + Traverse the data tree only along the given path. When the end of the given path + is reached, traverse the full tree as normal. """ crawler_definition = self.load_definition(crawler_definition_path) @@ -444,7 +387,9 @@ class Crawler(object): self.start_crawling(Directory(dir_structure_name, dirname), crawler_definition, - converter_registry) + converter_registry, + restricted_path=restricted_path + ) @staticmethod def initialize_converters(crawler_definition: dict, converter_registry: dict): @@ -472,7 +417,8 @@ class Crawler(object): def start_crawling(self, items: Union[list[StructureElement], StructureElement], crawler_definition: dict, - converter_registry: dict): + converter_registry: dict, + restricted_path: Optional[list[str]] = None): """ Start point of the crawler recursion. @@ -484,6 +430,9 @@ class Crawler(object): crawler_definition : dict A dictionary representing the crawler definition, possibly from a yaml file. + restricted_path: optional, list of strings + Traverse the data tree only along the given path. When the end of the given path + is reached, traverse the full tree as normal. Returns ------- @@ -504,8 +453,14 @@ class Crawler(object): # This recursive crawling procedure generates the update list: self.crawled_data: list[db.Record] = [] - self._crawl(items, local_converters, self.generalStore, self.recordStore, [], []) - + self._crawl( + items=items, + local_converters=local_converters, + generalStore=self.generalStore, + recordStore=self.recordStore, + structure_elements_path=[], + converters_path=[], + restricted_path=restricted_path) if self.debug: self.debug_converters = local_converters @@ -942,16 +897,17 @@ class Crawler(object): because some changes in parents (e.g. of Files) might fail if they are not updated first. """ + logger.debug("=== Going to execute parent updates ===") Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated) parent_updates = db.Container() - for record in to_be_updated: - old_entity = Crawler._get_entity_by_id(record.id) + for entity in to_be_updated: + old_entity = Crawler._get_entity_by_id(entity.id) # Check whether the parents have been changed and add them if missing # in the old entity: changes_made = False - for parent in record.parents: + for parent in entity.parents: found = False for old_parent in old_entity.parents: if old_parent.id == parent.id: @@ -1061,20 +1017,25 @@ class Crawler(object): referencing_entities) for record in to_be_updated] # Merge with existing data to prevent unwanted overwrites - to_be_updated = self._merge_properties_from_remote(to_be_updated, - identified_records) + to_be_updated = self._merge_properties_from_remote(to_be_updated, identified_records) # remove unnecessary updates from list by comparing the target records # to the existing ones - to_be_updated = self.remove_unnecessary_updates( - to_be_updated, identified_records) + to_be_updated = self.remove_unnecessary_updates(to_be_updated, identified_records) + logger.info(f"Going to insert {len(to_be_inserted)} Entities and update " + f"{len(to_be_inserted)} Entities.") if commit_changes: self.execute_parent_updates_in_list(to_be_updated, securityMode=self.securityMode, run_id=self.run_id, unique_names=unique_names) + logger.info(f"Added parent RecordTypes where necessary.") self.execute_inserts_in_list( to_be_inserted, self.securityMode, self.run_id, unique_names=unique_names) + logger.info(f"Executed inserts:\n" + + self.create_entity_summary(to_be_inserted)) self.execute_updates_in_list( to_be_updated, self.securityMode, self.run_id, unique_names=unique_names) + logger.info(f"Executed updates:\n" + + self.create_entity_summary(to_be_updated)) update_cache = UpdateCache() pending_inserts = update_cache.get_inserts(self.run_id) @@ -1089,6 +1050,25 @@ class Crawler(object): return (to_be_inserted, to_be_updated) + @staticmethod + def create_entity_summary(entities: list[db.Entity]): + """ Creates a summary string reprensentation of a list of entities.""" + parents = {} + for el in entities: + for pp in el.parents: + if pp.name not in parents: + parents[pp.name] = [el] + else: + parents[pp.name].append(el) + output = "" + for key, value in parents.items(): + output += f"{key}:\n" + for el in value: + output += create_entity_link(el) + ", " + + output = output[:-2] + "\n" + return output + @staticmethod def inform_about_pending_changes(pending_changes, run_id, path, inserts=False): # Sending an Email with a link to a form to authorize updates is @@ -1156,11 +1136,14 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) with open(filename, "w") as f: f.write(yaml.dump(paths, sort_keys=False)) - def _crawl(self, items: list[StructureElement], + def _crawl(self, + items: list[StructureElement], local_converters: list[Converter], generalStore: GeneralStore, recordStore: RecordStore, - structure_elements_path: list[str], converters_path: list[str]): + structure_elements_path: list[str], + converters_path: list[str], + restricted_path: Optional[list[str]] = None): """ Crawl a list of StructureElements and apply any matching converters. @@ -1169,20 +1152,35 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) treating structure elements. A locally defined converter could be one that is only valid for a specific subtree of the originally cralwed StructureElement structure. - generalStore and recordStore: This recursion of the crawl function should only operate on copies of the - global stores of the Crawler object. + generalStore and recordStore: This recursion of the crawl function should only operate on + copies of the global stores of the Crawler object. + restricted_path: optional, list of strings, traverse the data tree only along the given + path. For example, when a directory contains files a, b and c and b is + given in restricted_path, a and c will be ignroed by the crawler. + When the end of the given path is reached, traverse the full tree as + normal. The first element of the list provided by restricted_path should + be the name of the StructureElement at this level, i.e. denoting the + respective element in the items argument. """ + # This path_found variable stores wether the path given by restricted_path was found in the + # data tree + path_found = False + if restricted_path is not None and len(restricted_path) == 0: + restricted_path = None + for element in items: for converter in local_converters: # type is something like "matches files", replace isinstance with "type_matches" # match function tests regexp for example - if (converter.typecheck(element) and - converter.match(element) is not None): + if (converter.typecheck(element) and ( + restricted_path is None or element.name == restricted_path[0]) + and converter.match(element) is not None): + path_found = True generalStore_copy = generalStore.create_scoped_copy() recordStore_copy = recordStore.create_scoped_copy() - # Create an entry for this matched structure element: + # Create an entry for this matched structure element that contains the path: generalStore_copy[converter.name] = ( os.path.join(*(structure_elements_path + [element.get_name()]))) @@ -1196,7 +1194,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) children = converter.create_children(generalStore_copy, element) if self.debug: - # add provenance information for each varaible + # add provenance information for each variable self.debug_tree[str(element)] = ( generalStore_copy.get_storage(), recordStore_copy.get_storage()) self.debug_metadata["copied"][str(element)] = ( @@ -1219,7 +1217,12 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3])) self._crawl(children, converter.converters, generalStore_copy, recordStore_copy, structure_elements_path + [element.get_name()], - converters_path + [converter.name]) + converters_path + [converter.name], + restricted_path[1:] if restricted_path is not None else None) + + if restricted_path and not path_found: + raise RuntimeError("A 'restricted_path' argument was given that is not contained in " + "the data tree") # if the crawler is running out of scope, copy all records in # the recordStore, that were created in this scope # to the general update container. @@ -1250,6 +1253,9 @@ def crawler_main(crawled_directory_path: str, prefix: str = "", securityMode: SecurityMode = SecurityMode.UPDATE, unique_names=True, + restricted_path: Optional[list[str]] = None, + remove_prefix: Optional[str] = None, + add_prefix: Optional[str] = None, ): """ @@ -1268,11 +1274,18 @@ def crawler_main(crawled_directory_path: str, dry_run : bool do not commit any chnages to the server prefix : str - remove the given prefix from file paths + DEPRECATED, remove the given prefix from file paths securityMode : int securityMode of Crawler unique_names : bool whether or not to update or insert entities inspite of name conflicts + restricted_path: optional, list of strings + Traverse the data tree only along the given path. When the end of the given path + is reached, traverse the full tree as normal. + remove_prefix : Optional[str] + remove the given prefix from file paths + add_prefix : Optional[str] + add the given prefix to file paths Returns ------- @@ -1281,19 +1294,27 @@ def crawler_main(crawled_directory_path: str, """ crawler = Crawler(debug=debug, securityMode=securityMode) try: - crawler.crawl_directory(crawled_directory_path, cfood_file_name) + crawler.crawl_directory(crawled_directory_path, cfood_file_name, restricted_path) except ConverterValidationError as err: - print(err) + logger.error(err) return 1 - if provenance_file is not None: + if provenance_file is not None and debug: crawler.save_debug_data(provenance_file) if identifiables_definition_file is not None: - ident = CaosDBIdentifiableAdapter() ident.load_from_yaml_definition(identifiables_definition_file) crawler.identifiableAdapter = ident + if prefix != "": + warnings.warn(DeprecationWarning("The prefix argument is deprecated and will be removed " + "in the future. Please use `remove_prefix` instead.")) + if remove_prefix is not None: + raise ValueError("Please do not supply the (deprecated) `prefix` and the " + "`remove_prefix` argument at the same time. Only use " + "`remove_prefix` instead.") + remove_prefix = prefix + if dry_run: ins, upd = crawler.synchronize(commit_changes=False) inserts = [str(i) for i in ins] @@ -1308,11 +1329,15 @@ def crawler_main(crawled_directory_path: str, if isinstance(elem, db.File): # correct the file path: # elem.file = os.path.join(args.path, elem.file) - if prefix is None: - raise RuntimeError( - "No prefix set. Prefix must be set if files are used.") - if elem.path.startswith(prefix): - elem.path = elem.path[len(prefix):] + if remove_prefix: + if elem.path.startswith(remove_prefix): + elem.path = elem.path[len(remove_prefix):] + else: + raise RuntimeError("Prefix shall be removed from file path but the path " + "does not start with the prefix:" + f"\n{remove_prefix}\n{elem.path}") + if add_prefix: + elem.path = add_prefix + elem.path elem.file = None # TODO: as long as the new file backend is not finished # we are using the loadFiles function to insert symlinks. @@ -1346,6 +1371,15 @@ def parse_args(): formatter_class=RawTextHelpFormatter) parser.add_argument("cfood_file_name", help="Path name of the cfood yaml file to be used.") + mg = parser.add_mutually_exclusive_group() + mg.add_argument("-r", "--restrict", nargs="*", + help="Restrict the crawling to the subtree at the end of the given path." + "I.e. for each level that is given the crawler only treats the element " + "with the given name.") + mg.add_argument("--restrict-path", help="same as restrict; instead of a list, this takes a " + "single string that is interpreded as file system path. Note that a trailing" + "separator (e.g. '/') will be ignored. Use --restrict if you need to have " + "empty strings.") parser.add_argument("--provenance", required=False, help="Path name of the provenance yaml file. " "This file will only be generated if this option is set.") @@ -1371,18 +1405,35 @@ def parse_args(): parser.add_argument("-u", "--unique-names", help="Insert or updates entities even if name conflicts exist.") parser.add_argument("-p", "--prefix", - help="Remove the given prefix from the paths " - "of all file objects.") + help="DEPRECATED, use --remove-prefix instead. Remove the given prefix " + "from the paths of all file objects.") + parser.add_argument("--remove-prefix", + help="Remove the given prefix from the paths of all file objects.") + parser.add_argument("--add-prefix", + help="Add the given prefix to the paths of all file objects.") return parser.parse_args() +def split_restricted_path(path): + elements = [] + while path != "/": + path, el = os.path.split(path) + if el != "": + elements.insert(0, el) + return elements + + def main(): args = parse_args() conlogger = logging.getLogger("connection") conlogger.setLevel(level=logging.ERROR) + if args.prefix: + print("Please use '--remove-prefix' option instead of '--prefix' or '-p'.") + return -1 + # logging config for local execution logger.addHandler(logging.StreamHandler(sys.stdout)) if args.debug: @@ -1392,6 +1443,12 @@ def main(): if args.add_cwd_to_path: sys.path.append(os.path.abspath(".")) + restricted_path = None + if args.restrict_path: + restricted_path = split_restricted_path(args.restrict_path) + if args.restrict: + restricted_path = args.restrict + sys.exit(crawler_main( crawled_directory_path=args.crawled_directory_path, cfood_file_name=args.cfood_file_name, @@ -1399,11 +1456,13 @@ def main(): debug=args.debug, provenance_file=args.provenance, dry_run=args.dry_run, - prefix=args.prefix, securityMode={"retrieve": SecurityMode.RETRIEVE, "insert": SecurityMode.INSERT, "update": SecurityMode.UPDATE}[args.security_mode], unique_names=args.unique_names, + restricted_path=restricted_path, + remove_prefix=args.remove_prefix, + add_prefix=args.add_prefix, )) diff --git a/src/caoscrawler/default_converters.yml b/src/caoscrawler/default_converters.yml new file mode 100644 index 0000000000000000000000000000000000000000..e192ab1b3bae70a6772cf6defba4a4592a92e584 --- /dev/null +++ b/src/caoscrawler/default_converters.yml @@ -0,0 +1,86 @@ +# ------------------------- +# Base Types +# ------------------------- + +BooleanElement: + converter: BooleanElementConverter + package: caoscrawler.converters +Date: + converter: DateElementConverter + package: caoscrawler.converters +Dict: + converter: DictElementConverter + package: caoscrawler.converters +FloatElement: + converter: FloatElementConverter + package: caoscrawler.converters +IntegerElement: + converter: IntegerElementConverter + package: caoscrawler.converters +ListElement: + converter: ListElementConverter + package: caoscrawler.converters +TextElement: + converter: TextElementConverter + package: caoscrawler.converters + + +DictDictElement: # deprecated + converter: DictElementConverter + package: caoscrawler.converters +DictElement: # deprecated + converter: DictElementConverter + package: caoscrawler.converters +DictBooleanElement: # deprecated + converter: BooleanElementConverter + package: caoscrawler.converters +DictFloatElement: # deprecated + converter: FloatElementConverter + package: caoscrawler.converters +DictIntegerElement: # deprecated + converter: IntegerElementConverter + package: caoscrawler.converters +DictListElement: # deprecated + converter: ListElementConverter + package: caoscrawler.converters +DictTextElement: # deprecated + converter: TextElementConverter + package: caoscrawler.converters + +# ------------------------- +# Directories and Files +# ------------------------- + +Directory: + converter: DirectoryConverter + package: caoscrawler.converters + + +File: # deprecated + converter: SimpleFileConverter + package: caoscrawler.converters + + +SimpleFile: + converter: SimpleFileConverter + package: caoscrawler.converters + +MarkdownFile: + converter: MarkdownFileConverter + package: caoscrawler.converters + +YAMLFile: + converter: YAMLFileConverter + package: caoscrawler.converters + +JSONFile: + converter: JSONFileConverter + package: caoscrawler.converters + +CSVTableConverter: + converter: CSVTableConverter + package: caoscrawler.converters + +XLSXTableConverter: + converter: XLSXTableConverter + package: caoscrawler.converters diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index 7ff7172576be08e068ba412f319b059fb349bbeb..eda113d8fc0c5fc64a620ef7540dec4004401aef 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -25,6 +25,9 @@ from datetime import datetime import json from hashlib import sha256 from typing import Union +import logging + +logger = logging.getLogger(__name__) class Identifiable(): @@ -62,6 +65,8 @@ class Identifiable(): self.path = path self.record_type = record_type self.name = name + if name is "": + self.name = None self.properties: dict = {} if properties is not None: self.properties = properties diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 40c801547a85afaf32e1ab6a668bc47d98d60b66..c410159de4364e9b0299a84a4cbc687f773d35c0 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -33,6 +33,7 @@ import caosdb as db import logging from abc import abstractmethod, ABCMeta from .utils import has_parent + logger = logging.getLogger(__name__) @@ -447,7 +448,7 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter): def get_file(self, identifiable: Identifiable): if identifiable.path is None: raise RuntimeError("Path must not be None for File retrieval.") - candidates = db.execute_query("FIND File which is stored at {}".format( + candidates = db.execute_query("FIND File which is stored at '{}'".format( identifiable.path)) if len(candidates) > 1: raise RuntimeError("Identifiable was not defined unambigiously.") diff --git a/src/doc/README_SETUP.md b/src/doc/README_SETUP.md index 1f6e15d408e10e38bce0d9b9fe9b6197ec69bfc3..952a8c94a7dfa24110f320f5dd32b0ad2ac1df01 100644 --- a/src/doc/README_SETUP.md +++ b/src/doc/README_SETUP.md @@ -1,63 +1,10 @@ # Getting started with the CaosDB Crawler # -## Installation ## - -### How to install ### - -#### Linux #### - -Make sure that Python (at least version 3.8) and pip is installed, using your system tools and -documentation. - -Then open a terminal and continue in the [Generic installation](#generic-installation) section. - -#### Windows #### - -If a Python distribution is not yet installed, we recommend Anaconda Python, which you can download -for free from [https://www.anaconda.com](https://www.anaconda.com). The "Anaconda Individual Edition" provides most of all -packages you will ever need out of the box. If you prefer, you may also install the leaner -"Miniconda" installer, which allows you to install packages as you need them. - -After installation, open an Anaconda prompt from the Windows menu and continue in the [Generic -installation](#generic-installation) section. - -#### MacOS #### - -If there is no Python 3 installed yet, there are two main ways to -obtain it: Either get the binary package from -[python.org](https://www.python.org/downloads/) or, for advanced -users, install via [Homebrew](https://brew.sh/). After installation -from python.org, it is recommended to also update the TLS certificates -for Python (this requires administrator rights for your user): - -```sh -# Replace this with your Python version number: -cd /Applications/Python\ 3.9/ - -# This needs administrator rights: -sudo ./Install\ Certificates.command -``` - -After these steps, you may continue with the [Generic -installation](#generic-installation). - -#### Generic installation #### - ---- - -Obtain the sources from GitLab and install from there (`git` must be installed for -this option): - -```sh -git clone https://gitlab.com/caosdb/caosdb-crawler -cd caosdb-crawler -pip3 install --user . -``` - -**Note**: In the near future, this package will also be made available on PyPi. - +## Installation +see INSTALL.md ## Run Unit Tests +Run `pytest unittests`. ## Documentation ## We use sphinx to create the documentation. Docstrings in the code should comply diff --git a/src/doc/cfood.rst b/src/doc/cfood.rst index 37f6a8c7d3be9298ec965c50a4ec29110988ddc6..6564ee677f0b363a52c44dd5ceabe5378c255105 100644 --- a/src/doc/cfood.rst +++ b/src/doc/cfood.rst @@ -149,6 +149,44 @@ create lists or multi properties instead of single values: .. code-block:: yaml Experiment1: - Measurement: +Measurement <- Element in List (list is cleared before run) - *Measurement <- Multi Property (properties are removed before run) - Measurement <- Overwrite + Measurement: +Measurement # Element in List (list is cleared before run) + *Measurement # Multi Property (properties are removed before run) + Measurement # Overwrite + + +File Entities +------------- + +In order to use File Entities, you must set the appropriate ``role: File``. +Additionally, the path and file keys have to be given, with values that set the +paths remotely and locally, respectively. You can use the variable +``<converter name>_path`` that is automatically created by converters that deal +with file system related StructureElements. The file object itsself is stored +in a vairable with the same name (as it is the case for other Records). + + +.. code-block:: yaml + + somefile: + type: SimpleFile + match: ^params.*$ # macht any file that starts with "params" + records: + fileEntity: + role: File # necessary to create a File Entity + path: somefile.path # defines the path in CaosDB + file: somefile.path # path where the file is found locally + SomeRecord: + ParameterFile: $fileEntity # creates a reference to the file + +Automatically generated keys +++++++++++++++++++++++++++++ + +Some variable names are automatically generated and can be used using the +``$<variable name>`` syntax. Those include: + +- ``<converter name>``: access the path of converter names to the current converter +- ``<converter name>.path``: the file system path to the structure element + (file system related converters only; you need curly brackets to use them: + ``${<converter name>.path}``) +- ``<Record key>``: all entities that are created in the ``records`` section + are available under the same key diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst index 89757f21958f3d94649b33e9f9112593f703191d..0881d9302b621d6b47575e171dd9e8c144e29cd4 100644 --- a/src/doc/concepts.rst +++ b/src/doc/concepts.rst @@ -1,6 +1,10 @@ Concepts )))))))) +The CaosDB Crawler can handle any kind of hierarchical data structure. The typical use case is +directory tree that is traversed. We use the following terms/concepts to describe how the CaosDB +Crawler works. + Structure Elements ++++++++++++++++++ diff --git a/src/doc/conf.py b/src/doc/conf.py index b8d055abe682efcb17f960cdaabca3de4d25a16d..7719a920328c46b4453cd59413b939fcf2d45f5a 100644 --- a/src/doc/conf.py +++ b/src/doc/conf.py @@ -33,10 +33,10 @@ copyright = '2021, MPIDS' author = 'Alexander Schlemmer' # The short X.Y version -version = '0.3.0' +version = '0.4.0' # The full version, including alpha/beta/rc tags # release = '0.5.2-rc2' -release = '0.3.0' +release = '0.4.0' # -- General configuration --------------------------------------------------- diff --git a/src/doc/converters.rst b/src/doc/converters.rst index b4ba89ced3b5858ca2f8abe7bc724d6710d9203b..95676627d95a5cd6bbca5208b67f9689fffb6806 100644 --- a/src/doc/converters.rst +++ b/src/doc/converters.rst @@ -77,7 +77,7 @@ Reads a YAML header from Markdown files (if such a header exists) and creates children elements according to the structure of the header. DictElement Converter -============== +===================== Creates a child StructureElement for each key in the dictionary. Typical Subtree converters diff --git a/src/doc/getting_started/INSTALL.md b/src/doc/getting_started/INSTALL.md new file mode 120000 index 0000000000000000000000000000000000000000..95b6037c7ab329d91e3a8ed4a2b31eba675eef62 --- /dev/null +++ b/src/doc/getting_started/INSTALL.md @@ -0,0 +1 @@ +../../../INSTALL.md \ No newline at end of file diff --git a/src/doc/getting_started/helloworld.rst b/src/doc/getting_started/helloworld.rst new file mode 100644 index 0000000000000000000000000000000000000000..ef4a1398322b59d7983b7dff384534cfa501b660 --- /dev/null +++ b/src/doc/getting_started/helloworld.rst @@ -0,0 +1,5 @@ + +Prerequisites +))))))))))))) + +TODO Describe the smallest possible crawler run diff --git a/src/doc/getting_started/index.rst b/src/doc/getting_started/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..74ffa7daeff393d05605e1066a5985984c2e9751 --- /dev/null +++ b/src/doc/getting_started/index.rst @@ -0,0 +1,15 @@ +Getting Started ++++++++++++++++ + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + :hidden: + + Installation<INSTALL> + prerequisites + helloworld + +This section will help you get going! From the first installation steps to the first simple crawl. + +Let's go! diff --git a/src/doc/getting_started/prerequisites.rst b/src/doc/getting_started/prerequisites.rst new file mode 100644 index 0000000000000000000000000000000000000000..dc8022b6cad99a8508f19f47dc01c601fb676c5b --- /dev/null +++ b/src/doc/getting_started/prerequisites.rst @@ -0,0 +1,6 @@ + +Prerequisites +))))))))))))) + +TODO Describe what you need to actually do a crawler run: data, CaosDB, ... + diff --git a/src/doc/index.rst b/src/doc/index.rst index b4e30e4728068cabb92626cfac986ab858a0bbb6..d319bf4d24a05a3033b1ae5bbf80433c5ef3646b 100644 --- a/src/doc/index.rst +++ b/src/doc/index.rst @@ -7,12 +7,12 @@ CaosDB-Crawler Documentation :caption: Contents: :hidden: - Getting started<README_SETUP> + Getting started<getting_started/index> + Tutorials<tutorials/index> Concepts<concepts> Converters<converters> CFoods (Crawler Definitions)<cfood> Macros<macros> - Tutorials<tutorials/index> How to upgrade<how-to-upgrade> API documentation<_apidoc/modules> diff --git a/src/doc/macros.rst b/src/doc/macros.rst index d3a3e9b9634a4e1d72228dd46692a824e1d5acfd..7685731d35afab51074bb4d12c51ede0a7ba1b75 100644 --- a/src/doc/macros.rst +++ b/src/doc/macros.rst @@ -195,7 +195,7 @@ The example will be expanded to: Limitation ----------- +========== Currently it is not possible to use the same macro twice in the same yaml node, but in different positions. Consider: diff --git a/src/doc/tutorials/index.rst b/src/doc/tutorials/index.rst index 88d598ece284e1aad315a1e0fcae3fdf494b3aad..02371de196cc139776416882aff31bd6fa4dabbe 100644 --- a/src/doc/tutorials/index.rst +++ b/src/doc/tutorials/index.rst @@ -1,9 +1,11 @@ Tutorials +++++++++ +This chapter contains a collection of tutorials. + .. toctree:: :maxdepth: 2 :caption: Contents: - :hidden: Example CFood<example> + diff --git a/unittests/scifolder_cfood.yml b/unittests/scifolder_cfood.yml index 74fd027563907c5ae416ca389faba0ecd64d5848..dce219b751c3e980662a1eaa4904e1163d9836a0 100644 --- a/unittests/scifolder_cfood.yml +++ b/unittests/scifolder_cfood.yml @@ -22,7 +22,7 @@ Data: # name of the converter parents: - Project # not needed as the name is equivalent date: $date - identifier: $identifier + identifier: ${identifier} subtree: measurement: # new name for folders on the 3rd level diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 5942b1e124ebd1228a619ed7a1024738c70ee0aa..4d3791fce3ceffaafe529423e4020ebd6a4231ba 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -25,15 +25,18 @@ test the converters module """ import json import yaml +import logging +import sys import importlib import os from itertools import product +import datetime import pytest import yaml from caoscrawler.converters import (Converter, ConverterValidationError, DictElementConverter, DirectoryConverter, DictIntegerElementConverter, - handle_value, MarkdownFileConverter, + handle_value, MarkdownFileConverter, DateElementConverter, FloatElementConverter, IntegerElementConverter, JSONFileConverter, YAMLFileConverter) from caoscrawler.converters import _AbstractScalarValueElementConverter @@ -55,6 +58,9 @@ def converter_registry(): "MarkdownFile": { "converter": "MarkdownFileConverter", "package": "caoscrawler.converters"}, + "Date": { + "converter": "DateElementConverter", + "package": "caoscrawler.converters"}, "DictElement": { "converter": "DictElementConverter", "package": "caoscrawler.converters"}, @@ -64,9 +70,6 @@ def converter_registry(): "ListElement": { "converter": "ListElementConverter", "package": "caoscrawler.converters"}, - "TextElement": { - "converter": "TextElementConverter", - "package": "caoscrawler.converters"}, "JSONFile": { "converter": "JSONFileConverter", "package": "caoscrawler.converters"}, @@ -127,14 +130,11 @@ def test_markdown_converter(converter_registry): ) ) - converter = MarkdownFileConverter({ - "match": "(.*)" - }, "TestMarkdownFileConverter", - converter_registry) + converter = MarkdownFileConverter({"match": "(.*)"}, "TestMarkdownFileConverter", + converter_registry) - m = converter.match(File("test_tool.py", rfp( - "test_tool.py"))) - assert m is None + with pytest.raises(ConverterValidationError) as err: + converter.create_children(None, File("test_tool.py", rfp("test_tool.py"))) m = converter.match(test_readme) assert m is not None @@ -370,7 +370,6 @@ def test_filter_children_of_directory(converter_registry, capsys): dc = DirectoryConverter( definition={ "match": "(.*)", - "debug_match": True, "filter": { "expr": "test_(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}).json", "group": "date", @@ -383,14 +382,6 @@ def test_filter_children_of_directory(converter_registry, capsys): m = dc.match(test_dir) assert m is not None - # checking debug output - captured = capsys.readouterr() - # the name - assert "examples_filter_children" in captured.out - # the regexp - assert "(.*)" in captured.out - # the empty result set - assert "{}" in captured.out # This should only contain the youngest json and the csv that doesn't match # the above filter expression. @@ -540,7 +531,8 @@ def test_converter_value_match(converter_registry): assert m is not None -def test_match_debug(converter_registry, capsys): +def test_match_debug(converter_registry, caplog): + caplog.set_level(logging.DEBUG, logger="caoscrawler.converters") for m, mn, mv in product([".*", None], [".*", None], [".*", None]): defi = {"debug_match": True} if m: @@ -562,11 +554,70 @@ def test_match_debug(converter_registry, capsys): mtch = dc.match(IntegerElement(name="a", value=4)) if not (m is None and mn is None and mv is None): assert mtch is not None - # checking debug output - captured = capsys.readouterr() # the name - assert "a" in captured.out + assert "a" in caplog.text # the regexp - assert ".*" in captured.out + assert ".*" in caplog.text # the empty result set - assert "{}" in captured.out + assert "{}" in caplog.text + caplog.clear() + + +def test_date_converter(): + dictconverter = DateElementConverter( + definition={"match_value": "(?P<date>.*)"}, + name="conv", + converter_registry=converter_registry) + matches = dictconverter.match(TextElement("text", "2022-11-11")) + assert "date" in matches + assert isinstance(matches["date"], datetime.date) + assert matches["date"].year == 2022 + + dictconverter = DateElementConverter( + definition={"match_value": r"(?P<date>(\d|-)+)", + "date_format": "%y-%m-%d"}, + name="conv", + converter_registry=converter_registry) + matches = dictconverter.match(TextElement("text", "22-11-11")) + assert "date" in matches + assert isinstance(matches["date"], datetime.date) + assert matches["date"].year == 2022 + + matches = dictconverter.match(TextElement("text", "alve")) + assert matches is None + + +def test_load_converters(): + c = Crawler() + converter_registry = c.load_converters({}) + # The previous function call actually already asserts that all defined + # converter classes can be loaded from their respective packages. + + # Please adapt, if defaults change! + assert len(converter_registry) == 22 + + # All of them are contained in caoscrawler.converters + for conv_key, conv in converter_registry.items(): + assert conv["package"] == "caoscrawler.converters" + # ... and their names all end in "Converter" + assert conv["converter"].endswith("Converter") + + # Some checks: + assert "CSVTableConverter" in converter_registry + assert "SimpleFile" in converter_registry + assert "Directory" in converter_registry + assert "ListElement" in converter_registry + + +def test_create_path_value(converter_registry): + """ test whether the variable containing the path is added to the general store""" + dc = Converter.converter_factory( + definition={ + "type": "Directory", + "match": ".*" + }, + name="Test", converter_registry=converter_registry) + values = GeneralStore() + dc.create_values(values, Directory("a", "/a")) + assert "Test.path" in values + assert values["Test.path"] == "/a" diff --git a/unittests/test_scalars_cfood.py b/unittests/test_scalars_cfood.py index 1bf8f0b7d67f00f2018b5b68424d6b9cc17602eb..ac408b2dab0fa151c370d3ec6ffd1dced22c77d7 100644 --- a/unittests/test_scalars_cfood.py +++ b/unittests/test_scalars_cfood.py @@ -42,16 +42,23 @@ def test_record_structure_generation(crawler): subd = crawler.debug_tree[dircheckstr("DataAnalysis")] assert len(subd) == 2 # variables store on Data Analysis node of debug tree - assert len(subd[0]) == 3 - assert "Data" in subd[0] - assert "DataAnalysis" in subd[0] - assert "RecordThatGetsParentsLater" in subd[0] + if "Data" in subd[0]: + subddata = subd[0] + subdRTGPL = subd[1] + else: + subddata = subd[1] + subdRTGPL = subd[0] + assert len(subddata) == 5 + assert "DataAnalysis" in subddata + assert "DataAnalysis.path" in subddata + assert "Data.path" in subddata + assert "RecordThatGetsParentsLater" in subddata - prop = subd[0]["RecordThatGetsParentsLater"].get_property("someId") + prop = subddata["RecordThatGetsParentsLater"].get_property("someId") assert type(prop.value) == int assert prop.value == 23 # record store on Data Analysis node of debug tree - assert len(subd[1]) == 1 - prop2 = subd[1]["RecordThatGetsParentsLater"].get_property("someId") + assert len(subdRTGPL) == 1 + prop2 = subdRTGPL["RecordThatGetsParentsLater"].get_property("someId") assert prop == prop2 diff --git a/unittests/test_tool.py b/unittests/test_tool.py index 6a828532c1de9796008a6e51c21811f83b85657a..e15d7cb777ced4b92566df2b25b375e90be39295 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -25,11 +25,13 @@ Tests for the tool using pytest Adapted from check-sfs """ +import logging +from caoscrawler.stores import GeneralStore, RecordStore import os -from caoscrawler.crawl import Crawler, SecurityMode +from caoscrawler.crawl import Crawler, SecurityMode, split_restricted_path from caoscrawler.identifiable import Identifiable -from caoscrawler.structure_elements import File, DictTextElement, DictListElement +from caoscrawler.structure_elements import File, DictTextElement, DictListElement, DictElement from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter from simulated_server_data import full_data from functools import partial @@ -108,15 +110,17 @@ def ident(crawler): def test_record_structure_generation(crawler): + # TODO How does this test relate to the test function in test_scalars_cfood with the same name? + # There seems to be code duplication subd = crawler.debug_tree[dircheckstr("DataAnalysis")] subc = crawler.debug_metadata["copied"][dircheckstr("DataAnalysis")] assert len(subd) == 2 # variables store on Data Analysis node of debug tree - assert len(subd[0]) == 2 + assert len(subd[0]) == 4 # record store on Data Analysis node of debug tree assert len(subd[1]) == 0 assert len(subc) == 2 - assert len(subc[0]) == 2 + assert len(subc[0]) == 4 assert len(subc[1]) == 0 # The data analysis node creates one variable for the node itself: @@ -135,7 +139,7 @@ def test_record_structure_generation(crawler): assert subd[1]["Project"].get_property( "identifier").value == "climate-model-predict" - assert len(subd[0]) == 6 + assert len(subd[0]) == 9 assert subd[0]["date"] == "2020" assert subd[0]["identifier"] == "climate-model-predict" assert subd[0]["Project"].__class__ == db.Record @@ -146,7 +150,7 @@ def test_record_structure_generation(crawler): assert subc[0]["project_dir"] is False # Check the copy flags for the first level in the hierarchy: - assert len(subc[0]) == 6 + assert len(subc[0]) == 9 assert len(subc[1]) == 1 assert subc[1]["Project"] is False assert subc[0]["Project"] is False @@ -159,7 +163,7 @@ def test_record_structure_generation(crawler): subc = crawler.debug_metadata["copied"][dircheckstr("DataAnalysis", "2020_climate-model-predict", "2020-02-08_prediction-errors")] - assert len(subd[0]) == 8 + assert len(subd[0]) == 12 assert subd[0]["date"] == "2020-02-08" assert subd[0]["identifier"] == "prediction-errors" assert subd[0]["Project"].__class__ == db.Record @@ -779,7 +783,8 @@ def crawler_mocked_for_backref_test(crawler): return crawler -def test_validation_error_print(capsys): +def test_validation_error_print(caplog): + caplog.set_level(logging.DEBUG, logger="caoscrawler.converters") # there should be no server interaction since we only test the behavior if a validation error # occurs during the data collection stage DATADIR = os.path.join(os.path.dirname(__file__), "test_data", "failing_validation") @@ -789,10 +794,9 @@ def test_validation_error_print(capsys): os.path.join(DATADIR, "identifiables.yml"), True, None, - False, - "/use_case_simple_presentation") - captured = capsys.readouterr() - assert "Couldn't validate" in captured.out + False) + assert "Couldn't validate" in caplog.text + caplog.clear() def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test): @@ -867,3 +871,131 @@ def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_ insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist)) assert len(update) == 2 assert len(insert) == 1 + + +def mock_create_values(values, element): + pass + + +@patch("caoscrawler.converters.IntegerElementConverter.create_values") +def test_restricted_path(create_mock): + """ + The restricted_path argument allows to ignroe part of the crawled data structure. Here, we make + sure, that is that argument is provided, ideed only the given path of the tree is traversed. + + The check is done using the mock of the create_values function of the IntegerElementConverter. + This function is only called if elements are being treated. + """ + crawler_definition = { + "DictTest": { + "type": "DictElement", + "match": "(.*)", + "subtree": { + "nextdict": { + "type": "DictElement", + "match": "(.*)", + "subtree": { + "int_element": { + "type": "IntegerElement", + "match_name": ".*", + "match_value": "(?P<int_value>.*)", + "records": { + "Dataset": { + "Subject": "$int_value" + } + } + } + } + } + } + } + } + + crawler = Crawler(debug=True) + converter_registry = crawler.load_converters(crawler_definition) + + # This structure is crawled + test_dict = { + "v1": { + "a": 1, + "b": 2, + }, + "v2": { + "c": 3, + "d": 4, + } + } + # first test without a restricted_path + restricted_path = None + records = crawler.start_crawling( + DictElement("TestDict", test_dict), crawler_definition, converter_registry, + restricted_path + ) + assert create_mock.call_count == 4 + create_mock.reset_mock() + + # test with a restricted_path but one that has no effect (single root element) + # this also tests that the remainder of the tree is fully traversed + restricted_path = ["TestDict"] + records = crawler.start_crawling( + DictElement("TestDict", test_dict), crawler_definition, converter_registry, + restricted_path + ) + assert create_mock.call_count == 4 + create_mock.reset_mock() + + # test with a restricted_path that restricts the tree (single root element) + restricted_path = ["TestDict", "v2"] + records = crawler.start_crawling( + DictElement("TestDict", test_dict), crawler_definition, converter_registry, + restricted_path + ) + assert create_mock.call_count == 2 + create_mock.reset_mock() + + # test with a restricted_path that contains a bad element + restricted_path = ["TestDict", "v3"] + with raises(RuntimeError): + records = crawler.start_crawling( + DictElement("TestDict", test_dict), crawler_definition, converter_registry, + restricted_path + ) + + +def test_split_restricted_path(): + assert ["el"] == split_restricted_path("/el") + assert ["el"] == split_restricted_path("/el/") + assert ["el", "el"] == split_restricted_path("/el/el") + + +def test_deprecated_prefix_option(): + """Test that calling the crawler's main function with the deprecated + `prefix` option raises the correct errors and warnings. + + """ + + with pytest.deprecated_call(): + crawler_main("./", rfp("scifolder_cfood.yml"), prefix="to/be/removed") + + with raises(ValueError) as ve: + crawler_main("./", rfp("scifolder_cfood.yml"), prefix="to/be/removed", + remove_prefix="to/be/removed") + + assert "(deprecated) `prefix` and the `remove_prefix`" in str(ve.value) + + +def test_create_entity_summary(): + assert "" == Crawler.create_entity_summary([]).strip() + + entities = [ + db.Record(id=1).add_parent("A"), + db.Record(id=4, name='a').add_parent("B"), + db.Record(id=5).add_parent("A"), + db.Record(id=6, name='b').add_parent("B"), + ] + text = Crawler.create_entity_summary(entities).strip() + assert 'a' in text + assert 'b' in text + assert 'A:' in text + assert 'B:' in text + assert "<a href='/Entity/4'>a</a>, <a href='/Entity/6'>b</a>" in text