diff --git a/CHANGELOG.md b/CHANGELOG.md index 6aa83aca68ca2c2daa97c88784aec4987a817605..0c68188c6cc140fa49c6cb4b8f1f58189b45f8c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,12 +11,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - DateElementConverter: allows to interpret text as a date object - the restricted_path argument allows to crawl only a subtree - logging that provides a summary of what is inserted and updated +- You can now access the file system path of a structure element (if it has one) using the variable + name ``<converter name>.path`` ### Changed ### - The definitions for the default converters were removed from crawl.py and placed into a separate yaml file called `default_converters.yml`. There is a new test testing for the correct loading behavior of that file. +- JSONFileConverter, YAMLFileConverter and MarkdownFileConverter now inherit from + SimpleFileConverter. Behavior is unchanged, except that the MarkdownFileConverter now raises a + ConverterValidationError when the YAML header cannot be read instead of silently not matching. ### Deprecated ### diff --git a/INSTALL.md b/INSTALL.md new file mode 100644 index 0000000000000000000000000000000000000000..ba220626460c559aeded69d360c85917e0c78066 --- /dev/null +++ b/INSTALL.md @@ -0,0 +1,57 @@ +# Installation ## + + +## Linux #### + +Make sure that Python (at least version 3.8) and pip is installed, using your system tools and +documentation. + +Then open a terminal and continue in the [Generic installation](#generic-installation) section. + +## Windows #### + +If a Python distribution is not yet installed, we recommend Anaconda Python, which you can download +for free from [https://www.anaconda.com](https://www.anaconda.com). The "Anaconda Individual Edition" provides most of all +packages you will ever need out of the box. If you prefer, you may also install the leaner +"Miniconda" installer, which allows you to install packages as you need them. + +After installation, open an Anaconda prompt from the Windows menu and continue in the [Generic +installation](#generic-installation) section. + +## MacOS #### + +If there is no Python 3 installed yet, there are two main ways to +obtain it: Either get the binary package from +[python.org](https://www.python.org/downloads/) or, for advanced +users, install via [Homebrew](https://brew.sh/). After installation +from python.org, it is recommended to also update the TLS certificates +for Python (this requires administrator rights for your user): + +```sh +# Replace this with your Python version number: +cd /Applications/Python\ 3.9/ + +# This needs administrator rights: +sudo ./Install\ Certificates.command +``` + +After these steps, you may continue with the [Generic +installation](#generic-installation). + +## Generic installation #### + +The CaosDB crawler is available as [PyPi +package](https://pypi.org/project/caoscrawler/) and can simply installed by + +```sh +pip3 install caoscrawler +``` + +Alternatively, obtain the sources from GitLab and install from there (`git` must +be installed for this option): + +```sh +git clone https://gitlab.com/caosdb/caosdb-crawler +cd caosdb-crawler +pip3 install --user . +``` diff --git a/integrationtests/test_data/extroot/use_case_simple_presentation/cfood.yml b/integrationtests/test_data/extroot/use_case_simple_presentation/cfood.yml index 6495e1828dc56e99459c162f7751951f880ea55c..c55be2157a1f079ecfb5809c3658586f9114fad1 100644 --- a/integrationtests/test_data/extroot/use_case_simple_presentation/cfood.yml +++ b/integrationtests/test_data/extroot/use_case_simple_presentation/cfood.yml @@ -25,8 +25,8 @@ extroot: parents: - mdfile role: File - path: $DataFile - file: $DataFile + path: ${DataFile.path} + file: ${DataFile.path} Experiment: mdfile: $mdfile @@ -68,8 +68,8 @@ extroot: parents: - mdfile role: File - path: $DataFile - file: $DataFile + path: ${DataFile.path} + file: ${DataFile.path} Experiment: {} diff --git a/integrationtests/test_use_case_simple_presentation.py b/integrationtests/test_use_case_simple_presentation.py index 91c523be90a4d0117a7cc54217cae0b911511957..463253b6a85cdbc95088a0fa3f64c831459e5b9e 100644 --- a/integrationtests/test_use_case_simple_presentation.py +++ b/integrationtests/test_use_case_simple_presentation.py @@ -63,7 +63,7 @@ def test_complete_crawler( True, os.path.join(DATADIR, "provenance.yml"), False, - "/use_case_simple_presentation") + os.path.abspath(DATADIR)) res = db.execute_query("FIND Record Experiment") assert len(res) == 1 diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index ed48c130c578734a1757eb59b6778814085a8bf4..1962737dddbe71869846bcd40ecd8b0905ef0907 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -56,6 +56,10 @@ SPECIAL_PROPERTIES = ("description", "name", "id", "path", logger = logging.getLogger(__name__) +class CrawlerTemplate(Template): + braceidpattern = r"(?a:[_a-z][_\.a-z0-9]*)" + + def _only_max(children_with_keys): return [max(children_with_keys, key=lambda x: x[1])[0]] @@ -110,6 +114,19 @@ class ConverterValidationError(Exception): self.message = msg +def create_path_value(func): + """decorator for create_values functions that adds a value containing the path + + should be used for StructureElement that are associated with file system objects that have a + path, like File or Directory. + """ + + def inner(self, values: GeneralStore, element: StructureElement): + func(self, values=values, element=element) + values.update({self.name + ".path": element.path}) + return inner + + def replace_variables(propvalue, values: GeneralStore): """ This function replaces variables in property values (and possibly other locations, @@ -133,7 +150,7 @@ def replace_variables(propvalue, values: GeneralStore): if isinstance(values[varname], db.Entity): return values[varname] - propvalue_template = Template(propvalue) + propvalue_template = CrawlerTemplate(propvalue) return propvalue_template.safe_substitute(**values.get_storage()) @@ -241,7 +258,7 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict continue # Allow replacing variables in keys / names of properties: - key_template = Template(key) + key_template = CrawlerTemplate(key) key = key_template.safe_substitute(**values.get_storage()) keys_modified.append((name, key)) @@ -478,6 +495,10 @@ class DirectoryConverter(Converter): return children + @create_path_value + def create_values(self, values: GeneralStore, element: StructureElement): + super().create_values(values=values, element=element) + def typecheck(self, element: StructureElement): return isinstance(element, Directory) @@ -525,6 +546,10 @@ class SimpleFileConverter(Converter): def create_children(self, generalStore: GeneralStore, element: StructureElement): return list() + @create_path_value + def create_values(self, values: GeneralStore, element: StructureElement): + super().create_values(values=values, element=element) + @Converter.debug_matching("name") def match(self, element: StructureElement): # TODO: See comment on types and inheritance @@ -543,7 +568,7 @@ class FileConverter(SimpleFileConverter): super().__init__(*args, **kwargs) -class MarkdownFileConverter(Converter): +class MarkdownFileConverter(SimpleFileConverter): """ reads the yaml header of markdown files (if a such a header exists). """ @@ -553,8 +578,18 @@ class MarkdownFileConverter(Converter): if not isinstance(element, File): raise RuntimeError("A markdown file is needed to create children.") - header = yaml_header_tools.get_header_from_file( - element.path, clean=False) + try: + header = yaml_header_tools.get_header_from_file( + element.path, clean=False) + except yaml_header_tools.NoValidHeader: + if generalStore is not None and self.name in generalStore: + path = generalStore[self.name] + else: + path = "<path not set>" + raise ConverterValidationError( + "Error during the validation (yaml header cannot be read) of the markdown file " + "located at the following node in the data structure:\n" + f"{path}") children: List[StructureElement] = [] for name, entry in header.items(): @@ -567,25 +602,6 @@ class MarkdownFileConverter(Converter): "Header entry {} has incompatible type.".format(name)) return children - def typecheck(self, element: StructureElement): - return isinstance(element, File) - - @Converter.debug_matching("name") - def match(self, element: StructureElement): - # TODO: See comment on types and inheritance - if not isinstance(element, File): - raise RuntimeError("Element must be a file.") - m = re.match(self.definition["match"], element.name) - if m is None: - return None - try: - yaml_header_tools.get_header_from_file(element.path) - except yaml_header_tools.NoValidHeader: - # TODO(salexan): Raise a validation error instead of just not - # matching silently. - return None - return m.groupdict() - def convert_basic_element(element: Union[list, dict, bool, int, float, str, None], name=None, msg_prefix=""): @@ -692,20 +708,7 @@ class DictDictElementConverter(DictElementConverter): super().__init__(*args, **kwargs) -class JSONFileConverter(Converter): - def typecheck(self, element: StructureElement): - return isinstance(element, File) - - @Converter.debug_matching("name") - def match(self, element: StructureElement): - # TODO: See comment on types and inheritance - if not self.typecheck(element): - raise RuntimeError("Element must be a file") - m = re.match(self.definition["match"], element.name) - if m is None: - return None - return m.groupdict() - +class JSONFileConverter(SimpleFileConverter): def create_children(self, generalStore: GeneralStore, element: StructureElement): # TODO: See comment on types and inheritance if not isinstance(element, File): @@ -727,20 +730,7 @@ class JSONFileConverter(Converter): return [structure_element] -class YAMLFileConverter(Converter): - def typecheck(self, element: StructureElement): - return isinstance(element, File) - - @Converter.debug_matching("name") - def match(self, element: StructureElement): - # TODO: See comment on types and inheritance - if not self.typecheck(element): - raise RuntimeError("Element must be a file") - m = re.match(self.definition["match"], element.name) - if m is None: - return None - return m.groupdict() - +class YAMLFileConverter(SimpleFileConverter): def create_children(self, generalStore: GeneralStore, element: StructureElement): # TODO: See comment on types and inheritance if not isinstance(element, File): diff --git a/src/doc/README_SETUP.md b/src/doc/README_SETUP.md index 1f6e15d408e10e38bce0d9b9fe9b6197ec69bfc3..952a8c94a7dfa24110f320f5dd32b0ad2ac1df01 100644 --- a/src/doc/README_SETUP.md +++ b/src/doc/README_SETUP.md @@ -1,63 +1,10 @@ # Getting started with the CaosDB Crawler # -## Installation ## - -### How to install ### - -#### Linux #### - -Make sure that Python (at least version 3.8) and pip is installed, using your system tools and -documentation. - -Then open a terminal and continue in the [Generic installation](#generic-installation) section. - -#### Windows #### - -If a Python distribution is not yet installed, we recommend Anaconda Python, which you can download -for free from [https://www.anaconda.com](https://www.anaconda.com). The "Anaconda Individual Edition" provides most of all -packages you will ever need out of the box. If you prefer, you may also install the leaner -"Miniconda" installer, which allows you to install packages as you need them. - -After installation, open an Anaconda prompt from the Windows menu and continue in the [Generic -installation](#generic-installation) section. - -#### MacOS #### - -If there is no Python 3 installed yet, there are two main ways to -obtain it: Either get the binary package from -[python.org](https://www.python.org/downloads/) or, for advanced -users, install via [Homebrew](https://brew.sh/). After installation -from python.org, it is recommended to also update the TLS certificates -for Python (this requires administrator rights for your user): - -```sh -# Replace this with your Python version number: -cd /Applications/Python\ 3.9/ - -# This needs administrator rights: -sudo ./Install\ Certificates.command -``` - -After these steps, you may continue with the [Generic -installation](#generic-installation). - -#### Generic installation #### - ---- - -Obtain the sources from GitLab and install from there (`git` must be installed for -this option): - -```sh -git clone https://gitlab.com/caosdb/caosdb-crawler -cd caosdb-crawler -pip3 install --user . -``` - -**Note**: In the near future, this package will also be made available on PyPi. - +## Installation +see INSTALL.md ## Run Unit Tests +Run `pytest unittests`. ## Documentation ## We use sphinx to create the documentation. Docstrings in the code should comply diff --git a/src/doc/cfood.rst b/src/doc/cfood.rst index 37f6a8c7d3be9298ec965c50a4ec29110988ddc6..6564ee677f0b363a52c44dd5ceabe5378c255105 100644 --- a/src/doc/cfood.rst +++ b/src/doc/cfood.rst @@ -149,6 +149,44 @@ create lists or multi properties instead of single values: .. code-block:: yaml Experiment1: - Measurement: +Measurement <- Element in List (list is cleared before run) - *Measurement <- Multi Property (properties are removed before run) - Measurement <- Overwrite + Measurement: +Measurement # Element in List (list is cleared before run) + *Measurement # Multi Property (properties are removed before run) + Measurement # Overwrite + + +File Entities +------------- + +In order to use File Entities, you must set the appropriate ``role: File``. +Additionally, the path and file keys have to be given, with values that set the +paths remotely and locally, respectively. You can use the variable +``<converter name>_path`` that is automatically created by converters that deal +with file system related StructureElements. The file object itsself is stored +in a vairable with the same name (as it is the case for other Records). + + +.. code-block:: yaml + + somefile: + type: SimpleFile + match: ^params.*$ # macht any file that starts with "params" + records: + fileEntity: + role: File # necessary to create a File Entity + path: somefile.path # defines the path in CaosDB + file: somefile.path # path where the file is found locally + SomeRecord: + ParameterFile: $fileEntity # creates a reference to the file + +Automatically generated keys +++++++++++++++++++++++++++++ + +Some variable names are automatically generated and can be used using the +``$<variable name>`` syntax. Those include: + +- ``<converter name>``: access the path of converter names to the current converter +- ``<converter name>.path``: the file system path to the structure element + (file system related converters only; you need curly brackets to use them: + ``${<converter name>.path}``) +- ``<Record key>``: all entities that are created in the ``records`` section + are available under the same key diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst index 89757f21958f3d94649b33e9f9112593f703191d..0881d9302b621d6b47575e171dd9e8c144e29cd4 100644 --- a/src/doc/concepts.rst +++ b/src/doc/concepts.rst @@ -1,6 +1,10 @@ Concepts )))))))) +The CaosDB Crawler can handle any kind of hierarchical data structure. The typical use case is +directory tree that is traversed. We use the following terms/concepts to describe how the CaosDB +Crawler works. + Structure Elements ++++++++++++++++++ diff --git a/src/doc/getting_started/INSTALL.md b/src/doc/getting_started/INSTALL.md new file mode 120000 index 0000000000000000000000000000000000000000..95b6037c7ab329d91e3a8ed4a2b31eba675eef62 --- /dev/null +++ b/src/doc/getting_started/INSTALL.md @@ -0,0 +1 @@ +../../../INSTALL.md \ No newline at end of file diff --git a/src/doc/getting_started/helloworld.rst b/src/doc/getting_started/helloworld.rst new file mode 100644 index 0000000000000000000000000000000000000000..ef4a1398322b59d7983b7dff384534cfa501b660 --- /dev/null +++ b/src/doc/getting_started/helloworld.rst @@ -0,0 +1,5 @@ + +Prerequisites +))))))))))))) + +TODO Describe the smallest possible crawler run diff --git a/src/doc/getting_started/index.rst b/src/doc/getting_started/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..74ffa7daeff393d05605e1066a5985984c2e9751 --- /dev/null +++ b/src/doc/getting_started/index.rst @@ -0,0 +1,15 @@ +Getting Started ++++++++++++++++ + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + :hidden: + + Installation<INSTALL> + prerequisites + helloworld + +This section will help you get going! From the first installation steps to the first simple crawl. + +Let's go! diff --git a/src/doc/getting_started/prerequisites.rst b/src/doc/getting_started/prerequisites.rst new file mode 100644 index 0000000000000000000000000000000000000000..dc8022b6cad99a8508f19f47dc01c601fb676c5b --- /dev/null +++ b/src/doc/getting_started/prerequisites.rst @@ -0,0 +1,6 @@ + +Prerequisites +))))))))))))) + +TODO Describe what you need to actually do a crawler run: data, CaosDB, ... + diff --git a/src/doc/index.rst b/src/doc/index.rst index b4e30e4728068cabb92626cfac986ab858a0bbb6..d319bf4d24a05a3033b1ae5bbf80433c5ef3646b 100644 --- a/src/doc/index.rst +++ b/src/doc/index.rst @@ -7,12 +7,12 @@ CaosDB-Crawler Documentation :caption: Contents: :hidden: - Getting started<README_SETUP> + Getting started<getting_started/index> + Tutorials<tutorials/index> Concepts<concepts> Converters<converters> CFoods (Crawler Definitions)<cfood> Macros<macros> - Tutorials<tutorials/index> How to upgrade<how-to-upgrade> API documentation<_apidoc/modules> diff --git a/src/doc/macros.rst b/src/doc/macros.rst index d3a3e9b9634a4e1d72228dd46692a824e1d5acfd..7685731d35afab51074bb4d12c51ede0a7ba1b75 100644 --- a/src/doc/macros.rst +++ b/src/doc/macros.rst @@ -195,7 +195,7 @@ The example will be expanded to: Limitation ----------- +========== Currently it is not possible to use the same macro twice in the same yaml node, but in different positions. Consider: diff --git a/src/doc/tutorials/index.rst b/src/doc/tutorials/index.rst index 88d598ece284e1aad315a1e0fcae3fdf494b3aad..02371de196cc139776416882aff31bd6fa4dabbe 100644 --- a/src/doc/tutorials/index.rst +++ b/src/doc/tutorials/index.rst @@ -1,9 +1,11 @@ Tutorials +++++++++ +This chapter contains a collection of tutorials. + .. toctree:: :maxdepth: 2 :caption: Contents: - :hidden: Example CFood<example> + diff --git a/unittests/scifolder_cfood.yml b/unittests/scifolder_cfood.yml index 74fd027563907c5ae416ca389faba0ecd64d5848..dce219b751c3e980662a1eaa4904e1163d9836a0 100644 --- a/unittests/scifolder_cfood.yml +++ b/unittests/scifolder_cfood.yml @@ -22,7 +22,7 @@ Data: # name of the converter parents: - Project # not needed as the name is equivalent date: $date - identifier: $identifier + identifier: ${identifier} subtree: measurement: # new name for folders on the 3rd level diff --git a/unittests/test_converters.py b/unittests/test_converters.py index f72deda18152f9d12161d740e41271f90fcb848c..4d3791fce3ceffaafe529423e4020ebd6a4231ba 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -130,14 +130,11 @@ def test_markdown_converter(converter_registry): ) ) - converter = MarkdownFileConverter({ - "match": "(.*)" - }, "TestMarkdownFileConverter", - converter_registry) + converter = MarkdownFileConverter({"match": "(.*)"}, "TestMarkdownFileConverter", + converter_registry) - m = converter.match(File("test_tool.py", rfp( - "test_tool.py"))) - assert m is None + with pytest.raises(ConverterValidationError) as err: + converter.create_children(None, File("test_tool.py", rfp("test_tool.py"))) m = converter.match(test_readme) assert m is not None @@ -610,3 +607,17 @@ def test_load_converters(): assert "SimpleFile" in converter_registry assert "Directory" in converter_registry assert "ListElement" in converter_registry + + +def test_create_path_value(converter_registry): + """ test whether the variable containing the path is added to the general store""" + dc = Converter.converter_factory( + definition={ + "type": "Directory", + "match": ".*" + }, + name="Test", converter_registry=converter_registry) + values = GeneralStore() + dc.create_values(values, Directory("a", "/a")) + assert "Test.path" in values + assert values["Test.path"] == "/a" diff --git a/unittests/test_scalars_cfood.py b/unittests/test_scalars_cfood.py index 1bf8f0b7d67f00f2018b5b68424d6b9cc17602eb..ac408b2dab0fa151c370d3ec6ffd1dced22c77d7 100644 --- a/unittests/test_scalars_cfood.py +++ b/unittests/test_scalars_cfood.py @@ -42,16 +42,23 @@ def test_record_structure_generation(crawler): subd = crawler.debug_tree[dircheckstr("DataAnalysis")] assert len(subd) == 2 # variables store on Data Analysis node of debug tree - assert len(subd[0]) == 3 - assert "Data" in subd[0] - assert "DataAnalysis" in subd[0] - assert "RecordThatGetsParentsLater" in subd[0] + if "Data" in subd[0]: + subddata = subd[0] + subdRTGPL = subd[1] + else: + subddata = subd[1] + subdRTGPL = subd[0] + assert len(subddata) == 5 + assert "DataAnalysis" in subddata + assert "DataAnalysis.path" in subddata + assert "Data.path" in subddata + assert "RecordThatGetsParentsLater" in subddata - prop = subd[0]["RecordThatGetsParentsLater"].get_property("someId") + prop = subddata["RecordThatGetsParentsLater"].get_property("someId") assert type(prop.value) == int assert prop.value == 23 # record store on Data Analysis node of debug tree - assert len(subd[1]) == 1 - prop2 = subd[1]["RecordThatGetsParentsLater"].get_property("someId") + assert len(subdRTGPL) == 1 + prop2 = subdRTGPL["RecordThatGetsParentsLater"].get_property("someId") assert prop == prop2 diff --git a/unittests/test_tool.py b/unittests/test_tool.py index 23b35f2dc9228eeda9137945198c49c19bf5c474..4ac2b4577fbeea6f4bdf291c48ddaf0fa418b2a5 100755 --- a/unittests/test_tool.py +++ b/unittests/test_tool.py @@ -110,15 +110,17 @@ def ident(crawler): def test_record_structure_generation(crawler): + # TODO How does this test relate to the test function in test_scalars_cfood with the same name? + # There seems to be code duplication subd = crawler.debug_tree[dircheckstr("DataAnalysis")] subc = crawler.debug_metadata["copied"][dircheckstr("DataAnalysis")] assert len(subd) == 2 # variables store on Data Analysis node of debug tree - assert len(subd[0]) == 2 + assert len(subd[0]) == 4 # record store on Data Analysis node of debug tree assert len(subd[1]) == 0 assert len(subc) == 2 - assert len(subc[0]) == 2 + assert len(subc[0]) == 4 assert len(subc[1]) == 0 # The data analysis node creates one variable for the node itself: @@ -137,7 +139,7 @@ def test_record_structure_generation(crawler): assert subd[1]["Project"].get_property( "identifier").value == "climate-model-predict" - assert len(subd[0]) == 6 + assert len(subd[0]) == 9 assert subd[0]["date"] == "2020" assert subd[0]["identifier"] == "climate-model-predict" assert subd[0]["Project"].__class__ == db.Record @@ -148,7 +150,7 @@ def test_record_structure_generation(crawler): assert subc[0]["project_dir"] is False # Check the copy flags for the first level in the hierarchy: - assert len(subc[0]) == 6 + assert len(subc[0]) == 9 assert len(subc[1]) == 1 assert subc[1]["Project"] is False assert subc[0]["Project"] is False @@ -161,7 +163,7 @@ def test_record_structure_generation(crawler): subc = crawler.debug_metadata["copied"][dircheckstr("DataAnalysis", "2020_climate-model-predict", "2020-02-08_prediction-errors")] - assert len(subd[0]) == 8 + assert len(subd[0]) == 12 assert subd[0]["date"] == "2020-02-08" assert subd[0]["identifier"] == "prediction-errors" assert subd[0]["Project"].__class__ == db.Record