Skip to content
Snippets Groups Projects
Commit f71d820e authored by florian's avatar florian
Browse files

Merge branch 'dev' into f-create-link

parents 1590919f 6f3717df
Branches
Tags
2 merge requests!105REL: v0.4.0,!102MAINT: add logging on inserts and updates
Pipeline #33965 passed
Showing
with 230 additions and 140 deletions
...@@ -11,12 +11,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ...@@ -11,12 +11,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- DateElementConverter: allows to interpret text as a date object - DateElementConverter: allows to interpret text as a date object
- the restricted_path argument allows to crawl only a subtree - the restricted_path argument allows to crawl only a subtree
- logging that provides a summary of what is inserted and updated - logging that provides a summary of what is inserted and updated
- You can now access the file system path of a structure element (if it has one) using the variable
name ``<converter name>.path``
### Changed ### ### Changed ###
- The definitions for the default converters were removed from crawl.py and placed into - The definitions for the default converters were removed from crawl.py and placed into
a separate yaml file called `default_converters.yml`. There is a new test testing for a separate yaml file called `default_converters.yml`. There is a new test testing for
the correct loading behavior of that file. the correct loading behavior of that file.
- JSONFileConverter, YAMLFileConverter and MarkdownFileConverter now inherit from
SimpleFileConverter. Behavior is unchanged, except that the MarkdownFileConverter now raises a
ConverterValidationError when the YAML header cannot be read instead of silently not matching.
### Deprecated ### ### Deprecated ###
......
# Installation ##
## Linux ####
Make sure that Python (at least version 3.8) and pip is installed, using your system tools and
documentation.
Then open a terminal and continue in the [Generic installation](#generic-installation) section.
## Windows ####
If a Python distribution is not yet installed, we recommend Anaconda Python, which you can download
for free from [https://www.anaconda.com](https://www.anaconda.com). The "Anaconda Individual Edition" provides most of all
packages you will ever need out of the box. If you prefer, you may also install the leaner
"Miniconda" installer, which allows you to install packages as you need them.
After installation, open an Anaconda prompt from the Windows menu and continue in the [Generic
installation](#generic-installation) section.
## MacOS ####
If there is no Python 3 installed yet, there are two main ways to
obtain it: Either get the binary package from
[python.org](https://www.python.org/downloads/) or, for advanced
users, install via [Homebrew](https://brew.sh/). After installation
from python.org, it is recommended to also update the TLS certificates
for Python (this requires administrator rights for your user):
```sh
# Replace this with your Python version number:
cd /Applications/Python\ 3.9/
# This needs administrator rights:
sudo ./Install\ Certificates.command
```
After these steps, you may continue with the [Generic
installation](#generic-installation).
## Generic installation ####
The CaosDB crawler is available as [PyPi
package](https://pypi.org/project/caoscrawler/) and can simply installed by
```sh
pip3 install caoscrawler
```
Alternatively, obtain the sources from GitLab and install from there (`git` must
be installed for this option):
```sh
git clone https://gitlab.com/caosdb/caosdb-crawler
cd caosdb-crawler
pip3 install --user .
```
...@@ -25,8 +25,8 @@ extroot: ...@@ -25,8 +25,8 @@ extroot:
parents: parents:
- mdfile - mdfile
role: File role: File
path: $DataFile path: ${DataFile.path}
file: $DataFile file: ${DataFile.path}
Experiment: Experiment:
mdfile: $mdfile mdfile: $mdfile
...@@ -68,8 +68,8 @@ extroot: ...@@ -68,8 +68,8 @@ extroot:
parents: parents:
- mdfile - mdfile
role: File role: File
path: $DataFile path: ${DataFile.path}
file: $DataFile file: ${DataFile.path}
Experiment: {} Experiment: {}
......
...@@ -63,7 +63,7 @@ def test_complete_crawler( ...@@ -63,7 +63,7 @@ def test_complete_crawler(
True, True,
os.path.join(DATADIR, "provenance.yml"), os.path.join(DATADIR, "provenance.yml"),
False, False,
"/use_case_simple_presentation") os.path.abspath(DATADIR))
res = db.execute_query("FIND Record Experiment") res = db.execute_query("FIND Record Experiment")
assert len(res) == 1 assert len(res) == 1
......
...@@ -56,6 +56,10 @@ SPECIAL_PROPERTIES = ("description", "name", "id", "path", ...@@ -56,6 +56,10 @@ SPECIAL_PROPERTIES = ("description", "name", "id", "path",
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class CrawlerTemplate(Template):
braceidpattern = r"(?a:[_a-z][_\.a-z0-9]*)"
def _only_max(children_with_keys): def _only_max(children_with_keys):
return [max(children_with_keys, key=lambda x: x[1])[0]] return [max(children_with_keys, key=lambda x: x[1])[0]]
...@@ -110,6 +114,19 @@ class ConverterValidationError(Exception): ...@@ -110,6 +114,19 @@ class ConverterValidationError(Exception):
self.message = msg self.message = msg
def create_path_value(func):
"""decorator for create_values functions that adds a value containing the path
should be used for StructureElement that are associated with file system objects that have a
path, like File or Directory.
"""
def inner(self, values: GeneralStore, element: StructureElement):
func(self, values=values, element=element)
values.update({self.name + ".path": element.path})
return inner
def replace_variables(propvalue, values: GeneralStore): def replace_variables(propvalue, values: GeneralStore):
""" """
This function replaces variables in property values (and possibly other locations, This function replaces variables in property values (and possibly other locations,
...@@ -133,7 +150,7 @@ def replace_variables(propvalue, values: GeneralStore): ...@@ -133,7 +150,7 @@ def replace_variables(propvalue, values: GeneralStore):
if isinstance(values[varname], db.Entity): if isinstance(values[varname], db.Entity):
return values[varname] return values[varname]
propvalue_template = Template(propvalue) propvalue_template = CrawlerTemplate(propvalue)
return propvalue_template.safe_substitute(**values.get_storage()) return propvalue_template.safe_substitute(**values.get_storage())
...@@ -241,7 +258,7 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict ...@@ -241,7 +258,7 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict
continue continue
# Allow replacing variables in keys / names of properties: # Allow replacing variables in keys / names of properties:
key_template = Template(key) key_template = CrawlerTemplate(key)
key = key_template.safe_substitute(**values.get_storage()) key = key_template.safe_substitute(**values.get_storage())
keys_modified.append((name, key)) keys_modified.append((name, key))
...@@ -478,6 +495,10 @@ class DirectoryConverter(Converter): ...@@ -478,6 +495,10 @@ class DirectoryConverter(Converter):
return children return children
@create_path_value
def create_values(self, values: GeneralStore, element: StructureElement):
super().create_values(values=values, element=element)
def typecheck(self, element: StructureElement): def typecheck(self, element: StructureElement):
return isinstance(element, Directory) return isinstance(element, Directory)
...@@ -525,6 +546,10 @@ class SimpleFileConverter(Converter): ...@@ -525,6 +546,10 @@ class SimpleFileConverter(Converter):
def create_children(self, generalStore: GeneralStore, element: StructureElement): def create_children(self, generalStore: GeneralStore, element: StructureElement):
return list() return list()
@create_path_value
def create_values(self, values: GeneralStore, element: StructureElement):
super().create_values(values=values, element=element)
@Converter.debug_matching("name") @Converter.debug_matching("name")
def match(self, element: StructureElement): def match(self, element: StructureElement):
# TODO: See comment on types and inheritance # TODO: See comment on types and inheritance
...@@ -543,7 +568,7 @@ class FileConverter(SimpleFileConverter): ...@@ -543,7 +568,7 @@ class FileConverter(SimpleFileConverter):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
class MarkdownFileConverter(Converter): class MarkdownFileConverter(SimpleFileConverter):
""" """
reads the yaml header of markdown files (if a such a header exists). reads the yaml header of markdown files (if a such a header exists).
""" """
...@@ -553,8 +578,18 @@ class MarkdownFileConverter(Converter): ...@@ -553,8 +578,18 @@ class MarkdownFileConverter(Converter):
if not isinstance(element, File): if not isinstance(element, File):
raise RuntimeError("A markdown file is needed to create children.") raise RuntimeError("A markdown file is needed to create children.")
try:
header = yaml_header_tools.get_header_from_file( header = yaml_header_tools.get_header_from_file(
element.path, clean=False) element.path, clean=False)
except yaml_header_tools.NoValidHeader:
if generalStore is not None and self.name in generalStore:
path = generalStore[self.name]
else:
path = "<path not set>"
raise ConverterValidationError(
"Error during the validation (yaml header cannot be read) of the markdown file "
"located at the following node in the data structure:\n"
f"{path}")
children: List[StructureElement] = [] children: List[StructureElement] = []
for name, entry in header.items(): for name, entry in header.items():
...@@ -567,25 +602,6 @@ class MarkdownFileConverter(Converter): ...@@ -567,25 +602,6 @@ class MarkdownFileConverter(Converter):
"Header entry {} has incompatible type.".format(name)) "Header entry {} has incompatible type.".format(name))
return children return children
def typecheck(self, element: StructureElement):
return isinstance(element, File)
@Converter.debug_matching("name")
def match(self, element: StructureElement):
# TODO: See comment on types and inheritance
if not isinstance(element, File):
raise RuntimeError("Element must be a file.")
m = re.match(self.definition["match"], element.name)
if m is None:
return None
try:
yaml_header_tools.get_header_from_file(element.path)
except yaml_header_tools.NoValidHeader:
# TODO(salexan): Raise a validation error instead of just not
# matching silently.
return None
return m.groupdict()
def convert_basic_element(element: Union[list, dict, bool, int, float, str, None], name=None, def convert_basic_element(element: Union[list, dict, bool, int, float, str, None], name=None,
msg_prefix=""): msg_prefix=""):
...@@ -692,20 +708,7 @@ class DictDictElementConverter(DictElementConverter): ...@@ -692,20 +708,7 @@ class DictDictElementConverter(DictElementConverter):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
class JSONFileConverter(Converter): class JSONFileConverter(SimpleFileConverter):
def typecheck(self, element: StructureElement):
return isinstance(element, File)
@Converter.debug_matching("name")
def match(self, element: StructureElement):
# TODO: See comment on types and inheritance
if not self.typecheck(element):
raise RuntimeError("Element must be a file")
m = re.match(self.definition["match"], element.name)
if m is None:
return None
return m.groupdict()
def create_children(self, generalStore: GeneralStore, element: StructureElement): def create_children(self, generalStore: GeneralStore, element: StructureElement):
# TODO: See comment on types and inheritance # TODO: See comment on types and inheritance
if not isinstance(element, File): if not isinstance(element, File):
...@@ -727,20 +730,7 @@ class JSONFileConverter(Converter): ...@@ -727,20 +730,7 @@ class JSONFileConverter(Converter):
return [structure_element] return [structure_element]
class YAMLFileConverter(Converter): class YAMLFileConverter(SimpleFileConverter):
def typecheck(self, element: StructureElement):
return isinstance(element, File)
@Converter.debug_matching("name")
def match(self, element: StructureElement):
# TODO: See comment on types and inheritance
if not self.typecheck(element):
raise RuntimeError("Element must be a file")
m = re.match(self.definition["match"], element.name)
if m is None:
return None
return m.groupdict()
def create_children(self, generalStore: GeneralStore, element: StructureElement): def create_children(self, generalStore: GeneralStore, element: StructureElement):
# TODO: See comment on types and inheritance # TODO: See comment on types and inheritance
if not isinstance(element, File): if not isinstance(element, File):
......
# Getting started with the CaosDB Crawler # # Getting started with the CaosDB Crawler #
## Installation ## ## Installation
see INSTALL.md
### How to install ###
#### Linux ####
Make sure that Python (at least version 3.8) and pip is installed, using your system tools and
documentation.
Then open a terminal and continue in the [Generic installation](#generic-installation) section.
#### Windows ####
If a Python distribution is not yet installed, we recommend Anaconda Python, which you can download
for free from [https://www.anaconda.com](https://www.anaconda.com). The "Anaconda Individual Edition" provides most of all
packages you will ever need out of the box. If you prefer, you may also install the leaner
"Miniconda" installer, which allows you to install packages as you need them.
After installation, open an Anaconda prompt from the Windows menu and continue in the [Generic
installation](#generic-installation) section.
#### MacOS ####
If there is no Python 3 installed yet, there are two main ways to
obtain it: Either get the binary package from
[python.org](https://www.python.org/downloads/) or, for advanced
users, install via [Homebrew](https://brew.sh/). After installation
from python.org, it is recommended to also update the TLS certificates
for Python (this requires administrator rights for your user):
```sh
# Replace this with your Python version number:
cd /Applications/Python\ 3.9/
# This needs administrator rights:
sudo ./Install\ Certificates.command
```
After these steps, you may continue with the [Generic
installation](#generic-installation).
#### Generic installation ####
---
Obtain the sources from GitLab and install from there (`git` must be installed for
this option):
```sh
git clone https://gitlab.com/caosdb/caosdb-crawler
cd caosdb-crawler
pip3 install --user .
```
**Note**: In the near future, this package will also be made available on PyPi.
## Run Unit Tests ## Run Unit Tests
Run `pytest unittests`.
## Documentation ## ## Documentation ##
We use sphinx to create the documentation. Docstrings in the code should comply We use sphinx to create the documentation. Docstrings in the code should comply
......
...@@ -149,6 +149,44 @@ create lists or multi properties instead of single values: ...@@ -149,6 +149,44 @@ create lists or multi properties instead of single values:
.. code-block:: yaml .. code-block:: yaml
Experiment1: Experiment1:
Measurement: +Measurement <- Element in List (list is cleared before run) Measurement: +Measurement # Element in List (list is cleared before run)
*Measurement <- Multi Property (properties are removed before run) *Measurement # Multi Property (properties are removed before run)
Measurement <- Overwrite Measurement # Overwrite
File Entities
-------------
In order to use File Entities, you must set the appropriate ``role: File``.
Additionally, the path and file keys have to be given, with values that set the
paths remotely and locally, respectively. You can use the variable
``<converter name>_path`` that is automatically created by converters that deal
with file system related StructureElements. The file object itsself is stored
in a vairable with the same name (as it is the case for other Records).
.. code-block:: yaml
somefile:
type: SimpleFile
match: ^params.*$ # macht any file that starts with "params"
records:
fileEntity:
role: File # necessary to create a File Entity
path: somefile.path # defines the path in CaosDB
file: somefile.path # path where the file is found locally
SomeRecord:
ParameterFile: $fileEntity # creates a reference to the file
Automatically generated keys
++++++++++++++++++++++++++++
Some variable names are automatically generated and can be used using the
``$<variable name>`` syntax. Those include:
- ``<converter name>``: access the path of converter names to the current converter
- ``<converter name>.path``: the file system path to the structure element
(file system related converters only; you need curly brackets to use them:
``${<converter name>.path}``)
- ``<Record key>``: all entities that are created in the ``records`` section
are available under the same key
Concepts Concepts
)))))))) ))))))))
The CaosDB Crawler can handle any kind of hierarchical data structure. The typical use case is
directory tree that is traversed. We use the following terms/concepts to describe how the CaosDB
Crawler works.
Structure Elements Structure Elements
++++++++++++++++++ ++++++++++++++++++
......
../../../INSTALL.md
\ No newline at end of file
Prerequisites
)))))))))))))
TODO Describe the smallest possible crawler run
Getting Started
+++++++++++++++
.. toctree::
:maxdepth: 2
:caption: Contents:
:hidden:
Installation<INSTALL>
prerequisites
helloworld
This section will help you get going! From the first installation steps to the first simple crawl.
Let's go!
Prerequisites
)))))))))))))
TODO Describe what you need to actually do a crawler run: data, CaosDB, ...
...@@ -7,12 +7,12 @@ CaosDB-Crawler Documentation ...@@ -7,12 +7,12 @@ CaosDB-Crawler Documentation
:caption: Contents: :caption: Contents:
:hidden: :hidden:
Getting started<README_SETUP> Getting started<getting_started/index>
Tutorials<tutorials/index>
Concepts<concepts> Concepts<concepts>
Converters<converters> Converters<converters>
CFoods (Crawler Definitions)<cfood> CFoods (Crawler Definitions)<cfood>
Macros<macros> Macros<macros>
Tutorials<tutorials/index>
How to upgrade<how-to-upgrade> How to upgrade<how-to-upgrade>
API documentation<_apidoc/modules> API documentation<_apidoc/modules>
......
...@@ -195,7 +195,7 @@ The example will be expanded to: ...@@ -195,7 +195,7 @@ The example will be expanded to:
Limitation Limitation
---------- ==========
Currently it is not possible to use the same macro twice in the same yaml node, but in different Currently it is not possible to use the same macro twice in the same yaml node, but in different
positions. Consider: positions. Consider:
......
Tutorials Tutorials
+++++++++ +++++++++
This chapter contains a collection of tutorials.
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
:caption: Contents: :caption: Contents:
:hidden:
Example CFood<example> Example CFood<example>
...@@ -22,7 +22,7 @@ Data: # name of the converter ...@@ -22,7 +22,7 @@ Data: # name of the converter
parents: parents:
- Project # not needed as the name is equivalent - Project # not needed as the name is equivalent
date: $date date: $date
identifier: $identifier identifier: ${identifier}
subtree: subtree:
measurement: # new name for folders on the 3rd level measurement: # new name for folders on the 3rd level
......
...@@ -130,14 +130,11 @@ def test_markdown_converter(converter_registry): ...@@ -130,14 +130,11 @@ def test_markdown_converter(converter_registry):
) )
) )
converter = MarkdownFileConverter({ converter = MarkdownFileConverter({"match": "(.*)"}, "TestMarkdownFileConverter",
"match": "(.*)"
}, "TestMarkdownFileConverter",
converter_registry) converter_registry)
m = converter.match(File("test_tool.py", rfp( with pytest.raises(ConverterValidationError) as err:
"test_tool.py"))) converter.create_children(None, File("test_tool.py", rfp("test_tool.py")))
assert m is None
m = converter.match(test_readme) m = converter.match(test_readme)
assert m is not None assert m is not None
...@@ -610,3 +607,17 @@ def test_load_converters(): ...@@ -610,3 +607,17 @@ def test_load_converters():
assert "SimpleFile" in converter_registry assert "SimpleFile" in converter_registry
assert "Directory" in converter_registry assert "Directory" in converter_registry
assert "ListElement" in converter_registry assert "ListElement" in converter_registry
def test_create_path_value(converter_registry):
""" test whether the variable containing the path is added to the general store"""
dc = Converter.converter_factory(
definition={
"type": "Directory",
"match": ".*"
},
name="Test", converter_registry=converter_registry)
values = GeneralStore()
dc.create_values(values, Directory("a", "/a"))
assert "Test.path" in values
assert values["Test.path"] == "/a"
...@@ -42,16 +42,23 @@ def test_record_structure_generation(crawler): ...@@ -42,16 +42,23 @@ def test_record_structure_generation(crawler):
subd = crawler.debug_tree[dircheckstr("DataAnalysis")] subd = crawler.debug_tree[dircheckstr("DataAnalysis")]
assert len(subd) == 2 assert len(subd) == 2
# variables store on Data Analysis node of debug tree # variables store on Data Analysis node of debug tree
assert len(subd[0]) == 3 if "Data" in subd[0]:
assert "Data" in subd[0] subddata = subd[0]
assert "DataAnalysis" in subd[0] subdRTGPL = subd[1]
assert "RecordThatGetsParentsLater" in subd[0] else:
subddata = subd[1]
subdRTGPL = subd[0]
assert len(subddata) == 5
assert "DataAnalysis" in subddata
assert "DataAnalysis.path" in subddata
assert "Data.path" in subddata
assert "RecordThatGetsParentsLater" in subddata
prop = subd[0]["RecordThatGetsParentsLater"].get_property("someId") prop = subddata["RecordThatGetsParentsLater"].get_property("someId")
assert type(prop.value) == int assert type(prop.value) == int
assert prop.value == 23 assert prop.value == 23
# record store on Data Analysis node of debug tree # record store on Data Analysis node of debug tree
assert len(subd[1]) == 1 assert len(subdRTGPL) == 1
prop2 = subd[1]["RecordThatGetsParentsLater"].get_property("someId") prop2 = subdRTGPL["RecordThatGetsParentsLater"].get_property("someId")
assert prop == prop2 assert prop == prop2
...@@ -110,15 +110,17 @@ def ident(crawler): ...@@ -110,15 +110,17 @@ def ident(crawler):
def test_record_structure_generation(crawler): def test_record_structure_generation(crawler):
# TODO How does this test relate to the test function in test_scalars_cfood with the same name?
# There seems to be code duplication
subd = crawler.debug_tree[dircheckstr("DataAnalysis")] subd = crawler.debug_tree[dircheckstr("DataAnalysis")]
subc = crawler.debug_metadata["copied"][dircheckstr("DataAnalysis")] subc = crawler.debug_metadata["copied"][dircheckstr("DataAnalysis")]
assert len(subd) == 2 assert len(subd) == 2
# variables store on Data Analysis node of debug tree # variables store on Data Analysis node of debug tree
assert len(subd[0]) == 2 assert len(subd[0]) == 4
# record store on Data Analysis node of debug tree # record store on Data Analysis node of debug tree
assert len(subd[1]) == 0 assert len(subd[1]) == 0
assert len(subc) == 2 assert len(subc) == 2
assert len(subc[0]) == 2 assert len(subc[0]) == 4
assert len(subc[1]) == 0 assert len(subc[1]) == 0
# The data analysis node creates one variable for the node itself: # The data analysis node creates one variable for the node itself:
...@@ -137,7 +139,7 @@ def test_record_structure_generation(crawler): ...@@ -137,7 +139,7 @@ def test_record_structure_generation(crawler):
assert subd[1]["Project"].get_property( assert subd[1]["Project"].get_property(
"identifier").value == "climate-model-predict" "identifier").value == "climate-model-predict"
assert len(subd[0]) == 6 assert len(subd[0]) == 9
assert subd[0]["date"] == "2020" assert subd[0]["date"] == "2020"
assert subd[0]["identifier"] == "climate-model-predict" assert subd[0]["identifier"] == "climate-model-predict"
assert subd[0]["Project"].__class__ == db.Record assert subd[0]["Project"].__class__ == db.Record
...@@ -148,7 +150,7 @@ def test_record_structure_generation(crawler): ...@@ -148,7 +150,7 @@ def test_record_structure_generation(crawler):
assert subc[0]["project_dir"] is False assert subc[0]["project_dir"] is False
# Check the copy flags for the first level in the hierarchy: # Check the copy flags for the first level in the hierarchy:
assert len(subc[0]) == 6 assert len(subc[0]) == 9
assert len(subc[1]) == 1 assert len(subc[1]) == 1
assert subc[1]["Project"] is False assert subc[1]["Project"] is False
assert subc[0]["Project"] is False assert subc[0]["Project"] is False
...@@ -161,7 +163,7 @@ def test_record_structure_generation(crawler): ...@@ -161,7 +163,7 @@ def test_record_structure_generation(crawler):
subc = crawler.debug_metadata["copied"][dircheckstr("DataAnalysis", subc = crawler.debug_metadata["copied"][dircheckstr("DataAnalysis",
"2020_climate-model-predict", "2020_climate-model-predict",
"2020-02-08_prediction-errors")] "2020-02-08_prediction-errors")]
assert len(subd[0]) == 8 assert len(subd[0]) == 12
assert subd[0]["date"] == "2020-02-08" assert subd[0]["date"] == "2020-02-08"
assert subd[0]["identifier"] == "prediction-errors" assert subd[0]["identifier"] == "prediction-errors"
assert subd[0]["Project"].__class__ == db.Record assert subd[0]["Project"].__class__ == db.Record
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment