Skip to content
Snippets Groups Projects
Commit 5b9ca55f authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

Merge branch 'dev' into f-update-parents-mitigation

parents 8e9a9128 d34f0bfa
Branches
Tags
2 merge requests!53Release 0.1,!19F update parents mitigation
Pipeline #29019 failed
Showing
with 623 additions and 148 deletions
......@@ -14,3 +14,6 @@ provenance.yml
*.tar.gz
*.sql
/integrationtests/test-profile/custom/other/cert/
src/doc/_apidoc/
start_caosdb_docker.sh
src/doc/_apidoc
......@@ -152,11 +152,11 @@ inttest:
- CAOSDB_TAG=$CAOSDB_TAG docker-compose up -d
# Store versions of CaosDB parts
- docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_pylib_commit > hash_pylib
- docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_webui_commit > hash_webui
- docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_server_commit > hash_server
- docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_mysqlbackend_commit > hash_mysql
- docker exec -u 0 -t docker_caosdb-server_1 cat /opt/caosdb/git/caosdb_proto_commit > hash_proto
- docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_pylib_commit > hash_pylib
- docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_webui_commit > hash_webui
- docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_server_commit > hash_server
- docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_mysqlbackend_commit > hash_mysql
- docker exec -u 0 -t docker-caosdb-server-1 cat /opt/caosdb/git/caosdb_proto_commit > hash_proto
- cat hash_server
- cat hash_proto
- cat hash_mysql
......@@ -167,8 +167,8 @@ inttest:
- /bin/sh ./run.sh
# Save logs
- docker logs docker_caosdb-server_1 &> ../caosdb_log.txt
- docker logs docker_sqldb_1 &> ../mariadb_log.txt
- docker logs docker-caosdb-server-1 &> ../caosdb_log.txt
- docker logs docker-sqldb-1 &> ../mariadb_log.txt
- cd ..
# Stop the server
......
......@@ -13,11 +13,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
* Added new converters for tables: CSVTableConverter and XLSXTableConverter
* Possibility to authorize updates as in the old crawler
* Allow authorization of inserts
* Allow splitting cfoods into multiple yaml documents
* Implemented macros
* Converters can now filter the list of children
* You can now crawl data with name conflicts: `synchronize(unique_names=False)`
### Changed
* Renamed module from `newcrawler` to `caoscrawler`
* MAINT: Renamed module from `newcrawler` to `caoscrawler`
* MAINT: Removed global converters from `crawl.py`
### Deprecated
......@@ -30,6 +34,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
* FIX: #35 Parent cannot be set from value
* [#6](https://gitlab.com/caosdb/caosdb-crawler/-/issues/6): Fixed many type
hints to be compatible to python 3.8
* [#9](https://gitlab.com/caosdb/caosdb-crawler/-/issues/9): Sclaras of types
different than string can now be given in cfood definitions
### Security
......@@ -36,7 +36,7 @@ from .structure_elements import (StructureElement, Directory, File, Dict, JSONFi
DictFloatElement, DictDictElement,
TextElement, DictTextElement, DictElement, DictListElement)
from typing import Dict as Dict_t, List, Optional, Tuple, Union
from abc import abstractmethod
from abc import ABCMeta, abstractmethod
from string import Template
import yaml_header_tools
......@@ -156,15 +156,21 @@ def handle_value(value: Union[dict, str, list], values: GeneralStore):
propvalue = value
# variables replacement:
propvalue = [replace_variables(i, values) for i in propvalue]
propvalue = list()
for element in value:
# Do the element-wise replacement only, when its type is string:
if type(element) == str:
propvalue.append(replace_variables(element, values))
else:
propvalue.append(element)
return (propvalue, collection_mode)
else:
# value is another simple type
# collection_mode = "single"
# propvalue = value["value"]
# return (propvalue, collection_mode)
raise RuntimeError()
collection_mode = "single"
propvalue = value
# Return it immediately, otherwise variable substitution would be done and fail:
return (propvalue, collection_mode)
propvalue = replace_variables(propvalue, values)
return (propvalue, collection_mode)
......@@ -255,7 +261,7 @@ def create_records(values: GeneralStore,
return keys_modified
class Converter(object):
class Converter(object, metaclass=ABCMeta):
"""
Converters treat StructureElements contained in the hierarchical sturcture.
"""
......@@ -283,6 +289,10 @@ class Converter(object):
def converter_factory(definition: dict,
name: str,
converter_registry: dict):
"""creates a Converter instance of the appropriate class.
The `type` key in the `definition` defines the Converter class which is being used.
"""
if "type" not in definition:
raise RuntimeError(
......@@ -535,6 +545,7 @@ class DictConverter(Converter):
return {}
# TODO: difference to SimpleFileConverter? Do we need both?
class FileConverter(Converter):
def typecheck(self, element: StructureElement):
return isinstance(element, File)
......@@ -566,6 +577,8 @@ class JSONFileConverter(DictConverter):
def create_children(self, generalStore: GeneralStore, element: StructureElement):
if not self.typecheck(element):
raise RuntimeError("A JSON file is needed to create children")
# TODO: either add explicit time check for File structure element here,
# or add a comment to suppress mypy type warning.
with open(element.path, 'r') as json_file:
json_data = json.load(json_file)
if not isinstance(json_data, dict):
......
......@@ -55,12 +55,17 @@ from caosdb.apiutils import compare_entities, merge_entities
from copy import deepcopy
from jsonschema import validate
logger = logging.getLogger(__name__)
from .macros import defmacro_constructor, macro_constructor
logger = logging.getLogger(__name__)
SPECIAL_PROPERTIES_STRICT = ("description", "name", "id", "path")
SPECIAL_PROPERTIES_NOT_STRICT = ("file", "checksum", "size")
# Register the macro functions from the submodule:
yaml.SafeLoader.add_constructor("!defmacro", defmacro_constructor)
yaml.SafeLoader.add_constructor("!macro", macro_constructor)
def check_identical(record1: db.Entity, record2: db.Entity, ignore_id=False):
"""
......@@ -160,7 +165,6 @@ class Crawler(object):
"""
def __init__(self,
converters: List[Converter] = [],
generalStore: Optional[GeneralStore] = None,
debug: bool = False,
identifiableAdapter: IdentifiableAdapter = None,
......@@ -171,15 +175,14 @@ class Crawler(object):
Parameters
----------
converters : List[Converter]
The set of converters used for this crawler.
recordStore : GeneralStore
An initial GeneralStore which might store e.g. environment variables.
debug : bool
Create a debugging information tree when set to True.
The debugging information tree is a variable stored in
self.debug_tree. It is a dictionary mapping directory entries
to a tuple of general stores and record stores which are valid for the directory scope.
to a tuple of general stores and record stores which are valid for
the directory scope.
Furthermore, it is stored in a second tree named self.debug_copied whether the
objects in debug_tree had been copied from a higher level in the hierarchy
of the structureelements.
......@@ -191,7 +194,6 @@ class Crawler(object):
"""
# TODO: check if this feature is really needed
self.global_converters = converters
self.identified_cache = IdentifiedCache()
self.recordStore = RecordStore()
......@@ -225,7 +227,16 @@ class Crawler(object):
# Load the cfood from a yaml file:
with open(crawler_definition_path, "r") as f:
crawler_definition = yaml.safe_load(f)
crawler_definitions = list(yaml.safe_load_all(f))
if len(crawler_definitions) == 1:
# Simple case, just one document:
crawler_definition = crawler_definitions[0]
elif len(crawler_definitions) == 2:
crawler_definition = crawler_definitions[1]
else:
raise RuntimeError(
"Crawler definition must not contain more than two documents.")
# TODO: at this point this function can already load the cfood schema extensions
# from the crawler definition and add them to the yaml schema that will be
......@@ -376,9 +387,13 @@ class Crawler(object):
converter_registry)
@staticmethod
def create_local_converters(crawler_definition: dict,
converter_registry: dict):
local_converters = []
def initialize_converters(crawler_definition: dict, converter_registry: dict):
"""
takes the cfood as dict (`crawler_definition`) and creates the converter objects that
are defined on the highest level. Child Converters will in turn be created during the
initialization of the Converters.
"""
converters = []
for key, value in crawler_definition.items():
# Definitions and Converters are reserved keywords
......@@ -390,10 +405,9 @@ class Crawler(object):
continue
elif key == "Converters":
continue
local_converters.append(Converter.converter_factory(
value, key, converter_registry))
converters.append(Converter.converter_factory(value, key, converter_registry))
return local_converters
return converters
def start_crawling(self, items: Union[List[StructureElement], StructureElement],
crawler_definition: dict,
......@@ -425,20 +439,19 @@ class Crawler(object):
items = [items]
self.run_id = uuid.uuid1()
local_converters = Crawler.create_local_converters(crawler_definition,
converter_registry)
local_converters = Crawler.initialize_converters(
crawler_definition, converter_registry)
# This recursive crawling procedure generates the update list:
self.target_data: List[db.Record] = []
self._crawl(items,
self.global_converters, local_converters, self.generalStore, self.recordStore,
[], [])
self._crawl(items, local_converters, self.generalStore,
self.recordStore, [], [])
if self.debug:
self.debug_converters = self.global_converters + local_converters
self.debug_converters = local_converters
return self.target_data
def synchronize(self, commit_changes: bool = True):
def synchronize(self, commit_changes: bool = True, unique_names=True):
"""
Carry out the actual synchronization.
"""
......@@ -446,7 +459,7 @@ class Crawler(object):
# After the crawling, the actual synchronization with the database, based on the
# update list is carried out:
return self._synchronize(self.target_data, commit_changes)
return self._synchronize(self.target_data, commit_changes, unique_names=unique_names)
def can_be_checked_externally(self, record: db.Record):
"""
......@@ -807,7 +820,8 @@ class Crawler(object):
return db.Entity(name=name).retrieve()
@staticmethod
def execute_inserts_in_list(to_be_inserted, securityMode, run_id: int = None):
def execute_inserts_in_list(to_be_inserted, securityMode, run_id: int = None,
unique_names=True):
for record in to_be_inserted:
for prop in record.properties:
entity = Crawler._get_entity_by_name(prop.name)
......@@ -816,7 +830,7 @@ class Crawler(object):
logger.debug(to_be_inserted)
if len(to_be_inserted) > 0:
if securityMode.value > SecurityMode.RETRIEVE.value:
db.Container().extend(to_be_inserted).insert()
db.Container().extend(to_be_inserted).insert(unique=unique_names)
elif run_id is not None:
update_cache = UpdateCache()
update_cache.insert(to_be_inserted, run_id, insert=True)
......@@ -834,18 +848,20 @@ class Crawler(object):
_resolve_datatype(prop, entity)
@staticmethod
def execute_updates_in_list(to_be_updated, securityMode, run_id: int = None):
def execute_updates_in_list(to_be_updated, securityMode, run_id: int = None,
unique_names=True):
Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated)
logger.debug("UPDATE")
logger.debug(to_be_updated)
if len(to_be_updated) > 0:
if securityMode.value > SecurityMode.INSERT.value:
db.Container().extend(to_be_updated).update()
db.Container().extend(to_be_updated).update(unique=unique_names)
elif run_id is not None:
update_cache = UpdateCache()
update_cache.insert(to_be_updated, run_id)
def _synchronize(self, target_data: List[db.Record], commit_changes: bool = True):
def _synchronize(self, target_data: List[db.Record], commit_changes: bool = True,
unique_names=True):
"""
This function applies several stages:
1) Retrieve identifiables for all records in target_data.
......@@ -884,9 +900,9 @@ class Crawler(object):
self.execute_parent_updates_in_list(to_be_updated)
self.execute_inserts_in_list(
to_be_inserted, self.securityMode, self.run_id)
to_be_inserted, self.securityMode, self.run_id, unique_names=unique_names)
self.execute_updates_in_list(
to_be_updated, self.securityMode, self.run_id)
to_be_updated, self.securityMode, self.run_id, unique_names=unique_names)
update_cache = UpdateCache()
pending_inserts = update_cache.get_inserts(self.run_id)
......@@ -969,7 +985,6 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
f.write(yaml.dump(paths, sort_keys=False))
def _crawl(self, items: List[StructureElement],
global_converters: List[Converter],
local_converters: List[Converter],
generalStore: GeneralStore,
recordStore: RecordStore,
......@@ -978,7 +993,7 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
Crawl a list of StructureElements and apply any matching converters.
items: structure_elements (e.g. files and folders on one level on the hierarchy)
global_converters and local_converters: globally or locally defined converters for
local_converters: locally defined converters for
treating structure elements. A locally defined converter could be
one that is only valid for a specific subtree of the originally
cralwed StructureElement structure.
......@@ -986,7 +1001,8 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
global stores of the Crawler object.
"""
for element in items:
for converter in global_converters + local_converters:
for converter in local_converters:
# type is something like "matches files", replace isinstance with "type_matches"
# match function tests regexp for example
if (converter.typecheck(element) and
......@@ -1012,7 +1028,8 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
self.debug_tree[str(element)] = (
generalStore_copy.get_storage(), recordStore_copy.get_storage())
self.debug_metadata["copied"][str(element)] = (
generalStore_copy.get_dict_copied(), recordStore_copy.get_dict_copied())
generalStore_copy.get_dict_copied(),
recordStore_copy.get_dict_copied())
self.debug_metadata["usage"][str(element)].add(
"/".join(converters_path + [converter.name]))
mod_info = self.debug_metadata["provenance"]
......@@ -1023,10 +1040,11 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
record_identifier = record_name + \
"_" + str(internal_id)
converter.metadata["usage"].add(record_identifier)
mod_info[record_identifier][prop_name] = (structure_elements_path + [element.get_name()],
mod_info[record_identifier][prop_name] = (
structure_elements_path + [element.get_name()],
converters_path + [converter.name])
self._crawl(children, global_converters, converter.converters,
self._crawl(children, converter.converters,
generalStore_copy, recordStore_copy,
structure_elements_path + [element.get_name()],
converters_path + [converter.name])
......@@ -1058,7 +1076,9 @@ def crawler_main(crawled_directory_path: str,
provenance_file: str = None,
dry_run: bool = False,
prefix: str = "",
securityMode: int = SecurityMode.UPDATE):
securityMode: int = SecurityMode.UPDATE,
unique_names=True,
):
"""
Parameters
......@@ -1079,6 +1099,8 @@ def crawler_main(crawled_directory_path: str,
remove the given prefix from file paths
securityMode : int
securityMode of Crawler
unique_names : bool
whether or not to update or insert entities inspite of name conflicts
Returns
-------
......@@ -1110,6 +1132,8 @@ def crawler_main(crawled_directory_path: str,
if isinstance(elem, db.File):
# correct the file path:
# elem.file = os.path.join(args.path, elem.file)
if prefix is None:
raise RuntimeError("No prefix set. Prefix must be set if files are used.")
if elem.path.startswith(prefix):
elem.path = elem.path[len(prefix):]
elem.file = None
......@@ -1136,7 +1160,7 @@ def crawler_main(crawled_directory_path: str,
raise RuntimeError("Missing RecordTypes: {}".
format(", ".join(notfound)))
crawler.synchronize(commit_changes=True)
crawler.synchronize(commit_changes=True, unique_names=unique_names)
return 0
......@@ -1154,6 +1178,7 @@ def parse_args():
help="The subtree of files below the given path will "
"be considered. Use '/' for everything.")
parser.add_argument("-s", "--security-mode", choices=["retrieve", "insert", "update"],
default="retrieve",
help="Determines whether entities may only be read from the server, or "
"whether inserts or even updates may be done.")
parser.add_argument("-n", "--dry-run", action="store_true",
......@@ -1162,9 +1187,9 @@ def parse_args():
# TODO: load identifiables is a dirty implementation currently
parser.add_argument("-i", "--load-identifiables",
help="Load identifiables from "
"the given yaml file.")
help="Load identifiables from the given yaml file.")
parser.add_argument("-u", "--unique-names",
help="Insert or updates entities even if name conflicts exist.")
parser.add_argument("-p", "--prefix",
help="Remove the given prefix from the paths "
"of all file objects.")
......@@ -1186,16 +1211,17 @@ def main():
logger.setLevel(logging.INFO)
sys.exit(crawler_main(
args.crawled_directory_path,
args.cfood_file_name,
args.load_identifiables,
args.debug,
args.provenance,
args.dry_run,
args.prefix,
{"retrieve": SecurityMode.RETRIEVE,
crawled_directory_path=args.crawled_directory_path,
cfood_file_name=args.cfood_file_name,
identifiables_definition_file=args.load_identifiables,
debug=args.debug,
provenance_file=args.provenance,
dry_run=args.dry_run,
prefix=args.prefix,
securityMode={"retrieve": SecurityMode.RETRIEVE,
"insert": SecurityMode.INSERT,
"update": SecurityMode.UPDATE}[args.security_mode]
"update": SecurityMode.UPDATE}[args.security_mode],
unique_names=args.unique_names,
))
......
from .macro_yaml_object import defmacro_constructor, macro_constructor
#!/usr/bin/env python3
# encoding: utf-8
#
# ** header v3.0
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2022 Alexander Schlemmer
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# ** end header
#
# Function to expand a macro in yaml
# A. Schlemmer, 05/2022
from dataclasses import dataclass
from typing import Any, Dict
from copy import deepcopy
from string import Template
@dataclass
class MacroDefinition:
"""
Stores a macro definition.
name: Name of the macro
params: variables and default values to be substituted in keys or values
definition: A dictionary that will be substituted including parameters
"""
name: str
params: Dict[str, Any]
definition: Any
# This dictionary stores the macro definitions
macro_store: Dict[str, MacroDefinition] = dict()
def substitute(propvalue, values: dict):
"""
Substitution of variables in strings using the variable substitution
library from python's standard library.
"""
propvalue_template = Template(propvalue)
return propvalue_template.safe_substitute(**values)
def substitute_dict(sourced: Dict[str, Any], values: Dict[str, Any]):
"""
Create a copy of sourced.
Afterwards recursively do variable substitution on all keys and values.
"""
d = deepcopy(sourced)
# Changes in keys:
replace: Dict[str, str] = dict()
for k in d:
replacement = substitute(k, values)
if replacement != k:
replace[k] = replacement
for k, v in replace.items():
d[v] = d[k]
del d[k]
# Changes in values:
for k, v in d.items():
if isinstance(v, str):
d[k] = substitute(v, values)
elif isinstance(v, list):
subst_list = list()
for i in d[k]:
if isinstance(i, str):
subst_list.append(substitute(i, values))
elif isinstance(i, dict):
subst_list.append(substitute_dict(i, values))
else:
subst_list.append(i)
d[k] = subst_list
elif isinstance(v, dict):
d[k] = substitute_dict(v, values)
else:
pass
return d
def defmacro_constructor(loader, node):
"""
Function for registering macros in yaml files.
It can be registered in pyaml using:
yaml.SafeLoader.add_constructor("!defmacro", defmacro_constructor)
"""
value = loader.construct_mapping(node, deep=True)
params = {}
if "params" in value:
params = value["params"]
macro = MacroDefinition(
value["name"], params,
value["definition"])
macro_store[macro.name] = macro
return {}
def macro_constructor(loader, node):
"""
Function for substituting macros in yaml files.
It can be registered in pyaml using:
yaml.SafeLoader.add_constructor("!macro", macro_constructor)
"""
res = dict()
value = loader.construct_mapping(node, deep=True)
for name, params_setter in value.items():
if name in macro_store:
# If params_setter is a list, run this for every element:
if params_setter is not None and isinstance(params_setter, list):
for el in params_setter:
macro = macro_store[name]
params = deepcopy(macro.params)
if el is not None:
if isinstance(el, dict):
params.update(el)
else:
raise RuntimeError("params type not supported")
else:
raise RuntimeError("params type must not be None")
definition = substitute_dict(macro.definition, params)
res.update(definition)
else:
# This is just a single macro:
macro = macro_store[name]
params = deepcopy(macro.params)
if params_setter is not None:
if isinstance(params_setter, dict):
params.update(params_setter)
else:
raise RuntimeError("params type not supported")
definition = substitute_dict(macro.definition, params)
res.update(definition)
else:
# If there is no macro with that name, just keep that node:
res[name] = params_setter
return res
# Getting started with the CaosDB Crawler #
## Installation ##
### Requirements ###
### How to install ###
#### Linux ####
Make sure that Python (at least version 3.8) and pip is installed, using your system tools and
documentation.
Then open a terminal and continue in the [Generic installation](#generic-installation) section.
#### Windows ####
If a Python distribution is not yet installed, we recommend Anaconda Python, which you can download
for free from [https://www.anaconda.com](https://www.anaconda.com). The "Anaconda Individual Edition" provides most of all
packages you will ever need out of the box. If you prefer, you may also install the leaner
"Miniconda" installer, which allows you to install packages as you need them.
After installation, open an Anaconda prompt from the Windows menu and continue in the [Generic
installation](#generic-installation) section.
#### MacOS ####
If there is no Python 3 installed yet, there are two main ways to
obtain it: Either get the binary package from
[python.org](https://www.python.org/downloads/) or, for advanced
users, install via [Homebrew](https://brew.sh/). After installation
from python.org, it is recommended to also update the TLS certificates
for Python (this requires administrator rights for your user):
```sh
# Replace this with your Python version number:
cd /Applications/Python\ 3.9/
# This needs administrator rights:
sudo ./Install\ Certificates.command
```
After these steps, you may continue with the [Generic
installation](#generic-installation).
#### Generic installation ####
---
Obtain the sources from GitLab and install from there (`git` must be installed for
this option):
```sh
git clone https://gitlab.com/caosdb/caosdb-crawler
cd caosdb-crawler
pip3 install --user .
```
**Note**: In the near future, this package will also be made available on PyPi.
## Configuration ##
## Try it out ##
## Run Unit Tests
## Documentation ##
Build documentation in `src/doc` with `make html`.
### Requirements ###
- `sphinx`
- `sphinx-autoapi`
- `recommonmark`
### Troubleshooting ###
newcrawler
==========
.. toctree::
:maxdepth: 4
newcrawler
newcrawler.converters module
============================
.. automodule:: newcrawler.converters
:members:
:undoc-members:
:show-inheritance:
newcrawler.crawl module
=======================
.. automodule:: newcrawler.crawl
:members:
:undoc-members:
:show-inheritance:
newcrawler.identifiable\_adapters module
========================================
.. automodule:: newcrawler.identifiable_adapters
:members:
:undoc-members:
:show-inheritance:
newcrawler.identified\_cache module
===================================
.. automodule:: newcrawler.identified_cache
:members:
:undoc-members:
:show-inheritance:
newcrawler package
==================
Submodules
----------
.. toctree::
:maxdepth: 4
newcrawler.converters
newcrawler.crawl
newcrawler.identifiable_adapters
newcrawler.identified_cache
newcrawler.stores
newcrawler.structure_elements
newcrawler.utils
Module contents
---------------
.. automodule:: newcrawler
:members:
:undoc-members:
:show-inheritance:
newcrawler.stores module
========================
.. automodule:: newcrawler.stores
:members:
:undoc-members:
:show-inheritance:
newcrawler.structure\_elements module
=====================================
.. automodule:: newcrawler.structure_elements
:members:
:undoc-members:
:show-inheritance:
newcrawler.utils module
=======================
.. automodule:: newcrawler.utils
:members:
:undoc-members:
:show-inheritance:
CFood-Definition
================
The crawler specification is called CFood-definition. It is stored inside a yaml file, or - more precisely - inside of one single or two yaml documents inside a yaml file.
The specification consists of three separate parts:
#. Metadata and macro definitions
#. Custom converter registrations
#. The converter tree specification
In the simplest case, there is just one yaml file with just a single document including at least
the converter tree specification (see :ref:`example 1<example_1>`). Additionally the custom converter part may be also included in
this single document (for historical reasons, see :ref:`example 2<example_2>`), but it is recommended to include them in the separate
document together with the metadata and :doc:`macro<macros>` definitions (see :ref:`below<example_4>`).
If metadata and macro definitions are provided, there **must** be a second document preceeding the
converter tree specification, including these definitions.
Examples
++++++++
A single document with a converter tree specification:
.. _example_1:
.. code-block:: yaml
extroot:
type: Directory
match: ^extroot$
subtree:
DataAnalysis:
type: Directory
match: DataAnalysis
# (...)
A single document with a converter tree specification, but also including a custom converters section:
.. _example_2:
.. code-block:: yaml
Converters:
CustomConverter_1:
package: mypackage.converters
converter: CustomConverter1
CustomConverter_2:
package: mypackage.converters
converter: CustomConverter2
extroot:
type: Directory
match: ^extroot$
subtree:
DataAnalysis:
type: Directory
match: DataAnalysis
# (...)
A yaml multi-document, defining metadata and some macros in the first document and declaring
two custom converters in the second document (**not recommended**, see the recommended version :ref:`below<example_4>`). Please note, that two separate yaml documents can be defined using the ``---`` syntax:
.. _example_3:
.. code-block:: yaml
---
metadata:
name: Datascience CFood
description: CFood for data from the local data science work group
macros:
- !defmacro
name: SimulationDatasetFile
params:
match: null
recordtype: null
nodename: null
definition:
# (...)
---
Converters:
CustomConverter_1:
package: mypackage.converters
converter: CustomConverter1
CustomConverter_2:
package: mypackage.converters
converter: CustomConverter2
extroot:
type: Directory
match: ^extroot$
subtree:
DataAnalysis:
type: Directory
match: DataAnalysis
# (...)
The **recommended way** of defining metadata, custom converters, macros and the main cfood specification is shown in the following code example:
.. _example_4:
.. code-block:: yaml
---
metadata:
name: Datascience CFood
description: CFood for data from the local data science work group
macros:
- !defmacro
name: SimulationDatasetFile
params:
match: null
recordtype: null
nodename: null
definition:
# (...)
Converters:
CustomConverter_1:
package: mypackage.converters
converter: CustomConverter1
CustomConverter_2:
package: mypackage.converters
converter: CustomConverter2
---
extroot:
type: Directory
match: ^extroot$
subtree:
DataAnalysis:
type: Directory
match: DataAnalysis
# (...)
List Mode
---------
Specifying values of properties can make use of two special characters, in order to automatically
create lists or multi properties instead of single values:
.. code-block:: yaml
Experiment1:
Measurement: +Measurement <- Element in List (list is cleared before run)
*Measurement <- Multi Property (properties are removed before run)
Measurement <- Overwrite
Concepts
))))))))
Structure Elements
++++++++++++++++++
This hierarchical structure is assumed to be consituted of a tree of
StructureElements. The tree is created on the fly by so called Converters which
are defined in a yaml file. The tree of StructureElements is a model
of the existing data (For example could a tree of Python file objects
(StructureElements) represent a file tree that exists on some file server).
Relevant sources in:
src/structure_elements.py
Converters
++++++++++
Converters treat StructureElements and thereby create the StructureElement that
are the children of the treated StructureElement. Converters therefore create
the above named tree. The definition of a Converter also contains what
Converters shall be used to treat the generated child-StructureElements. The
definition is therefore a tree itself.
See `:doc:converters<converters>` for details.
Relevant sources in:
src/converters.py
Identifiables
+++++++++++++
Relevant sources in:
src/identifiable_adapters.py
The Crawler
+++++++++++
The crawler can be considered the main program doing the synchronization in basically two steps:
#. Based on a yaml-specification scan the file system (or other sources) and create a set of CaosDB Entities that are supposed to be inserted or updated in a CaosDB instance.
#. Compare the current state of the CaosDB instance with the set of CaosDB Entities created in step 1, taking into account the :ref:`registered identifiables<Identifiables>`. Insert or update entites accordingly.
Relevant sources in:
src/crawl.py
Special Cases
=============
Variable Precedence
+++++++++++++++++++
Let's assume the following situation
.. code-block:: yaml
description:
type: DictTextElement
match_value: (?P<description>.*)
match_name: description
Making use of the $description variable could refer to two different variables created here:
1. The structure element path.
2. The value of the matched expression.
The matched expression does take precedence over the structure element path and shadows it.
Make sure, that if you want to be able to use the structure element path, to give unique names
to the variables like:
.. code-block:: yaml
description_text_block:
type: DictTextElement
match_value: (?P<description>.*)
match_name: description
Scopes
========
Example:
.. code-block:: yaml
DicomFile:
type: SimpleDicomFile
match: (?P<filename>.*)\.dicom
records:
DicomRecord:
name: $filename
subtree: # header of dicom file
PatientID:
type: DicomHeaderElement
match_name: PatientName
match_value: (?P<patient>.*)
records:
Patient:
name: $patient
dicom_name: $filename # $filename is in same scope!
ExperimentFile:
type: MarkdownFile
match: ^readme.md$
records:
Experiment:
dicom_name: $filename # does NOT work, because $filename is out of scope!
# can variables be used within regexp?
File Objects
============
......@@ -53,6 +53,7 @@ extensions = [
'sphinx.ext.autosectionlabel',
'sphinx.ext.intersphinx',
'sphinx.ext.napoleon', # For Google style docstrings
"recommonmark", # For markdown files.
"sphinx_rtd_theme",
]
......@@ -61,7 +62,7 @@ templates_path = ['_templates']
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
source_suffix = ['.rst']
source_suffix = ['.rst', '.md']
# The master toctree document.
master_doc = 'index'
......@@ -71,7 +72,7 @@ master_doc = 'index'
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
language = "en"
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
......@@ -99,7 +100,7 @@ html_theme = "sphinx_rtd_theme"
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_static_path = [] # ['_static']
# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment