Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • caosdb/src/caosdb-crawler
1 result
Show changes
Commits on Source (209)
Showing
with 1035 additions and 938 deletions
FROM debian:bullseye
FROM debian:bookworm
RUN apt-get update && \
apt-get install \
curl \
git \
openjdk-11-jdk-headless \
openjdk-17-jdk-headless \
python3-autopep8 \
python3-pip \
python3-pytest \
python3-sphinx \
tox \
-y
RUN pip3 install pylint recommonmark sphinx-rtd-theme
RUN pip3 install --break-system-packages \
pylint \
recommonmark \
sphinx-rtd-theme \
;
COPY .docker/wait-for-it.sh /wait-for-it.sh
ARG PYLIB
ADD https://gitlab.indiscale.com/api/v4/projects/97/repository/commits/${PYLIB} \
pylib_version.json
RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git && \
cd caosdb-pylib && git checkout ${PYLIB} && pip3 install .
cd caosdb-pylib && git checkout ${PYLIB} && pip3 install --break-system-packages .
ARG ADVANCED
ADD https://gitlab.indiscale.com/api/v4/projects/104/repository/commits/${ADVANCED} \
advanced_version.json
RUN git clone https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git && \
cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install .[h5-crawler]
cd caosdb-advanced-user-tools && git checkout ${ADVANCED} && pip3 install --break-system-packages .[h5-crawler]
COPY . /git
# Delete .git because it is huge.
......@@ -30,7 +34,7 @@ RUN rm -r /git/.git
# Install pycaosdb.ini for the tests
RUN mv /git/.docker/tester_pycaosdb.ini /git/integrationtests/pycaosdb.ini
RUN cd /git/ && pip3 install .
RUN cd /git/ && pip3 install --break-system-packages .[h5-crawler,spss]
WORKDIR /git/integrationtests
# wait for server,
......
......@@ -113,32 +113,33 @@ info:
script:
- *env
unittest_py3.9:
unittest_py3.11:
tags: [cached-dind]
stage: test
image: $CI_REGISTRY_IMAGE
script:
- tox
- python3 -c "import sys; assert sys.version.startswith('3.11')"
- tox
unittest_py3.7:
unittest_py3.8:
tags: [cached-dind]
stage: test
image: python:3.7
image: python:3.8
script: &python_test_script
# install dependencies
- pip install pytest pytest-cov
# TODO: Use f-branch logic here
- pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev
- pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev
- pip install .[h5-crawler]
- pip install .[h5-crawler,spss]
# actual test
- caosdb-crawler --help
- pytest --cov=caosdb -vv ./unittests
unittest_py3.8:
unittest_py3.9:
tags: [cached-dind]
stage: test
image: python:3.8
image: python:3.9
script: *python_test_script
unittest_py3.10:
......@@ -147,12 +148,31 @@ unittest_py3.10:
image: python:3.10
script: *python_test_script
unittest_py3.11:
unittest_py3.12:
tags: [cached-dind]
stage: test
image: python:3.11
image: python:3.12
script: *python_test_script
unittest_py3.13:
allow_failure: true
tags: [cached-dind]
stage: test
image: python:3.13-rc
script:
# TODO: Replace by '*python_test_script' as soon as 3.13 has been officially released.
# TODO Remove the "!" after 3.13 release, which serves as an xfail
- apt update && apt install -y cargo
# install dependencies
- pip install pytest pytest-cov
# TODO: Use f-branch logic here
- pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-pylib.git@dev
- (! pip install git+https://gitlab.indiscale.com/caosdb/src/caosdb-advanced-user-tools.git@dev)
- (! pip install .[h5-crawler,spss])
# actual test
- (! caosdb-crawler --help)
- (! pytest --cov=caosdb -vv ./unittests)
inttest:
tags: [docker]
services:
......@@ -287,7 +307,8 @@ code-style:
- job: build-testenv
optional: true
script:
- autopep8 -r --diff --exit-code .
- autopep8 --version
- autopep8 -r --diff --exit-code .
allow_failure: true
pylint:
......
......@@ -9,18 +9,45 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added ###
* Support for Python 3.12 and experimental support for 3.13
* CFood macros now accept complex objects as values, not just strings.
* More options for the `CSVTableConverter`
* New converters:
* `DatetimeElementConverter`
* `SPSSConverter`
* New scripts:
* `spss_to_datamodel`
* `csv_to_datamodel`
* New transformer functions:
* `date_parse`
* `datetime_parse`
### Changed ###
* CFood macros do not render everything into strings now.
* Better internal handling of identifiable/reference resolving and merging of entities. This also
includes more understandable output for users.
* Better handling of missing imports, with nice messages for users.
* No longer use configuration of advancedtools to set to and from email addresses
### Deprecated ###
### Removed ###
* Support for Python 3.7
### Fixed ###
* [93](https://gitlab.com/linkahead/linkahead-crawler/-/issues/93) cfood.yaml does not allow umlaut in $expression
* [96](https://gitlab.com/linkahead/linkahead-crawler/-/issues/96) Do not fail silently on transaction errors
### Security ###
### Documentation ###
* General improvement of the documentaion, in many small places.
* The API documentation should now also include documentation of the constructors.
## [0.7.1] - 2024-03-21 ##
### Fixed ###
......@@ -68,6 +95,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
* The `identifiable_adapters.IdentifiableAdapter` uses entity ids (negative for
entities that don't exist remotely) instead of entity objects for keeping
track of references.
* Log output is either written to $SHARED_DIR/ (when this variable is set) or just to the terminal.
### Deprecated ###
......@@ -161,6 +189,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- ``add_prefix`` and ``remove_prefix`` arguments for the command line interface
and the ``crawler_main`` function for the adding/removal of path prefixes when
creating file entities.
- More strict checking of `identifiables.yaml`.
- Better error messages when server does not conform to expected data model.
### Changed ###
......
......@@ -32,7 +32,7 @@ import sys
from argparse import RawTextHelpFormatter
from pathlib import Path
import caosdb as db
import linkahead as db
import pytest
import yaml
from caosadvancedtools.crawler import Crawler as OldCrawler
......@@ -42,8 +42,8 @@ from caoscrawler.debug_tree import DebugTree
from caoscrawler.identifiable import Identifiable
from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter
from caoscrawler.scanner import scan_directory
from caosdb import EmptyUniqueQueryError
from caosdb.utils.register_tests import clear_database, set_test_key
from linkahead import EmptyUniqueQueryError
from linkahead.utils.register_tests import clear_database, set_test_key
set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2")
......
......@@ -16,20 +16,18 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
from pytest import fixture, mark, raises
import linkahead as db
from linkahead.cached import cache_clear
from caosadvancedtools.models.parser import parse_model_from_string
from caoscrawler.crawl import Crawler
from caoscrawler.identifiable import Identifiable
from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter
from caoscrawler.scanner import (create_converter_registry,
scan_structure_elements)
from caoscrawler.structure_elements import DictElement
from caoscrawler.scanner import create_converter_registry, scan_structure_elements
from linkahead.cached import cache_clear
from linkahead.utils.register_tests import clear_database, set_test_key
from pytest import fixture, mark, raises
set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2")
......@@ -171,8 +169,9 @@ def test_issue_83(clear_database):
name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target1])
referencing2 = db.Record(name="Referencing2").add_parent(
name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target2])
referencing3 = db.Record(name="Referencing3").add_parent(name=referencing_type.name).add_property(
name=referenced_type.name, value=[ref_target1, ref_target2])
referencing3 = db.Record(name="Referencing3").add_parent(
name=referencing_type.name).add_property(name=referenced_type.name, value=[ref_target1,
ref_target2])
records = db.Container().extend(
[ref_target1, ref_target2, referencing1, referencing2, referencing3])
......
......@@ -27,12 +27,12 @@ import os
import pytest
from subprocess import run
import caosdb as db
import linkahead as db
from caosadvancedtools.loadFiles import loadpath
from caosdb.cached import cache_clear
from linkahead.cached import cache_clear
from caosadvancedtools.models import parser as parser
from caoscrawler.crawl import crawler_main
from caosdb.utils.register_tests import clear_database, set_test_key
from linkahead.utils.register_tests import clear_database, set_test_key
set_test_key("10b128cf8a1372f30aa3697466bb55e76974e0c16a599bb44ace88f19c8f61e2")
......
......@@ -17,15 +17,15 @@ classifiers =
package_dir =
= src
packages = find:
python_requires = >=3.7
python_requires = >=3.8
install_requires =
caosadvancedtools >= 0.7.0
importlib-resources
importlib_metadata;python_version<'3.8'
linkahead > 0.13.2
odfpy #make optional
packaging
pandas
pyarrow # Will be required by Pandas >= 3.0.
pyyaml
yaml-header-tools >= 0.2.1
......@@ -40,8 +40,12 @@ per-file-ignores = __init__.py:F401
[options.entry_points]
console_scripts =
caosdb-crawler = caoscrawler.crawl:main
spss_to_datamodel = caoscrawler.conv_impl.spss:spss_to_datamodel_main
csv_to_datamodel = caoscrawler.scripts.generators:csv_to_datamodel_main
[options.extras_require]
h5-crawler =
h5py >= 3.8
numpy
spss =
pandas[spss]
from . import converters, utils
try:
from .conv_impl.spss import SPSSConverter
except ImportError as err:
SPSSConverter: type = utils.MissingImport(
name="SPSSConverter", hint="Try installing with the `spss` extra option.",
err=err)
from .crawl import Crawler, SecurityMode
from .version import CfoodRequiredVersionError, get_caoscrawler_version
__version__ = get_caoscrawler_version()
# Convenience members #########################################################
# mypy: disable-error-code="attr-defined"
converters.SPSSConverter = SPSSConverter
......@@ -28,9 +28,12 @@ cfood:
- Definitions
- Dict
- Date
- Datetime
- JSONFile
- YAMLFile
- CSVTableConverter
- XLSXTableConverter
- SPSSFile
- H5File
- H5Dataset
- H5Group
......
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com>
# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Converter for SAV files (stored by SPSS)."""
from __future__ import annotations # Can be removed with 3.10.
import argparse
from collections import OrderedDict
import numpy as np
import pandas as pd
import pyreadstat
import yaml
from .. import converters
from ..stores import GeneralStore
from ..structure_elements import (File, StructureElement)
from typing import Optional, Any
READSTAT_TYPES = {
"double": "DOUBLE",
"string": "TEXT",
}
ORIGINAL_TYPES = {
"EDATE8": "DATETIME",
}
class SPSSConverter(converters.TableConverter):
"""Converter for SAV files (stored by SPSS)."""
def create_children(self, values: GeneralStore, element: StructureElement) -> list:
assert isinstance(element, File)
# The default dtype backend "numpy_nullable" does not handle dates well.
# Note that pandas.ArrowDtype is considered experimental (in Pandas 2.2).
df = pd.io.spss.read_spss(element.path, dtype_backend="pyarrow")
dtypes = read_column_types(element.path)
# Fix datetime columns
for name, dtype in dtypes.items():
if dtype != "DATETIME":
continue
col = df.loc[:, name]
col.fillna(np.nan, inplace=True)
col.replace([np.nan], [None], inplace=True)
return self._children_from_dataframe(df)
def read_column_types(savfile: Optional[str] = None, meta: Optional[Any] = None) -> dict[str, str]:
"""Read SAV file and return the column types.
Optionally, take data from a previours reading.
Parameters
----------
savfile : Optional[str]
The SAV file to read.
meta : Optional
The meta data result from `pyreadstat.read_sav(...)`.
Returns
-------
out : dict[str, str]
The column names and types.
"""
if not meta:
_, meta = pyreadstat.read_sav(savfile, metadataonly=True)
elif savfile is not None:
raise ValueError("Only one of `savfile` and `meta` must be given.")
dtypes: dict[str, str] = {}
for name in meta.column_names:
datatype = ORIGINAL_TYPES.get(meta.original_variable_types[name],
READSTAT_TYPES[meta.readstat_variable_types[name]])
dtypes[name] = datatype
return dtypes
def spss_to_yaml(savfile: str, yamlfile: str, cfood: Optional[str] = None) -> None:
"""Parse the *.sav and create basic datamodel in ``yamlfile``.
Parameters
----------
cfood: str
If given, also create a cfood skeleton.
"""
_, meta = pyreadstat.read_sav(savfile, metadataonly=True)
dtypes = read_column_types(meta=meta)
cfood_str = """
---
metadata:
macros:
- !defmacro
# Simple column value -> property rule
name: ColumnValue
params:
name: null
belongsto: BaseElement
type: TextElement
definition:
${name}:
type: ${type}
match_name: ^${name}$$
match_value: (?P<val>.*)
records:
${belongsto}:
${name}: $$val
- !defmacro
# column value -> reference property
name: ColumnValueReference
params:
name: null
reftype: null # RecordType of the reference
belongsto: BaseElement
type: TextElement # References are always text, right?
definition:
${name}:
type: ${type}
match_name: ^${name}$$
match_value: (?P<val>.*)
records:
${reftype}:
name: $$val
${belongsto}:
${name}: $$${reftype}
- !defmacro
# Same as "ColumnValue", but also give name of property.
name: ColumnValuePropname
params:
name: null
propname: null
belongsto: BaseElement
type: TextElement
definition:
${name}:
type: ${type}
match_name: ^${name}$$
match_value: (?P<val>.*)
records:
${belongsto}:
${propname}: $$val
---
directory: # corresponds to the directory given to the crawler
type: Directory
match: .* # we do not care how it is named here
subtree:
# This is the file
thisfile:
type: SPSSFile
match: ".*sav"
subtree:
entry:
type: Dict
match: .* # Name is irrelevant
records:
MyParent:
subtree: !macro
"""
enums: dict[str, list[str]] = {}
properties = OrderedDict()
for name in meta.column_names:
prop = {
"datatype": dtypes[name],
}
desc = meta.column_names_to_labels.get(name)
if desc and desc != name:
prop["description"] = desc
# Handle categorial variables
if var_label := meta.variable_to_label.get(name):
vvl = meta.variable_value_labels[name]
# reproducible (and sensible) order
label_values = [vvl[key] for key in sorted(vvl.keys())]
if label_values not in enums.values():
enums[var_label] = label_values
else:
var_label = [key for key, value in enums.items() if value == label_values][0]
prop["datatype"] = var_label
properties[name] = prop
output = f"""# auto-generated data model from file "{savfile}".
# To insert a datamodel into LinkAhead, run:
#
# python3 -m caosadvancedtools.models.parser datamodel.yaml --sync
"""
# Actual datamodel
output += """
#########
# Enums #
#########
"""
for name, values in enums.items():
output += f"""{name}:
description:
# possible values: {values}\n"""
output += ("""
###############
# RecordTypes #
###############
DummyRT:
description: Note: Change name and enter description.
recommended_properties:
"""
+ " ".join(yaml.dump(dict(properties), # from OrderedDict to dict
allow_unicode=True,
sort_keys=False).splitlines(keepends=True)))
# Experimental: Enum creation
output += """
###############
# Enum values #
###############
"""
for name, values in enums.items():
output += f"\n# ### {name} ###\n"
for value in values:
output += f"""
{value}:
role: Record
inherit_from_suggested:
- {name}
"""
with open(yamlfile, encoding="utf-8", mode="w") as myfile:
myfile.write(output)
if cfood:
defs_col_value: list[str] = []
defs_col_value_ref: list[str] = []
prefix = " " * 14
for name, propdef in properties.items():
def_str = prefix + f"- name: {name}\n"
dtype = None
reftype = None
defs = defs_col_value
# Which type?
if propdef["datatype"] == "DOUBLE":
dtype = "FloatElement"
elif propdef["datatype"] in ("TEXT", "DATETIME"):
dtype = None
else:
reftype = propdef["datatype"]
defs = defs_col_value_ref
# Append according to types:
if reftype:
def_str += prefix + f" reftype: {reftype}\n"
if dtype:
def_str += prefix + f" type: {dtype}\n"
# Store result
defs.append(def_str)
del defs
cfood_str += (prefix[2:] + "ColumnValue:\n" + "".join(defs_col_value)
+ prefix[2:] + "ColumnValueReference:\n" + "".join(defs_col_value_ref)
)
with open(cfood, encoding="utf-8", mode="w") as myfile:
myfile.write(cfood_str)
def _parse_arguments():
"""Parse the arguments."""
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('-i', '--input', help="The *.sav file.", required=True)
parser.add_argument('-o', '--outfile', help="Yaml filename to save the result", required=True)
parser.add_argument('--cfood', help="Yaml filename to create cfood output in", required=False)
return parser.parse_args()
def spss_to_datamodel_main():
"""The main function of this script."""
args = _parse_arguments()
spss_to_yaml(savfile=args.input, yamlfile=args.outfile, cfood=args.cfood)
print(f"Written datamodel to: {args.outfile}")
if args.cfood:
print(f"Written cfood to: {args.cfood}")
#!/usr/bin/env python3
# encoding: utf-8
#
# ** header v3.0
# This file is a part of the CaosDB Project.
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2021 Henrik tom Wörden
# 2021 Alexander Schlemmer
# Copyright (C) 2021 Alexander Schlemmer
# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
......@@ -19,9 +19,8 @@
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# ** end header
#
"""Converters take structure elements and create Records and new structure elements from them."""
from __future__ import annotations
......@@ -34,7 +33,7 @@ import warnings
from abc import ABCMeta, abstractmethod
from inspect import signature
from string import Template
from typing import Any, List, Optional, Tuple, Union
from typing import Any, Callable, Optional, Union
import linkahead as db
import pandas as pd
......@@ -53,12 +52,16 @@ from .utils import has_parent
# by the converters:
SPECIAL_PROPERTIES = ("description", "name", "id", "path",
"file", "checksum", "size")
SINGLE_VAR_RE = re.compile(r"^\$(\{)?(?P<varname>[0-9a-zA-Z_]+)(\})?$")
ID_PATTERN = r"\D[.\w]*"
SINGLE_VAR_RE = re.compile(r"^\$(\{)?(?P<varname>" + ID_PATTERN + r")(\})?$")
logger = logging.getLogger(__name__)
class CrawlerTemplate(Template):
braceidpattern = r"(?a:[_a-z][_\.a-z0-9]*)"
# This also adds a dot to the default pattern.
# See: https://docs.python.org/3/library/string.html#template-strings
# Default flags is re.IGNORECASE
braceidpattern = ID_PATTERN
def _only_max(children_with_keys):
......@@ -134,8 +137,8 @@ def replace_variables(propvalue: Any, values: GeneralStore):
This function replaces variables in property values (and possibly other locations,
where the crawler can replace cfood-internal variables).
If `propvalue` is a single variable name preceeded with a '$' (e.g. '$var' or '${var}'), then
the corresponding value stored in `values` is returned.
If ``propvalue`` is a single variable name preceeded by a ``$`` (e.g. ``$var`` or ``${var}``),
then the corresponding value stored in ``values`` is returned.
In any other case the variable substitution is carried out as defined by string templates
and a new string with the replaced variables is returned.
"""
......@@ -160,16 +163,16 @@ def handle_value(value: Union[dict, str, list], values: GeneralStore):
add as an additional property (multiproperty).
Variable names (starting with a "$") are replaced by the corresponding value stored in the
`values` GeneralStore.
``values`` GeneralStore.
Parameters
----------
value:
- if str, the value to be interpreted. E.g. "4", "hallo" or "$a" etc.
- if dict, must have keys "value" and "collection_mode". The returned tuple is directly
value: Union[dict, str, list]
- If *str*, the value to be interpreted. E.g. "4", "hello" or "$a" etc.
- If *dict*, must have keys ``value`` and ``collection_mode``. The returned tuple is directly
created from the corresponding values.
- if list, each element is checked for replacement and the resulting list will be used
- If *list*, each element is checked for replacement and the resulting list will be used
as (list) value for the property
Returns
......@@ -181,15 +184,15 @@ out: tuple
"""
# @review Florian Spreckelsen 2022-05-13
if type(value) == dict:
if isinstance(value, dict):
if "value" not in value:
# TODO: how do we handle this case? Just ignore?
# or disallow?
raise NotImplementedError()
raise NotImplementedError(f"This definition has no \"value\": {value}")
propvalue = value["value"]
# can be "single", "list" or "multiproperty"
collection_mode = value["collection_mode"]
elif type(value) == str:
elif isinstance(value, str):
propvalue = value
collection_mode = "single"
if propvalue.startswith("+"):
......@@ -198,7 +201,7 @@ out: tuple
elif propvalue.startswith("*"):
collection_mode = "multiproperty"
propvalue = propvalue[1:]
elif type(value) == list:
elif isinstance(value, list):
# TODO: (for review)
# This is a bit dirty right now and needed for
# being able to directly set list values. Semantics is, however, a bit
......@@ -209,7 +212,7 @@ out: tuple
propvalue = list()
for element in value:
# Do the element-wise replacement only, when its type is string:
if type(element) == str:
if isinstance(element, str):
propvalue.append(replace_variables(element, values))
else:
propvalue.append(element)
......@@ -286,9 +289,7 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict
propvalue = os.path.normpath(propvalue)
setattr(c_record, key.lower(), propvalue)
else:
if c_record.get_property(key) is None:
if collection_mode == "list":
c_record.add_property(name=key, value=[propvalue])
elif (collection_mode == "multiproperty" or
......@@ -322,10 +323,13 @@ class Converter(object, metaclass=ABCMeta):
Parameters
----------
definition: dict, Please refer to ``src/doc/converters.rst`` to learn about the structure
that the definition dict must have.
converter_registry: dict, A dictionary that contains converter names as keys and dicts as
values. Those value dicts have the keys 'converter' and 'package'.
definition: dict
Please refer to ``src/doc/converters.rst`` to learn about the structure that the
definition dict must have.
converter_registry: dict
A dictionary that contains converter names as keys and dicts as values. Those value dicts
have the keys 'converter', 'package' and 'class'. 'converter' is the class name,
'package' the module and 'class' the class instance of converters.
"""
self.definition = definition
......@@ -363,7 +367,7 @@ class Converter(object, metaclass=ABCMeta):
@staticmethod
def converter_factory(definition: dict, name: str, converter_registry: dict):
"""creates a Converter instance of the appropriate class.
"""Create a Converter instance of the appropriate class.
The `type` key in the `definition` defines the Converter class which is being used.
"""
......@@ -424,10 +428,11 @@ class Converter(object, metaclass=ABCMeta):
pass
"""
if not "transform" in self.definition:
if "transform" not in self.definition:
return
for transformer_key, transformer in self.definition["transform"].items():
in_value = replace_variables(transformer["in"], values)
out_value = in_value
for tr_func_el in transformer["functions"]:
if not isinstance(tr_func_el, dict):
......@@ -460,13 +465,13 @@ class Converter(object, metaclass=ABCMeta):
values[match.group('varname')] = out_value
@abstractmethod
def create_children(self, values: GeneralStore,
element: StructureElement):
def create_children(self, values: GeneralStore, element: StructureElement):
pass
def create_records(self, values: GeneralStore, records: RecordStore,
element: StructureElement):
# TODO why is element passed but not used???
# ANSWER: because it might be used by overriding child classes.
if "records" not in self.definition:
return []
......@@ -477,7 +482,7 @@ class Converter(object, metaclass=ABCMeta):
self.definition["records"])
def filter_children(self, children_with_strings:
List[Tuple[StructureElement, str]], expr: str,
list[tuple[StructureElement, str]], expr: str,
group: str, rule: str):
"""Filter children according to regexp `expr` and `rule`."""
......@@ -515,8 +520,8 @@ class Converter(object, metaclass=ABCMeta):
result: Optional[dict]):
""" Template for the debugging output for the match function """
msg = "\n--------" + name + "-----------\n"
for re, ma in zip(regexp, matched):
msg += "matching reg:\t" + re + "\n"
for exp, ma in zip(regexp, matched):
msg += "matching reg:\t" + exp + "\n"
msg += "matching val:\t" + ma + "\n"
msg += "---------\n"
if result is None:
......@@ -620,7 +625,7 @@ class DirectoryConverter(Converter):
element: A directory (of type Directory) which will be traversed.
"""
children: List[StructureElement] = []
children: list[StructureElement] = []
for name in sorted(os.listdir(element.path)):
path = os.path.join(element.path, name)
......@@ -660,7 +665,7 @@ class SimpleFileConverter(Converter):
class FileConverter(SimpleFileConverter):
def __init__(self, *args, **kwargs):
warnings.warn(DeprecationWarning(
"This class is depricated. Please use SimpleFileConverter."))
"This class is deprecated. Please use SimpleFileConverter."))
super().__init__(*args, **kwargs)
......@@ -693,12 +698,12 @@ class MarkdownFileConverter(SimpleFileConverter):
"Error during the validation (yaml header cannot be read) of the markdown file "
"located at the following node in the data structure:\n"
"{}\nError:\n{}".format(path, err))
children: List[StructureElement] = []
children: list[StructureElement] = []
for name, entry in header.items():
if type(entry) == list:
if isinstance(entry, list):
children.append(ListElement(name, entry))
elif type(entry) == str:
elif isinstance(entry, str):
children.append(TextElement(name, entry))
else:
if generalStore is not None and self.name in generalStore:
......@@ -713,7 +718,9 @@ class MarkdownFileConverter(SimpleFileConverter):
def convert_basic_element(element: Union[list, dict, bool, int, float, str, None], name=None,
msg_prefix=""):
"""Convert basic Python objects to the corresponding StructureElements"""
if isinstance(element, list):
if isinstance(element, StructureElement):
return element
elif isinstance(element, list):
return ListElement(name, element)
elif isinstance(element, dict):
return DictElement(name, element)
......@@ -814,14 +821,14 @@ class DictElementConverter(Converter):
class DictConverter(DictElementConverter):
def __init__(self, *args, **kwargs):
warnings.warn(DeprecationWarning(
"This class is depricated. Please use DictConverter."))
"This class is deprecated. Please use DictElementConverter."))
super().__init__(*args, **kwargs)
class DictDictElementConverter(DictElementConverter):
def __init__(self, *args, **kwargs):
warnings.warn(DeprecationWarning(
"This class is depricated. Please use DictElementConverter."))
"This class is deprecated. Please use DictElementConverter."))
super().__init__(*args, **kwargs)
......@@ -886,7 +893,7 @@ out:
"""
if "match_name" in definition:
if "match" in definition:
raise RuntimeError(f"Do not supply both, 'match_name' and 'match'.")
raise RuntimeError("Do not supply both, 'match_name' and 'match'.")
m1 = re.match(definition["match_name"], name)
if m1 is None:
......@@ -1009,7 +1016,7 @@ class BooleanElementConverter(_AbstractScalarValueElementConverter):
class DictBooleanElementConverter(BooleanElementConverter):
def __init__(self, *args, **kwargs):
warnings.warn(DeprecationWarning(
"This class is depricated. Please use BooleanElementConverter."))
"This class is deprecated. Please use BooleanElementConverter."))
super().__init__(*args, **kwargs)
......@@ -1025,7 +1032,7 @@ class FloatElementConverter(_AbstractScalarValueElementConverter):
class DictFloatElementConverter(FloatElementConverter):
def __init__(self, *args, **kwargs):
warnings.warn(DeprecationWarning(
"This class is depricated. Please use FloatElementConverter."))
"This class is deprecated. Please use FloatElementConverter."))
super().__init__(*args, **kwargs)
......@@ -1040,7 +1047,7 @@ class TextElementConverter(_AbstractScalarValueElementConverter):
def __init__(self, definition, *args, **kwargs):
if "match" in definition:
raise ValueError("""
The 'match' key will in future be used to match a potential name of a TextElement. Please use
The 'match' key is used to match a potential name of a TextElement. Please use
the 'match_value' key to match the value of the TextElement and 'match_name' for matching the name.
""")
......@@ -1050,7 +1057,7 @@ the 'match_value' key to match the value of the TextElement and 'match_name' for
class DictTextElementConverter(TextElementConverter):
def __init__(self, *args, **kwargs):
warnings.warn(DeprecationWarning(
"This class is depricated. Please use TextElementConverter."))
"This class is deprecated. Please use TextElementConverter."))
super().__init__(*args, **kwargs)
......@@ -1066,7 +1073,7 @@ class IntegerElementConverter(_AbstractScalarValueElementConverter):
class DictIntegerElementConverter(IntegerElementConverter):
def __init__(self, *args, **kwargs):
warnings.warn(DeprecationWarning(
"This class is depricated. Please use IntegerElementConverter."))
"This class is deprecated. Please use IntegerElementConverter."))
super().__init__(*args, **kwargs)
......@@ -1076,7 +1083,7 @@ class ListElementConverter(Converter):
# TODO: See comment on types and inheritance
if not isinstance(element, ListElement):
raise RuntimeError(
"This converter can only process DictListElements.")
"This converter can only process ListElements.")
children: list[StructureElement] = []
for index, list_element in enumerate(element.value):
children.append(
......@@ -1108,7 +1115,7 @@ class ListElementConverter(Converter):
class DictListElementConverter(ListElementConverter):
def __init__(self, *args, **kwargs):
warnings.warn(DeprecationWarning(
"This class is depricated. Please use ListElementConverter."))
"This class is deprecated. Please use ListElementConverter."))
super().__init__(*args, **kwargs)
......@@ -1122,15 +1129,22 @@ class TableConverter(Converter):
The rows can be matched using a DictElementConverter.
"""
@abstractmethod
def get_options(self):
"""
This method needs to be overwritten by the specific table converter to provide
information about the possible options.
def get_options(self) -> dict:
"""Get specific options, e.g. from ``self.definitions``.
This method may to be overwritten by the specific table converter to provide information about the
possible options. Implementors may use ``TableConverter._get_options(...)`` to get (and convert)
options from ``self.definitions``.
Returns
-------
out: dict
An options dict.
"""
pass
return {}
def _get_options(self, possible_options):
def _get_options(self, possible_options: list[tuple[str, Callable]]) -> dict:
option_dict = dict()
for opt_name, opt_conversion in possible_options:
if opt_name in self.definition:
......@@ -1158,6 +1172,14 @@ class TableConverter(Converter):
return None
return m.groupdict()
@staticmethod
def _children_from_dataframe(dataframe: pd.DataFrame):
child_elements = list()
for index, row in dataframe.iterrows():
child_elements.append(
DictElement(str(index), row.to_dict()))
return child_elements
class XLSXTableConverter(TableConverter):
"""
......@@ -1187,11 +1209,7 @@ class XLSXTableConverter(TableConverter):
if not isinstance(element, File):
raise RuntimeError("Element must be a File.")
table = pd.read_excel(element.path, **self.get_options())
child_elements = list()
for index, row in table.iterrows():
child_elements.append(
DictElement(str(index), row.to_dict()))
return child_elements
return self._children_from_dataframe(table)
class CSVTableConverter(TableConverter):
......@@ -1216,22 +1234,19 @@ class CSVTableConverter(TableConverter):
if not isinstance(element, File):
raise RuntimeError("Element must be a File.")
table = pd.read_csv(element.path, **self.get_options())
child_elements = list()
for index, row in table.iterrows():
child_elements.append(
DictElement(str(index), row.to_dict()))
return child_elements
return self._children_from_dataframe(table)
class DateElementConverter(TextElementConverter):
"""allows to convert different text formats of dates to Python date objects.
The text to be parsed must be contained in the "date" group. The format string can be supplied
under "dateformat" in the Converter definition. The library used is datetime so see its
under "date_format" in the Converter definition. The library used is datetime so see its
documentation for information on how to create the format string.
"""
# TODO make `date` parameter name configurable
def match(self, element: StructureElement):
matches = super().match(element)
if matches is not None and "date" in matches:
......@@ -1240,3 +1255,24 @@ class DateElementConverter(TextElementConverter):
self.definition["date_format"] if "date_format" in self.definition else "%Y-%m-%d"
).date()})
return matches
class DatetimeElementConverter(TextElementConverter):
"""Convert text so that it is formatted in a way that LinkAhead can understand it.
The text to be parsed must be in the ``val`` parameter. The format string can be supplied in the
``datetime_format`` node. This class uses the ``datetime`` module, so ``datetime_format`` must
follow this specificaton:
https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes
"""
# TODO make `val` parameter name configurable
def match(self, element: StructureElement):
matches = super().match(element)
if matches is not None and "val" in matches:
fmt_default = "%Y-%m-%dT%H:%M:%S"
fmt = self.definition.get("datetime_format", fmt_default)
dt_str = datetime.datetime.strptime(matches["val"], fmt).strftime(fmt_default)
matches.update({"val": dt_str})
return matches
This diff is collapsed.
......@@ -8,6 +8,9 @@ BooleanElement:
Date:
converter: DateElementConverter
package: caoscrawler.converters
Datetime:
converter: DatetimeElementConverter
package: caoscrawler.converters
Dict:
converter: DictElementConverter
package: caoscrawler.converters
......@@ -24,7 +27,7 @@ TextElement:
converter: TextElementConverter
package: caoscrawler.converters
DictDictElement: # deprecated
converter: DictElementConverter
package: caoscrawler.converters
......@@ -60,7 +63,7 @@ File: # deprecated
converter: SimpleFileConverter
package: caoscrawler.converters
SimpleFile:
converter: SimpleFileConverter
package: caoscrawler.converters
......@@ -81,6 +84,10 @@ CSVTableConverter:
converter: CSVTableConverter
package: caoscrawler.converters
SPSSFile:
converter: SPSSConverter
package: caoscrawler.converters
XLSXTableConverter:
converter: XLSXTableConverter
package: caoscrawler.converters
# Lookup table for matching functions and cfood yaml node names.
submatch:
package: caoscrawler.transformer_functions
......@@ -9,3 +9,9 @@ split:
replace:
package: caoscrawler.transformer_functions
function: replace
date_parse:
package: caoscrawler.transformer_functions
function: date_parse
datetime_parse:
package: caoscrawler.transformer_functions
function: datetime_parse
#!/usr/bin/env python3
# encoding: utf-8
#
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
class ForbiddenTransaction(Exception):
"""Thrown if an transactions is needed that is not allowed.
For example an update of an entity if the security level is INSERT
"""
pass
class ImpossibleMergeError(Exception):
"""Thrown if due to identifying information, two SyncNodes or two Properties of SyncNodes
should be merged, but there is conflicting information that prevents this.
"""
def __init__(self, *args, pname, values, **kwargs):
self.pname = pname
self.values = values
super().__init__(self, *args, **kwargs)
class InvalidIdentifiableYAML(Exception):
"""Thrown if the identifiable definition is invalid."""
pass
class MissingIdentifyingProperty(Exception):
"""Thrown if a SyncNode does not have the properties required by the corresponding registered
identifiable
"""
pass
class MissingRecordType(Exception):
"""Thrown if an record type can not be found although it is expected that it exists on the
server.
"""
pass
class MissingReferencingEntityError(Exception):
"""Thrown if the identifiable requires that some entity references the given entity but there
is no such reference """
def __init__(self, *args, rts=None, **kwargs):
self.rts = rts
super().__init__(self, *args, **kwargs)
......@@ -18,6 +18,8 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
from typing import Optional
try:
import h5py
except ModuleNotFoundError:
......@@ -94,8 +96,8 @@ def convert_h5_element(elt: Union[h5py.Group, h5py.Dataset], name: str):
raise ValueError("The given element must be either a HDF5 Group or Dataset object.")
def convert_basic_element_with_nd_array(value, name: str = None,
internal_path: str = None, msg_prefix: str = ""):
def convert_basic_element_with_nd_array(value, name: Optional[str] = None,
internal_path: Optional[str] = None, msg_prefix: str = ""):
"""Convert a given object either to an ndarray structure element or to a
basic scalar structure element.
......
#!/usr/bin/env python3
# encoding: utf-8
#
# This file is a part of the CaosDB Project.
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2022 Henrik tom Wörden
#
......@@ -20,23 +20,27 @@
#
from __future__ import annotations
import linkahead as db
from datetime import datetime
import json
from hashlib import sha256
from typing import Union
import logging
from datetime import datetime
from hashlib import sha256
from typing import Optional, Union
import linkahead as db
from .exceptions import MissingIdentifyingProperty
from .sync_node import SyncNode
logger = logging.getLogger(__name__)
class Identifiable():
"""
The fingerprint of a Record in CaosDB.
The fingerprint of a Record in LinkAhead.
This class contains the information that is used by the CaosDB Crawler to identify Records.
On one hand, this can be the ID or a Record or the path of a File.
On the other hand, in order to check whether a Record exits in the CaosDB Server, a query can
This class contains the information that is used by the LinkAhead Crawler to identify Records.
In order to check whether a Record exits in the LinkAhead Server, a query can
be created using the information contained in the Identifiable.
Parameters
......@@ -46,23 +50,22 @@ class Identifiable():
properties: dict, keys are names of Properties; values are Property values
Note, that lists are not checked for equality but are interpreted as multiple
conditions for a single Property.
path: str, In case of files: The path where the file is stored.
backrefs: list, TODO future
"""
def __init__(self, record_id: int = None, path: str = None, record_type: str = None,
name: str = None, properties: dict = None,
backrefs: list[Union[int, str]] = None):
if (record_id is None and path is None and name is None
def __init__(self, record_id: Optional[int] = None, record_type: Optional[str] = None,
name: Optional[str] = None, properties: Optional[dict] = None,
backrefs: Optional[list[Union[int, str]]] = None):
if (record_id is None and name is None
and (backrefs is None or len(backrefs) == 0)
and (properties is None or len(properties) == 0)):
raise ValueError("There is no identifying information. You need to add a path or "
"properties or other identifying attributes.")
raise ValueError(
"There is no identifying information. You need to add "
"properties or other identifying attributes.")
if properties is not None and 'name' in [k.lower() for k in properties.keys()]:
raise ValueError("Please use the separete 'name' keyword instead of the properties "
"dict for name")
self.record_id = record_id
self.path = path
self.record_type = record_type
self.name = name
if name == "":
......@@ -77,24 +80,21 @@ class Identifiable():
def get_representation(self) -> str:
return sha256(Identifiable._create_hashable_string(self).encode('utf-8')).hexdigest()
@staticmethod
@ staticmethod
def _value_representation(value) -> str:
"""returns the string representation of property values to be used in the hash function
The string is the path of a File Entity, the CaosDB ID or Python ID of other Entities
(Python Id only if there is no CaosDB ID) and the string representation of bool, float, int
and str.
The string is the LinkAhead ID in case of SyncNode objects (SyncNode objects must have an ID)
and the string representation of None, bool, float, int, datetime and str.
"""
if value is None:
return "None"
elif isinstance(value, db.File):
return str(value.path)
elif isinstance(value, db.Entity):
elif isinstance(value, SyncNode):
if value.id is not None:
return str(value.id)
else:
return "PyID=" + str(id(value))
raise RuntimeError("Python Entity (SyncNode) without ID not allowed")
elif isinstance(value, list):
return "[" + ", ".join([Identifiable._value_representation(el) for el in value]) + "]"
elif (isinstance(value, str) or isinstance(value, int) or isinstance(value, float)
......@@ -103,7 +103,7 @@ class Identifiable():
else:
raise ValueError(f"Unknown datatype of the value: {value}")
@staticmethod
@ staticmethod
def _create_hashable_string(identifiable: Identifiable) -> str:
"""
creates a string from the attributes of an identifiable that can be hashed
......@@ -120,27 +120,20 @@ class Identifiable():
return rec_string
def __eq__(self, other) -> bool:
"""
Identifiables are equal if they belong to the same Record. Since ID and path are on their
own enough to identify the Record it is sufficient if those attributes are equal.
1. both IDs are set (not None) -> equal if IDs are equal
2. both paths are set (not None) -> equal if paths are equal
3. equal if attribute representations are equal
"""
""" Identifiables are equal if they share the same ID or if the representation is equal """
if not isinstance(other, Identifiable):
raise ValueError("Identifiable can only be compared to other Identifiable objects.")
elif self.record_id is not None and other.record_id is not None:
if self.record_id is not None and other.record_id is not None:
return self.record_id == other.record_id
elif self.path is not None and other.path is not None:
return self.path == other.path
elif self.get_representation() == other.get_representation():
return True
else:
return False
def __repr__(self):
pstring = json.dumps(self.properties)
""" deterministic text representation of the identifiable """
pstring = json.dumps({k: str(v) for k, v in self.properties.items()})
return (f"{self.__class__.__name__} for RT {self.record_type}: id={self.record_id}; "
f"name={self.name}\n\tpath={self.path}\n"
f"name={self.name}\n"
f"\tproperties:\n{pstring}\n"
f"\tbackrefs:\n{self.backrefs}")
This diff is collapsed.
......@@ -25,12 +25,17 @@
# Function to expand a macro in yaml
# A. Schlemmer, 05/2022
import re
from dataclasses import dataclass
from typing import Any, Dict
from copy import deepcopy
from string import Template
_SAFE_SUBST_PAT = re.compile(r"^\$(?P<key>\w+)$")
_SAFE_SUBST_PAT_BRACES = re.compile(r"^\$\{(?P<key>\w+)}$")
@dataclass
class MacroDefinition:
"""
......@@ -53,6 +58,12 @@ def substitute(propvalue, values: dict):
Substitution of variables in strings using the variable substitution
library from python's standard library.
"""
# Simple matches are simply replaced by the raw dict entry.
if match := (_SAFE_SUBST_PAT.fullmatch(propvalue)
or _SAFE_SUBST_PAT_BRACES.fullmatch(propvalue)):
key = match.group("key")
if key in values:
return values[key]
propvalue_template = Template(propvalue)
return propvalue_template.safe_substitute(**values)
......