Skip to content
Snippets Groups Projects
Commit f200440c authored by Alexander Schlemmer's avatar Alexander Schlemmer
Browse files

Merge branch 'dev' into f-new-debug-tree

parents 4c925e19 96ae0ada
No related branches found
No related tags found
No related merge requests found
Pipeline #59484 failed
Showing
with 2609 additions and 1120 deletions
...@@ -18,6 +18,8 @@ ...@@ -18,6 +18,8 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>. # along with this program. If not, see <https://www.gnu.org/licenses/>.
# #
from typing import Optional
try: try:
import h5py import h5py
except ModuleNotFoundError: except ModuleNotFoundError:
...@@ -26,16 +28,16 @@ except ModuleNotFoundError: ...@@ -26,16 +28,16 @@ except ModuleNotFoundError:
"its optional `h5-crawler` dependency?" "its optional `h5-crawler` dependency?"
) )
import numpy as np
from typing import Union from typing import Union
import linkahead as db import linkahead as db
import numpy as np
from .converters import (convert_basic_element, Converter, DictElementConverter, from ..stores import GeneralStore, RecordStore
match_name_and_value, SimpleFileConverter) from ..structure_elements import (DictElement, File, FloatElement,
from .stores import GeneralStore, RecordStore IntegerElement, StructureElement)
from .structure_elements import DictElement, File, FloatElement, IntegerElement, StructureElement from .converters import (Converter, DictElementConverter, SimpleFileConverter,
convert_basic_element, match_name_and_value)
def convert_attributes(elt: Union[h5py.File, h5py.Group, h5py.Dataset]): def convert_attributes(elt: Union[h5py.File, h5py.Group, h5py.Dataset]):
...@@ -94,8 +96,8 @@ def convert_h5_element(elt: Union[h5py.Group, h5py.Dataset], name: str): ...@@ -94,8 +96,8 @@ def convert_h5_element(elt: Union[h5py.Group, h5py.Dataset], name: str):
raise ValueError("The given element must be either a HDF5 Group or Dataset object.") raise ValueError("The given element must be either a HDF5 Group or Dataset object.")
def convert_basic_element_with_nd_array(value, name: str = None, def convert_basic_element_with_nd_array(value, name: Optional[str] = None,
internal_path: str = None, msg_prefix: str = ""): internal_path: Optional[str] = None, msg_prefix: str = ""):
"""Convert a given object either to an ndarray structure element or to a """Convert a given object either to an ndarray structure element or to a
basic scalar structure element. basic scalar structure element.
......
# encoding: utf-8
#
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2024 Alexander Schlemmer
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Converters take structure elements and create Records and new structure elements from them.
This converter converts ro-crate files which may also be .eln-files.
"""
from __future__ import annotations
import os
import re
import tempfile
from typing import Optional
from zipfile import ZipFile
import rocrate
from rocrate.rocrate import ROCrate
from ..stores import GeneralStore
from ..structure_elements import (Directory, File, ROCrateEntity,
StructureElement)
from .converters import Converter, SimpleFileConverter, convert_basic_element
class ROCrateConverter(SimpleFileConverter):
"""Convert ro-crate files / directories.
"""
def setup(self):
self._tempdir = None
def cleanup(self):
self._tempdir.cleanup()
def typecheck(self, element: StructureElement):
"""
Check whether the current structure element can be converted using
this converter.
"""
return isinstance(element, File) or isinstance(element, Directory)
def match(self, element: StructureElement) -> Optional[dict]:
m = re.match(self.definition["match"], element.name)
if m is None:
return None
return m.groupdict()
def create_children(self, generalStore: GeneralStore, element: StructureElement):
"""
Loads an ROCrate from an rocrate file or directory.
Arguments:
----------
element must be a File or Directory (structure element).
Returns:
--------
A list with an ROCrateElement representing the contents of the .eln-file or None
in case of errors.
"""
if isinstance(element, File):
self._tempdir = tempfile.TemporaryDirectory()
with ZipFile(element.path) as zipf:
zipf.extractall(self._tempdir.name)
crate_path = self._tempdir.name
crate = ROCrate(crate_path)
entity_ls = []
for ent in crate.get_entities():
entity_ls.append(ROCrateEntity(crate_path, ent))
return entity_ls
elif isinstance(element, Directory):
# This would be an unzipped .eln file
# As this is possible for rocrate files, I think it is reasonable
# to support it as well.
raise NotImplementedError()
else:
raise ValueError("create_children was called with wrong type of StructureElement")
return None
class ELNFileConverter(ROCrateConverter):
"""Convert .eln-Files
See: https://github.com/TheELNConsortium/TheELNFileFormat
These files are basically RO-Crates with some minor differences:
- The ro-crate metadata file is not on top-level within the .eln-zip-container,
but in a top-level subdirectory.
"""
def create_children(self, generalStore: GeneralStore, element: StructureElement):
"""
Loads an ROCrate from an .eln-file or directory.
This involves unzipping the .eln-file to a temporary folder and creating an ROCrate object
from its contents.
Arguments:
----------
element must be a File or Directory (structure element).
Returns:
--------
A list with an ROCrateElement representing the contents of the .eln-file or None
in case of errors.
"""
if isinstance(element, File):
self._tempdir = tempfile.TemporaryDirectory()
with ZipFile(element.path) as zipf:
zipf.extractall(self._tempdir.name)
cratep = os.listdir(self._tempdir.name)
if len(cratep) != 1:
raise RuntimeError(".eln file must contain exactly one folder")
crate_path = os.path.join(self._tempdir.name, cratep[0])
crate = ROCrate(crate_path)
entity_ls = []
for ent in crate.get_entities():
entity_ls.append(ROCrateEntity(crate_path, ent))
return entity_ls
elif isinstance(element, Directory):
# This would be an unzipped .eln file
# As this is possible for rocrate files, I think it is reasonable
# to support it as well.
raise NotImplementedError()
else:
raise ValueError("create_children was called with wrong type of StructureElement")
return None
class ROCrateEntityConverter(Converter):
def typecheck(self, element: StructureElement):
"""
Check whether the current structure element can be converted using
this converter.
"""
return isinstance(element, ROCrateEntity)
def match(self, element: StructureElement) -> Optional[dict]:
# See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
# for a suggestion for the design of the matching algorithm.
if not isinstance(element, ROCrateEntity):
raise TypeError("Element must be an instance of ROCrateEntity.")
# Store the result of all individual regexp variable results:
vardict = {}
# TODO: I accidentally used "match_type" instead
# of "match_entity_type". This was completely
# unnoticed. So add it to schema and adapt tests.
if "match_entity_type" in self.definition:
entity_type = element.entity.type
if isinstance(entity_type, list):
# TODO: this seems to be a bug in kadi4mat RO-Crates
# ./ has type ['Dataset']
# instead of type 'Dataset'
entity_type = entity_type[0]
m_type = re.match(self.definition["match_entity_type"], entity_type)
if m_type is None:
return None
vardict.update(m_type.groupdict())
if not self.match_properties(element.entity.properties(), vardict):
return None
return vardict
def create_children(self, generalStore: GeneralStore, element: StructureElement):
children = []
eprops = element.entity.properties()
# Add the properties:
for name, value in eprops.items():
children.append(convert_basic_element(value, name))
# Add the files:
if isinstance(element.entity, rocrate.model.file.File):
path, name = os.path.split(eprops["@id"])
children.append(File(name, os.path.join(element.folder, path, name)))
# Parts of this entity are added as child entities:
if "hasPart" in eprops:
for p in eprops["hasPart"]:
children.append(
ROCrateEntity(element.folder, element.entity.crate.dereference(
p["@id"])))
return children
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 IndiScale GmbH <info@indiscale.com>
# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Converter for SAV files (stored by SPSS)."""
from __future__ import annotations # Can be removed with 3.10.
import argparse
from collections import OrderedDict
from typing import Any, Optional
import numpy as np
import pandas as pd
import pyreadstat
import yaml
from ..stores import GeneralStore
from ..structure_elements import File, StructureElement
from . import converters
READSTAT_TYPES = {
"double": "DOUBLE",
"string": "TEXT",
}
ORIGINAL_TYPES = {
"EDATE8": "DATETIME",
}
class SPSSConverter(converters.TableConverter):
"""Converter for SAV files (stored by SPSS)."""
def create_children(self, values: GeneralStore, element: StructureElement) -> list:
assert isinstance(element, File)
# The default dtype backend "numpy_nullable" does not handle dates well.
# Note that pandas.ArrowDtype is considered experimental (in Pandas 2.2).
df = pd.io.spss.read_spss(element.path, dtype_backend="pyarrow")
dtypes = read_column_types(element.path)
# Fix datetime columns
for name, dtype in dtypes.items():
if dtype != "DATETIME":
continue
col = df.loc[:, name]
col.fillna(np.nan, inplace=True)
col.replace([np.nan], [None], inplace=True)
return self._children_from_dataframe(df)
def read_column_types(savfile: Optional[str] = None, meta: Optional[Any] = None) -> dict[str, str]:
"""Read SAV file and return the column types.
Optionally, take data from a previours reading.
Parameters
----------
savfile : Optional[str]
The SAV file to read.
meta : Optional
The meta data result from `pyreadstat.read_sav(...)`.
Returns
-------
out : dict[str, str]
The column names and types.
"""
if not meta:
_, meta = pyreadstat.read_sav(savfile, metadataonly=True)
elif savfile is not None:
raise ValueError("Only one of `savfile` and `meta` must be given.")
dtypes: dict[str, str] = {}
for name in meta.column_names:
datatype = ORIGINAL_TYPES.get(meta.original_variable_types[name],
READSTAT_TYPES[meta.readstat_variable_types[name]])
dtypes[name] = datatype
return dtypes
def spss_to_yaml(savfile: str, yamlfile: str, cfood: Optional[str] = None) -> None:
"""Parse the *.sav and create basic datamodel in ``yamlfile``.
Parameters
----------
cfood: str
If given, also create a cfood skeleton.
"""
_, meta = pyreadstat.read_sav(savfile, metadataonly=True)
dtypes = read_column_types(meta=meta)
cfood_str = """
---
metadata:
macros:
- !defmacro
# Simple column value -> property rule
name: ColumnValue
params:
name: null
belongsto: BaseElement
type: TextElement
definition:
${name}:
type: ${type}
match_name: ^${name}$$
match_value: (?P<val>.*)
records:
${belongsto}:
${name}: $$val
- !defmacro
# column value -> reference property
name: ColumnValueReference
params:
name: null
reftype: null # RecordType of the reference
belongsto: BaseElement
type: TextElement # References are always text, right?
definition:
${name}:
type: ${type}
match_name: ^${name}$$
match_value: (?P<val>.*)
records:
${reftype}:
name: $$val
${belongsto}:
${name}: $$${reftype}
- !defmacro
# Same as "ColumnValue", but also give name of property.
name: ColumnValuePropname
params:
name: null
propname: null
belongsto: BaseElement
type: TextElement
definition:
${name}:
type: ${type}
match_name: ^${name}$$
match_value: (?P<val>.*)
records:
${belongsto}:
${propname}: $$val
---
directory: # corresponds to the directory given to the crawler
type: Directory
match: .* # we do not care how it is named here
subtree:
# This is the file
thisfile:
type: SPSSFile
match: ".*sav"
subtree:
entry:
type: Dict
match: .* # Name is irrelevant
records:
MyParent:
subtree: !macro
"""
enums: dict[str, list[str]] = {}
properties = OrderedDict()
for name in meta.column_names:
prop = {
"datatype": dtypes[name],
}
desc = meta.column_names_to_labels.get(name)
if desc and desc != name:
prop["description"] = desc
# Handle categorial variables
if var_label := meta.variable_to_label.get(name):
vvl = meta.variable_value_labels[name]
# reproducible (and sensible) order
label_values = [vvl[key] for key in sorted(vvl.keys())]
if label_values not in enums.values():
enums[var_label] = label_values
else:
var_label = [key for key, value in enums.items() if value == label_values][0]
prop["datatype"] = var_label
properties[name] = prop
output = f"""# auto-generated data model from file "{savfile}".
# To insert a datamodel into LinkAhead, run:
#
# python3 -m caosadvancedtools.models.parser datamodel.yaml --sync
"""
# Actual datamodel
output += """
#########
# Enums #
#########
"""
for name, values in enums.items():
output += f"""{name}:
description:
# possible values: {values}\n"""
output += ("""
###############
# RecordTypes #
###############
DummyRT:
description: Note: Change name and enter description.
recommended_properties:
"""
+ " ".join(yaml.dump(dict(properties), # from OrderedDict to dict
allow_unicode=True,
sort_keys=False).splitlines(keepends=True)))
# Experimental: Enum creation
output += """
###############
# Enum values #
###############
"""
for name, values in enums.items():
output += f"\n# ### {name} ###\n"
for value in values:
output += f"""
{value}:
role: Record
inherit_from_suggested:
- {name}
"""
with open(yamlfile, encoding="utf-8", mode="w") as myfile:
myfile.write(output)
if cfood:
defs_col_value: list[str] = []
defs_col_value_ref: list[str] = []
prefix = " " * 14
for name, propdef in properties.items():
def_str = prefix + f"- name: {name}\n"
dtype = None
reftype = None
defs = defs_col_value
# Which type?
if propdef["datatype"] == "DOUBLE":
dtype = "FloatElement"
elif propdef["datatype"] in ("TEXT", "DATETIME"):
dtype = None
else:
reftype = propdef["datatype"]
defs = defs_col_value_ref
# Append according to types:
if reftype:
def_str += prefix + f" reftype: {reftype}\n"
if dtype:
def_str += prefix + f" type: {dtype}\n"
# Store result
defs.append(def_str)
del defs
cfood_str += (prefix[2:] + "ColumnValue:\n" + "".join(defs_col_value)
+ prefix[2:] + "ColumnValueReference:\n" + "".join(defs_col_value_ref)
)
with open(cfood, encoding="utf-8", mode="w") as myfile:
myfile.write(cfood_str)
def _parse_arguments():
"""Parse the arguments."""
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('-i', '--input', help="The *.sav file.", required=True)
parser.add_argument('-o', '--outfile', help="Yaml filename to save the result", required=True)
parser.add_argument('--cfood', help="Yaml filename to create cfood output in", required=False)
return parser.parse_args()
def spss_to_datamodel_main():
"""The main function of this script."""
args = _parse_arguments()
spss_to_yaml(savfile=args.input, yamlfile=args.outfile, cfood=args.cfood)
print(f"Written datamodel to: {args.outfile}")
if args.cfood:
print(f"Written cfood to: {args.cfood}")
# encoding: utf-8
#
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2024 Alexander Schlemmer
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Converters take structure elements and create Records and new structure elements from them."""
from __future__ import annotations
import re
from typing import Optional
import lxml.etree
from ..stores import GeneralStore
from ..structure_elements import (File, StructureElement, XMLAttributeNode,
XMLTagElement, XMLTextNode)
from .converters import (Converter, ConverterValidationError,
SimpleFileConverter)
class XMLFileConverter(SimpleFileConverter):
"""Convert XML files. See
https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
for the current suggestion for the specification.
"""
def create_children(self, generalStore: GeneralStore, element: StructureElement):
# TODO: See comment on types and inheritance
if not isinstance(element, File):
raise ValueError("create_children was called with wrong type of StructureElement")
with open(element.path, 'r') as xml_file:
xml = lxml.etree.parse(xml_file)
if "validate" in self.definition and self.definition["validate"]:
try:
raise NotImplementedError("XML validation not implemented yet.")
except ConverterValidationError as err:
raise ConverterValidationError(
"Error during the validation of the XML file:\n"
f"{element.path}\n" + err.message)
return [XMLTagElement(xml.getroot())]
class XMLTagConverter(Converter):
def create_children(self, generalStore: GeneralStore, element: StructureElement):
"""Children that are generated by this function are the
result of the xpath query given in the yaml property
``xpath``. Its default (when not given) is ``child::*``, so the
direct children of the current xml node. The xpath expression
must be designed in a way that it returns xml tags (and no
attributes or texts). That means, that the axis ``attribute::``
and the function ``text()`` must not be used.
The following yaml properties can be used to generate other
types of nodes (text nodes and attribute nodes) as subtree
structure elements:
::
# _*_ marks the default:
attribs_as_children: true # true / _false_
text_as_children: true # true / _false_
tags_as_children: true # _true_ / false
The default is to generate the tags matched by the xpath expression only.
- When text_as_children is set to true, text nodes will be generated that contain the text
contained in the matched tags.
- When attribs_as_children is set to true, attribute nodes will be generated from the attributes
of the matched tags.
Notes
-----
The default is to take the namespace map from the current node and use it in xpath queries.
Because default namespaces cannot be handled by xpath, it is possible to remap the default namespace
using the key ``default_namespace``.
The key ``nsmap`` can be used to define additional nsmap entries.
"""
if not isinstance(element, XMLTagElement):
raise TypeError("Element must be an instance of XMLTagElement.")
# Get the namespace map from the element:
nsmap = element.tag.nsmap
# The default name of the default namespace is "default".
# You can overwrite it using the attribute "default_namespace" in the converter definition:
default_namespace = self.definition.get("default_namespace", "default")
if None in nsmap:
nsmap[default_namespace] = nsmap[None]
del nsmap[None]
# Set additional nsmap entries from the converter definition:
if "nsmap" in self.definition:
for key, value in self.definition["nsmap"].items():
nsmap[key] = value
xpath = self.definition.get("xpath", "child::*")
children = element.tag.xpath(xpath, namespaces=nsmap)
el_lst = []
for el in children:
if isinstance(el, str):
raise RuntimeError(
"Only standard xml nodes are supported as results of xpath queries.")
elif isinstance(el, lxml.etree._Element):
if self.definition.get("tags_as_children", True):
el_lst.append(XMLTagElement(el))
if self.definition.get("attribs_as_children", False):
for attrib in el.attrib:
el_lst.append(XMLAttributeNode(el, attrib))
if self.definition.get("text_as_children", False):
el_lst.append(XMLTextNode(el))
else:
raise RuntimeError("Unsupported child type.")
return el_lst
def typecheck(self, element: StructureElement):
"""
Check whether the current structure element can be converted using
this converter.
"""
return isinstance(element, XMLTagElement)
def match(self, element: StructureElement) -> Optional[dict]:
# See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
# for a suggestion for the design of the matching algorithm.
if not isinstance(element, XMLTagElement):
raise TypeError("Element must be an instance of XMLTagElement.")
# Store the result of all individual regexp variable results:
vardict = {}
if "match_tag" in self.definition:
m_tag = re.match(self.definition["match_tag"], element.tag.tag)
if m_tag is None:
return None
vardict.update(m_tag.groupdict())
if "match_text" in self.definition:
tagtext = element.tag.text
if element.tag.text is None:
tagtext = ""
m_text = re.match(self.definition["match_text"], tagtext, re.DOTALL)
if m_text is None:
return None
vardict.update(m_text.groupdict())
if not self.match_properties(element.tag.attrib, vardict, "match_attrib"):
return None
return vardict
class XMLTextNodeConverter(Converter):
def create_children(self, generalStore: GeneralStore, element: StructureElement):
"""
This converter does not create children.
"""
return []
def typecheck(self, element: StructureElement):
"""
Check whether the current structure element can be converted using
this converter.
"""
return isinstance(element, XMLTextNode)
def match(self, element: StructureElement) -> Optional[dict]:
# See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
# for a suggestion for the design of the matching algorithm.
if not isinstance(element, XMLTextNode):
raise TypeError("Element must be an instance of XMLTextNode.")
vardict = {}
m_text = re.match(self.definition["match_text"], element.value,
re.DOTALL)
if m_text is None:
return None
vardict.update(m_text.groupdict())
return vardict
class XMLAttributeNodeConverter(Converter):
def create_children(self, generalStore: GeneralStore, element: StructureElement):
"""
This converter does not create children.
"""
return []
def typecheck(self, element: StructureElement):
"""
Check whether the current structure element can be converted using
this converter.
"""
return isinstance(element, XMLAttributeNode)
def match(self, element: StructureElement) -> Optional[dict]:
# See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
# for a suggestion for the design of the matching algorithm.
if not isinstance(element, XMLAttributeNode):
raise TypeError("Element must be an instance of XMLAttributeNode.")
vardict = {}
m_name = re.match(self.definition["match_name"], element.key)
if m_name is None:
return None
vardict.update(m_name.groupdict())
m_value = re.match(self.definition["match_value"], element.value)
if m_value is None:
return None
vardict.update(m_value.groupdict())
return vardict
# encoding: utf-8
#
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2024 Alexander Schlemmer
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Converters take structure elements and create Records and new structure elements from them.
This converter opens zip files, unzips them into a temporary directory and
exposes its contents as File structure elements.
"""
from __future__ import annotations
import os
import tempfile
from os.path import isdir, join
from zipfile import ZipFile
from ..stores import GeneralStore
from ..structure_elements import Directory, File, StructureElement
from .converters import SimpleFileConverter
class ZipFileConverter(SimpleFileConverter):
"""Convert zipfiles.
"""
def setup(self):
self._tempdir = None
def cleanup(self):
self._tempdir.cleanup()
def create_children(self, generalStore: GeneralStore, element: StructureElement):
"""
Loads an ROCrate from an rocrate file or directory.
Arguments:
----------
element must be a File or Directory (structure element).
Returns:
--------
A list with an ROCrateElement representing the contents of the .eln-file or None
in case of errors.
"""
if isinstance(element, File):
self._tempdir = tempfile.TemporaryDirectory()
unzd_path = self._tempdir.name
with ZipFile(element.path) as zipf:
zipf.extractall(unzd_path)
entity_ls = []
for el in os.listdir(unzd_path):
path = join(unzd_path, el)
if isdir(path):
entity_ls.append(Directory(el, path))
else:
entity_ls.append(File(el, path))
return entity_ls
else:
raise ValueError("create_children was called with wrong type of StructureElement")
return None
This diff is collapsed.
...@@ -29,35 +29,20 @@ A structure containing debug tree information. ...@@ -29,35 +29,20 @@ A structure containing debug tree information.
from __future__ import annotations from __future__ import annotations
import argparse
import importlib
import logging
import os
import sys
import warnings
import yaml
from argparse import RawTextHelpFormatter
from collections import defaultdict from collections import defaultdict
from copy import deepcopy
from enum import Enum
from importlib_resources import files
from jsonschema import validate
from typing import Any, Optional, Type, Union
import linkahead as db import linkahead as db
import yaml
from caosadvancedtools.cache import UpdateCache, Cache from importlib_resources import files
from caosadvancedtools.crawler import Crawler as OldCrawler from jsonschema import validate
from linkahead.apiutils import (compare_entities, EntityMergeConflictError, from linkahead.apiutils import (EntityMergeConflictError, compare_entities,
merge_entities) merge_entities)
from linkahead.common.datatype import is_reference from linkahead.common.datatype import is_reference
from .converters import Converter, DirectoryConverter, ConverterValidationError from .converters import Converter, ConverterValidationError, DirectoryConverter
from .macros import defmacro_constructor, macro_constructor from .macros import defmacro_constructor, macro_constructor
from .stores import Store, GeneralStore, RecordStore from .stores import GeneralStore, RecordStore, Store
from .structure_elements import StructureElement, Directory, NoneElement from .structure_elements import Directory, NoneElement, StructureElement
from .version import check_cfood_version from .version import check_cfood_version
......
...@@ -8,9 +8,15 @@ BooleanElement: ...@@ -8,9 +8,15 @@ BooleanElement:
Date: Date:
converter: DateElementConverter converter: DateElementConverter
package: caoscrawler.converters package: caoscrawler.converters
Datetime:
converter: DatetimeElementConverter
package: caoscrawler.converters
Dict: Dict:
converter: DictElementConverter converter: DictElementConverter
package: caoscrawler.converters package: caoscrawler.converters
PropertiesFromDictElement:
converter: PropertiesFromDictConverter
package: caoscrawler.converters
FloatElement: FloatElement:
converter: FloatElementConverter converter: FloatElementConverter
package: caoscrawler.converters package: caoscrawler.converters
...@@ -81,6 +87,31 @@ CSVTableConverter: ...@@ -81,6 +87,31 @@ CSVTableConverter:
converter: CSVTableConverter converter: CSVTableConverter
package: caoscrawler.converters package: caoscrawler.converters
SPSSFile:
converter: SPSSConverter
package: caoscrawler.converters
XLSXTableConverter: XLSXTableConverter:
converter: XLSXTableConverter converter: XLSXTableConverter
package: caoscrawler.converters package: caoscrawler.converters
# -------------------------
# XML
# -------------------------
XMLFile:
converter: XMLFileConverter
package: caoscrawler.converters
XMLTag:
converter: XMLTagConverter
package: caoscrawler.converters
XMLTextNode:
converter: XMLTextNodeConverter
package: caoscrawler.converters
XMLAttributeNode:
converter: XMLAttributeNodeConverter
package: caoscrawler.converters
# Lookup table for matching functions and cfood yaml node names.
submatch: submatch:
package: caoscrawler.transformer_functions package: caoscrawler.transformer_functions
...@@ -9,3 +9,21 @@ split: ...@@ -9,3 +9,21 @@ split:
replace: replace:
package: caoscrawler.transformer_functions package: caoscrawler.transformer_functions
function: replace function: replace
date_parse:
package: caoscrawler.transformer_functions
function: date_parse
datetime_parse:
package: caoscrawler.transformer_functions
function: datetime_parse
cast_to_int:
package: caoscrawler.transformer_functions
function: cast_to_int
cast_to_float:
package: caoscrawler.transformer_functions
function: cast_to_float
cast_to_bool:
package: caoscrawler.transformer_functions
function: cast_to_bool
cast_to_str:
package: caoscrawler.transformer_functions
function: cast_to_str
#!/usr/bin/env python3
# encoding: utf-8
#
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2024 Henrik tom Wörden <h.tomwoerden@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
from typing import Any
class ForbiddenTransaction(Exception):
"""Thrown if an transactions is needed that is not allowed.
For example an update of an entity if the security level is INSERT
"""
pass
class ImpossibleMergeError(Exception):
"""Thrown if due to identifying information, two SyncNodes or two Properties of SyncNodes
should be merged, but there is conflicting information that prevents this.
Parameters
----------
msg : str
A case-specific error message describing where the merger error occurred.
pname : str
The name of the property the values of which caused the merge error.
value_a, value_b : Any
The two values that couldn't be merged.
Attributes
----------
message : str
A case-specific error message describing where the merger error occurred.
values : tuple[Any]
The two values that couldn't be merged.
pname : str
The name of the property the values of which caused the merge error.
"""
def __init__(self, msg: str, pname: str, value_a: Any, value_b: Any):
self.pname = pname
self.values = (value_a, value_b)
self.message = msg
super().__init__(self, msg)
def __str__(self):
return (
f"{self.message}\n\nThe problematic property is '{self.pname}' with "
f"values '{self.values[0]}' and '{self.values[1]}'."
)
def __repr__(self):
return self.__str__()
class InvalidIdentifiableYAML(Exception):
"""Thrown if the identifiable definition is invalid."""
pass
class MissingIdentifyingProperty(Exception):
"""Thrown if a SyncNode does not have the properties required by the corresponding registered
identifiable
"""
pass
class MissingRecordType(Exception):
"""Thrown if an record type can not be found although it is expected that it exists on the
server.
"""
pass
class MissingReferencingEntityError(Exception):
"""Thrown if the identifiable requires that some entity references the given entity but there
is no such reference """
def __init__(self, *args, rts=None, **kwargs):
self.rts = rts
super().__init__(self, *args, **kwargs)
#!/usr/bin/env python3 #!/usr/bin/env python3
# encoding: utf-8 # encoding: utf-8
# #
# This file is a part of the CaosDB Project. # This file is a part of the LinkAhead Project.
# #
# Copyright (C) 2022 Henrik tom Wörden # Copyright (C) 2022 Henrik tom Wörden
# #
...@@ -20,23 +20,27 @@ ...@@ -20,23 +20,27 @@
# #
from __future__ import annotations from __future__ import annotations
import linkahead as db
from datetime import datetime
import json import json
from hashlib import sha256
from typing import Union
import logging import logging
from datetime import datetime
from hashlib import sha256
from typing import Optional, Union
import linkahead as db
from .exceptions import MissingIdentifyingProperty
from .sync_node import SyncNode
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class Identifiable(): class Identifiable():
""" """
The fingerprint of a Record in CaosDB. The fingerprint of a Record in LinkAhead.
This class contains the information that is used by the CaosDB Crawler to identify Records. This class contains the information that is used by the LinkAhead Crawler to identify Records.
On one hand, this can be the ID or a Record or the path of a File. In order to check whether a Record exits in the LinkAhead Server, a query can
On the other hand, in order to check whether a Record exits in the CaosDB Server, a query can
be created using the information contained in the Identifiable. be created using the information contained in the Identifiable.
Parameters Parameters
...@@ -46,23 +50,22 @@ class Identifiable(): ...@@ -46,23 +50,22 @@ class Identifiable():
properties: dict, keys are names of Properties; values are Property values properties: dict, keys are names of Properties; values are Property values
Note, that lists are not checked for equality but are interpreted as multiple Note, that lists are not checked for equality but are interpreted as multiple
conditions for a single Property. conditions for a single Property.
path: str, In case of files: The path where the file is stored.
backrefs: list, TODO future backrefs: list, TODO future
""" """
def __init__(self, record_id: int = None, path: str = None, record_type: str = None, def __init__(self, record_id: Optional[int] = None, record_type: Optional[str] = None,
name: str = None, properties: dict = None, name: Optional[str] = None, properties: Optional[dict] = None,
backrefs: list[Union[int, str]] = None): backrefs: Optional[list[Union[int, str]]] = None):
if (record_id is None and path is None and name is None if (record_id is None and name is None
and (backrefs is None or len(backrefs) == 0) and (backrefs is None or len(backrefs) == 0)
and (properties is None or len(properties) == 0)): and (properties is None or len(properties) == 0)):
raise ValueError("There is no identifying information. You need to add a path or " raise ValueError(
"There is no identifying information. You need to add "
"properties or other identifying attributes.") "properties or other identifying attributes.")
if properties is not None and 'name' in [k.lower() for k in properties.keys()]: if properties is not None and 'name' in [k.lower() for k in properties.keys()]:
raise ValueError("Please use the separete 'name' keyword instead of the properties " raise ValueError("Please use the separete 'name' keyword instead of the properties "
"dict for name") "dict for name")
self.record_id = record_id self.record_id = record_id
self.path = path
self.record_type = record_type self.record_type = record_type
self.name = name self.name = name
if name == "": if name == "":
...@@ -81,20 +84,17 @@ class Identifiable(): ...@@ -81,20 +84,17 @@ class Identifiable():
def _value_representation(value) -> str: def _value_representation(value) -> str:
"""returns the string representation of property values to be used in the hash function """returns the string representation of property values to be used in the hash function
The string is the path of a File Entity, the CaosDB ID or Python ID of other Entities The string is the LinkAhead ID in case of SyncNode objects (SyncNode objects must have an ID)
(Python Id only if there is no CaosDB ID) and the string representation of bool, float, int and the string representation of None, bool, float, int, datetime and str.
and str.
""" """
if value is None: if value is None:
return "None" return "None"
elif isinstance(value, db.File): elif isinstance(value, SyncNode):
return str(value.path)
elif isinstance(value, db.Entity):
if value.id is not None: if value.id is not None:
return str(value.id) return str(value.id)
else: else:
return "PyID=" + str(id(value)) raise RuntimeError("Python Entity (SyncNode) without ID not allowed")
elif isinstance(value, list): elif isinstance(value, list):
return "[" + ", ".join([Identifiable._value_representation(el) for el in value]) + "]" return "[" + ", ".join([Identifiable._value_representation(el) for el in value]) + "]"
elif (isinstance(value, str) or isinstance(value, int) or isinstance(value, float) elif (isinstance(value, str) or isinstance(value, int) or isinstance(value, float)
...@@ -120,27 +120,20 @@ class Identifiable(): ...@@ -120,27 +120,20 @@ class Identifiable():
return rec_string return rec_string
def __eq__(self, other) -> bool: def __eq__(self, other) -> bool:
""" """ Identifiables are equal if they share the same ID or if the representation is equal """
Identifiables are equal if they belong to the same Record. Since ID and path are on their
own enough to identify the Record it is sufficient if those attributes are equal.
1. both IDs are set (not None) -> equal if IDs are equal
2. both paths are set (not None) -> equal if paths are equal
3. equal if attribute representations are equal
"""
if not isinstance(other, Identifiable): if not isinstance(other, Identifiable):
raise ValueError("Identifiable can only be compared to other Identifiable objects.") raise ValueError("Identifiable can only be compared to other Identifiable objects.")
elif self.record_id is not None and other.record_id is not None: if self.record_id is not None and other.record_id is not None:
return self.record_id == other.record_id return self.record_id == other.record_id
elif self.path is not None and other.path is not None:
return self.path == other.path
elif self.get_representation() == other.get_representation(): elif self.get_representation() == other.get_representation():
return True return True
else: else:
return False return False
def __repr__(self): def __repr__(self):
pstring = json.dumps(self.properties) """ deterministic text representation of the identifiable """
pstring = json.dumps({k: str(v) for k, v in self.properties.items()})
return (f"{self.__class__.__name__} for RT {self.record_type}: id={self.record_id}; " return (f"{self.__class__.__name__} for RT {self.record_type}: id={self.record_id}; "
f"name={self.name}\n\tpath={self.path}\n" f"name={self.name}\n"
f"\tproperties:\n{pstring}\n" f"\tproperties:\n{pstring}\n"
f"\tbackrefs:\n{self.backrefs}") f"\tbackrefs:\n{self.backrefs}")
This diff is collapsed.
...@@ -20,29 +20,46 @@ ...@@ -20,29 +20,46 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>. # along with this program. If not, see <https://www.gnu.org/licenses/>.
import logging import logging
import sys
from caosadvancedtools.webui_formatter import WebUI_Formatter
from caosadvancedtools.serverside.helper import get_shared_filename from caosadvancedtools.serverside.helper import get_shared_filename
import sys from caosadvancedtools.webui_formatter import WebUI_Formatter
def configure_server_side_logging(): def configure_server_side_logging(max_log_level: int = logging.INFO):
""" """
Set logging up to save one plain debugging log file, one plain info log Set logging up to save one plain debugging log file, one plain info log
file (for users) and a stdout stream with messages wrapped in html elements file (for users) and a stdout stream with messages wrapped in html elements
returns the path to the file with debugging output returns the path to the file with debugging output
Parameters
----------
max_log_level : int, optional
The maximum log level to use for SSS-logs. Default is
``logging.INFO``.
Returns
-------
userlog_public, htmluserlog_public, debuglog_public: str
Public paths of the respective log files.
""" """
adv_logger = logging.getLogger("caosadvancedtools") adv_logger = logging.getLogger("caosadvancedtools")
adv_logger.setLevel(level=logging.DEBUG) # The max_<level> variables will be used to set the logger levels
# to the respective maximum of intended level and max_log_level,
# effectively cutting off logging above the specified
# max_log_level.
max_info = max(logging.INFO, max_log_level)
max_debug = max(logging.DEBUG, max_log_level)
adv_logger.setLevel(level=max_debug)
cr_logger = logging.getLogger("caoscrawler") cr_logger = logging.getLogger("caoscrawler")
cr_logger.setLevel(level=logging.DEBUG) cr_logger.setLevel(level=max_debug)
userlog_public, userlog_internal = get_shared_filename("userlog.txt") userlog_public, userlog_internal = get_shared_filename("userlog.txt")
root_logger = logging.getLogger() root_logger = logging.getLogger()
root_logger.setLevel(level=logging.INFO) root_logger.setLevel(level=max_info)
# this is a log file with INFO level for the user # this is a log file with INFO level for the user
user_file_handler = logging.FileHandler(filename=userlog_internal) user_file_handler = logging.FileHandler(filename=userlog_internal)
......
...@@ -25,10 +25,14 @@ ...@@ -25,10 +25,14 @@
# Function to expand a macro in yaml # Function to expand a macro in yaml
# A. Schlemmer, 05/2022 # A. Schlemmer, 05/2022
from dataclasses import dataclass import re
from typing import Any, Dict
from copy import deepcopy from copy import deepcopy
from dataclasses import dataclass
from string import Template from string import Template
from typing import Any, Dict
_SAFE_SUBST_PAT = re.compile(r"^\$(?P<key>\w+)$")
_SAFE_SUBST_PAT_BRACES = re.compile(r"^\$\{(?P<key>\w+)}$")
@dataclass @dataclass
...@@ -53,6 +57,12 @@ def substitute(propvalue, values: dict): ...@@ -53,6 +57,12 @@ def substitute(propvalue, values: dict):
Substitution of variables in strings using the variable substitution Substitution of variables in strings using the variable substitution
library from python's standard library. library from python's standard library.
""" """
# Simple matches are simply replaced by the raw dict entry.
if match := (_SAFE_SUBST_PAT.fullmatch(propvalue)
or _SAFE_SUBST_PAT_BRACES.fullmatch(propvalue)):
key = match.group("key")
if key in values:
return values[key]
propvalue_template = Template(propvalue) propvalue_template = Template(propvalue)
return propvalue_template.safe_substitute(**values) return propvalue_template.safe_substitute(**values)
......
...@@ -25,7 +25,9 @@ ...@@ -25,7 +25,9 @@
# #
""" """
This is the scanner, the original "_crawl" function from crawl.py. This is the scanner.
This was where formerly the ``_crawl(...)`` function from ``crawl.py`` was located.
This is just the functionality that extracts data from the file system. This is just the functionality that extracts data from the file system.
""" """
...@@ -37,7 +39,7 @@ import logging ...@@ -37,7 +39,7 @@ import logging
import os import os
import warnings import warnings
from collections.abc import Callable from collections.abc import Callable
from typing import Any, Optional, Type, Union from typing import Any, Optional, Union
import jinja2 import jinja2
...@@ -58,29 +60,45 @@ import pathlib ...@@ -58,29 +60,45 @@ import pathlib
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def load_definition(crawler_definition_path: str): def load_definition(crawler_definition_path: str) -> dict:
""" """
Load a cfood from a crawler definition defined by Load a cfood from a crawler definition defined by
crawler definition path and validate it using cfood-schema.yml. crawler definition path and validate it using cfood-schema.yml.
Arguments:
----------
crawler_definition_path: str
Path to the crawler definition file in yaml format.
Returns:
--------
dict containing the crawler definition.
""" """
# Load the cfood from a yaml file: # Load the cfood from a yaml file:
with open(crawler_definition_path, "r") as f: with open(crawler_definition_path, encoding="utf-8") as f:
crawler_definitions = list(yaml.safe_load_all(f)) crawler_definitions = list(yaml.safe_load_all(f))
crawler_definition = _load_definition_from_yaml_dict( crawler_definition = _load_definition_from_yaml_dict(crawler_definitions)
crawler_definitions)
return _resolve_validator_paths(crawler_definition, crawler_definition_path) return _resolve_validator_paths(crawler_definition, crawler_definition_path)
def _load_definition_from_yaml_dict(crawler_definitions: list[dict]): def _load_definition_from_yaml_dict(crawler_definitions: list[dict]) -> dict:
"""Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which """Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which
contains either one or two documents. contains either one or two documents.
Doesn't resolve the validator paths in the cfood definition, so for Doesn't resolve the validator paths in the cfood definition, so for
internal and testing use only. internal and testing use only.
Arguments:
----------
crawler_definitions: list[dict]
List of one or two dicts containing (optionally) metadata and the crawler definition.
Returns:
--------
dict containing the crawler definition.
""" """
if len(crawler_definitions) == 1: if len(crawler_definitions) == 1:
# Simple case, just one document: # Simple case, just one document:
...@@ -134,7 +152,8 @@ def _resolve_validator_paths(definition: dict, definition_path: str): ...@@ -134,7 +152,8 @@ def _resolve_validator_paths(definition: dict, definition_path: str):
# Validator is given by a path # Validator is given by a path
if not value.startswith('/'): if not value.startswith('/'):
# Not an absolute path # Not an absolute path
definition[key] = os.path.join(os.path.dirname(definition_path), value) definition[key] = os.path.join(
os.path.dirname(definition_path), value)
if not os.path.isfile(definition[key]): if not os.path.isfile(definition[key]):
# TODO(henrik) capture this in `crawler_main` similar to # TODO(henrik) capture this in `crawler_main` similar to
# `ConverterValidationError`. # `ConverterValidationError`.
...@@ -263,31 +282,31 @@ def scanner(items: list[StructureElement], ...@@ -263,31 +282,31 @@ def scanner(items: list[StructureElement],
crawled_data: Optional[list[db.Record]] = None, crawled_data: Optional[list[db.Record]] = None,
debug_tree: Optional[DebugTree] = None, debug_tree: Optional[DebugTree] = None,
registered_transformer_functions: Optional[dict] = None, registered_transformer_functions: Optional[dict] = None,
new_debug_tree: Optional[dict] = None): new_debug_tree: Optional[dict] = None) -> list[db.Record]:
"""Crawl a list of StructureElements and apply any matching converters. """Crawl a list of StructureElements and apply any matching converters.
Formerly known as "_crawl". Formerly known as ``_crawl(...)``.
Parameters Parameters
---------- ----------
items: items: list[StructureElement]
structure_elements (e.g. files and folders on one level on the hierarchy) structure_elements (e.g. files and folders on one level on the hierarchy)
converters: converters: list[Converter]
locally defined converters for treating structure elements. A locally locally defined converters for treating structure elements. A locally
defined converter could be one that is only valid for a specific subtree defined converter could be one that is only valid for a specific subtree
of the originally cralwed StructureElement structure. of the originally cralwed StructureElement structure.
general_store, record_store: general_store, record_store: GeneralStore, RecordStore, optional
This recursion of the crawl function should only operate on copies of This recursion of the crawl function should only operate on copies of
the global stores of the Crawler object. the global stores of the Crawler object.
restricted_path : list of strings, optional restricted_path : list[str], optional
traverse the data tree only along the given path. For example, when a traverse the data tree only along the given path. For example, when a
directory contains files a, b and c and b is given as restricted_path, a directory contains files a, b and c, and b is given as ``restricted_path``, a
and c will be ignroed by the crawler. When the end of the given path is and c will be ignored by the crawler. When the end of the given path is
reached, traverse the full tree as normal. The first element of the list reached, traverse the full tree as normal. The first element of the list
provided by restricted_path should be the name of the StructureElement provided by ``restricted_path`` should be the name of the StructureElement
at this level, i.e. denoting the respective element in the items at this level, i.e. denoting the respective element in the items
argument. argument.
...@@ -298,7 +317,8 @@ def scanner(items: list[StructureElement], ...@@ -298,7 +317,8 @@ def scanner(items: list[StructureElement],
Each function is a dictionary: Each function is a dictionary:
- The key is the name of the function to be looked up in the dictionary of registered transformer functions. - The key is the name of the function to be looked up in the dictionary of registered
transformer functions.
- The value is the function which needs to be of the form: - The value is the function which needs to be of the form:
def func(in_value: Any, in_parameters: dict) -> Any: def func(in_value: Any, in_parameters: dict) -> Any:
pass pass
...@@ -332,7 +352,8 @@ def scanner(items: list[StructureElement], ...@@ -332,7 +352,8 @@ def scanner(items: list[StructureElement],
pass pass
for element in items: for element in items:
element_path = os.path.join(*(structure_elements_path + [str(element.get_name())])) element_path = os.path.join(
*(structure_elements_path + [str(element.get_name())]))
logger.debug(f"Dealing with {element_path}") logger.debug(f"Dealing with {element_path}")
# Store whether this element was matched by at least one converter: # Store whether this element was matched by at least one converter:
at_least_one_matched = False at_least_one_matched = False
...@@ -368,7 +389,8 @@ def scanner(items: list[StructureElement], ...@@ -368,7 +389,8 @@ def scanner(items: list[StructureElement],
keys_modified = converter.create_records( keys_modified = converter.create_records(
general_store_copy, record_store_copy, element) general_store_copy, record_store_copy, element)
children = converter.create_children(general_store_copy, element) children = converter.create_children(
general_store_copy, element)
if debug_tree is not None: if debug_tree is not None:
# add provenance information for each variable # add provenance information for each variable
...@@ -380,6 +402,9 @@ def scanner(items: list[StructureElement], ...@@ -380,6 +402,9 @@ def scanner(items: list[StructureElement],
debug_tree.debug_metadata["usage"][str(element)].add( debug_tree.debug_metadata["usage"][str(element)].add(
"/".join(converters_path + [converter.name])) "/".join(converters_path + [converter.name]))
mod_info = debug_tree.debug_metadata["provenance"] mod_info = debug_tree.debug_metadata["provenance"]
# TODO: actually keys_modified must not be None. create_records should
# always return a list.
if keys_modified is not None:
for record_name, prop_name in keys_modified: for record_name, prop_name in keys_modified:
# TODO: check # TODO: check
internal_id = record_store_copy.get_internal_id( internal_id = record_store_copy.get_internal_id(
...@@ -445,6 +470,9 @@ def scanner(items: list[StructureElement], ...@@ -445,6 +470,9 @@ def scanner(items: list[StructureElement],
element_dictionary["matching_converters"] = matching_converters element_dictionary["matching_converters"] = matching_converters
new_debug_tree.append(element_dictionary) new_debug_tree.append(element_dictionary)
# Clean up converter:
converter.cleanup()
if restricted_path and not path_found: if restricted_path and not path_found:
raise RuntimeError("A 'restricted_path' argument was given that is not contained in " raise RuntimeError("A 'restricted_path' argument was given that is not contained in "
"the data tree") "the data tree")
...@@ -463,7 +491,7 @@ def scanner(items: list[StructureElement], ...@@ -463,7 +491,7 @@ def scanner(items: list[StructureElement],
# -------------------------------------------------------------------------------- # --------------------------------------------------------------------------------
def scan_directory(dirname: str, crawler_definition_path: str, def scan_directory(dirname: Union[str, list[str]], crawler_definition_path: str,
restricted_path: Optional[list[str]] = None, restricted_path: Optional[list[str]] = None,
debug_tree: Optional[DebugTree] = None, debug_tree: Optional[DebugTree] = None,
new_debug_tree: Optional[dict] = None): new_debug_tree: Optional[dict] = None):
...@@ -477,10 +505,12 @@ def scan_directory(dirname: str, crawler_definition_path: str, ...@@ -477,10 +505,12 @@ def scan_directory(dirname: str, crawler_definition_path: str,
Parameters Parameters
---------- ----------
dirname: str or list[str]
directory or list of directories to be scanned
restricted_path: optional, list of strings restricted_path: optional, list of strings
Traverse the data tree only along the given path. When the end of the given path Traverse the data tree only along the given path. When the end
is reached, traverse the full tree as normal. See docstring of 'scanner' for of the given path is reached, traverse the full tree as
more details. normal. See docstring of 'scanner' for more details.
Returns Returns
------- -------
...@@ -493,23 +523,31 @@ def scan_directory(dirname: str, crawler_definition_path: str, ...@@ -493,23 +523,31 @@ def scan_directory(dirname: str, crawler_definition_path: str,
converter_registry = create_converter_registry(crawler_definition) converter_registry = create_converter_registry(crawler_definition)
# Load and register transformer functions: # Load and register transformer functions:
registered_transformer_functions = create_transformer_registry(crawler_definition) registered_transformer_functions = create_transformer_registry(
crawler_definition)
if not dirname: if not dirname:
raise ValueError( raise ValueError(
"You have to provide a non-empty path for crawling.") "You have to provide a non-empty path for crawling.")
dir_structure_name = os.path.basename(dirname) if not isinstance(dirname, list):
dirname = [dirname]
dir_element_list = []
for dname in dirname:
dir_structure_name = os.path.basename(dname)
# TODO: needs to be covered somewhere else # TODO: needs to be covered somewhere else
crawled_directory = dirname crawled_directory = dname
if not dir_structure_name and dirname.endswith('/'): if not dir_structure_name and dname.endswith(os.path.sep):
if dirname == '/': if dname == os.path.sep:
# Crawling the entire file system # Crawling the entire file system
dir_structure_name = "root" dir_structure_name = "root"
else: else:
# dirname had a trailing '/' # dirname had a trailing '/'
dir_structure_name = os.path.basename(dirname[:-1]) dir_structure_name = os.path.basename(dname[:-1])
dir_element_list.append(Directory(dir_structure_name, dname))
<< << << < HEAD
dir_structure_element = Directory(dir_structure_name, dirname) dir_structure_element = Directory(dir_structure_name, dirname)
return scan_structure_elements(dir_structure_element, return scan_structure_elements(dir_structure_element,
...@@ -520,6 +558,16 @@ def scan_directory(dirname: str, crawler_definition_path: str, ...@@ -520,6 +558,16 @@ def scan_directory(dirname: str, crawler_definition_path: str,
registered_transformer_functions=registered_transformer_functions, registered_transformer_functions=registered_transformer_functions,
new_debug_tree=new_debug_tree new_debug_tree=new_debug_tree
) )
== == == =
return scan_structure_elements(
dir_element_list,
crawler_definition,
converter_registry,
restricted_path=restricted_path,
debug_tree=debug_tree,
registered_transformer_functions=registered_transformer_functions
)
>>>>>> > dev
def scan_structure_elements(items: Union[list[StructureElement], StructureElement], def scan_structure_elements(items: Union[list[StructureElement], StructureElement],
...@@ -527,8 +575,15 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen ...@@ -527,8 +575,15 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen
converter_registry: dict, converter_registry: dict,
restricted_path: Optional[list[str]] = None, restricted_path: Optional[list[str]] = None,
debug_tree: Optional[DebugTree] = None, debug_tree: Optional[DebugTree] = None,
<< << << < HEAD
registered_transformer_functions: Optional[dict] = None, registered_transformer_functions: Optional[dict] = None,
new_debug_tree: Optional[dict] = None): new_debug_tree: Optional[dict] = None):
== == == =
registered_transformer_functions: Optional[dict] = None) -> (
list[db.Record]):
>>>>>> > dev
""" """
Start point of the crawler recursion. Start point of the crawler recursion.
...@@ -542,14 +597,14 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen ...@@ -542,14 +597,14 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen
crawler_definition : dict crawler_definition : dict
A dictionary representing the crawler definition, possibly from a yaml A dictionary representing the crawler definition, possibly from a yaml
file. file.
restricted_path: optional, list of strings restricted_path: list[str], optional
Traverse the data tree only along the given path. When the end of the Traverse the data tree only along the given path. When the end of the
given path is reached, traverse the full tree as normal. See docstring given path is reached, traverse the full tree as normal. See docstring
of 'scanner' for more details. of 'scanner' for more details.
Returns Returns
------- -------
crawled_data : list crawled_data : list[db.Record]
the final list with the target state of Records. the final list with the target state of Records.
""" """
......
This diff is collapsed.
# encoding: utf-8
#
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2024 Daniel Hornung <d.hornung@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Submdule containing all default and optional converters."""
from .. import utils
from .structure_elements import *
try:
from .rocrate_structure_elements import ROCrateEntity
except ImportError as err:
ROCrateEntity: type = utils.MissingImport(
name="ROCrateEntity", hint="Try installing with the `rocrate` extra option.",
err=err)
#!/usr/bin/env python3
# encoding: utf-8
#
# ** header v3.0
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2024 Alexander Schlemmer
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# ** end header
#
from rocrate.model.entity import Entity
from .structure_elements import StructureElement
class ROCrateEntity(StructureElement):
"""
Store entities contained in ROCrates.
"""
def __init__(self, folder: str, entity: Entity):
"""
Initializes this ROCrateEntity.
Arguments:
----------
folder: str
The folder that contains the ROCrate data. In case of a zipped ROCrate, this
is a temporary folder that the ROCrate was unzipped to.
The folder is the folder containing the ro-crate-metadata.json.
entity: Entity
The ROCrate entity that is stored in this structure element.
The entity automatically contains an attribute ".crate"
that stores the ROCrate that this entity belongs to. It can be used
e.g. to look up links to other entities (ROCrate.dereference).
"""
super().__init__(entity.properties()["@id"])
self.folder = folder
self.entity = entity
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment