diff --git a/CHANGELOG.md b/CHANGELOG.md index 240e1aff64dc555c0316606dce19589fea02a863..352311d0910bcf0c7f60183b6f58dd7ffdcb0ed4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Documentation ### +## [0.7.1] - 2024-03-21 ## + +### Fixed ### + +* `crawler_main` doesn't need the deprecated `debug=True` anymore to put out a + provenance file if the `provenance_file` parameter is provided. +* [indiscale#129](https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/129) + missing packaging dependency. + ## [0.7.0] - 2024-03-04 ## ### Added ### diff --git a/CITATION.cff b/CITATION.cff index de05ce11bc6667e508e3504629ba517a90642aef..abbd6b21e19c5a989c6d6d24f32d3946df070308 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -17,6 +17,6 @@ authors: given-names: Alexander orcid: https://orcid.org/0000-0003-4124-9649 title: CaosDB - Crawler -version: 0.7.0 +version: 0.7.1 doi: 10.3390/data9020024 -date-released: 2023-03-04 \ No newline at end of file +date-released: 2023-03-21 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 1e8abb8eeee2ba4b861a71880c96d943f813c812..88898530f7b7e049e84b230bdcbd45ff5170fabf 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = caoscrawler -version = 0.7.1 +version = 0.7.2 author = Alexander Schlemmer author_email = alexander.schlemmer@ds.mpg.de description = A new crawler for caosdb @@ -19,14 +19,15 @@ package_dir = packages = find: python_requires = >=3.7 install_requires = - importlib-resources caosadvancedtools >= 0.7.0 + importlib-resources + importlib_metadata;python_version<'3.8' linkahead > 0.13.2 - yaml-header-tools >= 0.2.1 - pyyaml odfpy #make optional + packaging pandas - importlib_metadata;python_version<'3.8' + pyyaml + yaml-header-tools >= 0.2.1 [options.packages.find] where = src diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 772f690b2ce7f8a3822d0d833f13c84cdd9d5e35..e0ca0f9bff77ba1ecc63f4d102d6d9869fb11cb0 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -389,8 +389,8 @@ class Converter(object, metaclass=ABCMeta): Extract information from the structure element and store them as values in the general store. - Parameters: - ------------ + Parameters + ---------- values: GeneralStore The GeneralStore to store values in. @@ -409,8 +409,8 @@ class Converter(object, metaclass=ABCMeta): Check if transformers are defined using the "transform" keyword. Then apply the transformers to the variables defined in GeneralStore "values". - Parameters: - ------------ + Parameters + ---------- values: GeneralStore The GeneralStore to store values in. @@ -765,6 +765,12 @@ schema_resource: class DictElementConverter(Converter): + """ +**Operates on:** :py:class:`caoscrawler.structure_elements.DictElement` + +**Generates:** :py:class:`caoscrawler.structure_elements.StructureElement` + """ + def create_children(self, generalStore: GeneralStore, element: StructureElement): # TODO: See comment on types and inheritance if not isinstance(element, DictElement): @@ -1154,6 +1160,12 @@ class TableConverter(Converter): class XLSXTableConverter(TableConverter): + """ +**Operates on:** :py:class:`caoscrawler.structure_elements.File` + +**Generates:** :py:class:`caoscrawler.structure_elements.DictElement` + """ + def get_options(self): return self._get_options([ ("sheet_name", str), diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index ffd0185604f243e119f3efd136488612276a9c05..d21e6e2521578dc407e445d8220506677be84e26 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -1504,7 +1504,7 @@ def crawler_main(crawled_directory_path: str, dry_run: bool = False, prefix: str = "", securityMode: SecurityMode = SecurityMode.UPDATE, - unique_names=True, + unique_names: bool = True, restricted_path: Optional[list[str]] = None, remove_prefix: Optional[str] = None, add_prefix: Optional[str] = None, @@ -1520,9 +1520,9 @@ def crawler_main(crawled_directory_path: str, identifiables_definition_file : str filename of an identifiable definition yaml file debug : bool - DEPRECATED, whether or not to run in debug mode + DEPRECATED, use a provenance file instead. provenance_file : str - provenance information will be stored in a file with given filename + Provenance information will be stored in a file with given filename dry_run : bool do not commit any chnages to the server prefix : str @@ -1562,7 +1562,7 @@ def crawler_main(crawled_directory_path: str, _fix_file_paths(crawled_data, add_prefix, remove_prefix) _check_record_types(crawled_data) - if provenance_file is not None and debug: + if provenance_file is not None: crawler.save_debug_data(debug_tree=debug_tree, filename=provenance_file) if identifiables_definition_file is not None: diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 9be539fe0842e3ce24b68060fa2288cdc4c531b2..3a9b8819b2f16ae7b6b2dc4ff06d94055da14d60 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -90,7 +90,7 @@ Some terms: - ``is_referenced_by`` statements - An *identifiable* belongs to a concrete record. It consists of identifying attributes which "fill in" the *registered identifiable*. In code, it can be represented as a Record based on the - *registered identifiable* with all the values filled in. + *registered identifiable* with all the values filled in. - An *identified record* is the result of retrieving a record from the database, based on the *identifiable* (and its values). diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index 5d92401a50d3e4aa9cc258e792be41df18ef3976..9d1f538732858ff2fbf949d45c359ebb16fe3480 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -25,7 +25,9 @@ # """ -This is the scanner, the original "_crawl" function from crawl.py. +This is the scanner. + +This was where formerly the ``_crawl(...)`` function from ``crawl.py`` was located. This is just the functionality that extracts data from the file system. """ @@ -257,31 +259,31 @@ def scanner(items: list[StructureElement], restricted_path: Optional[list[str]] = None, crawled_data: Optional[list[db.Record]] = None, debug_tree: Optional[DebugTree] = None, - registered_transformer_functions: Optional[dict] = None): + registered_transformer_functions: Optional[dict] = None) -> list[db.Record]: """Crawl a list of StructureElements and apply any matching converters. - Formerly known as "_crawl". + Formerly known as ``_crawl(...)``. Parameters ---------- - items: + items: list[StructureElement] structure_elements (e.g. files and folders on one level on the hierarchy) - converters: + converters: list[Converter] locally defined converters for treating structure elements. A locally defined converter could be one that is only valid for a specific subtree of the originally cralwed StructureElement structure. - general_store, record_store: + general_store, record_store: GeneralStore, RecordStore, optional This recursion of the crawl function should only operate on copies of the global stores of the Crawler object. - restricted_path : list of strings, optional + restricted_path : list[str], optional traverse the data tree only along the given path. For example, when a - directory contains files a, b and c and b is given as restricted_path, a - and c will be ignroed by the crawler. When the end of the given path is + directory contains files a, b and c, and b is given as ``restricted_path``, a + and c will be ignored by the crawler. When the end of the given path is reached, traverse the full tree as normal. The first element of the list - provided by restricted_path should be the name of the StructureElement + provided by ``restricted_path`` should be the name of the StructureElement at this level, i.e. denoting the respective element in the items argument. @@ -292,7 +294,8 @@ def scanner(items: list[StructureElement], Each function is a dictionary: - - The key is the name of the function to be looked up in the dictionary of registered transformer functions. + - The key is the name of the function to be looked up in the dictionary of registered + transformer functions. - The value is the function which needs to be of the form: def func(in_value: Any, in_parameters: dict) -> Any: pass @@ -457,7 +460,8 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen converter_registry: dict, restricted_path: Optional[list[str]] = None, debug_tree: Optional[DebugTree] = None, - registered_transformer_functions: Optional[dict] = None): + registered_transformer_functions: Optional[dict] = None) -> ( + list[db.Record]): """ Start point of the crawler recursion. @@ -471,14 +475,14 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen crawler_definition : dict A dictionary representing the crawler definition, possibly from a yaml file. - restricted_path: optional, list of strings + restricted_path: list[str], optional Traverse the data tree only along the given path. When the end of the given path is reached, traverse the full tree as normal. See docstring - of 'scanner' formore details. + of 'scanner' for more details. Returns ------- - crawled_data : list + crawled_data : list[db.Record] the final list with the target state of Records. """ diff --git a/src/caoscrawler/structure_elements.py b/src/caoscrawler/structure_elements.py index 952f29d012f8373062ed9dfe8a830bd18c4b0baa..ff070626ebfdd580c16bbbf2dc30ab330dc162f0 100644 --- a/src/caoscrawler/structure_elements.py +++ b/src/caoscrawler/structure_elements.py @@ -28,9 +28,16 @@ import warnings class StructureElement(object): - """ base class for elements in the hierarchical data structure """ + """Base class for elements in the hierarchical data structure. - def __init__(self, name): +Parameters +---------- + +name: str + The name of the StructureElement. May be used for pattern matching by CFood rules. + """ + + def __init__(self, name: str): # Used to store usage information for debugging: self.metadata: tDict[str, set[str]] = { "usage": set() @@ -46,6 +53,18 @@ class StructureElement(object): class FileSystemStructureElement(StructureElement): + """StructureElement representing an element of a file system, like a directory or a simple file. + +Parameters +---------- + +name: str + The name of the StructureElement. May be used for pattern matching by CFood rules. + +path: str + The path to the file or directory. + """ + def __init__(self, name: str, path: str): super().__init__(name) self.path = path @@ -65,6 +84,7 @@ class Directory(FileSystemStructureElement): class File(FileSystemStructureElement): + """StrutureElement representing a file.""" pass diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst index 32176b9edb895074021b3ed4eabe270ad48ae632..7100bcd1790edb3e040a1a90663a32a09b7c8eaf 100644 --- a/src/doc/concepts.rst +++ b/src/doc/concepts.rst @@ -1,18 +1,17 @@ Concepts -)))))))) +======== -The CaosDB Crawler can handle any kind of hierarchical data structure. The typical use case is +The CaosDB Crawler can handle any kind of hierarchical data structure. The typical use case is a directory tree that is traversed. We use the following terms/concepts to describe how the CaosDB Crawler works. Structure Elements ++++++++++++++++++ -This hierarchical structure is assumed to be consituted of a tree of -StructureElements. The tree is created on the fly by so called Converters which -are defined in a yaml file. The tree of StructureElements is a model -of the existing data (For example could a tree of Python file objects -(StructureElements) represent a file tree that exists on some file server). +The crawled hierarchical structure is represented by a tree of *StructureElements*. This tree is +generated on the fly by so called Converters which are defined in a yaml file (usually called +``cfood.yml``). This generated tree of StructureElements is a model of the existing data. For +example a tree of Python *file objects* (StructureElements) could correspond to a file system tree. Relevant sources in: @@ -23,29 +22,28 @@ Relevant sources in: Converters ++++++++++ -Converters treat StructureElements and thereby create the StructureElement that -are the children of the treated StructureElement. Converters therefore create -the above named tree. The definition of a Converter also contains what -Converters shall be used to treat the generated child-StructureElements. The -definition is therefore a tree itself. - -See :std:doc:`converters<converters>` for details. +Converters treat a StructureElement and during this process create a number of new +StructureElements: the children of the initially treated StructureElement. Thus by treatment of +existing StructureElements, Converters create a tree of StructureElements. +.. image:: img/converter.png + :height: 170 +See :std:doc:`converters<converters>` for details. Relevant sources in: -- ``src/converters.py`` +- ``src/converters.py`` Identifiables +++++++++++++ -An Identifiable of a Record is like the fingerprint of a Record. +An *Identifiable* of a Record is like the fingerprint of a Record. -The identifiable contains the information that is used by the CaosDB Crawler to identify Records. -For example, in order to check whether a Record exits in the CaosDB Server, the CaosDB Crawler creates a query -using the information contained in the Identifiable. +The Identifiable contains the information that is used by the CaosDB Crawler to identify Records. +For example, the CaosDB Crawler may create a query using the information contained in the +Identifiable in order to check whether a Record exists in the CaosDB Server. Suppose a certain experiment is at most done once per day, then the identifiable could consist of the RecordType "SomeExperiment" (as a parent) and the Property "date" with the respective value. @@ -100,7 +98,9 @@ The Crawler +++++++++++ The crawler can be considered the main program doing the synchronization in basically two steps: + #. Based on a yaml-specification scan the file system (or other sources) and create a set of CaosDB Entities that are supposed to be inserted or updated in a CaosDB instance. + #. Compare the current state of the CaosDB instance with the set of CaosDB Entities created in step 1, taking into account the :ref:`registered identifiables<Identifiables>`. Insert or update entites accordingly. Relevant sources in: diff --git a/src/doc/conf.py b/src/doc/conf.py index ac5fc7f5d95c2399f4402121ba9445bc0dbc6aaa..3cce99d03728d229c848ba6374d15de9fe73ec7b 100644 --- a/src/doc/conf.py +++ b/src/doc/conf.py @@ -33,10 +33,10 @@ copyright = '2024, IndiScale' author = 'Alexander Schlemmer' # The short X.Y version -version = '0.7.1' +version = '0.7.2' # The full version, including alpha/beta/rc tags # release = '0.5.2-rc2' -release = '0.7.1-dev' +release = '0.7.2-dev' # -- General configuration --------------------------------------------------- diff --git a/src/doc/converters.rst b/src/doc/converters.rst index 44988fbd497cdb57023b5a696f83d55e7eb5113a..9b28c9a61eec4d9707b9640720b9c6a44a8fe25e 100644 --- a/src/doc/converters.rst +++ b/src/doc/converters.rst @@ -1,18 +1,23 @@ Converters )))))))))) -Converters treat StructureElements and thereby create the StructureElement that -are the children of the treated StructureElement. Converters therefore create -the tree of structure elements. The definition of a Converter also contains what -Converters shall be used to treat the generated child-StructureElements. The -definition is therefore a tree itself. - -Each StructureElement in the tree has a set of data values, i.e a dictionary of -key value pairs. -Some of those values are set due to the kind of StructureElement. For example, -a file could have the file name as such a key value pair: 'filename': <sth>. +Converters treat a StructureElement and during this process create a number of new +StructureElements: the children of the initially treated StructureElement. Thus by treatment of +existing StructureElements, Converters create a tree of StructureElements. + +.. image:: img/converter.png + :height: 170 + +The ``cfood.yml`` definition also describes which +Converters shall be used to treat the generated child StructureElements. The +definition therefore itself also defines a tree. + +Each StructureElement in the tree has a set of properties, organized as +key-value pairs. +Some of those properties are specified by the type of StructureElement. For example, +a file could have the file name as property: ``'filename': myfile.dat``. Converters may define additional functions that create further values. For -example, a regular expresion could be used to get a date from a file name. +example, a regular expression could be used to get a date from a file name. A converter is defined via a yml file or part of it. The definition states @@ -20,7 +25,7 @@ what kind of StructureElement it treats (typically one). Also, it defines how children of the current StructureElement are created and what Converters shall be used to treat those. -The yaml definition looks like the following: +The yaml definition may look like this: TODO: outdated, see cfood-schema.yml @@ -53,8 +58,9 @@ to generate records (see :py:meth:`~caoscrawler.converters.Converter.create_reco **records** is a dict of definitions that define the semantic structure (see details below). -Subtree contains a list of Converter defnitions that look like the one -described here. +**subtree** makes the yaml recursive: It contains a list of new Converter +definitions, which work on the StructureElements that are returned by the +current Converter. Transform Functions +++++++++++++++++++ @@ -108,6 +114,9 @@ them to the cfood definition (see :doc:`CFood Documentation<cfood>`). Standard Converters +++++++++++++++++++ +These are the standard converters that exist in a default installation. For writing and applying +*custom converters*, see :ref:`below <Custom Converters>`. + Directory Converter =================== The Directory Converter creates StructureElements for each File and Directory @@ -126,11 +135,14 @@ children elements according to the structure of the header. DictElement Converter ===================== + +DictElement → StructureElement + Creates a child StructureElement for each key in the dictionary. Typical Subtree converters -------------------------- -The following StructureElement are typically created: +The following StructureElement types are typically created by the DictElement converter: - BooleanElement - FloatElement @@ -155,12 +167,12 @@ behavior can be adjusted with the fields `accept_text`, `accept_int`, The following denotes what kind of StructureElements are accepted by default (they are defined in `src/caoscrawler/converters.py`): -- DictBooleanElementConverter: bool, int -- DictFloatElementConverter: int, float -- DictTextElementConverter: text, bool, int, float -- DictIntegerElementConverter: int -- DictListElementConverter: list -- DictDictElementConverter: dict +- BooleanElementConverter: bool, int +- FloatElementConverter: int, float +- TextElementConverter: text, bool, int, float +- IntegerElementConverter: int +- ListElementConverter: list +- DictElementConverter: dict YAMLFileConverter ================= @@ -180,11 +192,13 @@ JSONFileConverter TableConverter ============== +Table → DictElement + A generic converter (abstract) for files containing tables. -Currently, there are two specialized implementations for xlsx-files and csv-files. +Currently, there are two specialized implementations for XLSX files and CSV files. -All table converters generate a subtree that can be converted with DictDictElementConverters: -For each row in the table a DictDictElement (structure element) is generated. The key of the +All table converters generate a subtree of dicts, which in turn can be converted with DictElementConverters: +For each row in the table the TableConverter generates a DictElement (structure element). The key of the element is the row number. The value of the element is a dict containing the mapping of column names to values of the respective cell. @@ -193,21 +207,21 @@ Example: .. code-block:: yaml subtree: - TABLE: + TABLE: # Any name for the table as a whole type: CSVTableConverter match: ^test_table.csv$ records: (...) # Records edited for the whole table file subtree: - ROW: - type: DictDictElement + ROW: # Any name for a data row in the table + type: DictElement match_name: .* match_value: .* records: (...) # Records edited for each row subtree: - COLUMN: - type: DictFloatElement + COLUMN: # Any name for a specific type of column in the table + type: FloatElement match_name: measurement # Name of the column in the table file match_value: (?P<column_value).*) records: @@ -217,9 +231,13 @@ Example: XLSXTableConverter ================== +XLSX File → DictElement + CSVTableConverter ================= +CSV File → DictElement + Further converters ++++++++++++++++++ @@ -322,11 +340,15 @@ file in a text property, the name of which can be configured with the Custom Converters +++++++++++++++++ -It was previously mentioned that it is possible to create custom converters. +As mentioned before it is possible to create custom converters. These custom converters can be used to integrate arbitrary data extraction and ETL capabilities -into the caosdb-crawler and make these extensions available to any yaml specification. +into the LinkAhead crawler and make these extensions available to any yaml specification. + +Tell the crawler about a custom converter +========================================= -The basic syntax for adding a custom converter to a yaml cfood definition file is: +To use a custom crawler, it must be defined in the ``Converters`` section of the CFood yaml file. +The basic syntax for adding a custom converter to a definition file is: .. code-block:: yaml @@ -335,7 +357,7 @@ The basic syntax for adding a custom converter to a yaml cfood definition file i package: <python>.<module>.<name> converter: <PythonClassName> -The Converters-section can be either put into the first or second document of the cfood yaml file. +The Converters section can be either put into the first or the second document of the cfood yaml file. It can be also part of a single-document yaml cfood file. Please refer to :doc:`the cfood documentation<cfood>` for more details. Details: @@ -344,9 +366,16 @@ Details: - **<python>.<module>.<name>**: The name of the module where the converter class resides. - **<PythonClassName>**: Within this specified module there must be a class inheriting from base class :py:class:`caoscrawler.converters.Converter`. +Implementing a custom converter +=============================== + +Converters inherit from the :py:class:`~caoscrawler.converters.Converter` class. + The following methods are abstract and need to be overwritten by your custom converter to make it work: -- :py:meth:`~caoscrawler.converters.Converter.create_children` +:py:meth:`~caoscrawler.converters.Converter.create_children`: + Return a list of child StructureElement objects. + - :py:meth:`~caoscrawler.converters.Converter.match` - :py:meth:`~caoscrawler.converters.Converter.typecheck` diff --git a/src/doc/getting_started/furtherreading.rst b/src/doc/getting_started/furtherreading.rst new file mode 100644 index 0000000000000000000000000000000000000000..eb600416c1fce3857d28fc2e856ceabebb3a8bb7 --- /dev/null +++ b/src/doc/getting_started/furtherreading.rst @@ -0,0 +1,8 @@ +Further reading +=============== + +- A simple `documented example <https://gitlab.com/caosdb/documented-crawler-example>`_ which + demonstrates the crawler usage. +- Some useful examples can be found in the `integration tests + <https://gitlab.com/caosdb/caosdb-crawler/-/tree/main/integrationtests>`_ (and to a certain extent + in the unit tests). diff --git a/src/doc/getting_started/index.rst b/src/doc/getting_started/index.rst index 490c705f2feb9eeedc399e8c1d91e28abcd7fd12..86b34d069391b146d15599228067df2e9e41d642 100644 --- a/src/doc/getting_started/index.rst +++ b/src/doc/getting_started/index.rst @@ -10,6 +10,7 @@ Getting Started prerequisites helloworld optionalfeatures + furtherreading This section will help you get going! From the first installation steps to the first simple crawl. diff --git a/src/doc/img/converter.png b/src/doc/img/converter.png new file mode 100644 index 0000000000000000000000000000000000000000..c11517a32ceb164510a7731ff0516d19db71801a Binary files /dev/null and b/src/doc/img/converter.png differ diff --git a/src/doc/img/converter.svg b/src/doc/img/converter.svg new file mode 100644 index 0000000000000000000000000000000000000000..af32ff69cdd6c25805f929458556310b3ee34f41 --- /dev/null +++ b/src/doc/img/converter.svg @@ -0,0 +1,442 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<svg + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns="http://www.w3.org/2000/svg" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + width="72.854424mm" + height="33.470383mm" + viewBox="0 0 72.854423 33.470383" + version="1.1" + id="svg13434" + inkscape:version="1.0.2 (e86c870879, 2021-01-15)" + sodipodi:docname="converter.svg" + inkscape:export-filename="/home/daniel/indiscale/software/linkahead/caosdb-crawler/src/doc/img/converter.png" + inkscape:export-xdpi="299.83078" + inkscape:export-ydpi="299.83078"> + <defs + id="defs13428"> + <marker + style="overflow:visible;" + id="marker1559" + refX="0.0" + refY="0.0" + orient="auto" + inkscape:stockid="Arrow2Mend" + inkscape:isstock="true"> + <path + transform="scale(0.6) rotate(180) translate(0,0)" + d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z " + style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round;stroke:#000000;stroke-opacity:1;fill:#000000;fill-opacity:1" + id="path1557" /> + </marker> + <marker + style="overflow:visible" + id="marker1266" + refX="0" + refY="0" + orient="auto" + inkscape:stockid="Arrow2Mend" + inkscape:isstock="true"> + <path + transform="scale(-0.6)" + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1" + id="path1264" /> + </marker> + <marker + style="overflow:visible" + id="marker1218" + refX="0" + refY="0" + orient="auto" + inkscape:stockid="Arrow2Mend" + inkscape:isstock="true" + inkscape:collect="always"> + <path + transform="scale(-0.6)" + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1" + id="path1216" /> + </marker> + <marker + style="overflow:visible" + id="Arrow2Mend" + refX="0" + refY="0" + orient="auto" + inkscape:stockid="Arrow2Mend" + inkscape:isstock="true" + inkscape:collect="always"> + <path + transform="scale(-0.6)" + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1" + id="path909" /> + </marker> + <marker + style="overflow:visible" + id="Arrow1Lend" + refX="0" + refY="0" + orient="auto" + inkscape:stockid="Arrow1Lend" + inkscape:isstock="true"> + <path + transform="matrix(-0.8,0,0,-0.8,-10,0)" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:1pt;stroke-opacity:1" + d="M 0,0 5,-5 -12.5,0 5,5 Z" + id="path885" /> + </marker> + <marker + style="overflow:visible" + id="marker1559-2" + refX="0" + refY="0" + orient="auto" + inkscape:stockid="Arrow2Mend" + inkscape:isstock="true"> + <path + transform="scale(-0.6)" + d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z" + style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:#000000;stroke-width:0.625;stroke-linejoin:round;stroke-opacity:1" + id="path1557-9" /> + </marker> + </defs> + <sodipodi:namedview + id="base" + pagecolor="#ffffff" + bordercolor="#666666" + borderopacity="1.0" + inkscape:pageopacity="0.0" + inkscape:pageshadow="2" + inkscape:zoom="2.8" + inkscape:cx="120.68286" + inkscape:cy="23.831081" + inkscape:document-units="mm" + inkscape:current-layer="g1411" + inkscape:document-rotation="0" + showgrid="false" + inkscape:snap-global="false" + inkscape:window-width="1920" + inkscape:window-height="1135" + inkscape:window-x="0" + inkscape:window-y="0" + inkscape:window-maximized="1" + lock-margins="true" + fit-margin-top="2" + fit-margin-left="2" + fit-margin-right="2" + fit-margin-bottom="2" /> + <metadata + id="metadata13431"> + <rdf:RDF> + <cc:Work + rdf:about=""> + <dc:format>image/svg+xml</dc:format> + <dc:type + rdf:resource="http://purl.org/dc/dcmitype/StillImage" /> + <dc:title></dc:title> + </cc:Work> + </rdf:RDF> + </metadata> + <g + inkscape:label="Ebene 1" + inkscape:groupmode="layer" + id="layer1" + transform="translate(-8.1569115,-36.221295)"> + <g + id="g1411" + transform="translate(32.258972,-4.0381556)"> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + d="m 26.22787,46.991961 -0.04324,7.85981" + id="path870" + sodipodi:nodetypes="cc" /> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + d="m 27.268191,47.234524 6.5917,7.093847" + id="path872" + sodipodi:nodetypes="cc" /> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + d="M 17.211264,56.167197 12.543075,64.49543" + id="path874" + sodipodi:nodetypes="cc" /> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + d="m 19.403188,56.222309 1.865426,8.356695" + id="path876" + sodipodi:nodetypes="cc" /> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.265;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#Arrow2Mend);paint-order:markers fill stroke;stop-color:#000000" + d="m 34.590338,55.360048 c 1.051358,-1.820435 1.974353,-2.426981 3.317324,-2.31217 0.956924,0.08181 1.647835,1.289889 2.049783,2.024833" + id="path880" + sodipodi:nodetypes="cac" /> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + d="m 25.076267,47.179412 -6.5917,7.093847" + id="path14001" + sodipodi:nodetypes="cc" /> + <rect + style="opacity:1;fill:#25e325;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + id="rect13997" + width="4.4276514" + height="3.9112766" + x="23.986937" + y="44.075451" /> + <rect + style="opacity:1;fill:#ff8cff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + id="rect13999" + width="4.4276514" + height="3.9112766" + x="15.955473" + y="53.282654" /> + <path + sodipodi:type="star" + style="opacity:1;fill:#ff8cff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + id="path14003" + sodipodi:sides="3" + sodipodi:cx="26.161613" + sodipodi:cy="55.658291" + sodipodi:r1="2.7924292" + sodipodi:r2="1.3962145" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="true" + inkscape:rounded="0" + inkscape:randomized="0" + d="m 28.579928,57.054505 -4.836629,0 2.418314,-4.188643 z" + inkscape:transform-center-y="-0.69810795" /> + <path + style="fill:none;stroke:#000000;stroke-width:0.264583;stroke-miterlimit:4;stroke-dasharray:0.529166, 0.529166;stroke-dashoffset:0;paint-order:markers fill stroke;stop-color:#000000" + d="M 11.791704,65.225482 9.0065326,70.566411" + id="path1467" + sodipodi:nodetypes="cc" /> + <path + style="fill:none;stroke:#000000;stroke-width:0.264583;stroke-miterlimit:4;stroke-dasharray:0.529166, 0.529166;stroke-dashoffset:0;paint-order:markers fill stroke;stop-color:#000000" + d="m 13.983628,65.280594 1.865426,5.369391" + id="path1469" + sodipodi:nodetypes="cc" /> + <circle + style="opacity:1;fill:#6abfff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + id="path14861" + cx="12.714239" + cy="65.343147" + r="2.3446827" /> + <path + sodipodi:type="star" + style="opacity:1;fill:#ff8cff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + id="path14863" + sodipodi:sides="3" + sodipodi:cx="33.771244" + sodipodi:cy="55.658291" + sodipodi:r1="2.7924292" + sodipodi:r2="1.3962145" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="true" + inkscape:rounded="0" + inkscape:randomized="0" + d="m 36.189559,57.054505 -4.83663,0 2.418315,-4.188643 z" + inkscape:transform-center-y="-0.69810795" /> + <path + sodipodi:type="star" + style="font-variation-settings:normal;opacity:1;vector-effect:none;fill:#6abfff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000;stop-opacity:1" + id="path14865" + sodipodi:sides="3" + sodipodi:cx="31.079979" + sodipodi:cy="69.469734" + sodipodi:r1="2.7924292" + sodipodi:r2="1.3962145" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="true" + inkscape:rounded="0" + inkscape:randomized="0" + d="m 33.498294,70.865949 -4.83663,0 2.418315,-4.188644 z" + inkscape:transform-center-y="-0.69810795" /> + <circle + style="font-variation-settings:normal;opacity:1;vector-effect:none;fill:#6abfff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000;stop-opacity:1" + id="circle14867" + cx="21.223957" + cy="65.343147" + r="2.3446827" /> + <rect + style="font-variation-settings:normal;opacity:1;vector-effect:none;fill:#6abfff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000;stop-opacity:1" + id="rect14869" + width="4.4276514" + height="3.9112766" + x="36.216988" + y="66.749847" /> + <path + id="path82" + inkscape:connector-curvature="0" + d="m 41.515562,54.871846 -0.417806,1.20398 -1.168202,0.279888 -0.960641,-0.870246 -1.138707,0.832408 0.537693,1.179427 -0.621192,1.02818 -1.273991,0.03274 -0.218785,1.407938 1.203979,0.417805 0.279889,1.168202 -0.870245,0.960643 0.832407,1.138704 1.179427,-0.537691 1.028181,0.621191 0.03274,1.273992 1.407938,0.218785 0.417806,-1.20398 1.168202,-0.279888 0.96064,0.870244 1.138706,-0.832406 -0.537691,-1.179427 0.621192,-1.028182 1.273992,-0.03274 0.218784,-1.407938 -1.20398,-0.417805 -0.279888,-1.168203 0.870246,-0.96064 -0.83241,-1.138707 -1.179425,0.537693 -1.028181,-0.621192 -0.03274,-1.273992 z" + style="fill:#d0dbf5;fill-opacity:1;stroke:#0f2d59;stroke-width:0.284967;stroke-linecap:round;stroke-linejoin:round" + sodipodi:nodetypes="ccccccccccccccccccccccccccccccccc" /> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.265;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#marker1218);paint-order:markers fill stroke;stop-color:#000000" + d="m 36.505382,61.212129 c -1.732593,0.460546 -2.239587,0.94846 -3.054171,1.805942 -0.855057,0.900086 -1.291029,1.914968 -1.728787,3.298907" + id="path1214" + sodipodi:nodetypes="cac" /> + <path + style="opacity:1;fill:none;stroke:#000000;stroke-width:0.265;stroke-miterlimit:4;stroke-dasharray:none;marker-end:url(#marker1266);paint-order:markers fill stroke;stop-color:#000000" + d="m 39.119283,63.680684 c -0.561579,0.349977 -1.171361,1.831472 -1.388934,2.468193" + id="path1262" + sodipodi:nodetypes="cc" /> + <g + id="g1624-1" + transform="translate(-24.776227,-7.0250037)"> + <path + sodipodi:type="star" + style="font-variation-settings:normal;vector-effect:none;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.113934;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000" + id="path14865-5-2" + sodipodi:sides="3" + sodipodi:cx="66.174721" + sodipodi:cy="64.759911" + sodipodi:r1="1.2024668" + sodipodi:r2="0.60123342" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="true" + inkscape:rounded="0" + inkscape:randomized="0" + inkscape:transform-center-y="-0.30061479" + inkscape:transform-center-x="2.0589357e-06" + d="m 67.216088,65.361144 -2.082734,0 1.041367,-1.8037 z" /> + <path + style="font-variation-settings:normal;opacity:1;vector-effect:none;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.15;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker1559-2);paint-order:markers fill stroke;stop-color:#000000;stop-opacity:1" + d="M 66.173282,65.809672 V 67.26045" + id="path1555-0" /> + <g + id="g1807" + transform="translate(0.32991862)"> + <rect + style="font-variation-settings:normal;vector-effect:none;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.113934;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000" + id="rect14869-6-7" + width="1.906621" + height="1.6842613" + x="66.594307" + y="67.911743" /> + <path + sodipodi:type="star" + style="font-variation-settings:normal;vector-effect:none;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.113934;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000" + id="path1803" + sodipodi:sides="3" + sodipodi:cx="64.271751" + sodipodi:cy="69.082977" + sodipodi:r1="1.2024668" + sodipodi:r2="0.60123342" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="true" + inkscape:rounded="0" + inkscape:randomized="0" + inkscape:transform-center-y="-0.30061479" + inkscape:transform-center-x="2.0589357e-06" + d="m 65.313118,69.684211 -2.082733,0 1.041366,-1.803701 z" /> + </g> + </g> + </g> + <g + id="g1374" + transform="translate(-49.214304,-4.5219647)"> + <circle + style="font-variation-settings:normal;vector-effect:none;fill:#6abfff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000" + id="circle14867-5" + cx="61.878521" + cy="49.767113" + r="2.3446827" /> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.125046" + x="59.173122" + y="60.298515" + id="text14855"><tspan + sodipodi:role="line" + id="tspan14853" + x="59.173122" + y="60.298515" + style="font-size:3.52778px;stroke-width:0.125046">Converter</tspan></text> + <text + xml:space="preserve" + style="font-style:normal;font-weight:normal;font-size:3.52778px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.125046" + x="59.173122" + y="45.423546" + id="text14859"><tspan + sodipodi:role="line" + id="tspan14857" + x="59.173122" + y="45.423546" + style="font-size:3.52778px;stroke-width:0.125046">StructureElement</tspan></text> + <rect + style="fill:#25e325;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + id="rect13997-6" + width="4.4276514" + height="3.9112766" + x="65.556091" + y="47.811474" /> + <path + sodipodi:type="star" + style="fill:#ff8cff;fill-opacity:1;stroke:#000000;stroke-width:0.264583;paint-order:markers fill stroke;stop-color:#000000" + id="path14003-7" + sodipodi:sides="3" + sodipodi:cx="73.831802" + sodipodi:cy="50.531364" + sodipodi:r1="2.7924292" + sodipodi:r2="1.3962145" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="true" + inkscape:rounded="0" + inkscape:randomized="0" + inkscape:transform-center-y="-0.69810795" + d="m 76.250117,51.927579 -4.836629,0 2.418314,-4.188644 z" /> + <g + id="g1649"> + <path + id="path82-3" + inkscape:connector-curvature="0" + d="m 66.342602,61.715213 -0.417806,1.20398 -1.168202,0.279888 -0.960641,-0.870246 -1.138707,0.832408 0.537693,1.179427 -0.621192,1.02818 -1.273991,0.03274 -0.218785,1.407938 1.203979,0.417805 0.279889,1.168202 -0.870245,0.960643 0.832407,1.138704 1.179427,-0.537691 1.028181,0.621191 0.03274,1.273992 1.407938,0.218785 0.417806,-1.20398 1.168202,-0.279888 0.96064,0.870244 1.138706,-0.832406 -0.537691,-1.179427 0.621192,-1.028182 1.273992,-0.03274 0.218784,-1.407938 -1.20398,-0.417805 -0.279888,-1.168203 0.870246,-0.96064 -0.83241,-1.138707 -1.179425,0.537693 -1.028181,-0.621192 -0.03274,-1.273992 z" + style="fill:#d0dbf5;fill-opacity:1;stroke:#0f2d59;stroke-width:0.284967;stroke-linecap:round;stroke-linejoin:round" + sodipodi:nodetypes="ccccccccccccccccccccccccccccccccc" /> + <g + id="g1624" + transform="translate(-0.23034383)"> + <path + sodipodi:type="star" + style="font-variation-settings:normal;vector-effect:none;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.113934;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000" + id="path14865-5" + sodipodi:sides="3" + sodipodi:cx="66.489288" + sodipodi:cy="64.759911" + sodipodi:r1="1.2024668" + sodipodi:r2="0.60123342" + sodipodi:arg1="0.52359878" + sodipodi:arg2="1.5707963" + inkscape:flatsided="true" + inkscape:rounded="0" + inkscape:randomized="0" + inkscape:transform-center-y="-0.30061479" + d="m 67.530655,65.361144 -2.082734,0 1.041367,-1.8037 z" + inkscape:transform-center-x="2.0589357e-06" /> + <rect + style="font-variation-settings:normal;vector-effect:none;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.113934;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:markers fill stroke;stop-color:#000000" + id="rect14869-6" + width="1.906621" + height="1.6842613" + x="65.535973" + y="67.911743" /> + <path + style="font-variation-settings:normal;opacity:1;vector-effect:none;fill:none;fill-opacity:1;stroke:#000000;stroke-width:0.15;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#marker1559);paint-order:markers fill stroke;stop-color:#000000;stop-opacity:1" + d="M 66.487845,65.809672 V 67.26045" + id="path1555" /> + </g> + </g> + </g> + </g> +</svg> diff --git a/src/doc/macros.rst b/src/doc/macros.rst index 5329ca6ddde49dbef439659d4904b07ed3f2bef9..d093d9b69f5d2c14b5bfbb2fe292545fc7943ca7 100644 --- a/src/doc/macros.rst +++ b/src/doc/macros.rst @@ -1,7 +1,8 @@ Macros ------ -Macros highly facilitate the writing of complex :doc:`CFoods<cfood>`. Consider the following prevalent example: +Macros highly facilitate the writing of complex :doc:`CFoods<cfood>`. Consider the following common +example: .. _example_files: .. code-block:: yaml diff --git a/src/doc/tutorials/index.rst b/src/doc/tutorials/index.rst index b6f0fab511f3646f3ec6a7a320299e72a2c20038..2d4e8be7172f3d1ea8f5f154c4d8013891312309 100644 --- a/src/doc/tutorials/index.rst +++ b/src/doc/tutorials/index.rst @@ -9,4 +9,4 @@ This chapter contains a collection of tutorials. Parameter File<parameterfile> Scientific Data Folder<scifolder> - + WIP: Single Structured File <single_file> diff --git a/src/doc/tutorials/single_file.rst b/src/doc/tutorials/single_file.rst new file mode 100644 index 0000000000000000000000000000000000000000..35c6c999e8ed422ac3b8caf6ae92b31d91f182c6 --- /dev/null +++ b/src/doc/tutorials/single_file.rst @@ -0,0 +1,129 @@ +WIP Tutorial: Single structured file +==================================== + +.. warning:: + + This tutorial is still work in progress. It may be better than nothing, but it is still + incomplete and probably contains serious errors. + + Use at your own risk. + +In this tutorial, we will create a crawler that reads a single structured file, such as an XLSX +file. + +Declarations +------------ + +``identifiables.yml`` + +.. code-block:: yaml + + Präventionsmaßnahme: + - Organisation + - titel + - Laufzeit + + +``cfood.yml`` + +.. code-block:: yaml + + --- + metadata: + crawler-version: 0.6.1 + --- + + Präventionsmaßnahme der Organisation: # Eine Excel-Datei mit Präventionsmaßnahmen + type: XLSXTableConverter + match: ".*xlsx$" # Any xlsx file. + subtree: + Maßnahme: # Eine Zeile in der Datei + type: DictElement + match_name: .* + match_value: .* + records: + Präventionsmaßnahme: # Records edited for each row + name: "" + subtree: + MaßnahmenArt: # Spalte mit Art der Maßnahme + type: IntegerElement + match_name: Art der Maßnahme # Name of the column in the table file + match_value: (?P<column_value).*) + MaßnahmenTitel: + type: TextElement + match_name: Titel der Maßnahme # Name of the column in the table file + match_value: (?P<column_value).*) + records: # Records edited for each cell + Präventionsmaßnahme: + titel: $column_value + + +Python code +----------- + + +.. code-block:: python + + #!/usr/bin/env python3 + + # Crawler für Präventionsmaßnahme + # + # Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> + # + # This program is free software: you can redistribute it and/or modify + # it under the terms of the GNU Affero General Public License as + # published by the Free Software Foundation, either version 3 of the + # License, or (at your option) any later version. + # + # This program is distributed in the hope that it will be useful, + # but WITHOUT ANY WARRANTY; without even the implied warranty of + # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + # GNU Affero General Public License for more details. + # + # You should have received a copy of the GNU Affero General Public License + # along with this program. If not, see <https://www.gnu.org/licenses/>. + + """Crawler für Präventionsmaßnahmen""" + + import argparse + + from caoscrawler.scanner import load_definition, create_converter_registry, scan_structure_elements + from caoscrawler.structure_elements import File + + + def crawl_file(filename: str, dry_run: bool = False): + """Read an XLSX file into a LinkAhead container. + + Parameters + ---------- + filename : str + The name of the XLSX file. + + dry_run : bool + If True, do not modify the database. + """ + definition = load_definition("cfood.yml") + converter_registry = create_converter_registry(definition) + + records = scan_structure_elements(items=File(name="somename.xlsx", path=filename), + crawler_definition=definition, + converter_registry=converter_registry) + from IPython import embed + embed() + + def _parse_arguments(): + """Parse the arguments.""" + parser = argparse.ArgumentParser(description='Crawler für Präventionsmaßnahme') + parser.add_argument('-n', '--dry-run', help="Do not modify the database.", action="store_true") + parser.add_argument('xlsx_file', metavar="XSLX file", help="The xlsx file to be crawled.") + return parser.parse_args() + + + def main(): + """Main function.""" + args = _parse_arguments() + crawl_file(args.xlsx_file, dry_run=args.dry_run) + + + if __name__ == '__main__': + main()