From 3b8e99cc53d986b146564c19158dcb55168f0681 Mon Sep 17 00:00:00 2001 From: Daniel <d.hornung@indiscale.com> Date: Wed, 13 Sep 2023 16:36:13 +0200 Subject: [PATCH] DOC WIP: Tutorial: Single structured file For issue #80. --- src/doc/tutorials/index.rst | 2 +- src/doc/tutorials/single_file.rst | 129 ++++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+), 1 deletion(-) create mode 100644 src/doc/tutorials/single_file.rst diff --git a/src/doc/tutorials/index.rst b/src/doc/tutorials/index.rst index b6f0fab5..2d4e8be7 100644 --- a/src/doc/tutorials/index.rst +++ b/src/doc/tutorials/index.rst @@ -9,4 +9,4 @@ This chapter contains a collection of tutorials. Parameter File<parameterfile> Scientific Data Folder<scifolder> - + WIP: Single Structured File <single_file> diff --git a/src/doc/tutorials/single_file.rst b/src/doc/tutorials/single_file.rst new file mode 100644 index 00000000..8a4e1b92 --- /dev/null +++ b/src/doc/tutorials/single_file.rst @@ -0,0 +1,129 @@ +WIP Tutorial: Single structured file +================================ + +.. warning:: + + This tutorial is still work in progress. It may be better than nothing, but it is still + incomplete and probably contains serious errors. + + Use at your own risk. + +In this tutorial, we will create a crawler that reads a single structured file, such as an XLSX +file. + +Declarations +------------ + +``identifiables.yml`` + +.. code-block:: yaml + + Präventionsmaßnahme: + - Organisation + - titel + - Laufzeit + + +``cfood.yml`` + +.. code-block:: yaml + + --- + metadata: + crawler-version: 0.6.1 + --- + + Präventionsmaßnahme der Organisation: # Eine Excel-Datei mit Präventionsmaßnahmen + type: XLSXTableConverter + match: ".*xlsx$" # Any xlsx file. + subtree: + Maßnahme: # Eine Zeile in der Datei + type: DictElement + match_name: .* + match_value: .* + records: + Präventionsmaßnahme: # Records edited for each row + name: "" + subtree: + MaßnahmenArt: # Spalte mit Art der Maßnahme + type: IntegerElement + match_name: Art der Maßnahme # Name of the column in the table file + match_value: (?P<column_value).*) + MaßnahmenTitel: + type: TextElement + match_name: Titel der Maßnahme # Name of the column in the table file + match_value: (?P<column_value).*) + records: # Records edited for each cell + Präventionsmaßnahme: + titel: $column_value + + +Python code +----------- + + +.. code-block:: python + + #!/usr/bin/env python3 + + # Crawler für Präventionsmaßnahme + # + # Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> + # + # This program is free software: you can redistribute it and/or modify + # it under the terms of the GNU Affero General Public License as + # published by the Free Software Foundation, either version 3 of the + # License, or (at your option) any later version. + # + # This program is distributed in the hope that it will be useful, + # but WITHOUT ANY WARRANTY; without even the implied warranty of + # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + # GNU Affero General Public License for more details. + # + # You should have received a copy of the GNU Affero General Public License + # along with this program. If not, see <https://www.gnu.org/licenses/>. + + """Crawler für Präventionsmaßnahmen""" + + import argparse + + from caoscrawler.scanner import load_definition, create_converter_registry, scan_structure_elements + from caoscrawler.structure_elements import File + + + def crawl_file(filename: str, dry_run: bool = False): + """Read an XLSX file into a LinkAhead container. + + Parameters + ---------- + filename : str + The name of the XLSX file. + + dry_run : bool + If True, do not modify the database. + """ + definition = load_definition("cfood.yml") + converter_registry = create_converter_registry(definition) + + records = scan_structure_elements(items=File(name="somename.xlsx", path=filename), + crawler_definition=definition, + converter_registry=converter_registry) + from IPython import embed + embed() + + def _parse_arguments(): + """Parse the arguments.""" + parser = argparse.ArgumentParser(description='Crawler für Präventionsmaßnahme') + parser.add_argument('-n', '--dry-run', help="Do not modify the database.", action="store_true") + parser.add_argument('xlsx_file', metavar="XSLX file", help="The xlsx file to be crawled.") + return parser.parse_args() + + + def main(): + """Main function.""" + args = _parse_arguments() + crawl_file(args.xlsx_file, dry_run=args.dry_run) + + + if __name__ == '__main__': + main() -- GitLab