diff --git a/src/doc/tutorials/index.rst b/src/doc/tutorials/index.rst index b6f0fab511f3646f3ec6a7a320299e72a2c20038..2d4e8be7172f3d1ea8f5f154c4d8013891312309 100644 --- a/src/doc/tutorials/index.rst +++ b/src/doc/tutorials/index.rst @@ -9,4 +9,4 @@ This chapter contains a collection of tutorials. Parameter File<parameterfile> Scientific Data Folder<scifolder> - + WIP: Single Structured File <single_file> diff --git a/src/doc/tutorials/single_file.rst b/src/doc/tutorials/single_file.rst new file mode 100644 index 0000000000000000000000000000000000000000..8a4e1b92378e89539db698db27a4996bd9e5ce38 --- /dev/null +++ b/src/doc/tutorials/single_file.rst @@ -0,0 +1,129 @@ +WIP Tutorial: Single structured file +================================ + +.. warning:: + + This tutorial is still work in progress. It may be better than nothing, but it is still + incomplete and probably contains serious errors. + + Use at your own risk. + +In this tutorial, we will create a crawler that reads a single structured file, such as an XLSX +file. + +Declarations +------------ + +``identifiables.yml`` + +.. code-block:: yaml + + Präventionsmaßnahme: + - Organisation + - titel + - Laufzeit + + +``cfood.yml`` + +.. code-block:: yaml + + --- + metadata: + crawler-version: 0.6.1 + --- + + Präventionsmaßnahme der Organisation: # Eine Excel-Datei mit Präventionsmaßnahmen + type: XLSXTableConverter + match: ".*xlsx$" # Any xlsx file. + subtree: + Maßnahme: # Eine Zeile in der Datei + type: DictElement + match_name: .* + match_value: .* + records: + Präventionsmaßnahme: # Records edited for each row + name: "" + subtree: + MaßnahmenArt: # Spalte mit Art der Maßnahme + type: IntegerElement + match_name: Art der Maßnahme # Name of the column in the table file + match_value: (?P<column_value).*) + MaßnahmenTitel: + type: TextElement + match_name: Titel der Maßnahme # Name of the column in the table file + match_value: (?P<column_value).*) + records: # Records edited for each cell + Präventionsmaßnahme: + titel: $column_value + + +Python code +----------- + + +.. code-block:: python + + #!/usr/bin/env python3 + + # Crawler für Präventionsmaßnahme + # + # Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> + # + # This program is free software: you can redistribute it and/or modify + # it under the terms of the GNU Affero General Public License as + # published by the Free Software Foundation, either version 3 of the + # License, or (at your option) any later version. + # + # This program is distributed in the hope that it will be useful, + # but WITHOUT ANY WARRANTY; without even the implied warranty of + # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + # GNU Affero General Public License for more details. + # + # You should have received a copy of the GNU Affero General Public License + # along with this program. If not, see <https://www.gnu.org/licenses/>. + + """Crawler für Präventionsmaßnahmen""" + + import argparse + + from caoscrawler.scanner import load_definition, create_converter_registry, scan_structure_elements + from caoscrawler.structure_elements import File + + + def crawl_file(filename: str, dry_run: bool = False): + """Read an XLSX file into a LinkAhead container. + + Parameters + ---------- + filename : str + The name of the XLSX file. + + dry_run : bool + If True, do not modify the database. + """ + definition = load_definition("cfood.yml") + converter_registry = create_converter_registry(definition) + + records = scan_structure_elements(items=File(name="somename.xlsx", path=filename), + crawler_definition=definition, + converter_registry=converter_registry) + from IPython import embed + embed() + + def _parse_arguments(): + """Parse the arguments.""" + parser = argparse.ArgumentParser(description='Crawler für Präventionsmaßnahme') + parser.add_argument('-n', '--dry-run', help="Do not modify the database.", action="store_true") + parser.add_argument('xlsx_file', metavar="XSLX file", help="The xlsx file to be crawled.") + return parser.parse_args() + + + def main(): + """Main function.""" + args = _parse_arguments() + crawl_file(args.xlsx_file, dry_run=args.dry_run) + + + if __name__ == '__main__': + main()