diff --git a/src/doc/tutorials/index.rst b/src/doc/tutorials/index.rst index 2d4e8be7172f3d1ea8f5f154c4d8013891312309..412d29f01018f05b84e0fe8e43fa631b61b91d04 100644 --- a/src/doc/tutorials/index.rst +++ b/src/doc/tutorials/index.rst @@ -9,4 +9,4 @@ This chapter contains a collection of tutorials. Parameter File<parameterfile> Scientific Data Folder<scifolder> - WIP: Single Structured File <single_file> + Single Structured File <single_file> diff --git a/src/doc/tutorials/single_file.rst b/src/doc/tutorials/single_file.rst index 35c6c999e8ed422ac3b8caf6ae92b31d91f182c6..824a658985b9375e140df7fb63a1fc9e7f6a7563 100644 --- a/src/doc/tutorials/single_file.rst +++ b/src/doc/tutorials/single_file.rst @@ -1,129 +1,222 @@ -WIP Tutorial: Single structured file +Tutorial: Single structured file ==================================== -.. warning:: +In this tutorial, we will create a crawler that reads a single structured file, +such as a CSV file. - This tutorial is still work in progress. It may be better than nothing, but it is still - incomplete and probably contains serious errors. +Declarations +------------ +This tutorial is based on the following simple data model: - Use at your own risk. +``model.yml`` -In this tutorial, we will create a crawler that reads a single structured file, such as an XLSX -file. +.. code-block:: yaml -Declarations ------------- + Fish: + recommended_properties: + date: + datatype: DATETIME + number: + datatype: INTEGER + weight: + datatype: DOUBLE + species: + datatype: TEXT + +You can insert this model with the following command: + +.. code-block:: shell + + python -m caosadvancedtools.models.parser model.yml --sync + + +We will identify `Fish` Records in LinkAhead using the following two +attributes. ``identifiables.yml`` .. code-block:: yaml - Präventionsmaßnahme: - - Organisation - - titel - - Laufzeit + Fish: + - date + - number +And we will use the following crawler configuration. ``cfood.yml`` .. code-block:: yaml - --- - metadata: - crawler-version: 0.6.1 - --- - - Präventionsmaßnahme der Organisation: # Eine Excel-Datei mit Präventionsmaßnahmen - type: XLSXTableConverter - match: ".*xlsx$" # Any xlsx file. - subtree: - Maßnahme: # Eine Zeile in der Datei - type: DictElement - match_name: .* - match_value: .* - records: - Präventionsmaßnahme: # Records edited for each row - name: "" - subtree: - MaßnahmenArt: # Spalte mit Art der Maßnahme - type: IntegerElement - match_name: Art der Maßnahme # Name of the column in the table file - match_value: (?P<column_value).*) - MaßnahmenTitel: - type: TextElement - match_name: Titel der Maßnahme # Name of the column in the table file - match_value: (?P<column_value).*) - records: # Records edited for each cell - Präventionsmaßnahme: - titel: $column_value + --- + metadata: + crawler-version: 0.9.1 + --- + + fish_data_file: # Root file + type: CSVTableConverter + match: "^fish_data_.*.csv$" # Match CSV file with a name that starts with "fish_data_" + subtree: + table_row: # One row in the CSV file + type: DictElement + match_name: .* # we want to treat every row, so match anything + match_value: .* + records: + Fish: # Record for the current row; information from statements below + # are added to this Record + subtree: + date: # Element for the date column + type: TextElement + match_name: date # Name of the column in the table file + match_value: (?P<column_value>.*) # We match any value of the row in this + # column and assign it to the ``column_value`` + # variable + records: # Records edited for each cell + Fish: + date: $column_value + species: + type: TextElement + match_name: species + match_value: (?P<column_value>.*) + records: + Fish: + species: $column_value + number: + type: TextElement + match_name: identifier + match_value: (?P<column_value>.*) + records: + Fish: + number: $column_value + weight: + type: TextElement + match_name: weight + match_value: (?P<column_value>.*) + records: + Fish: + weight: $column_value + Python code ----------- +The following code allows us to read the csv file, create corresponding `Fish` +Records and synchronize those with LinkAhead. .. code-block:: python - #!/usr/bin/env python3 - - # Crawler für Präventionsmaßnahme - # - # Copyright (C) 2023 IndiScale GmbH <info@indiscale.com> - # - # This program is free software: you can redistribute it and/or modify - # it under the terms of the GNU Affero General Public License as - # published by the Free Software Foundation, either version 3 of the - # License, or (at your option) any later version. - # - # This program is distributed in the hope that it will be useful, - # but WITHOUT ANY WARRANTY; without even the implied warranty of - # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - # GNU Affero General Public License for more details. - # - # You should have received a copy of the GNU Affero General Public License - # along with this program. If not, see <https://www.gnu.org/licenses/>. - - """Crawler für Präventionsmaßnahmen""" - - import argparse - - from caoscrawler.scanner import load_definition, create_converter_registry, scan_structure_elements - from caoscrawler.structure_elements import File - - - def crawl_file(filename: str, dry_run: bool = False): - """Read an XLSX file into a LinkAhead container. - - Parameters - ---------- - filename : str - The name of the XLSX file. - - dry_run : bool - If True, do not modify the database. - """ - definition = load_definition("cfood.yml") - converter_registry = create_converter_registry(definition) - - records = scan_structure_elements(items=File(name="somename.xlsx", path=filename), - crawler_definition=definition, - converter_registry=converter_registry) - from IPython import embed - embed() - - def _parse_arguments(): - """Parse the arguments.""" - parser = argparse.ArgumentParser(description='Crawler für Präventionsmaßnahme') - parser.add_argument('-n', '--dry-run', help="Do not modify the database.", action="store_true") - parser.add_argument('xlsx_file', metavar="XSLX file", help="The xlsx file to be crawled.") - return parser.parse_args() - - - def main(): - """Main function.""" - args = _parse_arguments() - crawl_file(args.xlsx_file, dry_run=args.dry_run) - - - if __name__ == '__main__': - main() + #!/usr/bin/env python3 + + # Copyright (C) 2023-2024 IndiScale GmbH <info@indiscale.com> + # + # This program is free software: you can redistribute it and/or modify + # it under the terms of the GNU Affero General Public License as + # published by the Free Software Foundation, either version 3 of the + # License, or (at your option) any later version. + # + # This program is distributed in the hope that it will be useful, + # but WITHOUT ANY WARRANTY; without even the implied warranty of + # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + # GNU Affero General Public License for more details. + # + # You should have received a copy of the GNU Affero General Public License + # along with this program. If not, see <https://www.gnu.org/licenses/>. + + """Crawler for fish data""" + + import os + import argparse + import sys + import logging + + from caoscrawler.scanner import load_definition, create_converter_registry, scan_structure_elements + from caoscrawler.structure_elements import File + from caoscrawler import Crawler, SecurityMode + from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter + + + def crawl_file(filename: str, dry_run: bool = False): + """Read a CSV file into a LinkAhead container. + + Parameters + ---------- + filename : str + The name of the CSV file. + + dry_run : bool + If True, do not modify the database. + """ + # setup logging + logger = logging.getLogger("caoscrawler") + logger.setLevel(level=(logging.DEBUG)) + logger.addHandler(logging.StreamHandler(stream=sys.stdout)) + + # load crawler configuration + definition = load_definition("cfood.yml") + converter_registry = create_converter_registry(definition) + + # crawl the CSV file + records = scan_structure_elements(items=File(name= os.path.basename(filename), path=filename), + crawler_definition=definition, + converter_registry=converter_registry) + logger.debug(records) + + crawler = Crawler(securityMode=SecurityMode.UPDATE) + # This defines how Records on the server are identified with the ones we have locally + ident = CaosDBIdentifiableAdapter() + ident.load_from_yaml_definition("identifiables.yml") + crawler.identifiableAdapter = ident + + # Here we synchronize the data + inserts, updates = crawler.synchronize(commit_changes=True, unique_names=True, + crawled_data=records) + + #from IPython import embed + #embed() + + def _parse_arguments(): + """Parse the arguments.""" + parser = argparse.ArgumentParser(description='Crawler for fish data') + parser.add_argument('-n', '--dry-run', help="Do not modify the database.", action="store_true") + parser.add_argument('csv_file', metavar="csv file", help="The csv file to be crawled.") + return parser.parse_args() + + + def main(): + """Main function.""" + args = _parse_arguments() + crawl_file(args.csv_file, dry_run=args.dry_run) + + + if __name__ == '__main__': + main() + +Running it +---------- +This is an example for the data files that we can crawl: + +``fish_data_1.csv`` + +.. code-block:: + + identifier,date,species,weight + 1,2022-01-02,pike,3.4 + 2,2022-01-02,guppy,2.3 + 3,2022-01-02,pike,2.2 + 3,2022-01-06,pike,2.1 + + +If you have created all the files, you can run: + +.. code-block:: shell + + python3 crawl.py fish_data_2.csv + +Note, that you can run the same script again and you will not see any changes +being done to the data in LinkAhead. + + +You may play around with changing data in the data table. Changes will +propagate into LinkAhead when you run the Crawler again. If you change one of +the identifying properties, the Crawler will consider the data that it reads as +new and create new `Fish` Records.