From fbdf229364159a0eef3e239bef1698d9d49e9603 Mon Sep 17 00:00:00 2001 From: Daniel <d.hornung@indiscale.com> Date: Mon, 16 Oct 2023 09:09:56 +0200 Subject: [PATCH] DOC: More documentation changes. --- src/caoscrawler/crawl.py | 8 ++++---- src/caoscrawler/scanner.py | 5 ++--- src/caoscrawler/structure_elements.py | 24 +++++++++++++++++++++-- src/doc/converters.rst | 28 ++++++++++++++++++++------- 4 files changed, 49 insertions(+), 16 deletions(-) diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index da721ec3..f9ecfd77 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -1254,7 +1254,7 @@ def crawler_main(crawled_directory_path: str, dry_run: bool = False, prefix: str = "", securityMode: SecurityMode = SecurityMode.UPDATE, - unique_names=True, + unique_names: bool = True, restricted_path: Optional[list[str]] = None, remove_prefix: Optional[str] = None, add_prefix: Optional[str] = None, @@ -1270,9 +1270,9 @@ def crawler_main(crawled_directory_path: str, identifiables_definition_file : str filename of an identifiable definition yaml file debug : bool - DEPRECATED, whether or not to run in debug mode + DEPRECATED, use a provenance file instead. provenance_file : str - provenance information will be stored in a file with given filename + Provenance information will be stored in a file with given filename dry_run : bool do not commit any chnages to the server prefix : str @@ -1312,7 +1312,7 @@ def crawler_main(crawled_directory_path: str, _fix_file_paths(crawled_data, add_prefix, remove_prefix) _check_record_types(crawled_data) - if provenance_file is not None and debug: + if provenance_file is not None: crawler.save_debug_data(debug_tree=debug_tree, filename=provenance_file) if identifiables_definition_file is not None: diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index f1cad055..53fb7ccc 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -288,7 +288,6 @@ Parameters # extracts values from structure element and stores them in the # variable store converter.create_values(general_store_copy, element) - keys_modified = converter.create_records( general_store_copy, record_store_copy, element) @@ -396,7 +395,7 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen crawler_definition: dict, converter_registry: dict, restricted_path: Optional[list[str]] = None, - debug_tree: Optional[DebugTree] = None): + debug_tree: Optional[DebugTree] = None) -> list[db.Record]: """ Start point of the crawler recursion. @@ -417,7 +416,7 @@ def scan_structure_elements(items: Union[list[StructureElement], StructureElemen Returns ------- - crawled_data : list + crawled_data : list[db.Record] the final list with the target state of Records. """ diff --git a/src/caoscrawler/structure_elements.py b/src/caoscrawler/structure_elements.py index 952f29d0..ff070626 100644 --- a/src/caoscrawler/structure_elements.py +++ b/src/caoscrawler/structure_elements.py @@ -28,9 +28,16 @@ import warnings class StructureElement(object): - """ base class for elements in the hierarchical data structure """ + """Base class for elements in the hierarchical data structure. - def __init__(self, name): +Parameters +---------- + +name: str + The name of the StructureElement. May be used for pattern matching by CFood rules. + """ + + def __init__(self, name: str): # Used to store usage information for debugging: self.metadata: tDict[str, set[str]] = { "usage": set() @@ -46,6 +53,18 @@ class StructureElement(object): class FileSystemStructureElement(StructureElement): + """StructureElement representing an element of a file system, like a directory or a simple file. + +Parameters +---------- + +name: str + The name of the StructureElement. May be used for pattern matching by CFood rules. + +path: str + The path to the file or directory. + """ + def __init__(self, name: str, path: str): super().__init__(name) self.path = path @@ -65,6 +84,7 @@ class Directory(FileSystemStructureElement): class File(FileSystemStructureElement): + """StrutureElement representing a file.""" pass diff --git a/src/doc/converters.rst b/src/doc/converters.rst index 119cee84..0dde61d5 100644 --- a/src/doc/converters.rst +++ b/src/doc/converters.rst @@ -58,13 +58,16 @@ to generate records (see :py:meth:`~caoscrawler.converters.Converter.create_reco **records** is a dict of definitions that define the semantic structure (see details below). -Subtree contains a list of Converter defnitions that look like the one -described here. +**subtree** makes the yaml recursive: It contains a list of new Converter definitions, which work on + the StructureElements that are returned by the current Converter. Standard Converters +++++++++++++++++++ +These are the standard converters that exist in a default installation. For writing and applying +*custom converters*, see :ref:`below <Custom Converters>`. + Directory Converter =================== The Directory Converter creates StructureElements for each File and Directory @@ -189,11 +192,15 @@ CSV File → DictElement Custom Converters +++++++++++++++++ -It was previously mentioned that it is possible to create custom converters. +As mentioned before it is possible to create custom converters. These custom converters can be used to integrate arbitrary data extraction and ETL capabilities -into the caosdb-crawler and make these extensions available to any yaml specification. +into the LinkAhead crawler and make these extensions available to any yaml specification. + +Tell the crawler about a custom converter +========================================= -The basic syntax for adding a custom converter to a yaml cfood definition file is: +To use a custom crawler, it must be defined in the ``Converters`` section of the CFood yaml file. +The basic syntax for adding a custom converter to a definition file is: .. code-block:: yaml @@ -202,7 +209,7 @@ The basic syntax for adding a custom converter to a yaml cfood definition file i package: <python>.<module>.<name> converter: <PythonClassName> -The Converters-section can be either put into the first or second document of the cfood yaml file. +The Converters section can be either put into the first or the second document of the cfood yaml file. It can be also part of a single-document yaml cfood file. Please refer to :doc:`the cfood documentation<cfood>` for more details. Details: @@ -211,9 +218,16 @@ Details: - **<python>.<module>.<name>**: The name of the module where the converter class resides. - **<PythonClassName>**: Within this specified module there must be a class inheriting from base class :py:class:`caoscrawler.converters.Converter`. +Implementing a custom converter +=============================== + +Converters inherit from the :py:class:`~caoscrawler.converters.Converter` class. + The following methods are abstract and need to be overwritten by your custom converter to make it work: -- :py:meth:`~caoscrawler.converters.Converter.create_children` +:py:meth:`~caoscrawler.converters.Converter.create_children`: + Return a list of child StructureElement objects. + - :py:meth:`~caoscrawler.converters.Converter.match` - :py:meth:`~caoscrawler.converters.Converter.typecheck` -- GitLab