diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 80a3728ce5b1f413d2bdd674b26a7dca1122eef5..dcb5d5fe12ac12806c40b18f4b5d3e0d15a0e87d 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -115,10 +115,11 @@ class ConverterValidationError(Exception): def create_path_value(func): - """decorator for create_values functions that adds a value containing the path + """Decorator for create_values functions that adds a value containing the path. should be used for StructureElement that are associated with file system objects that have a path, like File or Directory. + """ def inner(self, values: GeneralStore, element: StructureElement): @@ -155,22 +156,28 @@ def replace_variables(propvalue, values: GeneralStore): def handle_value(value: Union[dict, str, list], values: GeneralStore): - """ - determines whether the given value needs to set a property, be added to an existing value (create a list) or + """Determine whether the given value needs to set a property, be added to an existing value (create a list) or add as an additional property (multiproperty). Variable names (starting with a "$") are replaced by the corresponding value stored in the `values` GeneralStore. - Parameters: - - value: if str, the value to be interpreted. E.g. "4", "hallo" or "$a" etc. - if dict, must have keys "value" and "collection_mode". The returned tuple is directly - created from the corresponding values. - if list, each element is checked for replacement and the resulting list will be used - as (list) value for the property - Returns a tuple: - - the final value of the property; variable names contained in `values` are replaced. - - the collection mode (can be single, list or multiproperty) +Parameters +---------- + +value: + - if str, the value to be interpreted. E.g. "4", "hallo" or "$a" etc. + - if dict, must have keys "value" and "collection_mode". The returned tuple is directly + created from the corresponding values. + - if list, each element is checked for replacement and the resulting list will be used + as (list) value for the property + +Returns +------- + +out: tuple + - the final value of the property; variable names contained in `values` are replaced. + - the collection mode (can be single, list or multiproperty) """ # @review Florian Spreckelsen 2022-05-13 @@ -302,9 +309,7 @@ def create_records(values: GeneralStore, records: RecordStore, def_records: dict class Converter(object, metaclass=ABCMeta): - """ - Converters treat StructureElements contained in the hierarchical sturcture. - """ + """Converters treat StructureElements contained in the hierarchical sturcture.""" def __init__(self, definition: dict, name: str, converter_registry: dict): self.definition = definition @@ -535,9 +540,7 @@ class DirectoryConverter(Converter): class SimpleFileConverter(Converter): - """ - Just a file, ignore the contents. - """ + """Just a file, ignore the contents.""" def typecheck(self, element: StructureElement): return isinstance(element, File) @@ -568,9 +571,7 @@ class FileConverter(SimpleFileConverter): class MarkdownFileConverter(SimpleFileConverter): - """ - reads the yaml header of markdown files (if a such a header exists). - """ + """Read the yaml header of markdown files (if a such a header exists).""" def create_children(self, generalStore: GeneralStore, element: StructureElement): # TODO: See comment on types and inheritance @@ -604,7 +605,7 @@ class MarkdownFileConverter(SimpleFileConverter): def convert_basic_element(element: Union[list, dict, bool, int, float, str, None], name=None, msg_prefix=""): - """converts basic Python objects to the corresponding StructureElements """ + """Convert basic Python objects to the corresponding StructureElements""" if isinstance(element, list): return ListElement(name, element) elif isinstance(element, dict): @@ -628,12 +629,16 @@ def convert_basic_element(element: Union[list, dict, bool, int, float, str, None def validate_against_json_schema(instance, schema_resource: Union[dict, str]): - """validates given ``instance`` against given ``schema_resource``. + """Validate given ``instance`` against given ``schema_resource``. + +Parameters +---------- - Args: - instance: instance to be validated, typically ``dict`` but can be ``list``, ``str``, etc. - schema_resource: Either a path to the JSON file containing the schema or a ``dict`` with - the schema +instance: + Instance to be validated, typically ``dict`` but can be ``list``, ``str``, etc. + +schema_resource: + Either a path to the JSON file containing the schema or a ``dict`` with the schema. """ if isinstance(schema_resource, dict): schema = schema_resource @@ -752,15 +757,19 @@ class YAMLFileConverter(SimpleFileConverter): def match_name_and_value(definition, name, value): - """ - takes match definitions from the definition argument and applies regular expressiion to name - and possibly value + """Take match definitions from the definition argument and apply regular expression to name and + possibly value one of the keys 'match_name' and "match' needs to be available in definition 'match_value' is optional - Returns None, if match_name or match lead to no match. Otherwise, returns a dictionary with the - matched groups, possibly including matches from using match_value +Returns +------- + +out: + None, if match_name or match lead to no match. Otherwise, returns a dictionary with the + matched groups, possibly including matches from using match_value + """ if "match_name" in definition: if "match" in definition: @@ -796,11 +805,11 @@ def match_name_and_value(definition, name, value): class _AbstractScalarValueElementConverter(Converter): - """ - A base class for all converters that have a scalar value that can be matched using a regular + """A base class for all converters that have a scalar value that can be matched using a regular expression. values must have one of the following type: str, bool, int, float + """ default_matches = { @@ -840,15 +849,14 @@ class _AbstractScalarValueElementConverter(Converter): return match_name_and_value(self.definition, element.name, element.value) def _typecheck(self, element: StructureElement, allowed_matches: dict): - """ - returns whether the type of StructureElement is accepted. + """Return whether the type of StructureElement is accepted. - Parameters: - element: StructureElement, the element that is checked - allowed_matches: Dict, a dictionary that defines what types are allowed. It must have the - keys 'accept_text', 'accept_bool', 'accept_int', and 'accept_float'. + Parameters: element: StructureElement, the element that is checked allowed_matches: Dict, a + dictionary that defines what types are allowed. It must have the keys 'accept_text', + 'accept_bool', 'accept_int', and 'accept_float'. returns: whether or not the converter allows the type of element + """ if (bool(allowed_matches["accept_text"]) and isinstance(element, TextElement)): return True @@ -995,14 +1003,14 @@ class DictListElementConverter(ListElementConverter): class TableConverter(Converter): - """ - This converter reads tables in different formats line by line and + """This converter reads tables in different formats line by line and allows matching the corresponding rows. The subtree generated by the table converter consists of DictElements, each being a row. The corresponding header elements will become the dictionary keys. The rows can be matched using a DictElementConverter. + """ @abstractmethod def get_options(self): @@ -1100,12 +1108,12 @@ class CSVTableConverter(TableConverter): class DateElementConverter(TextElementConverter): - """ - allows to convert different text formats of dates to Python date objects. + """allows to convert different text formats of dates to Python date objects. The text to be parsed must be contained in the "date" group. The format string can be supplied under "dateformat" in the Converter definition. The library used is datetime so see its documentation for information on how to create the format string. + """ def match(self, element: StructureElement): diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index cadd7798d93b94bf4f11c76d18fe8431e61c5d0a..7b9119caa1cd4dd4623a9141de4a70abb4da5946 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -91,20 +91,22 @@ yaml.SafeLoader.add_constructor("!macro", macro_constructor) def check_identical(record1: db.Entity, record2: db.Entity, ignore_id=False): - """ - This function uses compare_entities to check whether to entities are identical - in a quite complex fashion: - - If one of the entities has additional parents or additional properties -> not identical - - If the value of one of the properties differs -> not identical - - If datatype, importance or unit are reported different for a property by compare_entities - return "not_identical" only if these attributes are set explicitely by record1. - Ignore the difference otherwise. - - If description, name, id or path appear in list of differences -> not identical. - - If file, checksum, size appear -> Only different, if explicitely set by record1. - - record1 serves as the reference, so datatype, importance and unit checks are carried - out using the attributes from record1. In that respect, the function is not symmetrical - in its arguments. + """Check whether two entities are identical. + +This function uses compare_entities to check whether two entities are identical +in a quite complex fashion: + +- If one of the entities has additional parents or additional properties -> not identical +- If the value of one of the properties differs -> not identical +- If datatype, importance or unit are reported different for a property by compare_entities + return "not_identical" only if these attributes are set explicitely by record1. + Ignore the difference otherwise. +- If description, name, id or path appear in list of differences -> not identical. +- If file, checksum, size appear -> Only different, if explicitely set by record1. + +record1 serves as the reference, so datatype, importance and unit checks are carried +out using the attributes from record1. In that respect, the function is not symmetrical +in its arguments. """ comp = compare_entities(record1, record2) diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index eb9333f73a79d5dd0dedc47b570b2934d4baf339..ceeb4750bb3be82d7bc76b0ca95c038af632921e 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -66,29 +66,29 @@ def convert_value(value: Any): class IdentifiableAdapter(metaclass=ABCMeta): - """ - Base class for identifiable adapters. + """Base class for identifiable adapters. + +Some terms: - Some terms: - - Registered identifiable is the definition of an identifiable which is: - - A record type as the parent - - A list of properties - - A list of referenced by statements +- Registered identifiable is the definition of an identifiable which is: + - A record type as the parent + - A list of properties + - A list of referenced by statements +- Identifiable is the concrete identifiable, e.g. the Record based on + the registered identifiable with all the values filled in. +- Identified record is the result of retrieving a record based on the + identifiable from the database. - - Identifiable is the concrete identifiable, e.g. the Record based on - the registered identifiable with all the values filled in. +General question to clarify: - - Identified record is the result of retrieving a record based on the - identifiable from the database. +- Do we want to support multiple identifiables per RecordType? +- Current implementation supports only one identifiable per RecordType. - General question to clarify: - Do we want to support multiple identifiables per RecordType? - Current implementation supports only one identifiable per RecordType. +The list of referenced by statements is currently not implemented. - The list of referenced by statements is currently not implemented. +The IdentifiableAdapter can be used to retrieve the three above mentioned objects (registred +identifiabel, identifiable and identified record) for a Record. - The IdentifiableAdapter can be used to retrieve the three above mentioned objects (registred - identifiabel, identifiable and identified record) for a Record. """ @staticmethod diff --git a/src/caoscrawler/scanner.py b/src/caoscrawler/scanner.py index ff6156aed3bde639435219a705d6d7d2124f7f38..400e182bec9e562f63c9c57245915523c3fc1355 100644 --- a/src/caoscrawler/scanner.py +++ b/src/caoscrawler/scanner.py @@ -26,7 +26,8 @@ """ This is the scanner, the original "_crawl" function from crawl.py. -This is just the functionality, that extracts data from the file system. + +This is just the functionality that extracts data from the file system. """ from __future__ import annotations @@ -234,18 +235,20 @@ def scanner(items: list[StructureElement], restricted_path: Optional[list[str]] = None, crawled_data: Optional[list[db.Record]] = None, debug_tree: Optional[DebugTree] = None): - """ - Crawl a list of StructureElements and apply any matching converters. + """Crawl a list of StructureElements and apply any matching converters. Formerly known as "_crawl". items: structure_elements (e.g. files and folders on one level on the hierarchy) + converters: locally defined converters for - treating structure elements. A locally defined converter could be - one that is only valid for a specific subtree of the originally - cralwed StructureElement structure. + treating structure elements. A locally defined converter could be + one that is only valid for a specific subtree of the originally + cralwed StructureElement structure. + general_store and record_store: This recursion of the crawl function should only operate on copies of the global stores of the Crawler object. + restricted_path: optional, list of strings, traverse the data tree only along the given path. For example, when a directory contains files a, b and c and b is given in restricted_path, a and c will be ignroed by the crawler. @@ -253,6 +256,7 @@ def scanner(items: list[StructureElement], normal. The first element of the list provided by restricted_path should be the name of the StructureElement at this level, i.e. denoting the respective element in the items argument. + """ # This path_found variable stores wether the path given by restricted_path was found in the # data tree diff --git a/src/doc/converters.rst b/src/doc/converters.rst index 95676627d95a5cd6bbca5208b67f9689fffb6806..98849609f0cab2afba037a82fe4ae6802caa5956 100644 --- a/src/doc/converters.rst +++ b/src/doc/converters.rst @@ -483,6 +483,7 @@ Let's formulate that using `create_records` (again, `dir_name` is constant here) keys_modified = create_records(values, records, record_def) + Debugging ========= diff --git a/src/doc/getting_started/helloworld.md b/src/doc/getting_started/helloworld.md index aa8f72ceda16398d4b8541eb5fceb65dd6106105..f26caf9e2484dcc8423f4fbc95aefe84e0aa0861 100644 --- a/src/doc/getting_started/helloworld.md +++ b/src/doc/getting_started/helloworld.md @@ -51,7 +51,7 @@ print(f"Updated {len(updates)} Records") ``` You also need a file called `identifiables.yml` with the following content: -```yml +```yaml HelloWorld: - name ``` diff --git a/src/doc/macros.rst b/src/doc/macros.rst index 7685731d35afab51074bb4d12c51ede0a7ba1b75..5d8a411607af223c5b8d65b1553e710553d998f0 100644 --- a/src/doc/macros.rst +++ b/src/doc/macros.rst @@ -24,7 +24,7 @@ Macros highly facilitate the writing of complex :doc:`CFoods<cfood>`. Consider t This example just inserts a file called ``README.md`` contained in Folder ``ExpreimentalData/`` into CaosDB, assigns the parent (RecordType) ``MarkdownFile`` and allows for later referencing this entity within the cfood. As file objects are created in the cfood specification using the ``records`` section with the special role ``File``, defining and using many files can become very cumbersome and make the cfood file difficult to read. The same version using cfood macros could be defined as follows: - + .. _example_files_2: .. code-block:: yaml @@ -79,7 +79,7 @@ The expanded version of `ExperimentalData` will look like: type: SimpleFile type: Directory -This :ref:`example<_example_files_2>` can also be found in the macro unit tests (see :func:`unittests.test_macros.test_documentation_example_2`). +This :ref:`example<example_files_2>` can also be found in the macro unit tests (see :func:`unittests.test_macros.test_documentation_example_2`). Complex Example @@ -117,7 +117,7 @@ of macro variable substitutions that generate crawler variable substitutions: Simulation: $recordtype: +$File -The expanded version of :ref:`example<_example_1>` can be seen in :ref:`example<_example_1_expanded>`. +The expanded version of :ref:`example<example_1>` can be seen in :ref:`example<example_1_expanded>`. .. _example_1_expanded: @@ -140,7 +140,7 @@ The expanded version of :ref:`example<_example_1>` can be seen in :ref:`example< type: SimpleFile type: Directory -This :ref:`example<_example_1>` can also be found in the macro unit tests (see :func:`unittests.test_macros.test_documentation_example_1`). +This :ref:`example<example_1>` can also be found in the macro unit tests (see :func:`unittests.test_macros.test_documentation_example_1`). @@ -173,7 +173,7 @@ To use the same macro multiple times in the same yaml node, lists can be used: - {} # <- This is the third one, just using default arguments -This :ref:`example<_example_multiple>` is taken from the macro unit tests (see :func:`unittests.test_macros.test_use_macro_twice`). +This :ref:`example<example_multiple>` is taken from the macro unit tests (see :func:`unittests.test_macros.test_use_macro_twice`). The example will be expanded to: