diff --git a/.gitignore b/.gitignore index 9af5ee22fdd68c1c25e98614ab516bf4d384d577..5599d7d263c8927025e128c37eabb185025bf96b 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,6 @@ provenance.yml *.tar.gz *.sql /integrationtests/test-profile/custom/other/cert/ +src/doc/_apidoc/ +start_caosdb_docker.sh +src/doc/_apidoc diff --git a/CHANGELOG.md b/CHANGELOG.md index 63e69dacc4e0d00b208829c7d8bb1b9566233804..f54912c4d869518f770e86a8a3b0c9054dd25146 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * Allow splitting cfoods into multiple yaml documents * Implemented macros * Converters can now filter the list of children +* You can now crawl data with name conflicts: `synchronize(unique_names=False)` ### Changed diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 3fbf9939664af20f35150e5fff95854634ea3040..97b81cd2e28faf2310e84abd5bb98aba9b60b308 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -36,7 +36,7 @@ from .structure_elements import (StructureElement, Directory, File, Dict, JSONFi DictFloatElement, DictDictElement, TextElement, DictTextElement, DictElement, DictListElement) from typing import Dict as Dict_t, List, Optional, Tuple, Union -from abc import abstractmethod +from abc import ABCMeta, abstractmethod from string import Template import yaml_header_tools @@ -255,7 +255,7 @@ def create_records(values: GeneralStore, return keys_modified -class Converter(object): +class Converter(object, metaclass=ABCMeta): """ Converters treat StructureElements contained in the hierarchical sturcture. """ @@ -283,6 +283,10 @@ class Converter(object): def converter_factory(definition: dict, name: str, converter_registry: dict): + """creates a Converter instance of the appropriate class. + + The `type` key in the `definition` defines the Converter class which is being used. + """ if "type" not in definition: raise RuntimeError( @@ -535,6 +539,7 @@ class DictConverter(Converter): return {} +# TODO: difference to SimpleFileConverter? Do we need both? class FileConverter(Converter): def typecheck(self, element: StructureElement): return isinstance(element, File) @@ -566,6 +571,8 @@ class JSONFileConverter(DictConverter): def create_children(self, generalStore: GeneralStore, element: StructureElement): if not self.typecheck(element): raise RuntimeError("A JSON file is needed to create children") + # TODO: either add explicit time check for File structure element here, + # or add a comment to suppress mypy type warning. with open(element.path, 'r') as json_file: json_data = json.load(json_file) if not isinstance(json_data, dict): diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index c3381ed5f2a1ad746ab79208e9376e04dc28137a..93196413c7ef16160fb8e528eeb24f4514074436 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -390,9 +390,13 @@ class Crawler(object): converter_registry) @staticmethod - def create_local_converters(crawler_definition: dict, - converter_registry: dict): - local_converters = [] + def initialize_converters(crawler_definition: dict, converter_registry: dict): + """ + takes the cfood as dict (`crawler_definition`) and creates the converter objects that + are defined on the highest level. Child Converters will in turn be created during the + initialization of the Converters. + """ + converters = [] for key, value in crawler_definition.items(): # Definitions and Converters are reserved keywords @@ -404,10 +408,10 @@ class Crawler(object): continue elif key == "Converters": continue - local_converters.append(Converter.converter_factory( + converters.append(Converter.converter_factory( value, key, converter_registry)) - return local_converters + return converters def start_crawling(self, items: Union[List[StructureElement], StructureElement], crawler_definition: dict, @@ -439,8 +443,7 @@ class Crawler(object): items = [items] self.run_id = uuid.uuid1() - local_converters = Crawler.create_local_converters(crawler_definition, - converter_registry) + local_converters = Crawler.initialize_converters(crawler_definition, converter_registry) # This recursive crawling procedure generates the update list: self.target_data: List[db.Record] = [] self._crawl(items, @@ -452,7 +455,7 @@ class Crawler(object): return self.target_data - def synchronize(self, commit_changes: bool = True): + def synchronize(self, commit_changes: bool = True, unique_names=True): """ Carry out the actual synchronization. """ @@ -460,7 +463,7 @@ class Crawler(object): # After the crawling, the actual synchronization with the database, based on the # update list is carried out: - return self._synchronize(self.target_data, commit_changes) + return self._synchronize(self.target_data, commit_changes, unique_names=unique_names) def can_be_checked_externally(self, record: db.Record): """ @@ -780,7 +783,8 @@ class Crawler(object): return db.Entity(name=name).retrieve() @staticmethod - def execute_inserts_in_list(to_be_inserted, securityMode, run_id: int = None): + def execute_inserts_in_list(to_be_inserted, securityMode, run_id: int = None, + unique_names=True): for record in to_be_inserted: for prop in record.properties: entity = Crawler._get_entity_by_name(prop.name) @@ -789,7 +793,7 @@ class Crawler(object): logger.debug(to_be_inserted) if len(to_be_inserted) > 0: if securityMode.value > SecurityMode.RETRIEVE.value: - db.Container().extend(to_be_inserted).insert() + db.Container().extend(to_be_inserted).insert(unique=unique_names) elif run_id is not None: update_cache = UpdateCache() update_cache.insert(to_be_inserted, run_id, insert=True) @@ -807,18 +811,20 @@ class Crawler(object): _resolve_datatype(prop, entity) @staticmethod - def execute_updates_in_list(to_be_updated, securityMode, run_id: int = None): + def execute_updates_in_list(to_be_updated, securityMode, run_id: int = None, + unique_names=True): Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated) logger.debug("UPDATE") logger.debug(to_be_updated) if len(to_be_updated) > 0: if securityMode.value > SecurityMode.INSERT.value: - db.Container().extend(to_be_updated).update() + db.Container().extend(to_be_updated).update(unique=unique_names) elif run_id is not None: update_cache = UpdateCache() update_cache.insert(to_be_updated, run_id) - def _synchronize(self, target_data: List[db.Record], commit_changes: bool = True): + def _synchronize(self, target_data: List[db.Record], commit_changes: bool = True, + unique_names=True): """ This function applies several stages: 1) Retrieve identifiables for all records in target_data. @@ -854,9 +860,9 @@ class Crawler(object): if commit_changes: self.execute_inserts_in_list( - to_be_inserted, self.securityMode, self.run_id) + to_be_inserted, self.securityMode, self.run_id, unique_names=unique_names) self.execute_updates_in_list( - to_be_updated, self.securityMode, self.run_id) + to_be_updated, self.securityMode, self.run_id, unique_names=unique_names) update_cache = UpdateCache() pending_inserts = update_cache.get_inserts(self.run_id) @@ -1030,7 +1036,9 @@ def crawler_main(crawled_directory_path: str, provenance_file: str = None, dry_run: bool = False, prefix: str = "", - securityMode: int = SecurityMode.UPDATE): + securityMode: int = SecurityMode.UPDATE, + unique_names=True, + ): """ Parameters @@ -1051,6 +1059,8 @@ def crawler_main(crawled_directory_path: str, remove the given prefix from file paths securityMode : int securityMode of Crawler + unique_names : bool + whether or not to update or insert entities inspite of name conflicts Returns ------- @@ -1108,7 +1118,7 @@ def crawler_main(crawled_directory_path: str, raise RuntimeError("Missing RecordTypes: {}". format(", ".join(notfound))) - crawler.synchronize(commit_changes=True) + crawler.synchronize(commit_changes=True, unique_names=unique_names) return 0 @@ -1126,6 +1136,7 @@ def parse_args(): help="The subtree of files below the given path will " "be considered. Use '/' for everything.") parser.add_argument("-s", "--security-mode", choices=["retrieve", "insert", "update"], + default="retrieve", help="Determines whether entities may only be read from the server, or " "whether inserts or even updates may be done.") parser.add_argument("-n", "--dry-run", action="store_true", @@ -1134,9 +1145,9 @@ def parse_args(): # TODO: load identifiables is a dirty implementation currently parser.add_argument("-i", "--load-identifiables", - help="Load identifiables from " - "the given yaml file.") - + help="Load identifiables from the given yaml file.") + parser.add_argument("-u", "--unique-names", + help="Insert or updates entities even if name conflicts exist.") parser.add_argument("-p", "--prefix", help="Remove the given prefix from the paths " "of all file objects.") @@ -1158,16 +1169,17 @@ def main(): logger.setLevel(logging.INFO) sys.exit(crawler_main( - args.crawled_directory_path, - args.cfood_file_name, - args.load_identifiables, - args.debug, - args.provenance, - args.dry_run, - args.prefix, - {"retrieve": SecurityMode.RETRIEVE, - "insert": SecurityMode.INSERT, - "update": SecurityMode.UPDATE}[args.security_mode] + crawled_directory_path=args.crawled_directory_path, + cfood_file_name=args.cfood_file_name, + identifiables_definition_file=args.load_identifiables, + debug=args.debug, + provenance_file=args.provenance, + dry_run=args.dry_run, + prefix=args.prefix, + securityMode={"retrieve": SecurityMode.RETRIEVE, + "insert": SecurityMode.INSERT, + "update": SecurityMode.UPDATE}[args.security_mode], + unique_names=args.unique_names, )) diff --git a/src/doc/README_SETUP.md b/src/doc/README_SETUP.md index 4311ce5f2ed3e25dbc7dc3be6b06d9f34a69b4e5..b6995c9a2d950ecd1e832d5b49dac9ed88a7e455 100644 --- a/src/doc/README_SETUP.md +++ b/src/doc/README_SETUP.md @@ -9,7 +9,7 @@ #### Linux #### -Make sure that Python (at least version 3.6) and pip is installed, using your system tools and +Make sure that Python (at least version 3.8) and pip is installed, using your system tools and documentation. Then open a terminal and continue in the [Generic installation](#generic-installation) section. @@ -57,6 +57,8 @@ cd caosdb-crawler pip3 install --user . ``` +**Note**: In the near future, this package will also be made available on PyPi. + ## Configuration ## diff --git a/src/doc/_apidoc/modules.rst b/src/doc/_apidoc/modules.rst deleted file mode 100644 index 17f187982981ffbf7bcc857056d10644c2bd422b..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/modules.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler -========== - -.. toctree:: - :maxdepth: 4 - - newcrawler diff --git a/src/doc/_apidoc/newcrawler.converters.rst b/src/doc/_apidoc/newcrawler.converters.rst deleted file mode 100644 index 893391c229b94baeed9a44c57877ed33f37b2f5e..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.converters.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler.converters module -============================ - -.. automodule:: newcrawler.converters - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/_apidoc/newcrawler.crawl.rst b/src/doc/_apidoc/newcrawler.crawl.rst deleted file mode 100644 index b00a6ab6498a0482cea3e9faa54d66d66991dc2d..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.crawl.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler.crawl module -======================= - -.. automodule:: newcrawler.crawl - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/_apidoc/newcrawler.identifiable_adapters.rst b/src/doc/_apidoc/newcrawler.identifiable_adapters.rst deleted file mode 100644 index d8926f41b72d2c54931f045d75f9fe59b21e6076..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.identifiable_adapters.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler.identifiable\_adapters module -======================================== - -.. automodule:: newcrawler.identifiable_adapters - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/_apidoc/newcrawler.identified_cache.rst b/src/doc/_apidoc/newcrawler.identified_cache.rst deleted file mode 100644 index 6f697362ad44d1fec01f328550dc8667cc889019..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.identified_cache.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler.identified\_cache module -=================================== - -.. automodule:: newcrawler.identified_cache - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/_apidoc/newcrawler.rst b/src/doc/_apidoc/newcrawler.rst deleted file mode 100644 index 202444a5efbde248e52d712575ade49f6dd50601..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.rst +++ /dev/null @@ -1,24 +0,0 @@ -newcrawler package -================== - -Submodules ----------- - -.. toctree:: - :maxdepth: 4 - - newcrawler.converters - newcrawler.crawl - newcrawler.identifiable_adapters - newcrawler.identified_cache - newcrawler.stores - newcrawler.structure_elements - newcrawler.utils - -Module contents ---------------- - -.. automodule:: newcrawler - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/_apidoc/newcrawler.stores.rst b/src/doc/_apidoc/newcrawler.stores.rst deleted file mode 100644 index 7d446c1cd45a6bf1c4b6cf1b1d33e9a2a5ad9751..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.stores.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler.stores module -======================== - -.. automodule:: newcrawler.stores - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/_apidoc/newcrawler.structure_elements.rst b/src/doc/_apidoc/newcrawler.structure_elements.rst deleted file mode 100644 index 4613e1d58b0ef9c7cc38096aa25270f469836ce5..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.structure_elements.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler.structure\_elements module -===================================== - -.. automodule:: newcrawler.structure_elements - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/_apidoc/newcrawler.utils.rst b/src/doc/_apidoc/newcrawler.utils.rst deleted file mode 100644 index 4df55a234fd85072068e41d1ce7bb3b17fd1a698..0000000000000000000000000000000000000000 --- a/src/doc/_apidoc/newcrawler.utils.rst +++ /dev/null @@ -1,7 +0,0 @@ -newcrawler.utils module -======================= - -.. automodule:: newcrawler.utils - :members: - :undoc-members: - :show-inheritance: diff --git a/src/doc/cfood.rst b/src/doc/cfood.rst index 9b0701f7cbffa112e9f1267afd761d8777019cf8..677cadc55709c6c25d16ff547b311102ee78699a 100644 --- a/src/doc/cfood.rst +++ b/src/doc/cfood.rst @@ -134,3 +134,16 @@ The **recommended way** of defining metadata, custom converters, macros and the match: DataAnalysis # (...) + +List Mode +--------- + +Specifying values of properties can make use of two special characters, in order to automatically +create lists or multi properties instead of single values: + +.. code-block:: yaml + + Experiment1: + Measurement: +Measurement <- Element in List (list is cleared before run) + *Measurement <- Multi Property (properties are removed before run) + Measurement <- Overwrite diff --git a/src/doc/concepts.rst b/src/doc/concepts.rst index cf1a1e096a99593835ceaa75019d8630cc8be6f4..c0f21cbaa322caddabed8e045f7b6fc4253d2959 100644 --- a/src/doc/concepts.rst +++ b/src/doc/concepts.rst @@ -20,97 +20,12 @@ Converters treat StructureElements and thereby create the StructureElement that are the children of the treated StructureElement. Converters therefore create the above named tree. The definition of a Converter also contains what Converters shall be used to treat the generated child-StructureElements. The -definition is there a tree itself. (Question: Should there be global Converters -that are always checked when treating a StructureElement? Should Converters be -associated with generated child-StructureElements? Currently, all children are -created and checked against all Converters. It could be that one would like to -check file-StructureElements against one set of Converters and -directory-StructureElements against another) +definition is therefore a tree itself. -Each StructureElement in the tree has a set of data values, i.e a dictionary of -key value pairs. -Some of those values are set due to the kind of StructureElement. For example, -a file could have the file name as such a key value pair: 'filename': <sth>. -Converters may define additional functions that create further values. For -example, a regular expresion could be used to get a date from a file name. +See `:doc:converters<converters>` for details. - -A converter is defined via a yml file or part of it. The definition states -what kind of StructureElement it treats (typically one). -Also, it defines how children of the current StructureElement are -created and what Converters shall be used to treat those. - -The yaml definition looks like the following: - -TODO: outdated, see cfood-schema.yml - -.. code-block:: yaml - - converter-name: - type: <StructureElement Type> - match: ".*" - records: - Experiment1: - parents: - - Experiment - - Blablabla - date: $DATUM - (...) - Experiment2: - parents: - - Experiment - subtree: - (...) - - - records: - Measurement: <- wird automatisch ein value im valueStore - run_number: 25 - Experiment1: - Measurement: +Measurement <- Element in List (list is cleared before run) - *Measurement <- Multi Property (properties are removed before run) - Measurement <- Overwrite - -UPDATE-Stage prüft ob es z.B. Gleichheit zwischen Listen gibt (die dadurch definiert sein -kann, dass alle Elemente vorhanden, aber nicht zwingend in der richtigen Reihenfolge sind) -evtl. brauchen wir das nicht, weil crawler eh schon deterministisch ist. - -The converter-name is a description of what it represents (e.g. -'experiment-folder') and is used as identifier. - -The type restricts what kind of StructureElements are treated. -The match is by default a regular expression, that is matche against the -name of StructureElements. Discussion: StructureElements might not have a -name (e.g. a dict) or should a name be created artificially if necessary -(e.g. "root-dict")? It might make sense to allow keywords like "always" and -other kinds of checks. For example a dictionary could be checked against a -json-schema definition. - -recordtypes is a list of definitions that define the semantic structure -(see details below). - -valuegenerators allow to provide additional functionality that creates -data values in addition to the ones given by default via the -StructureElement. This can be for example a match group of a regular -expression applied to the filename. -It should be possible to access the values of parent nodes. For example, -the name of a parent node could be accessed with $converter-name.name. -Discussion: This can introduce conflicts, if the key <converver-name> -already exists. An alternative would be to identify those lookups. E.g. -$$converter-name.name (2x$). - -childrengenerators denotes how StructureElements shall be created that are -children of the current one. - -subtree contains a list of Converter defnitions that look like the one -described here. - -those keywords should be allowed but not required. I.e. if no -valuegenerators shall be defined, the keyword may be omitted. - - Relevant sources in: src/converters.py diff --git a/src/doc/converters.rst b/src/doc/converters.rst index 28b81947587f68381d52a380cc54748e58b9c794..a30a7d92850f90be14f82a4e563fb56df5fcde88 100644 --- a/src/doc/converters.rst +++ b/src/doc/converters.rst @@ -1,6 +1,64 @@ Converters )))))))))) +Converters treat StructureElements and thereby create the StructureElement that +are the children of the treated StructureElement. Converters therefore create +the tree of structure elements. The definition of a Converter also contains what +Converters shall be used to treat the generated child-StructureElements. The +definition is therefore a tree itself. + +Each StructureElement in the tree has a set of data values, i.e a dictionary of +key value pairs. +Some of those values are set due to the kind of StructureElement. For example, +a file could have the file name as such a key value pair: 'filename': <sth>. +Converters may define additional functions that create further values. For +example, a regular expresion could be used to get a date from a file name. + + + + +A converter is defined via a yml file or part of it. The definition states +what kind of StructureElement it treats (typically one). +Also, it defines how children of the current StructureElement are +created and what Converters shall be used to treat those. + +The yaml definition looks like the following: + +TODO: outdated, see cfood-schema.yml + +.. code-block:: yaml + + <NodeName>: + type: <ConverterName> + match: ".*" + records: + Experiment1: + parents: + - Experiment + - Blablabla + date: $DATUM + (...) + Experiment2: + parents: + - Experiment + subtree: + (...) + +The **<NodeName>** is a description of what it represents (e.g. +'experiment-folder') and is used as identifier. + +**<type>** selects the converter that is going to be matched against the current structure +element. If the structure element matches (this is a combination of a typecheck and a detailed +match, see :py:class:`~caoscrawler.converters.Converter` for details) the converter is used +to generate records (see :py:meth:`~caoscrawler.converters.Converter.create_records`) and to possibly process a subtree, as defined by the function :func:`caoscrawler.converters.create_children`. + +**records** is a dict of definitions that define the semantic structure +(see details below). + +Subtree contains a list of Converter defnitions that look like the one +described here. + + Standard Converters +++++++++++++++++++ @@ -41,9 +99,10 @@ JSONFileConverter TextElementConverter +==================== TableConverter -================= +============== A generic converter (abstract) for files containing tables. Currently, there are two specialized implementations for xlsx-files and csv-files. @@ -80,7 +139,7 @@ Example: XLSXTableConverter -================= +================== CSVTableConverter ================= @@ -88,3 +147,162 @@ CSVTableConverter Custom Converters +++++++++++++++++ +It was previously mentioned that it is possible to create custom converters. +These custom converters can be used to integrate arbitrary data extraction and ETL capabilities +into the caosdb-crawler and make these extensions available to any yaml specification. + +The basic syntax for adding a custom converter to a yaml cfood definition file is: + +.. code-block:: yaml + + Converters: + <NameOfTheConverterInYamlFile>: + package: <python>.<module>.<name> + converter: <PythonClassName> + +The Converters-section can be either put into the first or second document of the cfood yaml file. +It can be also part of a single-document yaml cfood file. Please refer to :doc:`the cfood documentation<cfood>` for more details. + +Details: + +- **<NameOfTheConverterInYamlFile>**: This is the name of the converter as it is going to be used in the present yaml file. +- **<python>.<module>.<name>**: The name of the module where the converter class resides. +- **<PythonClassName>**: Within this specified module there must be a class inheriting from base class :py:class:`caoscrawler.converters.Converter`. + +The following methods are abstract and need to be overwritten by your custom converter to make it work: + +- :py:meth:`~caoscrawler.converters.Converter.create_children` +- :py:meth:`~caoscrawler.converters.Converter.match` +- :py:meth:`~caoscrawler.converters.Converter.typecheck` + + +Example +======= + +In the following, we will explain the process of adding a custom converter to a yaml file using +a SourceResolver that is able to attach a source element to another entity. + +**Note**: This example might become a standard crawler soon, as part of the scifolder specification. See https://doi.org/10.3390/data5020043 for details. In this documentation example we will, therefore, add it to a package called "scifolder". + +First we will create our package and module structure, which might be: +.. code-block:: + + scifolder_package/ + README.md + setup.cfg + setup.py + Makefile + tox.ini + src/ + scifolder/ + __init__.py + converters/ + __init__.py + sources.py # <- the actual file containing + # the converter class + doc/ + unittests/ + +Now we need to create a class called "SourceResolver" in the file "sources.py". In this - more advanced - example, we will not inherit our converter directly from :py:class:`~caoscrawler.converters.Converter`, but use :py:class:`~caoscrawler.converters.TextElementConverter`. The latter already implements :py:meth:`~caoscrawler.converters.Converter.match` and :py:meth:`~caoscrawler.converters.Converter.typecheck`, so only an implementation for :py:meth:`~caoscrawler.converters.Converter.create_children` has to be provided by us. +Furthermore we will customize the method :py:meth:`~caoscrawler.converters.Converter.create_records` that allows us to specify a more complex record generation procedure than provided in the standard implementation. One specific limitation of the standard implementation is, that only a fixed +number of records can be generated by the yaml definition. So for any applications - like here - that require an arbitrary number of records to be created, a customized implementation of :py:meth:`~caoscrawler.converters.Converter.create_records` is recommended. +In this context it is recommended to make use of the function :func:`caoscrawler.converters.create_records` that implements creation of record objects from python dictionaries of the same structure +that would be given using a yaml definition. + +.. code-block:: python + + import re + from caoscrawler.stores import GeneralStore, RecordStore + from caoscrawler.converters import TextElementConverter, create_records + from caoscrawler.structure_elements import StructureElement, TextElement + + + class SourceResolver(TextElementConverter): + """ + This resolver uses a source list element (e.g. from the markdown readme file) + to link sources correctly. + """ + + def __init__(self, definition: dict, name: str, + converter_registry: dict): + """ + Initialize a new directory converter. + """ + super().__init__(definition, name, converter_registry) + + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + + # The source resolver does not create children: + + return [] + + def create_records(self, values: GeneralStore, + records: RecordStore, + element: StructureElement, + file_path_prefix): + if not isinstance(element, TextElement): + raise RuntimeError() + + # This function must return a list containing tuples, each one for a modified + # property: (name_of_entity, name_of_property) + keys_modified = [] + + # This is the name of the entity where the source is going to be attached: + attach_to_scientific_activity = self.definition["scientific_activity"] + rec = records[attach_to_scientific_activity] + + # The "source" is a path to a source project, so it should have the form: + # /<Category>/<project>/<scientific_activity>/ + # obtain these information from the structure element: + val = element.value + regexp = (r'/(?P<category>(SimulationData)|(ExperimentalData)|(DataAnalysis))' + '/(?P<project_date>.*?)_(?P<project_identifier>.*)' + '/(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2})(_(?P<identifier>.*))?/') + + res = re.match(regexp, val) + if res is None: + raise RuntimeError("Source cannot be parsed correctly.") + + # Mapping of categories on the file system to corresponding record types in CaosDB: + cat_map = { + "SimulationData": "Simulation", + "ExperimentalData": "Experiment", + "DataAnalysis": "DataAnalysis"} + linkrt = cat_map[res.group("category")] + + keys_modified.extend(create_records(values, records, { + "Project": { + "date": res.group("project_date"), + "identifier": res.group("project_identifier"), + }, + linkrt: { + "date": res.group("date"), + "identifier": res.group("identifier"), + "project": "$Project" + }, + attach_to_scientific_activity: { + "sources": "+$" + linkrt + }}, file_path_prefix)) + + # Process the records section of the yaml definition: + keys_modified.extend( + super().create_records(values, records, element, file_path_prefix)) + + # The create_records function must return the modified keys to make it compatible + # to the crawler functions: + return keys_modified + + +If the recommended (python) package structure is used, the package containing the converter +definition can just be installed using `pip install .` or `pip install -e .` from the +`scifolder_package` directory. + +The following yaml block will register the converter in a yaml file: + +.. code-block:: yaml + + Converters: + SourceResolver: + package: scifolder.converters.sources + converter: SourceResolver diff --git a/src/doc/macros.rst b/src/doc/macros.rst index d8e819464b6af55cf6ea4e133b9b4110406f98dc..3d995c1fbc67b155a6df606ac2f84a0cec26d1a5 100644 --- a/src/doc/macros.rst +++ b/src/doc/macros.rst @@ -1,7 +1,7 @@ Macros ------ -Macros highly facilitate the writing of complex :doc:`CFoods<cfoods>`. Consider the following prevalent example: +Macros highly facilitate the writing of complex :doc:`CFoods<cfood>`. Consider the following prevalent example: .. _example_files: .. code-block:: yaml