diff --git a/CHANGELOG.md b/CHANGELOG.md index 34ce18520e245cbc82c558967f549c72071f62ac..d04329ed20eb29f51a4f9d149c2dca7a4cf6f240 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,8 +36,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 their contents were last modified before that datetime. ### Changed ### + - Registered identifiables can also be used by children of the given RecordType if no registered identifiable is defined for them. +- `None` and other NA values (i.e., values where `pandas.isna` is + `True`) are now interpreted as empty strings in + `converters.match_name_and_value` instead of being cast to string naïvely ### Deprecated ### @@ -48,10 +52,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `spss_to_datamodel` script works again. - The cfood now supports bi-directional references when defining records on the same level. (See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/175) +- [#112](https://gitlab.com/linkahead/linkahead-crawler/-/issues/112) + Children of CSVTableConverter match despite match_value: ".+" and + empty cell. This has been fixed by treating None and NA values in + `converters.match_name_and_value` (see above). ### Security ### ### Documentation ### +- Added documentation for ROCrateConverter, ELNFileConverter, and ROCrateEntityConverter ## [0.10.1] - 2024-11-13 ## diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 09942918a3818978b1b4b0c3ded1635f5f9053fc..e16b2c0fbaeeee419b0e3f235339dc18cd4da885 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -1295,17 +1295,17 @@ class YAMLFileConverter(SimpleFileConverter): def match_name_and_value(definition, name, value): """Take match definitions from the definition argument and apply regular expression to name and - possibly value + possibly value. - one of the keys 'match_name' and "match' needs to be available in definition - 'match_value' is optional + Exactly one of the keys ``match_name`` and ``match`` must exist in ``definition``, + ``match_value`` is optional Returns ------- out: None, if match_name or match lead to no match. Otherwise, returns a dictionary with the - matched groups, possibly including matches from using match_value + matched groups, possibly including matches from using `definition["match_value"]` """ if "match_name" in definition: @@ -1327,7 +1327,10 @@ out: m1 = {} if "match_value" in definition: - m2 = re.match(definition["match_value"], str(value), re.DOTALL) + # None values will be interpreted as empty strings for the + # matcher. + m_value = str(value) if (value is not None and not pd.isna(value)) else "" + m2 = re.match(definition["match_value"], m_value, re.DOTALL) if m2 is None: return None else: diff --git a/src/doc/converters/further_converters.rst b/src/doc/converters/further_converters.rst index a334c8778f440e108fd141b0fc53ec06765deb8c..0fffc2e7de1bd23327194c6379cca94bd7c72a29 100644 --- a/src/doc/converters/further_converters.rst +++ b/src/doc/converters/further_converters.rst @@ -98,3 +98,90 @@ given ``recordname``, this record can be used within the cfood. Most importantly, this record stores the internal path of this array within the HDF5 file in a text property, the name of which can be configured with the ``internal_path_property_name`` option which defaults to ``internal_hdf5_path``. + + + +ROCrateConverter +================ + +The ROCrateConverter unpacks ro-crate files, and creates one instance of the +``ROCrateEntity`` structure element for each contained object. Currently only +zipped ro-crate files are supported. The created ROCrateEntities wrap a +``rocrate.model.entity.Entity`` with a path to the folder the ROCrate data +is saved in. They are appended as children and can then be accessed via the +subtree and treated using the :ref:`ROCrateEntityConverter`. + +To use the ROCrateConverter, you need to install the LinkAhead crawler with its +optional ``rocrate`` dependency. + +ELNFileConverter +---------------- + +As .eln files are zipped ro-crate files, the ELNFileConverter works analogously +to the ROCrateConverter and also creates ROCrateEntities for contained objects. + +ROCrateEntityConverter +---------------------- + +The ROCrateEntityConverter unpacks the ``rocrate.model.entity.Entity`` wrapped +within a ROCrateEntity, and appends all properties, contained files, and parts +as children. Properties are converted to a basic element matching their value +(``BooleanElement``, ``IntegerElement``, etc.) and can be matched using +match_properties. Each ``rocrate.model.file.File`` is converted to a crawler +File object, which can be matched with SimpleFile. And each subpart of the +ROCrateEntity is also converted to a ROCrateEntity, which can then again be +treated using this converter. + +The ``match_entity_type`` keyword can be used to match a ROCrateEntity using its +entity_type. With the ``match_properties`` keyword, properties of a ROCrateEntity +can be either matched or extracted, as seen in the cfood example below: +* with ``match_properties: "@id": ro-crate-metadata.json`` the ROCrateEntities +can be filtered to only match the metadata json files. +* with ``match_properties: dateCreated: (?P<dateCreated>.*)`` the ``dateCreated`` +entry of that metadata json file is extracted and accessible through the +``dateCreated`` variable. +* the example could then be extended to use any other entry present in the metadata +json to filter the results, or insert the extracted information into generated records. + +Example cfood +------------- + +One short cfood to generate records for each .eln file in a directory and +their metadata files could be: + +.. code-block:: yaml + + --- + metadata: + crawler-version: 0.9.0 + --- + Converters: + ELNFile: + converter: ELNFileConverter + package: caoscrawler.converters.rocrate + ROCrateEntity: + converter: ROCrateEntityConverter + package: caoscrawler.converters.rocrate + + ParentDirectory: + type: Directory + match: (.*) + subtree: + ELNFile: + type: ELNFile + match: (?P<filename>.*)\.eln + records: + ELNExampleRecord: + filename: $filename + subtree: + ROCrateEntity: + type: ROCrateEntity + match_properties: + "@id": ro-crate-metadata.json + dateCreated: (?P<dateCreated>.*) + records: + MDExampleRecord: + parent: $ELNFile + filename: ro-crate-metadata.json + time: $dateCreated + diff --git a/unittests/test_directories/examples_tables/ExperimentalData/test_with_empty.csv b/unittests/test_directories/examples_tables/ExperimentalData/test_with_empty.csv new file mode 100644 index 0000000000000000000000000000000000000000..be25239a6d96ecde3876a7bbabdae8769994b455 --- /dev/null +++ b/unittests/test_directories/examples_tables/ExperimentalData/test_with_empty.csv @@ -0,0 +1,4 @@ +event,date +event_a,2025-02-06 +event_b, +event_c,2025-02-06T09:00:00 diff --git a/unittests/test_directories/examples_tables/crawler_for_issue_112.yml b/unittests/test_directories/examples_tables/crawler_for_issue_112.yml new file mode 100644 index 0000000000000000000000000000000000000000..4bab5adabcf889d7784583a80dcbb94b714fd3fc --- /dev/null +++ b/unittests/test_directories/examples_tables/crawler_for_issue_112.yml @@ -0,0 +1,27 @@ +ExperimentalData: + type: Directory + match: ExperimentalData + subtree: + CSVTable: + type: CSVTableConverter + match: "test_with_empty\\.csv" + subtree: + Row: + type: DictElement + records: + Event: + subtree: + EventName: + type: TextElement + match_name: "event" + match_value: "(?P<name>.*)" + records: + Event: + name: $name + Date: + type: Datetime + match_name: "date" + match_value: "(?P<date>.+)" + records: + Event: + event_time: $date diff --git a/unittests/test_issues.py b/unittests/test_issues.py index a6de65400f42018c3fdcde7b2f29d4fd200bf62b..779f77711fe18df2433f03580e7e3e4f2035f0f4 100644 --- a/unittests/test_issues.py +++ b/unittests/test_issues.py @@ -20,14 +20,44 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from pytest import mark +import importlib -from caoscrawler.converters import CrawlerTemplate, replace_variables +from pathlib import Path +from pytest import fixture, mark + +from caoscrawler.converters import (CrawlerTemplate, replace_variables, TextElementConverter) from caoscrawler.crawl import Crawler -from caoscrawler.scanner import (create_converter_registry, +from caoscrawler.scanner import (create_converter_registry, scan_directory, scan_structure_elements) from caoscrawler.stores import GeneralStore -from caoscrawler.structure_elements import DictElement +from caoscrawler.structure_elements import DictElement, TextElement + + +UNITTESTDIR = Path(__file__).parent + + +@fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "TextElement": { + "converter": "TextElementConverter", + "package": "caoscrawler.converters"}, + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + "CSVTableConverter": { + "converter": "CSVTableConverter", + "package": "caoscrawler.converters"}, + "Datetime": { + "converter": "DatetimeElementConverter", + "package": "caoscrawler.converters" + } + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry def test_issue_10(): @@ -148,3 +178,49 @@ def test_issue_93(): propvalue_template = CrawlerTemplate(propvalue) assert (propvalue_template.safe_substitute(**values.get_storage()) == f"some text before >> This is {exp} << some text after") + + +def test_issue_112(converter_registry): + """Test that empty table cells are not matched in case of + ``match_value: ".+"``. + + See https://gitlab.com/linkahead/linkahead-crawler/-/issues/112. + + """ + tec = TextElementConverter( + name="TestTextConverter", + definition={ + "match_name": ".*", + "match_value": "(?P<content>.+)" + }, + converter_registry=converter_registry + ) + + empty = TextElement(name="empty", value='') + assert tec.match(empty) is None + + empty_none = TextElement(name="empty", value=None) + assert tec.match(empty_none) is None + + non_empty = TextElement(name="empty", value=' ') + matches = tec.match(non_empty) + assert "content" in matches + assert matches["content"] == ' ' + + # Cfood definition for CSV example file + records = scan_directory(UNITTESTDIR / "test_directories" / "examples_tables" / "ExperimentalData", + UNITTESTDIR / "test_directories" / "examples_tables" / "crawler_for_issue_112.yml") + assert records + for rec in records: + print(rec.name) + assert len(rec.parents.filter_by_identity(name="Event")) > 0 + assert rec.name in ["event_a", "event_b", "event_c"] + if rec.name == "event_a": + assert rec.get_property("event_time") is not None + assert rec.get_property("event_time").value == "2025-02-06" + if rec.name == "event_b": + # `date` field is empty, so there must be no match + assert rec.get_property("event_time") is None + if rec.name == "event_c": + assert rec.get_property("event_time") is not None + assert rec.get_property("event_time").value == "2025-02-06T09:00:00"