From 1a3fc544d9a657b195e6f2f9b4a4680557c5ff2a Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Thu, 6 Feb 2025 13:06:25 +0100 Subject: [PATCH 1/5] FIX: Add unit tests and fix for https://gitlab.com/linkahead/linkahead-crawler/-/issues/112 --- src/caoscrawler/converters/converters.py | 5 +- unittests/test_issues.py | 84 ++++++++++++++++++++++-- 2 files changed, 84 insertions(+), 5 deletions(-) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 09942918..5fcdfda6 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -1327,7 +1327,10 @@ out: m1 = {} if "match_value" in definition: - m2 = re.match(definition["match_value"], str(value), re.DOTALL) + # None values will be interpreted as empty strings for the + # matcher. + m_value = str(value) if (value is not None and not pd.isna(value)) else "" + m2 = re.match(definition["match_value"], m_value, re.DOTALL) if m2 is None: return None else: diff --git a/unittests/test_issues.py b/unittests/test_issues.py index a6de6540..b7cde016 100644 --- a/unittests/test_issues.py +++ b/unittests/test_issues.py @@ -20,14 +20,44 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from pytest import mark +import importlib -from caoscrawler.converters import CrawlerTemplate, replace_variables +from pathlib import Path +from pytest import fixture, mark + +from caoscrawler.converters import (CrawlerTemplate, replace_variables, TextElementConverter) from caoscrawler.crawl import Crawler -from caoscrawler.scanner import (create_converter_registry, +from caoscrawler.scanner import (create_converter_registry, scan_directory, scan_structure_elements) from caoscrawler.stores import GeneralStore -from caoscrawler.structure_elements import DictElement +from caoscrawler.structure_elements import DictElement, TextElement + + +UNITTESTDIR = Path(__file__).parent + + +@fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "TextElement": { + "converter": "TextElementConverter", + "package": "caoscrawler.converters"}, + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + "CSVTableConverter": { + "converter": "CSVTableConverter", + "package": "caoscrawler.converters"}, + "Datetime": { + "converter": "DatetimeElementConverter", + "package": "caoscrawler.converters" + } + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry def test_issue_10(): @@ -148,3 +178,49 @@ def test_issue_93(): propvalue_template = CrawlerTemplate(propvalue) assert (propvalue_template.safe_substitute(**values.get_storage()) == f"some text before >> This is {exp} << some text after") + + +def test_issue_112(converter_registry): + """Test that empty table cells are not matched in case of + ``match_value: ".+"``. + + See https://gitlab.com/linkahead/linkahead-crawler/-/issues/112. + + """ + tec = TextElementConverter( + name="TestTextConverter", + definition={ + "match_name": ".*", + "match_value": "(?P<content>.+)" + }, + converter_registry=converter_registry + ) + + empty = TextElement(name="empty", value='') + assert tec.match(empty) is None + + empty_none = TextElement(name="empty", value=None) + assert tec.match(empty_none) is None + + non_empty = TextElement(name="empty", value=' ') + matches = tec.match(non_empty) + assert "content" in matches + assert matches["content"] == ' ' + + # Cfood definition for CSV example file + records = scan_directory(UNITTESTDIR / "test_directories" / "examples_tables" / "ExperimentalData", + UNITTESTDIR / "test_directories" / "examples_tables" / "crawler_for_issue_112.yml") + assert records + for rec in records: + print(rec.name) + assert len(rec.parents.filter_by_identity(name="Event")) > 0 + assert rec.name in ["event_a", "event_b", "event_c"] + if rec.name == "event_a": + assert rec.get_property("event_time") is not None + assert rec.get_property("event_time").value == "2025-02-06" + elif rec.name == "event_b": + # This should not have matched + assert rec.get_property("event_time") is None + if rec.name == "event_c": + assert rec.get_property("event_time") is not None + assert rec.get_property("event_time").value == "2025-02-06T09:00:00" -- GitLab From d8cc3a465a384b07c138eaa01340982883479eb0 Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Thu, 6 Feb 2025 13:11:57 +0100 Subject: [PATCH 2/5] DOC: Update changelog --- CHANGELOG.md | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a0dea9a..e843e99f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,8 +36,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 their contents were last modified before that datetime. ### Changed ### + - Registered identifiables can also be used by children of the given RecordType if no registered identifiable is defined for them. +- `None` and other NA values (i.e., values where `pandas.isna` is + `True`) are now interpreted as empty strings in + `converters.match_name_and_value` instead of being cast to string naïvely ### Deprecated ### @@ -48,6 +52,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `spss_to_datamodel` script works again. - The cfood now supports bi-directional references when defining records on the same level. (See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/175) +- [#112](https://gitlab.com/linkahead/linkahead-crawler/-/issues/112) + Children of CSVTableConverter match despite match_value: ".+" and + empty cell. This has been fixed by treating None and NA values in + `converters.match_name_and_value` (see above). ### Security ### @@ -79,9 +87,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Units for properties. They can be specified by giving the property as a dict in the form ```yaml MyRecord: - my_prop: - value: 5 - unit: m + my_prop: + value: 5 + unit: m ``` - Support for Python 3.13 - ROCrateConverter, ELNFileConverter and ROCrateEntityConverter for crawling ROCrate and .eln files -- GitLab From 3ff3181e7aee2dd8bdf39ee74c49a0afadf99a7c Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Thu, 6 Feb 2025 13:12:53 +0100 Subject: [PATCH 3/5] TST: Add test csv and cfood for new unit tests --- .../ExperimentalData/test_with_empty.csv | 4 +++ .../examples_tables/crawler_for_issue_112.yml | 27 +++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 unittests/test_directories/examples_tables/ExperimentalData/test_with_empty.csv create mode 100644 unittests/test_directories/examples_tables/crawler_for_issue_112.yml diff --git a/unittests/test_directories/examples_tables/ExperimentalData/test_with_empty.csv b/unittests/test_directories/examples_tables/ExperimentalData/test_with_empty.csv new file mode 100644 index 00000000..be25239a --- /dev/null +++ b/unittests/test_directories/examples_tables/ExperimentalData/test_with_empty.csv @@ -0,0 +1,4 @@ +event,date +event_a,2025-02-06 +event_b, +event_c,2025-02-06T09:00:00 diff --git a/unittests/test_directories/examples_tables/crawler_for_issue_112.yml b/unittests/test_directories/examples_tables/crawler_for_issue_112.yml new file mode 100644 index 00000000..4bab5ada --- /dev/null +++ b/unittests/test_directories/examples_tables/crawler_for_issue_112.yml @@ -0,0 +1,27 @@ +ExperimentalData: + type: Directory + match: ExperimentalData + subtree: + CSVTable: + type: CSVTableConverter + match: "test_with_empty\\.csv" + subtree: + Row: + type: DictElement + records: + Event: + subtree: + EventName: + type: TextElement + match_name: "event" + match_value: "(?P<name>.*)" + records: + Event: + name: $name + Date: + type: Datetime + match_name: "date" + match_value: "(?P<date>.+)" + records: + Event: + event_time: $date -- GitLab From 9f92638120ddc730c2124aa7670c129a5a631c92 Mon Sep 17 00:00:00 2001 From: Daniel <d.hornung@indiscale.com> Date: Thu, 6 Feb 2025 16:19:55 +0100 Subject: [PATCH 4/5] Revert whitespace changes. --- CHANGELOG.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e843e99f..d04329ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -87,9 +87,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Units for properties. They can be specified by giving the property as a dict in the form ```yaml MyRecord: - my_prop: - value: 5 - unit: m + my_prop: + value: 5 + unit: m ``` - Support for Python 3.13 - ROCrateConverter, ELNFileConverter and ROCrateEntityConverter for crawling ROCrate and .eln files -- GitLab From 22b53f7a8c96ff894e1cdc227b35ce21753fa9af Mon Sep 17 00:00:00 2001 From: Daniel <d.hornung@indiscale.com> Date: Thu, 6 Feb 2025 16:35:16 +0100 Subject: [PATCH 5/5] STY, DOC: A few small changes to documentation and regression test. --- src/caoscrawler/converters/converters.py | 8 ++++---- unittests/test_issues.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 5fcdfda6..e16b2c0f 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -1295,17 +1295,17 @@ class YAMLFileConverter(SimpleFileConverter): def match_name_and_value(definition, name, value): """Take match definitions from the definition argument and apply regular expression to name and - possibly value + possibly value. - one of the keys 'match_name' and "match' needs to be available in definition - 'match_value' is optional + Exactly one of the keys ``match_name`` and ``match`` must exist in ``definition``, + ``match_value`` is optional Returns ------- out: None, if match_name or match lead to no match. Otherwise, returns a dictionary with the - matched groups, possibly including matches from using match_value + matched groups, possibly including matches from using `definition["match_value"]` """ if "match_name" in definition: diff --git a/unittests/test_issues.py b/unittests/test_issues.py index b7cde016..779f7771 100644 --- a/unittests/test_issues.py +++ b/unittests/test_issues.py @@ -218,8 +218,8 @@ def test_issue_112(converter_registry): if rec.name == "event_a": assert rec.get_property("event_time") is not None assert rec.get_property("event_time").value == "2025-02-06" - elif rec.name == "event_b": - # This should not have matched + if rec.name == "event_b": + # `date` field is empty, so there must be no match assert rec.get_property("event_time") is None if rec.name == "event_c": assert rec.get_property("event_time") is not None -- GitLab