diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a0dea9a118f25b21fb7e8216839746f0dcb256a..d04329ed20eb29f51a4f9d149c2dca7a4cf6f240 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,8 +36,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 their contents were last modified before that datetime. ### Changed ### + - Registered identifiables can also be used by children of the given RecordType if no registered identifiable is defined for them. +- `None` and other NA values (i.e., values where `pandas.isna` is + `True`) are now interpreted as empty strings in + `converters.match_name_and_value` instead of being cast to string naïvely ### Deprecated ### @@ -48,6 +52,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `spss_to_datamodel` script works again. - The cfood now supports bi-directional references when defining records on the same level. (See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/175) +- [#112](https://gitlab.com/linkahead/linkahead-crawler/-/issues/112) + Children of CSVTableConverter match despite match_value: ".+" and + empty cell. This has been fixed by treating None and NA values in + `converters.match_name_and_value` (see above). ### Security ### diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 09942918a3818978b1b4b0c3ded1635f5f9053fc..e16b2c0fbaeeee419b0e3f235339dc18cd4da885 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -1295,17 +1295,17 @@ class YAMLFileConverter(SimpleFileConverter): def match_name_and_value(definition, name, value): """Take match definitions from the definition argument and apply regular expression to name and - possibly value + possibly value. - one of the keys 'match_name' and "match' needs to be available in definition - 'match_value' is optional + Exactly one of the keys ``match_name`` and ``match`` must exist in ``definition``, + ``match_value`` is optional Returns ------- out: None, if match_name or match lead to no match. Otherwise, returns a dictionary with the - matched groups, possibly including matches from using match_value + matched groups, possibly including matches from using `definition["match_value"]` """ if "match_name" in definition: @@ -1327,7 +1327,10 @@ out: m1 = {} if "match_value" in definition: - m2 = re.match(definition["match_value"], str(value), re.DOTALL) + # None values will be interpreted as empty strings for the + # matcher. + m_value = str(value) if (value is not None and not pd.isna(value)) else "" + m2 = re.match(definition["match_value"], m_value, re.DOTALL) if m2 is None: return None else: diff --git a/unittests/test_directories/examples_tables/ExperimentalData/test_with_empty.csv b/unittests/test_directories/examples_tables/ExperimentalData/test_with_empty.csv new file mode 100644 index 0000000000000000000000000000000000000000..be25239a6d96ecde3876a7bbabdae8769994b455 --- /dev/null +++ b/unittests/test_directories/examples_tables/ExperimentalData/test_with_empty.csv @@ -0,0 +1,4 @@ +event,date +event_a,2025-02-06 +event_b, +event_c,2025-02-06T09:00:00 diff --git a/unittests/test_directories/examples_tables/crawler_for_issue_112.yml b/unittests/test_directories/examples_tables/crawler_for_issue_112.yml new file mode 100644 index 0000000000000000000000000000000000000000..4bab5adabcf889d7784583a80dcbb94b714fd3fc --- /dev/null +++ b/unittests/test_directories/examples_tables/crawler_for_issue_112.yml @@ -0,0 +1,27 @@ +ExperimentalData: + type: Directory + match: ExperimentalData + subtree: + CSVTable: + type: CSVTableConverter + match: "test_with_empty\\.csv" + subtree: + Row: + type: DictElement + records: + Event: + subtree: + EventName: + type: TextElement + match_name: "event" + match_value: "(?P<name>.*)" + records: + Event: + name: $name + Date: + type: Datetime + match_name: "date" + match_value: "(?P<date>.+)" + records: + Event: + event_time: $date diff --git a/unittests/test_issues.py b/unittests/test_issues.py index a6de65400f42018c3fdcde7b2f29d4fd200bf62b..779f77711fe18df2433f03580e7e3e4f2035f0f4 100644 --- a/unittests/test_issues.py +++ b/unittests/test_issues.py @@ -20,14 +20,44 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from pytest import mark +import importlib -from caoscrawler.converters import CrawlerTemplate, replace_variables +from pathlib import Path +from pytest import fixture, mark + +from caoscrawler.converters import (CrawlerTemplate, replace_variables, TextElementConverter) from caoscrawler.crawl import Crawler -from caoscrawler.scanner import (create_converter_registry, +from caoscrawler.scanner import (create_converter_registry, scan_directory, scan_structure_elements) from caoscrawler.stores import GeneralStore -from caoscrawler.structure_elements import DictElement +from caoscrawler.structure_elements import DictElement, TextElement + + +UNITTESTDIR = Path(__file__).parent + + +@fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "TextElement": { + "converter": "TextElementConverter", + "package": "caoscrawler.converters"}, + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + "CSVTableConverter": { + "converter": "CSVTableConverter", + "package": "caoscrawler.converters"}, + "Datetime": { + "converter": "DatetimeElementConverter", + "package": "caoscrawler.converters" + } + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry def test_issue_10(): @@ -148,3 +178,49 @@ def test_issue_93(): propvalue_template = CrawlerTemplate(propvalue) assert (propvalue_template.safe_substitute(**values.get_storage()) == f"some text before >> This is {exp} << some text after") + + +def test_issue_112(converter_registry): + """Test that empty table cells are not matched in case of + ``match_value: ".+"``. + + See https://gitlab.com/linkahead/linkahead-crawler/-/issues/112. + + """ + tec = TextElementConverter( + name="TestTextConverter", + definition={ + "match_name": ".*", + "match_value": "(?P<content>.+)" + }, + converter_registry=converter_registry + ) + + empty = TextElement(name="empty", value='') + assert tec.match(empty) is None + + empty_none = TextElement(name="empty", value=None) + assert tec.match(empty_none) is None + + non_empty = TextElement(name="empty", value=' ') + matches = tec.match(non_empty) + assert "content" in matches + assert matches["content"] == ' ' + + # Cfood definition for CSV example file + records = scan_directory(UNITTESTDIR / "test_directories" / "examples_tables" / "ExperimentalData", + UNITTESTDIR / "test_directories" / "examples_tables" / "crawler_for_issue_112.yml") + assert records + for rec in records: + print(rec.name) + assert len(rec.parents.filter_by_identity(name="Event")) > 0 + assert rec.name in ["event_a", "event_b", "event_c"] + if rec.name == "event_a": + assert rec.get_property("event_time") is not None + assert rec.get_property("event_time").value == "2025-02-06" + if rec.name == "event_b": + # `date` field is empty, so there must be no match + assert rec.get_property("event_time") is None + if rec.name == "event_c": + assert rec.get_property("event_time") is not None + assert rec.get_property("event_time").value == "2025-02-06T09:00:00"