From 1a3fc544d9a657b195e6f2f9b4a4680557c5ff2a Mon Sep 17 00:00:00 2001
From: Florian Spreckelsen <f.spreckelsen@indiscale.com>
Date: Thu, 6 Feb 2025 13:06:25 +0100
Subject: [PATCH 1/5] FIX: Add unit tests and fix for
 https://gitlab.com/linkahead/linkahead-crawler/-/issues/112

---
 src/caoscrawler/converters/converters.py |  5 +-
 unittests/test_issues.py                 | 84 ++++++++++++++++++++++--
 2 files changed, 84 insertions(+), 5 deletions(-)

diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py
index 09942918..5fcdfda6 100644
--- a/src/caoscrawler/converters/converters.py
+++ b/src/caoscrawler/converters/converters.py
@@ -1327,7 +1327,10 @@ out:
         m1 = {}
 
     if "match_value" in definition:
-        m2 = re.match(definition["match_value"], str(value), re.DOTALL)
+        # None values will be interpreted as empty strings for the
+        # matcher.
+        m_value = str(value) if (value is not None and not pd.isna(value)) else ""
+        m2 = re.match(definition["match_value"], m_value, re.DOTALL)
         if m2 is None:
             return None
         else:
diff --git a/unittests/test_issues.py b/unittests/test_issues.py
index a6de6540..b7cde016 100644
--- a/unittests/test_issues.py
+++ b/unittests/test_issues.py
@@ -20,14 +20,44 @@
 # along with this program. If not, see <https://www.gnu.org/licenses/>.
 #
 
-from pytest import mark
+import importlib
 
-from caoscrawler.converters import CrawlerTemplate, replace_variables
+from pathlib import Path
+from pytest import fixture, mark
+
+from caoscrawler.converters import (CrawlerTemplate, replace_variables, TextElementConverter)
 from caoscrawler.crawl import Crawler
-from caoscrawler.scanner import (create_converter_registry,
+from caoscrawler.scanner import (create_converter_registry, scan_directory,
                                  scan_structure_elements)
 from caoscrawler.stores import GeneralStore
-from caoscrawler.structure_elements import DictElement
+from caoscrawler.structure_elements import DictElement, TextElement
+
+
+UNITTESTDIR = Path(__file__).parent
+
+
+@fixture
+def converter_registry():
+    converter_registry: dict[str, dict[str, str]] = {
+        "TextElement": {
+            "converter": "TextElementConverter",
+            "package": "caoscrawler.converters"},
+        "Directory": {
+            "converter": "DirectoryConverter",
+            "package": "caoscrawler.converters"},
+        "CSVTableConverter": {
+            "converter": "CSVTableConverter",
+            "package": "caoscrawler.converters"},
+        "Datetime": {
+            "converter": "DatetimeElementConverter",
+            "package": "caoscrawler.converters"
+        }
+    }
+
+    for key, value in converter_registry.items():
+        module = importlib.import_module(value["package"])
+        value["class"] = getattr(module, value["converter"])
+    return converter_registry
 
 
 def test_issue_10():
@@ -148,3 +178,49 @@ def test_issue_93():
         propvalue_template = CrawlerTemplate(propvalue)
         assert (propvalue_template.safe_substitute(**values.get_storage())
                 == f"some text before >> This is {exp} << some text after")
+
+
+def test_issue_112(converter_registry):
+    """Test that empty table cells are not matched in case of
+    ``match_value: ".+"``.
+
+    See https://gitlab.com/linkahead/linkahead-crawler/-/issues/112.
+
+    """
+    tec = TextElementConverter(
+        name="TestTextConverter",
+        definition={
+            "match_name": ".*",
+            "match_value": "(?P<content>.+)"
+        },
+        converter_registry=converter_registry
+    )
+
+    empty = TextElement(name="empty", value='')
+    assert tec.match(empty) is None
+
+    empty_none = TextElement(name="empty", value=None)
+    assert tec.match(empty_none) is None
+
+    non_empty = TextElement(name="empty", value=' ')
+    matches = tec.match(non_empty)
+    assert "content" in matches
+    assert matches["content"] == ' '
+
+    # Cfood definition for CSV example file
+    records = scan_directory(UNITTESTDIR / "test_directories" / "examples_tables" / "ExperimentalData",
+                             UNITTESTDIR / "test_directories" / "examples_tables" / "crawler_for_issue_112.yml")
+    assert records
+    for rec in records:
+        print(rec.name)
+        assert len(rec.parents.filter_by_identity(name="Event")) > 0
+        assert rec.name in ["event_a", "event_b", "event_c"]
+        if rec.name == "event_a":
+            assert rec.get_property("event_time") is not None
+            assert rec.get_property("event_time").value == "2025-02-06"
+        elif rec.name == "event_b":
+            # This should not have matched
+            assert rec.get_property("event_time") is None
+        if rec.name == "event_c":
+            assert rec.get_property("event_time") is not None
+            assert rec.get_property("event_time").value == "2025-02-06T09:00:00"
-- 
GitLab


From d8cc3a465a384b07c138eaa01340982883479eb0 Mon Sep 17 00:00:00 2001
From: Florian Spreckelsen <f.spreckelsen@indiscale.com>
Date: Thu, 6 Feb 2025 13:11:57 +0100
Subject: [PATCH 2/5] DOC: Update changelog

---
 CHANGELOG.md | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1a0dea9a..e843e99f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -36,8 +36,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   their contents were last modified before that datetime.
 
 ### Changed ###
+
 - Registered identifiables can also be used by children of the given RecordType
   if no registered identifiable is defined for them.
+- `None` and other NA values (i.e., values where `pandas.isna` is
+  `True`) are now interpreted as empty strings in
+  `converters.match_name_and_value` instead of being cast to string naïvely
 
 ### Deprecated ###
 
@@ -48,6 +52,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `spss_to_datamodel` script works again.
 - The cfood now supports bi-directional references when defining records on the same level.
   (See: https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/175)
+- [#112](https://gitlab.com/linkahead/linkahead-crawler/-/issues/112)
+  Children of CSVTableConverter match despite match_value: ".+" and
+  empty cell. This has been fixed by treating None and NA values in
+  `converters.match_name_and_value` (see above).
 
 ### Security ###
 
@@ -79,9 +87,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Units for properties. They can be specified by giving the property as a dict in the form
   ```yaml
   MyRecord:
-    my_prop:
-      value: 5
-      unit: m
+	my_prop:
+	  value: 5
+	  unit: m
   ```
 - Support for Python 3.13
 - ROCrateConverter, ELNFileConverter and ROCrateEntityConverter for crawling ROCrate and .eln files
-- 
GitLab


From 3ff3181e7aee2dd8bdf39ee74c49a0afadf99a7c Mon Sep 17 00:00:00 2001
From: Florian Spreckelsen <f.spreckelsen@indiscale.com>
Date: Thu, 6 Feb 2025 13:12:53 +0100
Subject: [PATCH 3/5] TST: Add test csv and cfood for new unit tests

---
 .../ExperimentalData/test_with_empty.csv      |  4 +++
 .../examples_tables/crawler_for_issue_112.yml | 27 +++++++++++++++++++
 2 files changed, 31 insertions(+)
 create mode 100644 unittests/test_directories/examples_tables/ExperimentalData/test_with_empty.csv
 create mode 100644 unittests/test_directories/examples_tables/crawler_for_issue_112.yml

diff --git a/unittests/test_directories/examples_tables/ExperimentalData/test_with_empty.csv b/unittests/test_directories/examples_tables/ExperimentalData/test_with_empty.csv
new file mode 100644
index 00000000..be25239a
--- /dev/null
+++ b/unittests/test_directories/examples_tables/ExperimentalData/test_with_empty.csv
@@ -0,0 +1,4 @@
+event,date
+event_a,2025-02-06
+event_b,
+event_c,2025-02-06T09:00:00
diff --git a/unittests/test_directories/examples_tables/crawler_for_issue_112.yml b/unittests/test_directories/examples_tables/crawler_for_issue_112.yml
new file mode 100644
index 00000000..4bab5ada
--- /dev/null
+++ b/unittests/test_directories/examples_tables/crawler_for_issue_112.yml
@@ -0,0 +1,27 @@
+ExperimentalData:
+  type: Directory
+  match: ExperimentalData
+  subtree:
+    CSVTable:
+      type: CSVTableConverter
+      match: "test_with_empty\\.csv"
+      subtree:
+        Row:
+          type: DictElement
+          records:
+            Event:
+          subtree:
+            EventName:
+              type: TextElement
+              match_name: "event"
+              match_value: "(?P<name>.*)"
+              records:
+                Event:
+                  name: $name
+            Date:
+              type: Datetime
+              match_name: "date"
+              match_value: "(?P<date>.+)"
+              records:
+                Event:
+                  event_time: $date
-- 
GitLab


From 9f92638120ddc730c2124aa7670c129a5a631c92 Mon Sep 17 00:00:00 2001
From: Daniel <d.hornung@indiscale.com>
Date: Thu, 6 Feb 2025 16:19:55 +0100
Subject: [PATCH 4/5] Revert whitespace changes.

---
 CHANGELOG.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e843e99f..d04329ed 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -87,9 +87,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Units for properties. They can be specified by giving the property as a dict in the form
   ```yaml
   MyRecord:
-	my_prop:
-	  value: 5
-	  unit: m
+    my_prop:
+      value: 5
+      unit: m
   ```
 - Support for Python 3.13
 - ROCrateConverter, ELNFileConverter and ROCrateEntityConverter for crawling ROCrate and .eln files
-- 
GitLab


From 22b53f7a8c96ff894e1cdc227b35ce21753fa9af Mon Sep 17 00:00:00 2001
From: Daniel <d.hornung@indiscale.com>
Date: Thu, 6 Feb 2025 16:35:16 +0100
Subject: [PATCH 5/5] STY, DOC: A few small changes to documentation and
 regression test.

---
 src/caoscrawler/converters/converters.py | 8 ++++----
 unittests/test_issues.py                 | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py
index 5fcdfda6..e16b2c0f 100644
--- a/src/caoscrawler/converters/converters.py
+++ b/src/caoscrawler/converters/converters.py
@@ -1295,17 +1295,17 @@ class YAMLFileConverter(SimpleFileConverter):
 
 def match_name_and_value(definition, name, value):
     """Take match definitions from the definition argument and apply regular expression to name and
-    possibly value
+    possibly value.
 
-    one of the keys 'match_name' and "match' needs to be available in definition
-    'match_value' is optional
+    Exactly one of the keys ``match_name`` and ``match`` must exist in ``definition``,
+    ``match_value`` is optional
 
 Returns
 -------
 
 out:
   None, if match_name or match lead to no match. Otherwise, returns a dictionary with the
-  matched groups, possibly including matches from using match_value
+  matched groups, possibly including matches from using `definition["match_value"]`
 
     """
     if "match_name" in definition:
diff --git a/unittests/test_issues.py b/unittests/test_issues.py
index b7cde016..779f7771 100644
--- a/unittests/test_issues.py
+++ b/unittests/test_issues.py
@@ -218,8 +218,8 @@ def test_issue_112(converter_registry):
         if rec.name == "event_a":
             assert rec.get_property("event_time") is not None
             assert rec.get_property("event_time").value == "2025-02-06"
-        elif rec.name == "event_b":
-            # This should not have matched
+        if rec.name == "event_b":
+            # `date` field is empty, so there must be no match
             assert rec.get_property("event_time") is None
         if rec.name == "event_c":
             assert rec.get_property("event_time") is not None
-- 
GitLab