diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 09942918a3818978b1b4b0c3ded1635f5f9053fc..5fcdfda69970b73fc85782bfd78943577b379c39 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -1327,7 +1327,10 @@ out: m1 = {} if "match_value" in definition: - m2 = re.match(definition["match_value"], str(value), re.DOTALL) + # None values will be interpreted as empty strings for the + # matcher. + m_value = str(value) if (value is not None and not pd.isna(value)) else "" + m2 = re.match(definition["match_value"], m_value, re.DOTALL) if m2 is None: return None else: diff --git a/unittests/test_issues.py b/unittests/test_issues.py index a6de65400f42018c3fdcde7b2f29d4fd200bf62b..b7cde0168738eb4546d87e688d4056323951fa41 100644 --- a/unittests/test_issues.py +++ b/unittests/test_issues.py @@ -20,14 +20,44 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # -from pytest import mark +import importlib -from caoscrawler.converters import CrawlerTemplate, replace_variables +from pathlib import Path +from pytest import fixture, mark + +from caoscrawler.converters import (CrawlerTemplate, replace_variables, TextElementConverter) from caoscrawler.crawl import Crawler -from caoscrawler.scanner import (create_converter_registry, +from caoscrawler.scanner import (create_converter_registry, scan_directory, scan_structure_elements) from caoscrawler.stores import GeneralStore -from caoscrawler.structure_elements import DictElement +from caoscrawler.structure_elements import DictElement, TextElement + + +UNITTESTDIR = Path(__file__).parent + + +@fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "TextElement": { + "converter": "TextElementConverter", + "package": "caoscrawler.converters"}, + "Directory": { + "converter": "DirectoryConverter", + "package": "caoscrawler.converters"}, + "CSVTableConverter": { + "converter": "CSVTableConverter", + "package": "caoscrawler.converters"}, + "Datetime": { + "converter": "DatetimeElementConverter", + "package": "caoscrawler.converters" + } + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry def test_issue_10(): @@ -148,3 +178,49 @@ def test_issue_93(): propvalue_template = CrawlerTemplate(propvalue) assert (propvalue_template.safe_substitute(**values.get_storage()) == f"some text before >> This is {exp} << some text after") + + +def test_issue_112(converter_registry): + """Test that empty table cells are not matched in case of + ``match_value: ".+"``. + + See https://gitlab.com/linkahead/linkahead-crawler/-/issues/112. + + """ + tec = TextElementConverter( + name="TestTextConverter", + definition={ + "match_name": ".*", + "match_value": "(?P<content>.+)" + }, + converter_registry=converter_registry + ) + + empty = TextElement(name="empty", value='') + assert tec.match(empty) is None + + empty_none = TextElement(name="empty", value=None) + assert tec.match(empty_none) is None + + non_empty = TextElement(name="empty", value=' ') + matches = tec.match(non_empty) + assert "content" in matches + assert matches["content"] == ' ' + + # Cfood definition for CSV example file + records = scan_directory(UNITTESTDIR / "test_directories" / "examples_tables" / "ExperimentalData", + UNITTESTDIR / "test_directories" / "examples_tables" / "crawler_for_issue_112.yml") + assert records + for rec in records: + print(rec.name) + assert len(rec.parents.filter_by_identity(name="Event")) > 0 + assert rec.name in ["event_a", "event_b", "event_c"] + if rec.name == "event_a": + assert rec.get_property("event_time") is not None + assert rec.get_property("event_time").value == "2025-02-06" + elif rec.name == "event_b": + # This should not have matched + assert rec.get_property("event_time") is None + if rec.name == "event_c": + assert rec.get_property("event_time") is not None + assert rec.get_property("event_time").value == "2025-02-06T09:00:00"