Skip to content
Snippets Groups Projects
Commit 1a3fc544 authored by Florian Spreckelsen's avatar Florian Spreckelsen
Browse files
parent 3c836b10
No related branches found
No related tags found
2 merge requests!217TST: Make NamedTemporaryFiles Windows-compatible,!216TST: Make NamedTemporaryFiles Windows-compatible
...@@ -1327,7 +1327,10 @@ out: ...@@ -1327,7 +1327,10 @@ out:
m1 = {} m1 = {}
if "match_value" in definition: if "match_value" in definition:
m2 = re.match(definition["match_value"], str(value), re.DOTALL) # None values will be interpreted as empty strings for the
# matcher.
m_value = str(value) if (value is not None and not pd.isna(value)) else ""
m2 = re.match(definition["match_value"], m_value, re.DOTALL)
if m2 is None: if m2 is None:
return None return None
else: else:
......
...@@ -20,14 +20,44 @@ ...@@ -20,14 +20,44 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>. # along with this program. If not, see <https://www.gnu.org/licenses/>.
# #
from pytest import mark import importlib
from caoscrawler.converters import CrawlerTemplate, replace_variables from pathlib import Path
from pytest import fixture, mark
from caoscrawler.converters import (CrawlerTemplate, replace_variables, TextElementConverter)
from caoscrawler.crawl import Crawler from caoscrawler.crawl import Crawler
from caoscrawler.scanner import (create_converter_registry, from caoscrawler.scanner import (create_converter_registry, scan_directory,
scan_structure_elements) scan_structure_elements)
from caoscrawler.stores import GeneralStore from caoscrawler.stores import GeneralStore
from caoscrawler.structure_elements import DictElement from caoscrawler.structure_elements import DictElement, TextElement
UNITTESTDIR = Path(__file__).parent
@fixture
def converter_registry():
converter_registry: dict[str, dict[str, str]] = {
"TextElement": {
"converter": "TextElementConverter",
"package": "caoscrawler.converters"},
"Directory": {
"converter": "DirectoryConverter",
"package": "caoscrawler.converters"},
"CSVTableConverter": {
"converter": "CSVTableConverter",
"package": "caoscrawler.converters"},
"Datetime": {
"converter": "DatetimeElementConverter",
"package": "caoscrawler.converters"
}
}
for key, value in converter_registry.items():
module = importlib.import_module(value["package"])
value["class"] = getattr(module, value["converter"])
return converter_registry
def test_issue_10(): def test_issue_10():
...@@ -148,3 +178,49 @@ def test_issue_93(): ...@@ -148,3 +178,49 @@ def test_issue_93():
propvalue_template = CrawlerTemplate(propvalue) propvalue_template = CrawlerTemplate(propvalue)
assert (propvalue_template.safe_substitute(**values.get_storage()) assert (propvalue_template.safe_substitute(**values.get_storage())
== f"some text before >> This is {exp} << some text after") == f"some text before >> This is {exp} << some text after")
def test_issue_112(converter_registry):
"""Test that empty table cells are not matched in case of
``match_value: ".+"``.
See https://gitlab.com/linkahead/linkahead-crawler/-/issues/112.
"""
tec = TextElementConverter(
name="TestTextConverter",
definition={
"match_name": ".*",
"match_value": "(?P<content>.+)"
},
converter_registry=converter_registry
)
empty = TextElement(name="empty", value='')
assert tec.match(empty) is None
empty_none = TextElement(name="empty", value=None)
assert tec.match(empty_none) is None
non_empty = TextElement(name="empty", value=' ')
matches = tec.match(non_empty)
assert "content" in matches
assert matches["content"] == ' '
# Cfood definition for CSV example file
records = scan_directory(UNITTESTDIR / "test_directories" / "examples_tables" / "ExperimentalData",
UNITTESTDIR / "test_directories" / "examples_tables" / "crawler_for_issue_112.yml")
assert records
for rec in records:
print(rec.name)
assert len(rec.parents.filter_by_identity(name="Event")) > 0
assert rec.name in ["event_a", "event_b", "event_c"]
if rec.name == "event_a":
assert rec.get_property("event_time") is not None
assert rec.get_property("event_time").value == "2025-02-06"
elif rec.name == "event_b":
# This should not have matched
assert rec.get_property("event_time") is None
if rec.name == "event_c":
assert rec.get_property("event_time") is not None
assert rec.get_property("event_time").value == "2025-02-06T09:00:00"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment