From 3052a97bc071f34cd73ac9c59203cdc879c8aba5 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <alexander@mail-schlemmer.de> Date: Thu, 18 Nov 2021 12:31:16 +0100 Subject: [PATCH] FIX: fixed some bugs in markdown conversion --- src/newcrawler/converters.py | 45 ++++++++++++++++++------------- tests/scifolder_cfood.yml | 17 ++++++++++-- tests/test_tool.py | 52 +++++++++++++++++++++++++++++++----- 3 files changed, 87 insertions(+), 27 deletions(-) diff --git a/src/newcrawler/converters.py b/src/newcrawler/converters.py index 97017e50..3feb0a99 100644 --- a/src/newcrawler/converters.py +++ b/src/newcrawler/converters.py @@ -29,7 +29,7 @@ import caosdb as db from .stores import GeneralStore, RecordStore from .structure_elements import (StructureElement, Directory, File, TextElement, DictTextElement, DictListElement) -from typing import Type, Union, Literal +from typing import Type, Optional from abc import abstractmethod import yaml_header_tools @@ -160,11 +160,7 @@ class Converter(object): m = self.match(element) if m is None: raise RuntimeError("Condition does not match.") - if type(m) == bool: - if m == False: - raise RuntimeError("Result of match must not be False, use None instead.") - return - values.update(m.groupdict()) + values.update(m) @abstractmethod def create_children(self, values: GeneralStore, @@ -205,7 +201,7 @@ class Converter(object): pass @abstractmethod - def match(self, element: StructureElement) -> Union[re.Match, Literal[True], None]: + def match(self, element: StructureElement) -> Optional[dict]: pass class DirectoryConverter(Converter): @@ -229,7 +225,10 @@ class DirectoryConverter(Converter): def match(self, element: StructureElement): if not isinstance(element, Directory): raise RuntimeError("Element must be a directory.") - return re.match(self.definition["match"], element.name) + m = re.match(self.definition["match"], element.name) + if m is None: + return None + return m.groupdict() @staticmethod def create_children_from_directory(element: Directory): @@ -263,7 +262,7 @@ class MarkdownFileConverter(Converter): if not isinstance(element, File): raise RuntimeError("A markdown file is needed to create children.") - header = yaml_header_tools.get_header_from_file(element.path) + header = yaml_header_tools.get_header_from_file(element.path, clean=False) children: list[StructureElement] = [] for name, entry in header.items(): @@ -283,11 +282,13 @@ class MarkdownFileConverter(Converter): if not isinstance(element, File): raise RuntimeError("Element must be a file.") m = re.match(self.definition["match"], element.name) + if m is None: + return None try: yaml_header_tools.get_header_from_file(element.path) except yaml_header_tools.NoValidHeader: return None - return m + return m.groupdict() class DictTextElementConverter(Converter): def create_children(self, generalStore: GeneralStore, @@ -298,13 +299,19 @@ class DictTextElementConverter(Converter): def typecheck(self, element: StructureElement): return isinstance(element, DictTextElement) - def match(self, element: StructureElement) -> Union[re.Match, Literal[True], None]: + def match(self, element: StructureElement): if not isinstance(element, DictTextElement): raise RuntimeError("Element must be a DictTextElement.") - if self.name != element.name: + m1 = re.match(self.definition["match_name"], self.name) + if m1 is None: return None - m = re.match(self.definition["match"], element.value) - return m + m2 = re.match(self.definition["match_value"], element.value) + if m2 is None: + return None + values = dict() + values.update(m1.groupdict()) + values.update(m2.groupdict()) + return values class DictListElementConverter(Converter): def create_children(self, generalStore: GeneralStore, @@ -316,14 +323,14 @@ class DictListElementConverter(Converter): def typecheck(self, element: StructureElement): return isinstance(element, DictListElement) - def match(self, element: StructureElement) -> Union[re.Match, Literal[True], None]: + def match(self, element: StructureElement): if not isinstance(element, DictListElement): raise RuntimeError("Element must be a DictListElement.") if self.name != element.name: return None if "match" in self.definition: raise NotImplementedError("Match is not implemented for DictListElement.") - return True + return dict() class TextElementConverter(Converter): def create_children(self, generalStore: GeneralStore, @@ -333,8 +340,10 @@ class TextElementConverter(Converter): def typecheck(self, element: StructureElement): return isinstance(element, TextElement) - def match(self, element: StructureElement) -> Union[re.Match, Literal[True], None]: + def match(self, element: StructureElement): if not isinstance(element, TextElement): raise RuntimeError("Element must be a TextElement.") m = re.match(self.definition["match"], element.value) - return m + if m is None: + return None + return m.groupdict() diff --git a/tests/scifolder_cfood.yml b/tests/scifolder_cfood.yml index ac460da3..252f30c8 100644 --- a/tests/scifolder_cfood.yml +++ b/tests/scifolder_cfood.yml @@ -29,12 +29,25 @@ DataAnalysis: # name of the converter subtree: description: type: DictTextElement - match: (?P<description>.*) + match_value: (?P<description>.*) + match_name: description records: Measurement: description: $description - responsible: + responsible_single: + type: DictTextElement + match_name: responsible + match_value: (?P<first_name>.+) (?P<last_name>.+) + records: + Person: + first_name: $first_name + last_name: $last_name + Measurement: # this uses the reference to the above defined record + responsible: +$Person + + responsible_list: type: DictListElement + match_name: responsible subtree: Person: type: TextElement diff --git a/tests/test_tool.py b/tests/test_tool.py index 778c753a..ef4b5929 100755 --- a/tests/test_tool.py +++ b/tests/test_tool.py @@ -5,7 +5,7 @@ from newcrawler import Crawler from newcrawler.converters import MarkdownFileConverter -from newcrawler.structure_elements import File +from newcrawler.structure_elements import File, DictTextElement, DictListElement from os.path import join, dirname, basename import caosdb as db @@ -72,16 +72,54 @@ def test_crawler(): assert subd[1]["Measurement"].get_property("project").value == subd[0]["Project"] def test_markdown_converter(): + test_readme = File("README.md", rfp( + "test_directories", "examples_article", "DataAnalysis", + "2020_climate-model-predict", "2020-02-08_prediction-errors", "README.md")) + converter = MarkdownFileConverter({ "match": "(.*)" }, "TestMarkdownFileConverter") - m = converter.match(File("README.md", rfp( - "test_directories", "examples_article", "DataAnalysis", - "2020_climate-model-predict", "2020-02-08_prediction-errors", "README.md"))) - assert m is not None - assert len(m.groups()) == 1 - m = converter.match(File("test_tool.py", rfp( "test_tool.py"))) assert m is None + + m = converter.match(test_readme) + assert m is not None + assert m.__class__ == dict + assert len(m) == 0 + + converter = MarkdownFileConverter({ + "match": "README.md" + }, "TestMarkdownFileConverter") + + m = converter.match(test_readme) + assert m is not None + assert len(m) == 0 + + children = converter.create_children(None, test_readme) + assert len(children) == 5 + assert children[1].__class__ == DictTextElement + assert children[1].name == "description" + assert children[1].value.__class__ == str + + assert children[0].__class__ == DictTextElement + assert children[0].name == "responsible" + assert children[0].value.__class__ == str + + test_readme2 = File("README.md", rfp("test_directories", "examples_article", "ExperimentalData", "2020_SpeedOfLight", "2020-01-01_TimeOfFlight", "README.md")) + + m = converter.match(test_readme2) + assert m is not None + assert len(m) == 0 + + children = converter.create_children(None, test_readme2) + assert len(children) == 2 + assert children[1].__class__ == DictTextElement + assert children[1].name == "description" + assert children[1].value.__class__ == str + + assert children[0].__class__ == DictListElement + assert children[0].name == "responsible" + assert children[0].value.__class__ == list + -- GitLab