diff --git a/src/newcrawler/converters.py b/src/newcrawler/converters.py index b790c970b3bc6eae3fe11e05001989f81ebdacc1..97017e502a806fff5fb73fd302a05a40355fc10e 100644 --- a/src/newcrawler/converters.py +++ b/src/newcrawler/converters.py @@ -27,8 +27,9 @@ import os import re import caosdb as db from .stores import GeneralStore, RecordStore -from .structure_elements import StructureElement, Directory, File -from typing import Type, Optional +from .structure_elements import (StructureElement, Directory, File, + TextElement, DictTextElement, DictListElement) +from typing import Type, Union, Literal from abc import abstractmethod import yaml_header_tools @@ -159,6 +160,10 @@ class Converter(object): m = self.match(element) if m is None: raise RuntimeError("Condition does not match.") + if type(m) == bool: + if m == False: + raise RuntimeError("Result of match must not be False, use None instead.") + return values.update(m.groupdict()) @abstractmethod @@ -200,7 +205,7 @@ class Converter(object): pass @abstractmethod - def match(self, element: StructureElement) -> Optional[re.Match]: + def match(self, element: StructureElement) -> Union[re.Match, Literal[True], None]: pass class DirectoryConverter(Converter): @@ -258,10 +263,20 @@ class MarkdownFileConverter(Converter): if not isinstance(element, File): raise RuntimeError("A markdown file is needed to create children.") - return + header = yaml_header_tools.get_header_from_file(element.path) + children: list[StructureElement] = [] + + for name, entry in header.items(): + if type(entry) == list: + children.append(DictListElement(name, entry)) + elif type(entry) == str: + children.append(DictTextElement(name, entry)) + else: + raise RuntimeError("Header entry {} has incompatible type.".format(name)) + return children + def typecheck(self, element: StructureElement): - return False return isinstance(element, File) def match(self, element: StructureElement): @@ -275,10 +290,51 @@ class MarkdownFileConverter(Converter): return m class DictTextElementConverter(Converter): - pass + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + return [] + + + def typecheck(self, element: StructureElement): + return isinstance(element, DictTextElement) + + def match(self, element: StructureElement) -> Union[re.Match, Literal[True], None]: + if not isinstance(element, DictTextElement): + raise RuntimeError("Element must be a DictTextElement.") + if self.name != element.name: + return None + m = re.match(self.definition["match"], element.value) + return m class DictListElementConverter(Converter): - pass + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + if not isinstance(element, DictListElement): + raise RuntimeError("This converter can only process DictListElements.") + return [TextElement(list_element) for list_element in element.value] + + def typecheck(self, element: StructureElement): + return isinstance(element, DictListElement) + + def match(self, element: StructureElement) -> Union[re.Match, Literal[True], None]: + if not isinstance(element, DictListElement): + raise RuntimeError("Element must be a DictListElement.") + if self.name != element.name: + return None + if "match" in self.definition: + raise NotImplementedError("Match is not implemented for DictListElement.") + return True class TextElementConverter(Converter): - pass + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + return [] + + def typecheck(self, element: StructureElement): + return isinstance(element, TextElement) + + def match(self, element: StructureElement) -> Union[re.Match, Literal[True], None]: + if not isinstance(element, TextElement): + raise RuntimeError("Element must be a TextElement.") + m = re.match(self.definition["match"], element.value) + return m diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index 4e65fb082bc65cd0cf324aaadb3065bb48eaa38a..b19a44dde10cde687463c8be4bf89f171ceabf46 100644 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -70,11 +70,6 @@ from .structure_elements import StructureElement, Directory, File from .converters import Converter, DirectoryConverter - - - - - class Crawler(object): """ Crawler class that encapsulates crawling functions. diff --git a/src/newcrawler/structure_elements.py b/src/newcrawler/structure_elements.py index cb3cbac885d3707df2a4f8312cc684afef324267..00a137bea80bcc6a95fbce492beabd3438218722 100644 --- a/src/newcrawler/structure_elements.py +++ b/src/newcrawler/structure_elements.py @@ -28,7 +28,7 @@ class StructureElement(object): pass class FileSystemStructureElement(StructureElement): - def __init__(self, name, path): + def __init__(self, name: str, path: str): self.name = name self.path = path @@ -42,3 +42,17 @@ class Directory(FileSystemStructureElement): class File(FileSystemStructureElement): pass + +class DictTextElement(StructureElement): + def __init__(self, name: str, value: str): + self.name = name + self.value = value + +class DictListElement(StructureElement): + def __init__(self, name: str, value: list): + self.name = name + self.value = value + +class TextElement(StructureElement): + def __init__(self, value: str): + self.value = value