From 521d8d05ce1da48f02f2694c45c9090b0c8ad582 Mon Sep 17 00:00:00 2001 From: Alexander Schlemmer <alexander@mail-schlemmer.de> Date: Thu, 18 Nov 2021 12:01:24 +0100 Subject: [PATCH] ENH: implemented markdown converter --- src/newcrawler/converters.py | 72 ++++++++++++++++++++++++---- src/newcrawler/crawl.py | 5 -- src/newcrawler/structure_elements.py | 16 ++++++- 3 files changed, 79 insertions(+), 14 deletions(-) diff --git a/src/newcrawler/converters.py b/src/newcrawler/converters.py index b790c970..97017e50 100644 --- a/src/newcrawler/converters.py +++ b/src/newcrawler/converters.py @@ -27,8 +27,9 @@ import os import re import caosdb as db from .stores import GeneralStore, RecordStore -from .structure_elements import StructureElement, Directory, File -from typing import Type, Optional +from .structure_elements import (StructureElement, Directory, File, + TextElement, DictTextElement, DictListElement) +from typing import Type, Union, Literal from abc import abstractmethod import yaml_header_tools @@ -159,6 +160,10 @@ class Converter(object): m = self.match(element) if m is None: raise RuntimeError("Condition does not match.") + if type(m) == bool: + if m == False: + raise RuntimeError("Result of match must not be False, use None instead.") + return values.update(m.groupdict()) @abstractmethod @@ -200,7 +205,7 @@ class Converter(object): pass @abstractmethod - def match(self, element: StructureElement) -> Optional[re.Match]: + def match(self, element: StructureElement) -> Union[re.Match, Literal[True], None]: pass class DirectoryConverter(Converter): @@ -258,10 +263,20 @@ class MarkdownFileConverter(Converter): if not isinstance(element, File): raise RuntimeError("A markdown file is needed to create children.") - return + header = yaml_header_tools.get_header_from_file(element.path) + children: list[StructureElement] = [] + + for name, entry in header.items(): + if type(entry) == list: + children.append(DictListElement(name, entry)) + elif type(entry) == str: + children.append(DictTextElement(name, entry)) + else: + raise RuntimeError("Header entry {} has incompatible type.".format(name)) + return children + def typecheck(self, element: StructureElement): - return False return isinstance(element, File) def match(self, element: StructureElement): @@ -275,10 +290,51 @@ class MarkdownFileConverter(Converter): return m class DictTextElementConverter(Converter): - pass + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + return [] + + + def typecheck(self, element: StructureElement): + return isinstance(element, DictTextElement) + + def match(self, element: StructureElement) -> Union[re.Match, Literal[True], None]: + if not isinstance(element, DictTextElement): + raise RuntimeError("Element must be a DictTextElement.") + if self.name != element.name: + return None + m = re.match(self.definition["match"], element.value) + return m class DictListElementConverter(Converter): - pass + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + if not isinstance(element, DictListElement): + raise RuntimeError("This converter can only process DictListElements.") + return [TextElement(list_element) for list_element in element.value] + + def typecheck(self, element: StructureElement): + return isinstance(element, DictListElement) + + def match(self, element: StructureElement) -> Union[re.Match, Literal[True], None]: + if not isinstance(element, DictListElement): + raise RuntimeError("Element must be a DictListElement.") + if self.name != element.name: + return None + if "match" in self.definition: + raise NotImplementedError("Match is not implemented for DictListElement.") + return True class TextElementConverter(Converter): - pass + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + return [] + + def typecheck(self, element: StructureElement): + return isinstance(element, TextElement) + + def match(self, element: StructureElement) -> Union[re.Match, Literal[True], None]: + if not isinstance(element, TextElement): + raise RuntimeError("Element must be a TextElement.") + m = re.match(self.definition["match"], element.value) + return m diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index 4e65fb08..b19a44dd 100644 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -70,11 +70,6 @@ from .structure_elements import StructureElement, Directory, File from .converters import Converter, DirectoryConverter - - - - - class Crawler(object): """ Crawler class that encapsulates crawling functions. diff --git a/src/newcrawler/structure_elements.py b/src/newcrawler/structure_elements.py index cb3cbac8..00a137be 100644 --- a/src/newcrawler/structure_elements.py +++ b/src/newcrawler/structure_elements.py @@ -28,7 +28,7 @@ class StructureElement(object): pass class FileSystemStructureElement(StructureElement): - def __init__(self, name, path): + def __init__(self, name: str, path: str): self.name = name self.path = path @@ -42,3 +42,17 @@ class Directory(FileSystemStructureElement): class File(FileSystemStructureElement): pass + +class DictTextElement(StructureElement): + def __init__(self, name: str, value: str): + self.name = name + self.value = value + +class DictListElement(StructureElement): + def __init__(self, name: str, value: list): + self.name = name + self.value = value + +class TextElement(StructureElement): + def __init__(self, value: str): + self.value = value -- GitLab