Skip to content
Snippets Groups Projects
Commit 497378ea authored by Alexander Schlemmer's avatar Alexander Schlemmer
Browse files

ENH: started implementation of markdown file converter and added a unit test

parent b1d84fc2
Branches
Tags
1 merge request!53Release 0.1
from .crawl import * from .crawl import Crawler
...@@ -28,8 +28,9 @@ import re ...@@ -28,8 +28,9 @@ import re
import caosdb as db import caosdb as db
from .stores import GeneralStore, RecordStore from .stores import GeneralStore, RecordStore
from .structure_elements import StructureElement, Directory, File from .structure_elements import StructureElement, Directory, File
from typing import Type from typing import Type, Optional
from abc import abstractmethod from abc import abstractmethod
import yaml_header_tools
class Converter(object): class Converter(object):
""" """
...@@ -145,7 +146,6 @@ class Converter(object): ...@@ -145,7 +146,6 @@ class Converter(object):
converter = converter_registry[definition["type"]](definition, name) converter = converter_registry[definition["type"]](definition, name)
return converter return converter
@abstractmethod
def create_values(self, def create_values(self,
values: GeneralStore, values: GeneralStore,
element: StructureElement): element: StructureElement):
...@@ -156,7 +156,10 @@ class Converter(object): ...@@ -156,7 +156,10 @@ class Converter(object):
values: The GeneralStore to store values in. values: The GeneralStore to store values in.
element: The StructureElement to extract values from. element: The StructureElement to extract values from.
""" """
pass m = self.match(element)
if m is None:
raise RuntimeError("Condition does not match.")
values.update(m.groupdict())
@abstractmethod @abstractmethod
def create_children(self, values: GeneralStore, def create_children(self, values: GeneralStore,
...@@ -197,7 +200,7 @@ class Converter(object): ...@@ -197,7 +200,7 @@ class Converter(object):
pass pass
@abstractmethod @abstractmethod
def match(self, element: StructureElement): def match(self, element: StructureElement) -> Optional[re.Match]:
pass pass
class DirectoryConverter(Converter): class DirectoryConverter(Converter):
...@@ -208,16 +211,6 @@ class DirectoryConverter(Converter): ...@@ -208,16 +211,6 @@ class DirectoryConverter(Converter):
""" """
super().__init__(definition, name) super().__init__(definition, name)
def create_values(self,
values: GeneralStore,
element: StructureElement):
if not isinstance(element, Directory):
raise RuntimeError("Element must be a directory.")
m = re.match(self.definition["match"], element.name)
if m is None:
raise RuntimeError("Condition does not match.")
values.update(m.groupdict())
def create_children(self, generalStore: GeneralStore, def create_children(self, generalStore: GeneralStore,
element: StructureElement): element: StructureElement):
if not isinstance(element, Directory): if not isinstance(element, Directory):
...@@ -231,8 +224,7 @@ class DirectoryConverter(Converter): ...@@ -231,8 +224,7 @@ class DirectoryConverter(Converter):
def match(self, element: StructureElement): def match(self, element: StructureElement):
if not isinstance(element, Directory): if not isinstance(element, Directory):
raise RuntimeError("Element must be a directory.") raise RuntimeError("Element must be a directory.")
m = re.match(self.definition["match"], element.name) return re.match(self.definition["match"], element.name)
return m is not None
@staticmethod @staticmethod
def create_children_from_directory(element: Directory): def create_children_from_directory(element: Directory):
...@@ -255,7 +247,32 @@ class DirectoryConverter(Converter): ...@@ -255,7 +247,32 @@ class DirectoryConverter(Converter):
return children return children
class MarkdownFileConverter(Converter): class MarkdownFileConverter(Converter):
pass def __init__(self, definition: dict, name: str):
"""
Initialize a new directory converter.
"""
super().__init__(definition, name)
def create_children(self, generalStore: GeneralStore,
element: StructureElement):
if not isinstance(element, File):
raise RuntimeError("A markdown file is needed to create children.")
return
def typecheck(self, element: StructureElement):
return False
return isinstance(element, File)
def match(self, element: StructureElement):
if not isinstance(element, File):
raise RuntimeError("Element must be a file.")
m = re.match(self.definition["match"], element.name)
try:
yaml_header_tools.get_header_from_file(element.path)
except yaml_header_tools.NoValidHeader:
return None
return m
class DictTextElementConverter(Converter): class DictTextElementConverter(Converter):
pass pass
......
...@@ -167,7 +167,7 @@ class Crawler(object): ...@@ -167,7 +167,7 @@ class Crawler(object):
# type is something like "matches files", replace isinstance with "type_matches" # type is something like "matches files", replace isinstance with "type_matches"
# match function tests regexp for example # match function tests regexp for example
if (converter.typecheck(element) and if (converter.typecheck(element) and
converter.match(element)): converter.match(element) is not None):
generalStore_copy = generalStore.create_scoped_copy() generalStore_copy = generalStore.create_scoped_copy()
recordStore_copy = recordStore.create_scoped_copy() recordStore_copy = recordStore.create_scoped_copy()
# extracts values from structure element and stores them in the converter # extracts values from structure element and stores them in the converter
......
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
# A. Schlemmer, 06/2021 # A. Schlemmer, 06/2021
from newcrawler import Crawler from newcrawler import Crawler
from newcrawler.converters import MarkdownFileConverter
from newcrawler.structure_elements import File
from os.path import join, dirname, basename from os.path import join, dirname, basename
import caosdb as db import caosdb as db
...@@ -68,3 +70,18 @@ def test_crawler(): ...@@ -68,3 +70,18 @@ def test_crawler():
assert subd[1]["Measurement"].get_property("project").value != "$Project" assert subd[1]["Measurement"].get_property("project").value != "$Project"
assert subd[1]["Measurement"].get_property("project").value.__class__ == db.Record assert subd[1]["Measurement"].get_property("project").value.__class__ == db.Record
assert subd[1]["Measurement"].get_property("project").value == subd[0]["Project"] assert subd[1]["Measurement"].get_property("project").value == subd[0]["Project"]
def test_markdown_converter():
converter = MarkdownFileConverter({
"match": "(.*)"
}, "TestMarkdownFileConverter")
m = converter.match(File("README.md", rfp(
"test_directories", "examples_article", "DataAnalysis",
"2020_climate-model-predict", "2020-02-08_prediction-errors", "README.md")))
assert m is not None
assert len(m.groups()) == 1
m = converter.match(File("test_tool.py", rfp(
"test_tool.py")))
assert m is None
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment