diff --git a/src/newcrawler/__init__.py b/src/newcrawler/__init__.py index c6a81199838c281938c3b2e0e820212570a43bd7..28ef97d421023ad41be65d9d0e6abac76fbef6fe 100644 --- a/src/newcrawler/__init__.py +++ b/src/newcrawler/__init__.py @@ -1 +1 @@ -from .crawl import * +from .crawl import Crawler diff --git a/src/newcrawler/converters.py b/src/newcrawler/converters.py index d251707cd4a91834007e04f11fcc66e337828be6..b790c970b3bc6eae3fe11e05001989f81ebdacc1 100644 --- a/src/newcrawler/converters.py +++ b/src/newcrawler/converters.py @@ -28,8 +28,9 @@ import re import caosdb as db from .stores import GeneralStore, RecordStore from .structure_elements import StructureElement, Directory, File -from typing import Type +from typing import Type, Optional from abc import abstractmethod +import yaml_header_tools class Converter(object): """ @@ -145,7 +146,6 @@ class Converter(object): converter = converter_registry[definition["type"]](definition, name) return converter - @abstractmethod def create_values(self, values: GeneralStore, element: StructureElement): @@ -156,7 +156,10 @@ class Converter(object): values: The GeneralStore to store values in. element: The StructureElement to extract values from. """ - pass + m = self.match(element) + if m is None: + raise RuntimeError("Condition does not match.") + values.update(m.groupdict()) @abstractmethod def create_children(self, values: GeneralStore, @@ -197,7 +200,7 @@ class Converter(object): pass @abstractmethod - def match(self, element: StructureElement): + def match(self, element: StructureElement) -> Optional[re.Match]: pass class DirectoryConverter(Converter): @@ -208,16 +211,6 @@ class DirectoryConverter(Converter): """ super().__init__(definition, name) - def create_values(self, - values: GeneralStore, - element: StructureElement): - if not isinstance(element, Directory): - raise RuntimeError("Element must be a directory.") - m = re.match(self.definition["match"], element.name) - if m is None: - raise RuntimeError("Condition does not match.") - values.update(m.groupdict()) - def create_children(self, generalStore: GeneralStore, element: StructureElement): if not isinstance(element, Directory): @@ -231,8 +224,7 @@ class DirectoryConverter(Converter): def match(self, element: StructureElement): if not isinstance(element, Directory): raise RuntimeError("Element must be a directory.") - m = re.match(self.definition["match"], element.name) - return m is not None + return re.match(self.definition["match"], element.name) @staticmethod def create_children_from_directory(element: Directory): @@ -255,7 +247,32 @@ class DirectoryConverter(Converter): return children class MarkdownFileConverter(Converter): - pass + def __init__(self, definition: dict, name: str): + """ + Initialize a new directory converter. + """ + super().__init__(definition, name) + + def create_children(self, generalStore: GeneralStore, + element: StructureElement): + if not isinstance(element, File): + raise RuntimeError("A markdown file is needed to create children.") + + return + + def typecheck(self, element: StructureElement): + return False + return isinstance(element, File) + + def match(self, element: StructureElement): + if not isinstance(element, File): + raise RuntimeError("Element must be a file.") + m = re.match(self.definition["match"], element.name) + try: + yaml_header_tools.get_header_from_file(element.path) + except yaml_header_tools.NoValidHeader: + return None + return m class DictTextElementConverter(Converter): pass diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index a20f0d880530765eb676f6c994cfecb127822f81..4e65fb082bc65cd0cf324aaadb3065bb48eaa38a 100644 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -167,7 +167,7 @@ class Crawler(object): # type is something like "matches files", replace isinstance with "type_matches" # match function tests regexp for example if (converter.typecheck(element) and - converter.match(element)): + converter.match(element) is not None): generalStore_copy = generalStore.create_scoped_copy() recordStore_copy = recordStore.create_scoped_copy() # extracts values from structure element and stores them in the converter diff --git a/tests/test_tool.py b/tests/test_tool.py index d1f0f211260ba7ded3beb76701484a5235a9966e..778c753a134874f6bc345c24d767051319b14893 100755 --- a/tests/test_tool.py +++ b/tests/test_tool.py @@ -4,6 +4,8 @@ # A. Schlemmer, 06/2021 from newcrawler import Crawler +from newcrawler.converters import MarkdownFileConverter +from newcrawler.structure_elements import File from os.path import join, dirname, basename import caosdb as db @@ -68,3 +70,18 @@ def test_crawler(): assert subd[1]["Measurement"].get_property("project").value != "$Project" assert subd[1]["Measurement"].get_property("project").value.__class__ == db.Record assert subd[1]["Measurement"].get_property("project").value == subd[0]["Project"] + +def test_markdown_converter(): + converter = MarkdownFileConverter({ + "match": "(.*)" + }, "TestMarkdownFileConverter") + + m = converter.match(File("README.md", rfp( + "test_directories", "examples_article", "DataAnalysis", + "2020_climate-model-predict", "2020-02-08_prediction-errors", "README.md"))) + assert m is not None + assert len(m.groups()) == 1 + + m = converter.match(File("test_tool.py", rfp( + "test_tool.py"))) + assert m is None