Skip to content
Snippets Groups Projects
Commit eb752ae9 authored by Henrik tom Wörden's avatar Henrik tom Wörden Committed by Timm Fitschen
Browse files

Create a non file based crawler

parent 3bb35450
No related branches found
No related tags found
No related merge requests found
......@@ -4,6 +4,9 @@ RUN apt-get update && \
curl \
python3 \
python3-pip \
python3-requests \
python3-pandas \
python3-html2text \
git \
openjdk-11-jdk-headless \
python-autopep8 \
......
......@@ -31,11 +31,12 @@ stages:
- setup
- cert
- style
- test
- unittest
- integrationtest
test:
tags: [docker]
stage: test
stage: integrationtest
image: $CI_REGISTRY_IMAGE_BASE
script:
- if [[ "$CAOSDB_TAG" == "" ]]; then
......@@ -106,3 +107,11 @@ style:
script:
- autopep8 -ar --diff --exit-code .
allow_failure: true
unittest:
tags: [docker]
stage: unittest
image: $CI_REGISTRY_IMAGE
script:
- cd src
- python3 -m pytest ../unittests
......@@ -8,7 +8,7 @@ tox
# Run Integration Tests Locally
1. Mount `integrationtests/full_test/extroot` to the folder that will be used as
extroot. E.g. `sudo mount -o bind extroot ../../../caosdb-deploy/profiles/empty/custom/extroot`
extroot. E.g. `sudo mount -o bind extroot ../../../caosdb-deploy/profiles/empty/paths/extroot`
2. Start an empty CaosDB instance
3. run test.sh
......
......@@ -29,7 +29,8 @@ import sys
from argparse import RawTextHelpFormatter
import caosdb as db
from caosadvancedtools.crawler import Crawler
from caosadvancedtools.crawler import FileCrawler
from caosadvancedtools.guard import INSERT, RETRIEVE, UPDATE, Guard
from caosadvancedtools.utils import set_log_level
from scifolder import (AnalysisCFood, ExperimentCFood, ProjectCFood,
......@@ -59,12 +60,13 @@ if __name__ == "__main__":
args = parser.parse_args()
logger.info("Starting query...")
files = Crawler.query_files(args.path)
files = FileCrawler.query_files(args.path)
logger.info("Query done...")
config = db.configuration.get_config()
c = Crawler(use_cache=True, access=access,
food=[ProjectCFood,
ExperimentCFood, AnalysisCFood,
PublicationCFood, SimulationCFood,
])
c.crawl(files, interactive=False, security_level=INSERT, hideKnown=True)
c = FileCrawler(files=files, use_cache=True, access=access,
interactive=False, hideKnown=True,
food=[ProjectCFood,
ExperimentCFood, AnalysisCFood,
PublicationCFood, SimulationCFood,
])
c.crawl(security_level=UPDATE)
......@@ -18,7 +18,10 @@ Person:
lastName:
datatype: TEXT
description: 'LastName of a Person.'
responsible:
recommended_properties:
email:
datatype: TEXT
description: 'Email of a Person.'
responsible:
datatype: REFERENCE
revisionOf:
......
......@@ -3,6 +3,7 @@ rm -rf cache.db
echo "Filling the database"
./filldb.sh
echo "Testing the crawler database"
py.test-3 test_crawler.py
python3 -m pytest test_crawler.py
# TODO the following test deletes lots of the data inserted by the crawler
echo "Testing im and export"
python3 test_im_und_export.py
#!/usr/bin/env python3
# encoding: utf-8
#
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2020 Henrik tom Wörden
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import argparse
import logging
import sys
from argparse import RawTextHelpFormatter
import caosdb as db
import pandas as pd
from caosadvancedtools.crawler import TableCrawler
from caosadvancedtools.guard import INSERT, RETRIEVE, UPDATE, Guard
from caosadvancedtools.utils import set_log_level
if __name__ == "__main__":
logger = logging.getLogger("caosadvancedtools")
conlogger = logging.getLogger("connection")
conlogger.setLevel(level=logging.ERROR)
logger.setLevel(level=logging.DEBUG)
table = pd.read_csv("example_table.csv")
assert 0 == len(db.execute_query("FIND Person with firstname=Henrik"))
first = table.loc[table.firstName == "Henrik"]
tcr = TableCrawler(table=first, unique_cols=["firstName", "lastName"],
recordtype="Person", interactive=False)
tcr.crawl(security_level=UPDATE)
assert 1 == len(db.execute_query("FIND Person with firstname=Henrik"))
tcr = TableCrawler(table=table, unique_cols=["firstName", "lastName"],
recordtype="Person", interactive=False)
tcr.crawl(security_level=UPDATE)
assert 1 == len(db.execute_query("FIND Person with firstname=Henrik"))
assert 1 == len(db.execute_query("FIND Person with firstname=Max"))
......@@ -27,7 +27,7 @@ import sys
import caosmodels
from caosmodels.parser import parse_model_from_yaml
from caosadvancedtools.converter import labfolder
from caosadvancedtools.converter import labfolder_export as labfolder
def main(args):
......
......@@ -7,6 +7,7 @@
# Copyright (C) 2018 Research Group Biomedical Physics,
# Max-Planck-Institute for Dynamics and Self-Organization Göttingen
# Copyright (C) 2019 Henrik tom Wörden
# Copyright (C) 2020 Henrik tom Wörden
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
......@@ -62,46 +63,176 @@ def get_entity(name):
class AbstractCFood(object):
def __init__(self):
""" Abstract base class for Crawler food (CFood)."""
self.to_be_updated = db.Container()
self.identifiables = db.Container()
def create_identifiables(self):
"""
should set the instance variable Container with the identifiables
"""
raise NotImplementedError()
def update_identifiables(self):
""" Changes the identifiables as needed and adds changed identifiables
to self.to_be_updated
"""
raise NotImplementedError()
def push_identifiables_to_CaosDB(self):
""" Updates the self.to_be_updated Container, i.e. pushes the changes
to CaosDB
"""
if len(self.to_be_updated) == 0:
return
get_ids_for_entities_with_names(self.to_be_updated)
# remove duplicates
tmp = db.Container()
for el in self.to_be_updated:
if el not in tmp:
tmp.append(el)
self.to_be_updated = tmp
logger.info("UPDATE: updating the following entities")
for el in self.to_be_updated:
logger.info("\t" + el.name if el.name is not None else el.id)
logger.debug(self.to_be_updated)
guard.safe_update(self.to_be_updated)
@staticmethod
# move to api?
def set_parents(entity, names):
entity.parents.clear()
for n in names:
entity.add_parent(get_entity(n))
@staticmethod
# move to api?
def remove_property(entity, prop):
# TODO only do something when it is necessary?
if isinstance(prop, db.Entity):
name = prop.name
else:
name = prop
while entity.get_property(name) is not None:
entity.remove_property(name)
@staticmethod
# move to api?
def set_property(entity, prop, value, datatype=None):
AbstractCFood.remove_property(entity, prop)
if datatype is not None:
entity.add_property(prop, value, datatype=datatype)
else:
entity.add_property(prop, value)
class CMeal(object):
"""
CMeal groups equivalent Files and allow their collected insertion.
Sometimes there is no one file that can be used to trigger the creation of
some Record. E.g. if a collection of images shall be referenced from one
Record that groups them, it is unclear which image should trigger the
creation of the Record.
CMeals are grouped based on the groups in the used regular expression. If,
in the above example, all the images reside in one folder, all groups
except that for the file name should match. The groups that shall match
need to be listed in the matching_groups class property. Subclasses will
overwrite this property.
The cook function of a cfood allows this class to work. Instead of directly
instantiating a CFood the cook function is used. If the CFood is also a
child of CMeal, it will be checked (using get_suitable_cfood) in the cook
function whether a new CFood should be created or if the file match should
be added to an existing one. In order to allow this all instances of a
CFood class are tracked in the existing_instances class member.
"""
existing_instances = []
matching_groups = []
def __init__(self, *args, **kwargs):
self.__class__.existing_instances.append(self)
self.crawled_files = []
def add(self, crawled_file):
self.crawled_files.append(crawled_file)
@classmethod
def get_suitable_cfood(cls, match):
for cfood in cls.existing_instances:
suitable = True
for group in cls.matching_groups:
if (group not in match.groupdict() or
group not in cfood.match.groupdict() or
match.group(group) != cfood.match.group(group)):
suitable = False
if suitable:
return cfood
return None
def get_entity_for_path(path):
try:
q = "FIND FILE WHICH IS STORED AT '{}'".format(path)
return db.execute_query(q, unique=True)
except EntityDoesNotExistError:
path_prefix = "**"
if not path.startswith("/"):
path_prefix = path_prefix + "/"
q = "FIND FILE WHICH IS STORED AT '{}{}'".format(path_prefix, path)
logger.debug(q)
return db.execute_query(q, unique=True)
class AbstractFileCFood(AbstractCFood):
# contains the compiled regular expression after the first execution of the
# function match()
_pattern = None
def __init__(self, crawled_file, access=lambda x: x):
""" Abstract base class for Crawler food (CFood).
def __init__(self, crawled_path, access=lambda x: x):
""" Abstract base class for file based Crawler food (CFood).
Parameters
----------
crawled_file : The file that the crawler is currently matching. Its
crawled_path : The file that the crawler is currently matching. Its
path should match against the pattern of this class
access : callable, optional
A function that takes a CaosDB path and returns a local path
"""
super().__init__()
self.access = access
self._crawled_file = None
self.crawled_path = crawled_file
self.match = type(self).match(crawled_file)
self.to_be_updated = db.Container()
self.identifiables = db.Container()
self.crawled_path = crawled_path
self.match = type(self).match_file(crawled_path)
self.attached_ones = []
self.attached_filenames = []
@property
def crawled_file(self):
if self._crawled_file is None:
try:
q = "FIND FILE WHICH IS STORED AT '{}'".format(
self.crawled_path)
self._crawled_file = db.execute_query(q, unique=True)
except EntityDoesNotExistError:
path = "**"
if not self.crawled_path.startswith("/"):
path = path + "/"
q = "FIND FILE WHICH IS STORED AT '{}{}'".format(path,
self.crawled_path)
logger.debug(q)
self._crawled_file = db.execute_query(q, unique=True)
self._crawled_file = get_entity_for_path(self.crawled_path)
return self._crawled_file
......@@ -146,7 +277,7 @@ class AbstractCFood(object):
return cls(crawled_file, **kwargs)
@classmethod
def match(cls, string):
def match_file(cls, string):
""" Matches the regular expression of this class against file names
Parameters
......@@ -164,48 +295,12 @@ class AbstractCFood(object):
return re.match(cls.get_re(), string)
def create_identifiables(self):
"""
should set the instance variable Container with the identifiables
"""
raise NotImplementedError()
def update_identifiables(self):
""" Changes the identifiables as needed and adds changed identifiables
to self.to_be_updated
"""
raise NotImplementedError()
def push_identifiables_to_CaosDB(self):
""" Updates the self.to_be_updated Container, i.e. pushes the changes
to CaosDB
"""
if len(self.to_be_updated) == 0:
return
get_ids_for_entities_with_names(self.to_be_updated)
# remove duplicates
tmp = db.Container()
for el in self.to_be_updated:
if el not in tmp:
tmp.append(el)
self.to_be_updated = tmp
logger.info("UPDATE: updating the following entities")
for el in self.to_be_updated:
logger.info("\t" + el.name if el.name is not None else el.id)
logger.debug(self.to_be_updated)
guard.safe_update(self.to_be_updated)
def attach(self, crawled_file):
self.attached_ones.append(crawled_file)
# TODO looking for should `attach` the files itsself. This would allow to
# group them right away and makes it unnecessary to check matches later
# again.
def looking_for(self, crawled_file):
"""
returns True if crawled_file can be added to this CFood.
......@@ -218,6 +313,8 @@ class AbstractCFood(object):
This function can be used to define what files shall be 'attached'.
"""
# TODO rename to filenames_to_be_attached
if crawled_file in self.attached_filenames:
return True
......@@ -280,7 +377,6 @@ def assure_object_is_in_list(obj, containing_object, property_name,
if not isinstance(containing_object.get_property(property_name).value, list):
containing_object.get_property(property_name).value = [containing_object.get_property(property_name).value]
containing_object.get_property(property_name).value
containing_object.get_property(property_name).datatype = datatype
current_list = containing_object.get_property(property_name).value
......@@ -418,6 +514,9 @@ def assure_has_property(entity, name, value, to_be_updated=None,
name.lower()]
contained = False
if isinstance(value, db.Entity):
value = value.id
for el in possible_properties:
if el.value == value:
contained = True
......@@ -464,50 +563,28 @@ def get_ids_for_entities_with_names(entities):
insert_id_based_on_name(ent)
class CMeal(object):
"""
CMeal groups equivalent CFoods and allow their collected insertion.
Sometimes there is no one file that can be used to trigger the creation of
some Record. E.g. if a collection of images shall be referenced from one
Record that groups them, it is unclear which image should trigger the
creation of the Record.
CMeals are grouped based on the groups in the used regular expression. If,
in the above example, all the images reside in one folder, all groups
except that for the file name should match. The groups that shall match
need to be listed in the matching_groups class property. Subclasses will
overwrite this property.
The cook function of a cfood allows this class to work. Instead of directly
instantiating a CFood the cook function is used. If the CFood is also a
child of CMeal, it will be checked (using get_suitable_cfood) in the cook
function whether a new CFood should be created or if the file match should
be added to an existing one. In order to allow this all instances of a
CFood class are tracked in the existing_instances class member.
"""
existing_instances = []
matching_groups = []
def __init__(self, *args, **kwargs):
self.existing_instances.append(self)
self.crawled_files = []
def add(self, crawled_file):
self.crawled_files.append(crawled_file)
class RowCFood(AbstractCFood):
def __init__(self, row, unique_cols, recordtype):
"""
table : pandas table
"""
super().__init__()
self.row = row
self.unique_cols = unique_cols
self.recordtype = recordtype
@classmethod
def get_suitable_cfood(cls, match):
for cfood in cls.existing_instances:
suitable = True
def create_identifiables(self):
rec = db.Record()
rec.add_parent(self.recordtype)
for group in cls.matching_groups:
if (group not in match.groupdict() or
group not in cfood.match.groupdict() or
match.group(group) != cfood.match.group(group)):
suitable = False
for col in self.unique_cols:
rec.add_property(col, self.row.loc[col])
self.identifiables.append(rec)
if suitable:
return cfood
def update_identifiables(self):
rec = self.identifiables[0]
return None
for key, value in self.row.iteritems():
if key in self.unique_cols:
continue
rec.add_property(key, value)
......@@ -28,7 +28,6 @@ import time
import html2text
import caosdb as db
import labfolder.connection
from labfolder.connection import configure_connection
......
......@@ -6,6 +6,7 @@
#
# Copyright (C) 2018 Research Group Biomedical Physics,
# Max-Planck-Institute for Dynamics and Self-Organization Göttingen
# Copyright (C) 2020 Henrik tom Wörden
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
......@@ -45,7 +46,8 @@ import caosdb as db
from caosdb.exceptions import TransactionError
from .cache import Cache
from .guard import INSERT, RETRIEVE, UPDATE
from .cfood import RowCFood
from .guard import RETRIEVE
from .guard import global_guard as guard
logger = logging.getLogger(__name__)
......@@ -80,11 +82,13 @@ class UnknownCache(object):
class Crawler(object):
def __init__(self, food, access=lambda x: x, use_cache=False,
abort_on_exception=True):
def __init__(self, food=None, access=lambda x: x, use_cache=False,
abort_on_exception=True, interactive=True):
"""
Parameters
----------
food : list of CFood classes, optional
The Crawler will use those CFoods when crawling.
pattern : str
The regex pattern for matching against file names.
......@@ -94,8 +98,17 @@ class Crawler(object):
access : callable, optional
A function that takes a CaosDB path and returns a local path
interactive : boolean, optional
If true, questions will be posed during execution of the
crawl function.
"""
self.food = food
if food is None:
self.food = []
else:
self.food = food
self.interactive = interactive
self.access = access
self.report = db.Container()
self.use_cache = use_cache
......@@ -104,98 +117,54 @@ class Crawler(object):
if self.use_cache:
self.cache = Cache()
def match(self, files, interactive, hideKnown=False):
errors_occured = False
tbs = []
def collect_cfoods(self):
"""
to be overwritten by subclasses.
should return cfoods, tbs and errors_occured.
# TODO do this via logging?
tbs text returned from traceback
errors_occured True if at least one error occured
"""
cfoods = []
matches = {f: [] for f in files}
tbs = []
errors_occured = False
logger.info(separated("Matching files against CFoods"))
for food in self.food:
cfoods.append(food())
for Cfood in self.food:
logger.debug("Matching against {}...".format(Cfood.__name__))
for crawled_file in files:
if Cfood.match(crawled_file) is not None:
matches[crawled_file].append(Cfood.__name__)
logger.debug("{} matched\n{}.".format(
Cfood.__class__.__name__,
crawled_file))
try:
cfood = Cfood.cook(crawled_file, access=self.access)
if cfood is not None:
cfoods.append(cfood)
except Exception as e:
traceback.print_exc()
print(e)
if self.abort_on_exception:
raise e
errors_occured = True
tbs.append(e)
logger.info(separated("CFoods are collecting information..."))
for cfood in cfoods:
cfood.collect_information()
logger.info(separated("Trying to attach files to created CFoods"))
for cfood in cfoods:
logger.debug("Matching against {}...".format(Cfood.__name__))
for crawled_file in files:
if cfood.looking_for(crawled_file):
logger.debug("{} matched\n{}.".format(
Cfood.__class__.__name__,
crawled_file))
cfood.attach(crawled_file)
matches[crawled_file].append(Cfood.__name__)
# possibly load previously encountered "Missing matches" and
# "Multiple matches"
ucache = UnknownCache(interactive=interactive, load=hideKnown)
try:
cfood = Cfood()
for crawled_file in files:
if len(matches[crawled_file]) == 0:
msg = ("ATTENTION: No matching cfood!\n"
"Tried to match {}\n".format(crawled_file))
if cfood is not None:
cfoods.append(cfood)
except Exception as e:
traceback.print_exc()
print(e)
if crawled_file in ucache.filenames:
logger.debug(msg)
else:
logger.warn(msg)
ucache.add(crawled_file)
if self.abort_on_exception:
raise e
errors_occured = True
tbs.append(e)
if len(matches[crawled_file]) > 1:
msg = ("Attention: More than one matching cfood!\n"
+ "Tried to match {}\n".format(crawled_file)
+ "\tRecordTypes:\t" + ", ".join(
matches[crawled_file])+"\n")
return cfoods, tbs, errors_occured
if crawled_file in ucache.filenames:
logger.debug(msg)
else:
logger.warn(msg)
ucache.add(crawled_file)
def cached_find_identifiables(self, identifiables):
if self.use_cache:
hashes = self.cache.update_ids_from_cache(identifiables)
# Save the encountered prblem matches
ucache.save()
self.find_or_insert_identifiables(identifiables)
return cfoods, matches, tbs, errors_occured
if self.use_cache:
self.cache.insert_list(hashes, identifiables)
def crawl(self, files, interactive=True, hideKnown=False,
security_level=RETRIEVE):
def crawl(self, security_level=RETRIEVE):
guard.set_level(level=security_level)
files = sorted([f.path for f in files])
cfoods, matches, tbs, errors_occured = self.match(files, interactive,
hideKnown=hideKnown)
cfoods, tbs, errors_occured = self.collect_cfoods()
if interactive and "y" != input("Do you want to continue? (y)"):
if self.interactive and "y" != input("Do you want to continue? (y)"):
return
logger.info(separated("Creating and updating Identifiables"))
......@@ -204,14 +173,7 @@ class Crawler(object):
try:
cfood.create_identifiables()
if self.use_cache:
hashes = self.cache.update_ids_from_cache(
cfood.identifiables)
self.find_or_insert_identifiables(cfood.identifiables)
if self.use_cache:
self.cache.insert_list(hashes, cfood.identifiables)
self.cached_find_identifiables(cfood.identifiables)
cfood.update_identifiables()
cfood.push_identifiables_to_CaosDB()
......@@ -225,8 +187,8 @@ class Crawler(object):
tbs.append(e)
if errors_occured:
logger.warn("Crawler terminated with failures!")
logger.warn(tbs)
logger.warning("Crawler terminated with failures!")
logger.warning(tbs)
else:
logger.info("Crawler terminated successfully!")
......@@ -322,6 +284,152 @@ class Crawler(object):
return files
class FileCrawler(Crawler):
def __init__(self, files, access=lambda x: x, hideKnown=False, **kwargs):
"""
Parameters
----------
files : files to be crawled
access : callable, optional
A function that takes a CaosDB path and returns a local path
"""
super().__init__(**kwargs)
self.files = files
self.access = access
self.hideKnown = hideKnown
def match(self):
files = sorted([f.path for f in self.files])
errors_occured = False
tbs = []
cfoods = []
matches = {f: [] for f in files}
logger.info(separated("Matching files against CFoods"))
for Cfood in self.food:
logger.debug("Matching against {}...".format(Cfood.__name__))
for crawled_file in files:
if Cfood.match_file(crawled_file) is not None:
matches[crawled_file].append(Cfood.__name__)
logger.debug("{} matched\n{}.".format(
Cfood.__name__,
crawled_file))
try:
cfood = Cfood.cook(crawled_file, access=self.access)
if cfood is not None:
cfoods.append(cfood)
except Exception as e:
traceback.print_exc()
print(e)
if self.abort_on_exception:
raise e
errors_occured = True
tbs.append(e)
logger.info(separated("CFoods are collecting information..."))
for cfood in cfoods:
cfood.collect_information()
logger.info(separated("Trying to attach files to created CFoods"))
for cfood in cfoods:
logger.debug("Matching against {}...".format(Cfood.__name__))
for crawled_file in files:
if cfood.looking_for(crawled_file):
logger.debug("{} matched\n{}.".format(
cfood.__class__.__name__,
crawled_file))
cfood.attach(crawled_file)
matches[crawled_file].append(Cfood.__name__)
# possibly load previously encountered "Missing matches" and
# "Multiple matches"
ucache = UnknownCache(interactive=self.interactive, load=self.hideKnown)
for crawled_file in files:
if len(matches[crawled_file]) == 0:
msg = ("ATTENTION: No matching cfood!\n"
"Tried to match {}\n".format(crawled_file))
if crawled_file in ucache.filenames:
logger.debug(msg)
else:
logger.warning(msg)
ucache.add(crawled_file)
if len(matches[crawled_file]) > 1:
msg = ("Attention: More than one matching cfood!\n"
+ "Tried to match {}\n".format(crawled_file)
+ "\tRecordTypes:\t" + ", ".join(
matches[crawled_file])+"\n")
if crawled_file in ucache.filenames:
logger.debug(msg)
else:
logger.warning(msg)
ucache.add(crawled_file)
# Save the encountered prblem matches
ucache.save()
return cfoods, tbs, errors_occured
def collect_cfoods(self):
cfoods, tbs, errors_occured = self.match()
return cfoods, tbs, errors_occured
class TableCrawler(Crawler):
def __init__(self, table, unique_cols, recordtype, **kwargs):
"""
Parameters
----------
table : pandas DataFrame
unique_cols : the columns that provide the properties for the
identifiable
recordtype : Record Type of the Records to be created
"""
super().__init__(**kwargs)
self.table = table
self.unique_cols = unique_cols
self.recordtype = recordtype
def collect_cfoods(self):
cfoods = []
tbs = []
errors_occured = False
for _, row in self.table.iterrows():
try:
cfood = RowCFood(row, self.unique_cols, self.recordtype)
if cfood is not None:
cfoods.append(cfood)
except Exception as e:
traceback.print_exc()
print(e)
if self.abort_on_exception:
raise e
errors_occured = True
tbs.append(e)
return cfoods, tbs, errors_occured
def get_value(prop):
""" Returns the value of a Property
......
......@@ -22,10 +22,10 @@
import caosdb as db
from .cfood import AbstractCFood, assure_has_property
from .cfood import AbstractFileCFood, assure_has_property
class ExampleCFood(AbstractCFood):
class ExampleCFood(AbstractFileCFood):
@staticmethod
def get_re():
return (r".*/(?P<species>[^/]+)/"
......
......@@ -24,14 +24,15 @@
import unittest
import caosdb as db
from caosadvancedtools.cfood import (AbstractCFood, assure_has_parent,
from caosadvancedtools.cfood import (AbstractFileCFood, assure_has_parent,
assure_object_is_in_list)
from caosadvancedtools.example_cfood import ExampleCFood
PATTERN = "h.*"
class TestCFood(AbstractCFood):
class TestCFood(AbstractFileCFood):
@staticmethod
def get_re():
......@@ -42,11 +43,11 @@ class CFoodReTest(unittest.TestCase):
def test(self):
self.assertEquals(TestCFood.get_re(), PATTERN)
self.assertEqual(TestCFood._pattern, None)
self.assertIsNotNone(TestCFood.match("hallo"))
self.assertIsNotNone(TestCFood.match_file("hallo"))
# TODO the caching is of compiled re is disabled currently
# self.assertIsNotNone(TestCFood._pattern)
self.assertIsNotNone(TestCFood.match("hallo"))
self.assertIsNone(TestCFood.match("allo"))
self.assertIsNotNone(TestCFood.match_file("hallo"))
self.assertIsNone(TestCFood.match_file("allo"))
class InsertionTest(unittest.TestCase):
......@@ -77,7 +78,7 @@ class InsertionTest(unittest.TestCase):
class ExampleTest(unittest.TestCase):
def test(self):
path = "/data/rabbit/2019-03-03/README.md"
cf = ExampleCFood(crawled_file=path)
self.assertIsNotNone(ExampleCFood.match(path))
cf = ExampleCFood(crawled_path=path)
self.assertIsNotNone(ExampleCFood.match_file(path))
self.assertEqual(cf.match.group('species'), 'rabbit')
self.assertEqual(cf.match.group('date'), '2019-03-03')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment