Skip to content
Snippets Groups Projects
Commit aea9144a authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

Merge branch 'f-refactor' into 'master'

Refactor Crawler

See merge request caosdb/caosdb-advanced-user-tools!19
parents 75b703dc a5ec4027
No related branches found
No related tags found
No related merge requests found
__pycache__ __pycache__
.tox .tox
.coverage .coverage
cache.db *cache.db
*.egg-info *.egg-info
.docker/cert .docker/cert
...@@ -7,10 +7,12 @@ pip install tox --user ...@@ -7,10 +7,12 @@ pip install tox --user
tox tox
# Run Integration Tests Locally # Run Integration Tests Locally
1. Mount `integrationtests/full_test/extroot` to the folder that will be used as
extroot. E.g. `sudo mount -o bind extroot ../../../caosdb-deploy/profiles/empty/paths/extroot` 1. Change directory to `integrationtests/full_test/`.
2. Start an empty CaosDB instance 2. Mount `extroot` to the folder that will be used as extroot. E.g. `sudo mount
3. run test.sh -o bind extroot ../../../caosdb-deploy/profiles/empty/paths/extroot`.
3. Start an empty CaosDB instance (with the mounted extroot).
4. Run `test.sh`.
# Code Formatting # Code Formatting
autopep8 -i -r ./ autopep8 -i -r ./
...@@ -25,14 +25,13 @@ ...@@ -25,14 +25,13 @@
import argparse import argparse
import logging import logging
import sys
from argparse import RawTextHelpFormatter from argparse import RawTextHelpFormatter
import caosdb as db import caosdb as db
from caosadvancedtools.cfood import fileguide
from caosadvancedtools.crawler import FileCrawler from caosadvancedtools.crawler import FileCrawler
from caosadvancedtools.guard import INSERT, RETRIEVE, UPDATE, Guard from caosadvancedtools.guard import UPDATE
from caosadvancedtools.utils import set_log_level
from scifolder import (AnalysisCFood, ExperimentCFood, ProjectCFood, from scifolder import (AnalysisCFood, ExperimentCFood, ProjectCFood,
PublicationCFood, SimulationCFood) PublicationCFood, SimulationCFood)
...@@ -54,8 +53,8 @@ if __name__ == "__main__": ...@@ -54,8 +53,8 @@ if __name__ == "__main__":
logger = logging.getLogger("caosadvancedtools") logger = logging.getLogger("caosadvancedtools")
conlogger = logging.getLogger("connection") conlogger = logging.getLogger("connection")
conlogger.setLevel(level=logging.ERROR) conlogger.setLevel(level=logging.ERROR)
logger.setLevel(level=logging.WARN) logger.setLevel(level=logging.DEBUG)
fileguide.access = access
parser = get_parser() parser = get_parser()
args = parser.parse_args() args = parser.parse_args()
...@@ -63,9 +62,9 @@ if __name__ == "__main__": ...@@ -63,9 +62,9 @@ if __name__ == "__main__":
files = FileCrawler.query_files(args.path) files = FileCrawler.query_files(args.path)
logger.info("Query done...") logger.info("Query done...")
config = db.configuration.get_config() config = db.configuration.get_config()
c = FileCrawler(files=files, use_cache=True, access=access, c = FileCrawler(files=files, use_cache=True,
interactive=False, hideKnown=True, interactive=False, hideKnown=True,
food=[ProjectCFood, cfood_types=[ProjectCFood,
ExperimentCFood, AnalysisCFood, ExperimentCFood, AnalysisCFood,
PublicationCFood, SimulationCFood, PublicationCFood, SimulationCFood,
]) ])
......
...@@ -4,6 +4,7 @@ echo "Filling the database" ...@@ -4,6 +4,7 @@ echo "Filling the database"
./filldb.sh ./filldb.sh
echo "Testing the crawler database" echo "Testing the crawler database"
python3 -m pytest test_crawler.py python3 -m pytest test_crawler.py
python3 test_table.py
# TODO the following test deletes lots of the data inserted by the crawler # TODO the following test deletes lots of the data inserted by the crawler
echo "Testing im and export" echo "Testing im and export"
python3 test_im_und_export.py python3 test_im_und_export.py
#!/usr/bin/env python3 #!/usr/bin/env python3
import os import os
import unittest
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
import caosdb as db import caosdb as db
......
...@@ -18,17 +18,13 @@ ...@@ -18,17 +18,13 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>. # along with this program. If not, see <https://www.gnu.org/licenses/>.
import argparse
import logging import logging
import sys
from argparse import RawTextHelpFormatter
import caosdb as db import caosdb as db
import pandas as pd import pandas as pd
from caosadvancedtools.crawler import TableCrawler from caosadvancedtools.crawler import TableCrawler
from caosadvancedtools.guard import INSERT, RETRIEVE, UPDATE, Guard from caosadvancedtools.guard import UPDATE
from caosadvancedtools.utils import set_log_level
if __name__ == "__main__": if __name__ == "__main__":
logger = logging.getLogger("caosadvancedtools") logger = logging.getLogger("caosadvancedtools")
......
...@@ -25,18 +25,21 @@ ...@@ -25,18 +25,21 @@
# ** end header # ** end header
""" Defines how something that shall be inserted into CaosDB is treated. """ Defines how something that shall be inserted into CaosDB is treated.
CaosDB can automatically be filled with Records based on some file structure. CaosDB can automatically be filled with Records based on some structure, a file
The Crawler will iterate over the files and test for each file whether a CFood structure, a table or similar.
exists that matches the file path. If one does, it is instanciated to treat the The Crawler will iterate over the respective items and test for each item
match. This occurs in basically three steps: whether a CFood class exists that matches the file path, i.e. whether CFood
1. create a list of identifiables, i.e. unique representation of CaosDB Records class wants to treat that pariticular item. If one does, it is instanciated to
(such as an experiment belonging to a project and a date/time) treat the match. This occurs in basically three steps:
2. the identifiables are either found in CaosDB or they are created. 1. Create a list of identifiables, i.e. unique representation of CaosDB Records
3. the identifiables are update based on the date in the file structure (such as an experiment belonging to a project and a date/time).
2. The identifiables are either found in CaosDB or they are created.
3. The identifiables are update based on the date in the file structure.
""" """
import logging import logging
import re import re
from abc import ABCMeta, abstractmethod
import caosdb as db import caosdb as db
from caosdb.exceptions import EntityDoesNotExistError from caosdb.exceptions import EntityDoesNotExistError
...@@ -62,24 +65,37 @@ def get_entity(name): ...@@ -62,24 +65,37 @@ def get_entity(name):
return ENTITIES[name] return ENTITIES[name]
class AbstractCFood(object): class FileGuide(object):
def access(path):
""" should be replaced by a function that adds
a prefix to paths to allow to access caosdb files locally"""
def __init__(self): raise NotImplementedError()
fileguide = FileGuide()
class AbstractCFood(object, metaclass=ABCMeta):
def __init__(self, item):
""" Abstract base class for Crawler food (CFood).""" """ Abstract base class for Crawler food (CFood)."""
self.to_be_updated = db.Container() self.to_be_updated = db.Container()
self.identifiables = db.Container() self.identifiables = db.Container()
self.item = item
self.attached_items = []
@abstractmethod
def create_identifiables(self): def create_identifiables(self):
""" """
should set the instance variable Container with the identifiables should set the instance variable Container with the identifiables
""" """
raise NotImplementedError()
@abstractmethod
def update_identifiables(self): def update_identifiables(self):
""" Changes the identifiables as needed and adds changed identifiables """ Changes the identifiables as needed and adds changed identifiables
to self.to_be_updated to self.to_be_updated
""" """
raise NotImplementedError()
def push_identifiables_to_CaosDB(self): def push_identifiables_to_CaosDB(self):
""" Updates the self.to_be_updated Container, i.e. pushes the changes """ Updates the self.to_be_updated Container, i.e. pushes the changes
...@@ -108,6 +124,56 @@ class AbstractCFood(object): ...@@ -108,6 +124,56 @@ class AbstractCFood(object):
logger.debug(self.to_be_updated) logger.debug(self.to_be_updated)
guard.safe_update(self.to_be_updated) guard.safe_update(self.to_be_updated)
@classmethod
def match_item(cls, item):
""" Matches an item found by the crawler against this class. Returns
True if the item shall be treated by this class, i.e. if this class
matches the item.
Parameters
----------
item : object
iterated by the crawler
To be overwritten by subclasses!
"""
return True
def collect_information(self):
""" The CFood collects information for further processing.
Often CFoods need information from files or even from the database in
order to make processing decision. It is intended that this function is
called after match. Thus match can be used without connecting to the
database.
To be overwritten by subclasses
"""
pass
def attach(self, item):
self.attached_items.append(item)
# TODO looking for should `attach` the files itsself. This would allow to
# group them right away and makes it unnecessary to check matches later
# again.
def looking_for(self, item):
"""
returns True if item can be added to this CFood.
Typically a CFood exists for a file and defines how to deal with the
file. However, sometimes additional files "belong" to a CFood. E.g. an
experiment CFood might match against a README file but labnotes.txt
also shall be treated by the cfood (and not a special cfood created for
labnotes.txt)
This function can be used to define what files shall be 'attached'.
To be overwritten by subclasses
"""
return False
@staticmethod @staticmethod
# move to api? # move to api?
def set_parents(entity, names): def set_parents(entity, names):
...@@ -140,55 +206,6 @@ class AbstractCFood(object): ...@@ -140,55 +206,6 @@ class AbstractCFood(object):
entity.add_property(prop, value) entity.add_property(prop, value)
class CMeal(object):
"""
CMeal groups equivalent Files and allow their collected insertion.
Sometimes there is no one file that can be used to trigger the creation of
some Record. E.g. if a collection of images shall be referenced from one
Record that groups them, it is unclear which image should trigger the
creation of the Record.
CMeals are grouped based on the groups in the used regular expression. If,
in the above example, all the images reside in one folder, all groups
except that for the file name should match. The groups that shall match
need to be listed in the matching_groups class property. Subclasses will
overwrite this property.
The cook function of a cfood allows this class to work. Instead of directly
instantiating a CFood the cook function is used. If the CFood is also a
child of CMeal, it will be checked (using get_suitable_cfood) in the cook
function whether a new CFood should be created or if the file match should
be added to an existing one. In order to allow this all instances of a
CFood class are tracked in the existing_instances class member.
"""
existing_instances = []
matching_groups = []
def __init__(self, *args, **kwargs):
self.__class__.existing_instances.append(self)
self.crawled_files = []
def add(self, crawled_file):
self.crawled_files.append(crawled_file)
@classmethod
def get_suitable_cfood(cls, match):
for cfood in cls.existing_instances:
suitable = True
for group in cls.matching_groups:
if (group not in match.groupdict() or
group not in cfood.match.groupdict() or
match.group(group) != cfood.match.group(group)):
suitable = False
if suitable:
return cfood
return None
def get_entity_for_path(path): def get_entity_for_path(path):
try: try:
q = "FIND FILE WHICH IS STORED AT '{}'".format(path) q = "FIND FILE WHICH IS STORED AT '{}'".format(path)
...@@ -210,7 +227,7 @@ class AbstractFileCFood(AbstractCFood): ...@@ -210,7 +227,7 @@ class AbstractFileCFood(AbstractCFood):
# function match() # function match()
_pattern = None _pattern = None
def __init__(self, crawled_path, access=lambda x: x): def __init__(self, crawled_path, *args, **kwargs):
""" Abstract base class for file based Crawler food (CFood). """ Abstract base class for file based Crawler food (CFood).
Parameters Parameters
...@@ -218,15 +235,11 @@ class AbstractFileCFood(AbstractCFood): ...@@ -218,15 +235,11 @@ class AbstractFileCFood(AbstractCFood):
crawled_path : The file that the crawler is currently matching. Its crawled_path : The file that the crawler is currently matching. Its
path should match against the pattern of this class path should match against the pattern of this class
access : callable, optional
A function that takes a CaosDB path and returns a local path
""" """
super().__init__() super().__init__(*args, item=crawled_path, **kwargs)
self.access = access
self._crawled_file = None self._crawled_file = None
self.crawled_path = crawled_path self.crawled_path = crawled_path
self.match = type(self).match_file(crawled_path) self.match = re.match(type(self).get_re(), crawled_path)
self.attached_ones = []
self.attached_filenames = [] self.attached_filenames = []
@property @property
...@@ -236,18 +249,6 @@ class AbstractFileCFood(AbstractCFood): ...@@ -236,18 +249,6 @@ class AbstractFileCFood(AbstractCFood):
return self._crawled_file return self._crawled_file
def collect_information(self):
""" The CFood collects information for further processing.
Often CFoods need information from files or even from the database in
order to make processing decision. It is intended that this function is
called after match. Thus match can be used without connecting to the
database.
To be overwritten by subclasses
"""
pass
@staticmethod @staticmethod
def get_re(): def get_re():
""" Returns the regular expression used to identify files that shall be """ Returns the regular expression used to identify files that shall be
...@@ -258,45 +259,16 @@ class AbstractFileCFood(AbstractCFood): ...@@ -258,45 +259,16 @@ class AbstractFileCFood(AbstractCFood):
raise NotImplementedError() raise NotImplementedError()
@classmethod @classmethod
def cook(cls, crawled_file, **kwargs): def match_item(cls, path):
""" possibly checks for existing CFoods whether the match should be
added or whether a new CFood instance needs to be returned
This function should typically be used to create CFoods in order to
prevent the creation of unnecessary instances.
This standard implementation does not do a check but may be overwritten
by subclasses.
Retruns
-------------
CFood: if a new instance was created
None: otherwise
"""
return cls(crawled_file, **kwargs)
@classmethod
def match_file(cls, string):
""" Matches the regular expression of this class against file names """ Matches the regular expression of this class against file names
Parameters Parameters
---------- ----------
string : str path : str
The path of the file that shall be matched. The path of the file that shall be matched.
""" """
# TODO this does not quite work. Sometimes the wrong expression is in return re.match(cls.get_re(), path) is not None
# _pattern; FIX
# if cls._pattern is None:
# cls._pattern = re.compile(cls.get_re())
# return cls._pattern.match(string)
return re.match(cls.get_re(), string)
def attach(self, crawled_file):
self.attached_ones.append(crawled_file)
# TODO looking for should `attach` the files itsself. This would allow to # TODO looking for should `attach` the files itsself. This would allow to
# group them right away and makes it unnecessary to check matches later # group them right away and makes it unnecessary to check matches later
...@@ -320,37 +292,6 @@ class AbstractFileCFood(AbstractCFood): ...@@ -320,37 +292,6 @@ class AbstractFileCFood(AbstractCFood):
return False return False
@staticmethod
# move to api?
def set_parents(entity, names):
entity.parents.clear()
for n in names:
entity.add_parent(get_entity(n))
@staticmethod
# move to api?
def remove_property(entity, prop):
# TODO only do something when it is necessary?
if isinstance(prop, db.Entity):
name = prop.name
else:
name = prop
while entity.get_property(name) is not None:
entity.remove_property(name)
@staticmethod
# move to api?
def set_property(entity, prop, value, datatype=None):
AbstractCFood.remove_property(entity, prop)
if datatype is not None:
entity.add_property(prop, value, datatype=datatype)
else:
entity.add_property(prop, value)
def assure_object_is_in_list(obj, containing_object, property_name, def assure_object_is_in_list(obj, containing_object, property_name,
to_be_updated, datatype=None): to_be_updated, datatype=None):
...@@ -564,12 +505,11 @@ def get_ids_for_entities_with_names(entities): ...@@ -564,12 +505,11 @@ def get_ids_for_entities_with_names(entities):
class RowCFood(AbstractCFood): class RowCFood(AbstractCFood):
def __init__(self, row, unique_cols, recordtype): def __init__(self, item, unique_cols, recordtype, **kwargs):
""" """
table : pandas table table : pandas table
""" """
super().__init__() super().__init__(item, **kwargs)
self.row = row
self.unique_cols = unique_cols self.unique_cols = unique_cols
self.recordtype = recordtype self.recordtype = recordtype
...@@ -578,13 +518,79 @@ class RowCFood(AbstractCFood): ...@@ -578,13 +518,79 @@ class RowCFood(AbstractCFood):
rec.add_parent(self.recordtype) rec.add_parent(self.recordtype)
for col in self.unique_cols: for col in self.unique_cols:
rec.add_property(col, self.row.loc[col]) rec.add_property(col, self.item.loc[col])
self.identifiables.append(rec) self.identifiables.append(rec)
def update_identifiables(self): def update_identifiables(self):
rec = self.identifiables[0] rec = self.identifiables[0]
for key, value in self.row.iteritems(): for key, value in self.item.iteritems():
if key in self.unique_cols: if key in self.unique_cols:
continue continue
rec.add_property(key, value) rec.add_property(key, value)
class CMeal(object):
"""
CMeal groups equivalent items and allow their collected insertion.
Sometimes there is no one item that can be used to trigger the creation of
some Record. E.g. if a collection of image files shall be referenced from one
Record that groups them, it is unclear which image should trigger the
creation of the Record.
CMeals are grouped based on the groups in the used regular expression. If,
in the above example, all the images reside in one folder, all groups of
the filename match except that for the file name should match.
The groups that shall match
need to be listed in the matching_groups class property. Subclasses will
overwrite this property.
This allows to use has_suitable_cfood in the match_item function of a CFood
to check whether the necessary CFood was already created.
In order to allow this all instances of a
CFood class are tracked in the existing_instances class member.
Subclasses must have a cls.get_re function and a match member variable
(see AbstractFileCFood)
"""
existing_instances = []
matching_groups = []
def __init__(self):
self.__class__.existing_instances.append(self)
@classmethod
def all_groups_equal(cls, m1, m2):
equal = True
for group in cls.matching_groups:
if (group not in m1.groupdict() or
group not in m2.groupdict() or
m1.group(group) != m2.group(group)):
equal = False
return equal
@classmethod
def has_suitable_cfood(cls, item):
""" checks whether the required cfood object already exists.
item : the crawled item
"""
match = re.match(cls.get_re(), item)
for cfood in cls.existing_instances:
if cls.all_groups_equal(match, cfood.match):
return True
return False
def belongs_to_meal(self, item):
# This is already the main item
if item == self.item:
return False
match = re.match(self.get_re(), item)
return self.all_groups_equal(match, self.match)
...@@ -82,44 +82,56 @@ class UnknownCache(object): ...@@ -82,44 +82,56 @@ class UnknownCache(object):
class Crawler(object): class Crawler(object):
def __init__(self, food=None, access=lambda x: x, use_cache=False, def __init__(self, cfood_types, use_cache=False,
abort_on_exception=True, interactive=True): abort_on_exception=True, interactive=True, hideKnown=False):
""" """
Parameters Parameters
---------- ----------
food : list of CFood classes, optional cfood_types : list of CFood classes
The Crawler will use those CFoods when crawling. The Crawler will use those CFoods when crawling.
pattern : str
The regex pattern for matching against file names.
use_cache : bool, optional use_cache : bool, optional
Whether to use caching (not re-inserting probably existing Whether to use caching (not re-inserting probably existing
objects into CaosDB), defaults to False. objects into CaosDB), defaults to False.
abort_on_exception : if true, exceptions are raise.
access : callable, optional Otherwise the crawler continues if an exception occurs.
A function that takes a CaosDB path and returns a local path
interactive : boolean, optional interactive : boolean, optional
If true, questions will be posed during execution of the If true, questions will be posed during execution of the
crawl function. crawl function.
""" """
if food is None: self.cfood_types = cfood_types
self.food = []
else:
self.food = food
self.interactive = interactive self.interactive = interactive
self.access = access
self.report = db.Container() self.report = db.Container()
self.use_cache = use_cache self.use_cache = use_cache
self.hideKnown = hideKnown
self.abort_on_exception = abort_on_exception self.abort_on_exception = abort_on_exception
if self.use_cache: if self.use_cache:
self.cache = Cache() self.cache = Cache()
def iteritems(self):
""" generates items to be crawled with an index"""
yield 0, None
def collect_cfoods(self): def collect_cfoods(self):
""" """
to be overwritten by subclasses. This is the first phase of the crawl. It collects all cfoods that shall
be processed. The second phase is iterating over cfoods and updating
CaosDB. This separate first step is necessary in order to allow a
single cfood being influenced by multiple crawled items. E.g. the
FileCrawler can have a single cfood treat multiple files.
This is a very basic implementation and this function should be
overwritten by subclasses.
The basic structure of this function should be, that what ever is
being processed is iterated and each cfood is checked whether the
item 'matches'. If it does, a cfood is instantiated passing the item
as an argument.
The match can depend on the cfoods already being created, i.e. a file
migth no longer match because it is already treaded by an earlier
cfood.
should return cfoods, tbs and errors_occured. should return cfoods, tbs and errors_occured.
# TODO do this via logging? # TODO do this via logging?
...@@ -129,16 +141,21 @@ class Crawler(object): ...@@ -129,16 +141,21 @@ class Crawler(object):
cfoods = [] cfoods = []
tbs = [] tbs = []
errors_occured = False errors_occured = False
matches = {idx: [] for idx, _ in self.iteritems()}
for food in self.food: logger.info(separated("Matching files against CFoods"))
cfoods.append(food())
for Cfood in self.food: for Cfood in self.cfood_types:
try: logger.debug("Matching against {}...".format(Cfood.__name__))
cfood = Cfood()
if cfood is not None: for idx, item in self.iteritems():
cfoods.append(cfood) if Cfood.match_item(item):
try:
cfoods.append(Cfood(item))
matches[idx].append(Cfood.__name__)
logger.debug("{} matched\n{}.".format(
Cfood.__name__,
item))
except Exception as e: except Exception as e:
traceback.print_exc() traceback.print_exc()
print(e) print(e)
...@@ -148,8 +165,59 @@ class Crawler(object): ...@@ -148,8 +165,59 @@ class Crawler(object):
errors_occured = True errors_occured = True
tbs.append(e) tbs.append(e)
logger.info(separated("CFoods are collecting information..."))
for cfood in cfoods:
cfood.collect_information()
logger.info(separated("Trying to attach further items to created CFoods"))
for cfood in cfoods:
logger.debug("Matching against {}...".format(Cfood.__name__))
for idx, item in self.iteritems():
if cfood.looking_for(item):
logger.debug("{} matched\n{}.".format(
cfood.__class__.__name__,
item))
cfood.attach(item)
matches[idx].append(Cfood.__name__)
self.check_matches(matches)
return cfoods, tbs, errors_occured return cfoods, tbs, errors_occured
def check_matches(self, matches):
# possibly load previously encountered "Missing matches" and
# "Multiple matches"
ucache = UnknownCache(interactive=self.interactive, load=self.hideKnown)
for idx, item in self.iteritems():
if len(matches[idx]) == 0:
msg = ("ATTENTION: No matching cfood!\n"
"Tried to match {}\n".format(item))
if item in ucache.filenames:
logger.debug(msg)
else:
logger.warning(msg)
ucache.add(item)
if len(matches[idx]) > 1:
msg = ("Attention: More than one matching cfood!\n"
+ "Tried to match {}\n".format(item)
+ "\tRecordTypes:\t" + ", ".join(
matches[idx])+"\n")
if item in ucache.filenames:
logger.debug(msg)
else:
logger.warning(msg)
ucache.add(item)
# Save the encountered prblem matches
ucache.save()
def cached_find_identifiables(self, identifiables): def cached_find_identifiables(self, identifiables):
if self.use_cache: if self.use_cache:
hashes = self.cache.update_ids_from_cache(identifiables) hashes = self.cache.update_ids_from_cache(identifiables)
...@@ -273,122 +341,30 @@ class Crawler(object): ...@@ -273,122 +341,30 @@ class Crawler(object):
return r return r
@staticmethod
def query_files(path):
query_str = "FIND FILE WHICH IS STORED AT " + \
(path if path.endswith("/") else path + "/") + "**"
logger.info("FILES QUERY: " + query_str)
files = db.execute_query(query_str)
logger.info("{} FILES TO BE PROCESSED.".format(len(files)))
return files
class FileCrawler(Crawler): class FileCrawler(Crawler):
def __init__(self, files, access=lambda x: x, hideKnown=False, **kwargs): def __init__(self, files, **kwargs):
""" """
Parameters Parameters
---------- ----------
files : files to be crawled files : files to be crawled
access : callable, optional
A function that takes a CaosDB path and returns a local path
""" """
super().__init__(**kwargs) super().__init__(**kwargs)
self.files = files self.files = files
self.access = access
self.hideKnown = hideKnown
def match(self): def iteritems(self):
for idx, p in enumerate(sorted([f.path for f in self.files])):
files = sorted([f.path for f in self.files]) yield idx, p
errors_occured = False
tbs = []
cfoods = []
matches = {f: [] for f in files}
logger.info(separated("Matching files against CFoods"))
for Cfood in self.food: @staticmethod
logger.debug("Matching against {}...".format(Cfood.__name__)) def query_files(path):
query_str = "FIND FILE WHICH IS STORED AT " + (path if path.endswith("/") else path + "/") + "**"
for crawled_file in files: logger.info("FILES QUERY: " + query_str)
if Cfood.match_file(crawled_file) is not None: files = db.execute_query(query_str)
matches[crawled_file].append(Cfood.__name__) logger.info("{} FILES TO BE PROCESSED.".format(len(files)))
logger.debug("{} matched\n{}.".format(
Cfood.__name__,
crawled_file))
try:
cfood = Cfood.cook(crawled_file, access=self.access)
if cfood is not None:
cfoods.append(cfood)
except Exception as e:
traceback.print_exc()
print(e)
if self.abort_on_exception:
raise e
errors_occured = True
tbs.append(e)
logger.info(separated("CFoods are collecting information..."))
for cfood in cfoods:
cfood.collect_information()
logger.info(separated("Trying to attach files to created CFoods"))
for cfood in cfoods:
logger.debug("Matching against {}...".format(Cfood.__name__))
for crawled_file in files:
if cfood.looking_for(crawled_file):
logger.debug("{} matched\n{}.".format(
cfood.__class__.__name__,
crawled_file))
cfood.attach(crawled_file)
matches[crawled_file].append(Cfood.__name__)
# possibly load previously encountered "Missing matches" and
# "Multiple matches"
ucache = UnknownCache(interactive=self.interactive, load=self.hideKnown)
for crawled_file in files:
if len(matches[crawled_file]) == 0:
msg = ("ATTENTION: No matching cfood!\n"
"Tried to match {}\n".format(crawled_file))
if crawled_file in ucache.filenames:
logger.debug(msg)
else:
logger.warning(msg)
ucache.add(crawled_file)
if len(matches[crawled_file]) > 1:
msg = ("Attention: More than one matching cfood!\n"
+ "Tried to match {}\n".format(crawled_file)
+ "\tRecordTypes:\t" + ", ".join(
matches[crawled_file])+"\n")
if crawled_file in ucache.filenames:
logger.debug(msg)
else:
logger.warning(msg)
ucache.add(crawled_file)
# Save the encountered prblem matches
ucache.save()
return cfoods, tbs, errors_occured
def collect_cfoods(self):
cfoods, tbs, errors_occured = self.match()
return cfoods, tbs, errors_occured return files
class TableCrawler(Crawler): class TableCrawler(Crawler):
...@@ -402,32 +378,18 @@ class TableCrawler(Crawler): ...@@ -402,32 +378,18 @@ class TableCrawler(Crawler):
identifiable identifiable
recordtype : Record Type of the Records to be created recordtype : Record Type of the Records to be created
""" """
super().__init__(**kwargs)
self.table = table self.table = table
self.unique_cols = unique_cols
self.recordtype = recordtype
def collect_cfoods(self):
cfoods = []
tbs = []
errors_occured = False
for _, row in self.table.iterrows():
try:
cfood = RowCFood(row, self.unique_cols, self.recordtype)
if cfood is not None: # TODO I do not like this yet, but I do not see a better way so far.
cfoods.append(cfood) class ThisRowCF(RowCFood):
except Exception as e: def __init__(self, item):
traceback.print_exc() super().__init__(item, unique_cols, recordtype)
print(e)
if self.abort_on_exception: super().__init__(cfood_types=[ThisRowCF], **kwargs)
raise e
errors_occured = True
tbs.append(e)
return cfoods, tbs, errors_occured def iteritems(self):
for idx, row in self.table.iterrows():
yield idx, row
def get_value(prop): def get_value(prop):
......
...@@ -21,33 +21,90 @@ ...@@ -21,33 +21,90 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>. # along with this program. If not, see <https://www.gnu.org/licenses/>.
# #
# ** end header # ** end header
import re
import unittest import unittest
import caosdb as db import caosdb as db
from caosadvancedtools.cfood import (AbstractCFood, AbstractFileCFood, CMeal,
from caosadvancedtools.cfood import (AbstractFileCFood, assure_has_parent, assure_has_parent,
assure_object_is_in_list) assure_object_is_in_list)
from caosadvancedtools.example_cfood import ExampleCFood from caosadvancedtools.example_cfood import ExampleCFood
PATTERN = "h.*" PATTERN = "h.*"
class TestCFood(AbstractFileCFood): class ExampleCFoodMeal(AbstractFileCFood, CMeal):
matching_groups = ["test"]
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# add the constructor of CMeal
CMeal.__init__(self)
@classmethod
def match_item(cls, item):
""" standard match_match, but returns False if a suitable cfood exists """
if cls.has_suitable_cfood(item):
return False
return re.match(cls.get_re(), item) is not None
def looking_for(self, crawled_file):
""" standard looking_for, but returns True if the file matches all
groups"""
if self.belongs_to_meal(crawled_file):
return True
return super().looking_for(crawled_file)
@staticmethod
def get_re():
return r"/(?P<test>[a-z]*)/"
def create_identifiables(self):
pass
def update_identifiables(self):
pass
class SimpleCFood(AbstractFileCFood):
@staticmethod @staticmethod
def get_re(): def get_re():
return PATTERN return PATTERN
class DependendCFood(AbstractCFood):
existing = []
@classmethod
def match_item(cls, item):
if len(cls.existing) == 0:
return True
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
DependendCFood.existing.append(self)
def create_identifiables(self):
pass
def update_identifiables(self):
pass
class CFoodReTest(unittest.TestCase): class CFoodReTest(unittest.TestCase):
def test(self): def test(self):
self.assertEquals(TestCFood.get_re(), PATTERN) self.assertEqual(SimpleCFood.get_re(), PATTERN)
self.assertEqual(TestCFood._pattern, None) self.assertEqual(SimpleCFood._pattern, None)
self.assertIsNotNone(TestCFood.match_file("hallo")) self.assertTrue(SimpleCFood.match_item("hallo"))
# TODO the caching is of compiled re is disabled currently # TODO the caching is of compiled re is disabled currently
# self.assertIsNotNone(TestCFood._pattern) # self.assertIsNotNone(SimpleCFood._pattern)
self.assertIsNotNone(TestCFood.match_file("hallo")) self.assertTrue(SimpleCFood.match_item("hallo"))
self.assertIsNone(TestCFood.match_file("allo")) self.assertFalse(SimpleCFood.match_item("allo"))
class InsertionTest(unittest.TestCase): class InsertionTest(unittest.TestCase):
...@@ -75,10 +132,31 @@ class InsertionTest(unittest.TestCase): ...@@ -75,10 +132,31 @@ class InsertionTest(unittest.TestCase):
assert len(to_be_updated) == 0 assert len(to_be_updated) == 0
class DependendTest(unittest.TestCase):
def test(self):
self.assertTrue(DependendCFood.match_item(None))
cf = DependendCFood(None)
self.assertFalse(DependendCFood.match_item(None))
class ExampleTest(unittest.TestCase): class ExampleTest(unittest.TestCase):
def test(self): def test(self):
path = "/data/rabbit/2019-03-03/README.md" path = "/data/rabbit/2019-03-03/README.md"
cf = ExampleCFood(crawled_path=path) cf = ExampleCFood(crawled_path=path)
self.assertIsNotNone(ExampleCFood.match_file(path)) self.assertIsNotNone(ExampleCFood.match_item(path))
self.assertEqual(cf.match.group('species'), 'rabbit') self.assertEqual(cf.match.group('species'), 'rabbit')
self.assertEqual(cf.match.group('date'), '2019-03-03') self.assertEqual(cf.match.group('date'), '2019-03-03')
class MealTest(unittest.TestCase):
def test(self):
# path should match
self.assertTrue(ExampleCFoodMeal.match_item("/this/file"))
# create an instance
c = ExampleCFoodMeal("/this/file")
# same prefix should no longer match
self.assertFalse(ExampleCFoodMeal.match_item("/this/other"))
# but instance should be looking for this prefix
self.assertTrue(c.looking_for("/this/other"))
# class should still match other prefixes
self.assertTrue(ExampleCFoodMeal.match_item("/that/file"))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment