From a5ec4027e5b9c88f5d2579e6e1f9f0201db118c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Tue, 5 May 2020 10:58:48 +0000 Subject: [PATCH] Refactor Crawler --- .gitignore | 2 +- README_SETUP.md | 10 +- integrationtests/full_test/crawl.py | 19 +- integrationtests/full_test/test.sh | 1 + .../full_test/test_im_und_export.py | 1 - integrationtests/full_test/test_table.py | 6 +- src/caosadvancedtools/cfood.py | 302 +++++++++--------- src/caosadvancedtools/crawler.py | 270 +++++++--------- unittests/test_cfood.py | 98 +++++- 9 files changed, 376 insertions(+), 333 deletions(-) diff --git a/.gitignore b/.gitignore index c68adb8f..30f02d1c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ __pycache__ .tox .coverage -cache.db +*cache.db *.egg-info .docker/cert diff --git a/README_SETUP.md b/README_SETUP.md index cba784d3..526fe900 100644 --- a/README_SETUP.md +++ b/README_SETUP.md @@ -7,10 +7,12 @@ pip install tox --user tox # Run Integration Tests Locally -1. Mount `integrationtests/full_test/extroot` to the folder that will be used as - extroot. E.g. `sudo mount -o bind extroot ../../../caosdb-deploy/profiles/empty/paths/extroot` -2. Start an empty CaosDB instance -3. run test.sh + +1. Change directory to `integrationtests/full_test/`. +2. Mount `extroot` to the folder that will be used as extroot. E.g. `sudo mount + -o bind extroot ../../../caosdb-deploy/profiles/empty/paths/extroot`. +3. Start an empty CaosDB instance (with the mounted extroot). +4. Run `test.sh`. # Code Formatting autopep8 -i -r ./ diff --git a/integrationtests/full_test/crawl.py b/integrationtests/full_test/crawl.py index 51ad24b9..66005fd7 100755 --- a/integrationtests/full_test/crawl.py +++ b/integrationtests/full_test/crawl.py @@ -25,14 +25,13 @@ import argparse import logging -import sys from argparse import RawTextHelpFormatter import caosdb as db +from caosadvancedtools.cfood import fileguide from caosadvancedtools.crawler import FileCrawler -from caosadvancedtools.guard import INSERT, RETRIEVE, UPDATE, Guard -from caosadvancedtools.utils import set_log_level +from caosadvancedtools.guard import UPDATE from scifolder import (AnalysisCFood, ExperimentCFood, ProjectCFood, PublicationCFood, SimulationCFood) @@ -54,8 +53,8 @@ if __name__ == "__main__": logger = logging.getLogger("caosadvancedtools") conlogger = logging.getLogger("connection") conlogger.setLevel(level=logging.ERROR) - logger.setLevel(level=logging.WARN) - + logger.setLevel(level=logging.DEBUG) + fileguide.access = access parser = get_parser() args = parser.parse_args() @@ -63,10 +62,10 @@ if __name__ == "__main__": files = FileCrawler.query_files(args.path) logger.info("Query done...") config = db.configuration.get_config() - c = FileCrawler(files=files, use_cache=True, access=access, + c = FileCrawler(files=files, use_cache=True, interactive=False, hideKnown=True, - food=[ProjectCFood, - ExperimentCFood, AnalysisCFood, - PublicationCFood, SimulationCFood, - ]) + cfood_types=[ProjectCFood, + ExperimentCFood, AnalysisCFood, + PublicationCFood, SimulationCFood, + ]) c.crawl(security_level=UPDATE) diff --git a/integrationtests/full_test/test.sh b/integrationtests/full_test/test.sh index 25906da8..249f7342 100755 --- a/integrationtests/full_test/test.sh +++ b/integrationtests/full_test/test.sh @@ -4,6 +4,7 @@ echo "Filling the database" ./filldb.sh echo "Testing the crawler database" python3 -m pytest test_crawler.py +python3 test_table.py # TODO the following test deletes lots of the data inserted by the crawler echo "Testing im and export" python3 test_im_und_export.py diff --git a/integrationtests/full_test/test_im_und_export.py b/integrationtests/full_test/test_im_und_export.py index d6fe43eb..5c7584e6 100644 --- a/integrationtests/full_test/test_im_und_export.py +++ b/integrationtests/full_test/test_im_und_export.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 import os -import unittest from tempfile import TemporaryDirectory import caosdb as db diff --git a/integrationtests/full_test/test_table.py b/integrationtests/full_test/test_table.py index 91b99471..15b851fb 100644 --- a/integrationtests/full_test/test_table.py +++ b/integrationtests/full_test/test_table.py @@ -18,17 +18,13 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. -import argparse import logging -import sys -from argparse import RawTextHelpFormatter import caosdb as db import pandas as pd from caosadvancedtools.crawler import TableCrawler -from caosadvancedtools.guard import INSERT, RETRIEVE, UPDATE, Guard -from caosadvancedtools.utils import set_log_level +from caosadvancedtools.guard import UPDATE if __name__ == "__main__": logger = logging.getLogger("caosadvancedtools") diff --git a/src/caosadvancedtools/cfood.py b/src/caosadvancedtools/cfood.py index 2128d6b1..6142e68c 100644 --- a/src/caosadvancedtools/cfood.py +++ b/src/caosadvancedtools/cfood.py @@ -25,18 +25,21 @@ # ** end header """ Defines how something that shall be inserted into CaosDB is treated. -CaosDB can automatically be filled with Records based on some file structure. -The Crawler will iterate over the files and test for each file whether a CFood -exists that matches the file path. If one does, it is instanciated to treat the -match. This occurs in basically three steps: -1. create a list of identifiables, i.e. unique representation of CaosDB Records -(such as an experiment belonging to a project and a date/time) -2. the identifiables are either found in CaosDB or they are created. -3. the identifiables are update based on the date in the file structure +CaosDB can automatically be filled with Records based on some structure, a file +structure, a table or similar. +The Crawler will iterate over the respective items and test for each item +whether a CFood class exists that matches the file path, i.e. whether CFood +class wants to treat that pariticular item. If one does, it is instanciated to +treat the match. This occurs in basically three steps: +1. Create a list of identifiables, i.e. unique representation of CaosDB Records +(such as an experiment belonging to a project and a date/time). +2. The identifiables are either found in CaosDB or they are created. +3. The identifiables are update based on the date in the file structure. """ import logging import re +from abc import ABCMeta, abstractmethod import caosdb as db from caosdb.exceptions import EntityDoesNotExistError @@ -62,24 +65,37 @@ def get_entity(name): return ENTITIES[name] -class AbstractCFood(object): +class FileGuide(object): + def access(path): + """ should be replaced by a function that adds + a prefix to paths to allow to access caosdb files locally""" - def __init__(self): + raise NotImplementedError() + + +fileguide = FileGuide() + + +class AbstractCFood(object, metaclass=ABCMeta): + + def __init__(self, item): """ Abstract base class for Crawler food (CFood).""" self.to_be_updated = db.Container() self.identifiables = db.Container() + self.item = item + self.attached_items = [] + @abstractmethod def create_identifiables(self): """ should set the instance variable Container with the identifiables """ - raise NotImplementedError() + @abstractmethod def update_identifiables(self): """ Changes the identifiables as needed and adds changed identifiables to self.to_be_updated """ - raise NotImplementedError() def push_identifiables_to_CaosDB(self): """ Updates the self.to_be_updated Container, i.e. pushes the changes @@ -108,6 +124,56 @@ class AbstractCFood(object): logger.debug(self.to_be_updated) guard.safe_update(self.to_be_updated) + @classmethod + def match_item(cls, item): + """ Matches an item found by the crawler against this class. Returns + True if the item shall be treated by this class, i.e. if this class + matches the item. + + Parameters + ---------- + item : object + iterated by the crawler + + To be overwritten by subclasses! + """ + + return True + + def collect_information(self): + """ The CFood collects information for further processing. + + Often CFoods need information from files or even from the database in + order to make processing decision. It is intended that this function is + called after match. Thus match can be used without connecting to the + database. + + To be overwritten by subclasses + """ + pass + + def attach(self, item): + self.attached_items.append(item) + + # TODO looking for should `attach` the files itsself. This would allow to + # group them right away and makes it unnecessary to check matches later + # again. + def looking_for(self, item): + """ + returns True if item can be added to this CFood. + + Typically a CFood exists for a file and defines how to deal with the + file. However, sometimes additional files "belong" to a CFood. E.g. an + experiment CFood might match against a README file but labnotes.txt + also shall be treated by the cfood (and not a special cfood created for + labnotes.txt) + This function can be used to define what files shall be 'attached'. + + To be overwritten by subclasses + """ + + return False + @staticmethod # move to api? def set_parents(entity, names): @@ -140,55 +206,6 @@ class AbstractCFood(object): entity.add_property(prop, value) -class CMeal(object): - """ - CMeal groups equivalent Files and allow their collected insertion. - - Sometimes there is no one file that can be used to trigger the creation of - some Record. E.g. if a collection of images shall be referenced from one - Record that groups them, it is unclear which image should trigger the - creation of the Record. - - CMeals are grouped based on the groups in the used regular expression. If, - in the above example, all the images reside in one folder, all groups - except that for the file name should match. The groups that shall match - need to be listed in the matching_groups class property. Subclasses will - overwrite this property. - - The cook function of a cfood allows this class to work. Instead of directly - instantiating a CFood the cook function is used. If the CFood is also a - child of CMeal, it will be checked (using get_suitable_cfood) in the cook - function whether a new CFood should be created or if the file match should - be added to an existing one. In order to allow this all instances of a - CFood class are tracked in the existing_instances class member. - """ - existing_instances = [] - matching_groups = [] - - def __init__(self, *args, **kwargs): - self.__class__.existing_instances.append(self) - self.crawled_files = [] - - def add(self, crawled_file): - self.crawled_files.append(crawled_file) - - @classmethod - def get_suitable_cfood(cls, match): - for cfood in cls.existing_instances: - suitable = True - - for group in cls.matching_groups: - if (group not in match.groupdict() or - group not in cfood.match.groupdict() or - match.group(group) != cfood.match.group(group)): - suitable = False - - if suitable: - return cfood - - return None - - def get_entity_for_path(path): try: q = "FIND FILE WHICH IS STORED AT '{}'".format(path) @@ -210,7 +227,7 @@ class AbstractFileCFood(AbstractCFood): # function match() _pattern = None - def __init__(self, crawled_path, access=lambda x: x): + def __init__(self, crawled_path, *args, **kwargs): """ Abstract base class for file based Crawler food (CFood). Parameters @@ -218,15 +235,11 @@ class AbstractFileCFood(AbstractCFood): crawled_path : The file that the crawler is currently matching. Its path should match against the pattern of this class - access : callable, optional - A function that takes a CaosDB path and returns a local path """ - super().__init__() - self.access = access + super().__init__(*args, item=crawled_path, **kwargs) self._crawled_file = None self.crawled_path = crawled_path - self.match = type(self).match_file(crawled_path) - self.attached_ones = [] + self.match = re.match(type(self).get_re(), crawled_path) self.attached_filenames = [] @property @@ -236,18 +249,6 @@ class AbstractFileCFood(AbstractCFood): return self._crawled_file - def collect_information(self): - """ The CFood collects information for further processing. - - Often CFoods need information from files or even from the database in - order to make processing decision. It is intended that this function is - called after match. Thus match can be used without connecting to the - database. - - To be overwritten by subclasses - """ - pass - @staticmethod def get_re(): """ Returns the regular expression used to identify files that shall be @@ -258,45 +259,16 @@ class AbstractFileCFood(AbstractCFood): raise NotImplementedError() @classmethod - def cook(cls, crawled_file, **kwargs): - """ possibly checks for existing CFoods whether the match should be - added or whether a new CFood instance needs to be returned - - This function should typically be used to create CFoods in order to - prevent the creation of unnecessary instances. - - This standard implementation does not do a check but may be overwritten - by subclasses. - - Retruns - ------------- - CFood: if a new instance was created - None: otherwise - """ - - return cls(crawled_file, **kwargs) - - @classmethod - def match_file(cls, string): + def match_item(cls, path): """ Matches the regular expression of this class against file names Parameters ---------- - string : str + path : str The path of the file that shall be matched. """ - # TODO this does not quite work. Sometimes the wrong expression is in - # _pattern; FIX - # if cls._pattern is None: - # cls._pattern = re.compile(cls.get_re()) - - # return cls._pattern.match(string) - - return re.match(cls.get_re(), string) - - def attach(self, crawled_file): - self.attached_ones.append(crawled_file) + return re.match(cls.get_re(), path) is not None # TODO looking for should `attach` the files itsself. This would allow to # group them right away and makes it unnecessary to check matches later @@ -320,37 +292,6 @@ class AbstractFileCFood(AbstractCFood): return False - @staticmethod - # move to api? - def set_parents(entity, names): - entity.parents.clear() - - for n in names: - entity.add_parent(get_entity(n)) - - @staticmethod - # move to api? - def remove_property(entity, prop): - # TODO only do something when it is necessary? - - if isinstance(prop, db.Entity): - name = prop.name - else: - name = prop - - while entity.get_property(name) is not None: - entity.remove_property(name) - - @staticmethod - # move to api? - def set_property(entity, prop, value, datatype=None): - AbstractCFood.remove_property(entity, prop) - - if datatype is not None: - entity.add_property(prop, value, datatype=datatype) - else: - entity.add_property(prop, value) - def assure_object_is_in_list(obj, containing_object, property_name, to_be_updated, datatype=None): @@ -564,12 +505,11 @@ def get_ids_for_entities_with_names(entities): class RowCFood(AbstractCFood): - def __init__(self, row, unique_cols, recordtype): + def __init__(self, item, unique_cols, recordtype, **kwargs): """ table : pandas table """ - super().__init__() - self.row = row + super().__init__(item, **kwargs) self.unique_cols = unique_cols self.recordtype = recordtype @@ -578,13 +518,79 @@ class RowCFood(AbstractCFood): rec.add_parent(self.recordtype) for col in self.unique_cols: - rec.add_property(col, self.row.loc[col]) + rec.add_property(col, self.item.loc[col]) self.identifiables.append(rec) def update_identifiables(self): rec = self.identifiables[0] - for key, value in self.row.iteritems(): + for key, value in self.item.iteritems(): if key in self.unique_cols: continue rec.add_property(key, value) + + +class CMeal(object): + """ + CMeal groups equivalent items and allow their collected insertion. + + Sometimes there is no one item that can be used to trigger the creation of + some Record. E.g. if a collection of image files shall be referenced from one + Record that groups them, it is unclear which image should trigger the + creation of the Record. + + CMeals are grouped based on the groups in the used regular expression. If, + in the above example, all the images reside in one folder, all groups of + the filename match except that for the file name should match. + The groups that shall match + need to be listed in the matching_groups class property. Subclasses will + overwrite this property. + + This allows to use has_suitable_cfood in the match_item function of a CFood + to check whether the necessary CFood was already created. + In order to allow this all instances of a + CFood class are tracked in the existing_instances class member. + + Subclasses must have a cls.get_re function and a match member variable + (see AbstractFileCFood) + """ + existing_instances = [] + matching_groups = [] + + def __init__(self): + self.__class__.existing_instances.append(self) + + @classmethod + def all_groups_equal(cls, m1, m2): + equal = True + + for group in cls.matching_groups: + if (group not in m1.groupdict() or + group not in m2.groupdict() or + m1.group(group) != m2.group(group)): + equal = False + + return equal + + @classmethod + def has_suitable_cfood(cls, item): + """ checks whether the required cfood object already exists. + + item : the crawled item + """ + match = re.match(cls.get_re(), item) + + for cfood in cls.existing_instances: + if cls.all_groups_equal(match, cfood.match): + return True + + return False + + def belongs_to_meal(self, item): + # This is already the main item + + if item == self.item: + return False + match = re.match(self.get_re(), item) + + return self.all_groups_equal(match, self.match) diff --git a/src/caosadvancedtools/crawler.py b/src/caosadvancedtools/crawler.py index db89551d..2877cd7f 100644 --- a/src/caosadvancedtools/crawler.py +++ b/src/caosadvancedtools/crawler.py @@ -82,44 +82,56 @@ class UnknownCache(object): class Crawler(object): - def __init__(self, food=None, access=lambda x: x, use_cache=False, - abort_on_exception=True, interactive=True): + def __init__(self, cfood_types, use_cache=False, + abort_on_exception=True, interactive=True, hideKnown=False): """ Parameters ---------- - food : list of CFood classes, optional + cfood_types : list of CFood classes The Crawler will use those CFoods when crawling. - pattern : str - The regex pattern for matching against file names. - use_cache : bool, optional Whether to use caching (not re-inserting probably existing objects into CaosDB), defaults to False. - - access : callable, optional - A function that takes a CaosDB path and returns a local path + abort_on_exception : if true, exceptions are raise. + Otherwise the crawler continues if an exception occurs. interactive : boolean, optional If true, questions will be posed during execution of the crawl function. """ - if food is None: - self.food = [] - else: - self.food = food + self.cfood_types = cfood_types self.interactive = interactive - self.access = access self.report = db.Container() self.use_cache = use_cache + self.hideKnown = hideKnown self.abort_on_exception = abort_on_exception if self.use_cache: self.cache = Cache() + def iteritems(self): + """ generates items to be crawled with an index""" + yield 0, None + def collect_cfoods(self): """ - to be overwritten by subclasses. + This is the first phase of the crawl. It collects all cfoods that shall + be processed. The second phase is iterating over cfoods and updating + CaosDB. This separate first step is necessary in order to allow a + single cfood being influenced by multiple crawled items. E.g. the + FileCrawler can have a single cfood treat multiple files. + + This is a very basic implementation and this function should be + overwritten by subclasses. + + The basic structure of this function should be, that what ever is + being processed is iterated and each cfood is checked whether the + item 'matches'. If it does, a cfood is instantiated passing the item + as an argument. + The match can depend on the cfoods already being created, i.e. a file + migth no longer match because it is already treaded by an earlier + cfood. should return cfoods, tbs and errors_occured. # TODO do this via logging? @@ -129,27 +141,83 @@ class Crawler(object): cfoods = [] tbs = [] errors_occured = False + matches = {idx: [] for idx, _ in self.iteritems()} - for food in self.food: - cfoods.append(food()) + logger.info(separated("Matching files against CFoods")) - for Cfood in self.food: - try: - cfood = Cfood() + for Cfood in self.cfood_types: + logger.debug("Matching against {}...".format(Cfood.__name__)) - if cfood is not None: - cfoods.append(cfood) - except Exception as e: - traceback.print_exc() - print(e) + for idx, item in self.iteritems(): + if Cfood.match_item(item): + try: + cfoods.append(Cfood(item)) + matches[idx].append(Cfood.__name__) + logger.debug("{} matched\n{}.".format( + Cfood.__name__, + item)) + except Exception as e: + traceback.print_exc() + print(e) - if self.abort_on_exception: - raise e - errors_occured = True - tbs.append(e) + if self.abort_on_exception: + raise e + errors_occured = True + tbs.append(e) + + logger.info(separated("CFoods are collecting information...")) + + for cfood in cfoods: + cfood.collect_information() + + logger.info(separated("Trying to attach further items to created CFoods")) + + for cfood in cfoods: + logger.debug("Matching against {}...".format(Cfood.__name__)) + + for idx, item in self.iteritems(): + if cfood.looking_for(item): + logger.debug("{} matched\n{}.".format( + cfood.__class__.__name__, + item)) + cfood.attach(item) + matches[idx].append(Cfood.__name__) + + self.check_matches(matches) return cfoods, tbs, errors_occured + def check_matches(self, matches): + # possibly load previously encountered "Missing matches" and + # "Multiple matches" + ucache = UnknownCache(interactive=self.interactive, load=self.hideKnown) + + for idx, item in self.iteritems(): + if len(matches[idx]) == 0: + msg = ("ATTENTION: No matching cfood!\n" + "Tried to match {}\n".format(item)) + + if item in ucache.filenames: + logger.debug(msg) + else: + logger.warning(msg) + ucache.add(item) + + if len(matches[idx]) > 1: + msg = ("Attention: More than one matching cfood!\n" + + "Tried to match {}\n".format(item) + + "\tRecordTypes:\t" + ", ".join( + matches[idx])+"\n") + + if item in ucache.filenames: + logger.debug(msg) + else: + logger.warning(msg) + ucache.add(item) + + # Save the encountered prblem matches + ucache.save() + def cached_find_identifiables(self, identifiables): if self.use_cache: hashes = self.cache.update_ids_from_cache(identifiables) @@ -273,122 +341,30 @@ class Crawler(object): return r - @staticmethod - def query_files(path): - query_str = "FIND FILE WHICH IS STORED AT " + \ - (path if path.endswith("/") else path + "/") + "**" - logger.info("FILES QUERY: " + query_str) - files = db.execute_query(query_str) - logger.info("{} FILES TO BE PROCESSED.".format(len(files))) - - return files - class FileCrawler(Crawler): - def __init__(self, files, access=lambda x: x, hideKnown=False, **kwargs): + def __init__(self, files, **kwargs): """ Parameters ---------- files : files to be crawled - access : callable, optional - A function that takes a CaosDB path and returns a local path - """ super().__init__(**kwargs) self.files = files - self.access = access - self.hideKnown = hideKnown - - def match(self): - - files = sorted([f.path for f in self.files]) - errors_occured = False - tbs = [] - cfoods = [] - matches = {f: [] for f in files} - - logger.info(separated("Matching files against CFoods")) - for Cfood in self.food: - logger.debug("Matching against {}...".format(Cfood.__name__)) - - for crawled_file in files: - if Cfood.match_file(crawled_file) is not None: - matches[crawled_file].append(Cfood.__name__) - - logger.debug("{} matched\n{}.".format( - Cfood.__name__, - crawled_file)) - try: - cfood = Cfood.cook(crawled_file, access=self.access) - - if cfood is not None: - cfoods.append(cfood) - except Exception as e: - traceback.print_exc() - print(e) - - if self.abort_on_exception: - raise e - errors_occured = True - tbs.append(e) - - logger.info(separated("CFoods are collecting information...")) - - for cfood in cfoods: - cfood.collect_information() + def iteritems(self): + for idx, p in enumerate(sorted([f.path for f in self.files])): + yield idx, p - logger.info(separated("Trying to attach files to created CFoods")) - - for cfood in cfoods: - logger.debug("Matching against {}...".format(Cfood.__name__)) - - for crawled_file in files: - if cfood.looking_for(crawled_file): - logger.debug("{} matched\n{}.".format( - cfood.__class__.__name__, - crawled_file)) - cfood.attach(crawled_file) - matches[crawled_file].append(Cfood.__name__) - - # possibly load previously encountered "Missing matches" and - # "Multiple matches" - ucache = UnknownCache(interactive=self.interactive, load=self.hideKnown) - - for crawled_file in files: - if len(matches[crawled_file]) == 0: - msg = ("ATTENTION: No matching cfood!\n" - "Tried to match {}\n".format(crawled_file)) - - if crawled_file in ucache.filenames: - logger.debug(msg) - else: - logger.warning(msg) - ucache.add(crawled_file) - - if len(matches[crawled_file]) > 1: - msg = ("Attention: More than one matching cfood!\n" - + "Tried to match {}\n".format(crawled_file) - + "\tRecordTypes:\t" + ", ".join( - matches[crawled_file])+"\n") - - if crawled_file in ucache.filenames: - logger.debug(msg) - else: - logger.warning(msg) - ucache.add(crawled_file) - - # Save the encountered prblem matches - ucache.save() - - return cfoods, tbs, errors_occured - - def collect_cfoods(self): - - cfoods, tbs, errors_occured = self.match() + @staticmethod + def query_files(path): + query_str = "FIND FILE WHICH IS STORED AT " + (path if path.endswith("/") else path + "/") + "**" + logger.info("FILES QUERY: " + query_str) + files = db.execute_query(query_str) + logger.info("{} FILES TO BE PROCESSED.".format(len(files))) - return cfoods, tbs, errors_occured + return files class TableCrawler(Crawler): @@ -402,32 +378,18 @@ class TableCrawler(Crawler): identifiable recordtype : Record Type of the Records to be created """ - super().__init__(**kwargs) self.table = table - self.unique_cols = unique_cols - self.recordtype = recordtype - - def collect_cfoods(self): - cfoods = [] - tbs = [] - errors_occured = False - - for _, row in self.table.iterrows(): - try: - cfood = RowCFood(row, self.unique_cols, self.recordtype) - if cfood is not None: - cfoods.append(cfood) - except Exception as e: - traceback.print_exc() - print(e) + # TODO I do not like this yet, but I do not see a better way so far. + class ThisRowCF(RowCFood): + def __init__(self, item): + super().__init__(item, unique_cols, recordtype) - if self.abort_on_exception: - raise e - errors_occured = True - tbs.append(e) + super().__init__(cfood_types=[ThisRowCF], **kwargs) - return cfoods, tbs, errors_occured + def iteritems(self): + for idx, row in self.table.iterrows(): + yield idx, row def get_value(prop): diff --git a/unittests/test_cfood.py b/unittests/test_cfood.py index dcea2cbd..bfa3443a 100644 --- a/unittests/test_cfood.py +++ b/unittests/test_cfood.py @@ -21,33 +21,90 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # # ** end header +import re import unittest import caosdb as db - -from caosadvancedtools.cfood import (AbstractFileCFood, assure_has_parent, +from caosadvancedtools.cfood import (AbstractCFood, AbstractFileCFood, CMeal, + assure_has_parent, assure_object_is_in_list) from caosadvancedtools.example_cfood import ExampleCFood PATTERN = "h.*" -class TestCFood(AbstractFileCFood): +class ExampleCFoodMeal(AbstractFileCFood, CMeal): + matching_groups = ["test"] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # add the constructor of CMeal + CMeal.__init__(self) + + @classmethod + def match_item(cls, item): + """ standard match_match, but returns False if a suitable cfood exists """ + + if cls.has_suitable_cfood(item): + return False + + return re.match(cls.get_re(), item) is not None + + def looking_for(self, crawled_file): + """ standard looking_for, but returns True if the file matches all + groups""" + + if self.belongs_to_meal(crawled_file): + return True + + return super().looking_for(crawled_file) + + @staticmethod + def get_re(): + return r"/(?P<test>[a-z]*)/" + + def create_identifiables(self): + pass + + def update_identifiables(self): + pass + + +class SimpleCFood(AbstractFileCFood): @staticmethod def get_re(): return PATTERN +class DependendCFood(AbstractCFood): + existing = [] + + @classmethod + def match_item(cls, item): + if len(cls.existing) == 0: + return True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + DependendCFood.existing.append(self) + + def create_identifiables(self): + pass + + def update_identifiables(self): + pass + + class CFoodReTest(unittest.TestCase): def test(self): - self.assertEquals(TestCFood.get_re(), PATTERN) - self.assertEqual(TestCFood._pattern, None) - self.assertIsNotNone(TestCFood.match_file("hallo")) + self.assertEqual(SimpleCFood.get_re(), PATTERN) + self.assertEqual(SimpleCFood._pattern, None) + self.assertTrue(SimpleCFood.match_item("hallo")) # TODO the caching is of compiled re is disabled currently - # self.assertIsNotNone(TestCFood._pattern) - self.assertIsNotNone(TestCFood.match_file("hallo")) - self.assertIsNone(TestCFood.match_file("allo")) + # self.assertIsNotNone(SimpleCFood._pattern) + self.assertTrue(SimpleCFood.match_item("hallo")) + self.assertFalse(SimpleCFood.match_item("allo")) class InsertionTest(unittest.TestCase): @@ -75,10 +132,31 @@ class InsertionTest(unittest.TestCase): assert len(to_be_updated) == 0 +class DependendTest(unittest.TestCase): + def test(self): + self.assertTrue(DependendCFood.match_item(None)) + cf = DependendCFood(None) + self.assertFalse(DependendCFood.match_item(None)) + + class ExampleTest(unittest.TestCase): def test(self): path = "/data/rabbit/2019-03-03/README.md" cf = ExampleCFood(crawled_path=path) - self.assertIsNotNone(ExampleCFood.match_file(path)) + self.assertIsNotNone(ExampleCFood.match_item(path)) self.assertEqual(cf.match.group('species'), 'rabbit') self.assertEqual(cf.match.group('date'), '2019-03-03') + + +class MealTest(unittest.TestCase): + def test(self): + # path should match + self.assertTrue(ExampleCFoodMeal.match_item("/this/file")) + # create an instance + c = ExampleCFoodMeal("/this/file") + # same prefix should no longer match + self.assertFalse(ExampleCFoodMeal.match_item("/this/other")) + # but instance should be looking for this prefix + self.assertTrue(c.looking_for("/this/other")) + # class should still match other prefixes + self.assertTrue(ExampleCFoodMeal.match_item("/that/file")) -- GitLab