diff --git a/.docker/Dockerfile b/.docker/Dockerfile index 9d2c5aedb1dbab515acce88e7736ed3f6f5ec72f..9daad27ef1bdb4da6659b0ad5402f8356c820742 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -4,6 +4,9 @@ RUN apt-get update && \ curl \ python3 \ python3-pip \ + python3-requests \ + python3-pandas \ + python3-html2text \ git \ openjdk-11-jdk-headless \ python-autopep8 \ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4cf9b51f775fd5c6a4ed5a92f0861089518e6293..699b03e22f9b315eccdd9ba038cfb840da9cb516 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -31,11 +31,12 @@ stages: - setup - cert - style - - test + - unittest + - integrationtest test: tags: [docker] - stage: test + stage: integrationtest image: $CI_REGISTRY_IMAGE_BASE script: - if [[ "$CAOSDB_TAG" == "" ]]; then @@ -106,3 +107,11 @@ style: script: - autopep8 -ar --diff --exit-code . allow_failure: true + +unittest: + tags: [docker] + stage: unittest + image: $CI_REGISTRY_IMAGE + script: + - cd src + - python3 -m pytest ../unittests diff --git a/README_SETUP.md b/README_SETUP.md index afea75279d347b1b7925a5972ec0b5b15274e708..cba784d3f7acb70ce20f0e06a55df8ad40c5a6d9 100644 --- a/README_SETUP.md +++ b/README_SETUP.md @@ -8,7 +8,7 @@ tox # Run Integration Tests Locally 1. Mount `integrationtests/full_test/extroot` to the folder that will be used as - extroot. E.g. `sudo mount -o bind extroot ../../../caosdb-deploy/profiles/empty/custom/extroot` + extroot. E.g. `sudo mount -o bind extroot ../../../caosdb-deploy/profiles/empty/paths/extroot` 2. Start an empty CaosDB instance 3. run test.sh diff --git a/integrationtests/full_test/crawl.py b/integrationtests/full_test/crawl.py index 622213ec8f97ca2c068665a7f49b434a2a6e1587..51ad24b9b3b1c8e9375f7c2e970b001bd777a77f 100755 --- a/integrationtests/full_test/crawl.py +++ b/integrationtests/full_test/crawl.py @@ -29,7 +29,8 @@ import sys from argparse import RawTextHelpFormatter import caosdb as db -from caosadvancedtools.crawler import Crawler + +from caosadvancedtools.crawler import FileCrawler from caosadvancedtools.guard import INSERT, RETRIEVE, UPDATE, Guard from caosadvancedtools.utils import set_log_level from scifolder import (AnalysisCFood, ExperimentCFood, ProjectCFood, @@ -59,12 +60,13 @@ if __name__ == "__main__": args = parser.parse_args() logger.info("Starting query...") - files = Crawler.query_files(args.path) + files = FileCrawler.query_files(args.path) logger.info("Query done...") config = db.configuration.get_config() - c = Crawler(use_cache=True, access=access, - food=[ProjectCFood, - ExperimentCFood, AnalysisCFood, - PublicationCFood, SimulationCFood, - ]) - c.crawl(files, interactive=False, security_level=INSERT, hideKnown=True) + c = FileCrawler(files=files, use_cache=True, access=access, + interactive=False, hideKnown=True, + food=[ProjectCFood, + ExperimentCFood, AnalysisCFood, + PublicationCFood, SimulationCFood, + ]) + c.crawl(security_level=UPDATE) diff --git a/integrationtests/full_test/model.yml b/integrationtests/full_test/model.yml index e698909bfbad21196b239f60123013887bb4524b..a01836b364ae539f5500da72d8685a6061f4f97d 100644 --- a/integrationtests/full_test/model.yml +++ b/integrationtests/full_test/model.yml @@ -18,7 +18,10 @@ Person: lastName: datatype: TEXT description: 'LastName of a Person.' - responsible: + recommended_properties: + email: + datatype: TEXT + description: 'Email of a Person.' responsible: datatype: REFERENCE revisionOf: diff --git a/integrationtests/full_test/test.sh b/integrationtests/full_test/test.sh index f7e7191b7a26702ab0742ef8ab53ec4efe7518be..25906da86f45237be5bebee45c7d976fa5fac43b 100755 --- a/integrationtests/full_test/test.sh +++ b/integrationtests/full_test/test.sh @@ -3,6 +3,7 @@ rm -rf cache.db echo "Filling the database" ./filldb.sh echo "Testing the crawler database" -py.test-3 test_crawler.py +python3 -m pytest test_crawler.py +# TODO the following test deletes lots of the data inserted by the crawler echo "Testing im and export" python3 test_im_und_export.py diff --git a/integrationtests/full_test/test_table.py b/integrationtests/full_test/test_table.py new file mode 100644 index 0000000000000000000000000000000000000000..91b9947103b2e8830f879205ea8941e229a1ae1d --- /dev/null +++ b/integrationtests/full_test/test_table.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2020 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +import argparse +import logging +import sys +from argparse import RawTextHelpFormatter + +import caosdb as db +import pandas as pd + +from caosadvancedtools.crawler import TableCrawler +from caosadvancedtools.guard import INSERT, RETRIEVE, UPDATE, Guard +from caosadvancedtools.utils import set_log_level + +if __name__ == "__main__": + logger = logging.getLogger("caosadvancedtools") + conlogger = logging.getLogger("connection") + conlogger.setLevel(level=logging.ERROR) + logger.setLevel(level=logging.DEBUG) + + table = pd.read_csv("example_table.csv") + + assert 0 == len(db.execute_query("FIND Person with firstname=Henrik")) + first = table.loc[table.firstName == "Henrik"] + tcr = TableCrawler(table=first, unique_cols=["firstName", "lastName"], + recordtype="Person", interactive=False) + tcr.crawl(security_level=UPDATE) + assert 1 == len(db.execute_query("FIND Person with firstname=Henrik")) + tcr = TableCrawler(table=table, unique_cols=["firstName", "lastName"], + recordtype="Person", interactive=False) + tcr.crawl(security_level=UPDATE) + assert 1 == len(db.execute_query("FIND Person with firstname=Henrik")) + assert 1 == len(db.execute_query("FIND Person with firstname=Max")) diff --git a/unittests/test_labfolder_import.py b/manual_tests/test_labfolder_import.py similarity index 95% rename from unittests/test_labfolder_import.py rename to manual_tests/test_labfolder_import.py index 0508f3e2e2bd716a4f33a9317bdd48ebd54d37dc..e1e9d3266478900b7fae02b3493fbc3d41ea2bd5 100644 --- a/unittests/test_labfolder_import.py +++ b/manual_tests/test_labfolder_import.py @@ -27,7 +27,7 @@ import sys import caosmodels from caosmodels.parser import parse_model_from_yaml -from caosadvancedtools.converter import labfolder +from caosadvancedtools.converter import labfolder_export as labfolder def main(args): diff --git a/unittests/test_labfolder_retrieve.py b/manual_tests/test_labfolder_retrieve.py similarity index 100% rename from unittests/test_labfolder_retrieve.py rename to manual_tests/test_labfolder_retrieve.py diff --git a/src/caosadvancedtools/cfood.py b/src/caosadvancedtools/cfood.py index 59c3a4ee9bd7a33239ef35086daf0f1b904e1003..2128d6b1c24610c423ec4534b5b1dd20ada631cd 100644 --- a/src/caosadvancedtools/cfood.py +++ b/src/caosadvancedtools/cfood.py @@ -7,6 +7,7 @@ # Copyright (C) 2018 Research Group Biomedical Physics, # Max-Planck-Institute for Dynamics and Self-Organization Göttingen # Copyright (C) 2019 Henrik tom Wörden +# Copyright (C) 2020 Henrik tom Wörden # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -62,46 +63,176 @@ def get_entity(name): class AbstractCFood(object): + + def __init__(self): + """ Abstract base class for Crawler food (CFood).""" + self.to_be_updated = db.Container() + self.identifiables = db.Container() + + def create_identifiables(self): + """ + should set the instance variable Container with the identifiables + """ + raise NotImplementedError() + + def update_identifiables(self): + """ Changes the identifiables as needed and adds changed identifiables + to self.to_be_updated + """ + raise NotImplementedError() + + def push_identifiables_to_CaosDB(self): + """ Updates the self.to_be_updated Container, i.e. pushes the changes + to CaosDB + """ + + if len(self.to_be_updated) == 0: + return + + get_ids_for_entities_with_names(self.to_be_updated) + + # remove duplicates + tmp = db.Container() + + for el in self.to_be_updated: + if el not in tmp: + tmp.append(el) + + self.to_be_updated = tmp + + logger.info("UPDATE: updating the following entities") + + for el in self.to_be_updated: + logger.info("\t" + el.name if el.name is not None else el.id) + + logger.debug(self.to_be_updated) + guard.safe_update(self.to_be_updated) + + @staticmethod + # move to api? + def set_parents(entity, names): + entity.parents.clear() + + for n in names: + entity.add_parent(get_entity(n)) + + @staticmethod + # move to api? + def remove_property(entity, prop): + # TODO only do something when it is necessary? + + if isinstance(prop, db.Entity): + name = prop.name + else: + name = prop + + while entity.get_property(name) is not None: + entity.remove_property(name) + + @staticmethod + # move to api? + def set_property(entity, prop, value, datatype=None): + AbstractCFood.remove_property(entity, prop) + + if datatype is not None: + entity.add_property(prop, value, datatype=datatype) + else: + entity.add_property(prop, value) + + +class CMeal(object): + """ + CMeal groups equivalent Files and allow their collected insertion. + + Sometimes there is no one file that can be used to trigger the creation of + some Record. E.g. if a collection of images shall be referenced from one + Record that groups them, it is unclear which image should trigger the + creation of the Record. + + CMeals are grouped based on the groups in the used regular expression. If, + in the above example, all the images reside in one folder, all groups + except that for the file name should match. The groups that shall match + need to be listed in the matching_groups class property. Subclasses will + overwrite this property. + + The cook function of a cfood allows this class to work. Instead of directly + instantiating a CFood the cook function is used. If the CFood is also a + child of CMeal, it will be checked (using get_suitable_cfood) in the cook + function whether a new CFood should be created or if the file match should + be added to an existing one. In order to allow this all instances of a + CFood class are tracked in the existing_instances class member. + """ + existing_instances = [] + matching_groups = [] + + def __init__(self, *args, **kwargs): + self.__class__.existing_instances.append(self) + self.crawled_files = [] + + def add(self, crawled_file): + self.crawled_files.append(crawled_file) + + @classmethod + def get_suitable_cfood(cls, match): + for cfood in cls.existing_instances: + suitable = True + + for group in cls.matching_groups: + if (group not in match.groupdict() or + group not in cfood.match.groupdict() or + match.group(group) != cfood.match.group(group)): + suitable = False + + if suitable: + return cfood + + return None + + +def get_entity_for_path(path): + try: + q = "FIND FILE WHICH IS STORED AT '{}'".format(path) + + return db.execute_query(q, unique=True) + except EntityDoesNotExistError: + path_prefix = "**" + + if not path.startswith("/"): + path_prefix = path_prefix + "/" + q = "FIND FILE WHICH IS STORED AT '{}{}'".format(path_prefix, path) + logger.debug(q) + + return db.execute_query(q, unique=True) + + +class AbstractFileCFood(AbstractCFood): # contains the compiled regular expression after the first execution of the # function match() _pattern = None - def __init__(self, crawled_file, access=lambda x: x): - """ Abstract base class for Crawler food (CFood). + def __init__(self, crawled_path, access=lambda x: x): + """ Abstract base class for file based Crawler food (CFood). Parameters ---------- - crawled_file : The file that the crawler is currently matching. Its + crawled_path : The file that the crawler is currently matching. Its path should match against the pattern of this class access : callable, optional A function that takes a CaosDB path and returns a local path """ + super().__init__() self.access = access self._crawled_file = None - self.crawled_path = crawled_file - self.match = type(self).match(crawled_file) - self.to_be_updated = db.Container() - self.identifiables = db.Container() + self.crawled_path = crawled_path + self.match = type(self).match_file(crawled_path) self.attached_ones = [] self.attached_filenames = [] @property def crawled_file(self): if self._crawled_file is None: - try: - q = "FIND FILE WHICH IS STORED AT '{}'".format( - self.crawled_path) - self._crawled_file = db.execute_query(q, unique=True) - except EntityDoesNotExistError: - path = "**" - - if not self.crawled_path.startswith("/"): - path = path + "/" - q = "FIND FILE WHICH IS STORED AT '{}{}'".format(path, - self.crawled_path) - logger.debug(q) - self._crawled_file = db.execute_query(q, unique=True) + self._crawled_file = get_entity_for_path(self.crawled_path) return self._crawled_file @@ -146,7 +277,7 @@ class AbstractCFood(object): return cls(crawled_file, **kwargs) @classmethod - def match(cls, string): + def match_file(cls, string): """ Matches the regular expression of this class against file names Parameters @@ -164,48 +295,12 @@ class AbstractCFood(object): return re.match(cls.get_re(), string) - def create_identifiables(self): - """ - should set the instance variable Container with the identifiables - """ - raise NotImplementedError() - - def update_identifiables(self): - """ Changes the identifiables as needed and adds changed identifiables - to self.to_be_updated - """ - raise NotImplementedError() - - def push_identifiables_to_CaosDB(self): - """ Updates the self.to_be_updated Container, i.e. pushes the changes - to CaosDB - """ - - if len(self.to_be_updated) == 0: - return - - get_ids_for_entities_with_names(self.to_be_updated) - - # remove duplicates - tmp = db.Container() - - for el in self.to_be_updated: - if el not in tmp: - tmp.append(el) - - self.to_be_updated = tmp - - logger.info("UPDATE: updating the following entities") - - for el in self.to_be_updated: - logger.info("\t" + el.name if el.name is not None else el.id) - - logger.debug(self.to_be_updated) - guard.safe_update(self.to_be_updated) - def attach(self, crawled_file): self.attached_ones.append(crawled_file) + # TODO looking for should `attach` the files itsself. This would allow to + # group them right away and makes it unnecessary to check matches later + # again. def looking_for(self, crawled_file): """ returns True if crawled_file can be added to this CFood. @@ -218,6 +313,8 @@ class AbstractCFood(object): This function can be used to define what files shall be 'attached'. """ + # TODO rename to filenames_to_be_attached + if crawled_file in self.attached_filenames: return True @@ -280,7 +377,6 @@ def assure_object_is_in_list(obj, containing_object, property_name, if not isinstance(containing_object.get_property(property_name).value, list): containing_object.get_property(property_name).value = [containing_object.get_property(property_name).value] - containing_object.get_property(property_name).value containing_object.get_property(property_name).datatype = datatype current_list = containing_object.get_property(property_name).value @@ -418,6 +514,9 @@ def assure_has_property(entity, name, value, to_be_updated=None, name.lower()] contained = False + if isinstance(value, db.Entity): + value = value.id + for el in possible_properties: if el.value == value: contained = True @@ -464,50 +563,28 @@ def get_ids_for_entities_with_names(entities): insert_id_based_on_name(ent) -class CMeal(object): - """ - CMeal groups equivalent CFoods and allow their collected insertion. - - Sometimes there is no one file that can be used to trigger the creation of - some Record. E.g. if a collection of images shall be referenced from one - Record that groups them, it is unclear which image should trigger the - creation of the Record. - - CMeals are grouped based on the groups in the used regular expression. If, - in the above example, all the images reside in one folder, all groups - except that for the file name should match. The groups that shall match - need to be listed in the matching_groups class property. Subclasses will - overwrite this property. - - The cook function of a cfood allows this class to work. Instead of directly - instantiating a CFood the cook function is used. If the CFood is also a - child of CMeal, it will be checked (using get_suitable_cfood) in the cook - function whether a new CFood should be created or if the file match should - be added to an existing one. In order to allow this all instances of a - CFood class are tracked in the existing_instances class member. - """ - existing_instances = [] - matching_groups = [] - - def __init__(self, *args, **kwargs): - self.existing_instances.append(self) - self.crawled_files = [] - - def add(self, crawled_file): - self.crawled_files.append(crawled_file) +class RowCFood(AbstractCFood): + def __init__(self, row, unique_cols, recordtype): + """ + table : pandas table + """ + super().__init__() + self.row = row + self.unique_cols = unique_cols + self.recordtype = recordtype - @classmethod - def get_suitable_cfood(cls, match): - for cfood in cls.existing_instances: - suitable = True + def create_identifiables(self): + rec = db.Record() + rec.add_parent(self.recordtype) - for group in cls.matching_groups: - if (group not in match.groupdict() or - group not in cfood.match.groupdict() or - match.group(group) != cfood.match.group(group)): - suitable = False + for col in self.unique_cols: + rec.add_property(col, self.row.loc[col]) + self.identifiables.append(rec) - if suitable: - return cfood + def update_identifiables(self): + rec = self.identifiables[0] - return None + for key, value in self.row.iteritems(): + if key in self.unique_cols: + continue + rec.add_property(key, value) diff --git a/src/caosadvancedtools/converter/labfolder_api.py b/src/caosadvancedtools/converter/labfolder_api.py index 82b2f4f4a042ba3b9350c3f9a87121914f27e0e0..567ee5a8aa7fdb1176fcbcc9bff96dfb6a19b821 100644 --- a/src/caosadvancedtools/converter/labfolder_api.py +++ b/src/caosadvancedtools/converter/labfolder_api.py @@ -28,7 +28,6 @@ import time import html2text import caosdb as db -import labfolder.connection from labfolder.connection import configure_connection diff --git a/src/caosadvancedtools/crawler.py b/src/caosadvancedtools/crawler.py index d581d7599907e3aac044c49dd65f401b1d9ffcc7..db89551d703c604e70cf0d7f0e656094a24629a3 100644 --- a/src/caosadvancedtools/crawler.py +++ b/src/caosadvancedtools/crawler.py @@ -6,6 +6,7 @@ # # Copyright (C) 2018 Research Group Biomedical Physics, # Max-Planck-Institute for Dynamics and Self-Organization Göttingen +# Copyright (C) 2020 Henrik tom Wörden # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as @@ -45,7 +46,8 @@ import caosdb as db from caosdb.exceptions import TransactionError from .cache import Cache -from .guard import INSERT, RETRIEVE, UPDATE +from .cfood import RowCFood +from .guard import RETRIEVE from .guard import global_guard as guard logger = logging.getLogger(__name__) @@ -80,11 +82,13 @@ class UnknownCache(object): class Crawler(object): - def __init__(self, food, access=lambda x: x, use_cache=False, - abort_on_exception=True): + def __init__(self, food=None, access=lambda x: x, use_cache=False, + abort_on_exception=True, interactive=True): """ Parameters ---------- + food : list of CFood classes, optional + The Crawler will use those CFoods when crawling. pattern : str The regex pattern for matching against file names. @@ -94,8 +98,17 @@ class Crawler(object): access : callable, optional A function that takes a CaosDB path and returns a local path + interactive : boolean, optional + If true, questions will be posed during execution of the + crawl function. + """ - self.food = food + + if food is None: + self.food = [] + else: + self.food = food + self.interactive = interactive self.access = access self.report = db.Container() self.use_cache = use_cache @@ -104,98 +117,54 @@ class Crawler(object): if self.use_cache: self.cache = Cache() - def match(self, files, interactive, hideKnown=False): - errors_occured = False - tbs = [] + def collect_cfoods(self): + """ + to be overwritten by subclasses. + + should return cfoods, tbs and errors_occured. + # TODO do this via logging? + tbs text returned from traceback + errors_occured True if at least one error occured + """ cfoods = [] - matches = {f: [] for f in files} + tbs = [] + errors_occured = False - logger.info(separated("Matching files against CFoods")) + for food in self.food: + cfoods.append(food()) for Cfood in self.food: - logger.debug("Matching against {}...".format(Cfood.__name__)) - - for crawled_file in files: - if Cfood.match(crawled_file) is not None: - matches[crawled_file].append(Cfood.__name__) - - logger.debug("{} matched\n{}.".format( - Cfood.__class__.__name__, - crawled_file)) - try: - cfood = Cfood.cook(crawled_file, access=self.access) - - if cfood is not None: - cfoods.append(cfood) - except Exception as e: - traceback.print_exc() - print(e) - - if self.abort_on_exception: - raise e - errors_occured = True - tbs.append(e) - - logger.info(separated("CFoods are collecting information...")) - - for cfood in cfoods: - cfood.collect_information() - - logger.info(separated("Trying to attach files to created CFoods")) - - for cfood in cfoods: - logger.debug("Matching against {}...".format(Cfood.__name__)) - - for crawled_file in files: - if cfood.looking_for(crawled_file): - logger.debug("{} matched\n{}.".format( - Cfood.__class__.__name__, - crawled_file)) - cfood.attach(crawled_file) - matches[crawled_file].append(Cfood.__name__) - - # possibly load previously encountered "Missing matches" and - # "Multiple matches" - ucache = UnknownCache(interactive=interactive, load=hideKnown) + try: + cfood = Cfood() - for crawled_file in files: - if len(matches[crawled_file]) == 0: - msg = ("ATTENTION: No matching cfood!\n" - "Tried to match {}\n".format(crawled_file)) + if cfood is not None: + cfoods.append(cfood) + except Exception as e: + traceback.print_exc() + print(e) - if crawled_file in ucache.filenames: - logger.debug(msg) - else: - logger.warn(msg) - ucache.add(crawled_file) + if self.abort_on_exception: + raise e + errors_occured = True + tbs.append(e) - if len(matches[crawled_file]) > 1: - msg = ("Attention: More than one matching cfood!\n" - + "Tried to match {}\n".format(crawled_file) - + "\tRecordTypes:\t" + ", ".join( - matches[crawled_file])+"\n") + return cfoods, tbs, errors_occured - if crawled_file in ucache.filenames: - logger.debug(msg) - else: - logger.warn(msg) - ucache.add(crawled_file) + def cached_find_identifiables(self, identifiables): + if self.use_cache: + hashes = self.cache.update_ids_from_cache(identifiables) - # Save the encountered prblem matches - ucache.save() + self.find_or_insert_identifiables(identifiables) - return cfoods, matches, tbs, errors_occured + if self.use_cache: + self.cache.insert_list(hashes, identifiables) - def crawl(self, files, interactive=True, hideKnown=False, - security_level=RETRIEVE): + def crawl(self, security_level=RETRIEVE): guard.set_level(level=security_level) - files = sorted([f.path for f in files]) - - cfoods, matches, tbs, errors_occured = self.match(files, interactive, - hideKnown=hideKnown) + cfoods, tbs, errors_occured = self.collect_cfoods() - if interactive and "y" != input("Do you want to continue? (y)"): + if self.interactive and "y" != input("Do you want to continue? (y)"): return logger.info(separated("Creating and updating Identifiables")) @@ -204,14 +173,7 @@ class Crawler(object): try: cfood.create_identifiables() - if self.use_cache: - hashes = self.cache.update_ids_from_cache( - cfood.identifiables) - - self.find_or_insert_identifiables(cfood.identifiables) - - if self.use_cache: - self.cache.insert_list(hashes, cfood.identifiables) + self.cached_find_identifiables(cfood.identifiables) cfood.update_identifiables() cfood.push_identifiables_to_CaosDB() @@ -225,8 +187,8 @@ class Crawler(object): tbs.append(e) if errors_occured: - logger.warn("Crawler terminated with failures!") - logger.warn(tbs) + logger.warning("Crawler terminated with failures!") + logger.warning(tbs) else: logger.info("Crawler terminated successfully!") @@ -322,6 +284,152 @@ class Crawler(object): return files +class FileCrawler(Crawler): + def __init__(self, files, access=lambda x: x, hideKnown=False, **kwargs): + """ + Parameters + ---------- + files : files to be crawled + + access : callable, optional + A function that takes a CaosDB path and returns a local path + + """ + super().__init__(**kwargs) + self.files = files + self.access = access + self.hideKnown = hideKnown + + def match(self): + + files = sorted([f.path for f in self.files]) + errors_occured = False + tbs = [] + cfoods = [] + matches = {f: [] for f in files} + + logger.info(separated("Matching files against CFoods")) + + for Cfood in self.food: + logger.debug("Matching against {}...".format(Cfood.__name__)) + + for crawled_file in files: + if Cfood.match_file(crawled_file) is not None: + matches[crawled_file].append(Cfood.__name__) + + logger.debug("{} matched\n{}.".format( + Cfood.__name__, + crawled_file)) + try: + cfood = Cfood.cook(crawled_file, access=self.access) + + if cfood is not None: + cfoods.append(cfood) + except Exception as e: + traceback.print_exc() + print(e) + + if self.abort_on_exception: + raise e + errors_occured = True + tbs.append(e) + + logger.info(separated("CFoods are collecting information...")) + + for cfood in cfoods: + cfood.collect_information() + + logger.info(separated("Trying to attach files to created CFoods")) + + for cfood in cfoods: + logger.debug("Matching against {}...".format(Cfood.__name__)) + + for crawled_file in files: + if cfood.looking_for(crawled_file): + logger.debug("{} matched\n{}.".format( + cfood.__class__.__name__, + crawled_file)) + cfood.attach(crawled_file) + matches[crawled_file].append(Cfood.__name__) + + # possibly load previously encountered "Missing matches" and + # "Multiple matches" + ucache = UnknownCache(interactive=self.interactive, load=self.hideKnown) + + for crawled_file in files: + if len(matches[crawled_file]) == 0: + msg = ("ATTENTION: No matching cfood!\n" + "Tried to match {}\n".format(crawled_file)) + + if crawled_file in ucache.filenames: + logger.debug(msg) + else: + logger.warning(msg) + ucache.add(crawled_file) + + if len(matches[crawled_file]) > 1: + msg = ("Attention: More than one matching cfood!\n" + + "Tried to match {}\n".format(crawled_file) + + "\tRecordTypes:\t" + ", ".join( + matches[crawled_file])+"\n") + + if crawled_file in ucache.filenames: + logger.debug(msg) + else: + logger.warning(msg) + ucache.add(crawled_file) + + # Save the encountered prblem matches + ucache.save() + + return cfoods, tbs, errors_occured + + def collect_cfoods(self): + + cfoods, tbs, errors_occured = self.match() + + return cfoods, tbs, errors_occured + + +class TableCrawler(Crawler): + + def __init__(self, table, unique_cols, recordtype, **kwargs): + """ + Parameters + ---------- + table : pandas DataFrame + unique_cols : the columns that provide the properties for the + identifiable + recordtype : Record Type of the Records to be created + """ + super().__init__(**kwargs) + self.table = table + self.unique_cols = unique_cols + self.recordtype = recordtype + + def collect_cfoods(self): + cfoods = [] + tbs = [] + errors_occured = False + + for _, row in self.table.iterrows(): + try: + cfood = RowCFood(row, self.unique_cols, self.recordtype) + + if cfood is not None: + cfoods.append(cfood) + except Exception as e: + traceback.print_exc() + print(e) + + if self.abort_on_exception: + raise e + errors_occured = True + tbs.append(e) + + return cfoods, tbs, errors_occured + + def get_value(prop): """ Returns the value of a Property diff --git a/src/caosadvancedtools/example_cfood.py b/src/caosadvancedtools/example_cfood.py index 07456df068263d65206f39072c6d5b330da756bf..6111d95defc37bbb6d836feec3fa3d2e4e3d91ab 100644 --- a/src/caosadvancedtools/example_cfood.py +++ b/src/caosadvancedtools/example_cfood.py @@ -22,10 +22,10 @@ import caosdb as db -from .cfood import AbstractCFood, assure_has_property +from .cfood import AbstractFileCFood, assure_has_property -class ExampleCFood(AbstractCFood): +class ExampleCFood(AbstractFileCFood): @staticmethod def get_re(): return (r".*/(?P<species>[^/]+)/" diff --git a/unittests/test_cfood.py b/unittests/test_cfood.py index 9122856dcdee795c4ec5cd14ee1591df3dbff3af..dcea2cbd7ab02850a1e1bbc59b854eafd4bdd62f 100644 --- a/unittests/test_cfood.py +++ b/unittests/test_cfood.py @@ -24,14 +24,15 @@ import unittest import caosdb as db -from caosadvancedtools.cfood import (AbstractCFood, assure_has_parent, + +from caosadvancedtools.cfood import (AbstractFileCFood, assure_has_parent, assure_object_is_in_list) from caosadvancedtools.example_cfood import ExampleCFood PATTERN = "h.*" -class TestCFood(AbstractCFood): +class TestCFood(AbstractFileCFood): @staticmethod def get_re(): @@ -42,11 +43,11 @@ class CFoodReTest(unittest.TestCase): def test(self): self.assertEquals(TestCFood.get_re(), PATTERN) self.assertEqual(TestCFood._pattern, None) - self.assertIsNotNone(TestCFood.match("hallo")) + self.assertIsNotNone(TestCFood.match_file("hallo")) # TODO the caching is of compiled re is disabled currently # self.assertIsNotNone(TestCFood._pattern) - self.assertIsNotNone(TestCFood.match("hallo")) - self.assertIsNone(TestCFood.match("allo")) + self.assertIsNotNone(TestCFood.match_file("hallo")) + self.assertIsNone(TestCFood.match_file("allo")) class InsertionTest(unittest.TestCase): @@ -77,7 +78,7 @@ class InsertionTest(unittest.TestCase): class ExampleTest(unittest.TestCase): def test(self): path = "/data/rabbit/2019-03-03/README.md" - cf = ExampleCFood(crawled_file=path) - self.assertIsNotNone(ExampleCFood.match(path)) + cf = ExampleCFood(crawled_path=path) + self.assertIsNotNone(ExampleCFood.match_file(path)) self.assertEqual(cf.match.group('species'), 'rabbit') self.assertEqual(cf.match.group('date'), '2019-03-03')