diff --git a/src/caosadvancedtools/crawler.py b/src/caosadvancedtools/crawler.py index 82ada2fa02cd3b87415493e1c1d5499fc3dc8a82..5c7ff4f954fa939afd359b9d4c3fe5db58396206 100644 --- a/src/caosadvancedtools/crawler.py +++ b/src/caosadvancedtools/crawler.py @@ -66,6 +66,66 @@ def separated(text): return "-"*60 + "\n" + text +def apply_list_of_updates(to_be_updated, update_flags, + update_cache=None, run_id=None): + """ + Updates the to_be_updated Container, i.e. pushes the changes to CaosDB + """ + + if len(to_be_updated) == 0: + return + + get_ids_for_entities_with_names(to_be_updated) + + # remove duplicates + tmp = db.Container() + + for el in to_be_updated: + if el not in tmp: + tmp.append(el) + + to_be_updated = tmp + + info = "UPDATE: updating the following entities\n" + + baseurl = db.configuration.get_config()["Connection"]["url"] + + for el in to_be_updated: + def make_clickable(txt, id): + return "<a href='{}/Entity/{}'>{}</a>".format(baseurl, id, txt) + info += str("\t" + make_clickable(el.name, el.id) + if el.name is not None + else "\t" + make_clickable(str(el.id), el.id)) + info += "\n" + logger.info(info) + + logger.debug(to_be_updated) + try: + if len(to_be_updated) > 0: + logger.info( + "Updating {} Records...".format( + len(to_be_updated))) + guard.safe_update(to_be_updated, unique=False, + flags=update_flags) + except FileNotFoundError as e: + logger.info("Cannot access {}. However, it might be needed for" + " the correct execution".format(e.filename)) + except ProhibitedException: + try: + update_cache.insert(to_be_updated, run_id) + except IntegrityError as e: + logger.warning( + "There were problems with the update of {}.".format( + to_be_updated), + extra={"identifier": str(to_be_updated), + "category": "update-cache"} + ) + logger.debug(traceback.format_exc()) + logger.debug(e) + except Exception as e: + DataModelProblems.evaluate_exception(e) + + class Crawler(object): def __init__(self, cfood_types, use_cache=False, abort_on_exception=True, interactive=True, hideKnown=False, @@ -318,7 +378,11 @@ class Crawler(object): self._cached_find_or_insert_identifiables(cfood.identifiables) cfood.update_identifiables() - self.push_identifiables_to_CaosDB(cfood) + self.apply_list_of_updates( + cfood.to_be_updated, + cfood.update_flags, + update_cache=self.update_cache, + run_id=self.run_id) except FileNotFoundError as e: logger.info("Cannot access {}. However, it might be needed for" " the correct execution".format(e.filename)) @@ -516,64 +580,8 @@ carefully and if the changes are ok, click on the following link: subject="Crawler Update", body=text) - def push_identifiables_to_CaosDB(self, cfood): - """ - Updates the to_be_updated Container, i.e. pushes the changes to CaosDB - """ - - if len(cfood.to_be_updated) == 0: - return - - get_ids_for_entities_with_names(cfood.to_be_updated) - - # remove duplicates - tmp = db.Container() - - for el in cfood.to_be_updated: - if el not in tmp: - tmp.append(el) - - cfood.to_be_updated = tmp - - info = "UPDATE: updating the following entities\n" - - baseurl = db.configuration.get_config()["Connection"]["url"] - for el in cfood.to_be_updated: - def make_clickable(txt, id): - return "<a href='{}/Entity/{}'>{}</a>".format(baseurl, id, txt) - info += str("\t" + make_clickable(el.name, el.id) - if el.name is not None - else "\t" + make_clickable(str(el.id), el.id)) - info += "\n" - logger.info(info) - - logger.debug(cfood.to_be_updated) - try: - if len(cfood.to_be_updated) > 0: - logger.info( - "Updating {} Records...".format( - len(cfood.to_be_updated))) - guard.safe_update(cfood.to_be_updated, unique=False, - flags=cfood.update_flags) - except FileNotFoundError as e: - logger.info("Cannot access {}. However, it might be needed for" - " the correct execution".format(e.filename)) - except ProhibitedException: - try: - self.update_cache.insert(cfood.to_be_updated, self.run_id) - except IntegrityError as e: - logger.warning( - "There were problems with the update of {}.".format( - cfood.to_be_updated), - extra={"identifier": str(cfood.to_be_updated), - "category": "update-cache"} - ) - logger.debug(traceback.format_exc()) - logger.debug(e) - except Exception as e: - DataModelProblems.evaluate_exception(e) - # TODO remove static? + @staticmethod def find_or_insert_identifiables(identifiables): """ Sets the ids of identifiables (that do not have already an id from the diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py index 7b4414b7bc24e93e5035bb9159b5709351360313..fee39ced2ed47da9d85d9430cce2d18e9a4023ca 100755 --- a/src/caosadvancedtools/table_importer.py +++ b/src/caosadvancedtools/table_importer.py @@ -51,7 +51,7 @@ def assure_name_format(name): name = str(name) if len(name.split(",")) != 2: - raise ValueError("Name field should be 'LastName, FirstName'." + raise ValueError("The field value should be 'LastName, FirstName'. " "The supplied value was '{}'.".format(name)) return name @@ -303,14 +303,20 @@ class TableImporter(object): """ for key, datatype in self.datatypes.items(): - for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].iteritems(): + for idx, val in df.loc[ + pd.notnull(df.loc[:, key]), key].iteritems(): + if not isinstance(val, datatype): - raise DataInconsistencyError( - "In row no. {rn} and column {c} of file '{fi}' the " + msg = ( + "In row no. {rn} and column '{c}' of file '{fi}' the " "datatype was {was} but it should be " "{expected}".format(rn=idx, c=key, fi=filename, - was=type(val), expected=datatype) + was=str(type(val)).strip("<>"), + expected=str(datatype).strip("<>")) ) + logger.warning(msg, extra={'identifier': filename, + 'category': "inconsistency"}) + raise DataInconsistencyError(msg) def check_missing(self, df, filename=None): """ @@ -394,7 +400,7 @@ class XLSImporter(TableImporter): df = xls_file.parse(converters=self.converters, **kwargs) except Exception as e: logger.warning( - "Cannot parse {}.".format(filename), + "Cannot parse {}.\n{}".format(filename, e), extra={'identifier': str(filename), 'category': "inconsistency"}) raise DataInconsistencyError(*e.args) @@ -411,7 +417,7 @@ class CSVImporter(TableImporter): **kwargs) except ValueError as ve: logger.warning( - "Cannot parse {}.".format(filename), + "Cannot parse {}.\n{}".format(filename, ve), extra={'identifier': str(filename), 'category': "inconsistency"}) raise DataInconsistencyError(*ve.args) @@ -428,7 +434,7 @@ class TSVImporter(TableImporter): **kwargs) except ValueError as ve: logger.warning( - "Cannot parse {}.".format(filename), + "Cannot parse {}.\n{}".format(filename, ve), extra={'identifier': str(filename), 'category': "inconsistency"}) raise DataInconsistencyError(*ve.args) diff --git a/src/doc/crawler.rst b/src/doc/crawler.rst index c52bbf2fe9b9f5fd77805e45ec85d195f5aa95f3..4b99c97e6db16e5691f373fa5fb4903e4d078155 100644 --- a/src/doc/crawler.rst +++ b/src/doc/crawler.rst @@ -131,6 +131,10 @@ The behavior and rules of the crawler are defined in logical units called CFoods. In order to extend the crawler you need to extend an existing CFood or create new one. +.. Note:: A crawler always needs a corresponding data model to exits in the + server. The following does not cover this aspect. Please refer + for example to documentation of the YAML Interface. + .. _c-food-introduction: CFood -- Introduction