crawl_sample_data_async.py

#!/usr/bin/env python3
#
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 GEOMAR
# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2024 Florian Spreckelsen <f.spreckelsen@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public
# License along with this program. If not, see
# <https://www.gnu.org/licenses/>.
#

import logging
import os
import pandas as pd
import re
import sys

from dateutil import parser as dateparser
from pickle import UnpicklingError

import linkahead as db

from caosadvancedtools.datainconsistency import DataInconsistencyError
from caosadvancedtools.serverside import helper
from caoscrawler import Crawler, SecurityMode
from caoscrawler.crawl import ForbiddenTransaction, _notify_about_inserts_and_updates
from caoscrawler.exceptions import ImpossibleMergeError
from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter
from caoscrawler.logging import configure_server_side_logging
from linkahead.cached import cached_get_entity_by
from linkahead.common.datatype import get_id_of_datatype

from bis_utils import (get_do_not_insert_type_names,
                       IGNORED_COLUMN_NAMES_SAMPLE,
                       return_value_if_not_none,
                       send_mail_with_defaults,
                       SPECIAL_TREATMENT_SAMPLE)

# suppress warning of diff function
apilogger = logging.getLogger("linkahead.apiutils")
apilogger.setLevel(logging.ERROR)

logger = logging.getLogger("caosadvancedtools")


def _notify_about_error(text, subject):

    logger.error(text)
    send_mail_with_defaults(subject=subject, body=text)


def _is_ignored_column_name(name, parent_suffix="_parent"):

    return name in IGNORED_COLUMN_NAMES_SAMPLE or name.endswith(parent_suffix)


def _update_property(entity: db.Record, property_id: int, value, property_name="", datatype=None):
    """
    Set the property of an entity.

    If the entity already has the property, just the value is set.
    Else, the property is added to the entity
    """
    # TODO: Replace by assure_property_is etc.

    # If the value in the spreadsheet is empty (nan)
    if ((isinstance(value, list) and len(value) == 0)
            or (not isinstance(value, list) and pd.isna(value))):
        # Remove the property from te Entity if it has it
        try:
            entity.get_properties().get_by_name(property_name)
            entity.remove_property(property_name)
        except KeyError:
            pass
        return entity
    if entity.get_property(property_id) is None:
        if datatype:
            entity.add_property(id=property_id, value=value, name=property_name, datatype=datatype)
        else:
            entity.add_property(id=property_id, value=value, name=property_name)
        logger.debug("{}: Adding {} = {}".format(entity.id, property_id, value.id if
                                                 isinstance(value, db.Entity) else value))
    else:
        if isinstance(value, list) and not entity.get_property(property_id).datatype.startswith("LIST"):
            entity.get_property(property_id).datatype = db.LIST(
                entity.get_property(property_id).datatype)
        entity.get_property(property_id).value = value
        logger.debug("{}: Setting {} = {}".format(entity.id, property_id, value.id if
                                                  isinstance(value, db.Entity) else value))
    return entity


def _treat_date(date_val: str):

    date_pattern = r"^[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}$"
    # Check general pattern since dateutil.parse is unreliable with incomplete
    # dates (e.g., 2024-01) or wrong formats (e.g., 01.12.2024 is parsed as
    # 2024-01-12).
    if re.match(date_pattern, date_val) is None:
        # ParserError for simplified error handling down the line.
        raise dateparser.ParserError(f"{date_val} is not of the format YYYY-MM-DD.")
    # Use dateutils.parser despite having checked the pattern to exclude
    # nonsense dates like 2024-13-54.
    return str(dateparser.parse(date_val).date())


def _append_times_to_entity(ent, data, propname_prefix="Time", colname_time_prefix="Time collected", colname_date_prefix="Date collected"):
    propname_start = f"{propname_prefix} start"
    propname_stop = f"{propname_prefix} stop"
    prop_start = cached_get_entity_by(name=propname_start)
    prop_stop = cached_get_entity_by(name=propname_stop)
    colname_time_start = f"{colname_time_prefix} start"
    colname_time_stop = f"{colname_time_prefix} stop"
    colname_date_start = f"{colname_date_prefix} start"
    colname_date_stop = f"{colname_date_prefix} stop"

    date_start = None
    date_stop = None
    time_start = None
    time_stop = None
    timezone = None
    # Time start
    if colname_date_start in data and return_value_if_not_none(data[colname_date_start]) is not None:
        date_start = return_value_if_not_none(data[colname_date_start])
        try:
            date_start = _treat_date(date_start)
        except dateparser.ParserError as perr:
            logger.error(
                f"There is a problem in '{colname_date_start}': {date_start}"
                f" of sample {data['BIS ID']}: {perr}"
            )
            raise DataInconsistencyError
    if colname_time_start in data and return_value_if_not_none(data[colname_time_start]) is not None:
        if not "Timezone" in data or return_value_if_not_none(data["Timezone"]) is None:
            logger.error(f"{colname_time_start} but no timezone given for sample "
                         f"{data['BIS ID']}.")
            raise DataInconsistencyError
        time_start = return_value_if_not_none(data[colname_time_start])
        timezone = return_value_if_not_none(data["Timezone"])
        if date_start is None:
            logger.error(
                f"{colname_time_start} is given but {colname_date_start} is missing for sample {data['BIS ID']}.")
            raise DataInconsistencyError
        try:
            _val = str(dateparser.parse(f"{date_start}T{time_start}{timezone}"))
            ent = _update_property(ent, prop_start.id, property_name=prop_start.name, value=_val)
        except dateparser.ParserError as perr:
            logger.error(
                f"Couldn't parse {colname_time_start}: {time_start} with timezone {timezone} "
                f"of sample {data['BIS ID']}: {perr}"
            )
            raise DataInconsistencyError
    elif date_start is not None:
        ent = _update_property(ent, prop_start.id, value=date_start, property_name=prop_start.name)

    # Time stop; raise error in case of stop without start
    if colname_date_stop in data and return_value_if_not_none(data[colname_date_stop]) is not None:
        date_stop = return_value_if_not_none(data[colname_date_stop])
    if date_stop is not None and date_start is None:
        logger.error(
            f"{colname_date_stop} is given but {colname_date_start} is missing for sample {data['BIS ID']}.")
        raise DataInconsistencyError
    if date_stop is None:
        _date_stop = date_start
    else:
        try:
            _date_stop = _treat_date(date_stop)
        except dateparser.ParserError as perr:
            logger.error(
                f"There is a problem in '{colname_date_stop}': {date_stop}"
                f" of sample {data['BIS ID']}: {perr}"
            )
            raise DataInconsistencyError

    if colname_time_stop in data and return_value_if_not_none(data[colname_time_stop]) is not None:
        time_stop = return_value_if_not_none(data[colname_time_stop])
        if time_start is None:
            logger.error(
                f"{colname_time_stop} is given but {colname_time_start} is missing for sample {data['BIS ID']}.")
            raise DataInconsistencyError
        # timezone is set by time start; if it hadn't been there, we would already have an error.
        try:
            _val = str(dateparser.parse(f"{_date_stop}T{time_stop}{timezone}"))
        except dateparser.ParserError as perr:
            logger.error(
                f"Couldn't parse {colname_time_stop}: {time_stop} with timezone {timezone} "
                f"of sample {data['BIS ID']}: {perr}"
            )
            raise DataInconsistencyError
        ent = _update_property(ent, prop_stop.id, property_name=prop_stop.name, value=_val)
    elif date_stop is not None:
        # We check date_stop but we used the cleaned-up _date_stop as value
        ent = _update_property(ent, prop_stop.id, property_name=prop_stop.name, value=_date_stop)

    return ent


def get_container(data):
    """
    Retrun the BIS ID of the Container Record that is identified by 'Storage contianer' in data.
    A Container can either be identified via a BIS ID or via a BIS Label.

    If no Container can be identified, an Error is raised, since creating/registering new
    Containers has to be done before registering samples.
    """
    identified_by_label = False
    container_identifier = data["Storage ID"]
    # If the ID is not spcified, try to get the label
    if "Storage Container Label" in data and pd.isnull(container_identifier):
        container_identifier = data["Storage Container Label"]
        identified_by_label = True

    if identified_by_label:
        container = _get_container_by_label(container_identifier)
    else:
        container = _get_container_by_id(container_identifier)

    if container is not None:
        return container
    else:
        msg = "Container: '{}' could not be identified.".format(container_identifier)
        raise DataInconsistencyError(msg)


def _get_container_by_id(id):
    res = db.execute_query("FIND RECORD Container WITH id = '{}'".format(id))
    if len(res) > 0:
        return res[0]
    else:
        return None


def _get_container_by_label(label):
    res = db.execute_query("FIND RECORD Container WITH 'BIS Label' = '{}'".format(label))
    if len(res) > 0:
        return res[0]
    else:
        return None


def get_event(data, gear_id):
    # Events only have names if they have the subevent property.
    if "Subevent" in data and return_value_if_not_none(data["Subevent"]) is not None:
        event_name = f"{data['Subevent']}"
        return _create_new_source_event(event_name, data, gear_id)

    return _create_new_source_event(name=None, data=data, gear_id=gear_id)


def _create_new_source_event(name, data, gear_id) -> db.Record:
    event = db.Record(name)
    event.add_parent("SourceEvent")
    event = _append_times_to_entity(event, data)

    event.add_property(name="Gear", value=gear_id)
    event.add_property(name="Position", value=_get_positions(
        data), datatype=db.common.datatype.LIST("Position"))  # type: ignore
    if "Station ID" in data and return_value_if_not_none(data["Station ID"]) is not None:
        event.add_property(name="Station ID", value=str(data["Station ID"]))
    if "Station number" in data and return_value_if_not_none(data["Station number"]) is not None:
        event.add_property(name="Station number", value=str(data["Station number"]))
    if "Hol" in data and return_value_if_not_none(data["Hol"]) is not None:
        event.add_property(name="Hol", value=str(data["Hol"]))
    return event


def _get_positions(data):
    # TODO: Refactor
    if "Latitude start" in data:
        latitude_start = return_value_if_not_none(data["Latitude start"])
    else:
        latitude_start = None
    if "Latitude stop" in data:
        latitude_stop = return_value_if_not_none(data["Latitude stop"])
    else:
        latitude_stop = None
    if "Longitude start" in data:
        longitude_start = return_value_if_not_none(data["Longitude start"])
    else:
        longitude_start = None
    if "Longitude stop" in data:
        longitude_stop = return_value_if_not_none(data["Longitude stop"])
    else:
        longitude_stop = None
    if "Sampling depth start" in data:
        sampling_depth_start = return_value_if_not_none(data["Sampling depth start"])
    else:
        sampling_depth_start = None
    if "Sampling depth stop" in data:
        sampling_depth_stop = return_value_if_not_none(data["Sampling depth stop"])
    else:
        sampling_depth_stop = None
    if "Water depth start" in data:
        water_depth_start = return_value_if_not_none(data["Water depth start"])
    else:
        water_depth_start = None
    if "Water depth stop" in data:
        water_depth_stop = return_value_if_not_none(data["Water depth stop"])
    else:
        water_depth_stop = None
    # insert start position
    position_start = db.Record()
    position_start.add_parent("StartPosition")
    position_start.add_property(name="Latitude", value=latitude_start)
    position_start.add_property(name="Longitude", value=longitude_start)
    if not pd.isna(sampling_depth_start):
        if sampling_depth_start < 0.0:
            sampling_depth_start *= -1.0
    # identifiable, so add even if it is None
    position_start.add_property(name="Sampling depth", value=sampling_depth_start)
    if not pd.isna(water_depth_start):
        if water_depth_start < 0:
            water_depth_start *= -1
    # identifiable, so add even if it is None
    position_start.add_property(name="Water depth", value=water_depth_start)

    # A stop position may be specified by depth stop alone:
    if not (pd.isna(sampling_depth_stop) and pd.isna(water_depth_stop)):
        # Copy all empty info from start position
        if pd.isna(latitude_stop) and pd.isna(longitude_stop):
            latitude_stop = latitude_start
            longitude_stop = longitude_start
        if pd.isna(sampling_depth_stop):
            sampling_depth_stop = sampling_depth_start
        if pd.isna(water_depth_stop):
            water_depth_stop = water_depth_start
    # If there is an endposition: insert endposition
    if not (pd.isna(latitude_stop) or pd.isna(longitude_stop)):

        position_end = db.Record()
        # position_end = db.Record("({}, {})".format(latitude_stop, longitude_stop))
        position_end.add_parent("StopPosition")
        position_end.add_property(name="Latitude", value=latitude_stop)
        position_end.add_property(name="Longitude", value=longitude_stop)
        if not pd.isna(sampling_depth_stop):
            if sampling_depth_stop < 0:
                sampling_depth_stop *= -1
            # position_end.name = position_end.name + " at -{}m".format(sampling_depth_stop)
        # identifiable, so add even if it is None
        position_end.add_property(name="Sampling depth", value=sampling_depth_stop)
        if not pd.isna(water_depth_stop):
            if water_depth_stop < 0:
                water_depth_stop *= -1
        # identifiable, so add even if it is None
        position_end.add_property(name="Water depth", value=water_depth_stop)
        # position_end.insert(unique=False)
        return [position_start, position_end]
    else:
        return [position_start]


def get_gear(data):
    """
    Return the BIS ID of the Gear that is specified by 'Gear' and 'Gear configuration' in data.

    If no Such Gear Record exists, a new Gear Record is created.
    """

    qtext = f"FIND RECORD '{data['Gear']}'"
    if "Gear configuration" in data and pd.notnull(data["Gear configuration"]):
        qtext += f" WITH 'Configuration'='{data['Gear configuration']}'"
    try:
        res = db.execute_query(qtext, unique=True)
    except db.exceptions.EmptyUniqueQueryError:
        raise DataInconsistencyError(f"The query\n{qtext}\nreturned no results.")
    except db.exceptions.QueryNotUniqueError:
        raise DataInconsistencyError(f"The query\n{qtext}\nreturned  more than one result.")
    return res


def get_nagoya_case(data):
    """Create and retrun a NagoyaCase Record."""
    nagoya_case_number = return_value_if_not_none(data["Nagoya case number"])

    nagoya_case = db.Record(nagoya_case_number)
    nagoya_case.add_parent(name="NagoyaCase")
    nagoya_case.add_property(name="Nagoya Case Number", value=nagoya_case_number)

    return nagoya_case


def get_person(text) -> db.Record:
    """
    Return the BIS ID of the person that is specifed as 'Main User' or 'Sampling Person' in data.

    If the Person is not present in the database, an Exception is raised. Creating new Person Reconrds can only be done by a priviledged user.
    """
    # Check in which format the person is identified:
    person_identifier = text.split(", ")
    if len(person_identifier) == 1:
        person = _get_person_by_abbreviation(person_identifier[0])
    else:
        person = _get_person_by_fullname(person_identifier[1], person_identifier[0])

    return person


def _get_person_by_fullname(first_name, last_name):
    # seach for person in db
    res = db.execute_query(
        "FIND RECORD Person WITH 'First name' = '{}' AND 'Last name' = '{}'".format(first_name, last_name))
    # if person doesn't exist in database...
    if len(res) == 0:
        # There is not enought data in the template to create a new Person record. Hence, we have to raise an Exception
        error_msg = "There is no person Record with 'First name' = '{}' AND 'Last name' = '{}' in the database. ".format(
            first_name, last_name)
        raise DataInconsistencyError(error_msg)
    else:
        return res[0]


def _get_person_by_abbreviation(abbreviation):
    # seach for person in db
    res = db.execute_query("FIND RECORD Person WITH 'Abbreviation' = '{}'".format(abbreviation))
    # if person doesn't exist in database...
    if len(res) == 0:
        # There is not enought data in the template to create a new Person record. Hence, we have to raise an Exception
        error_msg = "There is no Person Record with Abbreviation = '{}'".format(abbreviation)
        raise DataInconsistencyError(error_msg)
    else:
        return res[0]


def synchroize(records, additional_property_ents, htmluserlog_public):
    crawler = Crawler(securityMode=SecurityMode.UPDATE)
    identifiables_definition_file = os.path.expanduser("~/identifiables.yml")
    ident = CaosDBIdentifiableAdapter()
    ident.load_from_yaml_definition(identifiables_definition_file)
    for property_name, entity in additional_property_ents.items():
        if entity.role != "RecordType":
            continue
        if len(entity.properties) == 0:
            ident.register_identifiable(
                name=entity.name,
                definition=db.RecordType().add_parent(entity.name).add_property(name="name"))
        else:
            ident.register_identifiable(
                name=entity.name,
                definition=db.RecordType().add_parent(entity.name).add_property(
                    name=entity.properties[0].name))
    crawler.identifiableAdapter = ident

    inserts, updates = crawler.synchronize(commit_changes=True, unique_names=False,
                                           crawled_data=records,
                                           no_insert_RTs=get_do_not_insert_type_names(),
                                           no_update_RTs=None,
                                           )
    if "SHARED_DIR" in os.environ:
        _notify_about_inserts_and_updates(len(inserts), len(updates), htmluserlog_public,
                                          crawler.run_id)


def update_sample_records(data, htmluserlog_public):
    logger.info("Starting sample updates...")

    # TODO Check data first and if there are Errors in the data: Provide the user with a download
    # link to a template with Error descriptions.

    # Get property ids:
    person_property_id = db.get_entity_by_name("Main User").id
    sampling_person_property_id = db.get_entity_by_name("Sampling Person").id
    nagoya_case_property_id = get_id_of_datatype("NagoyaCase")
    container_property_id = get_id_of_datatype("Container")
    event_property_id = get_id_of_datatype("SourceEvent")
    pdfreport_property_id = get_id_of_datatype("PDFReport")
    parent_sample_property_id = db.get_entity_by_name("Parent Sample").id

    additional_properties = data.keys().to_list()
    additional_property_ids = {}  # name-> id
    additional_property_ents = {}  # name-> Entity
    for property_name in additional_properties:
        if property_name in SPECIAL_TREATMENT_SAMPLE or _is_ignored_column_name(property_name):
            continue
        try:
            try:
                res = cached_get_entity_by(query=f"FIND PROPERTY WITH name='{property_name}'")
            except db.EmptyUniqueQueryError:
                res = cached_get_entity_by(query=f"FIND RECORDTYPE WITH name='{property_name}'")
            additional_property_ids[property_name] = res.id
            additional_property_ents[property_name] = res
        except db.exceptions.EmptyUniqueQueryError:
            logger.info(f"Couldn't find (unique) Property or RecordType: '{property_name}'."
                        f"\nThe column '{property_name}' is not being used.")
        except db.QueryNotUniqueError:
            logger.info(f"Property or RecordType {property_name} was not unique. "
                        "Skipping this column.")

    # Create everything needed to update the samples
    samples = []

    for index, row in data.iterrows():

        sample_id_exists = not pd.isnull(row["BIS ID"])
        if not sample_id_exists:
            raise DataInconsistencyError(f"Missing sample ID in row {index}")
        try:
            sample = db.execute_query(
                "FIND RECORD Sample WITH id = {}".format(row["BIS ID"]), unique=True)
        except db.exceptions.EmptyUniqueQueryError:
            msg = "There is no Sample with ID = {} in the system.".format(row["BIS ID"])
            raise DataInconsistencyError(msg)

        sample = _update_property(entity=sample, property_id=person_property_id,
                                  property_name="Main User",
                                  value=get_person(row["Main User"]))
        if "Parent BIS ID" in row and return_value_if_not_none(row["Parent BIS ID"]) is not None:
            sample = _update_property(entity=sample, property_id=parent_sample_property_id,
                                      value=row["Parent BIS ID"])
        if ("Sampling Person" in row
                and return_value_if_not_none(row["Sampling Person"]) is not None):
            sample = _update_property(entity=sample, property_id=sampling_person_property_id,
                                      property_name="Sampling Person",
                                      value=get_person(row["Sampling Person"]))
        if "PI" in row and return_value_if_not_none(row["PI"]) is not None:
            sample = _update_property(entity=sample, property_id=db.get_entity_by_name("PI").id,
                                      property_name="PI",
                                      value=get_person(row["PI"]))
        if "Nagoya case number" in row and return_value_if_not_none(row["Nagoya case number"]) is not None:
            sample = _update_property(entity=sample, property_id=nagoya_case_property_id,
                                      property_name="NagoyaCase",
                                      value=get_nagoya_case(row))
        if "Storage ID" in row and return_value_if_not_none(row["Storage ID"]) is not None:
            sample = _update_property(entity=sample, property_id=container_property_id,
                                      property_name="Container",
                                      value=get_container(row))
        if "Collection" in row and return_value_if_not_none(row["Collection"]) is not None:
            sample = _update_property(entity=sample,
                                      property_id=db.get_entity_by_name("Collection").id,
                                      property_name="Collection",
                                      datatype=db.LIST("Collection"),
                                      value=[db.Record(name=el)
                                             .add_parent(name="Collection")
                                             for el in row["Collection"]])
        if "Sphere" in row and return_value_if_not_none(row["Sphere"]) is not None:
            sample = _update_property(entity=sample,
                                      property_id=db.get_entity_by_name("Sphere").id,
                                      property_name="Sphere",
                                      datatype=db.LIST("Sphere"),
                                      value=[db.Record(name=el)
                                             .add_parent(name="Sphere")
                                             for el in row["Sphere"]])

        if "Date collected start" in row and return_value_if_not_none(row["Date collected start"]) is not None:
            sample = _update_property(entity=sample, property_id=event_property_id, property_name='SourceEvent', value=get_event(
                row, get_gear(row)))
        if "PDFReport" in data.columns:
            sample = _update_property(
                entity=sample, property_id=pdfreport_property_id, property_name="PDFReport", value=row["PDFReport"])
        if "Date sampled start" in data.columns:
            sample = _append_times_to_entity(ent=sample, data=row, propname_prefix="Time sampled",
                                             colname_time_prefix="Time sampled", colname_date_prefix="Date sampled")

        # Add additional properties
        for property_name in additional_property_ids.keys():
            if return_value_if_not_none(row[property_name]) is None or (isinstance(row[property_name], list) and
                                                                        len(row[property_name]) == 0):
                continue
            ent = additional_property_ents[property_name]
            if ent.role == "RecordType":
                value = db.Record().add_parent(ent.name)
                if len(ent.properties) > 1:
                    raise DataInconsistencyError(
                        f"Trying to add a {ent.name} to a sample. Cannot identify property to set "
                        f"because RecordType with ID={ent.id} has more than one Property.")
                if len(ent.properties) == 0:
                    value.name = return_value_if_not_none(row[property_name])
                else:
                    value.add_property(
                        name=ent.properties[0].name, value=return_value_if_not_none(row[property_name]))
            else:
                value = return_value_if_not_none(row[property_name])
            sample = _update_property(
                entity=sample, property_id=additional_property_ids[property_name],
                value=value, property_name=property_name)

        samples.append(sample)

    synchroize(samples, additional_property_ents, htmluserlog_public)


def main():

    parser = helper.get_argument_parser()
    parser.add_argument(
        "pickled_sample_data",
        help="Dump of the cleaned and checked sample data for crawling."
    )
    parser.add_argument(
        "old_filename",
        help="Name of the file that was uploaded originally for logging purposes."
    )
    args = parser.parse_args()
    if hasattr(args, "auth_token") and args.auth_token:
        db.configure_connection(
            auth_token=args.auth_token,
            timeout=(30, 60*60*24*7)  # Rather short connection timeout, one week for read.
        )
        userlog_public, htmluserlog_public, debuglog_public = configure_server_side_logging()
    else:
        rootlogger = logging.getLogger()
        rootlogger.setLevel(logging.INFO)
        logger.setLevel(logging.DEBUG)
        handler = logging.StreamHandler(stream=sys.stdout)
        handler.setLevel(logging.DEBUG)
        rootlogger.addHandler(handler)
        userlog_public = "/tmp/upload_sample_userlog.log"
        htmluserlog_public = "/tmp/upload_sample_userlog.html"
        debuglog_public = "/tmp/upload_sample_debuglog.html"

    try:
        sample_data = pd.read_pickle(args.pickled_sample_data)
    except (FileNotFoundError, UnpicklingError) as err:
        email_body = f"""
Dear curator,

There were problems transferring the read-in CSV data from
{args.old_filename} to the asynchronous crawl script:

{str(err)}

        """
        _notify_about_error(
            subject=f"Errors when loading {args.old_filename}",
            text=email_body
        )
        return 2

    try:
        update_sample_records(sample_data, htmluserlog_public)
    except db.TransactionError as te:
        email_body = f"""
Dear curator,

There were problems synchronizing the sample entities from {args.old_filename} to the LinkAhead server:

{te}
        """
        _notify_about_error(
            subject=f"Errors when synchronoizing {args.old_filename}",
            text=email_body
        )
        return 3

    except DataInconsistencyError as die:

        email_body = f"""
Dear Curator,

There were problems with the data in {args.old_filename}:

{die}

Please check for mistakes like typos in names or ids, wrong data
types, or missing information.
        """
        _notify_about_error(
            subject=f"Parsing errors in {args.old_filename}",
            text=email_body
        )
        return 4

    except ForbiddenTransaction as fte:
        email_body = f"""
Dear Curator,

Crawling {args.old_filename} resulted in forbidden transactions:

{fte}
        """
        _notify_about_error(
            subject=f"Forbidden transactions in {args.old_filename}",
            text=email_body
        )
        return 5

    except ImpossibleMergeError as ime:
        email_body = f"""
Dear Curator,

There was a conflict when merging sample or event information in {args.old_filename}:

{ime}

Please verify that all information that there is no contradictory
information belonging to a single entity.
        """
        _notify_about_error(
            subject=f"Merge conflict in {args.old_filename}",
            text=email_body
        )
        return 6


if __name__ == "__main__":

    sys.exit(main())