ENH: Add sample postprocessing

c0d0740d · Florian Spreckelsen · dc1aebc5 · c0d0740d · c0d0740d · c0d0740d
Commit c0d0740d authored 5 months ago by Florian Spreckelsen
--- a/sample-management-custom/caosdb-server/scripting/bin/crawl_sample_data_async.py
+++ b/sample-management-custom/caosdb-server/scripting/bin/crawl_sample_data_async.py
@@ -47,6 +47,8 @@ from bis_utils import (get_do_not_insert_type_names,
                       return_value_if_not_none,
                       send_mail_with_defaults,
                       SPECIAL_TREATMENT_SAMPLE)
+from sample_helpers.sample_upload_post_processing import post_process_samples
+from sample_helpers.utils import get_column_header_name

 # suppress warning of diff function
 apilogger = logging.getLogger("linkahead.apiutils")
@@ -139,19 +141,20 @@ def _append_times_to_entity(ent, data, propname_prefix="Time", colname_time_pref
        except dateparser.ParserError as perr:
            logger.error(
                f"There is a problem in '{colname_date_start}': {date_start}"
-                f" of sample {data['BIS ID']}: {perr}"
+                f" of sample {data[get_column_header_name('entity_id')]}: {perr}"
            )
            raise DataInconsistencyError
    if colname_time_start in data and return_value_if_not_none(data[colname_time_start]) is not None:
-        if not "Timezone" in data or return_value_if_not_none(data["Timezone"]) is None:
+        if not get_column_header_name("Timezone") in data or return_value_if_not_none(data[get_column_header_name("Timezone")]) is None:
            logger.error(f"{colname_time_start} but no timezone given for sample "
-                         f"{data['BIS ID']}.")
+                         f"{data[get_column_header_name('entity_id')]}.")
            raise DataInconsistencyError
        time_start = return_value_if_not_none(data[colname_time_start])
-        timezone = return_value_if_not_none(data["Timezone"])
+        timezone = return_value_if_not_none(data[get_column_header_name("Timezone")])
        if date_start is None:
            logger.error(
-                f"{colname_time_start} is given but {colname_date_start} is missing for sample {data['BIS ID']}.")
+                f"{colname_time_start} is given but {colname_date_start} is missing for "
+                f"sample {data[get_column_header_name('entity_id')]}.")
            raise DataInconsistencyError
        try:
            _val = str(dateparser.parse(f"{date_start}T{time_start}{timezone}"))
@@ -159,7 +162,7 @@ def _append_times_to_entity(ent, data, propname_prefix="Time", colname_time_pref
        except dateparser.ParserError as perr:
            logger.error(
                f"Couldn't parse {colname_time_start}: {time_start} with timezone {timezone} "
-                f"of sample {data['BIS ID']}: {perr}"
+                f"of sample {data[get_column_header_name('entity_id')]}: {perr}"
            )
            raise DataInconsistencyError
    elif date_start is not None:
@@ -170,7 +173,8 @@ def _append_times_to_entity(ent, data, propname_prefix="Time", colname_time_pref
        date_stop = return_value_if_not_none(data[colname_date_stop])
    if date_stop is not None and date_start is None:
        logger.error(
-            f"{colname_date_stop} is given but {colname_date_start} is missing for sample {data['BIS ID']}.")
+            f"{colname_date_stop} is given but {colname_date_start} is missing for "
+            f"sample {data[get_column_header_name('entity_id')]}.")
        raise DataInconsistencyError
    if date_stop is None:
        _date_stop = date_start
@@ -180,7 +184,7 @@ def _append_times_to_entity(ent, data, propname_prefix="Time", colname_time_pref
        except dateparser.ParserError as perr:
            logger.error(
                f"There is a problem in '{colname_date_stop}': {date_stop}"
-                f" of sample {data['BIS ID']}: {perr}"
+                f" of sample {data[get_column_header_name('entity_id')]}: {perr}"
            )
            raise DataInconsistencyError

@@ -188,7 +192,8 @@ def _append_times_to_entity(ent, data, propname_prefix="Time", colname_time_pref
        time_stop = return_value_if_not_none(data[colname_time_stop])
        if time_start is None:
            logger.error(
-                f"{colname_time_stop} is given but {colname_time_start} is missing for sample {data['BIS ID']}.")
+                f"{colname_time_stop} is given but {colname_time_start} is missing for "
+                f"sample {data[get_column_header_name('entity_id')]}.")
            raise DataInconsistencyError
        # timezone is set by time start; if it hadn't been there, we would already have an error.
        try:
@@ -196,7 +201,7 @@ def _append_times_to_entity(ent, data, propname_prefix="Time", colname_time_pref
        except dateparser.ParserError as perr:
            logger.error(
                f"Couldn't parse {colname_time_stop}: {time_stop} with timezone {timezone} "
-                f"of sample {data['BIS ID']}: {perr}"
+                f"of sample {data[get_column_header_name('entity_id')]}: {perr}"
            )
            raise DataInconsistencyError
        ent = _update_property(ent, prop_stop.id, property_name=prop_stop.name, value=_val)
@@ -587,6 +592,7 @@ def update_sample_records(data, htmluserlog_public):

        samples.append(sample)

+    samples = post_process_samples(samples, data)
    synchroize(samples, additional_property_ents, htmluserlog_public)



--- a/sample-management-custom/caosdb-server/scripting/bin/sample_helpers/sample_upload_column_definitions.py
+++ b/sample-management-custom/caosdb-server/scripting/bin/sample_helpers/sample_upload_column_definitions.py
+#
+# Copyright (C) 2025 Indiscale GmbH <info@indiscale.com>
+# Copyright (C) 2025 Florian Spreckelsen <f.spreckelsen@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+import re
+
+from datetime import date, datetime
+from dateutil.relativedelta import relativedelta
+from typing import Union
+
+from .utils import get_column_header_name
+
+
+def semicolon_separated_list(text):
+    return [el.strip() for el in text.split(";") if el != ""]
+
+
+def _embargo_converter(text: str):
+
+    datepattern = r"^(?P<year>\d{4,4})-(?P<month>\d{2,2})-(?P<day>\d{2,2})"
+    matches = re.match(datepattern, str(text))
+    if matches:
+        return date(int(matches.groupdict()["year"]), int(matches.groupdict()["month"]), int(matches.groupdict()["day"]))
+    if f"{text}".lower() in ["true", "yes"]:
+        # yes means embargo until today in one year
+        return date.today() + relativedelta(years=1)
+    if f"{text}".lower() in ["false", "no"]:
+        return ""
+    raise ValueError(
+        f"The embargo should be either a date in YYYY-MM-DD format, or 'true'/'yes' or 'false'/'no', but is {text}.")
+
+
+def _use_custom_names(definition: Union[list, dict]):
+    """Replace names in list or dict keys by custom names with
+    `utils.get_column_header_name`.
+
+    """
+    if isinstance(definition, list):
+        return [get_column_header_name(name) for name in definition]
+    elif isinstance(definition, dict):
+        return {get_column_header_name(key): value for key, value in definition.items()}
+
+    raise ValueError(f"Expected dict or list, but got {type(definition)}.")
+
+
+DATATYPE_DEFINITIONS = _use_custom_names({
+    "AphiaID": int,
+    "entity_id": str,
+    "Campaign": str,
+    "Date collected start": str,
+    "Date collected stop": str,
+    "Date sampled start": str,
+    "Date sampled stop": str,
+    "Fixation": str,
+    "Gear configuration": str,
+    "Gear": str,
+    "Hol": int,
+    "Latitude start": float,
+    "Latitude stop": float,
+    "Longitude start": float,
+    "Longitude stop": float,
+    "Main User": str,
+    "Nagoya case number": str,
+    "PI": str,
+    "Parent LinkAhead ID": str,
+    "Platform": str,
+    "Sample Context": str,
+    "Sample container": str,
+    "SampleType": str,
+    "SampleTypeSpecific": str,
+    "Sampling Person": str,
+    "Sampling depth start": float,
+    "Sampling depth stop": float,
+    "Sampling method": str,
+    "Station ID": str,
+    "Station number": str,
+    "Storage Container Label": str,
+    "Storage ID": str,
+    "StorageTemperature": str,
+    "Subevent": str,
+    "Time collected start": str,
+    "Time collected stop": str,
+    "Time sampled start": str,
+    "Time sampled stop": str,
+    "Timezone": str,
+    "Water depth start": float,
+    "Water depth stop": float,
+})
+
+# Obligatory columns: Must exist and must not be empty
+# Must exist
+OBLIGATORY_COLUMNS = _use_custom_names([
+    "entity_id",
+    "Date collected start",
+    "Device",
+    "Latitude start",
+    "Longitude start",
+    "Main User",
+    "PI",
+    "Sample container",
+    "SampleType",
+    "Sphere",
+    "Storage ID",
+    "StorageTemperature",
+])
+
+OBLIGATORY_COLUMNS_CHILD = _use_custom_names([
+    "entity_id",
+    "Date sampled start",
+    "Main User",
+    "Parent LinkAhead ID",
+    "Sample container",
+    "SampleType",
+    "SampleTypeSpecific",
+    "Sphere",
+    "Storage ID",
+    "StorageTemperature",
+])
+
+COLUMN_CONVERTER = _use_custom_names({
+    "Collection": semicolon_separated_list,
+    "Ecotaxa URL": semicolon_separated_list,
+    "NCBI Accession": semicolon_separated_list,
+    "NCBI BioProject": semicolon_separated_list,
+    "NCBI BioSample": semicolon_separated_list,
+    "OSIS URL": semicolon_separated_list,
+    "Embargo": _embargo_converter,
+    "Publications": semicolon_separated_list,
+    "Sphere": semicolon_separated_list,
+})
+
+SPECIAL_TREATMENT_SAMPLE = _use_custom_names([
+    "entity_id",
+    "Collection",
+    "Date collected start",
+    "Date collected stop",
+    "Date sampled start",
+    "Date sampled stop",
+    "Gear configuration",
+    "Gear",
+    "Hol",
+    "Latitude start",
+    "Latitude stop",
+    "Longitude start",
+    "Longitude stop",
+    "Main User",
+    "Nagoya case number",
+    "PDFReport",
+    "PI",
+    "Parent Linkahead ID",
+    "Person",
+    "Sampling Person",
+    "Sampling depth start",
+    "Sampling depth stop",
+    "Sphere",
+    "Station ID",
+    "Station number",
+    "Storage Container Label",
+    "Storage ID",
+    "Subevent",
+    "Time collected start",
+    "Time collected stop",
+    "Time sampled start",
+    "Time sampled stop",
+    "Timezone",
+    "Water depth start",
+    "Water depth stop",
+])
+
+IGNORED_COLUMN_NAMES_SAMPLE = _use_custom_names([
+    "LinkAhead URL",
+    "Date",
+    "IGSN URL",
+    "IGSN",  # TODO This will be relevant for external IGSNs in the future.
+    "Parent Sample",
+    "Sampling depth",
+    "Storage chain",
+    "Water depth",
+])
--- a/sample-management-custom/caosdb-server/scripting/bin/sample_helpers/sample_upload_post_processing.py
+++ b/sample-management-custom/caosdb-server/scripting/bin/sample_helpers/sample_upload_post_processing.py
+#
+# Copyright (C) 2025 Indiscale GmbH <info@indiscale.com>
+# Copyright (C) 2025 Florian Spreckelsen <f.spreckelsen@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+import linkahead as db
+import pandas as pd
+
+def post_process_samples(samples: db.Container, data: pd.DataFrame) -> db.Container:
+
+    return samples