Florian Spreckelsen · Florian Spreckelsen
--- a/sample-management-custom/caosdb-server/scripting/bin/upload_sample_template.py

+ 22

− 130
+++ b/sample-management-custom/caosdb-server/scripting/bin/upload_sample_template.py

+ 22

− 130
 @@ -27,12 +27,9 @@ import json
 import logging
 import os
 import pandas as pd
-import re
 import subprocess
 import sys

-from datetime import date, datetime
-from dateutil.relativedelta import relativedelta
 from pathlib import Path
 from tempfile import NamedTemporaryFile

 @@ -45,128 +42,21 @@ from caoscrawler.logging import configure_server_side_logging

 from bis_utils import (replace_entity_urls_by_ids,
                       SPECIAL_TREATMENT_SAMPLE, whitespace_cleanup_in_df)
+from sample_helpers.sample_upload_column_definitions import (
+    COLUMN_CONVERTER, DATATYPE_DEFINITIONS,
+    OBLIGATORY_COLUMNS, OBLIGATORY_COLUMNS_CHILD, SPECIAL_TREATMENT_SAMPLE)
+from sample_helpers.utils import CONSTANTS, get_column_header_name

 # suppress warning of diff function
 apilogger = logging.getLogger("linkahead.apiutils")
 apilogger.setLevel(logging.ERROR)


-def semicolon_separated_list(text):
-    return [el.strip() for el in text.split(";") if el != ""]
-
-
-def _embargo_converter(text: str):
-
-    datepattern = r"^(?P<year>\d{4,4})-(?P<month>\d{2,2})-(?P<day>\d{2,2})"
-    matches = re.match(datepattern, str(text))
-    if matches:
-        return date(int(matches.groupdict()["year"]), int(matches.groupdict()["month"]), int(matches.groupdict()["day"]))
-    if f"{text}".lower() in ["true", "yes"]:
-        # yes means embargo until today in one year
-        return date.today() + relativedelta(years=1)
-    if f"{text}".lower() in ["false", "no"]:
-        return ""
-    raise ValueError(
-        f"The embargo should be either a date in YYYY-MM-DD format, or 'true'/'yes' or 'false'/'no', but is {text}.")
-
-
-ERROR_PREFIX = 'Something went wrong: '
-ERROR_SUFFIX = ' Please conatct <a href="mailto:biosamples@geomar.de">biosamples@geomar.de</a> if you encounter this issue.'
+ERROR_PREFIX = CONSTANTS["error_prefix"]
+ERROR_SUFFIX = CONSTANTS["error_suffix"]


 # Column datatypes
-DATATYPE_DEFINITIONS = {
-    "AphiaID": int,
-    "BIS ID": str,
-    "Campaign": str,
-    "Date collected start": str,
-    "Date collected stop": str,
-    "Date sampled start": str,
-    "Date sampled stop": str,
-    "Fixation": str,
-    "Gear configuration": str,
-    "Gear": str,
-    "Hol": int,
-    "Latitude start": float,
-    "Latitude stop": float,
-    "Longitude start": float,
-    "Longitude stop": float,
-    "Main User": str,
-    "Nagoya case number": str,
-    "PI": str,
-    "Parent BIS ID": str,
-    "Platform": str,
-    "Sample Context": str,
-    "Sample container": str,
-    "SampleType": str,
-    "SampleTypeSpecific": str,
-    "Sampling Person": str,
-    "Sampling depth start": float,
-    "Sampling depth stop": float,
-    "Sampling method": str,
-    "Station ID": str,
-    "Station number": str,
-    "Storage Container Label": str,
-    "Storage ID": str,
-    "StorageTemperature": str,
-    "Subevent": str,
-    "Time collected start": str,
-    "Time collected stop": str,
-    "Time sampled start": str,
-    "Time sampled stop": str,
-    "Timezone": str,
-    "Water depth start": float,
-    "Water depth stop": float,
-}
-
-# Obligatory columns: Must exist and must not be empty
-# Must exist
-OBLIGATORY_COLUMNS = [
-    "BIS ID",
-    "Collection",
-    "Date collected start",
-    "Fixation",
-    "Gear",
-    "Latitude start",
-    "Longitude start",
-    "Main User",
-    "Nagoya case number",
-    "PI",
-    "Sample Context",
-    "Sample container",
-    "SampleType",
-    "SampleTypeSpecific",
-    "Sphere",
-    "Storage ID",
-    "StorageTemperature",
-]
-
-OBLIGATORY_COLUMNS_CHILD = [
-    "BIS ID",
-    "Date sampled start",
-    "Fixation",
-    "Main User",
-    "Parent BIS ID",
-    "Sample Context",
-    "Sample container",
-    "SampleType",
-    "SampleTypeSpecific",
-    "Sphere",
-    "Storage ID",
-    "StorageTemperature",
-]
-
-COLUMN_CONVERTER = {
-    "Collection": semicolon_separated_list,
-    "Ecotaxa URL": semicolon_separated_list,
-    "NCBI Accession": semicolon_separated_list,
-    "NCBI BioProject": semicolon_separated_list,
-    "NCBI BioSample": semicolon_separated_list,
-    "OSIS URL": semicolon_separated_list,
-    "Embargo": _embargo_converter,
-    "Publications": semicolon_separated_list,
-    "Sphere": semicolon_separated_list,
-}

 logger = logging.getLogger("caosadvancedtools")

 @@ -179,7 +69,7 @@ def get_parser():
 def _is_child_sample_table(filename):
    tmp_data = pd.read_csv(filename, sep=',')
    if 'Parent BIS ID' in tmp_data.columns:
-        return not tmp_data["Parent BIS ID"].isnull().all()
+        return not tmp_data[get_column_header_name("Parent LinkAhead ID")].isnull().all()
    return False


 @@ -266,25 +156,27 @@ def main():
            property_name = eprop.name
            if property_name in SPECIAL_TREATMENT_SAMPLE:
                continue
-            if db.apiutils.is_reference(eprop):
-                rt = db.get_entity_by_id(eprop.id)
-                if len(rt.properties) == 1:
-                    converter = _get_converter_from_property_datatype(rt.properties[0].datatype)
-                elif len(rt.properties) == 1:
-                    converter = str
+            if property_name not in DATATYPE_DEFINITIONS:
+                if db.apiutils.is_reference(eprop):
+                    rt = db.get_entity_by_id(eprop.id)
+                    if len(rt.properties) == 1:
+                        converter = _get_converter_from_property_datatype(rt.properties[0].datatype)
+                    elif len(rt.properties) < 1:
+                        converter = str
+                    else:
+                        converter = None
                else:
-                    converter = None
-            else:
-                converter = _get_converter_from_property_datatype(eprop.datatype)
-            if converter is None:
-                continue
-            DATATYPE_DEFINITIONS[property_name] = converter
+                    converter = _get_converter_from_property_datatype(eprop.datatype)
+                if converter is None:
+                    continue
+                DATATYPE_DEFINITIONS[property_name] = converter
            if sample.get_importance(property_name) == db.OBLIGATORY:
                # This is only needed if the sample is not a child sample
                OBLIGATORY_COLUMNS.append(property_name)
        try:
            data = read_data_from_file(path)
-            data = replace_entity_urls_by_ids(data, ["BIS ID", "Storage ID", "Parent BIS ID"])
+            data = replace_entity_urls_by_ids(data, [get_column_header_name(name) for name in [
+                                              "entity_id", "Storage ID", "Parent LinkAhead ID"]])
            pickle_out = NamedTemporaryFile(delete=False, suffix=".pkl")
            data.to_pickle(pickle_out.name)
        except DataInconsistencyError as err: