ENH: Automatically replace all column keys by custom definitions

600603ba · Florian Spreckelsen · e145e16c · 600603ba · 600603ba
Commit 600603ba authored 5 months ago by Florian Spreckelsen
--- a/sample-management-custom/caosdb-server/scripting/bin/bis_utils.py
+++ b/sample-management-custom/caosdb-server/scripting/bin/bis_utils.py
@@ -32,57 +32,10 @@ from caosadvancedtools.serverside.helper import send_mail
 from caoscrawler.config import get_config_setting
 from linkahead import get_entity_by_name
+from sample_helpers.sample_upload_column_definitions import (
+    IGNORED_COLUMN_NAMES_SAMPLE, SPECIAL_TREATMENT_SAMPLE)
 from sample_helpers.utils import CONSTANTS
-SPECIAL_TREATMENT_SAMPLE = [
-    "BIS ID",
-    "Collection",
-    "Date collected start",
-    "Date collected stop",
-    "Date sampled start",
-    "Date sampled stop",
-    "Gear configuration",
-    "Gear",
-    "Hol",
-    "Latitude start",
-    "Latitude stop",
-    "Longitude start",
-    "Longitude stop",
-    "Main User",
-    "Nagoya case number",
-    "PDFReport",
-    "PI",
-    "Parent BIS ID",
-    "Person",
-    "Sampling Person",
-    "Sampling depth start",
-    "Sampling depth stop",
-    "Sphere",
-    "Station ID",
-    "Station number",
-    "Storage Container Label",
-    "Storage ID",
-    "Subevent",
-    "Time collected start",
-    "Time collected stop",
-    "Time sampled start",
-    "Time sampled stop",
-    "Timezone",
-    "Water depth start",
-    "Water depth stop",
-]
-IGNORED_COLUMN_NAMES_SAMPLE = [
-    "BIS URL",
-    "Date",
-    "IGSN URL",
-    "IGSN",  # TODO This will be relevant for external IGSNs in the future.
-    "Parent Sample",
-    "Sampling depth",
-    "Storage chain",
-    "Water depth",
-]
 COLUMN_DESCRIPTIONS = CONSTANTS["csv_column_descriptions"]

--- a/sample-management-custom/caosdb-server/scripting/bin/upload_sample_template.py
+++ b/sample-management-custom/caosdb-server/scripting/bin/upload_sample_template.py
@@ -27,12 +27,9 @@ import json
 import logging
 import os
 import pandas as pd
-import re
 import subprocess
 import sys
-from datetime import date, datetime
-from dateutil.relativedelta import relativedelta
 from pathlib import Path
 from tempfile import NamedTemporaryFile
@@ -45,128 +42,18 @@ from caoscrawler.logging import configure_server_side_logging
 from bis_utils import (replace_entity_urls_by_ids,
                       SPECIAL_TREATMENT_SAMPLE, whitespace_cleanup_in_df)
+from sample_helpers.utils import CONSTANTS
 # suppress warning of diff function
 apilogger = logging.getLogger("linkahead.apiutils")
 apilogger.setLevel(logging.ERROR)
-def semicolon_separated_list(text):
+ERROR_PREFIX = CONSTANTS["error_prefix"]
-    return [el.strip() for el in text.split(";") if el != ""]
+ERROR_SUFFIX = CONSTANTS["error_suffix"]
-def _embargo_converter(text: str):
-    datepattern = r"^(?P<year>\d{4,4})-(?P<month>\d{2,2})-(?P<day>\d{2,2})"
-    matches = re.match(datepattern, str(text))
-    if matches:
-        return date(int(matches.groupdict()["year"]), int(matches.groupdict()["month"]), int(matches.groupdict()["day"]))
-    if f"{text}".lower() in ["true", "yes"]:
-        # yes means embargo until today in one year
-        return date.today() + relativedelta(years=1)
-    if f"{text}".lower() in ["false", "no"]:
-        return ""
-    raise ValueError(
-        f"The embargo should be either a date in YYYY-MM-DD format, or 'true'/'yes' or 'false'/'no', but is {text}.")
-ERROR_PREFIX = 'Something went wrong: '
-ERROR_SUFFIX = ' Please conatct <a href="mailto:biosamples@geomar.de">biosamples@geomar.de</a> if you encounter this issue.'
 # Column datatypes
-DATATYPE_DEFINITIONS = {
-    "AphiaID": int,
-    "BIS ID": str,
-    "Campaign": str,
-    "Date collected start": str,
-    "Date collected stop": str,
-    "Date sampled start": str,
-    "Date sampled stop": str,
-    "Fixation": str,
-    "Gear configuration": str,
-    "Gear": str,
-    "Hol": int,
-    "Latitude start": float,
-    "Latitude stop": float,
-    "Longitude start": float,
-    "Longitude stop": float,
-    "Main User": str,
-    "Nagoya case number": str,
-    "PI": str,
-    "Parent BIS ID": str,
-    "Platform": str,
-    "Sample Context": str,
-    "Sample container": str,
-    "SampleType": str,
-    "SampleTypeSpecific": str,
-    "Sampling Person": str,
-    "Sampling depth start": float,
-    "Sampling depth stop": float,
-    "Sampling method": str,
-    "Station ID": str,
-    "Station number": str,
-    "Storage Container Label": str,
-    "Storage ID": str,
-    "StorageTemperature": str,
-    "Subevent": str,
-    "Time collected start": str,
-    "Time collected stop": str,
-    "Time sampled start": str,
-    "Time sampled stop": str,
-    "Timezone": str,
-    "Water depth start": float,
-    "Water depth stop": float,
-}
-# Obligatory columns: Must exist and must not be empty
-# Must exist
-OBLIGATORY_COLUMNS = [
-    "BIS ID",
-    "Collection",
-    "Date collected start",
-    "Fixation",
-    "Gear",
-    "Latitude start",
-    "Longitude start",
-    "Main User",
-    "Nagoya case number",
-    "PI",
-    "Sample Context",
-    "Sample container",
-    "SampleType",
-    "SampleTypeSpecific",
-    "Sphere",
-    "Storage ID",
-    "StorageTemperature",
-]
-OBLIGATORY_COLUMNS_CHILD = [
-    "BIS ID",
-    "Date sampled start",
-    "Fixation",
-    "Main User",
-    "Parent BIS ID",
-    "Sample Context",
-    "Sample container",
-    "SampleType",
-    "SampleTypeSpecific",
-    "Sphere",
-    "Storage ID",
-    "StorageTemperature",
-]
-COLUMN_CONVERTER = {
-    "Collection": semicolon_separated_list,
-    "Ecotaxa URL": semicolon_separated_list,
-    "NCBI Accession": semicolon_separated_list,
-    "NCBI BioProject": semicolon_separated_list,
-    "NCBI BioSample": semicolon_separated_list,
-    "OSIS URL": semicolon_separated_list,
-    "Embargo": _embargo_converter,
-    "Publications": semicolon_separated_list,
-    "Sphere": semicolon_separated_list,
-}
 logger = logging.getLogger("caosadvancedtools")