From 600603bad7d9cf64e286591c6ce0c868a828315c Mon Sep 17 00:00:00 2001 From: Florian Spreckelsen <f.spreckelsen@indiscale.com> Date: Wed, 29 Jan 2025 10:25:05 +0100 Subject: [PATCH] ENH: Automatically replace all column keys by custom definitions --- .../caosdb-server/scripting/bin/bis_utils.py | 51 +------- .../scripting/bin/upload_sample_template.py | 119 +----------------- 2 files changed, 5 insertions(+), 165 deletions(-) diff --git a/sample-management-custom/caosdb-server/scripting/bin/bis_utils.py b/sample-management-custom/caosdb-server/scripting/bin/bis_utils.py index 80165fc..1bcc322 100644 --- a/sample-management-custom/caosdb-server/scripting/bin/bis_utils.py +++ b/sample-management-custom/caosdb-server/scripting/bin/bis_utils.py @@ -32,57 +32,10 @@ from caosadvancedtools.serverside.helper import send_mail from caoscrawler.config import get_config_setting from linkahead import get_entity_by_name +from sample_helpers.sample_upload_column_definitions import ( + IGNORED_COLUMN_NAMES_SAMPLE, SPECIAL_TREATMENT_SAMPLE) from sample_helpers.utils import CONSTANTS -SPECIAL_TREATMENT_SAMPLE = [ - "BIS ID", - "Collection", - "Date collected start", - "Date collected stop", - "Date sampled start", - "Date sampled stop", - "Gear configuration", - "Gear", - "Hol", - "Latitude start", - "Latitude stop", - "Longitude start", - "Longitude stop", - "Main User", - "Nagoya case number", - "PDFReport", - "PI", - "Parent BIS ID", - "Person", - "Sampling Person", - "Sampling depth start", - "Sampling depth stop", - "Sphere", - "Station ID", - "Station number", - "Storage Container Label", - "Storage ID", - "Subevent", - "Time collected start", - "Time collected stop", - "Time sampled start", - "Time sampled stop", - "Timezone", - "Water depth start", - "Water depth stop", -] - -IGNORED_COLUMN_NAMES_SAMPLE = [ - "BIS URL", - "Date", - "IGSN URL", - "IGSN", # TODO This will be relevant for external IGSNs in the future. - "Parent Sample", - "Sampling depth", - "Storage chain", - "Water depth", -] - COLUMN_DESCRIPTIONS = CONSTANTS["csv_column_descriptions"] diff --git a/sample-management-custom/caosdb-server/scripting/bin/upload_sample_template.py b/sample-management-custom/caosdb-server/scripting/bin/upload_sample_template.py index f66744a..7d71282 100755 --- a/sample-management-custom/caosdb-server/scripting/bin/upload_sample_template.py +++ b/sample-management-custom/caosdb-server/scripting/bin/upload_sample_template.py @@ -27,12 +27,9 @@ import json import logging import os import pandas as pd -import re import subprocess import sys -from datetime import date, datetime -from dateutil.relativedelta import relativedelta from pathlib import Path from tempfile import NamedTemporaryFile @@ -45,128 +42,18 @@ from caoscrawler.logging import configure_server_side_logging from bis_utils import (replace_entity_urls_by_ids, SPECIAL_TREATMENT_SAMPLE, whitespace_cleanup_in_df) +from sample_helpers.utils import CONSTANTS # suppress warning of diff function apilogger = logging.getLogger("linkahead.apiutils") apilogger.setLevel(logging.ERROR) -def semicolon_separated_list(text): - return [el.strip() for el in text.split(";") if el != ""] - - -def _embargo_converter(text: str): - - datepattern = r"^(?P<year>\d{4,4})-(?P<month>\d{2,2})-(?P<day>\d{2,2})" - matches = re.match(datepattern, str(text)) - if matches: - return date(int(matches.groupdict()["year"]), int(matches.groupdict()["month"]), int(matches.groupdict()["day"])) - if f"{text}".lower() in ["true", "yes"]: - # yes means embargo until today in one year - return date.today() + relativedelta(years=1) - if f"{text}".lower() in ["false", "no"]: - return "" - raise ValueError( - f"The embargo should be either a date in YYYY-MM-DD format, or 'true'/'yes' or 'false'/'no', but is {text}.") - - -ERROR_PREFIX = 'Something went wrong: ' -ERROR_SUFFIX = ' Please conatct <a href="mailto:biosamples@geomar.de">biosamples@geomar.de</a> if you encounter this issue.' +ERROR_PREFIX = CONSTANTS["error_prefix"] +ERROR_SUFFIX = CONSTANTS["error_suffix"] # Column datatypes -DATATYPE_DEFINITIONS = { - "AphiaID": int, - "BIS ID": str, - "Campaign": str, - "Date collected start": str, - "Date collected stop": str, - "Date sampled start": str, - "Date sampled stop": str, - "Fixation": str, - "Gear configuration": str, - "Gear": str, - "Hol": int, - "Latitude start": float, - "Latitude stop": float, - "Longitude start": float, - "Longitude stop": float, - "Main User": str, - "Nagoya case number": str, - "PI": str, - "Parent BIS ID": str, - "Platform": str, - "Sample Context": str, - "Sample container": str, - "SampleType": str, - "SampleTypeSpecific": str, - "Sampling Person": str, - "Sampling depth start": float, - "Sampling depth stop": float, - "Sampling method": str, - "Station ID": str, - "Station number": str, - "Storage Container Label": str, - "Storage ID": str, - "StorageTemperature": str, - "Subevent": str, - "Time collected start": str, - "Time collected stop": str, - "Time sampled start": str, - "Time sampled stop": str, - "Timezone": str, - "Water depth start": float, - "Water depth stop": float, -} - -# Obligatory columns: Must exist and must not be empty -# Must exist -OBLIGATORY_COLUMNS = [ - "BIS ID", - "Collection", - "Date collected start", - "Fixation", - "Gear", - "Latitude start", - "Longitude start", - "Main User", - "Nagoya case number", - "PI", - "Sample Context", - "Sample container", - "SampleType", - "SampleTypeSpecific", - "Sphere", - "Storage ID", - "StorageTemperature", -] - -OBLIGATORY_COLUMNS_CHILD = [ - "BIS ID", - "Date sampled start", - "Fixation", - "Main User", - "Parent BIS ID", - "Sample Context", - "Sample container", - "SampleType", - "SampleTypeSpecific", - "Sphere", - "Storage ID", - "StorageTemperature", -] - -COLUMN_CONVERTER = { - "Collection": semicolon_separated_list, - "Ecotaxa URL": semicolon_separated_list, - "NCBI Accession": semicolon_separated_list, - "NCBI BioProject": semicolon_separated_list, - "NCBI BioSample": semicolon_separated_list, - "OSIS URL": semicolon_separated_list, - "Embargo": _embargo_converter, - "Publications": semicolon_separated_list, - "Sphere": semicolon_separated_list, -} logger = logging.getLogger("caosadvancedtools") -- GitLab