Skip to content
Snippets Groups Projects
Commit 600603ba authored by Florian Spreckelsen's avatar Florian Spreckelsen
Browse files

ENH: Automatically replace all column keys by custom definitions

parent e145e16c
Branches
Tags
1 merge request!1F awi sams
......@@ -32,57 +32,10 @@ from caosadvancedtools.serverside.helper import send_mail
from caoscrawler.config import get_config_setting
from linkahead import get_entity_by_name
from sample_helpers.sample_upload_column_definitions import (
IGNORED_COLUMN_NAMES_SAMPLE, SPECIAL_TREATMENT_SAMPLE)
from sample_helpers.utils import CONSTANTS
SPECIAL_TREATMENT_SAMPLE = [
"BIS ID",
"Collection",
"Date collected start",
"Date collected stop",
"Date sampled start",
"Date sampled stop",
"Gear configuration",
"Gear",
"Hol",
"Latitude start",
"Latitude stop",
"Longitude start",
"Longitude stop",
"Main User",
"Nagoya case number",
"PDFReport",
"PI",
"Parent BIS ID",
"Person",
"Sampling Person",
"Sampling depth start",
"Sampling depth stop",
"Sphere",
"Station ID",
"Station number",
"Storage Container Label",
"Storage ID",
"Subevent",
"Time collected start",
"Time collected stop",
"Time sampled start",
"Time sampled stop",
"Timezone",
"Water depth start",
"Water depth stop",
]
IGNORED_COLUMN_NAMES_SAMPLE = [
"BIS URL",
"Date",
"IGSN URL",
"IGSN", # TODO This will be relevant for external IGSNs in the future.
"Parent Sample",
"Sampling depth",
"Storage chain",
"Water depth",
]
COLUMN_DESCRIPTIONS = CONSTANTS["csv_column_descriptions"]
......
......@@ -27,12 +27,9 @@ import json
import logging
import os
import pandas as pd
import re
import subprocess
import sys
from datetime import date, datetime
from dateutil.relativedelta import relativedelta
from pathlib import Path
from tempfile import NamedTemporaryFile
......@@ -45,128 +42,18 @@ from caoscrawler.logging import configure_server_side_logging
from bis_utils import (replace_entity_urls_by_ids,
SPECIAL_TREATMENT_SAMPLE, whitespace_cleanup_in_df)
from sample_helpers.utils import CONSTANTS
# suppress warning of diff function
apilogger = logging.getLogger("linkahead.apiutils")
apilogger.setLevel(logging.ERROR)
def semicolon_separated_list(text):
return [el.strip() for el in text.split(";") if el != ""]
def _embargo_converter(text: str):
datepattern = r"^(?P<year>\d{4,4})-(?P<month>\d{2,2})-(?P<day>\d{2,2})"
matches = re.match(datepattern, str(text))
if matches:
return date(int(matches.groupdict()["year"]), int(matches.groupdict()["month"]), int(matches.groupdict()["day"]))
if f"{text}".lower() in ["true", "yes"]:
# yes means embargo until today in one year
return date.today() + relativedelta(years=1)
if f"{text}".lower() in ["false", "no"]:
return ""
raise ValueError(
f"The embargo should be either a date in YYYY-MM-DD format, or 'true'/'yes' or 'false'/'no', but is {text}.")
ERROR_PREFIX = 'Something went wrong: '
ERROR_SUFFIX = ' Please conatct <a href="mailto:biosamples@geomar.de">biosamples@geomar.de</a> if you encounter this issue.'
ERROR_PREFIX = CONSTANTS["error_prefix"]
ERROR_SUFFIX = CONSTANTS["error_suffix"]
# Column datatypes
DATATYPE_DEFINITIONS = {
"AphiaID": int,
"BIS ID": str,
"Campaign": str,
"Date collected start": str,
"Date collected stop": str,
"Date sampled start": str,
"Date sampled stop": str,
"Fixation": str,
"Gear configuration": str,
"Gear": str,
"Hol": int,
"Latitude start": float,
"Latitude stop": float,
"Longitude start": float,
"Longitude stop": float,
"Main User": str,
"Nagoya case number": str,
"PI": str,
"Parent BIS ID": str,
"Platform": str,
"Sample Context": str,
"Sample container": str,
"SampleType": str,
"SampleTypeSpecific": str,
"Sampling Person": str,
"Sampling depth start": float,
"Sampling depth stop": float,
"Sampling method": str,
"Station ID": str,
"Station number": str,
"Storage Container Label": str,
"Storage ID": str,
"StorageTemperature": str,
"Subevent": str,
"Time collected start": str,
"Time collected stop": str,
"Time sampled start": str,
"Time sampled stop": str,
"Timezone": str,
"Water depth start": float,
"Water depth stop": float,
}
# Obligatory columns: Must exist and must not be empty
# Must exist
OBLIGATORY_COLUMNS = [
"BIS ID",
"Collection",
"Date collected start",
"Fixation",
"Gear",
"Latitude start",
"Longitude start",
"Main User",
"Nagoya case number",
"PI",
"Sample Context",
"Sample container",
"SampleType",
"SampleTypeSpecific",
"Sphere",
"Storage ID",
"StorageTemperature",
]
OBLIGATORY_COLUMNS_CHILD = [
"BIS ID",
"Date sampled start",
"Fixation",
"Main User",
"Parent BIS ID",
"Sample Context",
"Sample container",
"SampleType",
"SampleTypeSpecific",
"Sphere",
"Storage ID",
"StorageTemperature",
]
COLUMN_CONVERTER = {
"Collection": semicolon_separated_list,
"Ecotaxa URL": semicolon_separated_list,
"NCBI Accession": semicolon_separated_list,
"NCBI BioProject": semicolon_separated_list,
"NCBI BioSample": semicolon_separated_list,
"OSIS URL": semicolon_separated_list,
"Embargo": _embargo_converter,
"Publications": semicolon_separated_list,
"Sphere": semicolon_separated_list,
}
logger = logging.getLogger("caosadvancedtools")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment