Skip to content
Snippets Groups Projects

F awi sams

Merged Florian Spreckelsen requested to merge f-awi-sams into main
1 file
+ 4
3
Compare changes
  • Side-by-side
  • Inline
@@ -27,12 +27,9 @@ import json
import logging
import os
import pandas as pd
import re
import subprocess
import sys
from datetime import date, datetime
from dateutil.relativedelta import relativedelta
from pathlib import Path
from tempfile import NamedTemporaryFile
@@ -45,128 +42,21 @@ from caoscrawler.logging import configure_server_side_logging
from bis_utils import (replace_entity_urls_by_ids,
SPECIAL_TREATMENT_SAMPLE, whitespace_cleanup_in_df)
from sample_helpers.sample_upload_column_definitions import (
COLUMN_CONVERTER, DATATYPE_DEFINITIONS,
OBLIGATORY_COLUMNS, OBLIGATORY_COLUMNS_CHILD, SPECIAL_TREATMENT_SAMPLE)
from sample_helpers.utils import CONSTANTS, get_column_header_name
# suppress warning of diff function
apilogger = logging.getLogger("linkahead.apiutils")
apilogger.setLevel(logging.ERROR)
def semicolon_separated_list(text):
return [el.strip() for el in text.split(";") if el != ""]
def _embargo_converter(text: str):
datepattern = r"^(?P<year>\d{4,4})-(?P<month>\d{2,2})-(?P<day>\d{2,2})"
matches = re.match(datepattern, str(text))
if matches:
return date(int(matches.groupdict()["year"]), int(matches.groupdict()["month"]), int(matches.groupdict()["day"]))
if f"{text}".lower() in ["true", "yes"]:
# yes means embargo until today in one year
return date.today() + relativedelta(years=1)
if f"{text}".lower() in ["false", "no"]:
return ""
raise ValueError(
f"The embargo should be either a date in YYYY-MM-DD format, or 'true'/'yes' or 'false'/'no', but is {text}.")
ERROR_PREFIX = 'Something went wrong: '
ERROR_SUFFIX = ' Please conatct <a href="mailto:biosamples@geomar.de">biosamples@geomar.de</a> if you encounter this issue.'
ERROR_PREFIX = CONSTANTS["error_prefix"]
ERROR_SUFFIX = CONSTANTS["error_suffix"]
# Column datatypes
DATATYPE_DEFINITIONS = {
"AphiaID": int,
"BIS ID": str,
"Campaign": str,
"Date collected start": str,
"Date collected stop": str,
"Date sampled start": str,
"Date sampled stop": str,
"Fixation": str,
"Gear configuration": str,
"Gear": str,
"Hol": int,
"Latitude start": float,
"Latitude stop": float,
"Longitude start": float,
"Longitude stop": float,
"Main User": str,
"Nagoya case number": str,
"PI": str,
"Parent BIS ID": str,
"Platform": str,
"Sample Context": str,
"Sample container": str,
"SampleType": str,
"SampleTypeSpecific": str,
"Sampling Person": str,
"Sampling depth start": float,
"Sampling depth stop": float,
"Sampling method": str,
"Station ID": str,
"Station number": str,
"Storage Container Label": str,
"Storage ID": str,
"StorageTemperature": str,
"Subevent": str,
"Time collected start": str,
"Time collected stop": str,
"Time sampled start": str,
"Time sampled stop": str,
"Timezone": str,
"Water depth start": float,
"Water depth stop": float,
}
# Obligatory columns: Must exist and must not be empty
# Must exist
OBLIGATORY_COLUMNS = [
"BIS ID",
"Collection",
"Date collected start",
"Fixation",
"Gear",
"Latitude start",
"Longitude start",
"Main User",
"Nagoya case number",
"PI",
"Sample Context",
"Sample container",
"SampleType",
"SampleTypeSpecific",
"Sphere",
"Storage ID",
"StorageTemperature",
]
OBLIGATORY_COLUMNS_CHILD = [
"BIS ID",
"Date sampled start",
"Fixation",
"Main User",
"Parent BIS ID",
"Sample Context",
"Sample container",
"SampleType",
"SampleTypeSpecific",
"Sphere",
"Storage ID",
"StorageTemperature",
]
COLUMN_CONVERTER = {
"Collection": semicolon_separated_list,
"Ecotaxa URL": semicolon_separated_list,
"NCBI Accession": semicolon_separated_list,
"NCBI BioProject": semicolon_separated_list,
"NCBI BioSample": semicolon_separated_list,
"OSIS URL": semicolon_separated_list,
"Embargo": _embargo_converter,
"Publications": semicolon_separated_list,
"Sphere": semicolon_separated_list,
}
logger = logging.getLogger("caosadvancedtools")
@@ -179,7 +69,7 @@ def get_parser():
def _is_child_sample_table(filename):
tmp_data = pd.read_csv(filename, sep=',')
if 'Parent BIS ID' in tmp_data.columns:
return not tmp_data["Parent BIS ID"].isnull().all()
return not tmp_data[get_column_header_name("Parent LinkAhead ID")].isnull().all()
return False
@@ -266,25 +156,27 @@ def main():
property_name = eprop.name
if property_name in SPECIAL_TREATMENT_SAMPLE:
continue
if db.apiutils.is_reference(eprop):
rt = db.get_entity_by_id(eprop.id)
if len(rt.properties) == 1:
converter = _get_converter_from_property_datatype(rt.properties[0].datatype)
elif len(rt.properties) == 1:
converter = str
if property_name not in DATATYPE_DEFINITIONS:
if db.apiutils.is_reference(eprop):
rt = db.get_entity_by_id(eprop.id)
if len(rt.properties) == 1:
converter = _get_converter_from_property_datatype(rt.properties[0].datatype)
elif len(rt.properties) < 1:
converter = str
else:
converter = None
else:
converter = None
else:
converter = _get_converter_from_property_datatype(eprop.datatype)
if converter is None:
continue
DATATYPE_DEFINITIONS[property_name] = converter
converter = _get_converter_from_property_datatype(eprop.datatype)
if converter is None:
continue
DATATYPE_DEFINITIONS[property_name] = converter
if sample.get_importance(property_name) == db.OBLIGATORY:
# This is only needed if the sample is not a child sample
OBLIGATORY_COLUMNS.append(property_name)
try:
data = read_data_from_file(path)
data = replace_entity_urls_by_ids(data, ["BIS ID", "Storage ID", "Parent BIS ID"])
data = replace_entity_urls_by_ids(data, [get_column_header_name(name) for name in [
"entity_id", "Storage ID", "Parent LinkAhead ID"]])
pickle_out = NamedTemporaryFile(delete=False, suffix=".pkl")
data.to_pickle(pickle_out.name)
except DataInconsistencyError as err:
Loading