Skip to content
Snippets Groups Projects
Commit c0d0740d authored by Florian Spreckelsen's avatar Florian Spreckelsen
Browse files

ENH: Add sample postprocessing

parent dc1aebc5
No related branches found
No related tags found
1 merge request!1F awi sams
......@@ -47,6 +47,8 @@ from bis_utils import (get_do_not_insert_type_names,
return_value_if_not_none,
send_mail_with_defaults,
SPECIAL_TREATMENT_SAMPLE)
from sample_helpers.sample_upload_post_processing import post_process_samples
from sample_helpers.utils import get_column_header_name
# suppress warning of diff function
apilogger = logging.getLogger("linkahead.apiutils")
......@@ -139,19 +141,20 @@ def _append_times_to_entity(ent, data, propname_prefix="Time", colname_time_pref
except dateparser.ParserError as perr:
logger.error(
f"There is a problem in '{colname_date_start}': {date_start}"
f" of sample {data['BIS ID']}: {perr}"
f" of sample {data[get_column_header_name('entity_id')]}: {perr}"
)
raise DataInconsistencyError
if colname_time_start in data and return_value_if_not_none(data[colname_time_start]) is not None:
if not "Timezone" in data or return_value_if_not_none(data["Timezone"]) is None:
if not get_column_header_name("Timezone") in data or return_value_if_not_none(data[get_column_header_name("Timezone")]) is None:
logger.error(f"{colname_time_start} but no timezone given for sample "
f"{data['BIS ID']}.")
f"{data[get_column_header_name('entity_id')]}.")
raise DataInconsistencyError
time_start = return_value_if_not_none(data[colname_time_start])
timezone = return_value_if_not_none(data["Timezone"])
timezone = return_value_if_not_none(data[get_column_header_name("Timezone")])
if date_start is None:
logger.error(
f"{colname_time_start} is given but {colname_date_start} is missing for sample {data['BIS ID']}.")
f"{colname_time_start} is given but {colname_date_start} is missing for "
f"sample {data[get_column_header_name('entity_id')]}.")
raise DataInconsistencyError
try:
_val = str(dateparser.parse(f"{date_start}T{time_start}{timezone}"))
......@@ -159,7 +162,7 @@ def _append_times_to_entity(ent, data, propname_prefix="Time", colname_time_pref
except dateparser.ParserError as perr:
logger.error(
f"Couldn't parse {colname_time_start}: {time_start} with timezone {timezone} "
f"of sample {data['BIS ID']}: {perr}"
f"of sample {data[get_column_header_name('entity_id')]}: {perr}"
)
raise DataInconsistencyError
elif date_start is not None:
......@@ -170,7 +173,8 @@ def _append_times_to_entity(ent, data, propname_prefix="Time", colname_time_pref
date_stop = return_value_if_not_none(data[colname_date_stop])
if date_stop is not None and date_start is None:
logger.error(
f"{colname_date_stop} is given but {colname_date_start} is missing for sample {data['BIS ID']}.")
f"{colname_date_stop} is given but {colname_date_start} is missing for "
f"sample {data[get_column_header_name('entity_id')]}.")
raise DataInconsistencyError
if date_stop is None:
_date_stop = date_start
......@@ -180,7 +184,7 @@ def _append_times_to_entity(ent, data, propname_prefix="Time", colname_time_pref
except dateparser.ParserError as perr:
logger.error(
f"There is a problem in '{colname_date_stop}': {date_stop}"
f" of sample {data['BIS ID']}: {perr}"
f" of sample {data[get_column_header_name('entity_id')]}: {perr}"
)
raise DataInconsistencyError
......@@ -188,7 +192,8 @@ def _append_times_to_entity(ent, data, propname_prefix="Time", colname_time_pref
time_stop = return_value_if_not_none(data[colname_time_stop])
if time_start is None:
logger.error(
f"{colname_time_stop} is given but {colname_time_start} is missing for sample {data['BIS ID']}.")
f"{colname_time_stop} is given but {colname_time_start} is missing for "
f"sample {data[get_column_header_name('entity_id')]}.")
raise DataInconsistencyError
# timezone is set by time start; if it hadn't been there, we would already have an error.
try:
......@@ -196,7 +201,7 @@ def _append_times_to_entity(ent, data, propname_prefix="Time", colname_time_pref
except dateparser.ParserError as perr:
logger.error(
f"Couldn't parse {colname_time_stop}: {time_stop} with timezone {timezone} "
f"of sample {data['BIS ID']}: {perr}"
f"of sample {data[get_column_header_name('entity_id')]}: {perr}"
)
raise DataInconsistencyError
ent = _update_property(ent, prop_stop.id, property_name=prop_stop.name, value=_val)
......@@ -587,6 +592,7 @@ def update_sample_records(data, htmluserlog_public):
samples.append(sample)
samples = post_process_samples(samples, data)
synchroize(samples, additional_property_ents, htmluserlog_public)
......
#
# Copyright (C) 2025 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2025 Florian Spreckelsen <f.spreckelsen@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
import re
from datetime import date, datetime
from dateutil.relativedelta import relativedelta
from typing import Union
from .utils import get_column_header_name
def semicolon_separated_list(text):
return [el.strip() for el in text.split(";") if el != ""]
def _embargo_converter(text: str):
datepattern = r"^(?P<year>\d{4,4})-(?P<month>\d{2,2})-(?P<day>\d{2,2})"
matches = re.match(datepattern, str(text))
if matches:
return date(int(matches.groupdict()["year"]), int(matches.groupdict()["month"]), int(matches.groupdict()["day"]))
if f"{text}".lower() in ["true", "yes"]:
# yes means embargo until today in one year
return date.today() + relativedelta(years=1)
if f"{text}".lower() in ["false", "no"]:
return ""
raise ValueError(
f"The embargo should be either a date in YYYY-MM-DD format, or 'true'/'yes' or 'false'/'no', but is {text}.")
def _use_custom_names(definition: Union[list, dict]):
"""Replace names in list or dict keys by custom names with
`utils.get_column_header_name`.
"""
if isinstance(definition, list):
return [get_column_header_name(name) for name in definition]
elif isinstance(definition, dict):
return {get_column_header_name(key): value for key, value in definition.items()}
raise ValueError(f"Expected dict or list, but got {type(definition)}.")
DATATYPE_DEFINITIONS = _use_custom_names({
"AphiaID": int,
"entity_id": str,
"Campaign": str,
"Date collected start": str,
"Date collected stop": str,
"Date sampled start": str,
"Date sampled stop": str,
"Fixation": str,
"Gear configuration": str,
"Gear": str,
"Hol": int,
"Latitude start": float,
"Latitude stop": float,
"Longitude start": float,
"Longitude stop": float,
"Main User": str,
"Nagoya case number": str,
"PI": str,
"Parent LinkAhead ID": str,
"Platform": str,
"Sample Context": str,
"Sample container": str,
"SampleType": str,
"SampleTypeSpecific": str,
"Sampling Person": str,
"Sampling depth start": float,
"Sampling depth stop": float,
"Sampling method": str,
"Station ID": str,
"Station number": str,
"Storage Container Label": str,
"Storage ID": str,
"StorageTemperature": str,
"Subevent": str,
"Time collected start": str,
"Time collected stop": str,
"Time sampled start": str,
"Time sampled stop": str,
"Timezone": str,
"Water depth start": float,
"Water depth stop": float,
})
# Obligatory columns: Must exist and must not be empty
# Must exist
OBLIGATORY_COLUMNS = _use_custom_names([
"entity_id",
"Date collected start",
"Device",
"Latitude start",
"Longitude start",
"Main User",
"PI",
"Sample container",
"SampleType",
"Sphere",
"Storage ID",
"StorageTemperature",
])
OBLIGATORY_COLUMNS_CHILD = _use_custom_names([
"entity_id",
"Date sampled start",
"Main User",
"Parent LinkAhead ID",
"Sample container",
"SampleType",
"SampleTypeSpecific",
"Sphere",
"Storage ID",
"StorageTemperature",
])
COLUMN_CONVERTER = _use_custom_names({
"Collection": semicolon_separated_list,
"Ecotaxa URL": semicolon_separated_list,
"NCBI Accession": semicolon_separated_list,
"NCBI BioProject": semicolon_separated_list,
"NCBI BioSample": semicolon_separated_list,
"OSIS URL": semicolon_separated_list,
"Embargo": _embargo_converter,
"Publications": semicolon_separated_list,
"Sphere": semicolon_separated_list,
})
SPECIAL_TREATMENT_SAMPLE = _use_custom_names([
"entity_id",
"Collection",
"Date collected start",
"Date collected stop",
"Date sampled start",
"Date sampled stop",
"Gear configuration",
"Gear",
"Hol",
"Latitude start",
"Latitude stop",
"Longitude start",
"Longitude stop",
"Main User",
"Nagoya case number",
"PDFReport",
"PI",
"Parent Linkahead ID",
"Person",
"Sampling Person",
"Sampling depth start",
"Sampling depth stop",
"Sphere",
"Station ID",
"Station number",
"Storage Container Label",
"Storage ID",
"Subevent",
"Time collected start",
"Time collected stop",
"Time sampled start",
"Time sampled stop",
"Timezone",
"Water depth start",
"Water depth stop",
])
IGNORED_COLUMN_NAMES_SAMPLE = _use_custom_names([
"LinkAhead URL",
"Date",
"IGSN URL",
"IGSN", # TODO This will be relevant for external IGSNs in the future.
"Parent Sample",
"Sampling depth",
"Storage chain",
"Water depth",
])
#
# Copyright (C) 2025 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2025 Florian Spreckelsen <f.spreckelsen@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
import linkahead as db
import pandas as pd
def post_process_samples(samples: db.Container, data: pd.DataFrame) -> db.Container:
return samples
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment