diff --git a/sample-management-custom/caosdb-server/scripting/bin/crawl_sample_data_async.py b/sample-management-custom/caosdb-server/scripting/bin/crawl_sample_data_async.py index 1c578f9e6e6b56c53805857996b13b90f4199e4b..1e17a3340673e91dd3fc72e959a9b68b541fbc34 100755 --- a/sample-management-custom/caosdb-server/scripting/bin/crawl_sample_data_async.py +++ b/sample-management-custom/caosdb-server/scripting/bin/crawl_sample_data_async.py @@ -47,6 +47,8 @@ from bis_utils import (get_do_not_insert_type_names, return_value_if_not_none, send_mail_with_defaults, SPECIAL_TREATMENT_SAMPLE) +from sample_helpers.sample_upload_post_processing import post_process_samples +from sample_helpers.utils import get_column_header_name # suppress warning of diff function apilogger = logging.getLogger("linkahead.apiutils") @@ -139,19 +141,20 @@ def _append_times_to_entity(ent, data, propname_prefix="Time", colname_time_pref except dateparser.ParserError as perr: logger.error( f"There is a problem in '{colname_date_start}': {date_start}" - f" of sample {data['BIS ID']}: {perr}" + f" of sample {data[get_column_header_name('entity_id')]}: {perr}" ) raise DataInconsistencyError if colname_time_start in data and return_value_if_not_none(data[colname_time_start]) is not None: - if not "Timezone" in data or return_value_if_not_none(data["Timezone"]) is None: + if not get_column_header_name("Timezone") in data or return_value_if_not_none(data[get_column_header_name("Timezone")]) is None: logger.error(f"{colname_time_start} but no timezone given for sample " - f"{data['BIS ID']}.") + f"{data[get_column_header_name('entity_id')]}.") raise DataInconsistencyError time_start = return_value_if_not_none(data[colname_time_start]) - timezone = return_value_if_not_none(data["Timezone"]) + timezone = return_value_if_not_none(data[get_column_header_name("Timezone")]) if date_start is None: logger.error( - f"{colname_time_start} is given but {colname_date_start} is missing for sample {data['BIS ID']}.") + f"{colname_time_start} is given but {colname_date_start} is missing for " + f"sample {data[get_column_header_name('entity_id')]}.") raise DataInconsistencyError try: _val = str(dateparser.parse(f"{date_start}T{time_start}{timezone}")) @@ -159,7 +162,7 @@ def _append_times_to_entity(ent, data, propname_prefix="Time", colname_time_pref except dateparser.ParserError as perr: logger.error( f"Couldn't parse {colname_time_start}: {time_start} with timezone {timezone} " - f"of sample {data['BIS ID']}: {perr}" + f"of sample {data[get_column_header_name('entity_id')]}: {perr}" ) raise DataInconsistencyError elif date_start is not None: @@ -170,7 +173,8 @@ def _append_times_to_entity(ent, data, propname_prefix="Time", colname_time_pref date_stop = return_value_if_not_none(data[colname_date_stop]) if date_stop is not None and date_start is None: logger.error( - f"{colname_date_stop} is given but {colname_date_start} is missing for sample {data['BIS ID']}.") + f"{colname_date_stop} is given but {colname_date_start} is missing for " + f"sample {data[get_column_header_name('entity_id')]}.") raise DataInconsistencyError if date_stop is None: _date_stop = date_start @@ -180,7 +184,7 @@ def _append_times_to_entity(ent, data, propname_prefix="Time", colname_time_pref except dateparser.ParserError as perr: logger.error( f"There is a problem in '{colname_date_stop}': {date_stop}" - f" of sample {data['BIS ID']}: {perr}" + f" of sample {data[get_column_header_name('entity_id')]}: {perr}" ) raise DataInconsistencyError @@ -188,7 +192,8 @@ def _append_times_to_entity(ent, data, propname_prefix="Time", colname_time_pref time_stop = return_value_if_not_none(data[colname_time_stop]) if time_start is None: logger.error( - f"{colname_time_stop} is given but {colname_time_start} is missing for sample {data['BIS ID']}.") + f"{colname_time_stop} is given but {colname_time_start} is missing for " + f"sample {data[get_column_header_name('entity_id')]}.") raise DataInconsistencyError # timezone is set by time start; if it hadn't been there, we would already have an error. try: @@ -196,7 +201,7 @@ def _append_times_to_entity(ent, data, propname_prefix="Time", colname_time_pref except dateparser.ParserError as perr: logger.error( f"Couldn't parse {colname_time_stop}: {time_stop} with timezone {timezone} " - f"of sample {data['BIS ID']}: {perr}" + f"of sample {data[get_column_header_name('entity_id')]}: {perr}" ) raise DataInconsistencyError ent = _update_property(ent, prop_stop.id, property_name=prop_stop.name, value=_val) @@ -587,6 +592,7 @@ def update_sample_records(data, htmluserlog_public): samples.append(sample) + samples = post_process_samples(samples, data) synchroize(samples, additional_property_ents, htmluserlog_public) diff --git a/sample-management-custom/caosdb-server/scripting/bin/sample_helpers/sample_upload_column_definitions.py b/sample-management-custom/caosdb-server/scripting/bin/sample_helpers/sample_upload_column_definitions.py new file mode 100644 index 0000000000000000000000000000000000000000..c1155d775140e282cbdde581f510b6b07afea892 --- /dev/null +++ b/sample-management-custom/caosdb-server/scripting/bin/sample_helpers/sample_upload_column_definitions.py @@ -0,0 +1,192 @@ +# +# Copyright (C) 2025 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2025 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +import re + +from datetime import date, datetime +from dateutil.relativedelta import relativedelta +from typing import Union + +from .utils import get_column_header_name + + +def semicolon_separated_list(text): + return [el.strip() for el in text.split(";") if el != ""] + + +def _embargo_converter(text: str): + + datepattern = r"^(?P<year>\d{4,4})-(?P<month>\d{2,2})-(?P<day>\d{2,2})" + matches = re.match(datepattern, str(text)) + if matches: + return date(int(matches.groupdict()["year"]), int(matches.groupdict()["month"]), int(matches.groupdict()["day"])) + if f"{text}".lower() in ["true", "yes"]: + # yes means embargo until today in one year + return date.today() + relativedelta(years=1) + if f"{text}".lower() in ["false", "no"]: + return "" + raise ValueError( + f"The embargo should be either a date in YYYY-MM-DD format, or 'true'/'yes' or 'false'/'no', but is {text}.") + + +def _use_custom_names(definition: Union[list, dict]): + """Replace names in list or dict keys by custom names with + `utils.get_column_header_name`. + + """ + if isinstance(definition, list): + return [get_column_header_name(name) for name in definition] + elif isinstance(definition, dict): + return {get_column_header_name(key): value for key, value in definition.items()} + + raise ValueError(f"Expected dict or list, but got {type(definition)}.") + + +DATATYPE_DEFINITIONS = _use_custom_names({ + "AphiaID": int, + "entity_id": str, + "Campaign": str, + "Date collected start": str, + "Date collected stop": str, + "Date sampled start": str, + "Date sampled stop": str, + "Fixation": str, + "Gear configuration": str, + "Gear": str, + "Hol": int, + "Latitude start": float, + "Latitude stop": float, + "Longitude start": float, + "Longitude stop": float, + "Main User": str, + "Nagoya case number": str, + "PI": str, + "Parent LinkAhead ID": str, + "Platform": str, + "Sample Context": str, + "Sample container": str, + "SampleType": str, + "SampleTypeSpecific": str, + "Sampling Person": str, + "Sampling depth start": float, + "Sampling depth stop": float, + "Sampling method": str, + "Station ID": str, + "Station number": str, + "Storage Container Label": str, + "Storage ID": str, + "StorageTemperature": str, + "Subevent": str, + "Time collected start": str, + "Time collected stop": str, + "Time sampled start": str, + "Time sampled stop": str, + "Timezone": str, + "Water depth start": float, + "Water depth stop": float, +}) + +# Obligatory columns: Must exist and must not be empty +# Must exist +OBLIGATORY_COLUMNS = _use_custom_names([ + "entity_id", + "Date collected start", + "Device", + "Latitude start", + "Longitude start", + "Main User", + "PI", + "Sample container", + "SampleType", + "Sphere", + "Storage ID", + "StorageTemperature", +]) + +OBLIGATORY_COLUMNS_CHILD = _use_custom_names([ + "entity_id", + "Date sampled start", + "Main User", + "Parent LinkAhead ID", + "Sample container", + "SampleType", + "SampleTypeSpecific", + "Sphere", + "Storage ID", + "StorageTemperature", +]) + +COLUMN_CONVERTER = _use_custom_names({ + "Collection": semicolon_separated_list, + "Ecotaxa URL": semicolon_separated_list, + "NCBI Accession": semicolon_separated_list, + "NCBI BioProject": semicolon_separated_list, + "NCBI BioSample": semicolon_separated_list, + "OSIS URL": semicolon_separated_list, + "Embargo": _embargo_converter, + "Publications": semicolon_separated_list, + "Sphere": semicolon_separated_list, +}) + +SPECIAL_TREATMENT_SAMPLE = _use_custom_names([ + "entity_id", + "Collection", + "Date collected start", + "Date collected stop", + "Date sampled start", + "Date sampled stop", + "Gear configuration", + "Gear", + "Hol", + "Latitude start", + "Latitude stop", + "Longitude start", + "Longitude stop", + "Main User", + "Nagoya case number", + "PDFReport", + "PI", + "Parent Linkahead ID", + "Person", + "Sampling Person", + "Sampling depth start", + "Sampling depth stop", + "Sphere", + "Station ID", + "Station number", + "Storage Container Label", + "Storage ID", + "Subevent", + "Time collected start", + "Time collected stop", + "Time sampled start", + "Time sampled stop", + "Timezone", + "Water depth start", + "Water depth stop", +]) + +IGNORED_COLUMN_NAMES_SAMPLE = _use_custom_names([ + "LinkAhead URL", + "Date", + "IGSN URL", + "IGSN", # TODO This will be relevant for external IGSNs in the future. + "Parent Sample", + "Sampling depth", + "Storage chain", + "Water depth", +]) diff --git a/sample-management-custom/caosdb-server/scripting/bin/sample_helpers/sample_upload_post_processing.py b/sample-management-custom/caosdb-server/scripting/bin/sample_helpers/sample_upload_post_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..54a034775389a16d5dc0d02d57195b37dcd94ca8 --- /dev/null +++ b/sample-management-custom/caosdb-server/scripting/bin/sample_helpers/sample_upload_post_processing.py @@ -0,0 +1,23 @@ +# +# Copyright (C) 2025 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2025 Florian Spreckelsen <f.spreckelsen@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +import linkahead as db +import pandas as pd + +def post_process_samples(samples: db.Container, data: pd.DataFrame) -> db.Container: + + return samples