From 8f7beb3fd5b053b474d6a33c10d92f63ecbcfff1 Mon Sep 17 00:00:00 2001
From: Florian Spreckelsen <f.spreckelsen@indiscale.com>
Date: Mon, 17 Feb 2025 18:13:53 +0100
Subject: [PATCH] ENH: Adapt sample export to new data model

---
 .../scripting/bin/export_sample_csv.py        | 278 ++++++++----------
 .../bin/sample_helpers/default_constants.yml  |   1 +
 ...mple_registration_get_person_identifier.py |   5 +
 .../sample_upload_column_definitions.py       |  16 +-
 4 files changed, 143 insertions(+), 157 deletions(-)

diff --git a/sample-management-custom/caosdb-server/scripting/bin/export_sample_csv.py b/sample-management-custom/caosdb-server/scripting/bin/export_sample_csv.py
index 41ca25f..169575a 100755
--- a/sample-management-custom/caosdb-server/scripting/bin/export_sample_csv.py
+++ b/sample-management-custom/caosdb-server/scripting/bin/export_sample_csv.py
@@ -42,8 +42,10 @@ from bis_utils import (create_email_with_link_text,
                        get_options_row, send_mail_with_defaults)
 from export_container_csv import (generate_label_text,
                                   extract_storage_chain as container_storage_chain)
+from sample_helpers.sample_registration_get_person_identifier import get_person_identifier_from_rec
 from sample_helpers.sample_upload_column_definitions import (
-    DATATYPE_DEFINITIONS, SPECIAL_TREATMENT_SAMPLE as SPECIAL_TREATMENT)
+    DATATYPE_DEFINITIONS, SPECIAL_TREATMENT_SAMPLE as
+    SPECIAL_TREATMENT, use_custom_names)
 from sample_helpers.utils import (CONSTANTS, get_column_header_name,
                                   get_entity_name)
 
@@ -126,7 +128,7 @@ def extract_value_as_list(record, key):
 
 
 def extract_storage_id(record, key):
-    return extract_value_as_list(record, "Container")
+    return extract_value_as_list(record, get_entity_name("container_rt"))
 
 
 def extract_pdf_id(record, key):
@@ -134,19 +136,20 @@ def extract_pdf_id(record, key):
     return prop.value if prop is not None else None
 
 
-def extract_storage_container_label(record, key):
-    ids = extract_value_as_list(record, "Container")
-    return retrieve_values(ids, 'BIS Label')
-
-
-def extract_nagoya_case_number(record, key):
-    ids = extract_value_as_list(record, "NagoyaCase")
-    return retrieve_values(ids, key)
-
-
 def extract_person(record, key):
     ids = extract_value_as_list(record, key)
-    return retrieve_values(ids, 'Abbreviation')
+    person_recs = [cached_record(i) for i in ids]
+    return [get_person_identifier_from_rec(r) for r in person_recs]
+
+
+def extract_event_responsible(record_key):
+    evt = retrieve_event(record)
+    if len(evt) == 0:
+        return None
+    elif len(evt) > 1:
+        logger.debug(f"Sample {record.id} references more than one event.")
+        return None
+    return extract_person(evt[0], get_entity_name("responsible_person_event"))
 
 
 def extract_parent_sample(record, key):
@@ -161,51 +164,14 @@ def extract_reference_name(record, key):
             for i in ids if i is not None]
 
 
-def retrieve_source_event(record):
-    ids = extract_value_as_list(record, 'SourceEvent')
-    if record.get_property("SourceEvent") is None:
+def retrieve_event(record):
+    ids = extract_value_as_list(record, get_entity_name("event_rt"))
+    if record.get_property(get_entity_name("event_rt")) is None:
         # there are cases where this property is named "Event"
         ids = extract_value_as_list(record, 'Event')
     return [cached_record(i) for i in ids]
 
 
-def retrieve_gear(record):
-    ids = [e.get_property("Gear").value for e in retrieve_source_event(record)
-           if e.get_property("Gear") is not None]
-    return [cached_query(f"SELECT 'parent', 'Configuration' FROM ENTITY WITH id = '{i}'", unique=True) for i in ids]
-
-
-def extract_gear(record, key):
-    return [e.get_parents()[0].name for e in retrieve_gear(record)]
-
-
-def extract_gear_configuration(record, key):
-    return [e.get_property("Configuration").value for e in
-            retrieve_gear(record)
-            if e.get_property("Configuration") is not None]
-
-
-def extract_date_time(record, p):
-    if p.lower() == "time start" or p.lower() == "time stop":
-        # these are attached to the source event directly
-        return [e.get_property(p).value for e in retrieve_source_event(record) if
-                e.get_property(p) is not None and e.get_property(p).value is not None]
-    else:
-        return extract_value_as_list(record, p)
-
-
-def extract_station_number(record, key):
-    source_ev = retrieve_source_event(record)
-    return [e.get_property(key).value for e in source_ev if
-            e.get_property(key) is not None]
-
-
-def extract_station_id(record, key):
-    source_ev = retrieve_source_event(record)
-    return [e.get_property(key).value for e in source_ev if
-            e.get_property(key) is not None]
-
-
 def retrieve_positions(source_ev):
     pos_ids = extract_value_as_list(source_ev, "Position")
     return [cached_record(i) for i in pos_ids]
@@ -217,7 +183,7 @@ def has_parent(r, par):
 
 
 def extract_position(record, position, component):
-    source_evs = retrieve_source_event(record)
+    source_evs = retrieve_event(record)
     result = []
     for ev in source_evs:
         _pos = [pos for pos in retrieve_positions(ev)]
@@ -234,83 +200,108 @@ def extract_position(record, position, component):
     return [pos.get_property(component).value for pos in result if pos.get_property(component) is not None]
 
 
+def extract_ele_start(record, key):
+    return extract_position(record, get_entity_name("StartPosition"), get_entity_name("elevation"))
+
+
+def extract_ele_stop(record, key):
+    return extract_position(record, get_enum_value("StopPosition"), get_entity_name("elevation"))
+
+
 def extract_lat_start(record, key):
-    return extract_position(record, "StartPosition", "Latitude")
+    return extract_position(record, get_entity_name("StartPosition"), get_entity_name("latitude"))
 
 
 def extract_lat_stop(record, key):
-    return extract_position(record, "StopPosition", "Latitude")
+    return extract_position(record, get_enum_value("StopPosition"), get_entity_name("latitude"))
 
 
 def extract_lng_start(record, key):
-    return extract_position(record, "StartPosition", "Longitude")
+    return extract_position(record, get_entity_name("StartPosition"), get_entity_name("longitude"))
 
 
 def extract_lng_stop(record, key):
-    return extract_position(record, "StopPosition", "Longitude")
+    return extract_position(record, get_entity_name("StopPosition"), get_entity_name("longitude"))
 
 
-def extract_sampling_depth_start(record, key):
-    return extract_position(record, "StartPosition", "Sampling depth")
+def extract_linkahead_url(record, key):
+    # base_uri = db.get_config().get("Connection", "url")
+    base_uri = get_config_setting("public_host_url")
+    return urllib.parse.urljoin(base_uri, f"Entity/{record.id}")
 
 
-def extract_sampling_depth_stop(record, key):
-    return extract_position(record, "StopPosition", "Sampling depth")
+def extract_doi(record, key):
+    source_evs = retrieve_event(record)
+    if len(source_evs) > 1:
+        logger.error(
+            f"Sample {record.id} references more than one event so no unique DOI can be exported.")
+        return None
+    elif len(source_evs) == 0:
+        return None
+    ev = source_evs[0]
+    return ev.get_property(get_entity_name("igsn_doi_prop")).value if ev.get_property("igsn_doi_prop") is not None else None
 
 
-def extract_water_depth_start(record, key):
-    return extract_position(record, "StartPosition", "Water depth")
+def _extract_event_prop(record, key, ref=False):
 
+    evt = retrieve_event(record)
+    if len(evt) == 0:
+        return None
+    elif len(evt) > 1:
+        logger.debug(f"Sample {record.id} references more than one event.")
+        return None
 
-def extract_water_depth_stop(record, key):
-    return extract_position(record, "StopPosition", "Water depth")
+    if ref:
+        return extract_reference_name(evt[0], key)
 
+    return extract_value_as_list(evt[0], key)
 
-def extract_source_event_name(record, key):
-    return [e.name for e in retrieve_source_event(record)]
 
+def extract_biome(record, key):
 
-def extract_hol(record, key):
-    source_ev = retrieve_source_event(record)
-    return [e.get_property(key).value for e in source_ev if
-            e.get_property(key) is not None]
+    return _extract_event_prop(record, get_entity_name("Biome"), ref=True)
 
 
-def extract_bis_url(record, key):
-    # base_uri = db.get_config().get("Connection", "url")
-    base_uri = get_config_setting("public_host_url")
-    return urllib.parse.urljoin(base_uri, f"Entity/{record.id}")
+def extract_campaign(record, key):
 
+    return _extract_event_prop(record, get_entity_name("Campaign"), ref=True)
 
-def extract_igsn(record, key):
-    source_evs = retrieve_source_event(record)
-    if len(source_evs) > 1:
-        logger.error(
-            f"Sample {record.id} references more than one SourceEvent so no unique IGSN can be exported.")
-        return None
-    elif len(source_evs) == 0:
-        return None
-    ev = source_evs[0]
-    return ev.get_property(key).value if ev.get_property(key) is not None else None
 
+def extract_device(record, key):
 
-def extract_doi(record, key):
-    source_evs = retrieve_source_event(record)
-    if len(source_evs) > 1:
-        logger.error(
-            f"Sample {record.id} references more than one SourceEvent so no unique DOI can be exported.")
-        return None
-    elif len(source_evs) == 0:
-        return None
-    ev = source_evs[0]
-    return ev.get_property("DOI").value if ev.get_property("DOI") is not None else None
+    return _extract_event_prop(record, get_entity_name("Device"), ref=True)
+
+
+def extract_end_date(record, key):
+
+    return _extract_event_prop(record, get_entity_name("end_date_prop"))
+
+
+def extract_level(record, key):
+
+    return _extract_event_prop(record, get_entity_name("level"))
+
+
+def extract_sphere(record, key):
+
+    return _extract_event_prop(record, get_entity_name("Sphere"), ref=True)
+
+
+def extract_locality_descr(record, key):
+
+    return _extract_event_prop(record, get_entity_name("locality_description_prop"))
+
+
+def extract_locality_name(record, key):
+
+    return _extract_event_prop(record, get_entity_name("locality_name_prop"))
 
 
 def extract_storage_chain(record, key):
 
-    if record.get_property("Container") is not None and record.get_property("Container").value:
+    if record.get_property(get_entity_name("container_rt")) is not None and record.get_property(get_entity_name("container_rt")).value:
 
-        cont_id = record.get_property("Container").value
+        cont_id = record.get_property(get_entity_name("container_rt")).value
         if isinstance(cont_id, list):
             if len(cont_id) > 1:
                 logger.debug(f"Sample {record.id} has multiple containers.")
@@ -327,7 +318,7 @@ def extract_storage_chain(record, key):
 
 def extract_event_url(record, key):
 
-    events = retrieve_source_event(record)
+    events = retrieve_event(record)
     if not events:
         return None
     if len(events) == 1:
@@ -337,72 +328,61 @@ def extract_event_url(record, key):
 
 
 # must include all keys from SPECIAL_TREATMENT
-EXTRACTORS = {
-    "BIS ID": lambda record, key: record.id,
-    "Parent BIS ID": extract_parent_sample,
-    "AphiaID": default_find,
-    "Collection": extract_reference_name,
+EXTRACTORS = use_custom_names({
+    "entity_id": lambda record, key: record.id,
     "Main User": extract_person,
-    "Sampling Person": extract_person,
-    "PI": extract_person,
-    "Person": extract_person,
-    "Gear": extract_gear,
-    "Gear configuration": extract_gear_configuration,
+    "Biome": extract_biome,
+    "Campaign": extract_campaign,
+    "Collector": extract_person,
+    "Curator": extract_person,
+    "Device": extract_device,
+    "Elevation start": extract_ele_start,
+    "Elevation stop": extract_ele_stop,
+    "Embargo": default_find,
+    "End date": extract_end_date,
     "Latitude start": extract_lat_start,
-    "Longitude start": extract_lng_start,
-    "Storage ID": extract_storage_id,
-    "Nagoya case number": extract_nagoya_case_number,
-    "PDFReport": extract_pdf_id,
-    "Subevent": extract_source_event_name,
-    "Station ID": extract_station_id,
-    "Station number": extract_station_number,
-    "Sampling depth start": extract_sampling_depth_start,
-    "Sampling depth stop": extract_sampling_depth_stop,
-    "Water depth start": extract_water_depth_start,
-    "Water depth stop": extract_water_depth_stop,
     "Latitude stop": extract_lat_stop,
+    "Level": extract_level,
+    "LinkAhead URL": extract_linkahead_url,
+    "Longitude start": extract_lng_start,
     "Longitude stop": extract_lng_stop,
-    "Storage chain": extract_storage_chain,
-    "Storage Container Label": extract_storage_container_label,
-    "Hol": extract_hol,
+    "PDFReport": extract_pdf_id,
     "Sampling method": default_find,
-    # "Publications": TODO never used
-    # "NCBI BioProject": TODO never used
-    # "NCBI BioSample": TODO never used
-    # "NCBI Accession": TODO never used
-    "BIS URL": extract_bis_url,
-    "IGSN": extract_igsn,
-    "IGSN URL": extract_doi,
-    "Sphere": default_find,
-    "URL SourceEvent": extract_event_url,
-}
-
-REVERSE_COLUMN_CONVERTER = {
-    "Collection": collection_value,
-    "PI": person_value,
-    "Person": person_value,
-}
+    "Sphere": extract_sphere,
+    "Start date": extract_end_date,
+    "Storage ID": extract_storage_id,
+    "Storage chain": extract_storage_chain,
+    "URL Event": extract_event_url,
+    "igsn_doi_prop": extract_doi,
+    "locality_description_prop": extract_locality_descr,
+    "locality_name_prop": extract_locality_name,
+    "parent_sample_prop": extract_parent_sample,
+    "responsible_person_event": extract_event_responsible
+})
+
+REVERSE_COLUMN_CONVERTER = use_custom_names({
+})
 
 # List of sample properties to be ignored because they are treated
 # otherwise. Similar, but not identical to SPECIAL TREATMENT.
-IGNORE_KEYS = [
-    "Parent Sample",
-    "Container",
-    "Event",
-]
+IGNORE_KEYS = use_custom_names([
+    "parent_sample_prop",
+    "container_rt",
+    "event_rt",
+])
 
 # Additional list of keys to be ignored when extracting parent sample information
-IGNORE_KEYS_PARENT = IGNORE_KEYS + [
-    "LinkAhead ID",
-]
+IGNORE_KEYS_PARENT = IGNORE_KEYS + use_custom_names([
+    "entity_id",
+])
 
 # List of columns to be exported although they are not known to or ignored by
 # the import.
-ADDITIONAL_EXPORTS = [
+ADDITIONAL_EXPORTS = use_custom_names([
     "LinkAhead URL",
     "Parent LinkAhead ID",
     "Storage chain",
-]
+])
 
 
 def extract_value(r, e):
diff --git a/sample-management-custom/caosdb-server/scripting/bin/sample_helpers/default_constants.yml b/sample-management-custom/caosdb-server/scripting/bin/sample_helpers/default_constants.yml
index c7f1baa..f752180 100644
--- a/sample-management-custom/caosdb-server/scripting/bin/sample_helpers/default_constants.yml
+++ b/sample-management-custom/caosdb-server/scripting/bin/sample_helpers/default_constants.yml
@@ -24,6 +24,7 @@ csv_column_names:
   locality_description_prop: "Locality description"
   locality_name_prop: "Locality name"
   responsible_person_event: "Event responsible"
+  parent_sample_prop: "Parent LinkAhead ID"
 
 csv_column_descriptions:
   LinkAhead ID: "An ID generated by LinkAhead (either integer or URL to this entity). Do not change this column!"
diff --git a/sample-management-custom/caosdb-server/scripting/bin/sample_helpers/sample_registration_get_person_identifier.py b/sample-management-custom/caosdb-server/scripting/bin/sample_helpers/sample_registration_get_person_identifier.py
index 329dc3d..396bf0e 100644
--- a/sample-management-custom/caosdb-server/scripting/bin/sample_helpers/sample_registration_get_person_identifier.py
+++ b/sample-management-custom/caosdb-server/scripting/bin/sample_helpers/sample_registration_get_person_identifier.py
@@ -26,6 +26,11 @@ def get_person_identifier(form_data: dict) -> str:
 
     """
     person_rec = db.cached.cached_get_entity_by(eid=form_data["responsible_person"])
+
+    return get_person_identifier_from_rec(person_rec)
+
+
+def get_person_identifier_from_rec(person_rec: db.Record) -> str:
     # Use abbreviation if present
     if (person_rec.get_property(get_entity_name("abbreviation_prop")) is not None and
             person_rec.get_property(get_entity_name("abbreviation_prop")).value):
diff --git a/sample-management-custom/caosdb-server/scripting/bin/sample_helpers/sample_upload_column_definitions.py b/sample-management-custom/caosdb-server/scripting/bin/sample_helpers/sample_upload_column_definitions.py
index d6a62cb..3912595 100644
--- a/sample-management-custom/caosdb-server/scripting/bin/sample_helpers/sample_upload_column_definitions.py
+++ b/sample-management-custom/caosdb-server/scripting/bin/sample_helpers/sample_upload_column_definitions.py
@@ -43,7 +43,7 @@ def _embargo_converter(text: str):
         f"The embargo should be either a date in YYYY-MM-DD format, or 'true'/'yes' or 'false'/'no', but is {text}.")
 
 
-def _use_custom_names(definition: Union[list, dict]):
+def use_custom_names(definition: Union[list, dict]):
     """Replace names in list or dict keys by custom names with
     `utils.get_column_header_name`.
 
@@ -56,7 +56,7 @@ def _use_custom_names(definition: Union[list, dict]):
     raise ValueError(f"Expected dict or list, but got {type(definition)}.")
 
 
-DATATYPE_DEFINITIONS = _use_custom_names({
+DATATYPE_DEFINITIONS = use_custom_names({
     "Campaign": str,
     "Elevation start": float,
     "Elevation stop": float,
@@ -75,16 +75,16 @@ DATATYPE_DEFINITIONS = _use_custom_names({
 
 # Obligatory columns: Must exist and must not be empty
 # Must exist
-OBLIGATORY_COLUMNS = _use_custom_names([
+OBLIGATORY_COLUMNS = use_custom_names([
     "entity_id",
 ])
 
-OBLIGATORY_COLUMNS_CHILD = _use_custom_names([
+OBLIGATORY_COLUMNS_CHILD = use_custom_names([
     "entity_id",
     "Parent LinkAhead ID",
 ])
 
-COLUMN_CONVERTER = _use_custom_names({
+COLUMN_CONVERTER = use_custom_names({
     "Collector": semicolon_separated_list,
     "Curator": semicolon_separated_list,
     "Embargo": _embargo_converter,
@@ -92,7 +92,7 @@ COLUMN_CONVERTER = _use_custom_names({
     "Sphere": semicolon_separated_list,
 })
 
-SPECIAL_TREATMENT_SAMPLE = _use_custom_names([
+SPECIAL_TREATMENT_SAMPLE = use_custom_names([
     "Biome",
     "Campaign",
     "Collector",
@@ -102,7 +102,7 @@ SPECIAL_TREATMENT_SAMPLE = _use_custom_names([
     "Elevation stop",
     "Embargo",
     "End date",
-    "Event responsible",
+    "responsible_person_event",
     "igsn_doi_prop",
     "Latitude start",
     "Latitude stop",
@@ -120,7 +120,7 @@ SPECIAL_TREATMENT_SAMPLE = _use_custom_names([
     "entity_id",
 ])
 
-IGNORED_COLUMN_NAMES_SAMPLE = _use_custom_names([
+IGNORED_COLUMN_NAMES_SAMPLE = use_custom_names([
     "LinkAhead URL",
     "Parent Sample",
     "Storage chain",
-- 
GitLab