Skip to content
Snippets Groups Projects
Select Git revision
  • 22b61759cd418077c2c70cbafc26c509bd585100
  • main default protected
  • f-sss4grpc
  • dev
  • 108-implement-rpc-call-for-server-side-scripting
  • f-windows-conan-create
  • f-to-string
  • f-update-requirements
  • f-related-projects
  • f-role
  • f-remote-path
  • f-rel-path
  • f-consol-message
  • v0.3.0
  • v0.2.2
  • v0.2.1
  • v0.2.0
  • v0.1.2
  • v0.1.1
  • v0.1
  • v0.0.19
  • v0.0.18
  • v0.0.16
  • v0.0.15
  • v0.0.10
  • v0.0.9
  • v0.0.8
  • v0.0.7
  • v0.0.6
  • v0.0.5
  • v0.0.4
  • v0.0.3
  • v0.0.2
33 results

utils.h

Blame
  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    update_containers.py 13.53 KiB
    #!/usr/bin/env python3
    # encoding: utf-8
    #
    # This file is a part of the LinkAhead Project.
    #
    # Copyright (C) 2022 - 2024 GEOMAR
    # Copyright (C) 2022 Jakob Eckstein
    # Copyright (C) 2023 - 2024 Indiscale GmbH <info@indiscale.com>
    # Copyright (C) 2023 - 2024 Florian Spreckelsen <f.spreckelsen@indiscale.com>
    #
    # This program is free software: you can redistribute it and/or modify
    # it under the terms of the GNU Affero General Public License as
    # published by the Free Software Foundation, either version 3 of the
    # License, or (at your option) any later version.
    #
    # This program is distributed in the hope that it will be useful,
    # but WITHOUT ANY WARRANTY; without even the implied warranty of
    # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    # GNU Affero General Public License for more details.
    #
    # You should have received a copy of the GNU Affero General Public License
    # along with this program. If not, see <https://www.gnu.org/licenses/>.
    #
    
    import json
    import logging
    import os
    
    import linkahead as db
    import pandas as pd
    from caosadvancedtools.serverside import helper
    from caoscrawler import Crawler, SecurityMode
    from caoscrawler.crawl import _notify_about_inserts_and_updates
    from caoscrawler.logging import configure_server_side_logging
    from linkahead.cached import cached_query, cached_get_entity_by
    
    from bis_utils import (get_do_not_insert_type_names,
                           replace_entity_urls_by_ids, whitespace_cleanup_in_df)
    
    # suppress warning of diff function
    apilogger = logging.getLogger("linkahead.apiutils")
    apilogger.setLevel(logging.ERROR)
    
    ERROR_PREFIX = 'Something went wrong: '
    ERROR_SUFFIX = ' Please conatct <a href="mailto:biosamples@geomar.de">biosamples@geomar.de</a> if you encounter this issue.'
    logger = logging.getLogger("caosadvancedtools")
    
    
    def _value_in_row(key, row):
    
        if not key in row:
            return False
        if pd.isnull(row[key]) or row[key] is None or f"{row[key]}" == "":
            return False
        return True
    
    
    def _get_parent_by_identifier(parent_identifier):
        """Get parent specified either by BIS ID, name, or BIS label."""
        try:
            parent_identifier = int(parent_identifier)
            query = f"FIND Container WITH ID={parent_identifier}"
        except ValueError:
            query = (f"FIND Container WITH name='{parent_identifier}' "
                     f"OR WITH 'BIS label'='{parent_identifier}'")
        return cached_query(query)
    
    
    def get_parser():
        par = helper.get_argument_parser()
        return par
    
    
    def main():
        userlog_public, htmluserlog_public, debuglog_public = configure_server_side_logging()
        logger = logging.getLogger("caosadvancedtools")
        parser = get_parser()
        args = parser.parse_args()
        # Check whether executed locally or as an SSS depending on
        # auth_token argument.
        if hasattr(args, "auth_token") and args.auth_token:
            db.configure_connection(auth_token=args.auth_token)
    
        if hasattr(args, "filename") and args.filename:
            upload_dir = os.path.dirname((args.filename))
            # Read the input from the form (form.json)
            with open(args.filename) as form_json:
                form_data = json.load(form_json)
            # Read content of th uplaoded file
            path = os.path.join(upload_dir, form_data["container_metadata_file"])
            data = whitespace_cleanup_in_df(pd.read_csv(path, comment='#'))
        else:
            raise ValueError("This script was called without the mandatory form data json file.")
        data = replace_entity_urls_by_ids(data)
    
        # Get referenced container entities
        child_containers = db.Container()
        parent_containers = db.Container()
        for index, row in data.iterrows():
            if not _value_in_row("BIS ID", row):
                logger.error(f"BIS ID is missing in row {index+1}. Nothing was updated.")
                return 1
            try:
                child = db.Record(id=int(row["BIS ID"]))
            except ValueError:
                logger.error(
                    f"Invalid BIS ID {row['BIS ID']} in row {index + 1}. Nothing was updated.")
                return 1
            child.add_parent("Container")
            child_containers.append(child)
    
            if _value_in_row("Parent container", row):
                parent_identifier = row["Parent container"]
                parent = _get_parent_by_identifier(parent_identifier)
                if len(parent) == 0:
                    logger.error(
                        f"Couldn't find parent with identifier '{parent_identifier}' in row {index+1}.")
                    return 1
                elif len(parent) > 1:
                    logger.error(f"Parent with identifier '{parent_identifier}' in row {index+1} was not unique. "
                                 "Please specify with BIS ID instead.")
                    return 1
                parent = parent[0]
                try:
                    parent_containers.get_entity_by_id(parent.id)
                except KeyError:
                    parent_containers.append(parent)
    
        if not child_containers:
            # Nothing to update
            logger.error("There are no containers to be updated")
            return 1
    
        # Get IDs of proerperties
        child_container_prop = cached_get_entity_by(query="FIND Property WITH name = 'Child container'")
        custom_label_prop = cached_get_entity_by(query="FIND Property WITH name = 'Custom label'")
        pdf_rt = cached_get_entity_by(query="FIND RECORDTYPE WITH name=PDFReport")
    
        # Update (/create) container entities
        for index, row in data.iterrows():
            # Add child to parent
            parent = None
            if _value_in_row("Parent container", row):
                parent_identifier = row["Parent container"]
                # This has already been checked above for uniqueness
                candidate = _get_parent_by_identifier(parent_identifier)[0]
                # A bit redundant, but we need the exact Python object here that is in the parent_containers list.
                parent = parent_containers.get_entity_by_id(candidate.id)
    
                if parent.get_property(child_container_prop.id) is None:
                    parent.add_property(id=child_container_prop.id,
                                        name=child_container_prop.name, value=[int(row["BIS ID"])])
                else:
                    if parent.get_property(child_container_prop.id).value is None:
                        parent.get_property(child_container_prop.id).value = [int(row["BIS ID"])]
                    else:
                        if int(row["BIS ID"]) not in parent.get_property(child_container_prop.id).value:
                            parent.get_property(child_container_prop.id).value.append(
                                int(row["BIS ID"]))
    
                # remove the current child from all other parents (don't do anything if the parent didn't change)
                old_parents = cached_query(f"FIND Container WHICH REFERENCES {int(row['BIS ID'])}")
                for old_parent in old_parents:
                    if parent is not None and old_parent.id == parent.id:
                        # old parent also is new parent
                        continue
                    try:
                        # Has already been registered for updates
                        old_parent = parent_containers.get_entity_by_id(old_parent.id)
                    except KeyError:
                        parent_containers.append(old_parent)
                    old_parent.remove_value_from_property("Child container", int(
                        row["BIS ID"]), remove_if_empty_afterwards=False)
                    if old_parent.get_property("Child container").value is None:
                        old_parent.get_property("Child container").value = []
    
            # Add custom label o child
            child = child_containers.get_entity_by_id(id=int(row["BIS ID"]))
            if _value_in_row("Custom label", row):
                child.name = row["Custom label"]
                if child.get_property(custom_label_prop.id) is None:
                    child.add_property(id=custom_label_prop.id,
                                       name=custom_label_prop.name, value=row["Custom label"])
                else:
                    child.get_property(custom_label_prop.id).value = row["Custom label"]
    
            # Treat PI
            if _value_in_row("PI", row):
                pi = row["PI"]
                pi_prop = cached_get_entity_by(query="FIND PROPERTY Pi")
                try:
                    query = f"FIND RECORD Person WITH ID={int(pi)}"
                except ValueError:
                    query = f"FIND RECORD Person WITH AN Abbreviation='{pi}'"
                try:
                    pi_rec = cached_get_entity_by(query=query)
                    if child.get_property(pi_prop.name) is not None:
                        child.get_property(pi_prop.name).value = pi_rec.id
                    else:
                        child.add_property(id=pi_prop.id, name=pi_prop.name, value=pi_rec.id)
                except db.EmptyUniqueQueryError:
                    logger.warning(f"There is no PI with BIS ID or abbreviation {pi}. Skipping.")
    
            # Collection(s)
            if _value_in_row("Collection", row):
                collection_rt = cached_get_entity_by(query="FIND RECORDTYPE Collection")
                if not ";" in str(row["Collection"]):
                    collections = [row["Collection"]]
                else:
                    collections = [coll.strip() for coll in str(row["Collection"]).split(';')]
                prop_val = []
                for coll in collections:
                    try:
                        query = f"FIND RECORD Collection WITH ID={int(coll)}"
                    except ValueError:
                        query = f"FIND RECORD Collection WITH name='{coll}'"
                    try:
                        coll_rec = cached_get_entity_by(query=query)
                        prop_val.append(coll_rec.id)
                    except db.EmptyUniqueQueryError:
                        logger.warning(f"There is no collection with name or BIS ID {coll}. Skipping.")
                        continue
                if prop_val:
                    if child.get_property("Collection") is not None:
                        child.get_property("Collection").datatype = db.LIST("Collection")
                        child.get_property("Collection").value = prop_val
                    else:
                        child.add_property(id=collection_rt.id, name=collection_rt.name, datatype=db.LIST(
                            "Collection"), value=prop_val)
    
            # Treat Container Contents
            if _value_in_row("Container Contents", row):
                if not (_value_in_row("PI", row) and _value_in_row("Collection", row)):
                    logger.error(
                        f"Container Contents are given for container {child.id} but it "
                        "is missing PI and/or Collection info. No updates have been performed."
                    )
                    return 1
                contents_prop = cached_get_entity_by(query="FIND PROPERTY 'Container Contents'")
                if child.get_property(contents_prop.name) is not None:
                    child.get_property(contents_prop.name).value = row["Container Contents"]
                else:
                    child.add_property(id=contents_prop.id, name=contents_prop.name,
                                       value=row["Container Contents"])
    
            # Treat PDF Report
            if _value_in_row("PDFReport", row):
                pdf_id = row["PDFReport"]
                try:
                    pdf_id = int(pdf_id)
                    pdf_rec = cached_query(f"FIND FILE PDFReport WITH ID={pdf_id}")
                    if not pdf_rec:
                        logger.warning(
                            f"There is no PDFReport with Bis ID {pdf_id}, so no PDF is attached to container {child.id}.")
                    else:
                        if child.get_property("PDFReport") is not None:
                            child.get_property("PDFReport").value = pdf_id
                        else:
                            child.add_property(id=pdf_rt.id, name=pdf_rt.name, value=pdf_id)
                except ValueError:
                    logger.warning(
                        f"There is no valid Bis ID provided for container {child.id}."
                        f"Provided was {pdf_id}. Skipping")
    
        # This is a workaround for weird merging errors in the
        # crawler. TODO(fspreck): Remove after merge of sync_node and sync_graph and
        # following release.
        merged = []
        for par in parent_containers:
            if (data['BIS ID'] == par.id).any():
                # A container to be updated is used as another containers parent:
                child = child_containers.get_entity_by_id(par.id)
                # All parents have a child sample property with a value (which may
                # be empty). No child sample has this property, so the following is
                # okay without checks:
                prop = par.get_property("Child container")
                child.add_property(name=prop.name, id=prop.id, value=prop.value)
                merged.append(par)
        for par in merged:
            # All relevant information, i.e., the new children have been merged into
            # the corresponding child, so drop this.
            parent_containers.remove(par)
        # TODO Add notes as CommentAnnotation
        crawler = Crawler(securityMode=SecurityMode.UPDATE)
        to_be_synchronized = child_containers + parent_containers
    
        inserts, updates = crawler.synchronize(
            commit_changes=True, unique_names=False, crawled_data=to_be_synchronized,
            no_insert_RTs=get_do_not_insert_type_names()
        )
        if "SHARED_DIR" in os.environ:
            _notify_about_inserts_and_updates(len(inserts), len(updates), htmluserlog_public,
                                              crawler.run_id)
        for ent in inserts + updates:
            ent.retrieve_acl()
            ent.grant(role='Stock Manager', priority=False, permission="EDIT:ACL")
            ent.update_acl()
        logger.info(f"Successfully processed {len(child_containers)} containers and "
                    f"{len(parent_containers)} parent containers.")
    
        # TODO Create new Spreadsheet for download
    
    
    if __name__ == "__main__":
        main()