From 65a3a403d838c54aaebe2b3c73550eaf114a043a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <henrik@trineo.org> Date: Fri, 17 Dec 2021 17:04:07 +0100 Subject: [PATCH] ENH: add cache --- src/newcrawler/identified_cache.py | 72 ++++++++++++++++++++++++++++++ unittests/test_identified_cache.py | 65 +++++++++++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 src/newcrawler/identified_cache.py create mode 100644 unittests/test_identified_cache.py diff --git a/src/newcrawler/identified_cache.py b/src/newcrawler/identified_cache.py new file mode 100644 index 00000000..e02e19d8 --- /dev/null +++ b/src/newcrawler/identified_cache.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +stores identified records and is able to detect duplicates +""" + +import caosdb as db + +from hashlib import sha256 + + +def _create_hashable_string(identifiable: db.Record): + """ + creates a string from the attributes of an identifiable that can be hashed + """ + rec_string = "P<{}>N<{}>".format(identifiable.parents[0].name, identifiable.name) + for pname in sorted([p.name for p in identifiable.properties]): + value = str(identifiable.get_property(pname).value) + if isinstance(identifiable.get_property(pname).value, db.Entity): + value = str(identifiable.get_property(pname).value.id) + elif isinstance(identifiable.get_property(pname).value, list): + tmplist = [] + for val in identifiable.get_property(pname).value: + if isinstance(val, db.Entity): + tmplist.append(val.id) + else: + tmplist.append(val) + value = str(tmplist) + + rec_string += "{}:".format(pname) + value + return rec_string + + +def _create_hash(identifiable: db.Record) -> str: + return sha256(_create_hashable_string(identifiable).encode('utf-8')).hexdigest() + + +class IdentifiedCache(object): + def __init__(self): + self._cache = {} + + def __contains__(self, identifiable: db.Record): + return _create_hash(identifiable) in self._cache + + def __getitem__(self, identifiable: db.Record): + return self._cache[_create_hash(identifiable)] + + def add(self, record: db.Record, identifiable: db.Record): + self._cache[_create_hash(identifiable)] = record diff --git a/unittests/test_identified_cache.py b/unittests/test_identified_cache.py new file mode 100644 index 00000000..9a103463 --- /dev/null +++ b/unittests/test_identified_cache.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +# + +""" +test identified_cache module +""" + +from newcrawler.identified_cache import _create_hashable_string, IdentifiedCache +import caosdb as db + + +def test_create_hash(): + assert _create_hashable_string(db.Record("A").add_parent("B")) == "P<B>N<A>" + assert _create_hashable_string(db.Record("A") + .add_parent("B").add_property('a', 5)) == "P<B>N<A>a:5" + assert (_create_hashable_string( + db.Record("A").add_parent("B") + .add_property('a', 4).add_property('b', 5)) == _create_hashable_string( + db.Record("A").add_parent("B") + .add_property('b', 5).add_property('a', 4))) + assert (_create_hashable_string(db.Record("A") + .add_parent("B") + .add_property('a', db.Record(id=12))) == "P<B>N<A>a:12") + assert (_create_hashable_string(db.Record("A") + .add_parent("B") + .add_property('a', [db.Record(id=12)])) == "P<B>N<A>a:[12]") + assert (_create_hashable_string(db.Record("A") + .add_parent("B").add_property('a', [12])) == "P<B>N<A>a:[12]") + assert (_create_hashable_string( + db.Record("A") + .add_parent("B") + .add_property('a', [db.Record(id=12), 11])) == "P<B>N<A>a:[12, 11]") + + +def test_IdentifiedCache(): + ident = db.Record("A").add_parent("B") + record = db.Record("A").add_parent("B").add_property('b', 5) + cache = IdentifiedCache() + assert ident not in cache + cache.add(record=record, identifiable=ident) + assert ident in cache + assert record not in cache + assert cache[ident] is record -- GitLab