From 65a3a403d838c54aaebe2b3c73550eaf114a043a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <henrik@trineo.org>
Date: Fri, 17 Dec 2021 17:04:07 +0100
Subject: [PATCH] ENH: add cache

---
 src/newcrawler/identified_cache.py | 72 ++++++++++++++++++++++++++++++
 unittests/test_identified_cache.py | 65 +++++++++++++++++++++++++++
 2 files changed, 137 insertions(+)
 create mode 100644 src/newcrawler/identified_cache.py
 create mode 100644 unittests/test_identified_cache.py

diff --git a/src/newcrawler/identified_cache.py b/src/newcrawler/identified_cache.py
new file mode 100644
index 00000000..e02e19d8
--- /dev/null
+++ b/src/newcrawler/identified_cache.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+#
+# ** header v3.0
+# This file is a part of the CaosDB Project.
+#
+# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com>
+# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+# ** end header
+#
+
+"""
+stores identified records and is able to detect duplicates
+"""
+
+import caosdb as db
+
+from hashlib import sha256
+
+
+def _create_hashable_string(identifiable: db.Record):
+    """
+    creates a string from the attributes of an identifiable that can be hashed
+    """
+    rec_string = "P<{}>N<{}>".format(identifiable.parents[0].name, identifiable.name)
+    for pname in sorted([p.name for p in identifiable.properties]):
+        value = str(identifiable.get_property(pname).value)
+        if isinstance(identifiable.get_property(pname).value, db.Entity):
+            value = str(identifiable.get_property(pname).value.id)
+        elif isinstance(identifiable.get_property(pname).value, list):
+            tmplist = []
+            for val in identifiable.get_property(pname).value:
+                if isinstance(val, db.Entity):
+                    tmplist.append(val.id)
+                else:
+                    tmplist.append(val)
+            value = str(tmplist)
+
+        rec_string += "{}:".format(pname) + value
+    return rec_string
+
+
+def _create_hash(identifiable: db.Record) -> str:
+    return sha256(_create_hashable_string(identifiable).encode('utf-8')).hexdigest()
+
+
+class IdentifiedCache(object):
+    def __init__(self):
+        self._cache = {}
+
+    def __contains__(self, identifiable: db.Record):
+        return _create_hash(identifiable) in self._cache
+
+    def __getitem__(self, identifiable: db.Record):
+        return self._cache[_create_hash(identifiable)]
+
+    def add(self, record: db.Record, identifiable: db.Record):
+        self._cache[_create_hash(identifiable)] = record
diff --git a/unittests/test_identified_cache.py b/unittests/test_identified_cache.py
new file mode 100644
index 00000000..9a103463
--- /dev/null
+++ b/unittests/test_identified_cache.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+#
+# ** header v3.0
+# This file is a part of the CaosDB Project.
+#
+# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com>
+# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+# ** end header
+#
+
+"""
+test identified_cache module
+"""
+
+from newcrawler.identified_cache import _create_hashable_string, IdentifiedCache
+import caosdb as db
+
+
+def test_create_hash():
+    assert _create_hashable_string(db.Record("A").add_parent("B")) == "P<B>N<A>"
+    assert _create_hashable_string(db.Record("A")
+                                   .add_parent("B").add_property('a', 5)) == "P<B>N<A>a:5"
+    assert (_create_hashable_string(
+        db.Record("A").add_parent("B")
+        .add_property('a', 4).add_property('b', 5)) == _create_hashable_string(
+            db.Record("A").add_parent("B")
+            .add_property('b', 5).add_property('a', 4)))
+    assert (_create_hashable_string(db.Record("A")
+                                    .add_parent("B")
+                                    .add_property('a', db.Record(id=12))) == "P<B>N<A>a:12")
+    assert (_create_hashable_string(db.Record("A")
+                                    .add_parent("B")
+                                    .add_property('a', [db.Record(id=12)])) == "P<B>N<A>a:[12]")
+    assert (_create_hashable_string(db.Record("A")
+                                    .add_parent("B").add_property('a', [12])) == "P<B>N<A>a:[12]")
+    assert (_create_hashable_string(
+        db.Record("A")
+        .add_parent("B")
+        .add_property('a', [db.Record(id=12), 11])) == "P<B>N<A>a:[12, 11]")
+
+
+def test_IdentifiedCache():
+    ident = db.Record("A").add_parent("B")
+    record = db.Record("A").add_parent("B").add_property('b', 5)
+    cache = IdentifiedCache()
+    assert ident not in cache
+    cache.add(record=record, identifiable=ident)
+    assert ident in cache
+    assert record not in cache
+    assert cache[ident] is record
-- 
GitLab