Skip to content
Snippets Groups Projects
Commit 419e518f authored by Florian Spreckelsen's avatar Florian Spreckelsen
Browse files

Merge branch 'dev' into f-hdf5-converter

parents 3d7ff171 735ca8bc
No related branches found
No related tags found
2 merge requests!160STY: styling,!143ENH: HDF5 Converter
Pipeline #47451 passed
......@@ -8,26 +8,47 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased] ##
### Added ###
* 'transform' sections can be added to a CFood to apply functions to values stored in variables.
* `transform` sections can be added to a CFood to apply functions to values stored in variables.
* default transform functions: submatch, split and replace.
* `*` can now be used as a wildcard in the identifiables parameter file to denote
that any Record may reference the identified one.
* `crawl.TreatedRecordLookUp` class replacing the old (and slow)
`identified_cache` module. The new class now handles all records identified by
id, path, or identifiable simultaneously. See API docs for more info on how to
add to and get from the new lookup class.
* `identifiable_adapters.IdentifiableAdapter.get_identifying_referencing_entities`
and
`identifiable_adapters.IdentifiableAdapter.get_identifying_referenced_entities`
static methods to return the referencing or referenced entities belonging to a
registered identifiable, respectively.
### Changed ###
- If the `parents` key is used in a cfood at a lower level for a Record that
* If the `parents` key is used in a cfood at a lower level for a Record that
already has a Parent (because it was explicitly given or the default Parent),
the old Parent(s) are now overwritten with the value belonging to the
`parents` key.
- If a registered identifiable states, that a reference by a Record with parent
* If a registered identifiable states, that a reference by a Record with parent
RT1 is needed, then now also references from Records that have a child of RT1
as parent are accepted.
- More aggressive caching.
* More aggressive caching.
* The `identifiable_adapters.IdentifiableAdapter` now creates (possibly empty)
reference lists for all records in `create_reference_mapping`. This allows
functions like `get_identifiable` to be called only with the subset of the
referenceing entities belonging to a specific Record.
* The `identifiable_adapters.IdentifiableAdapter` uses entity ids (negative for
entities that don't exist remotely) instead of entity objects for keeping
track of references.
### Deprecated ###
- `IdentifiableAdapter.get_file`
* `IdentifiableAdapter.get_file`
### Removed ###
* `identified_cache` module which was replaced by the `crawl.TreatedRecordLookUp` class.
### Fixed ###
* Empty Records can now be created (https://gitlab.com/caosdb/caosdb-crawler/-/issues/27)
......@@ -40,6 +61,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
handles cases correctly in which entities retrieved from the server have to be
merged with local entities that both reference another, already existing
entity
* A corner case in `split_into_inserts_and_updates` whereby two records created
in different places in the cfood definition would not be merged if both were
identified by the same LinkAhead id
### Security ###
......
This diff is collapsed.
......@@ -186,6 +186,49 @@ identifiabel, identifiable and identified record) for a Record.
"""
pass
@staticmethod
def get_identifying_referencing_entities(referencing_entities, registered_identifiable):
refs = []
for prop in registered_identifiable.properties:
if prop.name.lower() != "is_referenced_by":
continue
for givenrt in prop.value:
found = False
if givenrt == "*":
for val in referencing_entities.values():
if len(val) > 0:
found = True
refs.extend(val)
else:
rt_and_children = get_children_of_rt(givenrt)
for rtname in rt_and_children:
if (rtname in referencing_entities):
refs.extend(referencing_entities[rtname])
found = True
if not found:
raise NotImplementedError(
f"An identifying property:\n"
f"\nIdentifying PROPERTY\n{prop.name}"
)
return refs
@staticmethod
def get_identifying_referenced_entities(record, registered_identifiable):
refs = []
for prop in registered_identifiable.properties:
pname = prop.name.lower()
if pname == "name" or pname == "is_referenced_by":
continue
if record.get_property(prop.name) is None:
raise RuntimeError("Missing identifying Property")
pval = record.get_property(prop.name).value
if not isinstance(prop.value, list):
pval = [prop.value]
for val in pval:
if isinstance(val, db.Entity):
refs.append(val)
return refs
def get_identifiable(self, record: db.Record, referencing_entities=None):
"""
retrieve the registered identifiable and fill the property values to create an
......@@ -193,7 +236,7 @@ identifiabel, identifiable and identified record) for a Record.
Args:
record: the record for which the Identifiable shall be created.
referencing_entities: a dictionary (Type: dict[int, dict[str, list[db.Entity]]]), that
referencing_entities: a dictionary (Type: dict[str, list[db.Entity]]), that
allows to look up entities with a certain RecordType, that reference ``record``
Returns:
......@@ -212,6 +255,8 @@ identifiabel, identifiable and identified record) for a Record.
name_is_identifying_property = False
if registered_identifiable is not None:
identifiable_backrefs = self.get_identifying_referencing_entities(
referencing_entities, registered_identifiable)
# fill the values:
for prop in registered_identifiable.properties:
if prop.name == "name":
......@@ -222,31 +267,8 @@ identifiabel, identifiable and identified record) for a Record.
# case A: in the registered identifiable
# case B: in the identifiable
# TODO: similar to the Identifiable class, Registered Identifiable should be a
# separate class too
# treated above
if prop.name.lower() == "is_referenced_by":
for givenrt in prop.value:
found = False
if givenrt == "*":
if id(record) not in referencing_entities:
continue
for rt, rec in referencing_entities[id(record)].items():
identifiable_backrefs.extend(rec)
found = True
else:
rt_and_children = get_children_of_rt(givenrt)
for rtname in rt_and_children:
if (id(record) in referencing_entities
and (rtname in referencing_entities[id(record)])):
identifiable_backrefs.extend(
referencing_entities[id(record)][rtname])
found = True
if not found:
# TODO: is this the appropriate error?
raise NotImplementedError(
f"The following record is missing an identifying property:\n"
f"RECORD\n{record}\nIdentifying PROPERTY\n{prop.name}"
)
continue
record_prop = record.get_property(prop.name)
......
#!/usr/bin/env python3
# encoding: utf-8
#
# ** header v3.0
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# ** end header
#
"""
see class docstring
"""
from .identifiable import Identifiable
import caosdb as db
class IdentifiedCache(object):
"""
This class is like a dictionary where the keys are Identifiables. When you check whether an
Identifiable exists as key this class returns True not only if that exact Python object is
used as a key, but if an Identifiable is used as key that is **equal** to the one being
considered (see __eq__ function of Identifiable). Similarly, if you do `cache[identifiable]`
you get the Record where the key is an Identifiable that is equal to the one in the rectangular
brackets.
This class is used for Records where we checked the existence in a remote server using
identifiables. If the Record was found, this means that we identified the corresponding Record
in the remote server and the ID of the local object can be set.
To prevent querying the server again and again for the same objects, this cache allows storing
Records that were found on a remote server and those that were not (typically in separate
caches).
"""
def __init__(self):
self._cache = {}
self._identifiables = []
def __contains__(self, identifiable: Identifiable):
return identifiable in self._identifiables
def __getitem__(self, identifiable: db.Record):
index = self._identifiables.index(identifiable)
return self._cache[id(self._identifiables[index])]
def add(self, record: db.Record, identifiable: Identifiable):
self._cache[id(identifiable)] = record
self._identifiables.append(identifiable)
......@@ -38,8 +38,9 @@ import caosdb as db
import caosdb.common.models as dbmodels
import pytest
import yaml
from caoscrawler.crawl import (Crawler, SecurityMode, _treat_deprecated_prefix,
crawler_main, split_restricted_path)
from caoscrawler.crawl import (Crawler, SecurityMode, TreatedRecordLookUp,
_treat_deprecated_prefix, crawler_main,
split_restricted_path)
from caoscrawler.debug_tree import DebugTree
from caoscrawler.identifiable import Identifiable
from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter,
......@@ -247,8 +248,8 @@ def test_split_into_inserts_and_updates_single(crawler_mocked_identifiable_retri
entlist = [db.Record(name="A").add_parent(
"C"), db.Record(name="B").add_parent("C")]
assert crawler.get_from_any_cache(identlist[0]) is None
assert crawler.get_from_any_cache(identlist[1]) is None
assert crawler.treated_records_lookup.get_any(entlist[0], identlist[0]) is None
assert crawler.treated_records_lookup.get_any(entlist[0], identlist[0]) is None
assert not crawler._has_reference_value_without_id(identlist[0])
assert not crawler._has_reference_value_without_id(identlist[1])
assert crawler.identifiableAdapter.retrieve_identified_record_for_record(
......@@ -368,30 +369,34 @@ def test_has_missing_object_in_references():
# one reference with id -> check
assert not crawler._has_missing_object_in_references(
Identifiable(name="C", record_type="RTC", properties={'d': 123}), [])
Identifiable(name="C", record_type="RTC", properties={'d': 123}), {})
# one ref with Entity with id -> check
rec = db.Record(id=123).add_parent("C")
assert not crawler._has_missing_object_in_references(
Identifiable(name="C", record_type="RTC", properties={'d': db.Record(id=123)
.add_parent("C")}), [])
Identifiable(name="C", record_type="RTC", properties={'d': rec}), {id(rec): {'C': [None]}})
# one ref with id one with Entity with id (mixed) -> check
rec = db.Record(id=123).add_parent("RTC")
assert not crawler._has_missing_object_in_references(
Identifiable(name="C", record_type="RTD",
properties={'d': 123, 'b': db.Record(id=123).add_parent("RTC")}), [])
properties={'d': 123, 'b': rec}), {id(rec): {'C': [None]}})
# entity to be referenced in the following
a = db.Record(name="C").add_parent("C").add_property("d", 12311)
# one ref with id one with Entity without id (but not identifying) -> fail
assert not crawler._has_missing_object_in_references(
Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a}), [])
Identifiable(name="C", record_type="RTC", properties={'d': 123, 'e': a}),
{id(a): {'C': [None]}})
# one ref with id one with Entity without id (mixed) -> fail
assert not crawler._has_missing_object_in_references(
Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), [])
Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}),
{id(a): {'C': [None]}})
crawler.add_to_remote_missing_cache(a, Identifiable(name="C", record_type="RTC",
crawler.treated_records_lookup.add(a, Identifiable(name="C", record_type="RTC",
properties={'d': 12311}))
# one ref with id one with Entity without id but in cache -> check
assert crawler._has_missing_object_in_references(
Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}), [])
Identifiable(name="D", record_type="RTD", properties={'d': 123, 'e': a}),
{id(a): {'C': [None]}})
# if this ever fails, the mock up may be removed
crawler.identifiableAdapter.get_registered_identifiable.assert_called()
......@@ -580,12 +585,13 @@ def test_security_mode(updateCacheMock, upmock, insmock):
def test_create_reference_mapping():
a = db.Record().add_parent("A")
b = db.Record().add_parent("B").add_property('a', a)
b = db.Record(id=132).add_parent("B").add_property('a', a)
ref = Crawler.create_reference_mapping([a, b])
assert id(a) in ref
assert id(b) not in ref
assert id(b) in ref
assert "B" in ref[id(a)]
assert ref[id(a)]["B"] == [b]
assert {} == ref[id(b)]
assert ref[id(a)]["B"] == [132]
def test_create_flat_list():
......@@ -667,8 +673,8 @@ def test_split_into_inserts_and_updates_backref(crawler_mocked_for_backref_test)
crawler.split_into_inserts_and_updates([db.Record(name="B").add_parent("C")])
# identifiables were not yet checked
assert crawler.get_from_any_cache(identlist[0]) is None
assert crawler.get_from_any_cache(identlist[1]) is None
assert crawler.treated_records_lookup.get_any(entlist[1], identlist[0]) is None
assert crawler.treated_records_lookup.get_any(entlist[0], identlist[1]) is None
# one with reference, one without
assert not crawler._has_reference_value_without_id(identlist[0])
assert crawler._has_reference_value_without_id(identlist[1])
......@@ -701,7 +707,9 @@ def test_split_into_inserts_and_updates_mult_backref(crawler_mocked_for_backref_
# test whether both entities are listed in the backref attribute of the identifiable
referencing_entities = crawler.create_reference_mapping(entlist)
identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities)
identifiable = crawler.identifiableAdapter.get_identifiable(
referenced,
referencing_entities[id(referenced)])
assert len(identifiable.backrefs) == 2
# check the split...
......@@ -723,7 +731,10 @@ def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_
# test whether both entities are listed in the backref attribute of the identifiable
referencing_entities = crawler.create_reference_mapping(entlist)
identifiable = crawler.identifiableAdapter.get_identifiable(referenced, referencing_entities)
identifiable = crawler.identifiableAdapter.get_identifiable(
referenced,
referencing_entities[id(referenced)])
assert len(identifiable.backrefs) == 2
# check the split...
......@@ -964,3 +975,55 @@ def test_replace_name_with_referenced_entity():
assert isinstance(prop.value[2], int)
assert prop.value[2] == test_id
assert caoscrawler.crawl.cached_get_entity_by.call_count == 3
def test_treated_record_lookup():
trlu = TreatedRecordLookUp()
exist = db.Record(id=1)
trlu.add(exist)
assert len(trlu._existing) == 1
# was added to existing
assert trlu._existing[id(exist)] is exist
# is in ID lookup
assert trlu._id_look_up[exist.id] is exist
# can be accessed via get_existing
assert trlu.get_existing(db.Record(id=1)) is exist
miss = db.Record()
# exception when identifiable is missing
with raises(RuntimeError):
trlu.add(miss)
ident = Identifiable(name='a')
trlu.add(miss, ident)
# was added to missing
assert trlu._missing[id(miss)] is miss
# is in ident lookup
assert trlu._identifiable_look_up[ident.get_representation()] is miss
# can be accessed via get_missing
assert trlu.get_missing(db.Record(), Identifiable(name='a')) is miss
fi = db.File(path='a', id=2)
trlu.add(fi)
assert len(trlu._existing) == 2
# was added to existing
assert trlu._existing[id(fi)] is fi
# is in ID lookup
assert trlu._id_look_up[fi.id] is fi
# is in path lookup
assert trlu._path_look_up[fi.path] is fi
# can be accessed via get_existing
assert trlu.get_existing(fi) is fi
all_exi = trlu.get_existing_list()
assert fi in all_exi
assert exist in all_exi
all_mi = trlu.get_missing_list()
assert miss in all_mi
# If a Record was added using the ID, the ID must be used to identify it even though later an
# identifiable may be passed as well
assert trlu.get_any(exist, Identifiable(name='b')) is exist
fi2 = db.File(path='b')
trlu.add(fi2)
assert trlu.get_any(db.File(path='b'), Identifiable(name='c')) is fi2
......@@ -24,10 +24,9 @@
test identifiable module
"""
import pytest
import caosdb as db
import pytest
from caoscrawler.identifiable import Identifiable
from caoscrawler.identified_cache import IdentifiedCache
def test_create_hashable_string():
......
......@@ -143,10 +143,9 @@ def test_wildcard_ref():
.add_property(name="last_name", value='Tom'))
identifiable = ident.get_identifiable(rec,
referencing_entities={
id(rec):
{'A': [db.Record(id=1).add_parent("A")]}}
'A': [1]}
)
assert identifiable.backrefs[0].id == 1
assert identifiable.backrefs[0] == 1
def test_convert_value():
......
#!/usr/bin/env python3
# encoding: utf-8
#
# ** header v3.0
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2021 Indiscale GmbH <info@indiscale.com>
# Copyright (C) 2021 Henrik tom Wörden <h.tomwoerden@indiscale.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# ** end header
#
"""
test identified_cache module
"""
import caosdb as db
from caoscrawler.identifiable import Identifiable
from caoscrawler.identified_cache import IdentifiedCache
def test_IdentifiedCache():
ident = Identifiable(name="A", record_type="B")
record = db.Record("A").add_parent("B").add_property('b', 5)
cache = IdentifiedCache()
assert ident not in cache
cache.add(record=record, identifiable=ident)
assert ident in cache
assert cache[ident] is record
assert Identifiable(name="A", record_type="C") != Identifiable(name="A", record_type="B")
assert Identifiable(name="A", record_type="C") not in cache
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment