Skip to content
Snippets Groups Projects
Commit df3b43e6 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

wip

parent 9b75417a
Branches
Tags
2 merge requests!178FIX: #96 Better error output for crawl.py script.,!167Sync Graph
Pipeline #50005 failed
...@@ -139,6 +139,7 @@ def test_single_insertion(clear_database, usemodel, crawler, ident): ...@@ -139,6 +139,7 @@ def test_single_insertion(clear_database, usemodel, crawler, ident):
# xml.remove(xml.find(tag)) # xml.remove(xml.find(tag))
# f.write(db.common.utils.xml2str(xml)) # f.write(db.common.utils.xml2str(xml))
breakpoint()
assert len(ins) == 18 assert len(ins) == 18
assert len(ups) == 0 assert len(ups) == 0
......
...@@ -387,10 +387,13 @@ class Crawler(object): ...@@ -387,10 +387,13 @@ class Crawler(object):
# 3. check on the remote server # 3. check on the remote server
else: else:
st.check_remote_server(se) st.check_remote_server(se)
print("checked", se.id)
if se.id is None: if se.id is None:
st.set_missing(se) st.set_missing(se)
print("missing")
else: else:
st.set_existing(se) st.set_existing(se)
print("exisitng")
entity_was_treated = True entity_was_treated = True
# TODO # TODO
...@@ -737,6 +740,8 @@ class Crawler(object): ...@@ -737,6 +740,8 @@ class Crawler(object):
for record in to_be_updated: for record in to_be_updated:
if record.id is not None: if record.id is not None:
# TODO: use cache here? # TODO: use cache here?
print(record.id)
print(record)
identified_records.append(cached_get_entity_by(eid=record.id)) identified_records.append(cached_get_entity_by(eid=record.id))
else: else:
raise Exception("Please report a bug: At this stage all records to be updated" raise Exception("Please report a bug: At this stage all records to be updated"
......
...@@ -96,7 +96,8 @@ class Identifiable(): ...@@ -96,7 +96,8 @@ class Identifiable():
if value.id is not None: if value.id is not None:
return str(value.id) return str(value.id)
else: else:
return "PyID=" + str(id(value)) print(value)
raise RuntimeError("Python Entity without id not allowed")
elif isinstance(value, list): elif isinstance(value, list):
return "[" + ", ".join([Identifiable._value_representation(el) for el in value]) + "]" return "[" + ", ".join([Identifiable._value_representation(el) for el in value]) + "]"
elif (isinstance(value, str) or isinstance(value, int) or isinstance(value, float) elif (isinstance(value, str) or isinstance(value, int) or isinstance(value, float)
...@@ -105,7 +106,7 @@ class Identifiable(): ...@@ -105,7 +106,7 @@ class Identifiable():
else: else:
raise ValueError(f"Unknown datatype of the value: {value}") raise ValueError(f"Unknown datatype of the value: {value}")
@staticmethod @ staticmethod
def _create_hashable_string(identifiable: Identifiable) -> str: def _create_hashable_string(identifiable: Identifiable) -> str:
""" """
creates a string from the attributes of an identifiable that can be hashed creates a string from the attributes of an identifiable that can be hashed
......
...@@ -284,18 +284,24 @@ startswith: bool, optional ...@@ -284,18 +284,24 @@ startswith: bool, optional
) )
continue continue
options = [f.get_property(prop.name) for f in se.fragments options = [f.get_property(prop.name).value for f in se.fragments
if f.get_property(prop.name) is not None] if f.get_property(prop.name) is not None]
if len(options) == 0: if len(options) == 0:
raise NotImplementedError( raise NotImplementedError(
f"The following record is missing an identifying property:\n" f"The following record is missing an identifying property:\n"
f"RECORD\n{se.fragments[0]}\nIdentifying PROPERTY\n{prop.name}" f"RECORD\n{se.fragments[0]}\nIdentifying PROPERTY\n{prop.name}"
) )
if not all([f.value == options[0].value for f in options]): for ii, el in enumerate(options):
if isinstance(el, db.Entity):
options[ii] = el.id
if el.id is None:
raise RuntimeError("reference to unchecked in identifiable")
else:
options[ii] = el
if not all([f == options[0] for f in options]):
raise RuntimeError("differing prop values in fragments") raise RuntimeError("differing prop values in fragments")
record_prop = options[0]
identifiable_props[record_prop.name] = record_prop.value identifiable_props[prop.name] = options[0]
property_name_list_A.append(prop.name) property_name_list_A.append(prop.name)
# check for multi properties in the record: # check for multi properties in the record:
......
...@@ -87,6 +87,15 @@ class SemanticEntity(): ...@@ -87,6 +87,15 @@ class SemanticEntity():
class SemanticTarget(): class SemanticTarget():
""" models the target structure of Entities as it shall be created by the Crawler """ models the target structure of Entities as it shall be created by the Crawler
The target entities are composed using the information of the entity fragments (db.Entity
objects) of SemanticEntities. This is information like name, parents and properties and,
importantly, references. Those references are typically given by a Python reference to some
other db.Entity object. These references are scanned initially and used to create multiple
reference maps that track how SemanticEntities reference each other. These maps are kept up to
date when SemanticEntities are merged because they are identified with each other. When
creating the final list of db.Entity objects (``create_record_lists``) the Python references
are updated according to the reference map.
This model should only be manipulated via three functions: This model should only be manipulated via three functions:
- make_identifiable: adds an identifiable to a SemanticEntity what possibly allows to merge it - make_identifiable: adds an identifiable to a SemanticEntity what possibly allows to merge it
with another SemanticEntity with another SemanticEntity
...@@ -105,7 +114,8 @@ class SemanticTarget(): ...@@ -105,7 +114,8 @@ class SemanticTarget():
self._identifiable_look_up: Dict[str, SemanticEntity] = {} self._identifiable_look_up: Dict[str, SemanticEntity] = {}
self._missing: Dict[int, SemanticEntity] = {} self._missing: Dict[int, SemanticEntity] = {}
self._existing: Dict[int, SemanticEntity] = {} self._existing: Dict[int, SemanticEntity] = {}
self._remote_missing_counter = -1 # TODO: I guess we can now get rid of this... # entities that are missing get negative IDs to allow identifiable creation
self._remote_missing_counter = -1
# create initial set of SemanticEntities from provided Entity list # create initial set of SemanticEntities from provided Entity list
self.se: List[SemanticEntity] = [] # list of all SemanticEntities self.se: List[SemanticEntity] = [] # list of all SemanticEntities
...@@ -113,7 +123,9 @@ class SemanticTarget(): ...@@ -113,7 +123,9 @@ class SemanticTarget():
self.se_lookup: Dict[str, SemanticEntity] = {} # lookup: UUID -> SemanticEntity self.se_lookup: Dict[str, SemanticEntity] = {} # lookup: UUID -> SemanticEntity
entities = self._create_flat_list(entities) entities = self._create_flat_list(entities)
self._sanity_check(entities) self._sanity_check(entities)
print("ids")
for el in entities: for el in entities:
print(el.id)
self.se.append(SemanticEntity( self.se.append(SemanticEntity(
el, el,
self.identifiableAdapter.get_registered_identifiable(el))) self.identifiableAdapter.get_registered_identifiable(el)))
...@@ -166,6 +178,8 @@ class SemanticTarget(): ...@@ -166,6 +178,8 @@ class SemanticTarget():
if se.path is None and se.identifiable is None: if se.path is None and se.identifiable is None:
raise RuntimeError("no identifying information") raise RuntimeError("no identifying information")
se.id = self._remote_missing_counter se.id = self._remote_missing_counter
for f in se.fragments:
f.id = self._remote_missing_counter
self._remote_missing_counter -= 1 self._remote_missing_counter -= 1
self._add_any(se, self._missing) self._add_any(se, self._missing)
self.unchecked.remove(se) self.unchecked.remove(se)
...@@ -177,6 +191,7 @@ class SemanticTarget(): ...@@ -177,6 +191,7 @@ class SemanticTarget():
on the remote server. on the remote server.
""" """
assert se.id is not None assert se.id is not None
assert se.id > 0
self._add_any(se, self._existing) self._add_any(se, self._existing)
self.unchecked.remove(se) self.unchecked.remove(se)
...@@ -191,6 +206,10 @@ class SemanticTarget(): ...@@ -191,6 +206,10 @@ class SemanticTarget():
return self._id_look_up[entity.id] return self._id_look_up[entity.id]
if entity.path is not None and entity.path in self._path_look_up: if entity.path is not None and entity.path in self._path_look_up:
return self._path_look_up[entity.path] return self._path_look_up[entity.path]
for e in self.se:
if e.identifiable is not None:
print(e.identifiable._create_hashable_string(e.identifiable))
print(self._identifiable_look_up)
if (entity.identifiable is not None and entity.identifiable.get_representation() in if (entity.identifiable is not None and entity.identifiable.get_representation() in
self._identifiable_look_up): self._identifiable_look_up):
return self._identifiable_look_up[entity.identifiable.get_representation()] return self._identifiable_look_up[entity.identifiable.get_representation()]
...@@ -217,12 +236,21 @@ class SemanticTarget(): ...@@ -217,12 +236,21 @@ class SemanticTarget():
else: else:
return None return None
def combine_fragments(self):
for se in self.se:
if len(se.fragments) < 2:
continue
for ent in se.fragments[1:]:
merge_entities(se.fragments[0], ent, merge_id_with_resolved_entity=True)
def create_record_lists(self): def create_record_lists(self):
for se in self.se: for se in self.se:
for f in se.fragments: for f in se.fragments:
f.id = se.id f.id = se.id
f.path = se.path f.path = se.path
self._update_reference_values()
self.combine_fragments() self.combine_fragments()
# TODO assure that there is only one fragment each
missing = [el.fragments[0] for el in self._missing.values()] missing = [el.fragments[0] for el in self._missing.values()]
# remove negative IDs # remove negative IDs
...@@ -235,6 +263,18 @@ class SemanticTarget(): ...@@ -235,6 +263,18 @@ class SemanticTarget():
return (missing, [el.fragments[0] for el in self._existing.values()]) return (missing, [el.fragments[0] for el in self._existing.values()])
def _update_reference_values(self):
for se in self.se:
for f in se.fragments:
for p in f.properties:
if isinstance(p.value, list):
for index, val in enumerate(p.value):
if id(val) in self.se_lookup:
p.value[index] = self.se_lookup[id(val)].fragments[0]
else:
if id(p.value) in self.se_lookup:
p.value = self.se_lookup[id(p.value)].fragments[0]
def identity_relies_on_unchecked_entity(self, se: SemanticEntity): def identity_relies_on_unchecked_entity(self, se: SemanticEntity):
""" """
If a record for which it could not yet be verified whether it exists in LA or not is part If a record for which it could not yet be verified whether it exists in LA or not is part
...@@ -262,14 +302,6 @@ class SemanticTarget(): ...@@ -262,14 +302,6 @@ class SemanticTarget():
if ent.role == "Record" and len(ent.parents) == 0: if ent.role == "Record" and len(ent.parents) == 0:
raise RuntimeError(f"Records must have a parent.\n{ent}") raise RuntimeError(f"Records must have a parent.\n{ent}")
def combine_fragments(self):
for se in self.se:
if len(se.fragments) < 2:
continue
for ent in se.fragments[1:]:
merge_entities(se.fragments[0], ent, merge_id_with_resolved_entity=True)
se.fragments = [se.fragments[0]]
@ staticmethod @ staticmethod
def _create_flat_list(ent_list: List[db.Entity], flat: Optional[List[db.Entity]] = None): def _create_flat_list(ent_list: List[db.Entity], flat: Optional[List[db.Entity]] = None):
""" """
...@@ -394,8 +426,9 @@ class SemanticTarget(): ...@@ -394,8 +426,9 @@ class SemanticTarget():
el.name == p.name]) > 0: el.name == p.name]) > 0:
forward_id_references[se.uuid].add(vse) forward_id_references[se.uuid].add(vse)
backward_id_references[vse.uuid].add(se) backward_id_references[vse.uuid].add(se)
if IdentifiableAdapter.referencing_entity_has_appropriate_type( if (vse.registered_identifiable is not None and
ent.parents, vse.registered_identifiable): IdentifiableAdapter.referencing_entity_has_appropriate_type(
ent.parents, vse.registered_identifiable)):
forward_id_referenced_by[se.uuid].add(vse) forward_id_referenced_by[se.uuid].add(vse)
backward_id_referenced_by[vse.uuid].add(se) backward_id_referenced_by[vse.uuid].add(se)
...@@ -407,6 +440,7 @@ class SemanticTarget(): ...@@ -407,6 +440,7 @@ class SemanticTarget():
""" A path or an ID is sufficiently identifying. Thus, those entities can be marked as """ A path or an ID is sufficiently identifying. Thus, those entities can be marked as
checked """ checked """
for semantic_entity in list(self.se[::-1]): for semantic_entity in list(self.se[::-1]):
print(semantic_entity.uuid)
assert len(semantic_entity.fragments) == 1 assert len(semantic_entity.fragments) == 1
entity = semantic_entity.fragments[0] entity = semantic_entity.fragments[0]
if entity.id is None and entity.path is None: if entity.id is None and entity.path is None:
...@@ -419,9 +453,10 @@ class SemanticTarget(): ...@@ -419,9 +453,10 @@ class SemanticTarget():
if existing is not None: if existing is not None:
semantic_entity.identify_with(existing) semantic_entity.identify_with(existing)
# at this point, semantic_entity has an ID if it is existing
treated_before = self.get_checked_equivalent(semantic_entity) treated_before = self.get_checked_equivalent(semantic_entity)
if treated_before is None: if treated_before is None:
if semantic_entity.id is None: if semantic_entity.id is None or semantic_entity.id < 0:
self.set_missing(semantic_entity) self.set_missing(semantic_entity)
else: else:
self.set_existing(semantic_entity) self.set_existing(semantic_entity)
...@@ -451,23 +486,6 @@ class SemanticTarget(): ...@@ -451,23 +486,6 @@ class SemanticTarget():
return None return None
return circle return circle
@ staticmethod
def _bend_references_to_new_object(old, new, entities):
# TODO still needed???
""" Bend references to the other object
Iterate over all entities in `entities` and check the values of all properties of
occurances of old Entity and replace them with new Entity
"""
for el in entities:
for p in el.properties:
if isinstance(p.value, list):
for index, val in enumerate(p.value):
if val is old:
p.value[index] = new
else:
if p.value is old:
p.value = new
def _add_any(self, entity: SemanticEntity, lookup): def _add_any(self, entity: SemanticEntity, lookup):
if entity.id is not None: if entity.id is not None:
self._id_look_up[entity.id] = entity self._id_look_up[entity.id] = entity
......
...@@ -413,6 +413,8 @@ def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiab ...@@ -413,6 +413,8 @@ def test_split_into_inserts_and_updates_with_copy_attr(crawler_mocked_identifiab
crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called() crawler.identifiableAdapter.retrieve_identified_record_for_identifiable.assert_called()
@ patch("caoscrawler.crawl.cached_get_entity_by",
new=Mock(side_effect=mock_get_entity_by))
@patch("caoscrawler.identifiable_adapters.cached_query", @patch("caoscrawler.identifiable_adapters.cached_query",
new=Mock(side_effect=mock_cached_only_rt)) new=Mock(side_effect=mock_cached_only_rt))
def test_split_iiau_with_unmergeable_list_items(): def test_split_iiau_with_unmergeable_list_items():
...@@ -480,7 +482,7 @@ a: ([b1, b2]) ...@@ -480,7 +482,7 @@ a: ([b1, b2])
crawler = Crawler(identifiableAdapter=ident_adapter) crawler = Crawler(identifiableAdapter=ident_adapter)
st = SemanticTarget([rec_a, *rec_b, *rec_c], crawler.identifiableAdapter) st = SemanticTarget(deepcopy([rec_a, *rec_b, *rec_c]), crawler.identifiableAdapter)
assert st.identity_relies_on_unchecked_entity(st.se[0]) is False assert st.identity_relies_on_unchecked_entity(st.se[0]) is False
assert st.identity_relies_on_unchecked_entity(st.se[1]) assert st.identity_relies_on_unchecked_entity(st.se[1])
assert st.identity_relies_on_unchecked_entity(st.se[2]) assert st.identity_relies_on_unchecked_entity(st.se[2])
......
...@@ -54,11 +54,6 @@ def test_create_hashable_string(): ...@@ -54,11 +54,6 @@ def test_create_hashable_string():
Identifiable(name="A", record_type="B", properties={ Identifiable(name="A", record_type="B", properties={
'a': [db.Record(id=12), 11]}) 'a': [db.Record(id=12), 11]})
) == "P<B>N<A>R<[]>a:[12, 11]") ) == "P<B>N<A>R<[]>a:[12, 11]")
assert (
Identifiable._create_hashable_string(
Identifiable(record_type="B", properties={'a': [db.Record()]})
) != Identifiable._create_hashable_string(
Identifiable(record_type="B", properties={'a': [db.Record()]})))
assert Identifiable._create_hashable_string( assert Identifiable._create_hashable_string(
Identifiable(name="A", record_type="B", backrefs=[123, db.Entity(id=124)], Identifiable(name="A", record_type="B", backrefs=[123, db.Entity(id=124)],
properties={'a': 5})) == "P<B>N<A>R<['123', '124']>a:5" properties={'a': 5})) == "P<B>N<A>R<['123', '124']>a:5"
......
...@@ -314,11 +314,13 @@ def test_merging(): ...@@ -314,11 +314,13 @@ def test_merging():
db.File(name='101').add_parent("A").add_property('a', value=1), db.File(name='101').add_parent("A").add_property('a', value=1),
db.File(name='101').add_parent("A").add_property('a', value=1)] db.File(name='101').add_parent("A").add_property('a', value=1)]
st = SemanticTarget(entlist, ident_adapter) st = SemanticTarget(entlist, ident_adapter)
assert len(st.unchecked) == 2
st.make_identifiable(st.se[0]) st.make_identifiable(st.se[0])
st.check_remote_server(st.se[0]) st.check_remote_server(st.se[0])
st.set_missing(st.se[0]) st.set_missing(st.se[0])
assert len(st.unchecked) == 1 assert len(st.unchecked) == 1
st.make_identifiable(st.se[1]) st.make_identifiable(st.se[1])
assert st.se[1].id is None
assert st.merge_with_equivalent(st.se[1]) assert st.merge_with_equivalent(st.se[1])
assert len(st.se) == 1 assert len(st.se) == 1
assert len(st.unchecked) == 0 assert len(st.unchecked) == 0
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment