diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index e8788a640b4bfdffe283cd96de87f5849d9abee0..131747e340731153e432b61650994c6ad1115515 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -69,6 +69,7 @@ from .logging import configure_server_side_logging from .macros import defmacro_constructor, macro_constructor from .scanner import (create_converter_registry, initialize_converters, load_definition, scan_directory, scan_structure_elements) +from .semantic_target import SemanticTarget from .stores import GeneralStore from .structure_elements import StructureElement from .treated_record_lookup import TreatedRecordLookUp @@ -380,36 +381,6 @@ class Crawler(object): return True return False - @staticmethod - def create_flat_list(ent_list: list[db.Entity], flat: Optional[list[db.Entity]] = None): - """ - Recursively adds entities and all their properties contained in ent_list to - the output list flat. - - TODO: This function will be moved to pylib as it is also needed by the - high level API. - """ - # Note: A set would be useful here, but we do not want a random order. - if flat is None: - flat = list() - for el in ent_list: - if el not in flat: - flat.append(el) - for ent in ent_list: - for p in ent.properties: - # For lists append each element that is of type Entity to flat: - if isinstance(p.value, list): - for el in p.value: - if isinstance(el, db.Entity): - if el not in flat: - flat.append(el) - Crawler.create_flat_list([el], flat) - elif isinstance(p.value, db.Entity): - if p.value not in flat: - flat.append(p.value) - Crawler.create_flat_list([p.value], flat) - return flat - def _has_missing_object_in_references(self, ident: Identifiable, referencing_entities: dict): """ returns False if any value in the properties attribute is a db.Entity object that @@ -589,19 +560,14 @@ class Crawler(object): return references def split_into_inserts_and_updates(self, ent_list: list[db.Entity]): - flat = Crawler.create_flat_list(ent_list) - all_records = list(flat) - - # TODO: can the following be removed at some point - for ent in flat: - if ent.role == "Record" and len(ent.parents) == 0: - raise RuntimeError(f"Records must have a parent.\n{ent}") + st = SemanticTarget(ent_list) + all_records = list(st.entities) try_to_merge_later = [] # Check whether Records can be identified without identifiable - for i in reversed(range(len(flat))): - record = flat[i] + for i in reversed(range(len(st.entities))): + record = st.entities[i] # 1. Can it be identified via an ID? if record.id is not None: treated_record = self.treated_records_lookup.get_existing(record) @@ -612,7 +578,7 @@ class Crawler(object): else: self.treated_records_lookup.add(record, None) assert record.id - del flat[i] + del st.entities[i] # 2. Can it be identified via a path? elif record.path is not None: try: @@ -634,11 +600,11 @@ class Crawler(object): # TODO add identifiable if possible self.treated_records_lookup.add(record, None) assert record.id - del flat[i] + del st.entities[i] entity_was_treated = True - # flat contains Entities which could not yet be checked against the remote server - while entity_was_treated and len(flat) > 0: + # st.entities contains Entities which could not yet be checked against the remote server + while entity_was_treated and len(st.entities) > 0: entity_was_treated = False referencing_entities = self.create_reference_mapping(all_records) @@ -649,8 +615,8 @@ class Crawler(object): # 1. Is it in the cache of already checked Records? # 2. Can it be checked on the remote server? # 3. Does it have to be new since a needed reference is missing? - for i in reversed(range(len(flat))): - record = flat[i] + for i in reversed(range(len(st.entities))): + record = st.entities[i] if self._identity_relies_on_unchecked_entities(record, referencing_entities[id(record)]): @@ -670,7 +636,7 @@ class Crawler(object): all_records.remove(record) referencing_entities = self.create_reference_mapping(all_records) - del flat[i] + del st.entities[i] entity_was_treated = True # 2. Can it be checked on the remote server? @@ -687,7 +653,7 @@ class Crawler(object): record.path = identified_record.path self.treated_records_lookup.add(record, identifiable) assert record.id - del flat[i] + del st.entities[i] entity_was_treated = True # 3. Does it have to be new since a needed reference is missing? @@ -696,10 +662,10 @@ class Crawler(object): elif self._has_missing_object_in_references(identifiable, referencing_entities): self.treated_records_lookup.add(record, identifiable) assert record.id - del flat[i] + del st.entities[i] entity_was_treated = True - for record in flat: + for record in st.entities: self.replace_references_with_cached(record, referencing_entities) # We postponed the merge for records where it failed previously and try it again now. @@ -710,11 +676,11 @@ class Crawler(object): referencing_entities=referencing_entities[id(record)]) newrecord = self.treated_records_lookup.get_any(record, identifiable) merge_entities(newrecord, record, merge_id_with_resolved_entity=True) - if len(flat) > 0: - circle = self.detect_circular_dependency(flat) + if len(st.entities) > 0: + circle = self.detect_circular_dependency(st.entities) if circle is None: logger.error("Failed, but found NO circular dependency. The data is as follows:" - + str(self.compact_entity_list_representation(flat, + + str(self.compact_entity_list_representation(st.entities, referencing_entities))) else: logger.error("Found circular dependency (Note that this might include references " diff --git a/src/caoscrawler/semantic_target.py b/src/caoscrawler/semantic_target.py index 36721aa4e879ba2450d2baa91735c3fc1433574c..7ce4a4d423ad5374ce4972a24bf776a60401a505 100644 --- a/src/caoscrawler/semantic_target.py +++ b/src/caoscrawler/semantic_target.py @@ -23,6 +23,48 @@ A data model class for the semantic data that shall be created by synchronization of the crawler. """ +from typing import Any, List, Optional, Union + +import linkahead as db + class SemanticTarget(): - def __init__(self, records): + def __init__(self, entities: list[db.Entity]): + self.entities = self.create_flat_list(entities) + self.sanity_check(self.entities) + + @staticmethod + def sanity_check(entities: list[db.Entity]): + for ent in entities: + if ent.role == "Record" and len(ent.parents) == 0: + raise RuntimeError(f"Records must have a parent.\n{ent}") + + @staticmethod + def create_flat_list(ent_list: list[db.Entity], flat: Optional[list[db.Entity]] = None): + """ + Recursively adds entities and all their properties contained in ent_list to + the output list flat. + + TODO: This function will be moved to pylib as it is also needed by the + high level API. + """ + # Note: A set would be useful here, but we do not want a random order. + if flat is None: + flat = list() + for el in ent_list: + if el not in flat: + flat.append(el) + for ent in ent_list: + for p in ent.properties: + # For lists append each element that is of type Entity to flat: + if isinstance(p.value, list): + for el in p.value: + if isinstance(el, db.Entity): + if el not in flat: + flat.append(el) + SemanticTarget.create_flat_list([el], flat) + elif isinstance(p.value, db.Entity): + if p.value not in flat: + flat.append(p.value) + SemanticTarget.create_flat_list([p.value], flat) + return flat diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index a48b5e16ad1a71beeb4a5bf1c2ac52f67bbd7afe..7a0904e08896eb188c302ead07cbf0373369191c 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -709,26 +709,6 @@ def test_create_reference_mapping(): assert ref[id(a)]["B"] == [132] -def test_create_flat_list(): - a = db.Record() - b = db.Record() - a.add_property(name="a", value=a) - a.add_property(name="b", value=b) - flat = Crawler.create_flat_list([a]) - assert len(flat) == 2 - assert a in flat - assert b in flat - c = db.Record() - c.add_property(name="a", value=a) - # This would caus recursion if it is not dealt with properly. - a.add_property(name="c", value=c) - flat = Crawler.create_flat_list([c]) - assert len(flat) == 3 - assert a in flat - assert b in flat - assert c in flat - - @ pytest.fixture def crawler_mocked_for_backref_test(): crawler = Crawler() diff --git a/unittests/test_semantic_target.py b/unittests/test_semantic_target.py new file mode 100644 index 0000000000000000000000000000000000000000..e13a62e17fcddae0aa3b1d5191c64d11a2972f8a --- /dev/null +++ b/unittests/test_semantic_target.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +import linkahead as db +from caoscrawler.semantic_target import SemanticTarget + + +def test_create_flat_list(): + a = db.Record() + b = db.Record() + a.add_property(name="a", value=a) + a.add_property(name="b", value=b) + flat = SemanticTarget.create_flat_list([a]) + assert len(flat) == 2 + assert a in flat + assert b in flat + c = db.Record() + c.add_property(name="a", value=a) + # This would caus recursion if it is not dealt with properly. + a.add_property(name="c", value=c) + flat = SemanticTarget.create_flat_list([c]) + assert len(flat) == 3 + assert a in flat + assert b in flat + assert c in flat