diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index d21e6e2521578dc407e445d8220506677be84e26..e8788a640b4bfdffe283cd96de87f5849d9abee0 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -71,6 +71,7 @@ from .scanner import (create_converter_registry, initialize_converters, load_definition, scan_directory, scan_structure_elements) from .stores import GeneralStore from .structure_elements import StructureElement +from .treated_record_lookup import TreatedRecordLookUp logger = logging.getLogger(__name__) @@ -225,110 +226,6 @@ class SecurityMode(Enum): UPDATE = 2 -class TreatedRecordLookUp(): - """tracks Records and Identifiables for which it was checked whether they exist in the remote - server - - For a given Record it can be checked, whether it exists in the remote sever if - - it has a (valid) ID - - it has a (valid) path (FILEs only) - - an identifiable can be created for the Record. - - Records are added by calling the `add` function and they are then added to the internal - existing or missing list depending on whether the Record has a valid ID. - Additionally, the Record is added to three look up dicts. The keys of those are paths, IDs and - the representation of the identifiables. - - The extreme case, that one could imagine, would be that the same Record occurs three times as - different Python objects: one that only has an ID, one with only a path and one without ID and - path but with identifying properties. During `split_into_inserts_and_updates` all three - must be identified with each other (and must be merged). Since we require, that treated - entities have a valid ID if they exist in the remote server, all three objects would be - identified with each other simply using the IDs. - - In the case that the Record is not yet in the remote server, there cannot be a Python object - with an ID. Thus we might have one with a path and one with an identifiable. If that Record - does not yet exist, it is necessary that both Python objects have at least either the path or - the identifiable in common. - """ - - def __init__(self): - self._id_look_up: dict[int, db.Entity] = {} - self._path_look_up: dict[str, db.Entity] = {} - self._identifiable_look_up: dict[str, db.Entity] = {} - self.remote_missing_counter = -1 - self._missing: dict[int, db.Entity] = {} - self._existing: dict[int, db.Entity] = {} - - def add(self, record: db.Entity, identifiable: Optional[Identifiable] = None): - """ - Add a Record that was treated, such that it is contained in the internal look up dicts - - This Record MUST have an ID if it was found in the remote server. - """ - if record.id is None: - if record.path is None and identifiable is None: - raise RuntimeError("Record must have ID or path or an identifiable must be given." - f"Record is\n{record}") - record.id = self.remote_missing_counter - self.remote_missing_counter -= 1 - self._add_any(record, self._missing, identifiable) - else: - self._add_any(record, self._existing, identifiable) - - def get_any(self, record: db.Entity, identifiable: Optional[Identifiable] = None): - """ - Check whether this Record was already added. Identity is based on ID, path or Identifiable - represenation - """ - if record.id is not None and record.id in self._id_look_up: - return self._id_look_up[record.id] - if record.path is not None and record.path in self._path_look_up: - return self._path_look_up[record.path] - if (identifiable is not None and identifiable.get_representation() in - self._identifiable_look_up): - return self._identifiable_look_up[identifiable.get_representation()] - - def get_existing(self, record: db.Entity, identifiable: Optional[Identifiable] = None): - """ Check whether this Record exists on the remote server - - Returns: The stored Record - """ - rec = self.get_any(record, identifiable) - if id(rec) in self._existing: - return rec - else: - return None - - def get_missing(self, record: db.Entity, identifiable: Optional[Identifiable] = None): - """ Check whether this Record is missing on the remote server - - Returns: The stored Record - """ - rec = self.get_any(record, identifiable) - if id(rec) in self._missing: - return rec - else: - return None - - def get_missing_list(self): - """ Return all Records that are missing in the remote server """ - return list(self._missing.values()) - - def get_existing_list(self): - """ Return all Records that exist in the remote server """ - return list(self._existing.values()) - - def _add_any(self, record: db.Entity, lookup, identifiable: Optional[Identifiable] = None): - if record.id is not None: - self._id_look_up[record.id] = record - if record.path is not None: - self._path_look_up[record.path] = record - if identifiable is not None: - self._identifiable_look_up[identifiable.get_representation()] = record - lookup[id(record)] = record - - class Crawler(object): """ Crawler class that encapsulates crawling functions. diff --git a/src/caoscrawler/semantic_target.py b/src/caoscrawler/semantic_target.py new file mode 100644 index 0000000000000000000000000000000000000000..36721aa4e879ba2450d2baa91735c3fc1433574c --- /dev/null +++ b/src/caoscrawler/semantic_target.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +A data model class for the semantic data that shall be created by synchronization of the crawler. +""" + + +class SemanticTarget(): + def __init__(self, records): diff --git a/src/caoscrawler/treated_record_lookup.py b/src/caoscrawler/treated_record_lookup.py new file mode 100644 index 0000000000000000000000000000000000000000..bf9e3456719cd1d1f44a513399ac11ad6980b141 --- /dev/null +++ b/src/caoscrawler/treated_record_lookup.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +from typing import Any, List, Optional, Union + +import linkahead as db + +from .identifiable import Identifiable + + +class TreatedRecordLookUp(): + """tracks Records and Identifiables for which it was checked whether they exist in the remote + server + + For a given Record it can be checked, whether it exists in the remote sever if + - it has a (valid) ID + - it has a (valid) path (FILEs only) + - an identifiable can be created for the Record. + + Records are added by calling the `add` function and they are then added to the internal + existing or missing list depending on whether the Record has a valid ID. + Additionally, the Record is added to three look up dicts. The keys of those are paths, IDs and + the representation of the identifiables. + + The extreme case, that one could imagine, would be that the same Record occurs three times as + different Python objects: one that only has an ID, one with only a path and one without ID and + path but with identifying properties. During `split_into_inserts_and_updates` all three + must be identified with each other (and must be merged). Since we require, that treated + entities have a valid ID if they exist in the remote server, all three objects would be + identified with each other simply using the IDs. + + In the case that the Record is not yet in the remote server, there cannot be a Python object + with an ID. Thus we might have one with a path and one with an identifiable. If that Record + does not yet exist, it is necessary that both Python objects have at least either the path or + the identifiable in common. + """ + + def __init__(self): + self._id_look_up: dict[int, db.Entity] = {} + self._path_look_up: dict[str, db.Entity] = {} + self._identifiable_look_up: dict[str, db.Entity] = {} + self.remote_missing_counter = -1 + self._missing: dict[int, db.Entity] = {} + self._existing: dict[int, db.Entity] = {} + + def add(self, record: db.Entity, identifiable: Optional[Identifiable] = None): + """ + Add a Record that was treated, such that it is contained in the internal look up dicts + + This Record MUST have an ID if it was found in the remote server. + """ + if record.id is None: + if record.path is None and identifiable is None: + raise RuntimeError("Record must have ID or path or an identifiable must be given." + f"Record is\n{record}") + record.id = self.remote_missing_counter + self.remote_missing_counter -= 1 + self._add_any(record, self._missing, identifiable) + else: + self._add_any(record, self._existing, identifiable) + + def get_any(self, record: db.Entity, identifiable: Optional[Identifiable] = None): + """ + Check whether this Record was already added. Identity is based on ID, path or Identifiable + represenation + """ + if record.id is not None and record.id in self._id_look_up: + return self._id_look_up[record.id] + if record.path is not None and record.path in self._path_look_up: + return self._path_look_up[record.path] + if (identifiable is not None and identifiable.get_representation() in + self._identifiable_look_up): + return self._identifiable_look_up[identifiable.get_representation()] + + def get_existing(self, record: db.Entity, identifiable: Optional[Identifiable] = None): + """ Check whether this Record exists on the remote server + + Returns: The stored Record + """ + rec = self.get_any(record, identifiable) + if id(rec) in self._existing: + return rec + else: + return None + + def get_missing(self, record: db.Entity, identifiable: Optional[Identifiable] = None): + """ Check whether this Record is missing on the remote server + + Returns: The stored Record + """ + rec = self.get_any(record, identifiable) + if id(rec) in self._missing: + return rec + else: + return None + + def get_missing_list(self): + """ Return all Records that are missing in the remote server """ + return list(self._missing.values()) + + def get_existing_list(self): + """ Return all Records that exist in the remote server """ + return list(self._existing.values()) + + def _add_any(self, record: db.Entity, lookup, identifiable: Optional[Identifiable] = None): + if record.id is not None: + self._id_look_up[record.id] = record + if record.path is not None: + self._path_look_up[record.path] = record + if identifiable is not None: + self._identifiable_look_up[identifiable.get_representation()] = record + lookup[id(record)] = record