Skip to content
Snippets Groups Projects
Commit 595594a2 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

wip

parent 24686e48
No related branches found
No related tags found
2 merge requests!178FIX: #96 Better error output for crawl.py script.,!167Sync Graph
...@@ -71,6 +71,7 @@ from .scanner import (create_converter_registry, initialize_converters, ...@@ -71,6 +71,7 @@ from .scanner import (create_converter_registry, initialize_converters,
load_definition, scan_directory, scan_structure_elements) load_definition, scan_directory, scan_structure_elements)
from .stores import GeneralStore from .stores import GeneralStore
from .structure_elements import StructureElement from .structure_elements import StructureElement
from .treated_record_lookup import TreatedRecordLookUp
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -225,110 +226,6 @@ class SecurityMode(Enum): ...@@ -225,110 +226,6 @@ class SecurityMode(Enum):
UPDATE = 2 UPDATE = 2
class TreatedRecordLookUp():
"""tracks Records and Identifiables for which it was checked whether they exist in the remote
server
For a given Record it can be checked, whether it exists in the remote sever if
- it has a (valid) ID
- it has a (valid) path (FILEs only)
- an identifiable can be created for the Record.
Records are added by calling the `add` function and they are then added to the internal
existing or missing list depending on whether the Record has a valid ID.
Additionally, the Record is added to three look up dicts. The keys of those are paths, IDs and
the representation of the identifiables.
The extreme case, that one could imagine, would be that the same Record occurs three times as
different Python objects: one that only has an ID, one with only a path and one without ID and
path but with identifying properties. During `split_into_inserts_and_updates` all three
must be identified with each other (and must be merged). Since we require, that treated
entities have a valid ID if they exist in the remote server, all three objects would be
identified with each other simply using the IDs.
In the case that the Record is not yet in the remote server, there cannot be a Python object
with an ID. Thus we might have one with a path and one with an identifiable. If that Record
does not yet exist, it is necessary that both Python objects have at least either the path or
the identifiable in common.
"""
def __init__(self):
self._id_look_up: dict[int, db.Entity] = {}
self._path_look_up: dict[str, db.Entity] = {}
self._identifiable_look_up: dict[str, db.Entity] = {}
self.remote_missing_counter = -1
self._missing: dict[int, db.Entity] = {}
self._existing: dict[int, db.Entity] = {}
def add(self, record: db.Entity, identifiable: Optional[Identifiable] = None):
"""
Add a Record that was treated, such that it is contained in the internal look up dicts
This Record MUST have an ID if it was found in the remote server.
"""
if record.id is None:
if record.path is None and identifiable is None:
raise RuntimeError("Record must have ID or path or an identifiable must be given."
f"Record is\n{record}")
record.id = self.remote_missing_counter
self.remote_missing_counter -= 1
self._add_any(record, self._missing, identifiable)
else:
self._add_any(record, self._existing, identifiable)
def get_any(self, record: db.Entity, identifiable: Optional[Identifiable] = None):
"""
Check whether this Record was already added. Identity is based on ID, path or Identifiable
represenation
"""
if record.id is not None and record.id in self._id_look_up:
return self._id_look_up[record.id]
if record.path is not None and record.path in self._path_look_up:
return self._path_look_up[record.path]
if (identifiable is not None and identifiable.get_representation() in
self._identifiable_look_up):
return self._identifiable_look_up[identifiable.get_representation()]
def get_existing(self, record: db.Entity, identifiable: Optional[Identifiable] = None):
""" Check whether this Record exists on the remote server
Returns: The stored Record
"""
rec = self.get_any(record, identifiable)
if id(rec) in self._existing:
return rec
else:
return None
def get_missing(self, record: db.Entity, identifiable: Optional[Identifiable] = None):
""" Check whether this Record is missing on the remote server
Returns: The stored Record
"""
rec = self.get_any(record, identifiable)
if id(rec) in self._missing:
return rec
else:
return None
def get_missing_list(self):
""" Return all Records that are missing in the remote server """
return list(self._missing.values())
def get_existing_list(self):
""" Return all Records that exist in the remote server """
return list(self._existing.values())
def _add_any(self, record: db.Entity, lookup, identifiable: Optional[Identifiable] = None):
if record.id is not None:
self._id_look_up[record.id] = record
if record.path is not None:
self._path_look_up[record.path] = record
if identifiable is not None:
self._identifiable_look_up[identifiable.get_representation()] = record
lookup[id(record)] = record
class Crawler(object): class Crawler(object):
""" """
Crawler class that encapsulates crawling functions. Crawler class that encapsulates crawling functions.
......
#!/usr/bin/env python3
# encoding: utf-8
#
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 Henrik tom Wörden
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
"""
A data model class for the semantic data that shall be created by synchronization of the crawler.
"""
class SemanticTarget():
def __init__(self, records):
#!/usr/bin/env python3
# encoding: utf-8
#
# This file is a part of the LinkAhead Project.
#
# Copyright (C) 2024 Henrik tom Wörden
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
from typing import Any, List, Optional, Union
import linkahead as db
from .identifiable import Identifiable
class TreatedRecordLookUp():
"""tracks Records and Identifiables for which it was checked whether they exist in the remote
server
For a given Record it can be checked, whether it exists in the remote sever if
- it has a (valid) ID
- it has a (valid) path (FILEs only)
- an identifiable can be created for the Record.
Records are added by calling the `add` function and they are then added to the internal
existing or missing list depending on whether the Record has a valid ID.
Additionally, the Record is added to three look up dicts. The keys of those are paths, IDs and
the representation of the identifiables.
The extreme case, that one could imagine, would be that the same Record occurs three times as
different Python objects: one that only has an ID, one with only a path and one without ID and
path but with identifying properties. During `split_into_inserts_and_updates` all three
must be identified with each other (and must be merged). Since we require, that treated
entities have a valid ID if they exist in the remote server, all three objects would be
identified with each other simply using the IDs.
In the case that the Record is not yet in the remote server, there cannot be a Python object
with an ID. Thus we might have one with a path and one with an identifiable. If that Record
does not yet exist, it is necessary that both Python objects have at least either the path or
the identifiable in common.
"""
def __init__(self):
self._id_look_up: dict[int, db.Entity] = {}
self._path_look_up: dict[str, db.Entity] = {}
self._identifiable_look_up: dict[str, db.Entity] = {}
self.remote_missing_counter = -1
self._missing: dict[int, db.Entity] = {}
self._existing: dict[int, db.Entity] = {}
def add(self, record: db.Entity, identifiable: Optional[Identifiable] = None):
"""
Add a Record that was treated, such that it is contained in the internal look up dicts
This Record MUST have an ID if it was found in the remote server.
"""
if record.id is None:
if record.path is None and identifiable is None:
raise RuntimeError("Record must have ID or path or an identifiable must be given."
f"Record is\n{record}")
record.id = self.remote_missing_counter
self.remote_missing_counter -= 1
self._add_any(record, self._missing, identifiable)
else:
self._add_any(record, self._existing, identifiable)
def get_any(self, record: db.Entity, identifiable: Optional[Identifiable] = None):
"""
Check whether this Record was already added. Identity is based on ID, path or Identifiable
represenation
"""
if record.id is not None and record.id in self._id_look_up:
return self._id_look_up[record.id]
if record.path is not None and record.path in self._path_look_up:
return self._path_look_up[record.path]
if (identifiable is not None and identifiable.get_representation() in
self._identifiable_look_up):
return self._identifiable_look_up[identifiable.get_representation()]
def get_existing(self, record: db.Entity, identifiable: Optional[Identifiable] = None):
""" Check whether this Record exists on the remote server
Returns: The stored Record
"""
rec = self.get_any(record, identifiable)
if id(rec) in self._existing:
return rec
else:
return None
def get_missing(self, record: db.Entity, identifiable: Optional[Identifiable] = None):
""" Check whether this Record is missing on the remote server
Returns: The stored Record
"""
rec = self.get_any(record, identifiable)
if id(rec) in self._missing:
return rec
else:
return None
def get_missing_list(self):
""" Return all Records that are missing in the remote server """
return list(self._missing.values())
def get_existing_list(self):
""" Return all Records that exist in the remote server """
return list(self._existing.values())
def _add_any(self, record: db.Entity, lookup, identifiable: Optional[Identifiable] = None):
if record.id is not None:
self._id_look_up[record.id] = record
if record.path is not None:
self._path_look_up[record.path] = record
if identifiable is not None:
self._identifiable_look_up[identifiable.get_representation()] = record
lookup[id(record)] = record
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment