Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
CaosDB Crawler
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
caosdb
Software
CaosDB Crawler
Commits
dc95a9ab
Commit
dc95a9ab
authored
1 year ago
by
Henrik tom Wörden
Browse files
Options
Downloads
Patches
Plain Diff
MAINT: refactor merge into new function
parent
b0a0791b
No related branches found
No related tags found
4 merge requests
!160
STY: styling
,
!153
MAINT: refactor merge into new function
,
!152
MAINT: move checks that are not related to identifiables
,
!151
New class to track treated Records during sync
Pipeline
#47350
passed
1 year ago
Stage: info
Stage: setup
Stage: cert
Stage: style
Stage: test
Changes
1
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/caoscrawler/crawl.py
+44
-28
44 additions, 28 deletions
src/caoscrawler/crawl.py
with
44 additions
and
28 deletions
src/caoscrawler/crawl.py
+
44
−
28
View file @
dc95a9ab
...
@@ -632,6 +632,32 @@ class Crawler(object):
...
@@ -632,6 +632,32 @@ class Crawler(object):
if
p
.
value
is
old
:
if
p
.
value
is
old
:
p
.
value
=
new
p
.
value
=
new
def
_merge_identified
(
self
,
newrecord
,
record
,
try_to_merge_later
,
all_records
):
"""
tries to merge record into newrecord
If it fails, record is added to the try_to_merge_later list.
In any case, references are bent to the newrecord object.
"""
try
:
merge_entities
(
newrecord
,
record
,
merge_references_with_empty_diffs
=
False
,
merge_id_with_resolved_entity
=
True
)
except
EntityMergeConflictError
:
_treat_merge_error_of
(
newrecord
,
record
)
# We cannot merge but it is none of the clear case where merge is
# impossible. Thus we try later
try_to_merge_later
.
append
(
record
)
if
newrecord
.
id
is
not
None
:
record
.
id
=
newrecord
.
id
except
NotImplementedError
:
print
(
newrecord
)
print
(
record
)
raise
Crawler
.
bend_references_to_new_object
(
old
=
record
,
new
=
newrecord
,
entities
=
all_records
)
@staticmethod
@staticmethod
def
create_reference_mapping
(
flat
:
list
[
db
.
Entity
]):
def
create_reference_mapping
(
flat
:
list
[
db
.
Entity
]):
"""
"""
...
@@ -677,7 +703,12 @@ class Crawler(object):
...
@@ -677,7 +703,12 @@ class Crawler(object):
record
=
flat
[
i
]
record
=
flat
[
i
]
# 1. Can it be identified via an ID?
# 1. Can it be identified via an ID?
if
record
.
id
is
not
None
:
if
record
.
id
is
not
None
:
self
.
treated_records_lookup
.
add
(
record
,
None
)
treated_record
=
self
.
treated_records_lookup
.
get_existing
(
record
)
if
treated_record
is
not
None
:
self
.
_merge_identified
(
treated_record
,
record
,
try_to_merge_later
,
all_records
)
referencing_entities
=
self
.
create_reference_mapping
(
all_records
)
else
:
self
.
treated_records_lookup
.
add
(
record
,
None
)
del
flat
[
i
]
del
flat
[
i
]
# 2. Can it be identified via a path?
# 2. Can it be identified via a path?
elif
record
.
path
is
not
None
:
elif
record
.
path
is
not
None
:
...
@@ -685,18 +716,20 @@ class Crawler(object):
...
@@ -685,18 +716,20 @@ class Crawler(object):
existing
=
cached_get_entity_by
(
path
=
record
.
path
)
existing
=
cached_get_entity_by
(
path
=
record
.
path
)
except
EmptyUniqueQueryError
:
except
EmptyUniqueQueryError
:
existing
=
None
existing
=
None
if
existing
is
None
:
if
existing
is
not
None
:
# TODO add identifiable if possible
self
.
treated_records_lookup
.
add
(
record
,
None
)
del
flat
[
i
]
else
:
record
.
id
=
existing
.
id
record
.
id
=
existing
.
id
# TODO check the following copying of _size and _checksum
# TODO check the following copying of _size and _checksum
# Copy over checksum and size too if it is a file
# Copy over checksum and size too if it is a file
record
.
_size
=
existing
.
_size
record
.
_size
=
existing
.
_size
record
.
_checksum
=
existing
.
_checksum
record
.
_checksum
=
existing
.
_checksum
treated_record
=
self
.
treated_records_lookup
.
get_any
(
record
)
if
treated_record
is
not
None
:
self
.
_merge_identified
(
treated_record
,
record
,
try_to_merge_later
,
all_records
)
referencing_entities
=
self
.
create_reference_mapping
(
all_records
)
else
:
# TODO add identifiable if possible
self
.
treated_records_lookup
.
add
(
record
,
None
)
self
.
treated_records_lookup
.
add
(
record
,
None
)
del
flat
[
i
]
del
flat
[
i
]
resolved_references
=
True
resolved_references
=
True
# flat contains Entities which could not yet be checked against the remote server
# flat contains Entities which could not yet be checked against the remote server
...
@@ -721,28 +754,11 @@ class Crawler(object):
...
@@ -721,28 +754,11 @@ class Crawler(object):
# 1. Is it in the cache of already checked Records?
# 1. Is it in the cache of already checked Records?
if
self
.
treated_records_lookup
.
get_any
(
record
,
identifiable
)
is
not
None
:
if
self
.
treated_records_lookup
.
get_any
(
record
,
identifiable
)
is
not
None
:
new
record
=
self
.
treated_records_lookup
.
get_any
(
record
,
identifiable
)
treated_
record
=
self
.
treated_records_lookup
.
get_any
(
record
,
identifiable
)
# Since the identifiables are the same,
new
record and record actually describe
# Since the identifiables are the same,
treated_
record and record actually describe
# the same obejct.
# the same obejct.
# We merge the two in order to prevent loss of information
# We merge record into treated_record in order to prevent loss of information
try
:
self
.
_merge_identified
(
treated_record
,
record
,
try_to_merge_later
,
all_records
)
merge_entities
(
newrecord
,
record
,
merge_references_with_empty_diffs
=
False
,
merge_id_with_resolved_entity
=
True
)
except
EntityMergeConflictError
:
_treat_merge_error_of
(
newrecord
,
record
)
# We cannot merge but it is none of the clear case where merge is
# impossible. Thus we try later
try_to_merge_later
.
append
(
record
)
if
newrecord
.
id
is
not
None
:
record
.
id
=
newrecord
.
id
except
NotImplementedError
:
print
(
newrecord
)
print
(
record
)
raise
Crawler
.
bend_references_to_new_object
(
old
=
record
,
new
=
newrecord
,
entities
=
all_records
)
referencing_entities
=
self
.
create_reference_mapping
(
all_records
)
referencing_entities
=
self
.
create_reference_mapping
(
all_records
)
del
flat
[
i
]
del
flat
[
i
]
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment