Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
CaosDB Crawler
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
caosdb
Software
CaosDB Crawler
Commits
a9e84f9e
Commit
a9e84f9e
authored
Dec 17, 2021
by
Henrik tom Wörden
Browse files
Options
Downloads
Patches
Plain Diff
MAINT: refactoring
parent
83d48536
No related branches found
No related tags found
1 merge request
!53
Release 0.1
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
src/newcrawler/crawl.py
+56
-63
56 additions, 63 deletions
src/newcrawler/crawl.py
src/newcrawler/identifiable_adapters.py
+1
-0
1 addition, 0 deletions
src/newcrawler/identifiable_adapters.py
unittests/test_tool.py
+20
-5
20 additions, 5 deletions
unittests/test_tool.py
with
77 additions
and
68 deletions
src/newcrawler/crawl.py
+
56
−
63
View file @
a9e84f9e
...
...
@@ -73,6 +73,7 @@ from .identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiable
from
collections
import
defaultdict
from
typing
import
Union
,
Any
,
Optional
from
caosdb.apiutils
import
compare_entities
from
copy
import
deepcopy
class
Crawler
(
object
):
...
...
@@ -194,7 +195,7 @@ class Crawler(object):
def
can_be_checked_externally
(
self
,
record
:
db
.
Record
):
"""
Returns
Tru
e if there is at least one property in record which:
Returns
Fals
e if there is at least one property in record which:
a) is a reference property AND
b) where the value is set to a db.Entity (instead of an ID) AND
c) where the ID of the value is not set (to an integer)
...
...
@@ -204,7 +205,7 @@ class Crawler(object):
for
p
in
record
.
properties
:
# TODO: implement for lists?
if
(
is_reference
(
p
)
and
isinstance
(
p
.
value
,
db
.
Entity
)
and
not
isinstance
(
p
.
value
.
id
,
i
nt
)
):
and
p
.
value
.
id
i
s
None
):
return
False
return
True
...
...
@@ -221,7 +222,7 @@ class Crawler(object):
flat
.
append
(
p
.
value
)
self
.
create_flat_list
([
p
.
value
],
flat
)
def
cannot_be_checked_remotel
y
(
self
,
record
):
def
all_references_are_existing_alread
y
(
self
,
record
):
"""
returns true if all references either have IDs or were checked remotely and not found (i.e.
they exist in the local cache)
...
...
@@ -230,13 +231,14 @@ class Crawler(object):
if
(
is_reference
(
p
)
# Entity instead of ID and not cached locally
# TODO: implement for lists?
and
(
isinstance
(
p
.
value
,
db
.
Entity
)
and
self
.
get_identifiable_from_local_cache
(
p
.
value
)
is
None
)):
and
isinstance
(
p
.
value
,
db
.
Entity
)
and
p
.
value
.
id
is
None
and
self
.
get_identified_record_from_local_cache
(
p
.
value
)
is
None
):
# might be checked when reference is resolved
return
False
return
True
def
get_identifi
able
_from_local_cache
(
self
,
identifiable
:
db
.
Record
):
def
get_identifi
ed_record
_from_local_cache
(
self
,
identifiable
:
db
.
Record
):
"""
returns the identifiable if an identifiable with the same values already exists locally
(Each identifiable that is not found on the remote server, is
'
cached
'
locally to prevent
...
...
@@ -244,7 +246,7 @@ class Crawler(object):
"""
raise
NotImplementedError
()
def
add_identifi
able
_to_local_cache
(
self
,
identifiable
:
db
.
Record
):
def
add_identifi
ed_record
_to_local_cache
(
self
,
identifiable
:
db
.
Record
):
"""
adds the given identifiable to the local cache
...
...
@@ -265,6 +267,7 @@ class Crawler(object):
self
.
create_flat_list
(
ent_list
,
flat
)
resolved_references
=
True
# flat contains Entities which could not yet be checked against the remote server
while
resolved_references
and
len
(
flat
)
>
0
:
resolved_references
=
False
...
...
@@ -275,40 +278,38 @@ class Crawler(object):
if
(
record
.
id
is
not
None
or
record
in
to_be_inserted
):
raise
Exception
(
"
This should not be reached since treated elements are removed
"
"
from the list
"
)
# all references need to be IDs that exist on the remote server
elif
self
.
can_be_checked_externally
(
record
):
# Check the local cache first
if
self
.
get_identifiable_from_local_cache
(
record
)
is
not
None
:
print
(
record
,
"
is in cache
"
)
# Check the local cache first for duplicate
elif
self
.
get_identified_record_from_local_cache
(
record
)
is
not
None
:
# This record is a duplicate that can be removed. Make sure we do not lose
# information
# Update an (local) identified record that will be inserted
self
.
copy_attributes
(
fro
=
record
,
to
=
self
.
get_identifi
able
_from_local_cache
(
record
))
fro
=
record
,
to
=
self
.
get_identifi
ed_record
_from_local_cache
(
record
))
del
flat
[
i
]
continue
# Check remotely
identified
=
self
.
identifiableAdapter
.
retrieve_identifiable
(
record
)
print
(
record
)
if
identified
is
None
:
print
(
"
not identified
"
)
# all references need to be IDs that exist on the remote server
elif
self
.
can_be_checked_externally
(
record
):
# Check remotely
identified_record
=
self
.
identifiableAdapter
.
retrieve_identifiable
(
deepcopy
(
record
))
if
identified_record
is
None
:
# identifiable does not exist remote
to_be_inserted
.
append
(
record
)
self
.
add_identifi
able
_to_local_cache
(
record
)
self
.
add_identifi
ed_record
_to_local_cache
(
record
)
del
flat
[
i
]
else
:
print
(
"
identified
"
)
record
.
id
=
identified
.
id
# side effect
record
.
id
=
identified
_record
.
id
to_be_updated
.
append
(
record
)
del
flat
[
i
]
resolved_references
=
True
# e.g. references an identifiable that does not exist remotely
elif
self
.
cannot_be_checked_remotely
(
record
):
print
(
"
cannot be checked
"
)
print
(
record
)
elif
self
.
all_references_are_existing_already
(
record
):
to_be_inserted
.
append
(
record
)
self
.
add_identifi
able
_to_local_cache
(
record
)
self
.
add_identifi
ed_record
_to_local_cache
(
record
)
del
flat
[
i
]
resolved_references
=
True
if
len
(
flat
)
>
0
:
...
...
@@ -316,47 +317,18 @@ class Crawler(object):
return
to_be_inserted
,
to_be_updated
def
_synchronize
(
self
,
updateList
:
list
[
db
.
Record
]):
def
remove_unnecessary_updates
(
self
,
updateList
:
list
[
db
.
Record
]):
"""
This function applies several stages:
1) Retrieve identifiables for all records in updateList.
2) Compare updateList with existing records.
3) Insert and update records based on the set of identified differences.
This function makes use of an IdentifiableAdapter which is used to retrieve
register and retrieve identifiables.
Return the final insertList and updateList as tuple.
checks whether all relevant attributes (especially Property values) are equal
"""
if
self
.
identifiableAdapter
is
None
:
raise
RuntimeError
(
"
Should not happen.
"
)
to_be_inserted
,
to_be_updated
=
self
.
split_into_inserts_and_updates
(
updateList
)
# TODO the following needs to be moved
for
i
in
reversed
(
range
(
len
(
updateList
))):
record
=
updateList
[
i
]
# if there is no identifiable, move record from update list to insert list:
if
identifiable
is
None
:
insertList
.
append
(
record
)
del
updateList
[
i
]
# also update all references to this entity to get the value -x
# where -x is the id of the new entity in insert list
# also add this entity directly to the list of known entites of the current
# identifiable adapter.
# any reference to this entity does not need to be compared anymore, as it
# definitely needs updating (the new value cannot have existed before)
continue
identifiable
=
self
.
get_identifiable
(
record
)
comp
=
compare_entities
(
record
,
identifiable
)
identical
=
True
for
j
in
range
(
2
):
# TODO: should be implemented elsewhere
# TODO: should be implemented elsewhere
(?)
for
label
in
(
"
parents
"
,
):
if
len
(
comp
[
j
][
label
])
>
0
:
identical
=
False
...
...
@@ -384,9 +356,30 @@ class Crawler(object):
else
:
pass
def
_synchronize
(
self
,
updateList
:
list
[
db
.
Record
]):
"""
This function applies several stages:
1) Retrieve identifiables for all records in updateList.
2) Compare updateList with existing records.
3) Insert and update records based on the set of identified differences.
This function makes use of an IdentifiableAdapter which is used to retrieve
register and retrieve identifiables.
Return the final insertList and updateList as tuple.
"""
if
self
.
identifiableAdapter
is
None
:
raise
RuntimeError
(
"
Should not happen.
"
)
to_be_inserted
,
to_be_updated
=
self
.
split_into_inserts_and_updates
(
updateList
)
# remove unnecessary updates from list
self
.
remove_unnecessary_updates
(
to_be_updated
)
# TODO
#
self.exeute_inserts_in_list(insertList)
#
self.exeute_updates_in_list(updateList)
self
.
exe
c
ute_inserts_in_list
(
insertList
)
self
.
exe
c
ute_updates_in_list
(
updateList
)
return
(
insertList
,
updateList
)
...
...
This diff is collapsed.
Click to expand it.
src/newcrawler/identifiable_adapters.py
+
1
−
0
View file @
a9e84f9e
...
...
@@ -150,6 +150,7 @@ class IdentifiableAdapter(object):
pass
# TODO: the name is confusing. it returns the identified record
# TODO: remove side effect
def
retrieve_identifiable
(
self
,
record
:
db
.
Record
):
"""
This function combines all functionality of the IdentifierAdapter by
...
...
This diff is collapsed.
Click to expand it.
unittests/test_tool.py
+
20
−
5
View file @
a9e84f9e
...
...
@@ -376,8 +376,8 @@ def test_split_into_inserts_and_updates(crawler):
def
trivial_cache_add
(
stuff
):
cache
.
append
(
stuff
.
name
)
crawler
.
get_identifi
able
_from_local_cache
=
Mock
(
side_effect
=
trivial_cache_loockup
)
crawler
.
add_identifi
able
_to_local_cache
=
Mock
(
side_effect
=
trivial_cache_add
)
crawler
.
get_identifi
ed_record
_from_local_cache
=
Mock
(
side_effect
=
trivial_cache_loockup
)
crawler
.
add_identifi
ed_record
_to_local_cache
=
Mock
(
side_effect
=
trivial_cache_add
)
crawler
.
copy_attributes
=
Mock
()
# a record that is found remotely and should be added to the update list and one that is not
...
...
@@ -386,11 +386,11 @@ def test_split_into_inserts_and_updates(crawler):
entlist
=
[
db
.
Record
(
name
=
"
A
"
),
db
.
Record
(
name
=
"
B
"
)]
crawler
.
identifiableAdapter
.
retrieve_identifiable
=
Mock
(
side_effect
=
partial
(
base_mocked_lookup
,
known
=
remote_known
))
insert
,
update
=
crawler
.
split_into_inserts_and_updates
(
entlist
)
insert
,
update
=
crawler
.
split_into_inserts_and_updates
(
deepcopy
(
entlist
)
)
print
(
crawler
.
identifiableAdapter
.
retrieve_identifiable
.
call_args_list
)
print
(
entlist
)
crawler
.
identifiableAdapter
.
retrieve_identifiable
.
assert_any_call
(
entlist
[
0
])
crawler
.
identifiableAdapter
.
retrieve_identifiable
.
assert_any_call
(
entlist
[
1
])
#
crawler.identifiableAdapter.retrieve_identifiable.assert_any_call(entlist[0])
#
crawler.identifiableAdapter.retrieve_identifiable.assert_any_call(entlist[1])
assert
len
(
insert
)
==
1
assert
insert
[
0
].
name
==
"
B
"
assert
len
(
update
)
==
1
...
...
@@ -423,3 +423,18 @@ def test_split_into_inserts_and_updates(crawler):
entlist
=
[
a
,
b
]
with
raises
(
RuntimeError
):
crawler
.
split_into_inserts_and_updates
(
entlist
)
# assume identifiable is only the name
a
=
db
.
Record
(
name
=
"
A
"
)
a
.
add_property
(
"
foo
"
,
1
)
b
=
db
.
Record
(
name
=
"
A
"
)
b
.
add_property
(
"
bar
"
,
2
)
# expected TODO
#assert result.has_property("foo").value == 1
#assert result.has_property("bar").value == 1
def
test_all_references_are_existing_already
(
crawler
):
pass
# crawler.all_references_are_existing_already(record)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment