Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
CaosDB Crawler
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
caosdb
Software
CaosDB Crawler
Commits
3bebbf91
Commit
3bebbf91
authored
Nov 13, 2022
by
Henrik tom Wörden
Browse files
Options
Downloads
Patches
Plain Diff
WIP passing tests
parent
0adf4e16
No related branches found
No related tags found
2 merge requests
!91
Release 0.3
,
!67
MAINT: introduce an identifiable class
Pipeline
#30447
failed
Nov 13, 2022
Stage: info
Stage: setup
Stage: cert
Stage: style
Stage: test
Changes
3
Pipelines
1
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
src/caoscrawler/crawl.py
+17
-21
17 additions, 21 deletions
src/caoscrawler/crawl.py
unittests/records.xml
+133
-133
133 additions, 133 deletions
unittests/records.xml
unittests/test_tool.py
+7
-6
7 additions, 6 deletions
unittests/test_tool.py
with
157 additions
and
160 deletions
src/caoscrawler/crawl.py
+
17
−
21
View file @
3bebbf91
...
@@ -495,7 +495,7 @@ class Crawler(object):
...
@@ -495,7 +495,7 @@ class Crawler(object):
Returns False otherwise.
Returns False otherwise.
"""
"""
if
ident
is
None
:
if
ident
is
None
:
r
eturn
True
r
aise
ValueError
(
"
Identifiable has to be given as argument
"
)
for
pname
,
pvalue
in
ident
.
properties
.
items
():
for
pname
,
pvalue
in
ident
.
properties
.
items
():
if
isinstance
(
pvalue
,
list
):
if
isinstance
(
pvalue
,
list
):
for
el
in
pvalue
:
for
el
in
pvalue
:
...
@@ -536,17 +536,19 @@ class Crawler(object):
...
@@ -536,17 +536,19 @@ class Crawler(object):
whether it exists remotely and it was not found.
whether it exists remotely and it was not found.
"""
"""
if
ident
is
None
:
if
ident
is
None
:
r
eturn
True
r
aise
ValueError
(
"
Identifiable has to be given as argument
"
)
for
pname
,
pvalue
in
ident
.
properties
.
items
():
for
pname
,
pvalue
in
ident
.
properties
.
items
():
# if (is_reference(p)
# if (is_reference(p)
# Entity instead of ID and not cached locally
# Entity instead of ID and not cached locally
if
(
isinstance
(
pvalue
,
list
)):
if
(
isinstance
(
pvalue
,
list
)):
for
el
in
pvalue
:
for
el
in
pvalue
:
if
(
isinstance
(
el
,
db
.
Entity
)
if
(
isinstance
(
el
,
db
.
Entity
)
and
self
.
get_from_remote_missing_cache
(
el
)
is
not
None
):
and
self
.
get_from_remote_missing_cache
(
self
.
identifiableAdapter
.
get_identifiable
(
el
))
is
not
None
):
return
True
return
True
if
(
isinstance
(
pvalue
,
db
.
Entity
)
if
(
isinstance
(
pvalue
,
db
.
Entity
)
and
self
.
get_from_remote_missing_cache
(
pvalue
)
is
not
None
):
and
self
.
get_from_remote_missing_cache
(
self
.
identifiableAdapter
.
get_identifiable
(
pvalue
))
is
not
None
):
# might be checked when reference is resolved
# might be checked when reference is resolved
return
True
return
True
return
False
return
False
...
@@ -562,7 +564,8 @@ class Crawler(object):
...
@@ -562,7 +564,8 @@ class Crawler(object):
lst
=
[]
lst
=
[]
for
el
in
p
.
value
:
for
el
in
p
.
value
:
if
(
isinstance
(
el
,
db
.
Entity
)
and
el
.
id
is
None
):
if
(
isinstance
(
el
,
db
.
Entity
)
and
el
.
id
is
None
):
cached
=
self
.
get_from_any_cache
(
el
)
cached
=
self
.
get_from_any_cache
(
self
.
identifiableAdapter
.
get_identifiable
(
el
))
if
cached
is
None
:
if
cached
is
None
:
raise
RuntimeError
(
"
Not in cache.
"
)
raise
RuntimeError
(
"
Not in cache.
"
)
if
not
check_identical
(
cached
,
el
,
True
):
if
not
check_identical
(
cached
,
el
,
True
):
...
@@ -576,7 +579,8 @@ class Crawler(object):
...
@@ -576,7 +579,8 @@ class Crawler(object):
lst
.
append
(
el
)
lst
.
append
(
el
)
p
.
value
=
lst
p
.
value
=
lst
if
(
isinstance
(
p
.
value
,
db
.
Entity
)
and
p
.
value
.
id
is
None
):
if
(
isinstance
(
p
.
value
,
db
.
Entity
)
and
p
.
value
.
id
is
None
):
cached
=
self
.
get_from_any_cache
(
p
.
value
)
cached
=
self
.
get_from_any_cache
(
self
.
identifiableAdapter
.
get_identifiable
(
p
.
value
))
if
cached
is
None
:
if
cached
is
None
:
raise
RuntimeError
(
"
Not in cache.
"
)
raise
RuntimeError
(
"
Not in cache.
"
)
if
not
check_identical
(
cached
,
p
.
value
,
True
):
if
not
check_identical
(
cached
,
p
.
value
,
True
):
...
@@ -587,15 +591,12 @@ class Crawler(object):
...
@@ -587,15 +591,12 @@ class Crawler(object):
raise
RuntimeError
(
"
Not identical.
"
)
raise
RuntimeError
(
"
Not identical.
"
)
p
.
value
=
cached
p
.
value
=
cached
def
get_from_remote_missing_cache
(
self
,
record
:
db
.
Record
):
def
get_from_remote_missing_cache
(
self
,
identifiable
:
Identifiable
):
"""
"""
returns the identifiable if an identifiable with the same values already exists locally
returns the identifiable if an identifiable with the same values already exists locally
(Each identifiable that is not found on the remote server, is
'
cached
'
locally to prevent
(Each identifiable that is not found on the remote server, is
'
cached
'
locally to prevent
that the same identifiable exists twice)
that the same identifiable exists twice)
"""
"""
if
self
.
identifiableAdapter
is
None
:
raise
RuntimeError
(
"
Should not happen.
"
)
identifiable
=
self
.
identifiableAdapter
.
get_identifiable
(
record
)
if
identifiable
is
None
:
if
identifiable
is
None
:
# TODO: check whether the same idea as below works here
# TODO: check whether the same idea as below works here
identifiable
=
record
identifiable
=
record
...
@@ -606,15 +607,12 @@ class Crawler(object):
...
@@ -606,15 +607,12 @@ class Crawler(object):
else
:
else
:
return
None
return
None
def
get_from_any_cache
(
self
,
record
:
db
.
Record
):
def
get_from_any_cache
(
self
,
identifiable
:
Identifiable
):
"""
"""
returns the identifiable if an identifiable with the same values already exists locally
returns the identifiable if an identifiable with the same values already exists locally
(Each identifiable that is not found on the remote server, is
'
cached
'
locally to prevent
(Each identifiable that is not found on the remote server, is
'
cached
'
locally to prevent
that the same identifiable exists twice)
that the same identifiable exists twice)
"""
"""
if
self
.
identifiableAdapter
is
None
:
raise
RuntimeError
(
"
Should not happen.
"
)
identifiable
=
self
.
identifiableAdapter
.
get_identifiable
(
record
)
if
identifiable
is
None
:
if
identifiable
is
None
:
return
None
return
None
...
@@ -713,15 +711,16 @@ class Crawler(object):
...
@@ -713,15 +711,16 @@ class Crawler(object):
for
i
in
reversed
(
range
(
len
(
flat
))):
for
i
in
reversed
(
range
(
len
(
flat
))):
record
=
flat
[
i
]
record
=
flat
[
i
]
identifiable
=
self
.
identifiableAdapter
.
get_identifiable
(
record
)
# TODO remove if the exception is never raised
# TODO remove if the exception is never raised
if
(
record
.
id
is
not
None
or
record
in
to_be_inserted
):
if
(
record
.
id
is
not
None
or
record
in
to_be_inserted
):
raise
RuntimeError
(
"
This should not be reached since treated elements
"
raise
RuntimeError
(
"
This should not be reached since treated elements
"
"
are removed from the list
"
)
"
are removed from the list
"
)
# Check whether this record is a duplicate that can be removed
# Check whether this record is a duplicate that can be removed
elif
self
.
get_from_any_cache
(
record
)
is
not
None
:
elif
self
.
get_from_any_cache
(
identifiable
)
is
not
None
:
# We merge the two in order to prevent loss of information
# We merge the two in order to prevent loss of information
newrecord
=
self
.
get_from_any_cache
(
record
)
newrecord
=
self
.
get_from_any_cache
(
identifiable
)
merge_entities
(
newrecord
,
record
)
merge_entities
(
newrecord
,
record
)
Crawler
.
bend_references_to_new_object
(
Crawler
.
bend_references_to_new_object
(
old
=
record
,
new
=
newrecord
,
entities
=
flat
+
to_be_updated
+
to_be_inserted
)
old
=
record
,
new
=
newrecord
,
entities
=
flat
+
to_be_updated
+
to_be_inserted
)
...
@@ -730,9 +729,7 @@ class Crawler(object):
...
@@ -730,9 +729,7 @@ class Crawler(object):
resolved_references
=
True
resolved_references
=
True
# can we check whether the record(identifiable) exists on the remote server?
# can we check whether the record(identifiable) exists on the remote server?
elif
not
self
.
has_reference_value_without_id
(
elif
not
self
.
has_reference_value_without_id
(
identifiable
):
# TODO move get_identifiable above if else?
self
.
identifiableAdapter
.
get_identifiable
(
record
)):
# TODO: remove deepcopy?
# TODO: remove deepcopy?
identified_record
=
(
identified_record
=
(
self
.
identifiableAdapter
.
retrieve_identified_record_for_record
(
self
.
identifiableAdapter
.
retrieve_identified_record_for_record
(
...
@@ -757,8 +754,7 @@ class Crawler(object):
...
@@ -757,8 +754,7 @@ class Crawler(object):
# is it impossible to check this record because an identifiable references a
# is it impossible to check this record because an identifiable references a
# missing record?
# missing record?
elif
self
.
has_missing_object_in_references
(
elif
self
.
has_missing_object_in_references
(
identifiable
):
self
.
identifiableAdapter
.
get_identifiable
(
record
)):
to_be_inserted
.
append
(
record
)
to_be_inserted
.
append
(
record
)
self
.
add_to_remote_missing_cache
(
record
)
self
.
add_to_remote_missing_cache
(
record
)
del
flat
[
i
]
del
flat
[
i
]
...
...
This diff is collapsed.
Click to expand it.
unittests/records.xml
+
133
−
133
View file @
3bebbf91
This diff is collapsed.
Click to expand it.
unittests/test_tool.py
+
7
−
6
View file @
3bebbf91
...
@@ -362,17 +362,18 @@ def test_split_into_inserts_and_updates_trivial(crawler):
...
@@ -362,17 +362,18 @@ def test_split_into_inserts_and_updates_trivial(crawler):
def
test_split_into_inserts_and_updates_single
(
crawler_mocked_identifiable_retrieve
):
def
test_split_into_inserts_and_updates_single
(
crawler_mocked_identifiable_retrieve
):
crawler
=
crawler_mocked_identifiable_retrieve
crawler
=
crawler_mocked_identifiable_retrieve
identlist
=
[
Identifiable
(
name
=
"
A
"
,
record_type
=
"
C
"
),
Identifiable
(
name
=
"
B
"
,
record_type
=
"
C
"
)]
entlist
=
[
db
.
Record
(
name
=
"
A
"
).
add_parent
(
entlist
=
[
db
.
Record
(
name
=
"
A
"
).
add_parent
(
"
C
"
),
db
.
Record
(
name
=
"
B
"
).
add_parent
(
"
C
"
)]
"
C
"
),
db
.
Record
(
name
=
"
B
"
).
add_parent
(
"
C
"
)]
assert
crawler
.
get_from_any_cache
(
entlist
[
0
])
is
None
assert
crawler
.
get_from_any_cache
(
id
entlist
[
0
])
is
None
assert
crawler
.
get_from_any_cache
(
entlist
[
1
])
is
None
assert
crawler
.
get_from_any_cache
(
id
entlist
[
1
])
is
None
#
assert not crawler.has_reference_value_without_id(entlist[0])
assert
not
crawler
.
has_reference_value_without_id
(
id
entlist
[
0
])
#
assert not crawler.has_reference_value_without_id(entlist[1])
assert
not
crawler
.
has_reference_value_without_id
(
id
entlist
[
1
])
assert
crawler
.
identifiableAdapter
.
retrieve_identified_record_for_record
(
assert
crawler
.
identifiableAdapter
.
retrieve_identified_record_for_record
(
entlist
[
0
]).
id
==
1111
id
entlist
[
0
]).
id
==
1111
assert
crawler
.
identifiableAdapter
.
retrieve_identified_record_for_record
(
assert
crawler
.
identifiableAdapter
.
retrieve_identified_record_for_record
(
entlist
[
1
])
is
None
id
entlist
[
1
])
is
None
insert
,
update
=
crawler
.
split_into_inserts_and_updates
(
deepcopy
(
entlist
))
insert
,
update
=
crawler
.
split_into_inserts_and_updates
(
deepcopy
(
entlist
))
assert
len
(
insert
)
==
1
assert
len
(
insert
)
==
1
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment