Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
CaosDB Crawler
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
caosdb
Software
CaosDB Crawler
Commits
0adf4e16
Commit
0adf4e16
authored
Nov 13, 2022
by
Henrik tom Wörden
Browse files
Options
Downloads
Patches
Plain Diff
WIP: tests pass
parent
b67cb9c1
No related branches found
No related tags found
2 merge requests
!91
Release 0.3
,
!67
MAINT: introduce an identifiable class
Pipeline
#30446
passed
Nov 13, 2022
Stage: info
Stage: setup
Stage: cert
Stage: style
Stage: test
Changes
3
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
src/caoscrawler/crawl.py
+15
-11
15 additions, 11 deletions
src/caoscrawler/crawl.py
src/caoscrawler/identifiable.py
+2
-1
2 additions, 1 deletion
src/caoscrawler/identifiable.py
src/caoscrawler/identifiable_adapters.py
+9
-2
9 additions, 2 deletions
src/caoscrawler/identifiable_adapters.py
with
26 additions
and
14 deletions
src/caoscrawler/crawl.py
+
15
−
11
View file @
0adf4e16
...
@@ -45,6 +45,7 @@ from caosadvancedtools.crawler import Crawler as OldCrawler
...
@@ -45,6 +45,7 @@ from caosadvancedtools.crawler import Crawler as OldCrawler
from
caosdb.common.datatype
import
is_reference
from
caosdb.common.datatype
import
is_reference
from
.stores
import
GeneralStore
,
RecordStore
from
.stores
import
GeneralStore
,
RecordStore
from
.identified_cache
import
IdentifiedCache
from
.identified_cache
import
IdentifiedCache
from
.identifiable
import
Identifiable
from
.structure_elements
import
StructureElement
,
Directory
from
.structure_elements
import
StructureElement
,
Directory
from
.converters
import
Converter
,
DirectoryConverter
from
.converters
import
Converter
,
DirectoryConverter
from
.identifiable_adapters
import
(
IdentifiableAdapter
,
from
.identifiable_adapters
import
(
IdentifiableAdapter
,
...
@@ -484,16 +485,18 @@ class Crawler(object):
...
@@ -484,16 +485,18 @@ class Crawler(object):
return
self
.
_synchronize
(
self
.
crawled_data
,
commit_changes
,
unique_names
=
unique_names
)
return
self
.
_synchronize
(
self
.
crawled_data
,
commit_changes
,
unique_names
=
unique_names
)
def
has_reference_value_without_id
(
self
,
record
:
Identifiable
):
def
has_reference_value_without_id
(
self
,
ident
:
Identifiable
):
"""
"""
Returns True if there is at least one property in `
record
` which:
Returns True if there is at least one property in `
ident
` which:
a) is a reference property AND
a) is a reference property AND
b) where the value is set to a db.Entity (instead of an ID) AND
b) where the value is set to a db.Entity (instead of an ID) AND
c) where the ID of the value (the db.Entity object in b)) is not set (to an integer)
c) where the ID of the value (the db.Entity object in b)) is not set (to an integer)
Returns False otherwise.
Returns False otherwise.
"""
"""
for
pname
,
pvalue
in
record
.
properties
.
items
():
if
ident
is
None
:
return
True
for
pname
,
pvalue
in
ident
.
properties
.
items
():
if
isinstance
(
pvalue
,
list
):
if
isinstance
(
pvalue
,
list
):
for
el
in
pvalue
:
for
el
in
pvalue
:
if
isinstance
(
el
,
db
.
Entity
)
and
el
.
id
is
None
:
if
isinstance
(
el
,
db
.
Entity
)
and
el
.
id
is
None
:
...
@@ -525,14 +528,16 @@ class Crawler(object):
...
@@ -525,14 +528,16 @@ class Crawler(object):
flat
.
append
(
p
.
value
)
flat
.
append
(
p
.
value
)
Crawler
.
create_flat_list
([
p
.
value
],
flat
)
Crawler
.
create_flat_list
([
p
.
value
],
flat
)
def
has_missing_object_in_references
(
self
,
record
:
Identifiable
):
def
has_missing_object_in_references
(
self
,
ident
:
Identifiable
):
"""
"""
returns False if any property value is a db.Entity object that
returns False if any property value is a db.Entity object that
is contained in the `remote_missing_cache`. If
the record
has such an object in the
is contained in the `remote_missing_cache`. If
ident
has such an object in the
reference properties, it means that it references another Entity, where we checked
reference properties, it means that it references another Entity, where we checked
whether it exists remotely and it was not found.
whether it exists remotely and it was not found.
"""
"""
for
pname
,
pvalue
in
record
.
properties
.
items
():
if
ident
is
None
:
return
True
for
pname
,
pvalue
in
ident
.
properties
.
items
():
# if (is_reference(p)
# if (is_reference(p)
# Entity instead of ID and not cached locally
# Entity instead of ID and not cached locally
if
(
isinstance
(
pvalue
,
list
)):
if
(
isinstance
(
pvalue
,
list
)):
...
@@ -540,7 +545,7 @@ class Crawler(object):
...
@@ -540,7 +545,7 @@ class Crawler(object):
if
(
isinstance
(
el
,
db
.
Entity
)
if
(
isinstance
(
el
,
db
.
Entity
)
and
self
.
get_from_remote_missing_cache
(
el
)
is
not
None
):
and
self
.
get_from_remote_missing_cache
(
el
)
is
not
None
):
return
True
return
True
if
(
isinstance
(
p
.
value
,
db
.
Entity
)
if
(
isinstance
(
pvalue
,
db
.
Entity
)
and
self
.
get_from_remote_missing_cache
(
pvalue
)
is
not
None
):
and
self
.
get_from_remote_missing_cache
(
pvalue
)
is
not
None
):
# might be checked when reference is resolved
# might be checked when reference is resolved
return
True
return
True
...
@@ -611,9 +616,7 @@ class Crawler(object):
...
@@ -611,9 +616,7 @@ class Crawler(object):
raise
RuntimeError
(
"
Should not happen.
"
)
raise
RuntimeError
(
"
Should not happen.
"
)
identifiable
=
self
.
identifiableAdapter
.
get_identifiable
(
record
)
identifiable
=
self
.
identifiableAdapter
.
get_identifiable
(
record
)
if
identifiable
is
None
:
if
identifiable
is
None
:
# TODO: check whether the same idea as below works here
return
None
identifiable
=
record
# return None
if
identifiable
in
self
.
remote_existing_cache
:
if
identifiable
in
self
.
remote_existing_cache
:
return
self
.
remote_existing_cache
[
identifiable
]
return
self
.
remote_existing_cache
[
identifiable
]
...
@@ -670,7 +673,7 @@ class Crawler(object):
...
@@ -670,7 +673,7 @@ class Crawler(object):
# TODO: check whether that holds:
# TODO: check whether that holds:
# if there is no identifiable, for the cache that is the same
# if there is no identifiable, for the cache that is the same
# as if the complete entity is the identifiable:
# as if the complete entity is the identifiable:
identifiable
=
record
return
cache
.
add
(
identifiable
=
identifiable
,
record
=
record
)
cache
.
add
(
identifiable
=
identifiable
,
record
=
record
)
@staticmethod
@staticmethod
...
@@ -728,6 +731,7 @@ class Crawler(object):
...
@@ -728,6 +731,7 @@ class Crawler(object):
# can we check whether the record(identifiable) exists on the remote server?
# can we check whether the record(identifiable) exists on the remote server?
elif
not
self
.
has_reference_value_without_id
(
elif
not
self
.
has_reference_value_without_id
(
# TODO move get_identifiable above if else?
self
.
identifiableAdapter
.
get_identifiable
(
record
)):
self
.
identifiableAdapter
.
get_identifiable
(
record
)):
# TODO: remove deepcopy?
# TODO: remove deepcopy?
identified_record
=
(
identified_record
=
(
...
...
This diff is collapsed.
Click to expand it.
src/caoscrawler/identifiable.py
+
2
−
1
View file @
0adf4e16
...
@@ -20,6 +20,7 @@
...
@@ -20,6 +20,7 @@
#
#
from
__future__
import
annotations
from
__future__
import
annotations
from
typing
import
Union
class
Identifiable
():
class
Identifiable
():
...
@@ -42,7 +43,7 @@ class Identifiable():
...
@@ -42,7 +43,7 @@ class Identifiable():
"""
"""
def
__init__
(
self
,
record_type
:
str
=
None
,
name
:
str
=
None
,
properties
:
dict
=
None
,
def
__init__
(
self
,
record_type
:
str
=
None
,
name
:
str
=
None
,
properties
:
dict
=
None
,
path
:
str
=
None
,
backrefs
:
list
[
int
,
str
]
=
None
):
path
:
str
=
None
,
backrefs
:
list
[
Union
[
int
,
str
]
]
=
None
):
self
.
record_type
=
record_type
self
.
record_type
=
record_type
self
.
name
=
name
self
.
name
=
name
self
.
properties
:
dict
=
{}
self
.
properties
:
dict
=
{}
...
...
This diff is collapsed.
Click to expand it.
src/caoscrawler/identifiable_adapters.py
+
9
−
2
View file @
0adf4e16
...
@@ -95,7 +95,9 @@ class IdentifiableAdapter(metaclass=ABCMeta):
...
@@ -95,7 +95,9 @@ class IdentifiableAdapter(metaclass=ABCMeta):
uses the properties of ident to create a query that can determine
uses the properties of ident to create a query that can determine
whether the required record already exists.
whether the required record already exists.
"""
"""
query_string
=
"
FIND Record
"
+
ident
.
record_type
query_string
=
"
FIND Record
"
if
ident
.
record_type
is
not
None
:
query_string
+=
ident
.
record_type
query_string
+=
"
WITH
"
query_string
+=
"
WITH
"
...
@@ -166,6 +168,8 @@ class IdentifiableAdapter(metaclass=ABCMeta):
...
@@ -166,6 +168,8 @@ class IdentifiableAdapter(metaclass=ABCMeta):
registered_identifiable
=
self
.
get_registered_identifiable
(
record
)
registered_identifiable
=
self
.
get_registered_identifiable
(
record
)
if
registered_identifiable
is
None
:
if
registered_identifiable
is
None
:
if
path
is
not
None
:
return
Identifiable
(
path
=
path
)
return
None
return
None
if
len
(
registered_identifiable
.
parents
)
!=
1
:
if
len
(
registered_identifiable
.
parents
)
!=
1
:
raise
RuntimeError
(
"
Multiple parents for identifiables
"
raise
RuntimeError
(
"
Multiple parents for identifiables
"
...
@@ -326,7 +330,8 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter):
...
@@ -326,7 +330,8 @@ class LocalStorageIdentifiableAdapter(IdentifiableAdapter):
record is the record from the local database to check against.
record is the record from the local database to check against.
identifiable is the record that was created during the crawler run.
identifiable is the record that was created during the crawler run.
"""
"""
if
not
has_parent
(
record
,
identifiable
.
record_type
):
if
(
identifiable
.
record_type
is
not
None
and
not
has_parent
(
record
,
identifiable
.
record_type
)):
return
False
return
False
for
propname
,
propvalue
in
identifiable
.
properties
.
items
():
for
propname
,
propvalue
in
identifiable
.
properties
.
items
():
prop_record
=
record
.
get_property
(
propname
)
prop_record
=
record
.
get_property
(
propname
)
...
@@ -419,6 +424,8 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter):
...
@@ -419,6 +424,8 @@ class CaosDBIdentifiableAdapter(IdentifiableAdapter):
It is assumed, that there is exactly one identifiable for each RecordType. Only the first
It is assumed, that there is exactly one identifiable for each RecordType. Only the first
parent of the given Record is considered; others are ignored
parent of the given Record is considered; others are ignored
"""
"""
if
len
(
record
.
parents
)
==
0
:
return
None
rt_name
=
record
.
parents
[
0
].
name
rt_name
=
record
.
parents
[
0
].
name
for
name
,
definition
in
self
.
_registered_identifiables
.
items
():
for
name
,
definition
in
self
.
_registered_identifiables
.
items
():
if
definition
.
parents
[
0
].
name
.
lower
()
==
rt_name
.
lower
():
if
definition
.
parents
[
0
].
name
.
lower
()
==
rt_name
.
lower
():
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment