Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
C
caosdb-advanced-user-tools
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
caosdb
Software
caosdb-advanced-user-tools
Commits
782db460
Commit
782db460
authored
5 years ago
by
Henrik tom Wörden
Browse files
Options
Downloads
Patches
Plain Diff
ENH: verbose mode and workaround for empty string props
parent
9dc5ab5f
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/caosadvancedtools/cfood.py
+23
-6
23 additions, 6 deletions
src/caosadvancedtools/cfood.py
src/caosadvancedtools/crawler.py
+58
-11
58 additions, 11 deletions
src/caosadvancedtools/crawler.py
with
81 additions
and
17 deletions
src/caosadvancedtools/cfood.py
+
23
−
6
View file @
782db460
...
@@ -60,7 +60,7 @@ class AbstractCFood(object):
...
@@ -60,7 +60,7 @@ class AbstractCFood(object):
# function match()
# function match()
_pattern
=
None
_pattern
=
None
def
__init__
(
self
,
crawled_file
,
access
=
lambda
x
:
x
):
def
__init__
(
self
,
crawled_file
,
access
=
lambda
x
:
x
,
verbose
=
True
):
"""
Abstract base class for Crawler food (CFood).
"""
Abstract base class for Crawler food (CFood).
Parameters
Parameters
...
@@ -77,6 +77,7 @@ class AbstractCFood(object):
...
@@ -77,6 +77,7 @@ class AbstractCFood(object):
self
.
match
=
type
(
self
).
match
(
crawled_file
.
path
)
self
.
match
=
type
(
self
).
match
(
crawled_file
.
path
)
self
.
to_be_updated
=
db
.
Container
()
self
.
to_be_updated
=
db
.
Container
()
self
.
identifiables
=
db
.
Container
()
self
.
identifiables
=
db
.
Container
()
self
.
verbose
=
verbose
@staticmethod
@staticmethod
def
get_re
():
def
get_re
():
...
@@ -97,10 +98,14 @@ class AbstractCFood(object):
...
@@ -97,10 +98,14 @@ class AbstractCFood(object):
The path of the file that shall be matched.
The path of the file that shall be matched.
"""
"""
if
cls
.
_pattern
is
None
:
# TODO this does not quite work. Sometimes the wrong expression is in
cls
.
_pattern
=
re
.
compile
(
cls
.
get_re
())
# _pattern; FIX
# if cls._pattern is None:
# cls._pattern = re.compile(cls.get_re())
return
cls
.
_pattern
.
match
(
string
)
# return cls._pattern.match(string)
return
re
.
match
(
cls
.
get_re
(),
string
)
def
create_identifiables
(
self
):
def
create_identifiables
(
self
):
"""
"""
...
@@ -157,7 +162,7 @@ class AbstractCFood(object):
...
@@ -157,7 +162,7 @@ class AbstractCFood(object):
def
assure_object_is_in_list
(
obj
,
containing_object
,
property_name
,
def
assure_object_is_in_list
(
obj
,
containing_object
,
property_name
,
to_be_updated
):
to_be_updated
,
verbose
=
True
):
"""
"""
Checks whether `obj` is one of the values in the list property
Checks whether `obj` is one of the values in the list property
`property_name` of the supplied entity containing_object`.
`property_name` of the supplied entity containing_object`.
...
@@ -183,13 +188,20 @@ def assure_object_is_in_list(obj, containing_object, property_name,
...
@@ -183,13 +188,20 @@ def assure_object_is_in_list(obj, containing_object, property_name,
break
break
if
contained
:
if
contained
:
if
verbose
:
print
(
"
{} is in {} of entity {}
"
.
format
(
obj
,
property_name
,
containing_object
.
id
))
return
return
if
verbose
:
print
(
"
Appending {} to {} of entity {}
"
.
format
(
obj
,
property_name
,
containing_object
.
id
))
current_list
.
append
(
obj
)
current_list
.
append
(
obj
)
to_be_updated
.
append
(
containing_object
)
to_be_updated
.
append
(
containing_object
)
def
assure_has_parent
(
entity
,
parent
,
to_be_updated
):
def
assure_has_parent
(
entity
,
parent
,
to_be_updated
,
verbose
=
True
):
"""
"""
Checks whether `entity` has a parent with name `parent`.
Checks whether `entity` has a parent with name `parent`.
...
@@ -207,8 +219,13 @@ def assure_has_parent(entity, parent, to_be_updated):
...
@@ -207,8 +219,13 @@ def assure_has_parent(entity, parent, to_be_updated):
break
break
if
contained
:
if
contained
:
if
verbose
:
print
(
"
entity {} has parent {}
"
.
format
(
entity
.
id
,
parent
))
return
return
if
verbose
:
print
(
"
Adding parent {} to entity {}
"
.
format
(
parent
,
entity
.
id
))
entity
.
add_parent
(
parent
)
entity
.
add_parent
(
parent
)
to_be_updated
.
append
(
entity
)
to_be_updated
.
append
(
entity
)
...
...
This diff is collapsed.
Click to expand it.
src/caosadvancedtools/crawler.py
+
58
−
11
View file @
782db460
...
@@ -45,7 +45,7 @@ from .cache import Cache
...
@@ -45,7 +45,7 @@ from .cache import Cache
class
Crawler
(
object
):
class
Crawler
(
object
):
def
__init__
(
self
,
food
,
access
=
lambda
x
:
x
,
use_cache
=
False
):
def
__init__
(
self
,
food
,
access
=
lambda
x
:
x
,
use_cache
=
False
,
verbose
=
True
):
"""
"""
Parameters
Parameters
----------
----------
...
@@ -63,12 +63,13 @@ class Crawler(object):
...
@@ -63,12 +63,13 @@ class Crawler(object):
self
.
access
=
access
self
.
access
=
access
self
.
report
=
db
.
Container
()
self
.
report
=
db
.
Container
()
self
.
use_cache
=
use_cache
self
.
use_cache
=
use_cache
self
.
verbose
=
verbose
if
self
.
use_cache
:
if
self
.
use_cache
:
self
.
cache
=
Cache
()
self
.
cache
=
Cache
()
def
crawl
(
self
,
files
):
def
crawl
(
self
,
files
):
for
crawled_file
in
files
:
for
crawled_file
in
sorted
(
files
,
key
=
lambda
x
:
x
.
path
)
:
# if crawled_file.size == 0:
# if crawled_file.size == 0:
# crawled_file.add_message(
# crawled_file.add_message(
# type="Warning", description="This file is empty. Shouldn't we delete it?")
# type="Warning", description="This file is empty. Shouldn't we delete it?")
...
@@ -76,17 +77,31 @@ class Crawler(object):
...
@@ -76,17 +77,31 @@ class Crawler(object):
# continue
# continue
if
self
.
verbose
:
msg
=
"
Matching {}
"
.
format
(
crawled_file
.
path
)
print
(
"
=
"
*
len
(
msg
))
print
(
msg
)
print
(
"
=
"
*
len
(
msg
))
matches
=
0
for
Cfood
in
self
.
food
:
for
Cfood
in
self
.
food
:
if
Cfood
.
match
(
crawled_file
.
path
)
is
not
None
:
if
Cfood
.
match
(
crawled_file
.
path
)
is
not
None
:
matches
+=
1
if
self
.
verbose
:
print
(
"
{} matched.
"
.
format
(
Cfood
.
__name__
))
try
:
try
:
cfood
=
Cfood
(
crawled_file
,
access
=
self
.
access
)
cfood
=
Cfood
(
crawled_file
,
access
=
self
.
access
,
verbose
=
self
.
verbose
)
cfood
.
create_identifiables
()
cfood
.
create_identifiables
()
if
self
.
use_cache
:
if
self
.
use_cache
:
hashes
=
self
.
cache
.
update_ids_from_cache
(
hashes
=
self
.
cache
.
update_ids_from_cache
(
cfood
.
identifiables
)
cfood
.
identifiables
)
self
.
find_or_insert_identifiables
(
cfood
.
identifiables
)
self
.
find_or_insert_identifiables
(
cfood
.
identifiables
,
self
.
verbose
)
if
self
.
use_cache
:
if
self
.
use_cache
:
self
.
cache
.
insert_list
(
hashes
,
cfood
.
identifiables
)
self
.
cache
.
insert_list
(
hashes
,
cfood
.
identifiables
)
...
@@ -97,8 +112,14 @@ class Crawler(object):
...
@@ -97,8 +112,14 @@ class Crawler(object):
traceback
.
print_exc
()
traceback
.
print_exc
()
print
(
e
)
print
(
e
)
if
self
.
verbose
and
matches
==
0
:
print
(
"
ATTENTION: No matching cfood!
"
)
if
self
.
verbose
and
matches
>
1
:
print
(
"
Attention: More than one matching cfood!
"
)
@staticmethod
@staticmethod
def
find_or_insert_identifiables
(
identifiables
):
def
find_or_insert_identifiables
(
identifiables
,
verbose
=
True
):
"""
Sets the ids of identifiables (that do not have already an id from the
"""
Sets the ids of identifiables (that do not have already an id from the
cache) based on searching CaosDB and retrieves those entities.
cache) based on searching CaosDB and retrieves those entities.
The remaining entities (those which can not be retrieved) have no
The remaining entities (those which can not be retrieved) have no
...
@@ -107,15 +128,22 @@ class Crawler(object):
...
@@ -107,15 +128,22 @@ class Crawler(object):
# looking for matching entities in CaosDB when there is no valid id
# looking for matching entities in CaosDB when there is no valid id
# i.e. there was none set from a cache
# i.e. there was none set from a cache
if
verbose
:
print
(
"
-----------------------------------------------------
"
)
for
ent
in
identifiables
:
for
ent
in
identifiables
:
if
ent
.
id
is
None
or
ent
.
id
<
0
:
if
ent
.
id
is
None
or
ent
.
id
<
0
:
if
verbose
:
print
(
"
Looking for:
"
)
print
(
ent
)
existing
=
Crawler
.
find_existing
(
ent
)
existing
=
Crawler
.
find_existing
(
ent
)
if
existing
is
not
None
:
if
existing
is
not
None
:
ent
.
id
=
existing
.
id
ent
.
id
=
existing
.
id
else
:
# this makes entities with existing ids valid
if
verbose
:
# identifiables.retrieve(unique=True, raise_exception_on_error=False)
print
(
"
Id is known of:
"
)
print
(
ent
)
# insert missing, i.e. those which are not valid
# insert missing, i.e. those which are not valid
missing_identifiables
=
db
.
Container
()
missing_identifiables
=
db
.
Container
()
...
@@ -126,9 +154,20 @@ class Crawler(object):
...
@@ -126,9 +154,20 @@ class Crawler(object):
for
ent
in
missing_identifiables
:
for
ent
in
missing_identifiables
:
ent
.
id
=
None
ent
.
id
=
None
if
verbose
:
print
(
"
Going to insert the following entities:
"
)
for
ent
in
missing_identifiables
:
print
(
ent
)
missing_identifiables
.
insert
()
missing_identifiables
.
insert
()
if
verbose
:
print
(
"
Updating entities from CaosDB...
"
)
identifiables
.
retrieve
(
unique
=
True
,
raise_exception_on_error
=
False
)
identifiables
.
retrieve
(
unique
=
True
,
raise_exception_on_error
=
False
)
if
verbose
:
print
(
"
-----------------------------------------------------
"
)
@staticmethod
@staticmethod
def
find_existing
(
entity
):
def
find_existing
(
entity
):
"""
searches for an entity that matches the identifiable in CaosDB
"""
searches for an entity that matches the identifiable in CaosDB
...
@@ -140,12 +179,20 @@ class Crawler(object):
...
@@ -140,12 +179,20 @@ class Crawler(object):
if
entity
.
name
is
None
:
if
entity
.
name
is
None
:
# TODO multiple parents are ignored! Sufficient?
# TODO multiple parents are ignored! Sufficient?
query_string
=
"
FIND Record
"
+
entity
.
get_parents
()[
0
].
name
query_string
=
"
FIND Record
"
+
entity
.
get_parents
()[
0
].
name
query_string
+=
"
WITH
"
+
"
AND
"
.
join
(
query_string
+=
"
WITH
"
[
"'"
+
p
.
name
+
"'
=
'"
+
str
(
get_value
(
p
))
+
"'"
for
p
in
entity
.
get_properties
()])
for
p
in
entity
.
get_properties
():
if
p
.
value
==
""
:
query_string
+=
(
"
NOT
'"
+
p
.
name
+
"'
AND
"
)
else
:
query_string
+=
(
"'"
+
p
.
name
+
"'
=
'"
+
str
(
get_value
(
p
))
+
"'
AND
"
)
# remove the last AND
query_string
=
query_string
[:
-
4
]
else
:
else
:
query_string
=
"
FIND
'
{}
'"
.
format
(
entity
.
name
)
query_string
=
"
FIND
'
{}
'"
.
format
(
entity
.
name
)
print
(
query_string
)
q
=
db
.
Query
(
query_string
)
q
=
db
.
Query
(
query_string
)
# the identifiable should identify an object uniquely. Thus the query
# the identifiable should identify an object uniquely. Thus the query
# is using the unique keyword
# is using the unique keyword
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment