Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
C
caosdb-advanced-user-tools
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
caosdb
Software
caosdb-advanced-user-tools
Commits
23445e6e
Commit
23445e6e
authored
5 years ago
by
Henrik tom Wörden
Browse files
Options
Downloads
Patches
Plain Diff
ENH: output can now be controlled by verbosity more specifically
parent
8f96c9cb
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/caosadvancedtools/cfood.py
+36
-17
36 additions, 17 deletions
src/caosadvancedtools/cfood.py
src/caosadvancedtools/crawler.py
+61
-31
61 additions, 31 deletions
src/caosadvancedtools/crawler.py
with
97 additions
and
48 deletions
src/caosadvancedtools/cfood.py
+
36
−
17
View file @
23445e6e
...
...
@@ -38,6 +38,8 @@ import re
import
caosdb
as
db
from
.verbosity
import
INFO
,
VERBOSE
ENTITIES
=
{}
...
...
@@ -60,7 +62,7 @@ class AbstractCFood(object):
# function match()
_pattern
=
None
def
__init__
(
self
,
crawled_file
,
access
=
lambda
x
:
x
,
verbos
e
=
True
):
def
__init__
(
self
,
crawled_file
,
access
=
lambda
x
:
x
,
verbos
ity
=
INFO
):
"""
Abstract base class for Crawler food (CFood).
Parameters
...
...
@@ -77,7 +79,7 @@ class AbstractCFood(object):
self
.
match
=
type
(
self
).
match
(
crawled_file
.
path
)
self
.
to_be_updated
=
db
.
Container
()
self
.
identifiables
=
db
.
Container
()
self
.
verbos
e
=
verbos
e
self
.
verbos
ity
=
verbos
ity
self
.
attached_ones
=
[]
@staticmethod
...
...
@@ -127,7 +129,16 @@ class AbstractCFood(object):
if
len
(
self
.
to_be_updated
)
==
0
:
return
get_ids_for_entities_with_names
(
self
.
to_be_updated
)
if
self
.
verbosity
>=
INFO
:
print
(
"
/
"
*
60
)
print
(
"
UPDATE: updating the following entities
"
)
for
el
in
self
.
to_be_updated
:
print
(
el
.
name
if
el
.
name
is
not
None
else
el
.
id
)
print
(
"
/
"
*
60
)
self
.
to_be_updated
.
update
()
def
attach
(
self
,
crawled_file
):
...
...
@@ -173,7 +184,7 @@ class AbstractCFood(object):
def
assure_object_is_in_list
(
obj
,
containing_object
,
property_name
,
to_be_updated
,
verbos
e
=
True
):
to_be_updated
,
verbos
ity
=
INFO
):
"""
Checks whether `obj` is one of the values in the list property
`property_name` of the supplied entity containing_object`.
...
...
@@ -212,15 +223,17 @@ def assure_object_is_in_list(obj, containing_object, property_name,
break
if
verbose
:
if
contained
:
if
contained
:
if
verbosity
>=
VERBOSE
:
print
(
"
{} is in {} of entity {}
"
.
format
(
o
,
property_name
,
containing_object
.
id
))
else
:
print
(
"
Appending {} to {} of entity {}
"
.
format
(
o
,
property_name
,
containing_object
.
id
))
if
not
contained
:
else
:
if
verbosity
>=
INFO
:
print
(
"
/
"
*
60
)
print
(
"
UPDATE: Appending {} to {} of entity {}
"
.
format
(
o
,
property_name
,
containing_object
.
id
))
print
(
"
/
"
*
60
)
current_list
.
append
(
o
)
update
=
True
...
...
@@ -228,7 +241,7 @@ def assure_object_is_in_list(obj, containing_object, property_name,
to_be_updated
.
append
(
containing_object
)
def
assure_has_parent
(
entity
,
parent
,
to_be_updated
=
None
,
verbos
e
=
True
):
def
assure_has_parent
(
entity
,
parent
,
to_be_updated
=
None
,
verbos
ity
=
INFO
):
"""
Checks whether `entity` has a parent with name `parent`.
...
...
@@ -247,13 +260,16 @@ def assure_has_parent(entity, parent, to_be_updated=None, verbose=True):
break
if
contained
:
if
verbos
e
:
if
verbos
ity
>=
VERBOSE
:
print
(
"
entity {} has parent {}
"
.
format
(
entity
.
id
,
parent
))
return
if
verbose
:
print
(
"
Adding parent {} to entity {}
"
.
format
(
parent
,
entity
.
id
))
if
verbosity
>=
INFO
:
print
(
"
/
"
*
60
)
print
(
"
UPDATE: Adding parent {} to entity {}
"
.
format
(
parent
,
entity
.
id
))
print
(
"
/
"
*
60
)
entity
.
add_parent
(
parent
)
if
to_be_updated
is
None
:
...
...
@@ -262,7 +278,8 @@ def assure_has_parent(entity, parent, to_be_updated=None, verbose=True):
to_be_updated
.
append
(
entity
)
def
assure_has_property
(
entity
,
name
,
value
,
to_be_updated
=
None
,
verbose
=
True
):
def
assure_has_property
(
entity
,
name
,
value
,
to_be_updated
=
None
,
verbosity
=
INFO
):
"""
Checks whether `entity` has a property `name` with the value `value`.
...
...
@@ -282,17 +299,19 @@ def assure_has_property(entity, name, value, to_be_updated=None, verbose=True):
break
if
contained
:
if
verbos
e
:
if
verbos
ity
>=
VERBOSE
:
print
(
"
entity {} has property {} with value {}
"
.
format
(
entity
.
id
,
name
,
value
))
return
if
verbose
:
print
(
"
Adding property {} with value {} to entity {}
"
.
format
(
if
verbosity
>=
INFO
:
print
(
"
/
"
*
60
)
print
(
"
UPDATE: Adding property {} with value {} to entity {}
"
.
format
(
name
,
value
,
entity
.
id
))
print
(
"
/
"
*
60
)
entity
.
add_property
(
name
=
name
,
value
=
value
)
if
to_be_updated
is
None
:
...
...
This diff is collapsed.
Click to expand it.
src/caosadvancedtools/crawler.py
+
61
−
31
View file @
23445e6e
...
...
@@ -42,10 +42,12 @@ import caosdb as db
from
caosdb.exceptions
import
TransactionError
from
.cache
import
Cache
from
.verbosity
import
DEBUG
,
INFO
,
VERBOSE
class
Crawler
(
object
):
def
__init__
(
self
,
food
,
access
=
lambda
x
:
x
,
use_cache
=
False
,
verbose
=
True
):
def
__init__
(
self
,
food
,
access
=
lambda
x
:
x
,
use_cache
=
False
,
verbosity
=
INFO
):
"""
Parameters
----------
...
...
@@ -63,15 +65,23 @@ class Crawler(object):
self
.
access
=
access
self
.
report
=
db
.
Container
()
self
.
use_cache
=
use_cache
self
.
verbos
e
=
verbos
e
self
.
verbos
ity
=
verbos
ity
if
self
.
use_cache
:
self
.
cache
=
Cache
()
def
crawl
(
self
,
files
):
files
=
sorted
(
files
,
key
=
lambda
x
:
x
.
path
)
cfoods
=
[]
matches
=
{
f
.
path
:
0
for
f
in
files
}
for
crawled_file
in
sorted
(
files
,
key
=
lambda
x
:
x
.
path
):
if
self
.
verbosity
>=
INFO
:
print
(
"
-
"
*
60
)
print
(
"
Matching files against CFoods
"
)
for
crawled_file
in
files
:
if
self
.
verbosity
>=
VERBOSE
:
print
(
"
Matching {}...
"
.
format
(
crawled_file
.
path
))
# if crawled_file.size == 0:
# crawled_file.add_message(
# type="Warning", description="This file is empty. Shouldn't we delete it?")
...
...
@@ -79,37 +89,47 @@ class Crawler(object):
# continue
if
self
.
verbose
:
msg
=
"
Matching {}
"
.
format
(
crawled_file
.
path
)
print
(
"
=
"
*
len
(
msg
))
print
(
msg
)
print
(
"
=
"
*
len
(
msg
))
matches
=
0
for
Cfood
in
self
.
food
:
if
Cfood
.
match
(
crawled_file
.
path
)
is
not
None
:
matches
+=
1
matches
[
crawled_file
.
path
]
+=
1
if
self
.
verbos
e
:
if
self
.
verbos
ity
>=
VERBOSE
:
print
(
"
{} matched.
"
.
format
(
Cfood
.
__name__
))
try
:
cfoods
.
append
(
Cfood
(
crawled_file
,
access
=
self
.
access
,
verbos
e
=
self
.
verbos
e
))
verbos
ity
=
self
.
verbos
ity
))
except
Exception
as
e
:
traceback
.
print_exc
()
print
(
e
)
if
self
.
verbose
and
matches
==
0
:
print
(
"
ATTENTION: No matching cfood!
"
)
if
self
.
verbosity
>=
INFO
:
print
(
"
-
"
*
60
)
print
(
"
Trying to attach files to created CFoods
"
)
if
self
.
verbose
and
matches
>
1
:
print
(
"
Attention: More than one matching cfood!
"
)
for
crawled_file
in
files
:
if
self
.
verbosity
>=
VERBOSE
:
print
(
"
Matching {}...
"
.
format
(
crawled_file
.
path
))
for
crawled_file
in
sorted
(
files
,
key
=
lambda
x
:
x
.
path
):
for
cfood
in
cfoods
:
if
cfood
.
looking_for
(
crawled_file
):
if
self
.
verbosity
>=
VERBOSE
:
print
(
"
{} matched.
"
.
format
(
cfood
.
__class__
.
__name__
))
cfood
.
attach
(
crawled_file
)
matches
[
crawled_file
.
path
]
+=
1
if
self
.
verbosity
>=
INFO
:
for
crawled_file
in
files
:
if
matches
[
crawled_file
.
path
]
==
0
:
print
(
"
ATTENTION: No matching cfood!
"
)
print
(
"
Tried to match {}
"
.
format
(
crawled_file
.
path
))
if
matches
[
crawled_file
.
path
]
>
1
:
print
(
"
Attention: More than one matching cfood!
"
)
print
(
"
Tried to match {}
"
.
format
(
crawled_file
.
path
))
if
self
.
verbosity
>=
INFO
:
print
(
"
-
"
*
60
)
print
(
"
Creating and updating Identifiables
"
)
for
cfood
in
cfoods
:
try
:
...
...
@@ -120,7 +140,7 @@ class Crawler(object):
cfood
.
identifiables
)
self
.
find_or_insert_identifiables
(
cfood
.
identifiables
,
self
.
verbos
e
)
self
.
verbos
ity
)
if
self
.
use_cache
:
self
.
cache
.
insert_list
(
hashes
,
cfood
.
identifiables
)
...
...
@@ -131,8 +151,13 @@ class Crawler(object):
traceback
.
print_exc
()
print
(
e
)
if
self
.
verbosity
>=
INFO
:
print
(
"
-
"
*
60
)
print
(
"
Crawler terminated successfully!
"
)
print
(
"
-
"
*
60
)
@staticmethod
def
find_or_insert_identifiables
(
identifiables
,
verbos
e
=
True
):
def
find_or_insert_identifiables
(
identifiables
,
verbos
ity
=
INFO
):
"""
Sets the ids of identifiables (that do not have already an id from the
cache) based on searching CaosDB and retrieves those entities.
The remaining entities (those which can not be retrieved) have no
...
...
@@ -141,20 +166,20 @@ class Crawler(object):
# looking for matching entities in CaosDB when there is no valid id
# i.e. there was none set from a cache
if
verbos
e
:
if
verbos
ity
>=
VERBOSE
:
print
(
"
-----------------------------------------------------
"
)
for
ent
in
identifiables
:
if
ent
.
id
is
None
or
ent
.
id
<
0
:
if
verbos
e
:
if
verbos
ity
>=
VERBOSE
:
print
(
"
Looking for:
"
)
print
(
ent
)
existing
=
Crawler
.
find_existing
(
ent
)
existing
=
Crawler
.
find_existing
(
ent
,
verbosity
=
verbosity
)
if
existing
is
not
None
:
ent
.
id
=
existing
.
id
else
:
if
verbos
e
:
if
verbos
ity
>=
DEBUG
:
print
(
"
Id is known of:
"
)
print
(
ent
)
...
...
@@ -167,22 +192,26 @@ class Crawler(object):
for
ent
in
missing_identifiables
:
ent
.
id
=
None
if
verbos
e
:
if
verbos
ity
>=
INFO
and
len
(
missing_identifiables
)
>
0
:
print
(
"
Going to insert the following entities:
"
)
for
ent
in
missing_identifiables
:
print
(
ent
)
if
verbosity
>=
VERBOSE
and
len
(
missing_identifiables
)
==
0
:
print
(
"
No new entities to be inserted.
"
)
missing_identifiables
.
insert
()
if
verbos
e
:
print
(
"
Updat
ing entities from CaosDB...
"
)
if
verbos
ity
>=
VERBOSE
:
print
(
"
Retriev
ing entities from CaosDB...
"
)
identifiables
.
retrieve
(
unique
=
True
,
raise_exception_on_error
=
False
)
if
verbos
e
:
if
verbos
ity
>=
VERBOSE
:
print
(
"
-----------------------------------------------------
"
)
@staticmethod
def
find_existing
(
entity
):
def
find_existing
(
entity
,
verbosity
=
INFO
):
"""
searches for an entity that matches the identifiable in CaosDB
Characteristics of the identifiable like, properties, name or id are
...
...
@@ -205,7 +234,8 @@ class Crawler(object):
else
:
query_string
=
"
FIND
'
{}
'"
.
format
(
entity
.
name
)
print
(
query_string
)
if
verbosity
>=
VERBOSE
:
print
(
query_string
)
q
=
db
.
Query
(
query_string
)
# the identifiable should identify an object uniquely. Thus the query
# is using the unique keyword
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment