Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
CaosDB Crawler
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
caosdb
Software
CaosDB Crawler
Commits
b709bd88
Commit
b709bd88
authored
Nov 22, 2021
by
Alexander Schlemmer
Browse files
Options
Downloads
Patches
Plain Diff
ENH: provenance output works
parent
f5426e39
No related branches found
No related tags found
1 merge request
!53
Release 0.1
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
src/newcrawler/converters.py
+14
-8
14 additions, 8 deletions
src/newcrawler/converters.py
src/newcrawler/crawl.py
+27
-6
27 additions, 6 deletions
src/newcrawler/crawl.py
src/newcrawler/stores.py
+11
-14
11 additions, 14 deletions
src/newcrawler/stores.py
tests/test_tool.py
+13
-3
13 additions, 3 deletions
tests/test_tool.py
with
65 additions
and
31 deletions
src/newcrawler/converters.py
+
14
−
8
View file @
b709bd88
...
@@ -182,8 +182,7 @@ class Converter(object):
...
@@ -182,8 +182,7 @@ class Converter(object):
def
create_values
(
self
,
def
create_values
(
self
,
values
:
GeneralStore
,
values
:
GeneralStore
,
element
:
StructureElement
,
element
:
StructureElement
):
converters_path
:
list
,
structure_elements_path
:
list
):
"""
"""
Extract information from the structure element and store them as values in the
Extract information from the structure element and store them as values in the
general store.
general store.
...
@@ -195,7 +194,7 @@ class Converter(object):
...
@@ -195,7 +194,7 @@ class Converter(object):
if
m
is
None
:
if
m
is
None
:
# this should never happen as the condition was checked before already
# this should never happen as the condition was checked before already
raise
RuntimeError
(
"
Condition does not match.
"
)
raise
RuntimeError
(
"
Condition does not match.
"
)
values
.
update
(
m
,
converters_path
,
structure_elements_path
)
values
.
update
(
m
)
@abstractmethod
@abstractmethod
def
create_children
(
self
,
values
:
GeneralStore
,
def
create_children
(
self
,
values
:
GeneralStore
,
...
@@ -204,11 +203,16 @@ class Converter(object):
...
@@ -204,11 +203,16 @@ class Converter(object):
def
create_records
(
self
,
values
:
GeneralStore
,
def
create_records
(
self
,
values
:
GeneralStore
,
records
:
RecordStore
,
records
:
RecordStore
,
element
:
StructureElement
,
element
:
StructureElement
):
converters_path
:
list
,
structure_elements_path
:
list
):
if
"
records
"
not
in
self
.
definition
:
if
"
records
"
not
in
self
.
definition
:
return
return
[]
# list of keys to identify, which variables have been set by which paths:
# these are tuples:
# 0: record name
# 1: property name
keys_modified
=
[]
for
name
,
record
in
self
.
definition
[
"
records
"
].
items
():
for
name
,
record
in
self
.
definition
[
"
records
"
].
items
():
# whether the record already exists in the store or not are actually really
# whether the record already exists in the store or not are actually really
...
@@ -216,15 +220,16 @@ class Converter(object):
...
@@ -216,15 +220,16 @@ class Converter(object):
if
name
not
in
records
:
if
name
not
in
records
:
c_record
=
db
.
Record
()
c_record
=
db
.
Record
()
# add the new record to the record store:
# add the new record to the record store:
records
.
set_value
(
name
,
c_record
,
converters_path
,
structure_elements_path
)
records
[
name
]
=
c_record
# additionally add the new record to the general store:
# additionally add the new record to the general store:
values
.
set_value
(
name
,
c_record
,
converters_path
,
structure_elements_path
)
values
[
name
]
=
c_record
c_record
=
records
[
name
]
c_record
=
records
[
name
]
for
key
,
value
in
record
.
items
():
for
key
,
value
in
record
.
items
():
if
key
==
"
parents
"
:
if
key
==
"
parents
"
:
continue
continue
keys_modified
.
append
((
name
,
key
))
propvalue
,
collection_mode
=
handle_value
(
value
,
values
)
propvalue
,
collection_mode
=
handle_value
(
value
,
values
)
if
c_record
.
get_property
(
key
)
is
None
:
if
c_record
.
get_property
(
key
)
is
None
:
...
@@ -253,6 +258,7 @@ class Converter(object):
...
@@ -253,6 +258,7 @@ class Converter(object):
else
:
else
:
if
not
has_parent
(
c_record
,
name
):
if
not
has_parent
(
c_record
,
name
):
c_record
.
add_parent
(
name
)
c_record
.
add_parent
(
name
)
return
keys_modified
...
...
This diff is collapsed.
Click to expand it.
src/newcrawler/crawl.py
+
27
−
6
View file @
b709bd88
...
@@ -69,6 +69,7 @@ from .stores import GeneralStore, RecordStore
...
@@ -69,6 +69,7 @@ from .stores import GeneralStore, RecordStore
from
.structure_elements
import
StructureElement
,
Directory
,
File
from
.structure_elements
import
StructureElement
,
Directory
,
File
from
.converters
import
Converter
,
DirectoryConverter
from
.converters
import
Converter
,
DirectoryConverter
from
.identifiable_adapters
import
LocalStorageIdentifiableAdapter
from
.identifiable_adapters
import
LocalStorageIdentifiableAdapter
from
collections
import
defaultdict
class
Crawler
(
object
):
class
Crawler
(
object
):
...
@@ -107,7 +108,9 @@ class Crawler(object):
...
@@ -107,7 +108,9 @@ class Crawler(object):
# 0: generalStore
# 0: generalStore
# 1: recordStore
# 1: recordStore
self
.
debug_tree
:
dict
[
str
,
tuple
]
=
dict
()
self
.
debug_tree
:
dict
[
str
,
tuple
]
=
dict
()
self
.
debug_metadata
:
dict
[
str
,
dict
[
str
,
tuple
]]
=
dict
()
self
.
debug_metadata
:
dict
[
str
,
dict
]
=
dict
()
self
.
debug_metadata
[
"
copied
"
]
=
dict
()
self
.
debug_metadata
[
"
modified
"
]
=
defaultdict
(
lambda
:
dict
())
def
crawl_directory
(
self
,
dirname
:
str
,
def
crawl_directory
(
self
,
dirname
:
str
,
cfood
:
str
):
cfood
:
str
):
...
@@ -169,6 +172,20 @@ class Crawler(object):
...
@@ -169,6 +172,20 @@ class Crawler(object):
pass
pass
def
save_debug_data
(
self
,
filename
:
str
):
paths
:
dict
[
str
,
dict
]
=
{
"
provenance
"
:
dict
()}
mod_info
=
self
.
debug_metadata
[
"
modified
"
]
for
record_name
in
mod_info
:
paths
[
"
provenance
"
][
record_name
]
=
dict
()
for
prop_name
in
mod_info
[
record_name
]:
paths
[
"
provenance
"
][
record_name
][
prop_name
]
=
{
"
structure_elements_path
"
:
"
/
"
.
join
(
mod_info
[
record_name
][
prop_name
][
0
]),
"
converters_path
"
:
"
/
"
.
join
(
mod_info
[
record_name
][
prop_name
][
1
])}
with
open
(
filename
,
"
w
"
)
as
f
:
f
.
write
(
yaml
.
dump
(
paths
))
def
_crawl
(
self
,
items
:
list
[
StructureElement
],
def
_crawl
(
self
,
items
:
list
[
StructureElement
],
global_converters
:
list
[
Converter
],
global_converters
:
list
[
Converter
],
...
@@ -202,15 +219,22 @@ class Crawler(object):
...
@@ -202,15 +219,22 @@ class Crawler(object):
# -> rather store it in the variable storage than in the converter?
# -> rather store it in the variable storage than in the converter?
converter
.
create_values
(
generalStore_copy
,
element
)
converter
.
create_values
(
generalStore_copy
,
element
)
converter
.
create_records
(
generalStore_copy
,
recordStore_copy
,
element
)
keys_modified
=
converter
.
create_records
(
generalStore_copy
,
recordStore_copy
,
element
)
children
=
converter
.
create_children
(
generalStore_copy
,
element
)
children
=
converter
.
create_children
(
generalStore_copy
,
element
)
if
self
.
debug
:
if
self
.
debug
:
# add provenance information for each varaible
# add provenance information for each varaible
self
.
debug_tree
[
str
(
element
)]
=
(
self
.
debug_tree
[
str
(
element
)]
=
(
generalStore_copy
.
get_storage
(),
recordStore_copy
.
get_storage
())
generalStore_copy
.
get_storage
(),
recordStore_copy
.
get_storage
())
self
.
debug_metadata
[
str
(
element
)]
[
"
copied
"
]
=
(
self
.
debug_metadata
[
"
copied
"
][
str
(
element
)]
=
(
generalStore_copy
.
get_dict_copied
(),
recordStore_copy
.
get_dict_copied
())
generalStore_copy
.
get_dict_copied
(),
recordStore_copy
.
get_dict_copied
())
mod_info
=
self
.
debug_metadata
[
"
modified
"
]
for
record_name
,
prop_name
in
keys_modified
:
internal_id
=
recordStore_copy
.
get_internal_id
(
record_name
)
mod_info
[
record_name
+
"
_
"
+
str
(
internal_id
)][
prop_name
]
=
(
structure_elements_path
+
[
element
.
get_name
()],
converters_path
+
[
converter
.
name
])
self
.
_crawl
(
children
,
global_converters
,
converter
.
converters
,
self
.
_crawl
(
children
,
global_converters
,
converter
.
converters
,
generalStore_copy
,
recordStore_copy
,
generalStore_copy
,
recordStore_copy
,
...
@@ -220,9 +244,6 @@ class Crawler(object):
...
@@ -220,9 +244,6 @@ class Crawler(object):
# to the general update container.
# to the general update container.
scoped_records
=
recordStore
.
get_records_current_scope
()
scoped_records
=
recordStore
.
get_records_current_scope
()
for
record
in
scoped_records
:
for
record
in
scoped_records
:
print
(
"
/
"
.
join
(
structure_elements_path
))
print
(
"
/
"
.
join
(
converters_path
))
print
(
record
)
self
.
updateList
.
append
(
record
)
self
.
updateList
.
append
(
record
)
return
self
.
updateList
return
self
.
updateList
...
...
This diff is collapsed.
Click to expand it.
src/newcrawler/stores.py
+
11
−
14
View file @
b709bd88
...
@@ -24,6 +24,7 @@
...
@@ -24,6 +24,7 @@
#
#
import
caosdb
as
db
import
caosdb
as
db
from
collections
import
defaultdict
class
Store
(
object
):
class
Store
(
object
):
...
@@ -37,8 +38,9 @@ class Store(object):
...
@@ -37,8 +38,9 @@ class Store(object):
# This dict stores whether the corresponding dict item in _storage
# This dict stores whether the corresponding dict item in _storage
# (same key) has been copied from another Store, or was created newly in this store.
# (same key) has been copied from another Store, or was created newly in this store.
self
.
_copied
=
dict
()
self
.
_copied
=
dict
()
self
.
_provenance_structure_elements
=
dict
()
# This attribute stores an internal id for being able to distinguish multiple
self
.
_provenance_converters
=
dict
()
# ocurrences of the same thing in the store:
self
.
_ids
=
defaultdict
(
lambda
:
0
)
def
__getitem__
(
self
,
key
:
str
):
def
__getitem__
(
self
,
key
:
str
):
return
self
.
_storage
[
key
]
return
self
.
_storage
[
key
]
...
@@ -46,14 +48,16 @@ class Store(object):
...
@@ -46,14 +48,16 @@ class Store(object):
def
__contains__
(
self
,
key
:
str
):
def
__contains__
(
self
,
key
:
str
):
return
key
in
self
.
_storage
return
key
in
self
.
_storage
def
update
(
self
,
other
:
dict
,
converters_path
:
list
,
structure_elements_path
:
list
):
def
update
(
self
,
other
:
dict
):
self
.
_storage
.
update
(
other
)
self
.
_storage
.
update
(
other
)
for
key
in
other
:
for
key
in
other
:
self
.
_copied
[
key
]
=
False
self
.
_copied
[
key
]
=
False
self
.
_ids
[
key
]
+=
1
def
set
_value
(
self
,
key
:
str
,
value
,
converters_path
:
list
,
structure_elements_path
:
list
):
def
__
set
item__
(
self
,
key
:
str
,
value
):
self
.
_storage
[
key
]
=
value
self
.
_storage
[
key
]
=
value
self
.
_copied
[
key
]
=
False
self
.
_copied
[
key
]
=
False
self
.
_ids
[
key
]
+=
1
def
get_storage
(
self
):
def
get_storage
(
self
):
return
self
.
_storage
return
self
.
_storage
...
@@ -62,8 +66,7 @@ class Store(object):
...
@@ -62,8 +66,7 @@ class Store(object):
s_copy
=
self
.
__class__
()
s_copy
=
self
.
__class__
()
s_copy
.
_storage
=
dict
(
self
.
_storage
)
s_copy
.
_storage
=
dict
(
self
.
_storage
)
s_copy
.
_copied
=
{
key
:
True
for
key
in
self
.
_copied
}
s_copy
.
_copied
=
{
key
:
True
for
key
in
self
.
_copied
}
s_copy
.
_provenance_structure_elements
=
dict
(
self
.
_provenance_structure_elements
)
s_copy
.
_ids
=
self
.
_ids
s_copy
.
_provenance_converters
=
dict
(
self
.
_provenance_converters
)
return
s_copy
return
s_copy
def
get_dict_copied
(
self
):
def
get_dict_copied
(
self
):
...
@@ -72,17 +75,11 @@ class Store(object):
...
@@ -72,17 +75,11 @@ class Store(object):
"""
"""
return
self
.
_copied
return
self
.
_copied
def
get_
provenance_structure_elements
(
self
):
def
get_
internal_id
(
self
,
key
):
"""
"""
Only for debugging.
Only for debugging.
"""
"""
return
self
.
_provenance_structure_elements
return
self
.
_ids
[
key
]
def
get_provenance_converters
(
self
):
"""
Only for debugging.
"""
return
self
.
_provenance_converters
class
GeneralStore
(
Store
):
class
GeneralStore
(
Store
):
pass
pass
...
...
This diff is collapsed.
Click to expand it.
tests/test_tool.py
+
13
−
3
View file @
b709bd88
...
@@ -8,8 +8,15 @@ from newcrawler.converters import MarkdownFileConverter
...
@@ -8,8 +8,15 @@ from newcrawler.converters import MarkdownFileConverter
from
newcrawler.structure_elements
import
File
,
DictTextElement
,
DictListElement
from
newcrawler.structure_elements
import
File
,
DictTextElement
,
DictListElement
from
newcrawler.identifiable_adapters
import
LocalStorageIdentifiableAdapter
from
newcrawler.identifiable_adapters
import
LocalStorageIdentifiableAdapter
from
os.path
import
join
,
dirname
,
basename
from
os.path
import
join
,
dirname
,
basename
from
collections
import
defaultdict
import
yaml
import
caosdb
as
db
import
caosdb
as
db
# Some notes:
# Track provenance information in two ways:
# - DONE: provenance in structure elements and converters for properties of records
# - TODO: list whether information from structure elements and converters was used
def
rfp
(
*
pathcomponents
):
def
rfp
(
*
pathcomponents
):
"""
"""
Return full path.
Return full path.
...
@@ -29,7 +36,7 @@ def test_crawler():
...
@@ -29,7 +36,7 @@ def test_crawler():
rfp
(
"
scifolder_cfood.yml
"
))
rfp
(
"
scifolder_cfood.yml
"
))
subd
=
crawler
.
debug_tree
[
dircheckstr
(
"
DataAnalysis
"
)]
subd
=
crawler
.
debug_tree
[
dircheckstr
(
"
DataAnalysis
"
)]
subc
=
crawler
.
debug_copied
[
dircheckstr
(
"
DataAnalysis
"
)]
subc
=
crawler
.
debug_
metadata
[
"
copied
"
]
[
dircheckstr
(
"
DataAnalysis
"
)]
assert
len
(
subd
)
==
2
assert
len
(
subd
)
==
2
assert
len
(
subd
[
0
])
==
0
assert
len
(
subd
[
0
])
==
0
assert
len
(
subd
[
1
])
==
0
assert
len
(
subd
[
1
])
==
0
...
@@ -38,7 +45,7 @@ def test_crawler():
...
@@ -38,7 +45,7 @@ def test_crawler():
assert
len
(
subc
[
1
])
==
0
assert
len
(
subc
[
1
])
==
0
subd
=
crawler
.
debug_tree
[
dircheckstr
(
"
DataAnalysis
"
,
"
2020_climate-model-predict
"
)]
subd
=
crawler
.
debug_tree
[
dircheckstr
(
"
DataAnalysis
"
,
"
2020_climate-model-predict
"
)]
subc
=
crawler
.
debug_copied
[
dircheckstr
(
"
DataAnalysis
"
,
"
2020_climate-model-predict
"
)]
subc
=
crawler
.
debug_
metadata
[
"
copied
"
]
[
dircheckstr
(
"
DataAnalysis
"
,
"
2020_climate-model-predict
"
)]
assert
len
(
subd
[
1
])
==
1
assert
len
(
subd
[
1
])
==
1
assert
len
(
subd
[
1
][
"
Project
"
].
get_parents
())
==
1
assert
len
(
subd
[
1
][
"
Project
"
].
get_parents
())
==
1
...
@@ -64,7 +71,7 @@ def test_crawler():
...
@@ -64,7 +71,7 @@ def test_crawler():
subd
=
crawler
.
debug_tree
[
dircheckstr
(
"
DataAnalysis
"
,
subd
=
crawler
.
debug_tree
[
dircheckstr
(
"
DataAnalysis
"
,
"
2020_climate-model-predict
"
,
"
2020_climate-model-predict
"
,
"
2020-02-08_prediction-errors
"
)]
"
2020-02-08_prediction-errors
"
)]
subc
=
crawler
.
debug_copied
[
dircheckstr
(
"
DataAnalysis
"
,
subc
=
crawler
.
debug_
metadata
[
"
copied
"
]
[
dircheckstr
(
"
DataAnalysis
"
,
"
2020_climate-model-predict
"
,
"
2020_climate-model-predict
"
,
"
2020-02-08_prediction-errors
"
)]
"
2020-02-08_prediction-errors
"
)]
assert
len
(
subd
[
0
])
==
4
assert
len
(
subd
[
0
])
==
4
...
@@ -160,3 +167,6 @@ def test_crawler_update_list():
...
@@ -160,3 +167,6 @@ def test_crawler_update_list():
ident
.
store_state
(
rfp
(
"
records.xml
"
))
ident
.
store_state
(
rfp
(
"
records.xml
"
))
# ident.restore_state(rfp("records.xml"))
# ident.restore_state(rfp("records.xml"))
assert
len
(
ident
.
get_records
())
==
len
(
crawler
.
updateList
)
assert
len
(
ident
.
get_records
())
==
len
(
crawler
.
updateList
)
crawler
.
save_debug_data
(
rfp
(
"
provenance.yml
"
))
assert
False
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment