Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
CaosDB Crawler
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
caosdb
Software
CaosDB Crawler
Commits
a4c593ea
Commit
a4c593ea
authored
2 years ago
by
Henrik tom Wörden
Browse files
Options
Downloads
Plain Diff
Merge branch 'f-subtree' into 'dev'
F subtree See merge request
!93
parents
ac3bc48d
24dcf639
Branches
Branches containing commit
Tags
Tags containing commit
2 merge requests
!105
REL: v0.4.0
,
!93
F subtree
Pipeline
#33261
passed
2 years ago
Stage: info
Stage: setup
Stage: cert
Stage: style
Stage: test
Changes
3
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
CHANGELOG.md
+2
-3
2 additions, 3 deletions
CHANGELOG.md
src/caoscrawler/crawl.py
+81
-13
81 additions, 13 deletions
src/caoscrawler/crawl.py
unittests/test_tool.py
+98
-2
98 additions, 2 deletions
unittests/test_tool.py
with
181 additions
and
18 deletions
CHANGELOG.md
+
2
−
3
View file @
a4c593ea
...
...
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added ###
-
DateElementConverter: allows to interpret text as a date object
-
the restricted_path argument allows to crawl only a subtree
### Changed ###
...
...
@@ -19,9 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed ###
-
an empty string as name is treated as no name (as does the server). This, fixes
queries for identifiables since it would contain "WITH name=''" otherwise
which is an impossible condition. If your cfoods contained this case, it is
possible that Records are now matched that were not before. You need to adjust
your identifiable definition if this is not wanted.
which is an impossible condition. If your cfoods contained this case, they are ill defined.
### Security ###
...
...
This diff is collapsed.
Click to expand it.
src/caoscrawler/crawl.py
+
81
−
13
View file @
a4c593ea
...
...
@@ -420,11 +420,16 @@ class Crawler(object):
value
[
"
class
"
]
=
getattr
(
module
,
value
[
"
converter
"
])
return
converter_registry
def
crawl_directory
(
self
,
dirname
:
str
,
crawler_definition_path
:
str
):
def
crawl_directory
(
self
,
dirname
:
str
,
crawler_definition_path
:
str
,
restricted_path
:
Optional
[
list
[
str
]]
=
None
):
"""
Crawl a single directory.
Convenience function that starts the crawler (calls start_crawling)
with a single directory as the StructureElement.
restricted_path: optional, list of strings
Traverse the data tree only along the given path. When the end of the given path
is reached, traverse the full tree as normal.
"""
crawler_definition
=
self
.
load_definition
(
crawler_definition_path
)
...
...
@@ -447,7 +452,9 @@ class Crawler(object):
self
.
start_crawling
(
Directory
(
dir_structure_name
,
dirname
),
crawler_definition
,
converter_registry
)
converter_registry
,
restricted_path
=
restricted_path
)
@staticmethod
def
initialize_converters
(
crawler_definition
:
dict
,
converter_registry
:
dict
):
...
...
@@ -475,7 +482,8 @@ class Crawler(object):
def
start_crawling
(
self
,
items
:
Union
[
list
[
StructureElement
],
StructureElement
],
crawler_definition
:
dict
,
converter_registry
:
dict
):
converter_registry
:
dict
,
restricted_path
:
Optional
[
list
[
str
]]
=
None
):
"""
Start point of the crawler recursion.
...
...
@@ -487,6 +495,9 @@ class Crawler(object):
crawler_definition : dict
A dictionary representing the crawler definition, possibly from a yaml
file.
restricted_path: optional, list of strings
Traverse the data tree only along the given path. When the end of the given path
is reached, traverse the full tree as normal.
Returns
-------
...
...
@@ -507,8 +518,14 @@ class Crawler(object):
# This recursive crawling procedure generates the update list:
self
.
crawled_data
:
list
[
db
.
Record
]
=
[]
self
.
_crawl
(
items
,
local_converters
,
self
.
generalStore
,
self
.
recordStore
,
[],
[])
self
.
_crawl
(
items
=
items
,
local_converters
=
local_converters
,
generalStore
=
self
.
generalStore
,
recordStore
=
self
.
recordStore
,
structure_elements_path
=
[],
converters_path
=
[],
restricted_path
=
restricted_path
)
if
self
.
debug
:
self
.
debug_converters
=
local_converters
...
...
@@ -1159,11 +1176,14 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
with
open
(
filename
,
"
w
"
)
as
f
:
f
.
write
(
yaml
.
dump
(
paths
,
sort_keys
=
False
))
def
_crawl
(
self
,
items
:
list
[
StructureElement
],
def
_crawl
(
self
,
items
:
list
[
StructureElement
],
local_converters
:
list
[
Converter
],
generalStore
:
GeneralStore
,
recordStore
:
RecordStore
,
structure_elements_path
:
list
[
str
],
converters_path
:
list
[
str
]):
structure_elements_path
:
list
[
str
],
converters_path
:
list
[
str
],
restricted_path
:
Optional
[
list
[
str
]]
=
None
):
"""
Crawl a list of StructureElements and apply any matching converters.
...
...
@@ -1172,16 +1192,31 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
treating structure elements. A locally defined converter could be
one that is only valid for a specific subtree of the originally
cralwed StructureElement structure.
generalStore and recordStore: This recursion of the crawl function should only operate on copies of the
global stores of the Crawler object.
generalStore and recordStore: This recursion of the crawl function should only operate on
copies of the global stores of the Crawler object.
restricted_path: optional, list of strings, traverse the data tree only along the given
path. For example, when a directory contains files a, b and c and b is
given in restricted_path, a and c will be ignroed by the crawler.
When the end of the given path is reached, traverse the full tree as
normal. The first element of the list provided by restricted_path should
be the name of the StructureElement at this level, i.e. denoting the
respective element in the items argument.
"""
# This path_found variable stores wether the path given by restricted_path was found in the
# data tree
path_found
=
False
if
restricted_path
is
not
None
and
len
(
restricted_path
)
==
0
:
restricted_path
=
None
for
element
in
items
:
for
converter
in
local_converters
:
# type is something like "matches files", replace isinstance with "type_matches"
# match function tests regexp for example
if
(
converter
.
typecheck
(
element
)
and
converter
.
match
(
element
)
is
not
None
):
if
(
converter
.
typecheck
(
element
)
and
(
restricted_path
is
None
or
element
.
name
==
restricted_path
[
0
])
and
converter
.
match
(
element
)
is
not
None
):
path_found
=
True
generalStore_copy
=
generalStore
.
create_scoped_copy
()
recordStore_copy
=
recordStore
.
create_scoped_copy
()
...
...
@@ -1222,7 +1257,12 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
self
.
_crawl
(
children
,
converter
.
converters
,
generalStore_copy
,
recordStore_copy
,
structure_elements_path
+
[
element
.
get_name
()],
converters_path
+
[
converter
.
name
])
converters_path
+
[
converter
.
name
],
restricted_path
[
1
:]
if
restricted_path
is
not
None
else
None
)
if
restricted_path
and
not
path_found
:
raise
RuntimeError
(
"
A
'
restricted_path
'
argument was given that is not contained in
"
"
the data tree
"
)
# if the crawler is running out of scope, copy all records in
# the recordStore, that were created in this scope
# to the general update container.
...
...
@@ -1253,6 +1293,7 @@ def crawler_main(crawled_directory_path: str,
prefix
:
str
=
""
,
securityMode
:
SecurityMode
=
SecurityMode
.
UPDATE
,
unique_names
=
True
,
restricted_path
:
Optional
[
list
[
str
]]
=
None
):
"""
...
...
@@ -1276,6 +1317,9 @@ def crawler_main(crawled_directory_path: str,
securityMode of Crawler
unique_names : bool
whether or not to update or insert entities inspite of name conflicts
restricted_path: optional, list of strings
Traverse the data tree only along the given path. When the end of the given path
is reached, traverse the full tree as normal.
Returns
-------
...
...
@@ -1284,7 +1328,7 @@ def crawler_main(crawled_directory_path: str,
"""
crawler
=
Crawler
(
debug
=
debug
,
securityMode
=
securityMode
)
try
:
crawler
.
crawl_directory
(
crawled_directory_path
,
cfood_file_name
)
crawler
.
crawl_directory
(
crawled_directory_path
,
cfood_file_name
,
restricted_path
)
except
ConverterValidationError
as
err
:
print
(
err
)
return
1
...
...
@@ -1349,6 +1393,15 @@ def parse_args():
formatter_class
=
RawTextHelpFormatter
)
parser
.
add_argument
(
"
cfood_file_name
"
,
help
=
"
Path name of the cfood yaml file to be used.
"
)
mg
=
parser
.
add_mutually_exclusive_group
()
mg
.
add_argument
(
"
-r
"
,
"
--restrict
"
,
nargs
=
"
*
"
,
help
=
"
Restrict the crawling to the subtree at the end of the given path.
"
"
I.e. for each level that is given the crawler only treats the element
"
"
with the given name.
"
)
mg
.
add_argument
(
"
--restrict-path
"
,
help
=
"
same as restrict; instead of a list, this takes a
"
"
single string that is interpreded as file system path. Note that a trailing
"
"
separator (e.g.
'
/
'
) will be ignored. Use --restrict if you need to have
"
"
empty strings.
"
)
parser
.
add_argument
(
"
--provenance
"
,
required
=
False
,
help
=
"
Path name of the provenance yaml file.
"
"
This file will only be generated if this option is set.
"
)
...
...
@@ -1380,6 +1433,15 @@ def parse_args():
return
parser
.
parse_args
()
def
split_restricted_path
(
path
):
elements
=
[]
while
path
!=
"
/
"
:
path
,
el
=
os
.
path
.
split
(
path
)
if
el
!=
""
:
elements
.
insert
(
0
,
el
)
return
elements
def
main
():
args
=
parse_args
()
...
...
@@ -1395,6 +1457,11 @@ def main():
if
args
.
add_cwd_to_path
:
sys
.
path
.
append
(
os
.
path
.
abspath
(
"
.
"
))
if
args
.
restrict_path
:
restricted_path
=
split_restricted_path
(
args
.
restrict_path
)
if
args
.
restrict
:
restricted_path
=
args
.
restrict
sys
.
exit
(
crawler_main
(
crawled_directory_path
=
args
.
crawled_directory_path
,
cfood_file_name
=
args
.
cfood_file_name
,
...
...
@@ -1407,6 +1474,7 @@ def main():
"
insert
"
:
SecurityMode
.
INSERT
,
"
update
"
:
SecurityMode
.
UPDATE
}[
args
.
security_mode
],
unique_names
=
args
.
unique_names
,
restricted_path
=
restricted_path
))
...
...
This diff is collapsed.
Click to expand it.
unittests/test_tool.py
+
98
−
2
View file @
a4c593ea
...
...
@@ -26,10 +26,11 @@ Tests for the tool using pytest
Adapted from check-sfs
"""
from
caoscrawler.stores
import
GeneralStore
,
RecordStore
import
os
from
caoscrawler.crawl
import
Crawler
,
SecurityMode
from
caoscrawler.crawl
import
Crawler
,
SecurityMode
,
split_restricted_path
from
caoscrawler.identifiable
import
Identifiable
from
caoscrawler.structure_elements
import
File
,
DictTextElement
,
DictListElement
from
caoscrawler.structure_elements
import
File
,
DictTextElement
,
DictListElement
,
DictElement
from
caoscrawler.identifiable_adapters
import
IdentifiableAdapter
,
LocalStorageIdentifiableAdapter
from
simulated_server_data
import
full_data
from
functools
import
partial
...
...
@@ -867,3 +868,98 @@ def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_
insert
,
update
=
crawler
.
split_into_inserts_and_updates
(
deepcopy
(
entlist
))
assert
len
(
update
)
==
2
assert
len
(
insert
)
==
1
def
mock_create_values
(
values
,
element
):
pass
@patch
(
"
caoscrawler.converters.IntegerElementConverter.create_values
"
)
def
test_restricted_path
(
create_mock
):
"""
The restricted_path argument allows to ignroe part of the crawled data structure. Here, we make
sure, that is that argument is provided, ideed only the given path of the tree is traversed.
The check is done using the mock of the create_values function of the IntegerElementConverter.
This function is only called if elements are being treated.
"""
crawler_definition
=
{
"
DictTest
"
:
{
"
type
"
:
"
DictElement
"
,
"
match
"
:
"
(.*)
"
,
"
subtree
"
:
{
"
nextdict
"
:
{
"
type
"
:
"
DictElement
"
,
"
match
"
:
"
(.*)
"
,
"
subtree
"
:
{
"
int_element
"
:
{
"
type
"
:
"
IntegerElement
"
,
"
match_name
"
:
"
.*
"
,
"
match_value
"
:
"
(?P<int_value>.*)
"
,
"
records
"
:
{
"
Dataset
"
:
{
"
Subject
"
:
"
$int_value
"
}
}
}
}
}
}
}
}
crawler
=
Crawler
(
debug
=
True
)
converter_registry
=
crawler
.
load_converters
(
crawler_definition
)
# This structure is crawled
test_dict
=
{
"
v1
"
:
{
"
a
"
:
1
,
"
b
"
:
2
,
},
"
v2
"
:
{
"
c
"
:
3
,
"
d
"
:
4
,
}
}
# first test without a restricted_path
restricted_path
=
None
records
=
crawler
.
start_crawling
(
DictElement
(
"
TestDict
"
,
test_dict
),
crawler_definition
,
converter_registry
,
restricted_path
)
assert
create_mock
.
call_count
==
4
create_mock
.
reset_mock
()
# test with a restricted_path but one that has no effect (single root element)
# this also tests that the remainder of the tree is fully traversed
restricted_path
=
[
"
TestDict
"
]
records
=
crawler
.
start_crawling
(
DictElement
(
"
TestDict
"
,
test_dict
),
crawler_definition
,
converter_registry
,
restricted_path
)
assert
create_mock
.
call_count
==
4
create_mock
.
reset_mock
()
# test with a restricted_path that restricts the tree (single root element)
restricted_path
=
[
"
TestDict
"
,
"
v2
"
]
records
=
crawler
.
start_crawling
(
DictElement
(
"
TestDict
"
,
test_dict
),
crawler_definition
,
converter_registry
,
restricted_path
)
assert
create_mock
.
call_count
==
2
create_mock
.
reset_mock
()
# test with a restricted_path that contains a bad element
restricted_path
=
[
"
TestDict
"
,
"
v3
"
]
with
raises
(
RuntimeError
):
records
=
crawler
.
start_crawling
(
DictElement
(
"
TestDict
"
,
test_dict
),
crawler_definition
,
converter_registry
,
restricted_path
)
def
test_split_restricted_path
():
assert
[
"
el
"
]
==
split_restricted_path
(
"
/el
"
)
assert
[
"
el
"
]
==
split_restricted_path
(
"
/el/
"
)
assert
[
"
el
"
,
"
el
"
]
==
split_restricted_path
(
"
/el/el
"
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment