Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
CaosDB Crawler
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
caosdb
Software
CaosDB Crawler
Commits
31a6b372
Commit
31a6b372
authored
2 years ago
by
Alexander Schlemmer
Browse files
Options
Downloads
Patches
Plain Diff
MAINT: moved main scanner function to scanner module
parent
f547fa39
Branches
Branches containing commit
Tags
Tags containing commit
2 merge requests
!108
Release 0.5.0
,
!104
Create a new scanner module and move functions from crawl module there
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/caoscrawler/crawl.py
+0
-107
0 additions, 107 deletions
src/caoscrawler/crawl.py
src/caoscrawler/scanner.py
+120
-7
120 additions, 7 deletions
src/caoscrawler/scanner.py
with
120 additions
and
114 deletions
src/caoscrawler/crawl.py
+
0
−
107
View file @
31a6b372
...
...
@@ -903,113 +903,6 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
with
open
(
filename
,
"
w
"
)
as
f
:
f
.
write
(
yaml
.
dump
(
paths
,
sort_keys
=
False
))
def
_crawl
(
self
,
items
:
list
[
StructureElement
],
local_converters
:
list
[
Converter
],
generalStore
:
GeneralStore
,
recordStore
:
RecordStore
,
structure_elements_path
:
list
[
str
],
converters_path
:
list
[
str
],
restricted_path
:
Optional
[
list
[
str
]]
=
None
):
"""
Crawl a list of StructureElements and apply any matching converters.
items: structure_elements (e.g. files and folders on one level on the hierarchy)
local_converters: locally defined converters for
treating structure elements. A locally defined converter could be
one that is only valid for a specific subtree of the originally
cralwed StructureElement structure.
generalStore and recordStore: This recursion of the crawl function should only operate on
copies of the global stores of the Crawler object.
restricted_path: optional, list of strings, traverse the data tree only along the given
path. For example, when a directory contains files a, b and c and b is
given in restricted_path, a and c will be ignroed by the crawler.
When the end of the given path is reached, traverse the full tree as
normal. The first element of the list provided by restricted_path should
be the name of the StructureElement at this level, i.e. denoting the
respective element in the items argument.
"""
# This path_found variable stores wether the path given by restricted_path was found in the
# data tree
path_found
=
False
if
restricted_path
is
not
None
and
len
(
restricted_path
)
==
0
:
restricted_path
=
None
for
element
in
items
:
for
converter
in
local_converters
:
# type is something like "matches files", replace isinstance with "type_matches"
# match function tests regexp for example
if
(
converter
.
typecheck
(
element
)
and
(
restricted_path
is
None
or
element
.
name
==
restricted_path
[
0
])
and
converter
.
match
(
element
)
is
not
None
):
path_found
=
True
generalStore_copy
=
generalStore
.
create_scoped_copy
()
recordStore_copy
=
recordStore
.
create_scoped_copy
()
# Create an entry for this matched structure element that contains the path:
generalStore_copy
[
converter
.
name
]
=
(
os
.
path
.
join
(
*
(
structure_elements_path
+
[
element
.
get_name
()])))
# extracts values from structure element and stores them in the
# variable store
converter
.
create_values
(
generalStore_copy
,
element
)
keys_modified
=
converter
.
create_records
(
generalStore_copy
,
recordStore_copy
,
element
)
children
=
converter
.
create_children
(
generalStore_copy
,
element
)
if
self
.
debug
:
# add provenance information for each variable
self
.
debug_tree
[
str
(
element
)]
=
(
generalStore_copy
.
get_storage
(),
recordStore_copy
.
get_storage
())
self
.
debug_metadata
[
"
copied
"
][
str
(
element
)]
=
(
generalStore_copy
.
get_dict_copied
(),
recordStore_copy
.
get_dict_copied
())
self
.
debug_metadata
[
"
usage
"
][
str
(
element
)].
add
(
"
/
"
.
join
(
converters_path
+
[
converter
.
name
]))
mod_info
=
self
.
debug_metadata
[
"
provenance
"
]
for
record_name
,
prop_name
in
keys_modified
:
# TODO: check
internal_id
=
recordStore_copy
.
get_internal_id
(
record_name
)
record_identifier
=
record_name
+
\
"
_
"
+
str
(
internal_id
)
converter
.
metadata
[
"
usage
"
].
add
(
record_identifier
)
mod_info
[
record_identifier
][
prop_name
]
=
(
structure_elements_path
+
[
element
.
get_name
()],
converters_path
+
[
converter
.
name
])
self
.
_crawl
(
children
,
converter
.
converters
,
generalStore_copy
,
recordStore_copy
,
structure_elements_path
+
[
element
.
get_name
()],
converters_path
+
[
converter
.
name
],
restricted_path
[
1
:]
if
restricted_path
is
not
None
else
None
)
if
restricted_path
and
not
path_found
:
raise
RuntimeError
(
"
A
'
restricted_path
'
argument was given that is not contained in
"
"
the data tree
"
)
# if the crawler is running out of scope, copy all records in
# the recordStore, that were created in this scope
# to the general update container.
scoped_records
=
recordStore
.
get_records_current_scope
()
for
record
in
scoped_records
:
self
.
crawled_data
.
append
(
record
)
# TODO: the scoped variables should be cleaned up as soon if the variables
# are no longer in the current scope. This can be implemented as follows,
# but this breaks the test "test_record_structure_generation", because
# some debug info is also deleted. This implementation can be used as soon
# as the remaining problems with the debug_tree are fixed.
# Delete the variables that are no longer needed:
# scoped_names = recordStore.get_names_current_scope()
# for name in scoped_names:
# del recordStore[name]
# del generalStore[name]
return
self
.
crawled_data
def
crawler_main
(
crawled_directory_path
:
str
,
cfood_file_name
:
str
,
...
...
This diff is collapsed.
Click to expand it.
src/caoscrawler/scanner.py
+
120
−
7
View file @
31a6b372
...
...
@@ -218,13 +218,126 @@ def initialize_converters(crawler_definition: dict, converter_registry: dict):
return
converters
# --------------------------------------------------------------------------------
# Main scanner function:
# --------------------------------------------------------------------------------
def
_crawl
(
self
,
items
:
list
[
StructureElement
],
local_converters
:
list
[
Converter
],
generalStore
:
GeneralStore
,
recordStore
:
RecordStore
,
structure_elements_path
:
list
[
str
],
converters_path
:
list
[
str
],
restricted_path
:
Optional
[
list
[
str
]]
=
None
):
"""
Crawl a list of StructureElements and apply any matching converters.
items: structure_elements (e.g. files and folders on one level on the hierarchy)
local_converters: locally defined converters for
treating structure elements. A locally defined converter could be
one that is only valid for a specific subtree of the originally
cralwed StructureElement structure.
generalStore and recordStore: This recursion of the crawl function should only operate on
copies of the global stores of the Crawler object.
restricted_path: optional, list of strings, traverse the data tree only along the given
path. For example, when a directory contains files a, b and c and b is
given in restricted_path, a and c will be ignroed by the crawler.
When the end of the given path is reached, traverse the full tree as
normal. The first element of the list provided by restricted_path should
be the name of the StructureElement at this level, i.e. denoting the
respective element in the items argument.
"""
# This path_found variable stores wether the path given by restricted_path was found in the
# data tree
path_found
=
False
if
restricted_path
is
not
None
and
len
(
restricted_path
)
==
0
:
restricted_path
=
None
for
element
in
items
:
for
converter
in
local_converters
:
# type is something like "matches files", replace isinstance with "type_matches"
# match function tests regexp for example
if
(
converter
.
typecheck
(
element
)
and
(
restricted_path
is
None
or
element
.
name
==
restricted_path
[
0
])
and
converter
.
match
(
element
)
is
not
None
):
path_found
=
True
generalStore_copy
=
generalStore
.
create_scoped_copy
()
recordStore_copy
=
recordStore
.
create_scoped_copy
()
# Create an entry for this matched structure element that contains the path:
generalStore_copy
[
converter
.
name
]
=
(
os
.
path
.
join
(
*
(
structure_elements_path
+
[
element
.
get_name
()])))
# extracts values from structure element and stores them in the
# variable store
converter
.
create_values
(
generalStore_copy
,
element
)
keys_modified
=
converter
.
create_records
(
generalStore_copy
,
recordStore_copy
,
element
)
children
=
converter
.
create_children
(
generalStore_copy
,
element
)
if
self
.
debug
:
# add provenance information for each variable
self
.
debug_tree
[
str
(
element
)]
=
(
generalStore_copy
.
get_storage
(),
recordStore_copy
.
get_storage
())
self
.
debug_metadata
[
"
copied
"
][
str
(
element
)]
=
(
generalStore_copy
.
get_dict_copied
(),
recordStore_copy
.
get_dict_copied
())
self
.
debug_metadata
[
"
usage
"
][
str
(
element
)].
add
(
"
/
"
.
join
(
converters_path
+
[
converter
.
name
]))
mod_info
=
self
.
debug_metadata
[
"
provenance
"
]
for
record_name
,
prop_name
in
keys_modified
:
# TODO: check
internal_id
=
recordStore_copy
.
get_internal_id
(
record_name
)
record_identifier
=
record_name
+
\
"
_
"
+
str
(
internal_id
)
converter
.
metadata
[
"
usage
"
].
add
(
record_identifier
)
mod_info
[
record_identifier
][
prop_name
]
=
(
structure_elements_path
+
[
element
.
get_name
()],
converters_path
+
[
converter
.
name
])
self
.
_crawl
(
children
,
converter
.
converters
,
generalStore_copy
,
recordStore_copy
,
structure_elements_path
+
[
element
.
get_name
()],
converters_path
+
[
converter
.
name
],
restricted_path
[
1
:]
if
restricted_path
is
not
None
else
None
)
if
restricted_path
and
not
path_found
:
raise
RuntimeError
(
"
A
'
restricted_path
'
argument was given that is not contained in
"
"
the data tree
"
)
# if the crawler is running out of scope, copy all records in
# the recordStore, that were created in this scope
# to the general update container.
scoped_records
=
recordStore
.
get_records_current_scope
()
for
record
in
scoped_records
:
self
.
crawled_data
.
append
(
record
)
# TODO: the scoped variables should be cleaned up as soon if the variables
# are no longer in the current scope. This can be implemented as follows,
# but this breaks the test "test_record_structure_generation", because
# some debug info is also deleted. This implementation can be used as soon
# as the remaining problems with the debug_tree are fixed.
# Delete the variables that are no longer needed:
# scoped_names = recordStore.get_names_current_scope()
# for name in scoped_names:
# del recordStore[name]
# del generalStore[name]
return
self
.
crawled_data
# --------------------------------------------------------------------------------
# Main scanning functions:
# Main scanning
interface
functions:
# --------------------------------------------------------------------------------
def
crawl_directory
(
self
,
dirname
:
str
,
crawler_definition_path
:
str
,
def
crawl_directory
(
dirname
:
str
,
crawler_definition_path
:
str
,
restricted_path
:
Optional
[
list
[
str
]]
=
None
):
"""
Crawl a single directory.
...
...
@@ -236,15 +349,15 @@ def crawl_directory(self, dirname: str, crawler_definition_path: str,
is reached, traverse the full tree as normal.
"""
crawler_definition
=
self
.
load_definition
(
crawler_definition_path
)
crawler_definition
=
load_definition
(
crawler_definition_path
)
# Load and register converter packages:
converter_registry
=
self
.
load_converters
(
crawler_definition
)
converter_registry
=
load_converters
(
crawler_definition
)
if
not
dirname
:
raise
ValueError
(
"
You have to provide a non-empty path for crawling.
"
)
dir_structure_name
=
os
.
path
.
basename
(
dirname
)
self
.
crawled_directory
=
dirname
crawled_directory
=
dirname
if
not
dir_structure_name
and
dirname
.
endswith
(
'
/
'
):
if
dirname
==
'
/
'
:
# Crawling the entire file system
...
...
@@ -253,7 +366,7 @@ def crawl_directory(self, dirname: str, crawler_definition_path: str,
# dirname had a trailing '/'
dir_structure_name
=
os
.
path
.
basename
(
dirname
[:
-
1
])
self
.
start_crawling
(
Directory
(
dir_structure_name
,
start_crawling
(
Directory
(
dir_structure_name
,
dirname
),
crawler_definition
,
converter_registry
,
...
...
@@ -261,7 +374,7 @@ def crawl_directory(self, dirname: str, crawler_definition_path: str,
)
def
start_crawling
(
self
,
items
:
Union
[
list
[
StructureElement
],
StructureElement
],
def
start_crawling
(
items
:
Union
[
list
[
StructureElement
],
StructureElement
],
crawler_definition
:
dict
,
converter_registry
:
dict
,
restricted_path
:
Optional
[
list
[
str
]]
=
None
):
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment