Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
CaosDB Crawler
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
caosdb
Software
CaosDB Crawler
Commits
2fdacd93
Commit
2fdacd93
authored
2 years ago
by
Alexander Schlemmer
Browse files
Options
Downloads
Patches
Plain Diff
MAINT: moved crawling functions which are actually part of the scanner to scanner module
parent
dfe93653
No related branches found
No related tags found
2 merge requests
!108
Release 0.5.0
,
!104
Create a new scanner module and move functions from crawl module there
Pipeline
#34405
failed
2 years ago
Stage: info
Stage: setup
Stage: cert
Stage: style
Stage: test
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/caoscrawler/crawl.py
+0
-112
0 additions, 112 deletions
src/caoscrawler/crawl.py
src/caoscrawler/scanner.py
+232
-147
232 additions, 147 deletions
src/caoscrawler/scanner.py
with
232 additions
and
259 deletions
src/caoscrawler/crawl.py
+
0
−
112
View file @
2fdacd93
...
...
@@ -233,118 +233,6 @@ class Crawler(object):
def
crawl_directory
(
self
,
dirname
:
str
,
crawler_definition_path
:
str
,
restricted_path
:
Optional
[
list
[
str
]]
=
None
):
"""
Crawl a single directory.
Convenience function that starts the crawler (calls start_crawling)
with a single directory as the StructureElement.
restricted_path: optional, list of strings
Traverse the data tree only along the given path. When the end of the given path
is reached, traverse the full tree as normal.
"""
crawler_definition
=
self
.
load_definition
(
crawler_definition_path
)
# Load and register converter packages:
converter_registry
=
self
.
load_converters
(
crawler_definition
)
if
not
dirname
:
raise
ValueError
(
"
You have to provide a non-empty path for crawling.
"
)
dir_structure_name
=
os
.
path
.
basename
(
dirname
)
self
.
crawled_directory
=
dirname
if
not
dir_structure_name
and
dirname
.
endswith
(
'
/
'
):
if
dirname
==
'
/
'
:
# Crawling the entire file system
dir_structure_name
=
"
root
"
else
:
# dirname had a trailing '/'
dir_structure_name
=
os
.
path
.
basename
(
dirname
[:
-
1
])
self
.
start_crawling
(
Directory
(
dir_structure_name
,
dirname
),
crawler_definition
,
converter_registry
,
restricted_path
=
restricted_path
)
@staticmethod
def
initialize_converters
(
crawler_definition
:
dict
,
converter_registry
:
dict
):
"""
takes the cfood as dict (`crawler_definition`) and creates the converter objects that
are defined on the highest level. Child Converters will in turn be created during the
initialization of the Converters.
"""
converters
=
[]
for
key
,
value
in
crawler_definition
.
items
():
# Definitions and Converters are reserved keywords
# on the top level of the yaml file.
# TODO: there should also be a top level keyword for the actual
# CFood to avoid confusion between top level keywords
# and the CFood.
if
key
==
"
Definitions
"
:
continue
elif
key
==
"
Converters
"
:
continue
converters
.
append
(
Converter
.
converter_factory
(
value
,
key
,
converter_registry
))
return
converters
def
start_crawling
(
self
,
items
:
Union
[
list
[
StructureElement
],
StructureElement
],
crawler_definition
:
dict
,
converter_registry
:
dict
,
restricted_path
:
Optional
[
list
[
str
]]
=
None
):
"""
Start point of the crawler recursion.
Parameters
----------
items: list
A list of structure elements (or a single StructureElement) that is used for
generating the initial items for the crawler. This could e.g. be a Directory.
crawler_definition : dict
A dictionary representing the crawler definition, possibly from a yaml
file.
restricted_path: optional, list of strings
Traverse the data tree only along the given path. When the end of the given path
is reached, traverse the full tree as normal.
Returns
-------
crawled_data : list
the final list with the target state of Records.
"""
# This function builds the tree of converters out of the crawler definition.
if
self
.
generalStore
is
None
:
raise
RuntimeError
(
"
Should not happen.
"
)
if
not
isinstance
(
items
,
list
):
items
=
[
items
]
self
.
run_id
=
uuid
.
uuid1
()
local_converters
=
Crawler
.
initialize_converters
(
crawler_definition
,
converter_registry
)
# This recursive crawling procedure generates the update list:
self
.
crawled_data
:
list
[
db
.
Record
]
=
[]
self
.
_crawl
(
items
=
items
,
local_converters
=
local_converters
,
generalStore
=
self
.
generalStore
,
recordStore
=
self
.
recordStore
,
structure_elements_path
=
[],
converters_path
=
[],
restricted_path
=
restricted_path
)
if
self
.
debug
:
self
.
debug_converters
=
local_converters
return
self
.
crawled_data
def
synchronize
(
self
,
commit_changes
:
bool
=
True
,
unique_names
=
True
):
"""
Carry out the actual synchronization.
...
...
This diff is collapsed.
Click to expand it.
src/caoscrawler/scanner.py
+
232
−
147
View file @
2fdacd93
...
...
@@ -70,154 +70,239 @@ from .debug.debug_tree import (DebugTreeStructureElement,
logger
=
logging
.
getLogger
(
__name__
)
def
load_definition
(
self
,
crawler_definition_path
:
str
):
"""
Load a cfood from a crawler definition defined by
crawler definition path and validate it using cfood-schema.yml.
"""
# Load the cfood from a yaml file:
with
open
(
crawler_definition_path
,
"
r
"
)
as
f
:
crawler_definitions
=
list
(
yaml
.
safe_load_all
(
f
))
crawler_definition
=
self
.
_load_definition_from_yaml_dict
(
crawler_definitions
)
return
self
.
_resolve_validator_paths
(
crawler_definition
,
crawler_definition_path
)
def
_load_definition_from_yaml_dict
(
self
,
crawler_definitions
:
list
[
dict
]):
"""
Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which
contains either one or two documents.
Doesn
'
t resolve the validator paths in the cfood definition, so for
internal and testing use only.
"""
if
len
(
crawler_definitions
)
==
1
:
# Simple case, just one document:
crawler_definition
=
crawler_definitions
[
0
]
metadata
=
{}
elif
len
(
crawler_definitions
)
==
2
:
metadata
=
crawler_definitions
[
0
][
"
metadata
"
]
if
"
metadata
"
in
crawler_definitions
[
0
]
else
{
}
crawler_definition
=
crawler_definitions
[
1
]
else
:
raise
RuntimeError
(
"
Crawler definition must not contain more than two documents.
"
)
check_cfood_version
(
metadata
)
# TODO: at this point this function can already load the cfood schema extensions
# from the crawler definition and add them to the yaml schema that will be
# tested in the next lines of code:
# Load the cfood schema:
with
open
(
str
(
files
(
'
caoscrawler
'
).
joinpath
(
'
cfood-schema.yml
'
)),
"
r
"
)
as
f
:
schema
=
yaml
.
safe_load
(
f
)
# Add custom converters to converter enum in schema:
if
"
Converters
"
in
crawler_definition
:
for
key
in
crawler_definition
[
"
Converters
"
]:
def
load_definition
(
self
,
crawler_definition_path
:
str
):
"""
Load a cfood from a crawler definition defined by
crawler definition path and validate it using cfood-schema.yml.
"""
# Load the cfood from a yaml file:
with
open
(
crawler_definition_path
,
"
r
"
)
as
f
:
crawler_definitions
=
list
(
yaml
.
safe_load_all
(
f
))
crawler_definition
=
self
.
_load_definition_from_yaml_dict
(
crawler_definitions
)
return
self
.
_resolve_validator_paths
(
crawler_definition
,
crawler_definition_path
)
def
_load_definition_from_yaml_dict
(
self
,
crawler_definitions
:
list
[
dict
]):
"""
Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which
contains either one or two documents.
Doesn
'
t resolve the validator paths in the cfood definition, so for
internal and testing use only.
"""
if
len
(
crawler_definitions
)
==
1
:
# Simple case, just one document:
crawler_definition
=
crawler_definitions
[
0
]
metadata
=
{}
elif
len
(
crawler_definitions
)
==
2
:
metadata
=
crawler_definitions
[
0
][
"
metadata
"
]
if
"
metadata
"
in
crawler_definitions
[
0
]
else
{
}
crawler_definition
=
crawler_definitions
[
1
]
else
:
raise
RuntimeError
(
"
Crawler definition must not contain more than two documents.
"
)
check_cfood_version
(
metadata
)
# TODO: at this point this function can already load the cfood schema extensions
# from the crawler definition and add them to the yaml schema that will be
# tested in the next lines of code:
# Load the cfood schema:
with
open
(
str
(
files
(
'
caoscrawler
'
).
joinpath
(
'
cfood-schema.yml
'
)),
"
r
"
)
as
f
:
schema
=
yaml
.
safe_load
(
f
)
# Add custom converters to converter enum in schema:
if
"
Converters
"
in
crawler_definition
:
for
key
in
crawler_definition
[
"
Converters
"
]:
schema
[
"
cfood
"
][
"
$defs
"
][
"
converter
"
][
"
properties
"
][
"
type
"
][
"
enum
"
].
append
(
key
)
if
len
(
crawler_definitions
)
==
2
:
if
"
Converters
"
in
metadata
:
for
key
in
metadata
[
"
Converters
"
]:
schema
[
"
cfood
"
][
"
$defs
"
][
"
converter
"
][
"
properties
"
][
"
type
"
][
"
enum
"
].
append
(
key
)
if
len
(
crawler_definitions
)
==
2
:
if
"
Converters
"
in
metadata
:
for
key
in
metadata
[
"
Converters
"
]:
schema
[
"
cfood
"
][
"
$defs
"
][
"
converter
"
][
"
properties
"
][
"
type
"
][
"
enum
"
].
append
(
key
)
# Validate the cfood schema:
validate
(
instance
=
crawler_definition
,
schema
=
schema
[
"
cfood
"
])
return
crawler_definition
def
_resolve_validator_paths
(
self
,
definition
:
dict
,
definition_path
:
str
):
"""
Resolve path to validation files with respect to the file in which
the crawler was defined.
"""
for
key
,
value
in
definition
.
items
():
if
key
==
"
validate
"
and
isinstance
(
value
,
str
):
# Validator is given by a path
if
not
value
.
startswith
(
'
/
'
):
# Not an absolute path
definition
[
key
]
=
os
.
path
.
join
(
os
.
path
.
dirname
(
definition_path
),
value
)
if
not
os
.
path
.
isfile
(
definition
[
key
]):
# TODO(henrik) capture this in `crawler_main` similar to
# `ConverterValidationError`.
raise
FileNotFoundError
(
f
"
Couldn
'
t find validation file
{
definition
[
key
]
}
"
)
elif
isinstance
(
value
,
dict
):
# Recursively resolve all validators
definition
[
key
]
=
self
.
_resolve_validator_paths
(
value
,
definition_path
)
return
definition
def
load_converters
(
self
,
definition
:
dict
):
"""
Currently the converter registry is a dictionary containing for each converter:
- key is the short code, abbreviation for the converter class name
- module is the name of the module to be imported which must be installed
- class is the converter class to load and associate with this converter entry
all other info for the converter needs to be included in the converter plugin
directory:
schema.yml file
README.md documentation
TODO: this function does not make use of self, so it could become static.
"""
# Defaults for the converter registry:
with
open
(
str
(
files
(
'
caoscrawler
'
).
joinpath
(
'
default_converters.yml
'
)),
"
r
"
)
as
f
:
converter_registry
:
dict
[
str
,
dict
[
str
,
str
]]
=
yaml
.
safe_load
(
f
)
# More converters from definition file:
if
"
Converters
"
in
definition
:
for
key
,
entry
in
definition
[
"
Converters
"
].
items
():
if
key
in
[
"
Dict
"
,
"
DictTextElement
"
,
"
DictIntegerElement
"
,
"
DictBooleanElement
"
,
"
DictDictElement
"
,
"
DictListElement
"
,
"
DictFloatElement
"
]:
warnings
.
warn
(
DeprecationWarning
(
f
"
{
key
}
is deprecated. Please use the new
"
"
variant; without
'
Dict
'
prefix or
"
"'
DictElement
'
in case of
'
Dict
'"
))
converter_registry
[
key
]
=
{
"
converter
"
:
entry
[
"
converter
"
],
"
package
"
:
entry
[
"
package
"
]
}
# Load modules and associate classes:
for
key
,
value
in
converter_registry
.
items
():
module
=
importlib
.
import_module
(
value
[
"
package
"
])
value
[
"
class
"
]
=
getattr
(
module
,
value
[
"
converter
"
])
return
converter_registry
# Validate the cfood schema:
validate
(
instance
=
crawler_definition
,
schema
=
schema
[
"
cfood
"
])
return
crawler_definition
def
_resolve_validator_paths
(
self
,
definition
:
dict
,
definition_path
:
str
):
"""
Resolve path to validation files with respect to the file in which
the crawler was defined.
"""
for
key
,
value
in
definition
.
items
():
if
key
==
"
validate
"
and
isinstance
(
value
,
str
):
# Validator is given by a path
if
not
value
.
startswith
(
'
/
'
):
# Not an absolute path
definition
[
key
]
=
os
.
path
.
join
(
os
.
path
.
dirname
(
definition_path
),
value
)
if
not
os
.
path
.
isfile
(
definition
[
key
]):
# TODO(henrik) capture this in `crawler_main` similar to
# `ConverterValidationError`.
raise
FileNotFoundError
(
f
"
Couldn
'
t find validation file
{
definition
[
key
]
}
"
)
elif
isinstance
(
value
,
dict
):
# Recursively resolve all validators
definition
[
key
]
=
self
.
_resolve_validator_paths
(
value
,
definition_path
)
return
definition
def
load_converters
(
self
,
definition
:
dict
):
"""
Currently the converter registry is a dictionary containing for each converter:
- key is the short code, abbreviation for the converter class name
- module is the name of the module to be imported which must be installed
- class is the converter class to load and associate with this converter entry
all other info for the converter needs to be included in the converter plugin
directory:
schema.yml file
README.md documentation
TODO: this function does not make use of self, so it could become static.
"""
# Defaults for the converter registry:
with
open
(
str
(
files
(
'
caoscrawler
'
).
joinpath
(
'
default_converters.yml
'
)),
"
r
"
)
as
f
:
converter_registry
:
dict
[
str
,
dict
[
str
,
str
]]
=
yaml
.
safe_load
(
f
)
# More converters from definition file:
if
"
Converters
"
in
definition
:
for
key
,
entry
in
definition
[
"
Converters
"
].
items
():
if
key
in
[
"
Dict
"
,
"
DictTextElement
"
,
"
DictIntegerElement
"
,
"
DictBooleanElement
"
,
"
DictDictElement
"
,
"
DictListElement
"
,
"
DictFloatElement
"
]:
warnings
.
warn
(
DeprecationWarning
(
f
"
{
key
}
is deprecated. Please use the new
"
"
variant; without
'
Dict
'
prefix or
"
"'
DictElement
'
in case of
'
Dict
'"
))
converter_registry
[
key
]
=
{
"
converter
"
:
entry
[
"
converter
"
],
"
package
"
:
entry
[
"
package
"
]
}
# Load modules and associate classes:
for
key
,
value
in
converter_registry
.
items
():
module
=
importlib
.
import_module
(
value
[
"
package
"
])
value
[
"
class
"
]
=
getattr
(
module
,
value
[
"
converter
"
])
return
converter_registry
def
crawl_directory
(
self
,
dirname
:
str
,
crawler_definition_path
:
str
,
restricted_path
:
Optional
[
list
[
str
]]
=
None
):
"""
Crawl a single directory.
Convenience function that starts the crawler (calls start_crawling)
with a single directory as the StructureElement.
restricted_path: optional, list of strings
Traverse the data tree only along the given path. When the end of the given path
is reached, traverse the full tree as normal.
"""
crawler_definition
=
self
.
load_definition
(
crawler_definition_path
)
# Load and register converter packages:
converter_registry
=
self
.
load_converters
(
crawler_definition
)
if
not
dirname
:
raise
ValueError
(
"
You have to provide a non-empty path for crawling.
"
)
dir_structure_name
=
os
.
path
.
basename
(
dirname
)
self
.
crawled_directory
=
dirname
if
not
dir_structure_name
and
dirname
.
endswith
(
'
/
'
):
if
dirname
==
'
/
'
:
# Crawling the entire file system
dir_structure_name
=
"
root
"
else
:
# dirname had a trailing '/'
dir_structure_name
=
os
.
path
.
basename
(
dirname
[:
-
1
])
self
.
start_crawling
(
Directory
(
dir_structure_name
,
dirname
),
crawler_definition
,
converter_registry
,
restricted_path
=
restricted_path
)
def
initialize_converters
(
crawler_definition
:
dict
,
converter_registry
:
dict
):
"""
takes the cfood as dict (`crawler_definition`) and creates the converter objects that
are defined on the highest level. Child Converters will in turn be created during the
initialization of the Converters.
"""
converters
=
[]
for
key
,
value
in
crawler_definition
.
items
():
# Definitions and Converters are reserved keywords
# on the top level of the yaml file.
# TODO: there should also be a top level keyword for the actual
# CFood to avoid confusion between top level keywords
# and the CFood.
if
key
==
"
Definitions
"
:
continue
elif
key
==
"
Converters
"
:
continue
converters
.
append
(
Converter
.
converter_factory
(
value
,
key
,
converter_registry
))
return
converters
def
start_crawling
(
self
,
items
:
Union
[
list
[
StructureElement
],
StructureElement
],
crawler_definition
:
dict
,
converter_registry
:
dict
,
restricted_path
:
Optional
[
list
[
str
]]
=
None
):
"""
Start point of the crawler recursion.
Parameters
----------
items: list
A list of structure elements (or a single StructureElement) that is used for
generating the initial items for the crawler. This could e.g. be a Directory.
crawler_definition : dict
A dictionary representing the crawler definition, possibly from a yaml
file.
restricted_path: optional, list of strings
Traverse the data tree only along the given path. When the end of the given path
is reached, traverse the full tree as normal.
Returns
-------
crawled_data : list
the final list with the target state of Records.
"""
# This function builds the tree of converters out of the crawler definition.
if
self
.
generalStore
is
None
:
raise
RuntimeError
(
"
Should not happen.
"
)
if
not
isinstance
(
items
,
list
):
items
=
[
items
]
self
.
run_id
=
uuid
.
uuid1
()
local_converters
=
Crawler
.
initialize_converters
(
crawler_definition
,
converter_registry
)
# This recursive crawling procedure generates the update list:
self
.
crawled_data
:
list
[
db
.
Record
]
=
[]
self
.
_crawl
(
items
=
items
,
local_converters
=
local_converters
,
generalStore
=
self
.
generalStore
,
recordStore
=
self
.
recordStore
,
structure_elements_path
=
[],
converters_path
=
[],
restricted_path
=
restricted_path
)
if
self
.
debug
:
self
.
debug_converters
=
local_converters
return
self
.
crawled_data
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment