Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
CaosDB Crawler
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
caosdb
Software
CaosDB Crawler
Commits
f547fa39
Commit
f547fa39
authored
2 years ago
by
Alexander Schlemmer
Browse files
Options
Downloads
Patches
Plain Diff
MAINT: made utility and converter registry functions top level functions without references to self
parent
8cc9c99a
No related branches found
No related tags found
2 merge requests
!108
Release 0.5.0
,
!104
Create a new scanner module and move functions from crawl module there
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/caoscrawler/scanner.py
+36
-30
36 additions, 30 deletions
src/caoscrawler/scanner.py
with
36 additions
and
30 deletions
src/caoscrawler/scanner.py
+
36
−
30
View file @
f547fa39
...
...
@@ -70,7 +70,7 @@ from .debug.debug_tree import (DebugTreeStructureElement,
logger
=
logging
.
getLogger
(
__name__
)
def
load_definition
(
self
,
crawler_definition_path
:
str
):
def
load_definition
(
crawler_definition_path
:
str
):
"""
Load a cfood from a crawler definition defined by
crawler definition path and validate it using cfood-schema.yml.
...
...
@@ -80,12 +80,12 @@ def load_definition(self, crawler_definition_path: str):
with
open
(
crawler_definition_path
,
"
r
"
)
as
f
:
crawler_definitions
=
list
(
yaml
.
safe_load_all
(
f
))
crawler_definition
=
self
.
_load_definition_from_yaml_dict
(
crawler_definition
=
_load_definition_from_yaml_dict
(
crawler_definitions
)
return
self
.
_resolve_validator_paths
(
crawler_definition
,
crawler_definition_path
)
return
_resolve_validator_paths
(
crawler_definition
,
crawler_definition_path
)
def
_load_definition_from_yaml_dict
(
self
,
crawler_definitions
:
list
[
dict
]):
def
_load_definition_from_yaml_dict
(
crawler_definitions
:
list
[
dict
]):
"""
Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which
contains either one or two documents.
...
...
@@ -131,7 +131,8 @@ def _load_definition_from_yaml_dict(self, crawler_definitions: list[dict]):
return
crawler_definition
def
_resolve_validator_paths
(
self
,
definition
:
dict
,
definition_path
:
str
):
def
_resolve_validator_paths
(
definition
:
dict
,
definition_path
:
str
):
"""
Resolve path to validation files with respect to the file in which
the crawler was defined.
...
...
@@ -151,7 +152,7 @@ def _resolve_validator_paths(self, definition: dict, definition_path: str):
f
"
Couldn
'
t find validation file
{
definition
[
key
]
}
"
)
elif
isinstance
(
value
,
dict
):
# Recursively resolve all validators
definition
[
key
]
=
self
.
_resolve_validator_paths
(
value
,
definition_path
)
definition
[
key
]
=
_resolve_validator_paths
(
value
,
definition_path
)
return
definition
...
...
@@ -194,6 +195,35 @@ def create_converter_registry(definition: dict):
return
converter_registry
def
initialize_converters
(
crawler_definition
:
dict
,
converter_registry
:
dict
):
"""
takes the cfood as dict (`crawler_definition`) and creates the converter objects that
are defined on the highest level. Child Converters will in turn be created during the
initialization of the Converters.
"""
converters
=
[]
for
key
,
value
in
crawler_definition
.
items
():
# Definitions and Converters are reserved keywords
# on the top level of the yaml file.
# TODO: there should also be a top level keyword for the actual
# CFood to avoid confusion between top level keywords
# and the CFood.
if
key
==
"
Definitions
"
:
continue
elif
key
==
"
Converters
"
:
continue
converters
.
append
(
Converter
.
converter_factory
(
value
,
key
,
converter_registry
))
return
converters
# --------------------------------------------------------------------------------
# Main scanning functions:
# --------------------------------------------------------------------------------
def
crawl_directory
(
self
,
dirname
:
str
,
crawler_definition_path
:
str
,
restricted_path
:
Optional
[
list
[
str
]]
=
None
):
"""
Crawl a single directory.
...
...
@@ -229,31 +259,7 @@ def crawl_directory(self, dirname: str, crawler_definition_path: str,
converter_registry
,
restricted_path
=
restricted_path
)
def
initialize_converters
(
crawler_definition
:
dict
,
converter_registry
:
dict
):
"""
takes the cfood as dict (`crawler_definition`) and creates the converter objects that
are defined on the highest level. Child Converters will in turn be created during the
initialization of the Converters.
"""
converters
=
[]
for
key
,
value
in
crawler_definition
.
items
():
# Definitions and Converters are reserved keywords
# on the top level of the yaml file.
# TODO: there should also be a top level keyword for the actual
# CFood to avoid confusion between top level keywords
# and the CFood.
if
key
==
"
Definitions
"
:
continue
elif
key
==
"
Converters
"
:
continue
converters
.
append
(
Converter
.
converter_factory
(
value
,
key
,
converter_registry
))
return
converters
def
start_crawling
(
self
,
items
:
Union
[
list
[
StructureElement
],
StructureElement
],
crawler_definition
:
dict
,
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment