Skip to content
Snippets Groups Projects
Commit 2fdacd93 authored by Alexander Schlemmer's avatar Alexander Schlemmer
Browse files

MAINT: moved crawling functions which are actually part of the scanner to scanner module

parent dfe93653
No related branches found
No related tags found
2 merge requests!108Release 0.5.0,!104Create a new scanner module and move functions from crawl module there
Pipeline #34405 failed
......@@ -233,118 +233,6 @@ class Crawler(object):
def crawl_directory(self, dirname: str, crawler_definition_path: str,
restricted_path: Optional[list[str]] = None):
""" Crawl a single directory.
Convenience function that starts the crawler (calls start_crawling)
with a single directory as the StructureElement.
restricted_path: optional, list of strings
Traverse the data tree only along the given path. When the end of the given path
is reached, traverse the full tree as normal.
"""
crawler_definition = self.load_definition(crawler_definition_path)
# Load and register converter packages:
converter_registry = self.load_converters(crawler_definition)
if not dirname:
raise ValueError(
"You have to provide a non-empty path for crawling.")
dir_structure_name = os.path.basename(dirname)
self.crawled_directory = dirname
if not dir_structure_name and dirname.endswith('/'):
if dirname == '/':
# Crawling the entire file system
dir_structure_name = "root"
else:
# dirname had a trailing '/'
dir_structure_name = os.path.basename(dirname[:-1])
self.start_crawling(Directory(dir_structure_name,
dirname),
crawler_definition,
converter_registry,
restricted_path=restricted_path
)
@staticmethod
def initialize_converters(crawler_definition: dict, converter_registry: dict):
"""
takes the cfood as dict (`crawler_definition`) and creates the converter objects that
are defined on the highest level. Child Converters will in turn be created during the
initialization of the Converters.
"""
converters = []
for key, value in crawler_definition.items():
# Definitions and Converters are reserved keywords
# on the top level of the yaml file.
# TODO: there should also be a top level keyword for the actual
# CFood to avoid confusion between top level keywords
# and the CFood.
if key == "Definitions":
continue
elif key == "Converters":
continue
converters.append(Converter.converter_factory(
value, key, converter_registry))
return converters
def start_crawling(self, items: Union[list[StructureElement], StructureElement],
crawler_definition: dict,
converter_registry: dict,
restricted_path: Optional[list[str]] = None):
"""
Start point of the crawler recursion.
Parameters
----------
items: list
A list of structure elements (or a single StructureElement) that is used for
generating the initial items for the crawler. This could e.g. be a Directory.
crawler_definition : dict
A dictionary representing the crawler definition, possibly from a yaml
file.
restricted_path: optional, list of strings
Traverse the data tree only along the given path. When the end of the given path
is reached, traverse the full tree as normal.
Returns
-------
crawled_data : list
the final list with the target state of Records.
"""
# This function builds the tree of converters out of the crawler definition.
if self.generalStore is None:
raise RuntimeError("Should not happen.")
if not isinstance(items, list):
items = [items]
self.run_id = uuid.uuid1()
local_converters = Crawler.initialize_converters(crawler_definition, converter_registry)
# This recursive crawling procedure generates the update list:
self.crawled_data: list[db.Record] = []
self._crawl(
items=items,
local_converters=local_converters,
generalStore=self.generalStore,
recordStore=self.recordStore,
structure_elements_path=[],
converters_path=[],
restricted_path=restricted_path)
if self.debug:
self.debug_converters = local_converters
return self.crawled_data
def synchronize(self, commit_changes: bool = True, unique_names=True):
"""
Carry out the actual synchronization.
......
......@@ -70,154 +70,239 @@ from .debug.debug_tree import (DebugTreeStructureElement,
logger = logging.getLogger(__name__)
def load_definition(self, crawler_definition_path: str):
"""
Load a cfood from a crawler definition defined by
crawler definition path and validate it using cfood-schema.yml.
"""
# Load the cfood from a yaml file:
with open(crawler_definition_path, "r") as f:
crawler_definitions = list(yaml.safe_load_all(f))
crawler_definition = self._load_definition_from_yaml_dict(
crawler_definitions)
return self._resolve_validator_paths(crawler_definition, crawler_definition_path)
def _load_definition_from_yaml_dict(self, crawler_definitions: list[dict]):
"""Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which
contains either one or two documents.
Doesn't resolve the validator paths in the cfood definition, so for
internal and testing use only.
"""
if len(crawler_definitions) == 1:
# Simple case, just one document:
crawler_definition = crawler_definitions[0]
metadata = {}
elif len(crawler_definitions) == 2:
metadata = crawler_definitions[0]["metadata"] if "metadata" in crawler_definitions[0] else {
}
crawler_definition = crawler_definitions[1]
else:
raise RuntimeError(
"Crawler definition must not contain more than two documents.")
check_cfood_version(metadata)
# TODO: at this point this function can already load the cfood schema extensions
# from the crawler definition and add them to the yaml schema that will be
# tested in the next lines of code:
# Load the cfood schema:
with open(str(files('caoscrawler').joinpath('cfood-schema.yml')), "r") as f:
schema = yaml.safe_load(f)
# Add custom converters to converter enum in schema:
if "Converters" in crawler_definition:
for key in crawler_definition["Converters"]:
def load_definition(self, crawler_definition_path: str):
"""
Load a cfood from a crawler definition defined by
crawler definition path and validate it using cfood-schema.yml.
"""
# Load the cfood from a yaml file:
with open(crawler_definition_path, "r") as f:
crawler_definitions = list(yaml.safe_load_all(f))
crawler_definition = self._load_definition_from_yaml_dict(
crawler_definitions)
return self._resolve_validator_paths(crawler_definition, crawler_definition_path)
def _load_definition_from_yaml_dict(self, crawler_definitions: list[dict]):
"""Load crawler definitions from a list of (yaml) dicts `crawler_definitions` which
contains either one or two documents.
Doesn't resolve the validator paths in the cfood definition, so for
internal and testing use only.
"""
if len(crawler_definitions) == 1:
# Simple case, just one document:
crawler_definition = crawler_definitions[0]
metadata = {}
elif len(crawler_definitions) == 2:
metadata = crawler_definitions[0]["metadata"] if "metadata" in crawler_definitions[0] else {
}
crawler_definition = crawler_definitions[1]
else:
raise RuntimeError(
"Crawler definition must not contain more than two documents.")
check_cfood_version(metadata)
# TODO: at this point this function can already load the cfood schema extensions
# from the crawler definition and add them to the yaml schema that will be
# tested in the next lines of code:
# Load the cfood schema:
with open(str(files('caoscrawler').joinpath('cfood-schema.yml')), "r") as f:
schema = yaml.safe_load(f)
# Add custom converters to converter enum in schema:
if "Converters" in crawler_definition:
for key in crawler_definition["Converters"]:
schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append(
key)
if len(crawler_definitions) == 2:
if "Converters" in metadata:
for key in metadata["Converters"]:
schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append(
key)
if len(crawler_definitions) == 2:
if "Converters" in metadata:
for key in metadata["Converters"]:
schema["cfood"]["$defs"]["converter"]["properties"]["type"]["enum"].append(
key)
# Validate the cfood schema:
validate(instance=crawler_definition, schema=schema["cfood"])
return crawler_definition
def _resolve_validator_paths(self, definition: dict, definition_path: str):
"""Resolve path to validation files with respect to the file in which
the crawler was defined.
"""
for key, value in definition.items():
if key == "validate" and isinstance(value, str):
# Validator is given by a path
if not value.startswith('/'):
# Not an absolute path
definition[key] = os.path.join(os.path.dirname(definition_path), value)
if not os.path.isfile(definition[key]):
# TODO(henrik) capture this in `crawler_main` similar to
# `ConverterValidationError`.
raise FileNotFoundError(
f"Couldn't find validation file {definition[key]}")
elif isinstance(value, dict):
# Recursively resolve all validators
definition[key] = self._resolve_validator_paths(value, definition_path)
return definition
def load_converters(self, definition: dict):
"""
Currently the converter registry is a dictionary containing for each converter:
- key is the short code, abbreviation for the converter class name
- module is the name of the module to be imported which must be installed
- class is the converter class to load and associate with this converter entry
all other info for the converter needs to be included in the converter plugin
directory:
schema.yml file
README.md documentation
TODO: this function does not make use of self, so it could become static.
"""
# Defaults for the converter registry:
with open(str(files('caoscrawler').joinpath('default_converters.yml')), "r") as f:
converter_registry: dict[str, dict[str, str]] = yaml.safe_load(f)
# More converters from definition file:
if "Converters" in definition:
for key, entry in definition["Converters"].items():
if key in ["Dict", "DictTextElement", "DictIntegerElement", "DictBooleanElement",
"DictDictElement", "DictListElement", "DictFloatElement"]:
warnings.warn(DeprecationWarning(f"{key} is deprecated. Please use the new"
" variant; without 'Dict' prefix or "
"'DictElement' in case of 'Dict'"))
converter_registry[key] = {
"converter": entry["converter"],
"package": entry["package"]
}
# Load modules and associate classes:
for key, value in converter_registry.items():
module = importlib.import_module(value["package"])
value["class"] = getattr(module, value["converter"])
return converter_registry
# Validate the cfood schema:
validate(instance=crawler_definition, schema=schema["cfood"])
return crawler_definition
def _resolve_validator_paths(self, definition: dict, definition_path: str):
"""Resolve path to validation files with respect to the file in which
the crawler was defined.
"""
for key, value in definition.items():
if key == "validate" and isinstance(value, str):
# Validator is given by a path
if not value.startswith('/'):
# Not an absolute path
definition[key] = os.path.join(os.path.dirname(definition_path), value)
if not os.path.isfile(definition[key]):
# TODO(henrik) capture this in `crawler_main` similar to
# `ConverterValidationError`.
raise FileNotFoundError(
f"Couldn't find validation file {definition[key]}")
elif isinstance(value, dict):
# Recursively resolve all validators
definition[key] = self._resolve_validator_paths(value, definition_path)
return definition
def load_converters(self, definition: dict):
"""
Currently the converter registry is a dictionary containing for each converter:
- key is the short code, abbreviation for the converter class name
- module is the name of the module to be imported which must be installed
- class is the converter class to load and associate with this converter entry
all other info for the converter needs to be included in the converter plugin
directory:
schema.yml file
README.md documentation
TODO: this function does not make use of self, so it could become static.
"""
# Defaults for the converter registry:
with open(str(files('caoscrawler').joinpath('default_converters.yml')), "r") as f:
converter_registry: dict[str, dict[str, str]] = yaml.safe_load(f)
# More converters from definition file:
if "Converters" in definition:
for key, entry in definition["Converters"].items():
if key in ["Dict", "DictTextElement", "DictIntegerElement", "DictBooleanElement",
"DictDictElement", "DictListElement", "DictFloatElement"]:
warnings.warn(DeprecationWarning(f"{key} is deprecated. Please use the new"
" variant; without 'Dict' prefix or "
"'DictElement' in case of 'Dict'"))
converter_registry[key] = {
"converter": entry["converter"],
"package": entry["package"]
}
# Load modules and associate classes:
for key, value in converter_registry.items():
module = importlib.import_module(value["package"])
value["class"] = getattr(module, value["converter"])
return converter_registry
def crawl_directory(self, dirname: str, crawler_definition_path: str,
restricted_path: Optional[list[str]] = None):
""" Crawl a single directory.
Convenience function that starts the crawler (calls start_crawling)
with a single directory as the StructureElement.
restricted_path: optional, list of strings
Traverse the data tree only along the given path. When the end of the given path
is reached, traverse the full tree as normal.
"""
crawler_definition = self.load_definition(crawler_definition_path)
# Load and register converter packages:
converter_registry = self.load_converters(crawler_definition)
if not dirname:
raise ValueError(
"You have to provide a non-empty path for crawling.")
dir_structure_name = os.path.basename(dirname)
self.crawled_directory = dirname
if not dir_structure_name and dirname.endswith('/'):
if dirname == '/':
# Crawling the entire file system
dir_structure_name = "root"
else:
# dirname had a trailing '/'
dir_structure_name = os.path.basename(dirname[:-1])
self.start_crawling(Directory(dir_structure_name,
dirname),
crawler_definition,
converter_registry,
restricted_path=restricted_path
)
def initialize_converters(crawler_definition: dict, converter_registry: dict):
"""
takes the cfood as dict (`crawler_definition`) and creates the converter objects that
are defined on the highest level. Child Converters will in turn be created during the
initialization of the Converters.
"""
converters = []
for key, value in crawler_definition.items():
# Definitions and Converters are reserved keywords
# on the top level of the yaml file.
# TODO: there should also be a top level keyword for the actual
# CFood to avoid confusion between top level keywords
# and the CFood.
if key == "Definitions":
continue
elif key == "Converters":
continue
converters.append(Converter.converter_factory(
value, key, converter_registry))
return converters
def start_crawling(self, items: Union[list[StructureElement], StructureElement],
crawler_definition: dict,
converter_registry: dict,
restricted_path: Optional[list[str]] = None):
"""
Start point of the crawler recursion.
Parameters
----------
items: list
A list of structure elements (or a single StructureElement) that is used for
generating the initial items for the crawler. This could e.g. be a Directory.
crawler_definition : dict
A dictionary representing the crawler definition, possibly from a yaml
file.
restricted_path: optional, list of strings
Traverse the data tree only along the given path. When the end of the given path
is reached, traverse the full tree as normal.
Returns
-------
crawled_data : list
the final list with the target state of Records.
"""
# This function builds the tree of converters out of the crawler definition.
if self.generalStore is None:
raise RuntimeError("Should not happen.")
if not isinstance(items, list):
items = [items]
self.run_id = uuid.uuid1()
local_converters = Crawler.initialize_converters(crawler_definition, converter_registry)
# This recursive crawling procedure generates the update list:
self.crawled_data: list[db.Record] = []
self._crawl(
items=items,
local_converters=local_converters,
generalStore=self.generalStore,
recordStore=self.recordStore,
structure_elements_path=[],
converters_path=[],
restricted_path=restricted_path)
if self.debug:
self.debug_converters = local_converters
return self.crawled_data
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment