Skip to content
Snippets Groups Projects
Commit a4c593ea authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

Merge branch 'f-subtree' into 'dev'

F subtree

See merge request !93
parents ac3bc48d 24dcf639
Branches
Tags
2 merge requests!105REL: v0.4.0,!93F subtree
Pipeline #33261 passed
......@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added ###
- DateElementConverter: allows to interpret text as a date object
- the restricted_path argument allows to crawl only a subtree
### Changed ###
......@@ -19,9 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed ###
- an empty string as name is treated as no name (as does the server). This, fixes
queries for identifiables since it would contain "WITH name=''" otherwise
which is an impossible condition. If your cfoods contained this case, it is
possible that Records are now matched that were not before. You need to adjust
your identifiable definition if this is not wanted.
which is an impossible condition. If your cfoods contained this case, they are ill defined.
### Security ###
......
......@@ -420,11 +420,16 @@ class Crawler(object):
value["class"] = getattr(module, value["converter"])
return converter_registry
def crawl_directory(self, dirname: str, crawler_definition_path: str):
def crawl_directory(self, dirname: str, crawler_definition_path: str,
restricted_path: Optional[list[str]] = None):
""" Crawl a single directory.
Convenience function that starts the crawler (calls start_crawling)
with a single directory as the StructureElement.
restricted_path: optional, list of strings
Traverse the data tree only along the given path. When the end of the given path
is reached, traverse the full tree as normal.
"""
crawler_definition = self.load_definition(crawler_definition_path)
......@@ -447,7 +452,9 @@ class Crawler(object):
self.start_crawling(Directory(dir_structure_name,
dirname),
crawler_definition,
converter_registry)
converter_registry,
restricted_path=restricted_path
)
@staticmethod
def initialize_converters(crawler_definition: dict, converter_registry: dict):
......@@ -475,7 +482,8 @@ class Crawler(object):
def start_crawling(self, items: Union[list[StructureElement], StructureElement],
crawler_definition: dict,
converter_registry: dict):
converter_registry: dict,
restricted_path: Optional[list[str]] = None):
"""
Start point of the crawler recursion.
......@@ -487,6 +495,9 @@ class Crawler(object):
crawler_definition : dict
A dictionary representing the crawler definition, possibly from a yaml
file.
restricted_path: optional, list of strings
Traverse the data tree only along the given path. When the end of the given path
is reached, traverse the full tree as normal.
Returns
-------
......@@ -507,8 +518,14 @@ class Crawler(object):
# This recursive crawling procedure generates the update list:
self.crawled_data: list[db.Record] = []
self._crawl(items, local_converters, self.generalStore, self.recordStore, [], [])
self._crawl(
items=items,
local_converters=local_converters,
generalStore=self.generalStore,
recordStore=self.recordStore,
structure_elements_path=[],
converters_path=[],
restricted_path=restricted_path)
if self.debug:
self.debug_converters = local_converters
......@@ -1159,11 +1176,14 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
with open(filename, "w") as f:
f.write(yaml.dump(paths, sort_keys=False))
def _crawl(self, items: list[StructureElement],
def _crawl(self,
items: list[StructureElement],
local_converters: list[Converter],
generalStore: GeneralStore,
recordStore: RecordStore,
structure_elements_path: list[str], converters_path: list[str]):
structure_elements_path: list[str],
converters_path: list[str],
restricted_path: Optional[list[str]] = None):
"""
Crawl a list of StructureElements and apply any matching converters.
......@@ -1172,16 +1192,31 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
treating structure elements. A locally defined converter could be
one that is only valid for a specific subtree of the originally
cralwed StructureElement structure.
generalStore and recordStore: This recursion of the crawl function should only operate on copies of the
global stores of the Crawler object.
generalStore and recordStore: This recursion of the crawl function should only operate on
copies of the global stores of the Crawler object.
restricted_path: optional, list of strings, traverse the data tree only along the given
path. For example, when a directory contains files a, b and c and b is
given in restricted_path, a and c will be ignroed by the crawler.
When the end of the given path is reached, traverse the full tree as
normal. The first element of the list provided by restricted_path should
be the name of the StructureElement at this level, i.e. denoting the
respective element in the items argument.
"""
# This path_found variable stores wether the path given by restricted_path was found in the
# data tree
path_found = False
if restricted_path is not None and len(restricted_path) == 0:
restricted_path = None
for element in items:
for converter in local_converters:
# type is something like "matches files", replace isinstance with "type_matches"
# match function tests regexp for example
if (converter.typecheck(element) and
converter.match(element) is not None):
if (converter.typecheck(element) and (
restricted_path is None or element.name == restricted_path[0])
and converter.match(element) is not None):
path_found = True
generalStore_copy = generalStore.create_scoped_copy()
recordStore_copy = recordStore.create_scoped_copy()
......@@ -1222,7 +1257,12 @@ ____________________\n""".format(i + 1, len(pending_changes)) + str(el[3]))
self._crawl(children, converter.converters,
generalStore_copy, recordStore_copy,
structure_elements_path + [element.get_name()],
converters_path + [converter.name])
converters_path + [converter.name],
restricted_path[1:] if restricted_path is not None else None)
if restricted_path and not path_found:
raise RuntimeError("A 'restricted_path' argument was given that is not contained in "
"the data tree")
# if the crawler is running out of scope, copy all records in
# the recordStore, that were created in this scope
# to the general update container.
......@@ -1253,6 +1293,7 @@ def crawler_main(crawled_directory_path: str,
prefix: str = "",
securityMode: SecurityMode = SecurityMode.UPDATE,
unique_names=True,
restricted_path: Optional[list[str]] = None
):
"""
......@@ -1276,6 +1317,9 @@ def crawler_main(crawled_directory_path: str,
securityMode of Crawler
unique_names : bool
whether or not to update or insert entities inspite of name conflicts
restricted_path: optional, list of strings
Traverse the data tree only along the given path. When the end of the given path
is reached, traverse the full tree as normal.
Returns
-------
......@@ -1284,7 +1328,7 @@ def crawler_main(crawled_directory_path: str,
"""
crawler = Crawler(debug=debug, securityMode=securityMode)
try:
crawler.crawl_directory(crawled_directory_path, cfood_file_name)
crawler.crawl_directory(crawled_directory_path, cfood_file_name, restricted_path)
except ConverterValidationError as err:
print(err)
return 1
......@@ -1349,6 +1393,15 @@ def parse_args():
formatter_class=RawTextHelpFormatter)
parser.add_argument("cfood_file_name",
help="Path name of the cfood yaml file to be used.")
mg = parser.add_mutually_exclusive_group()
mg.add_argument("-r", "--restrict", nargs="*",
help="Restrict the crawling to the subtree at the end of the given path."
"I.e. for each level that is given the crawler only treats the element "
"with the given name.")
mg.add_argument("--restrict-path", help="same as restrict; instead of a list, this takes a "
"single string that is interpreded as file system path. Note that a trailing"
"separator (e.g. '/') will be ignored. Use --restrict if you need to have "
"empty strings.")
parser.add_argument("--provenance", required=False,
help="Path name of the provenance yaml file. "
"This file will only be generated if this option is set.")
......@@ -1380,6 +1433,15 @@ def parse_args():
return parser.parse_args()
def split_restricted_path(path):
elements = []
while path != "/":
path, el = os.path.split(path)
if el != "":
elements.insert(0, el)
return elements
def main():
args = parse_args()
......@@ -1395,6 +1457,11 @@ def main():
if args.add_cwd_to_path:
sys.path.append(os.path.abspath("."))
if args.restrict_path:
restricted_path = split_restricted_path(args.restrict_path)
if args.restrict:
restricted_path = args.restrict
sys.exit(crawler_main(
crawled_directory_path=args.crawled_directory_path,
cfood_file_name=args.cfood_file_name,
......@@ -1407,6 +1474,7 @@ def main():
"insert": SecurityMode.INSERT,
"update": SecurityMode.UPDATE}[args.security_mode],
unique_names=args.unique_names,
restricted_path=restricted_path
))
......
......@@ -26,10 +26,11 @@ Tests for the tool using pytest
Adapted from check-sfs
"""
from caoscrawler.stores import GeneralStore, RecordStore
import os
from caoscrawler.crawl import Crawler, SecurityMode
from caoscrawler.crawl import Crawler, SecurityMode, split_restricted_path
from caoscrawler.identifiable import Identifiable
from caoscrawler.structure_elements import File, DictTextElement, DictListElement
from caoscrawler.structure_elements import File, DictTextElement, DictListElement, DictElement
from caoscrawler.identifiable_adapters import IdentifiableAdapter, LocalStorageIdentifiableAdapter
from simulated_server_data import full_data
from functools import partial
......@@ -867,3 +868,98 @@ def test_split_into_inserts_and_updates_diff_backref(crawler_mocked_for_backref_
insert, update = crawler.split_into_inserts_and_updates(deepcopy(entlist))
assert len(update) == 2
assert len(insert) == 1
def mock_create_values(values, element):
pass
@patch("caoscrawler.converters.IntegerElementConverter.create_values")
def test_restricted_path(create_mock):
"""
The restricted_path argument allows to ignroe part of the crawled data structure. Here, we make
sure, that is that argument is provided, ideed only the given path of the tree is traversed.
The check is done using the mock of the create_values function of the IntegerElementConverter.
This function is only called if elements are being treated.
"""
crawler_definition = {
"DictTest": {
"type": "DictElement",
"match": "(.*)",
"subtree": {
"nextdict": {
"type": "DictElement",
"match": "(.*)",
"subtree": {
"int_element": {
"type": "IntegerElement",
"match_name": ".*",
"match_value": "(?P<int_value>.*)",
"records": {
"Dataset": {
"Subject": "$int_value"
}
}
}
}
}
}
}
}
crawler = Crawler(debug=True)
converter_registry = crawler.load_converters(crawler_definition)
# This structure is crawled
test_dict = {
"v1": {
"a": 1,
"b": 2,
},
"v2": {
"c": 3,
"d": 4,
}
}
# first test without a restricted_path
restricted_path = None
records = crawler.start_crawling(
DictElement("TestDict", test_dict), crawler_definition, converter_registry,
restricted_path
)
assert create_mock.call_count == 4
create_mock.reset_mock()
# test with a restricted_path but one that has no effect (single root element)
# this also tests that the remainder of the tree is fully traversed
restricted_path = ["TestDict"]
records = crawler.start_crawling(
DictElement("TestDict", test_dict), crawler_definition, converter_registry,
restricted_path
)
assert create_mock.call_count == 4
create_mock.reset_mock()
# test with a restricted_path that restricts the tree (single root element)
restricted_path = ["TestDict", "v2"]
records = crawler.start_crawling(
DictElement("TestDict", test_dict), crawler_definition, converter_registry,
restricted_path
)
assert create_mock.call_count == 2
create_mock.reset_mock()
# test with a restricted_path that contains a bad element
restricted_path = ["TestDict", "v3"]
with raises(RuntimeError):
records = crawler.start_crawling(
DictElement("TestDict", test_dict), crawler_definition, converter_registry,
restricted_path
)
def test_split_restricted_path():
assert ["el"] == split_restricted_path("/el")
assert ["el"] == split_restricted_path("/el/")
assert ["el", "el"] == split_restricted_path("/el/el")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment