Skip to content
Snippets Groups Projects
Commit 65f44c12 authored by Florian Spreckelsen's avatar Florian Spreckelsen
Browse files

Merge branch 'f-unique-names' into 'dev'

ENH: allow to crawl data with name conflicts

See merge request !43
parents 2151d699 e4a0ee54
No related branches found
No related tags found
2 merge requests!53Release 0.1,!43ENH: allow to crawl data with name conflicts
Pipeline #28758 passed
......@@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
* Possibility to authorize updates as in the old crawler
* Allow authorization of inserts
* Converters can now filter the list of children
* You can now crawl data with name conflicts: `synchronize(unique_names=False)`
### Changed
......
......@@ -438,7 +438,7 @@ class Crawler(object):
return self.target_data
def synchronize(self, commit_changes: bool = True):
def synchronize(self, commit_changes: bool = True, unique_names=True):
"""
Carry out the actual synchronization.
"""
......@@ -446,7 +446,7 @@ class Crawler(object):
# After the crawling, the actual synchronization with the database, based on the
# update list is carried out:
return self._synchronize(self.target_data, commit_changes)
return self._synchronize(self.target_data, commit_changes, unique_names=unique_names)
def can_be_checked_externally(self, record: db.Record):
"""
......@@ -766,7 +766,8 @@ class Crawler(object):
return db.Entity(name=name).retrieve()
@staticmethod
def execute_inserts_in_list(to_be_inserted, securityMode, run_id: int = None):
def execute_inserts_in_list(to_be_inserted, securityMode, run_id: int = None,
unique_names=True):
for record in to_be_inserted:
for prop in record.properties:
entity = Crawler._get_entity_by_name(prop.name)
......@@ -775,7 +776,7 @@ class Crawler(object):
logger.debug(to_be_inserted)
if len(to_be_inserted) > 0:
if securityMode.value > SecurityMode.RETRIEVE.value:
db.Container().extend(to_be_inserted).insert()
db.Container().extend(to_be_inserted).insert(unique=unique_names)
elif run_id is not None:
update_cache = UpdateCache()
update_cache.insert(to_be_inserted, run_id, insert=True)
......@@ -793,18 +794,20 @@ class Crawler(object):
_resolve_datatype(prop, entity)
@staticmethod
def execute_updates_in_list(to_be_updated, securityMode, run_id: int = None):
def execute_updates_in_list(to_be_updated, securityMode, run_id: int = None,
unique_names=True):
Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated)
logger.debug("UPDATE")
logger.debug(to_be_updated)
if len(to_be_updated) > 0:
if securityMode.value > SecurityMode.INSERT.value:
db.Container().extend(to_be_updated).update()
db.Container().extend(to_be_updated).update(unique=unique_names)
elif run_id is not None:
update_cache = UpdateCache()
update_cache.insert(to_be_updated, run_id)
def _synchronize(self, target_data: List[db.Record], commit_changes: bool = True):
def _synchronize(self, target_data: List[db.Record], commit_changes: bool = True,
unique_names=True):
"""
This function applies several stages:
1) Retrieve identifiables for all records in target_data.
......@@ -840,9 +843,9 @@ class Crawler(object):
if commit_changes:
self.execute_inserts_in_list(
to_be_inserted, self.securityMode, self.run_id)
to_be_inserted, self.securityMode, self.run_id, unique_names=unique_names)
self.execute_updates_in_list(
to_be_updated, self.securityMode, self.run_id)
to_be_updated, self.securityMode, self.run_id, unique_names=unique_names)
update_cache = UpdateCache()
pending_inserts = update_cache.get_inserts(self.run_id)
......@@ -1014,7 +1017,9 @@ def crawler_main(crawled_directory_path: str,
provenance_file: str = None,
dry_run: bool = False,
prefix: str = "",
securityMode: int = SecurityMode.UPDATE):
securityMode: int = SecurityMode.UPDATE,
unique_names=True,
):
"""
Parameters
......@@ -1035,6 +1040,8 @@ def crawler_main(crawled_directory_path: str,
remove the given prefix from file paths
securityMode : int
securityMode of Crawler
unique_names : bool
whether or not to update or insert entities inspite of name conflicts
Returns
-------
......@@ -1092,7 +1099,7 @@ def crawler_main(crawled_directory_path: str,
raise RuntimeError("Missing RecordTypes: {}".
format(", ".join(notfound)))
crawler.synchronize(commit_changes=True)
crawler.synchronize(commit_changes=True, unique_names=unique_names)
return 0
......@@ -1118,9 +1125,9 @@ def parse_args():
# TODO: load identifiables is a dirty implementation currently
parser.add_argument("-i", "--load-identifiables",
help="Load identifiables from "
"the given yaml file.")
help="Load identifiables from the given yaml file.")
parser.add_argument("-u", "--unique-names",
help="Insert or updates entities even if name conflicts exist.")
parser.add_argument("-p", "--prefix",
help="Remove the given prefix from the paths "
"of all file objects.")
......@@ -1142,16 +1149,17 @@ def main():
logger.setLevel(logging.INFO)
sys.exit(crawler_main(
args.crawled_directory_path,
args.cfood_file_name,
args.load_identifiables,
args.debug,
args.provenance,
args.dry_run,
args.prefix,
{"retrieve": SecurityMode.RETRIEVE,
crawled_directory_path=args.crawled_directory_path,
cfood_file_name=args.cfood_file_name,
identifiables_definition_file=args.load_identifiables,
debug=args.debug,
provenance_file=args.provenance,
dry_run=args.dry_run,
prefix=args.prefix,
securityMode={"retrieve": SecurityMode.RETRIEVE,
"insert": SecurityMode.INSERT,
"update": SecurityMode.UPDATE}[args.security_mode]
"update": SecurityMode.UPDATE}[args.security_mode],
unique_names=args.unique_names,
))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment