Skip to content
Snippets Groups Projects
Commit 65f44c12 authored by Florian Spreckelsen's avatar Florian Spreckelsen
Browse files

Merge branch 'f-unique-names' into 'dev'

ENH: allow to crawl data with name conflicts

See merge request !43
parents 2151d699 e4a0ee54
No related branches found
No related tags found
2 merge requests!53Release 0.1,!43ENH: allow to crawl data with name conflicts
Pipeline #28758 passed
...@@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ...@@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
* Possibility to authorize updates as in the old crawler * Possibility to authorize updates as in the old crawler
* Allow authorization of inserts * Allow authorization of inserts
* Converters can now filter the list of children * Converters can now filter the list of children
* You can now crawl data with name conflicts: `synchronize(unique_names=False)`
### Changed ### Changed
......
...@@ -438,7 +438,7 @@ class Crawler(object): ...@@ -438,7 +438,7 @@ class Crawler(object):
return self.target_data return self.target_data
def synchronize(self, commit_changes: bool = True): def synchronize(self, commit_changes: bool = True, unique_names=True):
""" """
Carry out the actual synchronization. Carry out the actual synchronization.
""" """
...@@ -446,7 +446,7 @@ class Crawler(object): ...@@ -446,7 +446,7 @@ class Crawler(object):
# After the crawling, the actual synchronization with the database, based on the # After the crawling, the actual synchronization with the database, based on the
# update list is carried out: # update list is carried out:
return self._synchronize(self.target_data, commit_changes) return self._synchronize(self.target_data, commit_changes, unique_names=unique_names)
def can_be_checked_externally(self, record: db.Record): def can_be_checked_externally(self, record: db.Record):
""" """
...@@ -766,7 +766,8 @@ class Crawler(object): ...@@ -766,7 +766,8 @@ class Crawler(object):
return db.Entity(name=name).retrieve() return db.Entity(name=name).retrieve()
@staticmethod @staticmethod
def execute_inserts_in_list(to_be_inserted, securityMode, run_id: int = None): def execute_inserts_in_list(to_be_inserted, securityMode, run_id: int = None,
unique_names=True):
for record in to_be_inserted: for record in to_be_inserted:
for prop in record.properties: for prop in record.properties:
entity = Crawler._get_entity_by_name(prop.name) entity = Crawler._get_entity_by_name(prop.name)
...@@ -775,7 +776,7 @@ class Crawler(object): ...@@ -775,7 +776,7 @@ class Crawler(object):
logger.debug(to_be_inserted) logger.debug(to_be_inserted)
if len(to_be_inserted) > 0: if len(to_be_inserted) > 0:
if securityMode.value > SecurityMode.RETRIEVE.value: if securityMode.value > SecurityMode.RETRIEVE.value:
db.Container().extend(to_be_inserted).insert() db.Container().extend(to_be_inserted).insert(unique=unique_names)
elif run_id is not None: elif run_id is not None:
update_cache = UpdateCache() update_cache = UpdateCache()
update_cache.insert(to_be_inserted, run_id, insert=True) update_cache.insert(to_be_inserted, run_id, insert=True)
...@@ -793,18 +794,20 @@ class Crawler(object): ...@@ -793,18 +794,20 @@ class Crawler(object):
_resolve_datatype(prop, entity) _resolve_datatype(prop, entity)
@staticmethod @staticmethod
def execute_updates_in_list(to_be_updated, securityMode, run_id: int = None): def execute_updates_in_list(to_be_updated, securityMode, run_id: int = None,
unique_names=True):
Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated) Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated)
logger.debug("UPDATE") logger.debug("UPDATE")
logger.debug(to_be_updated) logger.debug(to_be_updated)
if len(to_be_updated) > 0: if len(to_be_updated) > 0:
if securityMode.value > SecurityMode.INSERT.value: if securityMode.value > SecurityMode.INSERT.value:
db.Container().extend(to_be_updated).update() db.Container().extend(to_be_updated).update(unique=unique_names)
elif run_id is not None: elif run_id is not None:
update_cache = UpdateCache() update_cache = UpdateCache()
update_cache.insert(to_be_updated, run_id) update_cache.insert(to_be_updated, run_id)
def _synchronize(self, target_data: List[db.Record], commit_changes: bool = True): def _synchronize(self, target_data: List[db.Record], commit_changes: bool = True,
unique_names=True):
""" """
This function applies several stages: This function applies several stages:
1) Retrieve identifiables for all records in target_data. 1) Retrieve identifiables for all records in target_data.
...@@ -840,9 +843,9 @@ class Crawler(object): ...@@ -840,9 +843,9 @@ class Crawler(object):
if commit_changes: if commit_changes:
self.execute_inserts_in_list( self.execute_inserts_in_list(
to_be_inserted, self.securityMode, self.run_id) to_be_inserted, self.securityMode, self.run_id, unique_names=unique_names)
self.execute_updates_in_list( self.execute_updates_in_list(
to_be_updated, self.securityMode, self.run_id) to_be_updated, self.securityMode, self.run_id, unique_names=unique_names)
update_cache = UpdateCache() update_cache = UpdateCache()
pending_inserts = update_cache.get_inserts(self.run_id) pending_inserts = update_cache.get_inserts(self.run_id)
...@@ -1014,7 +1017,9 @@ def crawler_main(crawled_directory_path: str, ...@@ -1014,7 +1017,9 @@ def crawler_main(crawled_directory_path: str,
provenance_file: str = None, provenance_file: str = None,
dry_run: bool = False, dry_run: bool = False,
prefix: str = "", prefix: str = "",
securityMode: int = SecurityMode.UPDATE): securityMode: int = SecurityMode.UPDATE,
unique_names=True,
):
""" """
Parameters Parameters
...@@ -1035,6 +1040,8 @@ def crawler_main(crawled_directory_path: str, ...@@ -1035,6 +1040,8 @@ def crawler_main(crawled_directory_path: str,
remove the given prefix from file paths remove the given prefix from file paths
securityMode : int securityMode : int
securityMode of Crawler securityMode of Crawler
unique_names : bool
whether or not to update or insert entities inspite of name conflicts
Returns Returns
------- -------
...@@ -1092,7 +1099,7 @@ def crawler_main(crawled_directory_path: str, ...@@ -1092,7 +1099,7 @@ def crawler_main(crawled_directory_path: str,
raise RuntimeError("Missing RecordTypes: {}". raise RuntimeError("Missing RecordTypes: {}".
format(", ".join(notfound))) format(", ".join(notfound)))
crawler.synchronize(commit_changes=True) crawler.synchronize(commit_changes=True, unique_names=unique_names)
return 0 return 0
...@@ -1118,9 +1125,9 @@ def parse_args(): ...@@ -1118,9 +1125,9 @@ def parse_args():
# TODO: load identifiables is a dirty implementation currently # TODO: load identifiables is a dirty implementation currently
parser.add_argument("-i", "--load-identifiables", parser.add_argument("-i", "--load-identifiables",
help="Load identifiables from " help="Load identifiables from the given yaml file.")
"the given yaml file.") parser.add_argument("-u", "--unique-names",
help="Insert or updates entities even if name conflicts exist.")
parser.add_argument("-p", "--prefix", parser.add_argument("-p", "--prefix",
help="Remove the given prefix from the paths " help="Remove the given prefix from the paths "
"of all file objects.") "of all file objects.")
...@@ -1142,16 +1149,17 @@ def main(): ...@@ -1142,16 +1149,17 @@ def main():
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
sys.exit(crawler_main( sys.exit(crawler_main(
args.crawled_directory_path, crawled_directory_path=args.crawled_directory_path,
args.cfood_file_name, cfood_file_name=args.cfood_file_name,
args.load_identifiables, identifiables_definition_file=args.load_identifiables,
args.debug, debug=args.debug,
args.provenance, provenance_file=args.provenance,
args.dry_run, dry_run=args.dry_run,
args.prefix, prefix=args.prefix,
{"retrieve": SecurityMode.RETRIEVE, securityMode={"retrieve": SecurityMode.RETRIEVE,
"insert": SecurityMode.INSERT, "insert": SecurityMode.INSERT,
"update": SecurityMode.UPDATE}[args.security_mode] "update": SecurityMode.UPDATE}[args.security_mode],
unique_names=args.unique_names,
)) ))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment