diff --git a/CHANGELOG.md b/CHANGELOG.md index 81b6cd332416d35a8ef4c436e391890afb3a43f5..3db61909ec48b24231f011d1a2a4790febb793dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * Possibility to authorize updates as in the old crawler * Allow authorization of inserts * Converters can now filter the list of children +* You can now crawl data with name conflicts: `synchronize(unique_names=False)` ### Changed diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index cc8d69dd45eacac32c07f9eeb5247b89e1b017c4..d2758baaead7713aa9fedb8b9ad96163405cdf39 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -438,7 +438,7 @@ class Crawler(object): return self.target_data - def synchronize(self, commit_changes: bool = True): + def synchronize(self, commit_changes: bool = True, unique_names=True): """ Carry out the actual synchronization. """ @@ -446,7 +446,7 @@ class Crawler(object): # After the crawling, the actual synchronization with the database, based on the # update list is carried out: - return self._synchronize(self.target_data, commit_changes) + return self._synchronize(self.target_data, commit_changes, unique_names=unique_names) def can_be_checked_externally(self, record: db.Record): """ @@ -766,7 +766,8 @@ class Crawler(object): return db.Entity(name=name).retrieve() @staticmethod - def execute_inserts_in_list(to_be_inserted, securityMode, run_id: int = None): + def execute_inserts_in_list(to_be_inserted, securityMode, run_id: int = None, + unique_names=True): for record in to_be_inserted: for prop in record.properties: entity = Crawler._get_entity_by_name(prop.name) @@ -775,7 +776,7 @@ class Crawler(object): logger.debug(to_be_inserted) if len(to_be_inserted) > 0: if securityMode.value > SecurityMode.RETRIEVE.value: - db.Container().extend(to_be_inserted).insert() + db.Container().extend(to_be_inserted).insert(unique=unique_names) elif run_id is not None: update_cache = UpdateCache() update_cache.insert(to_be_inserted, run_id, insert=True) @@ -793,18 +794,20 @@ class Crawler(object): _resolve_datatype(prop, entity) @staticmethod - def execute_updates_in_list(to_be_updated, securityMode, run_id: int = None): + def execute_updates_in_list(to_be_updated, securityMode, run_id: int = None, + unique_names=True): Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated) logger.debug("UPDATE") logger.debug(to_be_updated) if len(to_be_updated) > 0: if securityMode.value > SecurityMode.INSERT.value: - db.Container().extend(to_be_updated).update() + db.Container().extend(to_be_updated).update(unique=unique_names) elif run_id is not None: update_cache = UpdateCache() update_cache.insert(to_be_updated, run_id) - def _synchronize(self, target_data: List[db.Record], commit_changes: bool = True): + def _synchronize(self, target_data: List[db.Record], commit_changes: bool = True, + unique_names=True): """ This function applies several stages: 1) Retrieve identifiables for all records in target_data. @@ -840,9 +843,9 @@ class Crawler(object): if commit_changes: self.execute_inserts_in_list( - to_be_inserted, self.securityMode, self.run_id) + to_be_inserted, self.securityMode, self.run_id, unique_names=unique_names) self.execute_updates_in_list( - to_be_updated, self.securityMode, self.run_id) + to_be_updated, self.securityMode, self.run_id, unique_names=unique_names) update_cache = UpdateCache() pending_inserts = update_cache.get_inserts(self.run_id) @@ -1014,7 +1017,9 @@ def crawler_main(crawled_directory_path: str, provenance_file: str = None, dry_run: bool = False, prefix: str = "", - securityMode: int = SecurityMode.UPDATE): + securityMode: int = SecurityMode.UPDATE, + unique_names=True, + ): """ Parameters @@ -1035,6 +1040,8 @@ def crawler_main(crawled_directory_path: str, remove the given prefix from file paths securityMode : int securityMode of Crawler + unique_names : bool + whether or not to update or insert entities inspite of name conflicts Returns ------- @@ -1092,7 +1099,7 @@ def crawler_main(crawled_directory_path: str, raise RuntimeError("Missing RecordTypes: {}". format(", ".join(notfound))) - crawler.synchronize(commit_changes=True) + crawler.synchronize(commit_changes=True, unique_names=unique_names) return 0 @@ -1118,9 +1125,9 @@ def parse_args(): # TODO: load identifiables is a dirty implementation currently parser.add_argument("-i", "--load-identifiables", - help="Load identifiables from " - "the given yaml file.") - + help="Load identifiables from the given yaml file.") + parser.add_argument("-u", "--unique-names", + help="Insert or updates entities even if name conflicts exist.") parser.add_argument("-p", "--prefix", help="Remove the given prefix from the paths " "of all file objects.") @@ -1142,16 +1149,17 @@ def main(): logger.setLevel(logging.INFO) sys.exit(crawler_main( - args.crawled_directory_path, - args.cfood_file_name, - args.load_identifiables, - args.debug, - args.provenance, - args.dry_run, - args.prefix, - {"retrieve": SecurityMode.RETRIEVE, - "insert": SecurityMode.INSERT, - "update": SecurityMode.UPDATE}[args.security_mode] + crawled_directory_path=args.crawled_directory_path, + cfood_file_name=args.cfood_file_name, + identifiables_definition_file=args.load_identifiables, + debug=args.debug, + provenance_file=args.provenance, + dry_run=args.dry_run, + prefix=args.prefix, + securityMode={"retrieve": SecurityMode.RETRIEVE, + "insert": SecurityMode.INSERT, + "update": SecurityMode.UPDATE}[args.security_mode], + unique_names=args.unique_names, ))