diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index f502527fd9f1f8f45693a73ac5497eaf4401be71..605f1463d9853a100443ea8ed698e4169266fa13 100644 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -893,15 +893,21 @@ class Crawler(object): return self.updateList -def main(): - args = parse_args() - crawler = Crawler(debug=args.debug) - crawler.crawl_directory(args.path, args.cfood) - if args.provenance is not None: - crawler.save_debug_data(args.provenance) - - if args.load_identifiables is not None: - with open(args.load_identifiables, "r") as f: +def crawler_main(args_path, + args_cfood, + args_load_identifiables, + args_debug, + args_provenance, + args_dry_sync, + args_sync, + args_prefix): + crawler = Crawler(debug=args_debug) + crawler.crawl_directory(args_path, args_cfood) + if args_provenance is not None: + crawler.save_debug_data(args_provenance) + + if args_load_identifiables is not None: + with open(args_load_identifiables, "r") as f: identifiable_data = yaml.safe_load(f) ident = CaosDBIdentifiableAdapter() @@ -914,7 +920,7 @@ def main(): rt.add_property(name=pn) ident.register_identifiable(k, rt) - if args.dry_sync: + if args_dry_sync: ins, upd = crawler.synchronize(commit_changes=False) inserts = [str(i) for i in ins] updates = [str(i) for i in upd] @@ -922,12 +928,14 @@ def main(): f.write(yaml.dump({ "insert": inserts, "update": updates})) - elif args.sync: + elif args_sync: rtsfinder = dict() for elem in crawler.updateList: if isinstance(elem, db.File): # correct the file path: # elem.file = os.path.join(args.path, elem.file) + if elem.path.startswith(args_prefix): + elem.path = elem.path[len(args_prefix):] elem.file = None # TODO: as long as the new file backend is not finished # we are using the loadFiles function to insert symlinks. @@ -982,8 +990,24 @@ def parse_args(): help="Do the synchronization. This is probably the expected " "standard behavior of the crawler.") + parser.add_argument("-p", "--prefix", + help="Remove the given prefix from the paths " + "of all file objects.") + return parser.parse_args() +def main(): + args = parse_args() + return crawler_main( + args.path, + args.cfood, + args.load_identifiables, + args.debug, + args.provenance, + args.dry_sync, + args.sync, + args.prefix + ) if __name__ == "__main__": sys.exit(main())