Skip to content
Snippets Groups Projects
Commit 391d101f authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

Merge branch 'f-stream-line-output' into 'master'

Streamline Crawler Output

See merge request caosdb/caosdb-advanced-user-tools!17
parents 4421e086 ac647773
Branches
Tags
No related merge requests found
...@@ -50,18 +50,21 @@ def access(path): ...@@ -50,18 +50,21 @@ def access(path):
if __name__ == "__main__": if __name__ == "__main__":
set_log_level(logging.DEBUG) logger = logging.getLogger("caosadvancedtools")
conlogger = logging.getLogger("connection")
conlogger.setLevel(level=logging.ERROR)
logger.setLevel(level=logging.WARN)
parser = get_parser() parser = get_parser()
args = parser.parse_args() args = parser.parse_args()
print("Starting query...") logger.info("Starting query...")
files = Crawler.query_files(args.path) files = Crawler.query_files(args.path)
print("Query done...") logger.info("Query done...")
config = db.configuration.get_config() config = db.configuration.get_config()
c = Crawler(use_cache=True, access=access, c = Crawler(use_cache=True, access=access,
food=[ProjectCFood, food=[ProjectCFood,
ExperimentCFood, AnalysisCFood, ExperimentCFood, AnalysisCFood,
PublicationCFood, SimulationCFood, PublicationCFood, SimulationCFood,
]) ])
c.crawl(files, interactive=False, security_level=UPDATE) c.crawl(files, interactive=False, security_level=INSERT, hideKnown=True)
...@@ -38,6 +38,7 @@ import logging ...@@ -38,6 +38,7 @@ import logging
import re import re
import caosdb as db import caosdb as db
from caosdb.exceptions import EntityDoesNotExist
from .guard import global_guard as guard from .guard import global_guard as guard
...@@ -77,7 +78,7 @@ class AbstractCFood(object): ...@@ -77,7 +78,7 @@ class AbstractCFood(object):
A function that takes a CaosDB path and returns a local path A function that takes a CaosDB path and returns a local path
""" """
self.access = access self.access = access
self.crawled_file = None self._crawled_file = None
self.crawled_path = crawled_file self.crawled_path = crawled_file
self.match = type(self).match(crawled_file) self.match = type(self).match(crawled_file)
self.to_be_updated = db.Container() self.to_be_updated = db.Container()
...@@ -85,6 +86,25 @@ class AbstractCFood(object): ...@@ -85,6 +86,25 @@ class AbstractCFood(object):
self.attached_ones = [] self.attached_ones = []
self.attached_filenames = [] self.attached_filenames = []
@property
def crawled_file(self):
if self._crawled_file is None:
try:
q = "FIND FILE WHICH IS STORED AT '{}'".format(
self.crawled_path)
self._crawled_file = db.execute_query(q, unique=True)
except EntityDoesNotExist:
path = "**"
if not self.crawled_path.startswith("/"):
path = path + "/"
q = "FIND FILE WHICH IS STORED AT '{}{}'".format(path,
self.crawled_path)
logger.debug(q)
self._crawled_file = db.execute_query(q, unique=True)
return self._crawled_file
def collect_information(self): def collect_information(self):
""" The CFood collects information for further processing. """ The CFood collects information for further processing.
......
...@@ -37,6 +37,7 @@ match. This occurs in basically three steps: ...@@ -37,6 +37,7 @@ match. This occurs in basically three steps:
import logging import logging
import os
import traceback import traceback
from datetime import datetime from datetime import datetime
...@@ -55,11 +56,15 @@ def separated(text): ...@@ -55,11 +56,15 @@ def separated(text):
class UnknownCache(object): class UnknownCache(object):
def __init__(self, interactive=False): def __init__(self, interactive=False, load=False):
if interactive and "y" == input( if interactive and "y" == input(
"\nDo you want to load filenames that previously were not " "\nDo you want to load filenames that previously were not "
"matched by any CFood?\nIn that case, they will not show up " "matched by any CFood?\nIn that case, they will not show up "
"again. (y)"): "again. (y)"):
load = True
if load and os.path.exists("known_cache.db"):
with open("known_cache.db") as fi: with open("known_cache.db") as fi:
self.filenames = [el.strip("\n") for el in fi.readlines()] self.filenames = [el.strip("\n") for el in fi.readlines()]
else: else:
...@@ -99,7 +104,7 @@ class Crawler(object): ...@@ -99,7 +104,7 @@ class Crawler(object):
if self.use_cache: if self.use_cache:
self.cache = Cache() self.cache = Cache()
def match(self, files, interactive): def match(self, files, interactive, hideKnown=False):
errors_occured = False errors_occured = False
tbs = [] tbs = []
cfoods = [] cfoods = []
...@@ -151,12 +156,12 @@ class Crawler(object): ...@@ -151,12 +156,12 @@ class Crawler(object):
# possibly load previously encountered "Missing matches" and # possibly load previously encountered "Missing matches" and
# "Multiple matches" # "Multiple matches"
ucache = UnknownCache(interactive=interactive) ucache = UnknownCache(interactive=interactive, load=hideKnown)
for crawled_file in files: for crawled_file in files:
if len(matches[crawled_file]) == 0: if len(matches[crawled_file]) == 0:
msg = ("ATTENTION: No matching cfood!\n" msg = ("ATTENTION: No matching cfood!\n"
"Tried to match {}".format(crawled_file)) "Tried to match {}\n".format(crawled_file))
if crawled_file in ucache.filenames: if crawled_file in ucache.filenames:
logger.debug(msg) logger.debug(msg)
...@@ -165,28 +170,30 @@ class Crawler(object): ...@@ -165,28 +170,30 @@ class Crawler(object):
ucache.add(crawled_file) ucache.add(crawled_file)
if len(matches[crawled_file]) > 1: if len(matches[crawled_file]) > 1:
msg = ("Attention: More than one matching cfood!" msg = ("Attention: More than one matching cfood!\n"
+ "Tried to match {}\n".format(crawled_file) + "Tried to match {}\n".format(crawled_file)
+ "\tRecordTypes:\t" + ", ".join( + "\tRecordTypes:\t" + ", ".join(
matches[crawled_file])) matches[crawled_file])+"\n")
if crawled_file in ucache.filenames: if crawled_file in ucache.filenames:
logger.debug(msg) logger.debug(msg)
else: else:
logger.warn(msg) logger.warn(msg)
ucache.add(crawled_file) ucache.add(crawled_file)
# Save the encountered prblem matches # Save the encountered prblem matches
ucache.save() ucache.save()
return cfoods, matches, tbs, errors_occured return cfoods, matches, tbs, errors_occured
def crawl(self, files, interactive=True, security_level=RETRIEVE): def crawl(self, files, interactive=True, hideKnown=False,
security_level=RETRIEVE):
guard.set_level(level=security_level) guard.set_level(level=security_level)
files = sorted([f.path for f in files]) files = sorted([f.path for f in files])
cfoods, matches, tbs, errors_occured = self.match(files, interactive) cfoods, matches, tbs, errors_occured = self.match(files, interactive,
hideKnown=hideKnown)
if interactive and "y" != input("Do you want to continue? (y)"): if interactive and "y" != input("Do you want to continue? (y)"):
return return
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment