Skip to content
Snippets Groups Projects

Create a new scanner module and move functions from crawl module there

Merged Alexander Schlemmer requested to merge f-refactor-scanner-crawler into dev
Compare and
24 files
+ 1110
696
Compare changes
  • Side-by-side
  • Inline
Files
24
@@ -27,6 +27,7 @@ an integration test module that does basic integration tests
"""
from caosadvancedtools.crawler import Crawler as OldCrawler
from caoscrawler.debug_tree import DebugTree
import os
from caosdb import EmptyUniqueQueryError
import argparse
@@ -36,6 +37,7 @@ from caoscrawler import Crawler, SecurityMode
from caoscrawler.identifiable import Identifiable
import caosdb as db
from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter
from caoscrawler.scanner import scan_directory
import pytest
from caosadvancedtools.models.parser import parse_model_from_yaml
import yaml
@@ -82,42 +84,46 @@ def ident():
return ident
def crawl_standard_test_directory(cr: Crawler,
subdir: str = "examples_article",
cfood: str = "scifolder_cfood.yml"):
cr.crawl_directory(rfp("..", "..", "unittests", "test_directories", subdir),
rfp("..", "..", "unittests", cfood))
def crawl_standard_test_directory(subdir: str = "examples_article",
cfood: str = "scifolder_cfood.yml",
debug_tree=None):
return scan_directory(rfp("..", "..", "unittests", "test_directories", subdir),
rfp("..", "..", "unittests", cfood),
debug_tree=debug_tree)
@pytest.fixture
def crawler(ident):
cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr)
return cr
cr = Crawler(identifiableAdapter=ident)
debug_tree = DebugTree()
crawled_data = crawl_standard_test_directory(debug_tree=debug_tree)
return cr, crawled_data, debug_tree
@pytest.fixture
def crawler_extended(ident):
cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr, cfood="scifolder_extended.yml")
cr = Crawler(identifiableAdapter=ident)
debug_tree = DebugTree()
crawled_data = crawl_standard_test_directory(
cfood="scifolder_extended.yml", debug_tree=debug_tree)
# correct paths for current working directory
file_list = [r for r in cr.crawled_data if r.role == "File"]
file_list = [r for r in crawled_data if r.role == "File"]
for f in file_list:
f.file = rfp("..", "..", "unittests", "test_directories", f.file)
return cr
return cr, crawled_data, debug_tree
def test_ambigious_lookup(clear_database, usemodel, crawler, ident):
ins, ups = crawler.synchronize()
ins, ups = crawler[0].synchronize(crawled_data=crawler[1])
proj = db.execute_query("FIND Project WITH identifier='SpeedOfLight'", unique=True)
with pytest.raises(RuntimeError, match=".*unambigiously.*"):
print(crawler.identifiableAdapter.retrieve_identified_record_for_identifiable(
print(crawler[0].identifiableAdapter.retrieve_identified_record_for_identifiable(
Identifiable(properties={'project': proj.id})))
def test_single_insertion(clear_database, usemodel, crawler, ident):
ins, ups = crawler.synchronize()
ins, ups = crawler[0].synchronize(crawled_data=crawler[1])
# This test also generates the file records.xml used in some of the unittesets:
res = db.execute_query("FIND Record")
@@ -138,94 +144,93 @@ def test_single_insertion(clear_database, usemodel, crawler, ident):
assert len(ups) == 0
# Do a second run on the same data, there should be no changes:
crawler = Crawler(debug=True, identifiableAdapter=ident)
crawler.crawl_directory(rfp("../../unittests/test_directories", "examples_article"),
rfp("../../unittests/scifolder_cfood.yml"))
ins, ups = crawler.synchronize()
crawler = Crawler(identifiableAdapter=ident)
crawled_data = scan_directory(rfp("../../unittests/test_directories", "examples_article"),
rfp("../../unittests/scifolder_cfood.yml"))
ins, ups = crawler.synchronize(crawled_data=crawled_data)
assert len(ins) == 0
assert len(ups) == 0
def test_multiple_insertions(clear_database, usemodel, ident, crawler):
ins, ups = crawler.synchronize()
ins, ups = crawler[0].synchronize(crawled_data=crawler[1])
# Do a second run on the same data, there should be no changes:
cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr)
ins, ups = cr.synchronize()
cr = Crawler(identifiableAdapter=ident)
crawled_data = crawl_standard_test_directory()
ins, ups = cr.synchronize(crawled_data=crawled_data)
assert len(ins) == 0
assert len(ups) == 0
def test_insertion(clear_database, usemodel, ident, crawler):
ins, ups = crawler.synchronize()
ins, ups = crawler[0].synchronize(crawled_data=crawler[1])
# Do a second run on the same data, there should a new insert:
cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr, "example_insert")
assert len(cr.crawled_data) == 3
ins, ups = cr.synchronize()
cr = Crawler(identifiableAdapter=ident)
crawled_data = crawl_standard_test_directory("example_insert")
assert len(crawled_data) == 3
ins, ups = cr.synchronize(crawled_data=crawled_data)
assert len(ins) == 1
assert len(ups) == 0
# Do it again to check whether nothing is changed:
cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr, "example_insert")
assert len(cr.crawled_data) == 3
ins, ups = cr.synchronize()
cr = Crawler(identifiableAdapter=ident)
crawled_data = crawl_standard_test_directory("example_insert")
assert len(crawled_data) == 3
ins, ups = cr.synchronize(crawled_data=crawled_data)
assert len(ins) == 0
assert len(ups) == 0
def test_insert_auth(clear_database, usemodel, ident, crawler):
ins, ups = crawler.synchronize()
ins, ups = crawler[0].synchronize(crawled_data=crawler[1])
# Do a second run on the same data, there should a new insert:
cr = Crawler(debug=True, identifiableAdapter=ident, securityMode=SecurityMode.RETRIEVE)
crawl_standard_test_directory(cr, "example_insert")
assert len(cr.crawled_data) == 3
ins, ups = cr.synchronize()
cr = Crawler(identifiableAdapter=ident, securityMode=SecurityMode.RETRIEVE)
crawled_data = crawl_standard_test_directory("example_insert")
assert len(crawled_data) == 3
ins, ups = cr.synchronize(crawled_data=crawled_data)
assert len(ins) == 1
assert not ins[0].is_valid()
nins, nups = OldCrawler.update_authorized_changes(cr.run_id)
assert nins == 1
# Do it again to check whether nothing is changed:
cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr, "example_insert")
assert len(cr.crawled_data) == 3
ins, ups = cr.synchronize()
cr = Crawler(identifiableAdapter=ident)
crawled_data = crawl_standard_test_directory("example_insert")
assert len(crawled_data) == 3
ins, ups = cr.synchronize(crawled_data=crawled_data)
assert len(ins) == 0
assert len(ups) == 0
def test_insertion_and_update(clear_database, usemodel, ident, crawler):
ins, ups = crawler.synchronize()
ins, ups = crawler[0].synchronize(crawled_data=crawler[1])
cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr, "example_insert")
ins, ups = cr.synchronize()
cr = Crawler(identifiableAdapter=ident)
crawled_data = crawl_standard_test_directory("example_insert")
ins, ups = cr.synchronize(crawled_data=crawled_data)
cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr, "example_overwrite_1")
# print(cr.crawled_data)
cr = Crawler(identifiableAdapter=ident)
crawled_data = crawl_standard_test_directory("example_overwrite_1")
# cr.save_debug_data(rfp("provenance.yml"))
assert len(cr.crawled_data) == 3
ins, ups = cr.synchronize()
assert len(crawled_data) == 3
ins, ups = cr.synchronize(crawled_data=crawled_data)
assert len(ins) == 0
assert len(ups) == 1
def test_identifiable_update(clear_database, usemodel, ident, crawler):
ins, ups = crawler.synchronize()
ins, ups = crawler[0].synchronize(crawled_data=crawler[1])
# Do a second run on the same data with a change in one
# of the identifiables:
cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr)
cr = Crawler(identifiableAdapter=ident)
crawled_data = crawl_standard_test_directory()
# Test the addition of a single property:
l = cr.crawled_data
l = crawled_data
for record in l:
if (record.parents[0].name == "Measurement" and
record.get_property("date").value == "2020-01-03"):
@@ -234,28 +239,28 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler):
name="email", value="testperson@testaccount.test")
print("one change")
break
ins, ups = cr.synchronize()
ins, ups = cr.synchronize(crawled_data=crawled_data)
assert len(ins) == 0
assert len(ups) == 1
# Test the change within one property:
cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr)
l = cr.crawled_data
cr = Crawler(identifiableAdapter=ident)
crawled_data = crawl_standard_test_directory()
l = crawled_data
for record in l:
if (record.parents[0].name == "Measurement" and
record.get_property("date").value == "2020-01-03"):
record.add_property(name="email", value="testperson@coolmail.test")
print("one change")
break
ins, ups = cr.synchronize()
ins, ups = cr.synchronize(crawled_data=crawled_data)
assert len(ins) == 0
assert len(ups) == 1
# Changing the date should result in a new insertion:
cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr)
l = cr.crawled_data
cr = Crawler(identifiableAdapter=ident)
crawled_data = crawl_standard_test_directory()
l = crawled_data
for record in l:
if (record.parents[0].name == "Measurement" and
record.get_property("date").value == "2020-01-03"):
@@ -263,30 +268,31 @@ def test_identifiable_update(clear_database, usemodel, ident, crawler):
record.get_property("date").value = "2012-01-02"
print("one change")
break
ins, ups = cr.synchronize()
ins, ups = cr.synchronize(crawled_data=crawled_data)
assert len(ins) == 1
assert len(ups) == 0
def test_file_insertion_dry(clear_database, usemodel, ident):
crawler_extended = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(
crawler_extended, cfood="scifolder_extended.yml")
file_list = [r for r in crawler_extended.crawled_data if r.role == "File"]
crawler_extended = Crawler(identifiableAdapter=ident)
crawled_data = crawl_standard_test_directory(
cfood="scifolder_extended.yml")
file_list = [r for r in crawled_data if r.role == "File"]
assert len(file_list) == 11
for f in file_list:
assert f.path.endswith("README.md")
assert f.path[1:] == f.file
ins, ups = crawler_extended.synchronize(commit_changes=False)
ins, ups = crawler_extended.synchronize(crawled_data=crawled_data, commit_changes=False)
assert len(ups) == 0
file_list_ins = [r for r in ins if r.role == "File"]
assert len(file_list_ins) == 11
def test_file_insertion(clear_database, usemodel, ident, crawler_extended):
ins, ups = crawler_extended.synchronize(commit_changes=True)
ins, ups = crawler_extended[0].synchronize(
crawled_data=crawler_extended[1], commit_changes=True)
file_list_ins = [r for r in ins if r.role == "File"]
assert len(file_list_ins) == 11
@@ -302,16 +308,17 @@ def test_file_insertion(clear_database, usemodel, ident, crawler_extended):
def test_file_update(clear_database, usemodel, ident, crawler_extended):
ins1, ups1 = crawler_extended.synchronize(commit_changes=True)
ins1, ups1 = crawler_extended[0].synchronize(
crawled_data=crawler_extended[1], commit_changes=True)
file_list_ins = [r for r in ins1 if r.role == "File"]
cr = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr, cfood="scifolder_extended.yml")
cr = Crawler(identifiableAdapter=ident)
crawled_data = crawl_standard_test_directory(cfood="scifolder_extended.yml")
file_list = [r for r in cr.crawled_data if r.role == "File"]
file_list = [r for r in crawled_data if r.role == "File"]
for f in file_list:
f.file = rfp("..", "..", "unittests", "test_directories", f.file)
ins2, ups2 = cr.synchronize(commit_changes=True)
ins2, ups2 = cr.synchronize(crawled_data=crawled_data, commit_changes=True)
assert len(ups1) == 0
assert len(ups2) == 0
@@ -320,13 +327,13 @@ def test_file_update(clear_database, usemodel, ident, crawler_extended):
assert len(res) == 11
assert len(res[0].parents) == 0
cr2 = Crawler(debug=True, identifiableAdapter=ident)
crawl_standard_test_directory(cr2, cfood="scifolder_extended2.yml")
cr2 = Crawler(identifiableAdapter=ident)
crawled_data = crawl_standard_test_directory(cfood="scifolder_extended2.yml")
file_list = [r for r in cr2.crawled_data if r.role == "File"]
file_list = [r for r in crawled_data if r.role == "File"]
for f in file_list:
f.file = rfp("..", "..", "unittests", "test_directories", f.file)
ins3, ups3 = cr2.synchronize(commit_changes=True)
ins3, ups3 = cr2.synchronize(crawled_data=crawled_data, commit_changes=True)
assert len(ups3) == 11
res = db.execute_query("Find File")
Loading