Skip to content
Snippets Groups Projects
Commit b709bd88 authored by Alexander Schlemmer's avatar Alexander Schlemmer
Browse files

ENH: provenance output works

parent f5426e39
No related branches found
No related tags found
1 merge request!53Release 0.1
...@@ -182,8 +182,7 @@ class Converter(object): ...@@ -182,8 +182,7 @@ class Converter(object):
def create_values(self, def create_values(self,
values: GeneralStore, values: GeneralStore,
element: StructureElement, element: StructureElement):
converters_path: list, structure_elements_path: list):
""" """
Extract information from the structure element and store them as values in the Extract information from the structure element and store them as values in the
general store. general store.
...@@ -195,7 +194,7 @@ class Converter(object): ...@@ -195,7 +194,7 @@ class Converter(object):
if m is None: if m is None:
# this should never happen as the condition was checked before already # this should never happen as the condition was checked before already
raise RuntimeError("Condition does not match.") raise RuntimeError("Condition does not match.")
values.update(m, converters_path, structure_elements_path) values.update(m)
@abstractmethod @abstractmethod
def create_children(self, values: GeneralStore, def create_children(self, values: GeneralStore,
...@@ -204,11 +203,16 @@ class Converter(object): ...@@ -204,11 +203,16 @@ class Converter(object):
def create_records(self, values: GeneralStore, def create_records(self, values: GeneralStore,
records: RecordStore, records: RecordStore,
element: StructureElement, element: StructureElement):
converters_path: list, structure_elements_path: list):
if "records" not in self.definition: if "records" not in self.definition:
return return []
# list of keys to identify, which variables have been set by which paths:
# these are tuples:
# 0: record name
# 1: property name
keys_modified = []
for name, record in self.definition["records"].items(): for name, record in self.definition["records"].items():
# whether the record already exists in the store or not are actually really # whether the record already exists in the store or not are actually really
...@@ -216,15 +220,16 @@ class Converter(object): ...@@ -216,15 +220,16 @@ class Converter(object):
if name not in records: if name not in records:
c_record = db.Record() c_record = db.Record()
# add the new record to the record store: # add the new record to the record store:
records.set_value(name, c_record, converters_path, structure_elements_path) records[name] = c_record
# additionally add the new record to the general store: # additionally add the new record to the general store:
values.set_value(name, c_record, converters_path, structure_elements_path) values[name] = c_record
c_record = records[name] c_record = records[name]
for key, value in record.items(): for key, value in record.items():
if key == "parents": if key == "parents":
continue continue
keys_modified.append((name, key))
propvalue, collection_mode = handle_value(value, values) propvalue, collection_mode = handle_value(value, values)
if c_record.get_property(key) is None: if c_record.get_property(key) is None:
...@@ -253,6 +258,7 @@ class Converter(object): ...@@ -253,6 +258,7 @@ class Converter(object):
else: else:
if not has_parent(c_record, name): if not has_parent(c_record, name):
c_record.add_parent(name) c_record.add_parent(name)
return keys_modified
......
...@@ -69,6 +69,7 @@ from .stores import GeneralStore, RecordStore ...@@ -69,6 +69,7 @@ from .stores import GeneralStore, RecordStore
from .structure_elements import StructureElement, Directory, File from .structure_elements import StructureElement, Directory, File
from .converters import Converter, DirectoryConverter from .converters import Converter, DirectoryConverter
from .identifiable_adapters import LocalStorageIdentifiableAdapter from .identifiable_adapters import LocalStorageIdentifiableAdapter
from collections import defaultdict
class Crawler(object): class Crawler(object):
...@@ -107,7 +108,9 @@ class Crawler(object): ...@@ -107,7 +108,9 @@ class Crawler(object):
# 0: generalStore # 0: generalStore
# 1: recordStore # 1: recordStore
self.debug_tree: dict[str, tuple] = dict() self.debug_tree: dict[str, tuple] = dict()
self.debug_metadata: dict[str, dict[str, tuple]] = dict() self.debug_metadata: dict[str, dict] = dict()
self.debug_metadata["copied"] = dict()
self.debug_metadata["modified"] = defaultdict(lambda: dict())
def crawl_directory(self, dirname: str, def crawl_directory(self, dirname: str,
cfood: str): cfood: str):
...@@ -169,6 +172,20 @@ class Crawler(object): ...@@ -169,6 +172,20 @@ class Crawler(object):
pass pass
def save_debug_data(self, filename: str):
paths: dict[str, dict] = {"provenance": dict()}
mod_info = self.debug_metadata["modified"]
for record_name in mod_info:
paths["provenance"][record_name] = dict()
for prop_name in mod_info[record_name]:
paths["provenance"][record_name][prop_name] = {
"structure_elements_path": "/".join(
mod_info[record_name][prop_name][0]),
"converters_path": "/".join(
mod_info[record_name][prop_name][1])}
with open(filename, "w") as f:
f.write(yaml.dump(paths))
def _crawl(self, items: list[StructureElement], def _crawl(self, items: list[StructureElement],
global_converters: list[Converter], global_converters: list[Converter],
...@@ -202,15 +219,22 @@ class Crawler(object): ...@@ -202,15 +219,22 @@ class Crawler(object):
# -> rather store it in the variable storage than in the converter? # -> rather store it in the variable storage than in the converter?
converter.create_values(generalStore_copy, element) converter.create_values(generalStore_copy, element)
converter.create_records(generalStore_copy, recordStore_copy, element) keys_modified = converter.create_records(
generalStore_copy, recordStore_copy, element)
children = converter.create_children(generalStore_copy, element) children = converter.create_children(generalStore_copy, element)
if self.debug: if self.debug:
# add provenance information for each varaible # add provenance information for each varaible
self.debug_tree[str(element)] = ( self.debug_tree[str(element)] = (
generalStore_copy.get_storage(), recordStore_copy.get_storage()) generalStore_copy.get_storage(), recordStore_copy.get_storage())
self.debug_metadata[str(element)]["copied"] = ( self.debug_metadata["copied"][str(element)] = (
generalStore_copy.get_dict_copied(), recordStore_copy.get_dict_copied()) generalStore_copy.get_dict_copied(), recordStore_copy.get_dict_copied())
mod_info = self.debug_metadata["modified"]
for record_name, prop_name in keys_modified:
internal_id = recordStore_copy.get_internal_id(record_name)
mod_info[record_name + "_" + str(internal_id)][prop_name] = (
structure_elements_path + [element.get_name()],
converters_path + [converter.name])
self._crawl(children, global_converters, converter.converters, self._crawl(children, global_converters, converter.converters,
generalStore_copy, recordStore_copy, generalStore_copy, recordStore_copy,
...@@ -220,9 +244,6 @@ class Crawler(object): ...@@ -220,9 +244,6 @@ class Crawler(object):
# to the general update container. # to the general update container.
scoped_records = recordStore.get_records_current_scope() scoped_records = recordStore.get_records_current_scope()
for record in scoped_records: for record in scoped_records:
print("/".join(structure_elements_path))
print("/".join(converters_path))
print(record)
self.updateList.append(record) self.updateList.append(record)
return self.updateList return self.updateList
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
# #
import caosdb as db import caosdb as db
from collections import defaultdict
class Store(object): class Store(object):
...@@ -37,8 +38,9 @@ class Store(object): ...@@ -37,8 +38,9 @@ class Store(object):
# This dict stores whether the corresponding dict item in _storage # This dict stores whether the corresponding dict item in _storage
# (same key) has been copied from another Store, or was created newly in this store. # (same key) has been copied from another Store, or was created newly in this store.
self._copied = dict() self._copied = dict()
self._provenance_structure_elements = dict() # This attribute stores an internal id for being able to distinguish multiple
self._provenance_converters = dict() # ocurrences of the same thing in the store:
self._ids = defaultdict(lambda: 0)
def __getitem__(self, key: str): def __getitem__(self, key: str):
return self._storage[key] return self._storage[key]
...@@ -46,14 +48,16 @@ class Store(object): ...@@ -46,14 +48,16 @@ class Store(object):
def __contains__(self, key: str): def __contains__(self, key: str):
return key in self._storage return key in self._storage
def update(self, other: dict, converters_path: list, structure_elements_path: list): def update(self, other: dict):
self._storage.update(other) self._storage.update(other)
for key in other: for key in other:
self._copied[key] = False self._copied[key] = False
self._ids[key] += 1
def set_value(self, key: str, value, converters_path: list, structure_elements_path: list): def __setitem__(self, key: str, value):
self._storage[key] = value self._storage[key] = value
self._copied[key] = False self._copied[key] = False
self._ids[key] += 1
def get_storage(self): def get_storage(self):
return self._storage return self._storage
...@@ -62,8 +66,7 @@ class Store(object): ...@@ -62,8 +66,7 @@ class Store(object):
s_copy = self.__class__() s_copy = self.__class__()
s_copy._storage = dict(self._storage) s_copy._storage = dict(self._storage)
s_copy._copied = {key: True for key in self._copied} s_copy._copied = {key: True for key in self._copied}
s_copy._provenance_structure_elements = dict(self._provenance_structure_elements) s_copy._ids = self._ids
s_copy._provenance_converters = dict(self._provenance_converters)
return s_copy return s_copy
def get_dict_copied(self): def get_dict_copied(self):
...@@ -72,17 +75,11 @@ class Store(object): ...@@ -72,17 +75,11 @@ class Store(object):
""" """
return self._copied return self._copied
def get_provenance_structure_elements(self): def get_internal_id(self, key):
""" """
Only for debugging. Only for debugging.
""" """
return self._provenance_structure_elements return self._ids[key]
def get_provenance_converters(self):
"""
Only for debugging.
"""
return self._provenance_converters
class GeneralStore(Store): class GeneralStore(Store):
pass pass
......
...@@ -8,8 +8,15 @@ from newcrawler.converters import MarkdownFileConverter ...@@ -8,8 +8,15 @@ from newcrawler.converters import MarkdownFileConverter
from newcrawler.structure_elements import File, DictTextElement, DictListElement from newcrawler.structure_elements import File, DictTextElement, DictListElement
from newcrawler.identifiable_adapters import LocalStorageIdentifiableAdapter from newcrawler.identifiable_adapters import LocalStorageIdentifiableAdapter
from os.path import join, dirname, basename from os.path import join, dirname, basename
from collections import defaultdict
import yaml
import caosdb as db import caosdb as db
# Some notes:
# Track provenance information in two ways:
# - DONE: provenance in structure elements and converters for properties of records
# - TODO: list whether information from structure elements and converters was used
def rfp(*pathcomponents): def rfp(*pathcomponents):
""" """
Return full path. Return full path.
...@@ -29,7 +36,7 @@ def test_crawler(): ...@@ -29,7 +36,7 @@ def test_crawler():
rfp("scifolder_cfood.yml")) rfp("scifolder_cfood.yml"))
subd = crawler.debug_tree[dircheckstr("DataAnalysis")] subd = crawler.debug_tree[dircheckstr("DataAnalysis")]
subc = crawler.debug_copied[dircheckstr("DataAnalysis")] subc = crawler.debug_metadata["copied"][dircheckstr("DataAnalysis")]
assert len(subd) == 2 assert len(subd) == 2
assert len(subd[0]) == 0 assert len(subd[0]) == 0
assert len(subd[1]) == 0 assert len(subd[1]) == 0
...@@ -38,7 +45,7 @@ def test_crawler(): ...@@ -38,7 +45,7 @@ def test_crawler():
assert len(subc[1]) == 0 assert len(subc[1]) == 0
subd = crawler.debug_tree[dircheckstr("DataAnalysis", "2020_climate-model-predict")] subd = crawler.debug_tree[dircheckstr("DataAnalysis", "2020_climate-model-predict")]
subc = crawler.debug_copied[dircheckstr("DataAnalysis", "2020_climate-model-predict")] subc = crawler.debug_metadata["copied"][dircheckstr("DataAnalysis", "2020_climate-model-predict")]
assert len(subd[1]) == 1 assert len(subd[1]) == 1
assert len(subd[1]["Project"].get_parents()) == 1 assert len(subd[1]["Project"].get_parents()) == 1
...@@ -64,7 +71,7 @@ def test_crawler(): ...@@ -64,7 +71,7 @@ def test_crawler():
subd = crawler.debug_tree[dircheckstr("DataAnalysis", subd = crawler.debug_tree[dircheckstr("DataAnalysis",
"2020_climate-model-predict", "2020_climate-model-predict",
"2020-02-08_prediction-errors")] "2020-02-08_prediction-errors")]
subc = crawler.debug_copied[dircheckstr("DataAnalysis", subc = crawler.debug_metadata["copied"][dircheckstr("DataAnalysis",
"2020_climate-model-predict", "2020_climate-model-predict",
"2020-02-08_prediction-errors")] "2020-02-08_prediction-errors")]
assert len(subd[0]) == 4 assert len(subd[0]) == 4
...@@ -160,3 +167,6 @@ def test_crawler_update_list(): ...@@ -160,3 +167,6 @@ def test_crawler_update_list():
ident.store_state(rfp("records.xml")) ident.store_state(rfp("records.xml"))
# ident.restore_state(rfp("records.xml")) # ident.restore_state(rfp("records.xml"))
assert len(ident.get_records()) == len(crawler.updateList) assert len(ident.get_records()) == len(crawler.updateList)
crawler.save_debug_data(rfp("provenance.yml"))
assert False
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment