diff --git a/src/newcrawler/crawl.py b/src/newcrawler/crawl.py index ebd3008b2233aae0dcf41e0cff789ce5991bbd79..bf6bf4c9fec14d42e3a972e0fe64c394bc3dd4bc 100644 --- a/src/newcrawler/crawl.py +++ b/src/newcrawler/crawl.py @@ -90,6 +90,9 @@ class Crawler(object): The debugging information tree is a variable stored in self.debug_tree. It is a dictionary mapping directory entries to a tuple of general stores and record stores which are valid for the directory scope. + Furthermore, it is stored in a second tree named self.debug_copied whether the + objects in debug_tree had been copied from a higher level in the hierarchy + of the structureelements. """ self.converters = converters self.generalStore = generalStore @@ -101,6 +104,7 @@ class Crawler(object): self.debug = debug if self.debug: self.debug_tree: dict[str, tuple] = dict() + self.debug_copied: dict[str, tuple] = dict() def crawl_directory(self, dirname: str, cfood: str): @@ -178,6 +182,8 @@ class Crawler(object): if self.debug: self.debug_tree[str(element)] = ( generalStore_copy.get_storage(), recordStore_copy.get_storage()) + self.debug_copied[str(element)] = ( + generalStore_copy.get_dict_copied(), recordStore_copy.get_dict_copied()) self.crawl(children, global_converters, converter.converters, generalStore_copy, recordStore_copy) diff --git a/src/newcrawler/stores.py b/src/newcrawler/stores.py index 275a23461e33bc10112cc275083390f7e9e77432..64742069f59ea23f53632e6c3d436d3c821b0ce4 100644 --- a/src/newcrawler/stores.py +++ b/src/newcrawler/stores.py @@ -34,6 +34,9 @@ class Store(object): def __init__(self): self._storage = dict() + # This dict stores whether the corresponding dict item in _storage + # (same key) has been copied from another Store, or was created newly in this store. + self._copied = dict() def __getitem__(self, key): return self._storage[key] @@ -43,9 +46,12 @@ class Store(object): def update(self, other): self._storage.update(other) + for key in other: + self._copied[key] = False def __setitem__(self, key, value): self._storage[key] = value + self._copied[key] = False def get_storage(self): return self._storage @@ -53,8 +59,15 @@ class Store(object): def create_scoped_copy(self): s_copy = self.__class__() s_copy._storage = dict(self._storage) + s_copy._copied = {key: True for key in self._copied} return s_copy + def get_dict_copied(self): + """ + Only for debugging. + """ + return self._copied + class GeneralStore(Store): pass diff --git a/tests/test_tool.py b/tests/test_tool.py index ef4b59297e70cb37d44b3a9b6e6010fbb5e2af84..9ba90527dea2fd73f0df9f31f08931ba62d7e727 100755 --- a/tests/test_tool.py +++ b/tests/test_tool.py @@ -28,11 +28,16 @@ def test_crawler(): rfp("scifolder_cfood.yml")) subd = crawler.debug_tree[dircheckstr("DataAnalysis")] + subc = crawler.debug_copied[dircheckstr("DataAnalysis")] assert len(subd) == 2 assert len(subd[0]) == 0 assert len(subd[1]) == 0 + assert len(subc) == 2 + assert len(subc[0]) == 0 + assert len(subc[1]) == 0 subd = crawler.debug_tree[dircheckstr("DataAnalysis", "2020_climate-model-predict")] + subc = crawler.debug_copied[dircheckstr("DataAnalysis", "2020_climate-model-predict")] assert len(subd[1]) == 1 assert len(subd[1]["Project"].get_parents()) == 1 @@ -43,13 +48,24 @@ def test_crawler(): assert len(subd[0]) == 3 assert subd[0]["date"] == "2020" assert subd[0]["identifier"] == "climate-model-predict" - assert subd[0]["Project"].__class__ == db.Record + # Check the copy flags for the first level in the hierarchy: + assert len(subc[0]) == 3 + assert len(subc[1]) == 1 + assert subc[1]["Project"] == False + assert subc[0]["Project"] == False + assert subc[0]["date"] == False + assert subc[0]["identifier"] == False + + + subd = crawler.debug_tree[dircheckstr("DataAnalysis", "2020_climate-model-predict", "2020-02-08_prediction-errors")] - print(subd) + subc = crawler.debug_copied[dircheckstr("DataAnalysis", + "2020_climate-model-predict", + "2020-02-08_prediction-errors")] assert len(subd[0]) == 4 assert subd[0]["date"] == "2020-02-08" assert subd[0]["identifier"] == "prediction-errors" @@ -71,6 +87,14 @@ def test_crawler(): assert subd[1]["Measurement"].get_property("project").value.__class__ == db.Record assert subd[1]["Measurement"].get_property("project").value == subd[0]["Project"] + # Check the copy flags for the second level in the hierarchy: + assert subc[1]["Project"] == True + assert subc[0]["Project"] == True + assert subc[1]["Measurement"] == False + assert subc[0]["Measurement"] == False + assert subc[0]["date"] == False + assert subc[0]["identifier"] == False + def test_markdown_converter(): test_readme = File("README.md", rfp( "test_directories", "examples_article", "DataAnalysis",