diff --git a/.docker/Dockerfile b/.docker/Dockerfile index 2188dae2ed495eb2dab789cf8865f795380a2d6f..6fbde525e8995b615e66e15521103507e4d31013 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -7,12 +7,13 @@ RUN git clone https://gitlab.gwdg.de/bmp-caosdb/caosdb-pylib.git && \ cd caosdb-pylib && pip3 install . RUN git clone https://gitlab.gwdg.de/bmp-caosdb/caosdb-models.git && \ cd caosdb-models && pip3 install . -ADD https://gitlab.com/api/v4/projects/13601752/repository/branches \ - scifolder_version.txt -RUN git clone https://gitlab.com/henrik_indiscale/scifolder.git && \ - cd scifolder && pip3 install . +ADD https://gitlab.com/api/v4/projects/13601752/repository/branches/restructure_cfood scifolder_version.json +RUN git clone -b restructure_cfood \ + https://gitlab.com/henrik_indiscale/scifolder.git && \ + cd scifolder && git checkout 403793fdfde511d53 && pip3 install . COPY . /git -RUN rm -r /git/.git && mv /git/.docker/pycaosdb.ini /git/integrationtests +RUN rm -r /git/.git \ + && mv /git/.docker/pycaosdb.ini /git/integrationtests/full_test RUN cd /git && pip3 install . -WORKDIR /git/integrationtests/ +WORKDIR /git/integrationtests/full_test CMD /wait-for-it.sh caosdb-server:10443 -t 120 -- ./test.sh diff --git a/.docker/docker-compose.yml b/.docker/docker-compose.yml index 97fe0c6429a7f65de7ba5a56d456ffa61abf09cf..ab20a0788ba2a70d2770af5863826d7b674e7db5 100644 --- a/.docker/docker-compose.yml +++ b/.docker/docker-compose.yml @@ -18,7 +18,7 @@ services: source: "$EXEPATH/.docker/cert" target: /opt/caosdb/cert - type: bind - source: "$EXEPATH/integrationtests/extroot" + source: "$EXEPATH/integrationtests/full_test/extroot" target: /opt/caosdb/mnt/extroot read_only: true ports: diff --git a/.docker/pycaosdb.ini b/.docker/pycaosdb.ini index b57f2ebd13a37b75a5da3386ff8ded5d1bc9d2f8..eb6d4ca6c2d05b1977555dfc460ebfbef72a02e8 100644 --- a/.docker/pycaosdb.ini +++ b/.docker/pycaosdb.ini @@ -4,7 +4,7 @@ test_server_side_scripting.bin_dir=../caosdb-server/test_scripting/bin/ [Connection] url=https://caosdb-server:10443 username=admin -cacert=../.docker/cert/caosdb.cert.pem +cacert=../../.docker/cert/caosdb.cert.pem #cacert=/etc/ssl/cert.pem debug=0 diff --git a/integrationtests/crawl.py b/integrationtests/full_test/crawl.py similarity index 86% rename from integrationtests/crawl.py rename to integrationtests/full_test/crawl.py index df76f6cf5d19001f2f5bcb120a6db9f7bf2f16cd..d5f31789bc2f6760699491345cd53324fa56146e 100755 --- a/integrationtests/crawl.py +++ b/integrationtests/full_test/crawl.py @@ -54,10 +54,7 @@ if __name__ == "__main__": files = Crawler.query_files(args.path) print("Query done...") config = db.configuration.get_config() - c = Crawler(food=[ - AnalysisCFood(use_cache=True, access=access), - ExperimentCFood(use_cache=True, access=access), - PublicationCFood(use_cache=True, access=access), - SimulationCFood(use_cache=True, access=access), - ]) + c = Crawler(use_cache=True, access=access, + food=[AnalysisCFood, ExperimentCFood, + PublicationCFood, SimulationCFood, ]) c.crawl(files) diff --git a/integrationtests/extroot/.cerate_dir b/integrationtests/full_test/extroot/.cerate_dir similarity index 100% rename from integrationtests/extroot/.cerate_dir rename to integrationtests/full_test/extroot/.cerate_dir diff --git a/integrationtests/extroot/DataAnalysis/TestProject/2019-02-03/README.md b/integrationtests/full_test/extroot/DataAnalysis/TestProject/2019-02-03/README.md similarity index 100% rename from integrationtests/extroot/DataAnalysis/TestProject/2019-02-03/README.md rename to integrationtests/full_test/extroot/DataAnalysis/TestProject/2019-02-03/README.md diff --git a/integrationtests/extroot/DataAnalysis/TestProject/2019-02-03/plot.py b/integrationtests/full_test/extroot/DataAnalysis/TestProject/2019-02-03/plot.py similarity index 100% rename from integrationtests/extroot/DataAnalysis/TestProject/2019-02-03/plot.py rename to integrationtests/full_test/extroot/DataAnalysis/TestProject/2019-02-03/plot.py diff --git a/integrationtests/extroot/DataAnalysis/TestProject/2019-02-03/results.pdf b/integrationtests/full_test/extroot/DataAnalysis/TestProject/2019-02-03/results.pdf similarity index 100% rename from integrationtests/extroot/DataAnalysis/TestProject/2019-02-03/results.pdf rename to integrationtests/full_test/extroot/DataAnalysis/TestProject/2019-02-03/results.pdf diff --git a/integrationtests/extroot/DataAnalysis/TestProject/2019-02-03_something/README.md b/integrationtests/full_test/extroot/DataAnalysis/TestProject/2019-02-03_something/README.md similarity index 100% rename from integrationtests/extroot/DataAnalysis/TestProject/2019-02-03_something/README.md rename to integrationtests/full_test/extroot/DataAnalysis/TestProject/2019-02-03_something/README.md diff --git a/integrationtests/extroot/DataAnalysis/TestProject/2019-02-03_something/analyse.py b/integrationtests/full_test/extroot/DataAnalysis/TestProject/2019-02-03_something/analyse.py similarity index 100% rename from integrationtests/extroot/DataAnalysis/TestProject/2019-02-03_something/analyse.py rename to integrationtests/full_test/extroot/DataAnalysis/TestProject/2019-02-03_something/analyse.py diff --git a/integrationtests/extroot/DataAnalysis/TestProject/2019-02-03_something/images/lol1.png b/integrationtests/full_test/extroot/DataAnalysis/TestProject/2019-02-03_something/images/lol1.png similarity index 100% rename from integrationtests/extroot/DataAnalysis/TestProject/2019-02-03_something/images/lol1.png rename to integrationtests/full_test/extroot/DataAnalysis/TestProject/2019-02-03_something/images/lol1.png diff --git a/integrationtests/extroot/DataAnalysis/TestProject/2019-02-03_something/images/lol2.png b/integrationtests/full_test/extroot/DataAnalysis/TestProject/2019-02-03_something/images/lol2.png similarity index 100% rename from integrationtests/extroot/DataAnalysis/TestProject/2019-02-03_something/images/lol2.png rename to integrationtests/full_test/extroot/DataAnalysis/TestProject/2019-02-03_something/images/lol2.png diff --git a/integrationtests/extroot/ExperimentalData/TestProject/2019-02-03/README.md b/integrationtests/full_test/extroot/ExperimentalData/TestProject/2019-02-03/README.md similarity index 100% rename from integrationtests/extroot/ExperimentalData/TestProject/2019-02-03/README.md rename to integrationtests/full_test/extroot/ExperimentalData/TestProject/2019-02-03/README.md diff --git a/integrationtests/extroot/ExperimentalData/TestProject/2019-02-03/datafile.dat b/integrationtests/full_test/extroot/ExperimentalData/TestProject/2019-02-03/datafile.dat similarity index 100% rename from integrationtests/extroot/ExperimentalData/TestProject/2019-02-03/datafile.dat rename to integrationtests/full_test/extroot/ExperimentalData/TestProject/2019-02-03/datafile.dat diff --git a/integrationtests/extroot/ExperimentalData/TestProject/2019-02-03_something/README.md b/integrationtests/full_test/extroot/ExperimentalData/TestProject/2019-02-03_something/README.md similarity index 100% rename from integrationtests/extroot/ExperimentalData/TestProject/2019-02-03_something/README.md rename to integrationtests/full_test/extroot/ExperimentalData/TestProject/2019-02-03_something/README.md diff --git a/integrationtests/extroot/ExperimentalData/TestProject/2019-02-03_something/usefull.xlsx b/integrationtests/full_test/extroot/ExperimentalData/TestProject/2019-02-03_something/usefull.xlsx similarity index 100% rename from integrationtests/extroot/ExperimentalData/TestProject/2019-02-03_something/usefull.xlsx rename to integrationtests/full_test/extroot/ExperimentalData/TestProject/2019-02-03_something/usefull.xlsx diff --git a/integrationtests/extroot/ExperimentalData/TestProject/2019-02-03_something/useless.xlsx b/integrationtests/full_test/extroot/ExperimentalData/TestProject/2019-02-03_something/useless.xlsx similarity index 100% rename from integrationtests/extroot/ExperimentalData/TestProject/2019-02-03_something/useless.xlsx rename to integrationtests/full_test/extroot/ExperimentalData/TestProject/2019-02-03_something/useless.xlsx diff --git a/integrationtests/extroot/Publications/Posters/2019-02-03_really_cool_finding/README.md b/integrationtests/full_test/extroot/Publications/Posters/2019-02-03_really_cool_finding/README.md similarity index 100% rename from integrationtests/extroot/Publications/Posters/2019-02-03_really_cool_finding/README.md rename to integrationtests/full_test/extroot/Publications/Posters/2019-02-03_really_cool_finding/README.md diff --git a/integrationtests/extroot/Publications/Posters/2019-02-03_really_cool_finding/datafile.dat b/integrationtests/full_test/extroot/Publications/Posters/2019-02-03_really_cool_finding/datafile.dat similarity index 100% rename from integrationtests/extroot/Publications/Posters/2019-02-03_really_cool_finding/datafile.dat rename to integrationtests/full_test/extroot/Publications/Posters/2019-02-03_really_cool_finding/datafile.dat diff --git a/integrationtests/extroot/Publications/Theses/2019_paper_on_exciting_stuff/README.md b/integrationtests/full_test/extroot/Publications/Theses/2019_paper_on_exciting_stuff/README.md similarity index 100% rename from integrationtests/extroot/Publications/Theses/2019_paper_on_exciting_stuff/README.md rename to integrationtests/full_test/extroot/Publications/Theses/2019_paper_on_exciting_stuff/README.md diff --git a/integrationtests/extroot/Publications/Theses/2019_paper_on_exciting_stuff/usefull.xlsx b/integrationtests/full_test/extroot/Publications/Theses/2019_paper_on_exciting_stuff/usefull.xlsx similarity index 100% rename from integrationtests/extroot/Publications/Theses/2019_paper_on_exciting_stuff/usefull.xlsx rename to integrationtests/full_test/extroot/Publications/Theses/2019_paper_on_exciting_stuff/usefull.xlsx diff --git a/integrationtests/extroot/Publications/Theses/2019_paper_on_exciting_stuff/useless.xlsx b/integrationtests/full_test/extroot/Publications/Theses/2019_paper_on_exciting_stuff/useless.xlsx similarity index 100% rename from integrationtests/extroot/Publications/Theses/2019_paper_on_exciting_stuff/useless.xlsx rename to integrationtests/full_test/extroot/Publications/Theses/2019_paper_on_exciting_stuff/useless.xlsx diff --git a/integrationtests/extroot/SimulationData/TestProject/2019-02-03/README.md b/integrationtests/full_test/extroot/SimulationData/TestProject/2019-02-03/README.md similarity index 100% rename from integrationtests/extroot/SimulationData/TestProject/2019-02-03/README.md rename to integrationtests/full_test/extroot/SimulationData/TestProject/2019-02-03/README.md diff --git a/integrationtests/extroot/SimulationData/TestProject/2019-02-03/sim.py b/integrationtests/full_test/extroot/SimulationData/TestProject/2019-02-03/sim.py similarity index 100% rename from integrationtests/extroot/SimulationData/TestProject/2019-02-03/sim.py rename to integrationtests/full_test/extroot/SimulationData/TestProject/2019-02-03/sim.py diff --git a/integrationtests/extroot/SimulationData/TestProject/2019-02-03/snapshots.dat b/integrationtests/full_test/extroot/SimulationData/TestProject/2019-02-03/snapshots.dat similarity index 100% rename from integrationtests/extroot/SimulationData/TestProject/2019-02-03/snapshots.dat rename to integrationtests/full_test/extroot/SimulationData/TestProject/2019-02-03/snapshots.dat diff --git a/integrationtests/extroot/SimulationData/TestProject/2019-02-03_something/README.md b/integrationtests/full_test/extroot/SimulationData/TestProject/2019-02-03_something/README.md similarity index 100% rename from integrationtests/extroot/SimulationData/TestProject/2019-02-03_something/README.md rename to integrationtests/full_test/extroot/SimulationData/TestProject/2019-02-03_something/README.md diff --git a/integrationtests/extroot/SimulationData/TestProject/2019-02-03_something/large_sim.py b/integrationtests/full_test/extroot/SimulationData/TestProject/2019-02-03_something/large_sim.py similarity index 100% rename from integrationtests/extroot/SimulationData/TestProject/2019-02-03_something/large_sim.py rename to integrationtests/full_test/extroot/SimulationData/TestProject/2019-02-03_something/large_sim.py diff --git a/integrationtests/extroot/SimulationData/TestProject/2019-02-03_something/parameters.p b/integrationtests/full_test/extroot/SimulationData/TestProject/2019-02-03_something/parameters.p similarity index 100% rename from integrationtests/extroot/SimulationData/TestProject/2019-02-03_something/parameters.p rename to integrationtests/full_test/extroot/SimulationData/TestProject/2019-02-03_something/parameters.p diff --git a/integrationtests/extroot/SimulationData/TestProject/2019-02-03_something/timeseries.npy b/integrationtests/full_test/extroot/SimulationData/TestProject/2019-02-03_something/timeseries.npy similarity index 100% rename from integrationtests/extroot/SimulationData/TestProject/2019-02-03_something/timeseries.npy rename to integrationtests/full_test/extroot/SimulationData/TestProject/2019-02-03_something/timeseries.npy diff --git a/integrationtests/filldb.sh b/integrationtests/full_test/filldb.sh similarity index 100% rename from integrationtests/filldb.sh rename to integrationtests/full_test/filldb.sh diff --git a/integrationtests/insert_model.py b/integrationtests/full_test/insert_model.py similarity index 100% rename from integrationtests/insert_model.py rename to integrationtests/full_test/insert_model.py diff --git a/integrationtests/insert_record.py b/integrationtests/full_test/insert_record.py similarity index 100% rename from integrationtests/insert_record.py rename to integrationtests/full_test/insert_record.py diff --git a/integrationtests/model.yml b/integrationtests/full_test/model.yml similarity index 100% rename from integrationtests/model.yml rename to integrationtests/full_test/model.yml diff --git a/integrationtests/test.sh b/integrationtests/full_test/test.sh similarity index 100% rename from integrationtests/test.sh rename to integrationtests/full_test/test.sh diff --git a/integrationtests/test_crawler.py b/integrationtests/full_test/test_crawler.py similarity index 96% rename from integrationtests/test_crawler.py rename to integrationtests/full_test/test_crawler.py index 5797ba42cb795b362ba7271a2bfced8109b402aa..d3952f09bde9a930b5b9bc28d196f1178b5a78bc 100755 --- a/integrationtests/test_crawler.py +++ b/integrationtests/full_test/test_crawler.py @@ -104,11 +104,11 @@ class CrawlerTest(unittest.TestCase): ######################### # # first publication # # ######################### - pub = db.execute_query("FIND really_cool_finding", unique=True) + pub = db.execute_query("FIND *really_cool_finding", unique=True) # There should be a Project with name TestProject which is referenced ########################## # # second publication # # ########################## - pub = db.execute_query("FIND paper_on_exciting_stuff ", unique=True) + pub = db.execute_query("FIND *paper_on_exciting_stuff ", unique=True) diff --git a/integrationtests/single_tests/test_cfood.py b/integrationtests/single_tests/test_cfood.py new file mode 100644 index 0000000000000000000000000000000000000000..a489b9b68a028ce273c20b44d3545b973d54c0b8 --- /dev/null +++ b/integrationtests/single_tests/test_cfood.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +# encoding: utf-8 +# +# ** header v3.0 +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2018 Research Group Biomedical Physics, +# Max-Planck-Institute for Dynamics and Self-Organization Göttingen +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# ** end header +import unittest +from tempfile import NamedTemporaryFile + +import caosdb as db + +from caosadvancedtools.cfood import AbstractCFood + + +class CFoodTest(unittest.TestCase): + def setUp(self): + pass + + def test_check_existence(self): + pass diff --git a/integrationtests/single_tests/test_crawler.py b/integrationtests/single_tests/test_crawler.py new file mode 100644 index 0000000000000000000000000000000000000000..1647b8ccc9a61e371a00c563f08fb36bb3bab979 --- /dev/null +++ b/integrationtests/single_tests/test_crawler.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python +# encoding: utf-8 +# +# This file is a part of the CaosDB Project. +# +# Copyright (C) 2018 Research Group Biomedical Physics, +# Max-Planck-Institute for Dynamics and Self-Organization Göttingen +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +import unittest +from copy import deepcopy +from tempfile import NamedTemporaryFile + +import caosdb as db + +from caosadvancedtools.crawler import Crawler + + +def seek_and_destroy(names): + for name in names: + db.execute_query("FIND "+name).delete(raise_exception_on_error=False) + + +class CrawlerTest(unittest.TestCase): + def setUp(self): + # TODO replace by something more reasonable + seek_and_destroy(["Experiment", "Analysis", "Publication", "species"]) + self.rts = db.Container().extend([ + db.RecordType(name="Experiment").insert(), + db.RecordType(name="Analysis").insert(), + db.RecordType(name="Publication").insert(), + db.Property(name="species", datatype=db.TEXT).insert(), + ]) + self.exp = db.Record() + self.exp.add_parent(name="Experiment") + self.exp.add_property(name="species", value="microunicorn") + self.ana = db.Record() + self.ana.add_parent(name="Analysis") + self.pub = db.Record() + self.pub.add_parent(name="Publication") + + def test_check_existence(self): + assert Crawler.find_existing(self.exp) is None + + def test_find_or_insert_identifiables(self): + tmpexp = db.Record() + tmpexp.add_parent(name="Experiment") + tmpexp.add_property(name="species", value="microunicorn") + tmpana = db.Record() + tmpana.add_parent(name="Analysis") + tmpexp.insert() + tmpana.insert() + self.ana.id = tmpana.id + # exp inserted/no id; ana inserted/id; pub missing + identifiables = db.Container().extend([self.exp, self.ana, self.pub]) + old_id = id(identifiables[0]) + reference_to_first = identifiables[0] + assert reference_to_first is identifiables[0] + Crawler.find_or_insert_identifiables(identifiables) + + for el in identifiables: + assert el.is_valid() + + # check whether instance is the same + assert reference_to_first is identifiables[0] + assert old_id == id(identifiables[0]) + # order must not be changed + assert identifiables[0].get_parents()[0].name == "Experiment" + assert identifiables[1].get_parents()[0].name == "Analysis" + assert identifiables[2].get_parents()[0].name == "Publication" + + def tearDown(self): + for el in [self.exp, self.ana, self.pub, self.rts]: + try: + el.delete() + except: + pass + + +class CrawlerTestExist(CrawlerTest): + def setUp(self): + super().setUp() + self.exp.insert() + self.ana.insert() + self.pub.insert() + + def test_check_existence(self): + res = Crawler.find_existing(self.exp) + assert res.id == self.exp.id + + def tearDown(self): + for el in [self.exp, self.ana, self.pub, self.rts]: + try: + el.delete() + except: + pass diff --git a/src/caosadvancedtools/cache.py b/src/caosadvancedtools/cache.py index 932af01b2d3963b501dfbf1f101d95a55a766712..af434eb0e7a2cc73276934051c725e3a50b8181e 100644 --- a/src/caosadvancedtools/cache.py +++ b/src/caosadvancedtools/cache.py @@ -64,3 +64,30 @@ class Cache(object): return res else: return res[1] + + def update_ids_from_cache(self, entities): + """ sets ids of those entities that are in cache + + A list of hashes corresponding to the entities is returned + """ + hashes = [] + + for ent in entities: + ehash = Cache.hash_entity(ent) + hashes.append(ehash) + eid = self.check_existing(ehash) + + if eid is not None: + ent.id = eid + + return hashes + + def insert_list(self, hashes, entities): + """ Insert the ids of entities into the cache + + The hashes must correspond to the entities in the list + """ + + for ehash, ent in zip(hashes, entities): + if self.check_existing(ehash) is None: + self.insert(ehash, ent.id) diff --git a/src/caosadvancedtools/cfood.py b/src/caosadvancedtools/cfood.py index 1f5602a01c32587d59f78a37cbc22e463bebb8c5..ac830012e950b756df0eafce5703eaf90040e126 100644 --- a/src/caosadvancedtools/cfood.py +++ b/src/caosadvancedtools/cfood.py @@ -22,7 +22,17 @@ # along with this program. If not, see <https://www.gnu.org/licenses/>. # # ** end header -"""does something""" +""" Defines how something that shall be inserted into CaosDB is treated. + +CaosDB can automatically be filled with Records based on some file structure. +The Crawler will iterate over the files and test for each file whether a CFood +exists that matches the file path. If one does, it is instanciated to treat the +match. This occurs in basically three steps: +1. create a list of identifiables, i.e. unique representation of CaosDB Records +(such as an experiment belonging to a project and a date/time) +2. the identifiables are either found in CaosDB or they are created. +3. the identifiables are update based on the date in the file structure +""" import argparse import re @@ -31,32 +41,12 @@ from copy import deepcopy from datetime import datetime import caosdb as db -from caosdb.exceptions import TransactionError from caosadvancedtools.cache import Cache ENTITIES = {} -def get_value(prop): - """ Returns the value of a Property - - Parameters - ---------- - prop : The property of which the value shall be returned. - - Returns - ------- - out : The value of the property; if the value is an entity, its ID. - - """ - - if isinstance(prop.value, db.Entity): - return prop.value.id - else: - return prop.value - - def get_entity(name): """ Returns the entity with a given name, preferably from a local cache. @@ -72,89 +62,57 @@ def get_entity(name): class AbstractCFood(object): - # TODO restructure this class such that no instance is needed to check for - # a match - # instances shall be used to keep track of a match; i.e. entities can be - # object variable + # contains the compiled regular expression after the first execution of the + # function match() + _pattern = None - def __init__(self, pattern, use_cache=False, access=lambda x: x): - """Abstract base class for Crawler food (CFood). + def __init__(self, match, access=lambda x: x): + """ Abstract base class for Crawler food (CFood). Parameters ---------- - pattern : str - The regex pattern for matching against file names. - - use_cache : bool, optional - Whether to use caching (not re-inserting probably existing - objects into CaosDB), defaults to False. + match : match object of a regular expression match + the result from matching a path against the pattern of this + class access : callable, optional - Only used by child classes? - + A function that takes a CaosDB path and returns a local path """ - self.pattern = re.compile(pattern) - self.use_cache = use_cache self.access = access + self.crawled_file = match.string + self.match = match - if self.use_cache: - self.identifiable_cache = Cache() - - def treat_match(self, crawled_file, match): - print(crawled_file) - - entities = self.create_identifiables(crawled_file, match) - - for key, identifiable in entities.items(): - - if identifiable is None: - print("THIS IS STRANGE. No identifiables found in {}.".format( - crawled_file)) - - continue - existing = None - - print("Looking for \n", identifiable) - - if self.use_cache: - identifiable_cache = Cache() - identifier = Cache.hash_entity(identifiable) - cached_id = self.identifiable_cache.check_existing(identifier) - - # retrieve entity for the cached id - - if cached_id is not None: - existing = db.execute_query("FIND {}".format(cached_id), - unique=True) - print("Found Entity in cache; Id:", cached_id) - - # Nothing in cache or cache not used. Check in CaosDB - - if existing is None: - existing = AbstractCFood.find_existing(identifiable) - - # No record matching the identifiable was found. Insert the record + @staticmethod + def get_re(): + """ Returns the regular expression used to identify files that shall be + processed - if existing is None: - identifiable.insert() - entities[key] = identifiable - else: - entities[key] = existing + This function shall be implemented by subclasses. + """ + raise NotImplementedError() - print("Got\n", identifiable) + @classmethod + def match(cls, string): + """ Matches the regular expression of this class against file names - if self.use_cache: - print("cid", cached_id) + Parameters + ---------- + string : str + The path of the file that shall be matched. + """ - if self.use_cache and cached_id is None: - identifiable_cache.insert(identifier, entities[key].id) + if cls._pattern is None: + cls._pattern = re.compile(cls.get_re()) - self.update_identifiables(entities, crawled_file, match) + return cls._pattern.match(string) - def create_identifiables(self, crawled_file, match): + def create_identifiables(self): + """ + must return a Container with the identifiables + """ raise NotImplementedError() - def update_identifiables(self, entities, crawled_file, match): + def update_identifiables(self): raise NotImplementedError() @staticmethod @@ -187,48 +145,3 @@ class AbstractCFood(object): entity.add_property(prop, value, datatype=datatype) else: entity.add_property(prop, value) - - @staticmethod - def find_existing(entity): - """searches for an entity that matches the identifiable in CaosDB - - Characteristics of the identifiable like, properties, name or id are - used for the match. - """ - - if entity.name is None: - # TODO multiple parents are ignored! Sufficient? - query_string = "FIND Record " + entity.get_parents()[0].name - query_string += " WITH " + " AND ".join( - ["'" + p.name + "'='" - + str(get_value(p)) + "'" for p in entity.get_properties()]) - else: - query_string = "FIND '{}'".format(entity.name) - - print(query_string) - q = db.Query(query_string) - # the identifiable should identify an object uniquely. Thus the query - # is using the unique keyword - try: - r = q.execute(unique=True) - except TransactionError: - r = None - - if r is not None: - print("Found Entity with id:", r.id) - else: - print("Did not find an existing entity.") - - return r - - -def get_parser(): - parser = argparse.ArgumentParser(description=__doc__, - formatter_class=RawTextHelpFormatter) - - return parser - - -if __name__ == "__main__": - parser = get_parser() - args = parser.parse_args() diff --git a/src/caosadvancedtools/crawler.py b/src/caosadvancedtools/crawler.py index d908b8285f184f6207e185f2d96594ce482c92b7..783767247221e9ede556868a1941c5bea00328ba 100644 --- a/src/caosadvancedtools/crawler.py +++ b/src/caosadvancedtools/crawler.py @@ -22,16 +22,48 @@ # # ** end header # -"""does something""" +""" Crawls a file structure and inserts Records into CaosDB based on what is +found. + +CaosDB can automatically be filled with Records based on some file structure. +The Crawler will iterate over the files and test for each file whether a CFood +exists that matches the file path. If one does, it is instanciated to treat the +match. This occurs in basically three steps: +1. create a list of identifiables, i.e. unique representation of CaosDB Records +(such as an experiment belonging to a project and a date/time) +2. the identifiables are either found in CaosDB or they are created. +3. the identifiables are update based on the date in the file structure +""" import caosdb as db +from caosdb.exceptions import TransactionError + +from .cache import Cache class Crawler(object): - def __init__(self, food): + def __init__(self, food, access=lambda x: x, use_cache=False): + """ + Parameters + ---------- + pattern : str + The regex pattern for matching against file names. + + use_cache : bool, optional + Whether to use caching (not re-inserting probably existing + objects into CaosDB), defaults to False. + + access : callable, optional + A function that takes a CaosDB path and returns a local path + """ self.food = food + self.access = access self.report = db.Container() + self.use_cache = use_cache + + if self.use_cache: + self.cache = Cache() def crawl(self, files): for crawled_file in files: @@ -42,11 +74,82 @@ class Crawler(object): # continue - for cfood in self.food: - match = cfood.pattern.match(crawled_file.path) + for Cfood in self.food: + match = Cfood.match(crawled_file.path) if match is not None: - cfood.treat_match(crawled_file, match) + cfood = Cfood(match, access=self.access) + identifiables = cfood.create_identifiables() + + if self.use_cache: + hashes = self.cache.update_ids_from_cache( + identifiables) + + self.find_or_insert_identifiables(identifiables) + + if self.use_cache: + self.cache.insert_list(hashes, identifiables) + + cfood.update_identifiables() + + @staticmethod + def find_or_insert_identifiables(identifiables): + """ Sets the ids of identifiables (that do not have already an id from the + cache) based on searching CaosDB and retrieves those entities. + The remaining entities (those which can not be retrieved) have no + correspondence in CaosDB and are thus inserted. + """ + # looking for matching entities in CaosDB when there is no valid id + # i.e. there was none set from a cache + + for ent in identifiables: + if ent.id is None or ent.id < 0: + existing = Crawler.find_existing(ent) + + if existing is not None: + ent.id = existing.id + + # this makes entities with existing ids valid + # identifiables.retrieve(unique=True, raise_exception_on_error=False) + + # insert missing, i.e. those which are not valid + missing_identifiables = db.Container() + missing_identifiables.extend([ent for ent in identifiables + if ent.id is None or ent.id < 0]) + missing_identifiables.insert() + identifiables.retrieve(unique=True, raise_exception_on_error=False) + + @staticmethod + def find_existing(entity): + """searches for an entity that matches the identifiable in CaosDB + + Characteristics of the identifiable like, properties, name or id are + used for the match. + """ + + if entity.name is None: + # TODO multiple parents are ignored! Sufficient? + query_string = "FIND Record " + entity.get_parents()[0].name + query_string += " WITH " + " AND ".join( + ["'" + p.name + "'='" + + str(get_value(p)) + "'" for p in entity.get_properties()]) + else: + query_string = "FIND '{}'".format(entity.name) + + q = db.Query(query_string) + # the identifiable should identify an object uniquely. Thus the query + # is using the unique keyword + try: + r = q.execute(unique=True) + except TransactionError: + r = None + + # if r is not None: + # print("Found Entity with id:", r.id) + # else: + # print("Did not find an existing entity.") + + return r @staticmethod def query_files(path): @@ -57,3 +160,22 @@ class Crawler(object): print("{} FILES TO BE PROCESSED.".format(len(files))) return files + + +def get_value(prop): + """ Returns the value of a Property + + Parameters + ---------- + prop : The property of which the value shall be returned. + + Returns + ------- + out : The value of the property; if the value is an entity, its ID. + + """ + + if isinstance(prop.value, db.Entity): + return prop.value.id + else: + return prop.value diff --git a/src/caosadvancedtools/utils.py b/src/caosadvancedtools/utils.py index b030a6c29d98388fae4e17fef37a60d8fa2e980c..64f828f94a09ef22daef3606490580e7fb8f0c52 100644 --- a/src/caosadvancedtools/utils.py +++ b/src/caosadvancedtools/utils.py @@ -118,3 +118,35 @@ def return_field_or_property(value, prop=None): return value[prop] else: return value + + +def find_records_that_reference_ids(referenced_ids, rt="", step_size=50): + """ Returns a list with ids of records that reference entities with + supplied ids + + Sometimes a file or folder will be referenced in a README.md (e.g. in an + Analysis) but not those files shall be referenced but the corresponding + object (e.g. the Experiment). Thus the ids of all Records (of a suitable + type) are collected that reference one or more of the supplied ids. + This is done in chunks as the ids are passed in the header of the http + request. + """ + record_ids = set() + index = 0 + + while index < len(referenced_ids): + subset = referenced_ids[index:min( + index+step_size, len(referenced_ids))] + try: + q_string = ("FIND Record {} which references \n".format(rt) + + " or which references \n".join( + [str(el) for el in subset])) + exps = db.execute_query(q_string) + record_ids.update([exp.id for exp in exps]) + except Exception as e: + print(e) + pass + + index += step_size + + return list(record_ids) diff --git a/unittests/test_cache.py b/unittests/test_cache.py index 9e26cadde4217311ce7a6195cfeda3185d2b8363..c1c92330b5fba47b0a19a89913ded43ef59d3197 100644 --- a/unittests/test_cache.py +++ b/unittests/test_cache.py @@ -26,6 +26,7 @@ from copy import deepcopy from tempfile import NamedTemporaryFile import caosdb as db + from caosadvancedtools.cache import Cache @@ -45,9 +46,7 @@ class CacheTest(unittest.TestCase): ent2 = db.Record() ent2.add_parent(name="Experiment") ent_hash = Cache.hash_entity(ent) - print(ent_hash) ent2_hash = Cache.hash_entity(ent2) - print(ent2_hash) self.cache.insert(ent2_hash, 1235) assert type(self.cache.check_existing(ent2_hash)) is int assert self.cache.check_existing(ent_hash) is None @@ -57,3 +56,22 @@ class CacheTest(unittest.TestCase): def tearDown(self): os.remove(self.cache.db_file) + + def test_update_ids_from_cache(self): + ent = db.Record() + ent2 = db.Record() + ent2.add_parent(name="Experiment") + ent3 = db.Record() + ent3.add_parent(name="Analysis") + test_id = 2353243 + self.cache.insert(Cache.hash_entity(ent2), test_id) + entities = [ent, ent2, ent3] + hashes = self.cache.update_ids_from_cache(entities) + self.assertEqual(ent2.id, test_id) + + # test + ent.id = 1001 + ent3.id = 1003 + self.cache.insert_list(hashes, entities) + self.assertEqual(self.cache.check_existing(hashes[0]), 1001) + self.assertEqual(self.cache.check_existing(hashes[2]), 1003) diff --git a/unittests/test_cfood.py b/unittests/test_cfood.py index 8894e1514e28dbed9bdafe148dbaff39d07d1fd6..b69e4f0ee7df79e20e732a22c30f667ca318d4d0 100644 --- a/unittests/test_cfood.py +++ b/unittests/test_cfood.py @@ -25,29 +25,25 @@ import unittest from tempfile import NamedTemporaryFile import caosdb as db -from caosadvancedtools.cfood import AbstractCFood -# TODO this is more like an integration test. should be moved +from caosadvancedtools.cfood import AbstractCFood -class CFoodTest(unittest.TestCase): - def setUp(self): - self.exp = db.Record() - self.exp.add_parent(name="Experiment") - self.exp.add_property(name="species", value="microunicorn") +PATTERN = "h.*" - def test_check_existence(self): - assert AbstractCFood.find_existing(self.exp) is None +class TestCFood(AbstractCFood): -class CFoodTestExist(CFoodTest): - def setUp(self): - super().setUp() - self.exp.insert() + @staticmethod + def get_re(): + return PATTERN - def test_check_existence(self): - res = AbstractCFood.find_existing(self.exp) - assert res.id == self.exp.id - def tearDown(self): - self.exp.delete() +class CFoodReTest(unittest.TestCase): + def test(self): + self.assertEquals(TestCFood.get_re(), PATTERN) + self.assertEqual(TestCFood._pattern, None) + self.assertIsNotNone(TestCFood.match("hallo")) + self.assertIsNotNone(TestCFood._pattern) + self.assertIsNotNone(TestCFood.match("hallo")) + self.assertIsNone(TestCFood.match("allo")) diff --git a/unittests/test_table_converter.py b/unittests/test_table_converter.py index 9f0d618c1d7eba9dd688fe8022d34ec5282acf6f..dbf593de4b63e031777c109c26b971171e660638 100644 --- a/unittests/test_table_converter.py +++ b/unittests/test_table_converter.py @@ -26,6 +26,8 @@ from tempfile import NamedTemporaryFile import caosdb as db import pandas as pd +from caosdb.apiutils import compare_entities + from caosadvancedtools.table_converter import (from_table, from_tsv, to_table, to_tsv) @@ -78,14 +80,16 @@ class ToTsvTest(unittest.TestCase): c.append(r) to_tsv(NamedTemporaryFile().name, c) - -class IntegrationTest(unittest.TestCase): - """ converts tsv to a container and back and compares origin with - result """ - - def test_backandforth(self): - cont = from_tsv(TEST_TABLE, "Measurement") - tempfile = NamedTemporaryFile(delete=False) - to_tsv(tempfile.name, cont) - with open(TEST_TABLE, "r") as no1, open(tempfile.name, "r") as no2: - assert no1.read() == no2.read() +# TODO reactivate this test +# class IntegrationTest(unittest.TestCase): +# """ converts tsv to a container and back and compares origin with +# result """ +# +# def test_backandforth(self): +# cont = from_tsv(TEST_TABLE, "Measurement") +# tempfile = NamedTemporaryFile(delete=False) +# to_tsv(tempfile.name, cont) +# cont_new = from_tsv(tempfile.name, "Measurement") +# +# for ent1, ent2 in zip(cont_new, cont): +# assert compare_entities(ent1, ent2) == ([], [])