diff --git a/.gitignore b/.gitignore index 0848d3fbca6412aac998fdbaa45ea03835fe67d0..4c175607e5327472c301949f187c58d925f0d05e 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,4 @@ build/ # documentation _apidoc /dist/ +*~ diff --git a/CHANGELOG.md b/CHANGELOG.md index 372f900d6f2a7eec500670097b8a255b74809699..8d3998f6225e3e8dfbe81fd98bf3152dc51ce42c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,25 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.8.0] - 2023-05-30 ## +(Florian Spreckelsen) + +### Added ### + +- TableImporter now accepts a `existing_columns` argument which demands that certain columns exist +- The `JsonSchemaParser` class supports `patternProperties` +- The `JsonSchemaParser` calss supports json-schema references (`$ref`) + +### Changed ### + +- The converters and datatype arguments of TableImporter now may have keys for nonexisting columns +- The `JsonSchemaParser` class does not require the top-level entry of a json + schema definition to specify a RecordType. + +### Fixed ### + +- refactored to work with the new default key word in FIND queries: RECORD + ## [0.7.0] - 2023-03-09 ## (Florian Spreckelsen) diff --git a/CITATION.cff b/CITATION.cff index 83f3a0b8fe6ff860e494351c6e903c6331668118..bd468e5ad1704db033f8e81bac5277194adc2158 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -20,6 +20,6 @@ authors: given-names: Stefan orcid: https://orcid.org/0000-0001-7214-8125 title: CaosDB - Advanced User Tools -version: 0.7.0 +version: 0.8.0 doi: 10.3390/data4020083 -date-released: 2023-01-20 \ No newline at end of file +date-released: 2023-05-30 \ No newline at end of file diff --git a/integrationtests/test.sh b/integrationtests/test.sh index 36730cc948d308659f01f6153f86a917ab1909d0..9f8d003c9219f7e243fd50c5d846b9a7450b9c7b 100755 --- a/integrationtests/test.sh +++ b/integrationtests/test.sh @@ -43,21 +43,23 @@ mv DataAnalysis/2010_TestProject/2019-02-03_something/README.xlsx \ DataAnalysis/2010_TestProject/2019-02-03_something/README.xlsx_back cd .. echo "run crawler" -./crawl.py / | tee $OUT +./crawl.py / | tee "$OUT" # rename the moved file mv extroot/DataAnalysis/2010_TestProject/2019-02-03_something/README.xlsx_back \ extroot/DataAnalysis/2010_TestProject/2019-02-03_something/README.xlsx # check whether there was something UNAUTHORIZED -grep "There where unauthorized changes" $OUT +grep "There where unauthorized changes" "$OUT" # get the id of the run which is the last field of the output string -RUN_ID=$(grep "run id:" $OUT | awk '{ print $NF }') +RUN_ID=$(grep "run id:" "$OUT" | awk '{ print $NF }') echo $RUN_ID echo "run crawler again" echo "./crawl.py -a $RUN_ID /" -./crawl.py -a $RUN_ID / | tee $OUT +./crawl.py -a $RUN_ID / | tee "$OUT" set +e -if grep "There where unauthorized changes" $OUT +if grep "There where unauthorized changes" "$OUT" then + echo "There still were unauthorized changes, which should not have happend!" + echo "Test FAILED" exit 1 fi set -e diff --git a/integrationtests/test_assure_functions.py b/integrationtests/test_assure_functions.py index b1c731dbbf25f33b54fc3a005402f292525d2d05..e04d481f230936ae96b02fe910401f50e7138a01 100644 --- a/integrationtests/test_assure_functions.py +++ b/integrationtests/test_assure_functions.py @@ -33,7 +33,7 @@ from caosadvancedtools.guard import (global_guard, RETRIEVE, UPDATE) def setup(): """Delete all test entities.""" - db.execute_query("FIND Test*").delete(raise_exception_on_error=False) + db.execute_query("FIND ENTITY Test*").delete(raise_exception_on_error=False) def setup_module(): @@ -105,13 +105,13 @@ def test_add_to_empty_list(): db.Record(name="TestReferencingRecord").add_parent( referencing_rt).add_property(list_prop, value=[]).insert() - referenced_rec = db.execute_query("FIND TestReferencedRecord", unique=True) + referenced_rec = db.execute_query("FIND ENTITY TestReferencedRecord", unique=True) referencing_rec = db.execute_query( - "FIND TestReferencingRecord", unique=True) + "FIND ENTITY TestReferencingRecord", unique=True) assure_object_is_in_list(referenced_rec, referencing_rec, list_prop.name) referencing_rec = db.execute_query( - "FIND TestReferencingRecord", unique=True) + "FIND ENTITY TestReferencingRecord", unique=True) assert referencing_rec.get_property(list_prop.name).value == [ referenced_rec.id] diff --git a/integrationtests/test_base_table_exporter_integration.py b/integrationtests/test_base_table_exporter_integration.py index 9d79e857fe706d78103ade3b92ee38498a2a1607..5af9caa3e83184f77c37c24073d85ee5aae2184b 100644 --- a/integrationtests/test_base_table_exporter_integration.py +++ b/integrationtests/test_base_table_exporter_integration.py @@ -81,7 +81,7 @@ def insert_entities(): def setup_module(): """Clear all test entities""" try: - db.execute_query("FIND Test*").delete() + db.execute_query("FIND ENTITY Test*").delete() except BaseException: pass @@ -146,7 +146,7 @@ def test_queries(): "Test_Property_2").value # test guessing of selector - del(export_dict["Test_Property_2"]["selector"]) + del (export_dict["Test_Property_2"]["selector"]) my_exporter = te.BaseTableExporter( export_dict=export_dict, record=rec1, raise_error_if_missing=True) assert my_exporter.export_dict["Test_Property_2"]["selector"] == "Test_Property_2" diff --git a/integrationtests/test_cache.py b/integrationtests/test_cache.py index 4b0a6cedc390b1268e8d2d89393e19a27a83b3be..da1824e8c3cdf3f68bb82f8c7f39e6eecb111f92 100644 --- a/integrationtests/test_cache.py +++ b/integrationtests/test_cache.py @@ -33,7 +33,7 @@ from caosadvancedtools.cache import UpdateCache class CacheTest(unittest.TestCase): def empty_db(self): try: - db.execute_query("FIND Test*").delete() + db.execute_query("FIND ENTITY Test*").delete() except Exception: pass diff --git a/integrationtests/test_crawl_with_datamodel_problems.py b/integrationtests/test_crawl_with_datamodel_problems.py index 0c6a145afdab682f82af09a17fb9aa0770769959..8623d57d60ded38987953ffaf78b1d30e15a8011 100644 --- a/integrationtests/test_crawl_with_datamodel_problems.py +++ b/integrationtests/test_crawl_with_datamodel_problems.py @@ -74,7 +74,7 @@ def test_crawler_with_data_model_problems(): deleted_entities = {"Experiment", "Poster", "results"} for ent in deleted_entities: - db.execute_query("FIND "+ent).delete() + db.execute_query("FIND ENTITY "+ent).delete() # Do the crawling def access(x): return "extroot" + x diff --git a/integrationtests/test_crawler_basics.py b/integrationtests/test_crawler_basics.py index 7da90844f14cf0d1eaded9d4fc8f37320da46aad..60c09d73e954c39d752b5fa4ae5e272d28000ca1 100644 --- a/integrationtests/test_crawler_basics.py +++ b/integrationtests/test_crawler_basics.py @@ -40,7 +40,7 @@ def setup_module(): """Clear all test entities. Allow insertions.""" guard.set_level(INSERT) try: - db.execute_query("FIND Test*").delete() + db.execute_query("FIND ENTITY Test*").delete() except Exception: pass diff --git a/integrationtests/test_crawler_with_cfoods.py b/integrationtests/test_crawler_with_cfoods.py index 19b1f8ff10e365031d6940c7b904e3656eda2861..1fa5eaa5a4f050d7282b863aae626982ff738c43 100755 --- a/integrationtests/test_crawler_with_cfoods.py +++ b/integrationtests/test_crawler_with_cfoods.py @@ -30,7 +30,7 @@ from caosdb.apiutils import retrieve_entity_with_id def get_entity_with_id(eid): - return db.execute_query("FIND "+str(eid), unique=True) + return db.execute_query("FIND ENTITY "+str(eid), unique=True) class LoadFilesTest(unittest.TestCase): @@ -49,7 +49,7 @@ class CrawlerTest(unittest.TestCase): # # dummy for dependency test experiment # # ######################## exp = db.execute_query( - "FIND Experiment with date=2019-02-04 and identifier=empty_identifier", + "FIND ENTITY Experiment with date=2019-02-04 and identifier=empty_identifier", unique=True) ######################## @@ -59,7 +59,7 @@ class CrawlerTest(unittest.TestCase): # vanishing of the property # thus an x is used here. Needs to be fixed. exp = db.execute_query( - "FIND Experiment with date=2019-02-03 and identifier=empty_identifier", + "FIND ENTITY Experiment with date=2019-02-03 and identifier=empty_identifier", unique=True) # There should be a Project with name TestProject which is referenced @@ -99,7 +99,7 @@ class CrawlerTest(unittest.TestCase): # # second experiment # # ######################### exp = db.execute_query( - "FIND Experiment with date=2019-02-03 and identifier='something'", + "FIND ENTITY Experiment with date=2019-02-03 and identifier='something'", unique=True) # Should be the same project @@ -120,7 +120,7 @@ class CrawlerTest(unittest.TestCase): # # first analysis # # ###################### ana = db.execute_query( - "FIND Analysis with date=2019-02-03 and identifier='empty_identifier'", + "FIND ENTITY Analysis with date=2019-02-03 and identifier='empty_identifier'", unique=True) # There should be a Project with name TestProject which is referenced @@ -164,7 +164,7 @@ class CrawlerTest(unittest.TestCase): # # second analysis # # ####################### ana = db.execute_query( - "FIND Analysis with date=2019-02-03 and identifier='something'", + "FIND ENTITY Analysis with date=2019-02-03 and identifier='something'", unique=True) # Should be the same project @@ -197,7 +197,7 @@ class CrawlerTest(unittest.TestCase): # # first simulation # # ###################### sim = db.execute_query( - "FIND Simulation with date=2019-02-03 and identifier='empty_identifier'", + "FIND ENTITY Simulation with date=2019-02-03 and identifier='empty_identifier'", unique=True) # There should be a Project with name TestProject which is referenced @@ -228,7 +228,7 @@ class CrawlerTest(unittest.TestCase): # # second simulation # # ######################### sim = db.execute_query( - "FIND Simulation with date=2019-02-03 and identifier='something'", + "FIND ENTITY Simulation with date=2019-02-03 and identifier='something'", unique=True) sources = [get_entity_with_id(el) for el in @@ -273,7 +273,7 @@ class CrawlerTest(unittest.TestCase): ######################### # # first publication # # ######################### - pub = db.execute_query("FIND *really_cool_finding", unique=True) + pub = db.execute_query("FIND ENTITY *really_cool_finding", unique=True) # There should be a file as result attached with path poster.pdf datfile_id = pub.get_property("results").value[0] @@ -291,7 +291,7 @@ class CrawlerTest(unittest.TestCase): ########################## # # second publication # # ########################## - pub = db.execute_query("FIND *paper_on_exciting_stuff ", unique=True) + pub = db.execute_query("FIND ENTITY *paper_on_exciting_stuff ", unique=True) # Test type self.assertEqual(pub.parents[0].name, "Thesis") @@ -311,10 +311,10 @@ class CrawlerTest(unittest.TestCase): # # first software version # # ############################## ana = db.execute_query( - "FIND Software with version='V1.0-rc1'", unique=True) + "FIND ENTITY Software with version='V1.0-rc1'", unique=True) sw = db.execute_query( - "FIND Software with name='2010_TestSoftware'", unique=True) + "FIND ENTITY Software with name='2010_TestSoftware'", unique=True) assert sw.get_property("alias").value == "TestSoftware" # The software record should inherit from the correct software @@ -360,10 +360,10 @@ class CrawlerTest(unittest.TestCase): # # second software version # # ####################### ana = db.execute_query( - "FIND Software with version='v0.1'", unique=True) + "FIND ENTITY Software with version='v0.1'", unique=True) sw = db.execute_query( - "FIND Software with name='2010_TestSoftware'", unique=True) + "FIND ENTITY Software with name='2010_TestSoftware'", unique=True) # The software record should inherit from the correct software assert sw.id == ana.get_parents()[0].id @@ -393,11 +393,11 @@ class CrawlerTest(unittest.TestCase): # # third software version # # ####################### ana = db.execute_query( - "FIND Software with date='2020-02-04' and not version", + "FIND ENTITY Software with date='2020-02-04' and not version", unique=True) sw = db.execute_query( - "FIND Software with name='2020NewProject0X'", unique=True) + "FIND ENTITY Software with name='2020NewProject0X'", unique=True) # The software record should inherit from the correct software assert sw.id == ana.get_parents()[0].id @@ -438,11 +438,11 @@ class CrawlerTest(unittest.TestCase): # # fourth software version # # ####################### ana = db.execute_query( - "FIND Software with date='2020-02-03' and not version", + "FIND ENTITY Software with date='2020-02-03' and not version", unique=True) sw = db.execute_query( - "FIND Software with name='2020NewProject0X'", unique=True) + "FIND ENTITY Software with name='2020NewProject0X'", unique=True) assert sw.get_property("alias").value == "NewProject0X" # The software record should inherit from the correct software @@ -479,10 +479,10 @@ class CrawlerTest(unittest.TestCase): # # fifth software version # # ############################## ana = db.execute_query( - "FIND Software with version='second'", unique=True) + "FIND ENTITY Software with version='second'", unique=True) sw = db.execute_query( - "FIND Software with name='2020NewProject0X'", unique=True) + "FIND ENTITY Software with name='2020NewProject0X'", unique=True) assert sw.get_property("alias").value == "NewProject0X" # The software record should inherit from the correct software diff --git a/integrationtests/test_data_model.py b/integrationtests/test_data_model.py index 2949fa81727a6c61a8646a48c249204fa87542d8..5bf168cd25873975d73cbbaa0249f2fd4c21299b 100644 --- a/integrationtests/test_data_model.py +++ b/integrationtests/test_data_model.py @@ -57,7 +57,7 @@ class DataModelTest(unittest.TestCase): def tearDown(self): try: - tests = db.execute_query("FIND test*") + tests = db.execute_query("FIND ENTITY test*") tests.delete() except Exception: pass diff --git a/integrationtests/test_datamodel_problems.py b/integrationtests/test_datamodel_problems.py index 3bca302dd2a337cee7fd023ee6a64c5185bc99f5..855170338fbc81493e407fbe235415d60958c0f0 100644 --- a/integrationtests/test_datamodel_problems.py +++ b/integrationtests/test_datamodel_problems.py @@ -39,7 +39,7 @@ def setup_module(): """Clear problem sets and delete possible test entities""" DataModelProblems.missing.clear() try: - db.execute_query("FIND Test*").delete() + db.execute_query("FIND Entity Test*").delete() except Exception as delete_exc: print(delete_exc) diff --git a/integrationtests/test_im_und_export.py b/integrationtests/test_im_und_export.py index 8ea45fd2cebbcb2c3be6c8cb79805204486f7862..407faa1a1d3eb609ffd01b9c78d74f1c6a9b231b 100644 --- a/integrationtests/test_im_und_export.py +++ b/integrationtests/test_im_und_export.py @@ -8,13 +8,14 @@ from caosadvancedtools.import_from_xml import import_xml if __name__ == "__main__": print("Conducting im- and export tests") - rec = db.execute_query("FIND 2019-02-03_really_cool_finding", unique=True) + rec = db.execute_query("FIND ENTITY 2019-02-03_really_cool_finding", unique=True) directory = TemporaryDirectory() export_related_to(rec.id, directory=directory.name) # delete everything print("Clearing database") recs = db.execute_query("FIND entity with id>99") - recs.delete() + if len(recs) > 0: + recs.delete() assert 0 == len(db.execute_query("FIND File which is stored at " "**/poster.pdf")) print("Importing stored elements") @@ -22,7 +23,7 @@ if __name__ == "__main__": # The following tests the existence of some required entities. # However, this is not a full list. - db.execute_query("FIND 2019-02-03_really_cool_finding", unique=True) + db.execute_query("FIND ENTITY 2019-02-03_really_cool_finding", unique=True) db.execute_query("FIND RecordType Poster", unique=True) db.execute_query("FIND RecordType Analysis", unique=True) db.execute_query("FIND RecordType Person", unique=True) diff --git a/integrationtests/test_table.py b/integrationtests/test_table.py index 15b851fb5c81611d0faba93edfc58f46f9d75e79..b8dfe349f3dac3be9bb741f937f2be4f73b6b2af 100644 --- a/integrationtests/test_table.py +++ b/integrationtests/test_table.py @@ -34,14 +34,14 @@ if __name__ == "__main__": table = pd.read_csv("example_table.csv") - assert 0 == len(db.execute_query("FIND Person with firstname=Henrik")) + assert 0 == len(db.execute_query("FIND ENTITY Person with firstname=Henrik")) first = table.loc[table.firstName == "Henrik"] tcr = TableCrawler(table=first, unique_cols=["firstName", "lastName"], recordtype="Person", interactive=False) tcr.crawl(security_level=UPDATE) - assert 1 == len(db.execute_query("FIND Person with firstname=Henrik")) + assert 1 == len(db.execute_query("FIND ENTITY Person with firstname=Henrik")) tcr = TableCrawler(table=table, unique_cols=["firstName", "lastName"], recordtype="Person", interactive=False) tcr.crawl(security_level=UPDATE) - assert 1 == len(db.execute_query("FIND Person with firstname=Henrik")) - assert 1 == len(db.execute_query("FIND Person with firstname=Max")) + assert 1 == len(db.execute_query("FIND ENTITY Person with firstname=Henrik")) + assert 1 == len(db.execute_query("FIND ENTITY Person with firstname=Max")) diff --git a/integrationtests/update_analysis.py b/integrationtests/update_analysis.py index bd18ab375437bec02320dcfd269896c2ba7e2bb0..ddebc049f449026400278a26226d341d64e678c8 100644 --- a/integrationtests/update_analysis.py +++ b/integrationtests/update_analysis.py @@ -39,7 +39,7 @@ from caosadvancedtools.serverside.generic_analysis import run def main(): - da = db.execute_query("FIND Analysis with identifier=TEST", unique=True) + da = db.execute_query("FIND ENTITY Analysis with identifier=TEST", unique=True) run(da) diff --git a/setup.py b/setup.py index 02b4f6f163fd14024b1b24e316c78014e993312a..3487be6a3e4eaa3fc3f96e3654985c6e53f81747 100755 --- a/setup.py +++ b/setup.py @@ -46,7 +46,7 @@ from setuptools import find_packages, setup ######################################################################## MAJOR = 0 -MINOR = 7 +MINOR = 8 MICRO = 0 PRE = "" # e.g. rc0, alpha.1, 0.beta-23 ISRELEASED = True @@ -156,6 +156,7 @@ def setup_package(): author_email='h.tomwoerden@indiscale.com', python_requires='>=3.7', install_requires=["caosdb>=0.11.0", + "jsonref", "jsonschema>=4.4.0", "numpy>=1.17.3", "openpyxl>=3.0.7", @@ -163,7 +164,7 @@ def setup_package(): "xlrd>=2.0", ], extras_require={"h5-crawler": ["h5py>=3.3.0", ], - "gitignore-parser ": ["gitignore-parser >=0.1.0", ], + "gitignore-parser": ["gitignore-parser >=0.1.0", ], }, packages=find_packages('src'), package_dir={'': 'src'}, diff --git a/src/caosadvancedtools/cache.py b/src/caosadvancedtools/cache.py index db189b16e5755094ff0d6816aa0806b197b1e883..2b79f9ae7eedaf6e7d6896450a8e7b14e1dc9b30 100644 --- a/src/caosadvancedtools/cache.py +++ b/src/caosadvancedtools/cache.py @@ -344,7 +344,7 @@ class UpdateCache(AbstractCache): old_ones = db.Container() for ent in cont: - old_ones.append(db.execute_query("FIND {}".format(ent.id), + old_ones.append(db.execute_query("FIND ENTITY {}".format(ent.id), unique=True)) return old_ones diff --git a/src/caosadvancedtools/cfood.py b/src/caosadvancedtools/cfood.py index 4a9f955a17fc429deb6cdd10c3645700e579b4df..c0da4f0156dc2af48a4ba80b4d0af69c62cd5c3e 100644 --- a/src/caosadvancedtools/cfood.py +++ b/src/caosadvancedtools/cfood.py @@ -807,7 +807,7 @@ class RowCFood(AbstractCFood): def update_identifiables(self): rec = self.identifiables[0] - for key, value in self.item.iteritems(): + for key, value in self.item.items(): if key in self.unique_cols: continue assure_property_is(rec, key, diff --git a/src/caosadvancedtools/crawler.py b/src/caosadvancedtools/crawler.py index 7b81b81ee9a717c226de7f3f3c0e5f28e3e6e789..5e84bc8a60c1b358150c4db389efb62656af0631 100644 --- a/src/caosadvancedtools/crawler.py +++ b/src/caosadvancedtools/crawler.py @@ -212,15 +212,14 @@ class Crawler(object): new_cont.insert(unique=False) logger.info("Successfully inserted {} records!".format(len(new_cont))) all_inserts += len(new_cont) - logger.info("Finished with authorized updates.") + logger.info("Finished with authorized inserts.") changes = cache.get_updates(run_id) for _, _, old, new, _ in changes: - new_cont = db.Container() - new_cont = new_cont.from_xml(new) + new_cont = db.Container.from_xml(new) ids = [] - tmp = [] + tmp = db.Container() update_incomplete = False # remove duplicate entities for el in new_cont: @@ -230,14 +229,14 @@ class Crawler(object): else: update_incomplete = True new_cont = tmp - if new[0].version: + if new_cont[0].version: valids = db.Container() nonvalids = db.Container() for ent in new_cont: remote_ent = db.Entity(id=ent.id).retrieve() if ent.version == remote_ent.version: - valids.append(remote_ent) + valids.append(ent) else: update_incomplete = True nonvalids.append(remote_ent) diff --git a/src/caosadvancedtools/export_related.py b/src/caosadvancedtools/export_related.py index 69b588c34cc7c8123ab4291f6d8f76f06e7400be..7ae3a4dbba65faed551f75a1627eb504a3275f48 100755 --- a/src/caosadvancedtools/export_related.py +++ b/src/caosadvancedtools/export_related.py @@ -99,7 +99,7 @@ def invert_ids(entities): def export_related_to(rec_id, directory="."): if not isinstance(rec_id, int): raise ValueError("rec_id needs to be an integer") - ent = db.execute_query("FIND {}".format(rec_id), unique=True) + ent = db.execute_query("FIND ENTITY {}".format(rec_id), unique=True) cont = recursively_collect_related(ent) export(cont, directory=directory) diff --git a/src/caosadvancedtools/models/parser.py b/src/caosadvancedtools/models/parser.py index c9b890de570d29e4a013b14ebe4579e956277ed2..b77b37669b27ee0d2ddf749eeae54915714e54ec 100644 --- a/src/caosadvancedtools/models/parser.py +++ b/src/caosadvancedtools/models/parser.py @@ -35,8 +35,9 @@ not defined, simply the name can be supplied with no value. Parents can be provided under the 'inherit_from_xxxx' keywords. The value needs to be a list with the names. Here, NO NEW entities can be defined. """ -import json import argparse +import json +import jsonref import re import sys import yaml @@ -76,7 +77,8 @@ JSON_SCHEMA_ATOMIC_TYPES = [ "string", "boolean", "integer", - "number" + "number", + "null" ] @@ -152,13 +154,29 @@ def parse_model_from_string(string): return parser.parse_model_from_string(string) -def parse_model_from_json_schema(filename: str): +def parse_model_from_json_schema( + filename: str, + top_level_recordtype: bool = True, + types_for_missing_array_items: dict = {}, + ignore_unspecified_array_items: bool = False +): """Return a datamodel parsed from a json schema definition. Parameters ---------- filename : str The path of the json schema file that is to be parsed + top_level_recordtype : bool, optional + Whether there is a record type defined at the top level of the + schema. Default is true. + types_for_missing_array_items : dict, optional + dictionary containing fall-back types for json entries with `type: + array` but without `items` specification. Default is an empty dict. + ignore_unspecified_array_items : bool, optional + Whether to ignore `type: array` entries the type of which is not + specified by their `items` property or given in + `types_for_missing_array_items`. An error is raised if they are not + ignored. Default is False. Returns ------- @@ -174,10 +192,10 @@ def parse_model_from_json_schema(filename: str): """ # @author Florian Spreckelsen # @date 2022-02-17 - # @review Daniel Hornung 2022-02-18 - parser = JsonSchemaParser() + # @review Timm Fitschen 2023-05-25 + parser = JsonSchemaParser(types_for_missing_array_items, ignore_unspecified_array_items) - return parser.parse_model_from_json_schema(filename) + return parser.parse_model_from_json_schema(filename, top_level_recordtype) class Parser(object): @@ -258,9 +276,9 @@ class Parser(object): self.model[name] = db.Property(name=name).retrieve() continue for role in ("Property", "RecordType", "Record", "File"): - if db.execute_query("COUNT {} {}".format(role, name)) > 0: + if db.execute_query("COUNT {} \"{}\"".format(role, name)) > 0: self.model[name] = db.execute_query( - "FIND {} WITH name={}".format(role, name), unique=True) + f"FIND {role} WITH name=\"{name}\"", unique=True) break else: raise Exception("Did not find {}".format(name)) @@ -600,14 +618,13 @@ class Parser(object): class JsonSchemaParser(Parser): """Extends the yaml parser to read in datamodels defined in a json schema. - **EXPERIMENTAL:** While this calss can already be used to create data models + **EXPERIMENTAL:** While this class can already be used to create data models from basic json schemas, there are the following limitations and missing features: * Due to limitations of json-schema itself, we currently do not support inheritance in the imported data models * The same goes for suggested properties of RecordTypes - * Currently, ``$defs`` and ``$ref`` in the input schema are not resolved. * Already defined RecordTypes and (scalar) Properties can't be re-used as list properties * Reference properties that are different from the referenced RT. (Although @@ -615,15 +632,18 @@ class JsonSchemaParser(Parser): * Values * Roles * The extern keyword from the yaml parser - * Currently, a json-schema cannot be transformed into a data model if its - root element isn't a RecordType (or Property) with ``title`` and ``type``. """ # @author Florian Spreckelsen # @date 2022-02-17 - # @review Timm Fitschen 2022-02-30 + # @review Timm Fitschen 2023-05-25 + + def __init__(self, types_for_missing_array_items={}, ignore_unspecified_array_items=False): + super().__init__() + self.types_for_missing_array_items = types_for_missing_array_items + self.ignore_unspecified_array_items = ignore_unspecified_array_items - def parse_model_from_json_schema(self, filename: str): + def parse_model_from_json_schema(self, filename: str, top_level_recordtype: bool = True): """Return a datamodel created from the definition in the json schema in `filename`. @@ -631,6 +651,9 @@ class JsonSchemaParser(Parser): ---------- filename : str The path to the json-schema file containing the datamodel definition + top_level_recordtype : bool, optional + Whether there is a record type defined at the top level of the + schema. Default is true. Returns ------- @@ -639,13 +662,13 @@ class JsonSchemaParser(Parser): """ # @author Florian Spreckelsen # @date 2022-02-17 - # @review Timm Fitschen 2022-02-30 + # @review Timm Fitschen 2023-05-25 with open(filename, 'r') as schema_file: - model_dict = json.load(schema_file) + model_dict = jsonref.load(schema_file) - return self._create_model_from_dict(model_dict) + return self._create_model_from_dict(model_dict, top_level_recordtype=top_level_recordtype) - def _create_model_from_dict(self, model_dict: [dict, List[dict]]): + def _create_model_from_dict(self, model_dict: [dict, List[dict]], top_level_recordtype: bool = True): """Parse a dictionary and return the Datamodel created from it. The dictionary was typically created from the model definition in a json schema file. @@ -654,36 +677,68 @@ class JsonSchemaParser(Parser): ---------- model_dict : dict or list[dict] One or several dictionaries read in from a json-schema file + top_level_recordtype : bool, optional + Whether there is a record type defined at the top level of the + schema. Default is true. Returns ------- our : DataModel The datamodel defined in `model_dict` """ - # @review Timm Fitschen 2022-02-30 + # @review Timm Fitschen 2023-05-25 if isinstance(model_dict, dict): model_dict = [model_dict] for ii, elt in enumerate(model_dict): - if "title" not in elt: - raise JsonSchemaDefinitionError( - f"Object {ii+1} is lacking the `title` key word") - if "type" not in elt: - raise JsonSchemaDefinitionError( - f"Object {ii+1} is lacking the `type` key word") - # Check if this is a valid Json Schema try: jsonschema.Draft202012Validator.check_schema(elt) except jsonschema.SchemaError as err: + key = elt["title"] if "title" in elt else f"element {ii}" raise JsonSchemaDefinitionError( - f"Json Schema error in {elt['title']}:\n{str(err)}") from err - name = self._stringify(elt["title"], context=elt) - self._treat_element(elt, name) + f"Json Schema error in {key}:\n{str(err)}") from err + + if top_level_recordtype: + if "title" not in elt: + raise JsonSchemaDefinitionError( + f"Object {ii+1} is lacking the `title` key word") + if "type" not in elt: + raise JsonSchemaDefinitionError( + f"Object {ii+1} is lacking the `type` key word") + # Check if this is a valid Json Schema + name = self._stringify(elt["title"], context=elt) + self._treat_element(elt, name) + elif "properties" in elt or "patternProperties" in elt: + # No top-level type but there are entities + if "properties" in elt: + for key, prop in elt["properties"].items(): + name = self._get_name_from_property(key, prop) + self._treat_element(prop, name) + if "patternProperties" in elt: + # See also treatment in ``_treat_record_type``. Since here, + # there is no top-level RT we use the prefix `__Pattern`, + # i.e., the resulting Record Types will be called + # `__PatternElement`. + self._treat_pattern_properties( + elt["patternProperties"], name_prefix="__Pattern") + else: + # Neither RecordType itself, nor further properties in schema, + # so nothing to do here. Maybe add something in the future. + continue return DataModel(self.model.values()) + def _get_name_from_property(self, key: str, prop: dict): + # @review Timm Fitschen 2023-05-25 + if "title" in prop: + name = self._stringify(prop["title"]) + else: + name = self._stringify(key) + + return name + def _get_atomic_datatype(self, elt): - # @review Timm Fitschen 2022-02-30 + # @review Timm Fitschen 2023-05-25 if elt["type"] == "string": if "format" in elt and elt["format"] in ["date", "date-time"]: return db.DATETIME @@ -695,11 +750,15 @@ class JsonSchemaParser(Parser): return db.DOUBLE elif elt["type"] == "boolean": return db.BOOLEAN + elif elt["type"] == "null": + # This could be any datatype since a valid json will never have a + # value in a null property. We use TEXT for convenience. + return db.TEXT else: raise JsonSchemaDefinitionError(f"Unkown atomic type in {elt}.") def _treat_element(self, elt: dict, name: str): - # @review Timm Fitschen 2022-02-30 + # @review Timm Fitschen 2023-05-25 force_list = False if name in self.model: return self.model[name], force_list @@ -710,12 +769,17 @@ class JsonSchemaParser(Parser): if name == "name": # This is identified with the CaosDB name property as long as the # type is correct. - if not elt["type"] == "string": + if not elt["type"] == "string" and "string" not in elt["type"]: raise JsonSchemaDefinitionError( "The 'name' property must be string-typed, otherwise it cannot " "be identified with CaosDB's name property." ) return None, force_list + # LinkAhead suports null for all types, so in the very special case of + # `"type": ["null", "<other_type>"]`, only consider the other type: + if isinstance(elt["type"], list) and len(elt["type"]) == 2 and "null" in elt["type"]: + elt["type"].remove("null") + elt["type"] = elt["type"][0] if "enum" in elt: ent = self._treat_enum(elt, name) elif elt["type"] in JSON_SCHEMA_ATOMIC_TYPES: @@ -733,11 +797,12 @@ class JsonSchemaParser(Parser): # treat_something function ent.description = elt["description"] - self.model[name] = ent + if ent is not None: + self.model[name] = ent return ent, force_list def _treat_record_type(self, elt: dict, name: str): - # @review Timm Fitschen 2022-02-30 + # @review Timm Fitschen 2023-05-25 rt = db.RecordType(name=name) if "required" in elt: required = elt["required"] @@ -745,10 +810,7 @@ class JsonSchemaParser(Parser): required = [] if "properties" in elt: for key, prop in elt["properties"].items(): - if "title" in prop: - name = self._stringify(prop["title"]) - else: - name = self._stringify(key) + name = self._get_name_from_property(key, prop) prop_ent, force_list = self._treat_element(prop, name) if prop_ent is None: # Nothing to be appended since the property has to be @@ -762,6 +824,17 @@ class JsonSchemaParser(Parser): rt.add_property(prop_ent, importance=importance, datatype=db.LIST(prop_ent)) + if "patternProperties" in elt: + + pattern_property_rts = self._treat_pattern_properties( + elt["patternProperties"], name_prefix=name) + for ppr in pattern_property_rts: + # add reference to pattern property type. These can never be + # obligatory since pattern properties cannot be required in the + # original schema (since their actual names are not known a + # priori). + rt.add_property(ppr) + if "description" in elt: rt.description = elt["description"] return rt @@ -783,28 +856,96 @@ class JsonSchemaParser(Parser): return rt def _treat_list(self, elt: dict, name: str): - # @review Timm Fitschen 2022-02-30 + # @review Timm Fitschen 2023-05-25 - if "items" not in elt: + if "items" not in elt and name not in self.types_for_missing_array_items: + if self.ignore_unspecified_array_items: + return None, False raise JsonSchemaDefinitionError( f"The definition of the list items is missing in {elt}.") - items = elt["items"] - if "enum" in items: - return self._treat_enum(items, name), True - if items["type"] in JSON_SCHEMA_ATOMIC_TYPES: - datatype = db.LIST(self._get_atomic_datatype(items)) + if "items" in elt: + items = elt["items"] + if "enum" in items: + return self._treat_enum(items, name), True + if items["type"] in JSON_SCHEMA_ATOMIC_TYPES: + datatype = db.LIST(self._get_atomic_datatype(items)) + return db.Property(name=name, datatype=datatype), False + if items["type"] == "object": + if "title" not in items or self._stringify(items["title"]) == name: + # Property is RecordType + return self._treat_record_type(items, name), True + else: + # List property will be an entity of its own with a name + # different from the referenced RT + ref_rt = self._treat_record_type( + items, self._stringify(items["title"])) + self.model[ref_rt.name] = ref_rt + return db.Property(name=name, datatype=db.LIST(ref_rt)), False + else: + # Use predefined type: + datatype = db.LIST(self.types_for_missing_array_items[name]) return db.Property(name=name, datatype=datatype), False - if items["type"] == "object": - if "title" not in items or self._stringify(items["title"]) == name: - # Property is RecordType - return self._treat_record_type(items, name), True + + def _get_pattern_prop(self): + # @review Timm Fitschen 2023-05-25 + if "__pattern_property_pattern_property" in self.model: + return self.model["__pattern_property_pattern_property"] + pp = db.Property(name="__matched_pattern", datatype=db.TEXT) + self.model["__pattern_property_pattern_property"] = pp + return pp + + def _treat_pattern_properties(self, pattern_elements, name_prefix=""): + """Special Treatment for pattern properties: A RecordType is created for + each pattern property. In case of a `type: object` PatternProperty, the + remaining properties of the JSON entry are appended to the new + RecordType; in case of an atomic type PatternProperty, a single value + Property is added to the RecordType. + + Raises + ------ + NotImplementedError + In case of patternProperties with non-object, non-atomic type, e.g., + array. + + """ + # @review Timm Fitschen 2023-05-25 + num_patterns = len(pattern_elements) + pattern_prop = self._get_pattern_prop() + returns = [] + for ii, (key, element) in enumerate(pattern_elements.items()): + if "title" not in element: + name_suffix = f"_{ii+1}" if num_patterns > 1 else "" + name = name_prefix + "Entry" + name_suffix + else: + name = element["title"] + if element["type"] == "object": + # simple, is already an object, so can be treated like any other + # record type. + pattern_type = self._treat_record_type(element, name) + elif element["type"] in JSON_SCHEMA_ATOMIC_TYPES: + # create a property that stores the actual value of the pattern + # property. + propname = f"{name}_value" + prop = db.Property(name=propname, datatype=self._get_atomic_datatype(element)) + self.model[propname] = prop + pattern_type = db.RecordType(name=name) + pattern_type.add_property(prop) + else: + raise NotImplementedError( + "Pattern properties are currently only supported for types " + + ", ".join(JSON_SCHEMA_ATOMIC_TYPES) + ", and object.") + + # Add pattern property and description + pattern_type.add_property(pattern_prop, importance=db.OBLIGATORY) + if pattern_type.description: + pattern_type.description += f"\n\npattern: {key}" else: - # List property will be an entity of its own with a name - # different from the referenced RT - ref_rt = self._treat_record_type( - items, self._stringify(items["title"])) - self.model[ref_rt.name] = ref_rt - return db.Property(name=name, datatype=db.LIST(ref_rt)), False + pattern_type.description = f"pattern: {key}" + + self.model[name] = pattern_type + returns.append(pattern_type) + + return returns if __name__ == "__main__": diff --git a/src/caosadvancedtools/scifolder/utils.py b/src/caosadvancedtools/scifolder/utils.py index afa671af85506a57a06ad5198bec4495823c76f1..50e897c7d2f19c6269ec622489c5a2c6ce1a28e0 100644 --- a/src/caosadvancedtools/scifolder/utils.py +++ b/src/caosadvancedtools/scifolder/utils.py @@ -154,7 +154,7 @@ def create_files_list(df, ftype): files = [] for indx, src in df.loc[ftype, - pd.notnull(df.loc[ftype])].iteritems(): + pd.notnull(df.loc[ftype])].items(): desc = df.loc[ftype+" description", indx] if pd.notnull(desc): diff --git a/src/caosadvancedtools/table_converter.py b/src/caosadvancedtools/table_converter.py index 76f4dfcdb5f040d81d923289a7a730806ad8681b..4b8591ed009ee8e63b328ad43e0d458b3e805ce7 100644 --- a/src/caosadvancedtools/table_converter.py +++ b/src/caosadvancedtools/table_converter.py @@ -79,7 +79,7 @@ def from_table(spreadsheet, recordtype): rec = db.Record() rec.add_parent(name=recordtype) - for key, value in row.iteritems(): + for key, value in row.items(): if key.lower() == "description": rec.description = value continue diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py index 1f515e78e3ddbd198fa0336589a359ba9154f038..8f793584051386796bce18bdbaded6c7e34c06ca 100755 --- a/src/caosadvancedtools/table_importer.py +++ b/src/caosadvancedtools/table_importer.py @@ -210,7 +210,7 @@ class TableImporter(): """ def __init__(self, converters, obligatory_columns=None, unique_keys=None, - datatypes=None): + datatypes=None, existing_columns=None): """ Parameters ---------- @@ -221,7 +221,7 @@ class TableImporter(): value check is not necessary. obligatory_columns : list, optional - List of column names, each listed column must not have missing values. + List of column names that (if they exist) must not have missing values. unique_keys : list, optional List of column names that in combination must be unique: each row has a unique @@ -232,22 +232,31 @@ class TableImporter(): checked whether they have the provided datatype. This dict also defines what columns are required to exist throught the existing keys. + existing_columns : list, optional + List of column names that must exist but may have missing (NULL) values """ if converters is None: converters = {} + self.converters = converters + + if obligatory_columns is None: + obligatory_columns = [] + self.obligatory_columns = obligatory_columns + + if unique_keys is None: + unique_keys = [] + self.unique_keys = unique_keys if datatypes is None: datatypes = {} + self.datatypes = datatypes + + if existing_columns is None: + existing_columns = [] + self.existing_columns = existing_columns self.sup = SuppressKnown() - self.required_columns = list(converters.keys())+list(datatypes.keys()) - self.obligatory_columns = ([] - if obligatory_columns is None - else obligatory_columns) - self.unique_keys = [] if unique_keys is None else unique_keys - self.converters = converters - self.datatypes = datatypes def read_file(self, filename, **kwargs): raise NotImplementedError() @@ -263,7 +272,7 @@ class TableImporter(): """ - for col in self.required_columns: + for col in self.existing_columns: if col not in df.columns: errmsg = "Column '{}' missing in ".format(col) errmsg += ("\n{}.\n".format(filename) if filename @@ -323,6 +332,8 @@ class TableImporter(): """ for key, datatype in self.datatypes.items(): + if key not in df.columns: + continue # Check for castable numeric types first: We unconditionally cast int to the default # float, because CaosDB does not have different sizes anyway. col_dtype = df.dtypes[key] @@ -333,8 +344,7 @@ class TableImporter(): df[key] = df[key].astype(datatype) # Now check each element - for idx, val in df.loc[ - pd.notnull(df.loc[:, key]), key].iteritems(): + for idx, val in df.loc[pd.notnull(df.loc[:, key]), key].items(): if not isinstance(val, datatype): msg = ( @@ -363,22 +373,20 @@ class TableImporter(): for index, row in df.iterrows(): # if none of the relevant information is given, skip - - if np.array([pd.isnull(row.loc[key]) for key in - self.obligatory_columns]).all(): - + if pd.isnull(row.loc[[key for key in self.obligatory_columns if key in df.columns]]).all(): df = df.drop(index) continue # if any of the relevant information is missing, report it - i = 0 okay = True while okay and i < len(self.obligatory_columns): key = self.obligatory_columns[i] i += 1 + if key not in df.columns: + continue if pd.isnull(row.loc[key]): errmsg = ( @@ -449,7 +457,10 @@ class XLSImporter(TableImporter): "All but the first are being ignored.".format(filename)) try: - df = xls_file.parse(converters=self.converters, **kwargs) + tmpdf = xls_file.parse(**kwargs) + applicable_converters = {k: v for k, v in self.converters.items() + if k in tmpdf.columns} + df = xls_file.parse(converters=applicable_converters, **kwargs) except Exception as e: logger.warning( "Cannot parse {}.\n{}".format(filename, e), @@ -465,7 +476,11 @@ class XLSImporter(TableImporter): class CSVImporter(TableImporter): def read_file(self, filename, sep=",", **kwargs): try: - df = pd.read_csv(filename, sep=sep, converters=self.converters, + tmpdf = pd.read_csv(filename, sep=sep, converters=self.converters, + **kwargs) + applicable_converters = {k: v for k, v in self.converters.items() + if k in tmpdf.columns} + df = pd.read_csv(filename, sep=sep, converters=applicable_converters, **kwargs) except ValueError as ve: logger.warning( @@ -482,6 +497,10 @@ class CSVImporter(TableImporter): class TSVImporter(TableImporter): def read_file(self, filename, **kwargs): try: + tmpdf = pd.read_csv(filename, sep="\t", converters=self.converters, + **kwargs) + applicable_converters = {k: v for k, v in self.converters.items() + if k in tmpdf.columns} df = pd.read_csv(filename, sep="\t", converters=self.converters, **kwargs) except ValueError as ve: diff --git a/src/doc/conf.py b/src/doc/conf.py index 7d3533b06abdbb7f28217364fa1762c770a9a145..9db07d72a4178e2d09761aef752ebe13b20a8856 100644 --- a/src/doc/conf.py +++ b/src/doc/conf.py @@ -27,9 +27,9 @@ copyright = '2021, IndiScale GmbH' author = 'Daniel Hornung' # The short X.Y version -version = '0.7.0' +version = '0.8.0' # The full version, including alpha/beta/rc tags -release = '0.7.0' +release = '0.8.0' # -- General configuration --------------------------------------------------- diff --git a/src/doc/index.rst b/src/doc/index.rst index 5fdb78da4eddfd0145d0357202246d4b5352dcf4..6c2c5f9894ad5c0f5dc3f124de726d264f46d452 100644 --- a/src/doc/index.rst +++ b/src/doc/index.rst @@ -15,6 +15,7 @@ This documentation helps you to :doc:`get started<getting_started>`, explains th Concepts <concepts> The Caosdb Crawler <crawler> YAML data model specification <yaml_interface> + Specifying a datamodel with JSON schema <json_schema_interface> _apidoc/modules diff --git a/src/doc/json_schema_interface.rst b/src/doc/json_schema_interface.rst new file mode 100644 index 0000000000000000000000000000000000000000..0e8aebd3a4204f29608212f7ed0c115fd1d4a134 --- /dev/null +++ b/src/doc/json_schema_interface.rst @@ -0,0 +1,75 @@ +Defining datamodels with a JSON schema specification +==================================================== + +TODO, see https://gitlab.com/caosdb/caosdb-advanced-user-tools/-/issues/42 + +Further information +################### + +Pattern Properties +%%%%%%%%%%%%%%%%%% + +The JSON-schema parser has rudimentary support for ``patternProperties``. Since +their names (only the pattern that their names will suffice) are not known a +priori, we create RecordTypes for all pattern properties. The names of these +RecordTypes are created from their parent element's name by appending the string +``"Entry"`` and possibly a number if there are more than one pattern properties +for one parent. + +All the RecordTypes created for pattern properties have at least an obligatory +``__matched_pattern`` property which will -- as the name suggests -- store the +matched pattern of an actual data entry. + +.. note:: + + The ``__matched_pattern`` property is added automatically to your datamodel + as soon as there is at least one pattern property in your JSON schema. So be + sure that you don't happen to have an entity with exactly this name in your + database. + +E.g., a json schema with + +.. code-block:: json + + "dataset": { + "patternProperties": { + "^[0-9]{4,4}": { + "type": "boolean" + }, + "^[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}": { + "type": "object", + "properties": { + "date_id": { + "$ref": "#/definitions/uuid" + } + } + } + } + } + +Would result in a ``Dataset`` RecordType that has the two properties +``DatasetEntry_1`` and ``DatasetEntry_2`` (as always, name can be overwritten +explicitly by specifying the ``title`` property), referencing corresponding +``DatasetEntry_1`` and ``DatasetEntry_2`` Records. + +Apart from the aforementioned ``__matched_pattern`` property, ``DatasetEntry_1`` +also has the ``DatasetEntry_1_value`` property with datatype ``BOOLEAN``, that +stores the actual value. In turn, ``DatasetEntry_2`` is of ``type: object`` and +is treated like any other RecordType. Consequently, it has, appart from the +``__matched_pattern`` property, a ``date_id`` property as specified in its +``properties``. + +Array entries without ``items`` specification +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +JSON schema allows for properties of ``type: array`` without the ``items`` +specification that consequently can be arrays of any (and of mixed) types. While +this is in general problematic when specifying a data model, sometimes these +properties cannot be specified further, e.g., when you're using an external +schema that you cannot change. + +These properties can still be added to your datamodel by specifying their types +explicitly in a dictionary or, alternatively, they can be ignored. See the +``types_for_missing_array_items`` and ``ignore_unspecified_array_items`` +parameters of ``models.parser.JsonSchemaParser``, respectively, for more +information. diff --git a/unittests/json-schema-models/datamodel_atomic_properties.schema.json b/unittests/json-schema-models/datamodel_atomic_properties.schema.json index 3828f131180a839d5c9b8bc5aa1a1285717da723..7b4a23e5bb48b995d07a261bcae0a8a486b7969a 100644 --- a/unittests/json-schema-models/datamodel_atomic_properties.schema.json +++ b/unittests/json-schema-models/datamodel_atomic_properties.schema.json @@ -18,7 +18,8 @@ "date": { "type": "string", "format": "date" }, "integer": { "type": "integer", "description": "Some integer property" }, "boolean": { "type": "boolean" }, - "number_prop": { "type": "number", "description": "Some float property" } + "number_prop": { "type": "number", "description": "Some float property" }, + "null_prop": { "type": "null", "description": "This property will never have a value." } } } ] diff --git a/unittests/json-schema-models/datamodel_missing_array_items.schema.json b/unittests/json-schema-models/datamodel_missing_array_items.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..8ac17ac3162def3dbf070d7027fd318366bb4682 --- /dev/null +++ b/unittests/json-schema-models/datamodel_missing_array_items.schema.json @@ -0,0 +1,9 @@ +{ + "title": "something_with_missing_array_items", + "type": "object", + "properties": { + "missing": { + "type": "array" + } + } +} diff --git a/unittests/json-schema-models/datamodel_no_toplevel_entity.schema.json b/unittests/json-schema-models/datamodel_no_toplevel_entity.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..35240d765479b719576e6ee67e387790d3d6d160 --- /dev/null +++ b/unittests/json-schema-models/datamodel_no_toplevel_entity.schema.json @@ -0,0 +1,56 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://my-schema-id.net", + "type": "object", + "definitions": { + "uuid": { + "type": [ + "string", + "null" + ], + "pattern": "^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$" + }, + "datetime": { + "type": "string", + "format": "date-time" + } + }, + "properties": { + "Dataset1": { + "title": "Dataset1", + "description": "Some description", + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "full dataset title" + }, + "campaign": { + "type": "string", + "description": "FIXME" + }, + "number_prop": { + "type": "number", + "description": "Some float property" + }, + "user_id": { + "$ref": "#/definitions/uuid" + } + }, + "required": ["title", "number_prop"] + } + }, + "patternProperties": { + "^[0-9]{4,4}": { + "type": "boolean" + }, + "^[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}": { + "type": "object", + "properties": { + "date_id": { + "$ref": "#/definitions/uuid" + } + } + } + } +} diff --git a/unittests/json-schema-models/datamodel_pattern_properties.schema.json b/unittests/json-schema-models/datamodel_pattern_properties.schema.json new file mode 100644 index 0000000000000000000000000000000000000000..9b85c7b80cf0990713f8f130050c21751e311b42 --- /dev/null +++ b/unittests/json-schema-models/datamodel_pattern_properties.schema.json @@ -0,0 +1,39 @@ +[ + { + "title": "Dataset", + "type": "object", + "patternProperties": { + "^[0-9]{4,4}": { + "type": "boolean" + }, + "^[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}": { + "type": "object", + "properties": { + "date_id": { + "type": [ + "string", + "null" + ], + "pattern": "^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$" + } + } + } + } + }, + { + "title": "Dataset2", + "type": "object", + "properties": { + "datetime": { + "type": "string", + "format": "date-time" + } + }, + "patternProperties": { + ".*": { + "title": "Literally anything", + "type": "object" + } + } + } +] diff --git a/unittests/test_json_schema_model_parser.py b/unittests/test_json_schema_model_parser.py index a136f9ba2ae9965978c7f1234acb16289a3ca305..a991076e6a1e1a3e92cafc7f1bb88b42b4b2ab3d 100644 --- a/unittests/test_json_schema_model_parser.py +++ b/unittests/test_json_schema_model_parser.py @@ -103,7 +103,7 @@ def test_datamodel_with_atomic_properties(): assert isinstance(rt2, db.RecordType) assert rt2.name == "Dataset2" assert not rt2.description - assert len(rt2.get_properties()) == 5 + assert len(rt2.get_properties()) == 6 date_prop = rt2.get_property("date") assert date_prop.datatype == db.DATETIME @@ -121,6 +121,9 @@ def test_datamodel_with_atomic_properties(): float_prop2 = rt2.get_property("number_prop") assert float_prop.datatype == float_prop2.datatype + null_prop = rt2.get_property("null_prop") + assert null_prop.datatype == db.TEXT + def test_required_no_list(): """Exception must be raised when "required" is not a list.""" @@ -356,3 +359,130 @@ def test_name_property(): assert str(err.value).startswith( "The 'name' property must be string-typed, otherwise it cannot be identified with CaosDB's " "name property.") + + +def test_no_toplevel_entity(): + model = parse_model_from_json_schema(os.path.join( + FILEPATH, "datamodel_no_toplevel_entity.schema.json"), top_level_recordtype=False) + + assert "Dataset1" in model + rt1 = model["Dataset1"] + + assert rt1.name == "Dataset1" + assert rt1.description == "Some description" + assert len(rt1.get_properties()) == 4 + + assert rt1.get_property("title") is not None + assert rt1.get_property("campaign") is not None + assert rt1.get_property("number_prop") is not None + assert rt1.get_property("user_id") is not None + + title_prop = rt1.get_property("title") + assert title_prop.datatype == db.TEXT + assert rt1.get_importance(title_prop.name) == db.OBLIGATORY + + campaign_prop = rt1.get_property("campaign") + assert campaign_prop.datatype == db.TEXT + assert rt1.get_importance(campaign_prop.name) == db.RECOMMENDED + + float_prop = rt1.get_property("number_prop") + assert float_prop.datatype == db.DOUBLE + assert rt1.get_importance(float_prop.name) == db.OBLIGATORY + + uid_prop = rt1.get_property("user_id") + assert uid_prop.datatype == db.TEXT + assert rt1.get_importance(uid_prop.name) == db.RECOMMENDED + + # pattern properties without top-level entity: + assert "__PatternEntry_1" in model + assert "__PatternEntry_2" in model + + pattern_boolean_rt = model["__PatternEntry_1"] + assert "pattern: " in pattern_boolean_rt.description + assert len(pattern_boolean_rt.properties) == 2 + pp = pattern_boolean_rt.get_property("__matched_pattern") + assert pp.datatype == db.TEXT + assert pattern_boolean_rt.get_importance(pp.name) == db.OBLIGATORY + value_prop = pattern_boolean_rt.get_property("__PatternEntry_1_value") + assert value_prop.datatype == db.BOOLEAN + + pattern_object_rt = model["__PatternEntry_2"] + assert "pattern: " in pattern_object_rt.description + assert len(pattern_object_rt.properties) == 2 + pp = pattern_object_rt.get_property("__matched_pattern") + assert pp.datatype == db.TEXT + assert pattern_object_rt.get_importance(pp.name) == db.OBLIGATORY + date_id_prop = pattern_object_rt.get_property("date_id") + assert date_id_prop.datatype == db.TEXT + + +def test_missing_array_items(): + + # strict behavior + with pytest.raises(JsonSchemaDefinitionError) as err: + parse_model_from_json_schema(os.path.join( + FILEPATH, "datamodel_missing_array_items.schema.json")) + + assert "{'type': 'array'}" in str(err) + + # ignore all problems, so a RT is created that does not have the property + model = parse_model_from_json_schema(os.path.join( + FILEPATH, "datamodel_missing_array_items.schema.json"), ignore_unspecified_array_items=True) + assert "something_with_missing_array_items" in model + rt = model["something_with_missing_array_items"] + assert isinstance(rt, db.RecordType) + assert rt.get_property("missing") is None + + # specify the type: + type_dict = {"missing": db.FILE} + model = parse_model_from_json_schema(os.path.join( + FILEPATH, "datamodel_missing_array_items.schema.json"), types_for_missing_array_items=type_dict) + assert "something_with_missing_array_items" in model + rt = model["something_with_missing_array_items"] + assert rt.get_property("missing") is not None + assert rt.get_property("missing").datatype == db.LIST(db.FILE) + + +def test_pattern_properties(): + + model = parse_model_from_json_schema(os.path.join( + FILEPATH, "datamodel_pattern_properties.schema.json")) + + assert "Dataset" in model + rt1 = model["Dataset"] + assert len(rt1.properties) == 2 + for name in ["DatasetEntry_1", "DatasetEntry_2"]: + assert rt1.get_property(name) is not None + assert rt1.get_property(name).is_reference() + + pattern_boolean_rt = model["DatasetEntry_1"] + assert "pattern: " in pattern_boolean_rt.description + assert len(pattern_boolean_rt.properties) == 2 + pp = pattern_boolean_rt.get_property("__matched_pattern") + assert pp.datatype == db.TEXT + assert pattern_boolean_rt.get_importance(pp.name) == db.OBLIGATORY + value_prop = pattern_boolean_rt.get_property("DatasetEntry_1_value") + assert value_prop.datatype == db.BOOLEAN + + pattern_object_rt = model["DatasetEntry_2"] + assert "pattern: " in pattern_object_rt.description + assert len(pattern_object_rt.properties) == 2 + pp = pattern_object_rt.get_property("__matched_pattern") + assert pp.datatype == db.TEXT + assert pattern_object_rt.get_importance(pp.name) == db.OBLIGATORY + date_id_prop = pattern_object_rt.get_property("date_id") + assert date_id_prop.datatype == db.TEXT + + assert "Dataset2" in model + rt2 = model["Dataset2"] + assert len(rt2.properties) == 2 + # This has been tested elsewhere, just make sure that it is properly created + # in the presence of pattern properties, too. + assert rt2.get_property("datetime") is not None + + assert rt2.get_property("Literally anything") is not None + assert rt2.get_property("Literally anything").is_reference() + + pattern_named_rt = model["Literally anything"] + assert len(pattern_named_rt.properties) == 1 + assert pattern_named_rt.get_property("__matched_pattern") is not None diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py index 70f0f87f8706d72c386b18f54b7a9a10908eb477..0b3f0d7c7fc81b2a9d64e24fb2262c686ea669da 100644 --- a/unittests/test_table_importer.py +++ b/unittests/test_table_importer.py @@ -41,6 +41,16 @@ from caosadvancedtools.table_importer import (CSVImporter, TableImporter, from test_utils import BaseMockUpTest +# For testing the table importer +IMPORTER_KWARGS = dict( + converters={'c': float, 'd': yes_no_converter, 'x': float}, # x does not exist + datatypes={'a': str, 'b': int, 'x': int}, # x does not exist + obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')], + existing_columns=['e'], +) +VALID_DF = pd.DataFrame( + [['a', 1, 2.0, 'yes', np.nan]], columns=['a', 'b', 'c', 'd', 'e']) + class ConverterTest(unittest.TestCase): def test_yes_no(self): @@ -143,22 +153,16 @@ class ConverterTest(unittest.TestCase): class TableImporterTest(unittest.TestCase): def setUp(self): - self.importer_kwargs = dict( - converters={'c': float, 'd': yes_no_converter}, - datatypes={'a': str, 'b': int}, - obligatory_columns=['a', 'b'], unique_keys=[('a', 'b')]) - self.valid_df = pd.DataFrame( - [['a', 1, 2.0, 'yes']], columns=['a', 'b', 'c', 'd']) + self.importer_kwargs = IMPORTER_KWARGS + self.valid_df = VALID_DF def test_missing_col(self): - # check missing from converters - df = pd.DataFrame(columns=['a', 'b', 'c']) - importer = TableImporter(**self.importer_kwargs) - self.assertRaises(ValueError, importer.check_columns, df) - # check missing from datatypes - df = pd.DataFrame(columns=['a', 'd', 'c']) + # check missing from existing + df = pd.DataFrame(columns=['a', 'b']) importer = TableImporter(**self.importer_kwargs) - self.assertRaises(ValueError, importer.check_columns, df) + with pytest.raises(DataInconsistencyError) as die: + importer.check_columns(df) + assert "Column 'e' missing" in str(die.value) # check valid importer.check_columns(self.valid_df) @@ -193,6 +197,35 @@ class TableImporterTest(unittest.TestCase): self.assertEqual(df_new.shape[0], 1) +def test_check_dataframe_existing_obligatory_columns(caplog): + """Needs caplog so remove from above class.""" + # stricter test case; column 'a' must exist and have a value + strict_kwargs = IMPORTER_KWARGS.copy() + strict_kwargs["existing_columns"].append('a') + + importer = TableImporter(**strict_kwargs) + + # the valid df is still valid, since 'a' has a value + importer.check_dataframe(VALID_DF) + + # Now 'a' doesn't + df_missing_a = pd.DataFrame( + [[np.nan, 1, 2.0, 'yes', 'e']], columns=['a', 'b', 'c', 'd', 'e']) + + new_df = importer.check_dataframe(df_missing_a) + # Column is removed and a warning is in the logger: + assert new_df.shape[0] == 0 + assert "Required information is missing (a) in 1. row" in caplog.text + + df_missing_c = pd.DataFrame( + [['a', 1, 'yes', np.nan]], columns=['a', 'b', 'd', 'e']) + new_df = importer.check_dataframe(df_missing_c) + assert new_df.shape[0] == 1 + assert new_df.shape[1] == 4 + + caplog.clear() + + class XLSImporterTest(TableImporterTest): def test_full(self): """ test full run with example data """