diff --git a/CHANGELOG.md b/CHANGELOG.md index d04329ed20eb29f51a4f9d149c2dca7a4cf6f240..00252658b4fe2117100a695f270bc66b752431bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Registered identifiables can also be used by children of the given RecordType if no registered identifiable is defined for them. +- ROCrate converter supports dereferencing property values with a single "@id"-property during + subtree generation. +- ROCrate converter supports the special property "variablesMeasured" in addition to "hasPart". - `None` and other NA values (i.e., values where `pandas.isna` is `True`) are now interpreted as empty strings in `converters.match_name_and_value` instead of being cast to string naïvely diff --git a/setup.cfg b/setup.cfg index d05f2acb1e8d5afafa5a1003c6da2dff0980c126..ae138a9ace0dcc6d88bb1f8fb46f77ac094f73a0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -51,4 +51,4 @@ h5-crawler = spss = pandas[spss] rocrate = - rocrate @ git+https://github.com/salexan2001/ro-crate-py.git@f-automatic-dummy-ids + rocrate diff --git a/src/caoscrawler/converters/rocrate.py b/src/caoscrawler/converters/rocrate.py index 8a45af753312a2bf29c1ddb9e6bcb15458c3ebde..7dcad86589961f03f1e755ddbc0b60742cf4ed4a 100644 --- a/src/caoscrawler/converters/rocrate.py +++ b/src/caoscrawler/converters/rocrate.py @@ -196,7 +196,21 @@ class ROCrateEntityConverter(Converter): # Add the properties: for name, value in eprops.items(): - children.append(convert_basic_element(value, name)) + if isinstance(value, dict): + # This is - according to the standard - only allowed, if it's flat, i.e. + # it contains a single element with key == "@id" and the id as value which + # is supposed to be dereferenced: + if not (len(value) == 1 and "@id" in value): + raise RuntimeError("The JSON-LD is not flat.") + dereferenced = element.entity.crate.dereference(value["@id"]) + if dereferenced is not None: + children.append( + ROCrateEntity(element.folder, dereferenced)) + else: + # This is just an external ID and will be added as simple DictElement + children.append(convert_basic_element(value, name)) + else: + children.append(convert_basic_element(value, name)) # Add the files: if isinstance(element.entity, rocrate.model.file.File): @@ -204,10 +218,12 @@ class ROCrateEntityConverter(Converter): children.append(File(name, os.path.join(element.folder, path, name))) # Parts of this entity are added as child entities: - if "hasPart" in eprops: - for p in eprops["hasPart"]: - children.append( - ROCrateEntity(element.folder, element.entity.crate.dereference( - p["@id"]))) + for sublist in ("hasPart", "variableMeasured"): + if sublist in eprops: + for p in eprops[sublist]: + children.append( + ROCrateEntity(element.folder, element.entity.crate.dereference( + p["@id"]))) + # TODO: See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/195 for discussion. return children diff --git a/unittests/eln_cfood.yaml b/unittests/eln_cfood.yaml index ab8e7108f511b0450d37c3e60162e412d4a1bf3b..bb29b7da7c1e6c3fc555038412f42ff2ab4d28fa 100644 --- a/unittests/eln_cfood.yaml +++ b/unittests/eln_cfood.yaml @@ -26,11 +26,18 @@ DataDir: "@id": records-example/$ name: (?P<name>.*) keywords: (?P<keywords>.*) - description: (?P<description>.*) dateModified: (?P<dateModified>.*) records: Dataset: name: $name keywords: $keywords - description: $description dateModified: $dateModified + subtree: + Description: + type: ROCrateEntity + match_type: TextObject + match_properties: + text: (?P<description>.*) + records: + Dataset: + description: $description diff --git a/unittests/eln_files/PASTA.eln b/unittests/eln_files/PASTA.eln deleted file mode 100644 index 61866e7d5f57cb32191af6663be230153092e712..0000000000000000000000000000000000000000 Binary files a/unittests/eln_files/PASTA.eln and /dev/null differ diff --git a/unittests/eln_files/records-example.eln b/unittests/eln_files/records-example.eln index 09ed53fc179e80a240ab773247d6f9adee71b429..4907bcc4e88e2152fdf2675a50ca661b666c947d 100644 Binary files a/unittests/eln_files/records-example.eln and b/unittests/eln_files/records-example.eln differ diff --git a/unittests/test_rocrate_converter.py b/unittests/test_rocrate_converter.py index dc7cef9f6d396c73a2a285d3f60fd587863237ac..4b6bde171c789017e95a38729ae93f49ecf3f97b 100644 --- a/unittests/test_rocrate_converter.py +++ b/unittests/test_rocrate_converter.py @@ -76,6 +76,12 @@ def eln_entities(basic_eln_converter): return entities +@pytest.mark.xfail( + reason="The example files for PASTA have not yet been updated in:" + "https://github.com/TheELNConsortium/TheELNFileFormat/tree/master/examples/PASTA" + "However, there was the announcement that these files are going to follow the" + "flattened structure soon: https://github.com/TheELNConsortium/TheELNFileFormat/issues/98" +) def test_load_pasta(basic_eln_converter): """ Test for loading the .eln example export from PASTA. @@ -99,7 +105,7 @@ def test_load_kadi4mat(basic_eln_converter): match = basic_eln_converter.match(f_k4mat) assert match is not None entities = basic_eln_converter.create_children(GeneralStore(), f_k4mat) - assert len(entities) == 10 + assert len(entities) == 17 assert isinstance(entities[0], ROCrateEntity) assert isinstance(entities[0].folder, str) assert isinstance(entities[0].entity, Entity) @@ -131,15 +137,15 @@ match_properties: match = ds2.match(eln_entities[1]) assert match is not None - assert match["dateCreated"] == "2024-08-21T12:07:45.115990+00:00" + assert match["dateCreated"] == "2024-11-19T13:44:35.476888+00:00" children = ds2.create_children(GeneralStore(), eln_entities[1]) assert len(children) == 8 assert isinstance(children[0], TextElement) assert children[0].name == "@id" assert children[0].value == "ro-crate-metadata.json" - assert isinstance(children[5], DictElement) - assert children[5].value == {'@id': 'https://kadi.iam.kit.edu'} + assert isinstance(children[5], ROCrateEntity) + assert children[5].name == "https://kadi.iam.kit.edu" def test_file(eln_entities): @@ -178,13 +184,20 @@ match_properties: assert match is not None children = ds_parts.create_children(GeneralStore(), ent_parts) + # Number of children = number of properties + number of parts + + # number of variables measured + number of files + assert len(children) == (len(ent_parts.entity.properties()) + + len(ent_parts.entity.properties()["hasPart"]) + + len(ent_parts.entity.properties()["variableMeasured"])) - # Number of children = number of properties + number of parts: - assert len(children) == len(ent_parts.entity.properties()) + 4 entity_children = [f for f in children if isinstance(f, ROCrateEntity)] - assert len(entity_children) == 4 + assert len(entity_children) == 13 + file_counter = 0 + for f in entity_children: - assert isinstance(f.entity, rocrate.model.file.File) + if isinstance(f.entity, rocrate.model.file.File): + file_counter += 1 + assert file_counter == 4 def test_scanner(): @@ -193,7 +206,14 @@ def test_scanner(): assert len(rlist) == 1 assert isinstance(rlist[0], db.Record) assert rlist[0].name == "records-example" - assert rlist[0].description == "This is a sample record." + # This assertion was moved to a different test, see below: + # assert rlist[0].description == "This is a sample record." assert rlist[0].parents[0].name == "Dataset" assert rlist[0].get_property("keywords").value == "sample" assert rlist[0].get_property("dateModified").value == "2024-08-21T11:43:17.626965+00:00" + + +def test_description_reference(): + rlist = scanner.scan_directory(os.path.join(UNITTESTDIR, "eln_files/"), + os.path.join(UNITTESTDIR, "eln_cfood.yaml")) + assert rlist[0].description == "This is a sample record." diff --git a/unittests/test_zipfile_converter.py b/unittests/test_zipfile_converter.py index 9bc8b8804e299387157869f0dc8b11a9c2a8c6f8..451d23c93bfc15889d5b7a9f97ef1f157aece6ee 100644 --- a/unittests/test_zipfile_converter.py +++ b/unittests/test_zipfile_converter.py @@ -50,6 +50,12 @@ def converter_registry(): return converter_registry +@pytest.mark.xfail( + reason="The example files for PASTA have not yet been updated in:" + "https://github.com/TheELNConsortium/TheELNFileFormat/tree/master/examples/PASTA" + "However, there was the announcement that these files are going to follow the" + "flattened structure soon: https://github.com/TheELNConsortium/TheELNFileFormat/issues/98" +) def test_zipfile_converter(converter_registry): zipfile = File("PASTA.eln", os.path.join(UNITTESTDIR, "eln_files", "PASTA.eln")) zip_conv = ZipFileConverter(yaml.safe_load(""" @@ -77,3 +83,40 @@ match: ^PASTA$ assert isinstance(children[i], Directory) for i in range(2, 5): assert isinstance(children[i], File) + + +def test_zipfile_minimal(converter_registry): + zipfile = File("empty.zip", os.path.join(UNITTESTDIR, "zip_minimal", "empty.zip")) + zip_conv = ZipFileConverter(yaml.safe_load(""" +type: ZipFile +match: .*$ +"""), "TestZipFileConverter", converter_registry) + + match = zip_conv.match(zipfile) + assert match is not None + + children = zip_conv.create_children(GeneralStore(), zipfile) + assert len(children) == 2 + + file_obj = None + dir_obj = None + for ch in children: + if isinstance(ch, File): + file_obj = ch + elif isinstance(ch, Directory): + dir_obj = ch + else: + assert False + assert file_obj is not None and dir_obj is not None + assert file_obj.name == "empty.txt" + + dir_conv = DirectoryConverter(yaml.safe_load(""" +type: Directory +match: ^folder$ +"""), "TestDirectory", converter_registry) + match = dir_conv.match(dir_obj) + assert match is not None + children = dir_conv.create_children(GeneralStore(), dir_obj) + assert len(children) == 3 + for i in range(3): + assert isinstance(children[i], File) diff --git a/unittests/zip_minimal/empty.zip b/unittests/zip_minimal/empty.zip new file mode 100644 index 0000000000000000000000000000000000000000..3eb2cee755e1b0265b13b1ee8f31c2aa1abe62de Binary files /dev/null and b/unittests/zip_minimal/empty.zip differ