diff --git a/CHANGELOG.md b/CHANGELOG.md index 71fdba54f1a1df425fcce9defc37810120d3f5e2..25a434be72057f6081aee8ab85453b8d12d1ba84 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 `match_attrib` in XMLConverter. - `match_properties` is a method of class Converter and can for example be used by CustomConverters. +- ZipFileConverter that opens zip files and exposes their contents as + File and Directory structure elements. ### Changed ### diff --git a/src/caoscrawler/converters/__init__.py b/src/caoscrawler/converters/__init__.py index 670d4e966c72c6bcf45d0d46c1db715fb79d8ab5..edb7b3633cea2657dc3b9638379a3e57c37c87e4 100644 --- a/src/caoscrawler/converters/__init__.py +++ b/src/caoscrawler/converters/__init__.py @@ -18,11 +18,12 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. -"""Submdule containing all default and optional converters.""" +"""Submodule containing all default and optional converters.""" from .. import utils from .converters import * from .xml_converter import * +from .zipfile_converter import ZipFileConverter try: from .spss import SPSSConverter diff --git a/src/caoscrawler/converters/zipfile_converter.py b/src/caoscrawler/converters/zipfile_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..7073e66a266168e17eb9b6143e7dc6292b5149dc --- /dev/null +++ b/src/caoscrawler/converters/zipfile_converter.py @@ -0,0 +1,82 @@ +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +"""Converters take structure elements and create Records and new structure elements from them. + +This converter opens zip files, unzips them into a temporary directory and +exposes its contents as File structure elements. + +""" + +from __future__ import annotations + +import os +import tempfile +from os.path import isdir, join +from zipfile import ZipFile + +from ..stores import GeneralStore +from ..structure_elements import Directory, File, StructureElement +from .converters import SimpleFileConverter + + +class ZipFileConverter(SimpleFileConverter): + + """Convert zipfiles. + """ + + def setup(self): + self._tempdir = None + + def cleanup(self): + self._tempdir.cleanup() + + def create_children(self, generalStore: GeneralStore, element: StructureElement): + """ + Loads an ROCrate from an rocrate file or directory. + + Arguments: + ---------- + element must be a File or Directory (structure element). + + Returns: + -------- + A list with an ROCrateElement representing the contents of the .eln-file or None + in case of errors. + """ + + if isinstance(element, File): + self._tempdir = tempfile.TemporaryDirectory() + unzd_path = self._tempdir.name + with ZipFile(element.path) as zipf: + zipf.extractall(unzd_path) + + entity_ls = [] + for el in os.listdir(unzd_path): + path = join(unzd_path, el) + if isdir(path): + entity_ls.append(Directory(el, path)) + else: + entity_ls.append(File(el, path)) + + return entity_ls + else: + raise ValueError("create_children was called with wrong type of StructureElement") + return None diff --git a/src/doc/converters/standard_converters.rst b/src/doc/converters/standard_converters.rst index 8d4ab045a6c21a00fb83e572df08a5697b9a47e0..f7f18794496e5e658a8abdb5676b562d5e047675 100644 --- a/src/doc/converters/standard_converters.rst +++ b/src/doc/converters/standard_converters.rst @@ -366,3 +366,31 @@ XMLTextNodeConverter In the future, this converter can be used to match XMLTextNodes that are generated by the XMLTagConverter. + + +ZipFileConverter +================ + +This converter opens zip files, unzips them into a temporary directory and +exposes its contents as File structure elements. + +Usage Example: +-------------- + +.. code-block:: yaml + + ExampleZipFile: + type: ZipFile + match: example\.zip$ + subtree: + DirInsideZip: + type: Directory + match: experiments$ + FileInsideZip: + type: File + match: description.odt$ + +This converter will match and open files called ``example.zip``. If +the file contains a directory called ``experiments`` it will be +processed further by the respective converter in the subtree. The same +is true for a file called ``description.odt``. diff --git a/unittests/test_rocrate_converter.py b/unittests/test_rocrate_converter.py index 06ce187eae1ec4936ce3aff3f2799c15be428b6c..dc7cef9f6d396c73a2a285d3f60fd587863237ac 100644 --- a/unittests/test_rocrate_converter.py +++ b/unittests/test_rocrate_converter.py @@ -27,21 +27,16 @@ import importlib import os from pathlib import Path -import jsonschema import linkahead as db import pytest import rocrate import yaml from caoscrawler import scanner from caoscrawler.converters import ELNFileConverter, ROCrateEntityConverter -from caoscrawler.scanner import load_definition from caoscrawler.stores import GeneralStore from caoscrawler.structure_elements import (DictElement, File, ROCrateEntity, TextElement) -from linkahead.high_level_api import convert_to_python_object -from lxml.etree import fromstring from rocrate.model.entity import Entity -from rocrate.rocrate import ROCrate UNITTESTDIR = Path(__file__).parent diff --git a/unittests/test_zipfile_converter.py b/unittests/test_zipfile_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..9bc8b8804e299387157869f0dc8b11a9c2a8c6f8 --- /dev/null +++ b/unittests/test_zipfile_converter.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# This file is a part of the LinkAhead Project. +# +# Copyright (C) 2024 Indiscale GmbH <info@indiscale.com> +# Copyright (C) 2024 Alexander Schlemmer <a.schlemmer@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +""" +test the zip-file converter +""" +import importlib +import os +from pathlib import Path + +import pytest +import yaml +from caoscrawler.converters import DirectoryConverter, ZipFileConverter +from caoscrawler.stores import GeneralStore +from caoscrawler.structure_elements import Directory, File + +UNITTESTDIR = Path(__file__).parent + + +@pytest.fixture +def converter_registry(): + converter_registry: dict[str, dict[str, str]] = { + "ZipFile": { + "converter": "ZipFileConverter", + "package": "caoscrawler.converters"}, + } + + for key, value in converter_registry.items(): + module = importlib.import_module(value["package"]) + value["class"] = getattr(module, value["converter"]) + return converter_registry + + +def test_zipfile_converter(converter_registry): + zipfile = File("PASTA.eln", os.path.join(UNITTESTDIR, "eln_files", "PASTA.eln")) + zip_conv = ZipFileConverter(yaml.safe_load(""" +type: ZipFile +match: .*$ +"""), "TestZipFileConverter", converter_registry) + + match = zip_conv.match(zipfile) + assert match is not None + + children = zip_conv.create_children(GeneralStore(), zipfile) + assert len(children) == 1 + assert children[0].name == "PASTA" + + dir_conv = DirectoryConverter(yaml.safe_load(""" +type: Directory +match: ^PASTA$ +"""), "TestDirectory", converter_registry) + match = dir_conv.match(children[0]) + assert match is not None + children = dir_conv.create_children(GeneralStore(), children[0]) + assert len(children) == 5 + print(children) + for i in range(2): + assert isinstance(children[i], Directory) + for i in range(2, 5): + assert isinstance(children[i], File)