diff --git a/.docker/Dockerfile b/.docker/Dockerfile index ca59395a90c747fc60a155c3fb5f8f264c60d42a..d5d2fe66770b2d37f7ecbb718a2260cdd7f501c1 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -20,15 +20,6 @@ ADD https://gitlab.com/api/v4/projects/13656973/repository/branches/dev \ pylib_version.json RUN git clone https://gitlab.com/caosdb/caosdb-pylib.git && \ cd caosdb-pylib && git checkout dev && pip3 install . -ADD https://gitlab.com/api/v4/projects/13656965/repository/branches/master \ - model_version.json -RUN git clone https://gitlab.com/caosdb/caosdb-models.git && \ - cd caosdb-models && pip3 install . -ADD https://gitlab.com/api/v4/projects/13601752/repository/branches/master \ - scifolder_version.json -RUN git clone \ - https://gitlab.com/henrik_indiscale/scifolder.git && \ - cd scifolder && pip3 install . COPY . /git RUN rm -r /git/.git \ && mv /git/.docker/pycaosdb.ini /git/integrationtests diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9f746e473a799c027e40b84e3f3a6b36e7539c62..9b573a53f424ccdbe3d47c426e497df15dbc1257 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -78,6 +78,7 @@ build-testenv: stage: setup only: - schedules + - web script: - df -h - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY diff --git a/CHANGELOG.md b/CHANGELOG.md index cfebbbcf981a7e96c18ea5a12bfd8c515f37759b..106d703caba5475999ce57a488be03677f84e547 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### +- included the scifolder module - included the caosmodels module * `send_mail` function in `caosadvancedtools.serverside.helper` module - New class to collect possible problems with the data model diff --git a/README_SETUP.md b/README_SETUP.md index b9db16a9feba246aeae8e59574047ba0f9380a38..243fba2dd1259aaefbe6c7163a242b700eb5a66e 100644 --- a/README_SETUP.md +++ b/README_SETUP.md @@ -15,17 +15,12 @@ Dependencies will be installed automatically if you use the below described proc For testing: - `tox` -- `scifolder`from https://gitlab.com/henrik_indiscale/scifolder ## Installation - `pip install . --user` - `pip install tox --user` -In order to run the tests you need to install the [scifolder -package](https://gitlab.com/henrik_indiscale/scifolder) by Henrik tom -Wörden. - ## Run Unit Tests `tox` diff --git a/integrationtests/crawl.py b/integrationtests/crawl.py index e4bf311e6700448aab0ebf1a5ab72bad6bf1296e..bf72b5f74b463f9ece2bd047548dcb22e8d71dac 100755 --- a/integrationtests/crawl.py +++ b/integrationtests/crawl.py @@ -32,8 +32,9 @@ import caosdb as db from caosadvancedtools.cfood import fileguide from caosadvancedtools.crawler import FileCrawler from caosadvancedtools.guard import INSERT, UPDATE -from scifolder import (AnalysisCFood, ExperimentCFood, PublicationCFood, - SimulationCFood, SoftwareCFood) +from caosadvancedtools.scifolder import (AnalysisCFood, ExperimentCFood, + PublicationCFood, SimulationCFood, + SoftwareCFood) try: from sss_helper import get_argument_parser, print_success diff --git a/integrationtests/insert_model.py b/integrationtests/insert_model.py index 2289f72e83545db0e7eacedfa52868507b6c4760..270a08a36d7512a8642c2ca08a9ec6ea93b81bd9 100755 --- a/integrationtests/insert_model.py +++ b/integrationtests/insert_model.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 import caosdb as db -from caosmodels.parser import parse_model_from_yaml +from caosadvancedtools.models.parser import parse_model_from_yaml model = parse_model_from_yaml("model.yml") model.sync_data_model(noquestion=True) diff --git a/integrationtests/test_crawl_with_datamodel_problems.py b/integrationtests/test_crawl_with_datamodel_problems.py index 3089bf4ce60093206e42477d740ead5854a9debc..6c212e36084430e5f7c3362a04e78565561019b2 100644 --- a/integrationtests/test_crawl_with_datamodel_problems.py +++ b/integrationtests/test_crawl_with_datamodel_problems.py @@ -30,9 +30,9 @@ from caosadvancedtools.cfood import fileguide from caosadvancedtools.crawler import FileCrawler from caosadvancedtools.datamodel_problems import DataModelProblems from caosadvancedtools.guard import INSERT -from caosmodels.parser import parse_model_from_yaml -from scifolder import (AnalysisCFood, ExperimentCFood, PublicationCFood, - SimulationCFood) +from caosadvancedtools.models.parser import parse_model_from_yaml +from caosadvancedtools.scifolder import (AnalysisCFood, ExperimentCFood, + PublicationCFood, SimulationCFood) def setup_module(): diff --git a/src/caosadvancedtools/import_from_xml.py b/src/caosadvancedtools/import_from_xml.py index 9942a9a9f38de90d62471cc86d32c25d55c9cba9..0bf9b1c0cbb478bb75687f9f3e41ca2d4960d2c0 100755 --- a/src/caosadvancedtools/import_from_xml.py +++ b/src/caosadvancedtools/import_from_xml.py @@ -33,7 +33,7 @@ from tempfile import NamedTemporaryFile import caosdb as db from caosdb.apiutils import apply_to_ids -from caosmodels.data_model import DataModel +from caosadvancedtools.models.data_model import DataModel def create_dummy_file(text="Please ask the administrator for this file."): diff --git a/src/caosadvancedtools/scifolder/__init__.py b/src/caosadvancedtools/scifolder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d7d67937b42ca23173fc93d4e704411f33d80bc4 --- /dev/null +++ b/src/caosadvancedtools/scifolder/__init__.py @@ -0,0 +1,5 @@ +from .analysis_cfood import AnalysisCFood +from .experiment_cfood import ExperimentCFood +from .publication_cfood import PublicationCFood +from .simulation_cfood import SimulationCFood +from .software_cfood import SoftwareCFood diff --git a/src/caosadvancedtools/scifolder/analysis_cfood.py b/src/caosadvancedtools/scifolder/analysis_cfood.py new file mode 100644 index 0000000000000000000000000000000000000000..27cb871aed08f41531c367567ea36ea9a3faaf69 --- /dev/null +++ b/src/caosadvancedtools/scifolder/analysis_cfood.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python +# encoding: utf-8 +# +# Copyright (C) 2019 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +import os +from itertools import chain + +import caosdb as db +from caosadvancedtools.cfood import (AbstractFileCFood, assure_has_parent, + assure_has_property, + assure_object_is_in_list, get_entity) +from caosadvancedtools.read_md_header import get_header + +from .generic_pattern import full_pattern +from .utils import (get_files_referenced_by_field, parse_responsibles, + reference_records_corresponding_to_files) +from .withreadme import DATAMODEL as dm +from .withreadme import (RESULTS, REVISIONOF, SCRIPTS, SOURCES, WithREADME, + get_glob) + + +class AnalysisCFood(AbstractFileCFood, WithREADME): + _prefix = ".*/DataAnalysis/" + + # win_paths can be used to define fields that will contain windows style + # path instead of the default unix ones. Possible fields are: + # ["results", "sources", "scripts","revisionOf"] + win_paths = [] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + WithREADME.__init__(self) + + def collect_information(self): + self.find_referenced_files([RESULTS, SOURCES, SCRIPTS]) + + @staticmethod + def name_beautifier(name): + """ a function that can be used to rename the project. I.e. if + the project in CaosDB shall be named differently than in the folder + structure. + Use discouraged. + """ + + return name + + @staticmethod + def get_re(): + return AnalysisCFood._prefix + full_pattern + + def create_identifiables(self): + # create the project identifiable + name = AnalysisCFood.name_beautifier( + self.match.group("project_identifier")) + self.project = db.Record(name=name) + self.project.add_parent(name=dm.Project) + self.identifiables.append(self.project) + + # create the Analysis identifiable + self.analysis = db.Record() + self.analysis.add_parent(name=dm.Analysis) + self.analysis.add_property(name=dm.date, value=self.match.group("date")) + + self.analysis.add_property(name=dm.Project, value=self.project) + self.identifiables.append(self.analysis) + + if self.match.group("suffix") is not None: + self.analysis.add_property(name=dm.identifier, + value=self.match.group("suffix")) + else: + # TODO empty string causes an error in search + self.analysis.add_property(name=dm.identifier, + value="empty_identifier") + + # parse people and add them to identifiables + # TODO People are currently 'identifiable' due to ther first and last + # names. There will be conflicts + self.people = parse_responsibles(self.header) + self.identifiables.extend(self.people) + + def update_identifiables(self): + assure_has_property(self.analysis, "description", + self.header["description"][0], + to_be_updated=self.to_be_updated) + assure_object_is_in_list(obj=self.people, + containing_object=self.analysis, + property_name=dm.responsible, + to_be_updated=self.to_be_updated, + datatype=db.LIST(db.REFERENCE) + ) + self.reference_included_records(self.analysis, + [RESULTS, SOURCES, SCRIPTS], + to_be_updated=self.to_be_updated + ) + + if SOURCES.key in self.header: + reference_records_corresponding_to_files( + record=self.analysis, + recordtypes=[dm.Experiment, dm.Publication, dm.Simulation, + dm.Analysis], + globs=get_glob(self.header[SOURCES.key]), + property_name=dm.sources, + path=self.crawled_path, + to_be_updated=self.to_be_updated) + + self.reference_files_from_header(record=self.analysis) + + if REVISIONOF.key in self.header: + reference_records_corresponding_to_files( + record=self.analysis, + recordtypes=[dm.Analysis], + property_name=dm.revisionOf, + globs=get_glob(self.header[REVISIONOF.key]), + path=self.crawled_path, + to_be_updated=self.to_be_updated) diff --git a/src/caosadvancedtools/scifolder/experiment_cfood.py b/src/caosadvancedtools/scifolder/experiment_cfood.py new file mode 100644 index 0000000000000000000000000000000000000000..0eccd18d9481b0bbb91c75d63c849e69e0c6572b --- /dev/null +++ b/src/caosadvancedtools/scifolder/experiment_cfood.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python +# encoding: utf-8 +# +# Copyright (C) 2019 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +import caosdb as db +from caosadvancedtools.cfood import (AbstractFileCFood, assure_has_description, + assure_has_parent, assure_has_property, + assure_object_is_in_list, get_entity) +from caosadvancedtools.read_md_header import get_header + +from .generic_pattern import full_pattern +from .utils import parse_responsibles, reference_records_corresponding_to_files +from .withreadme import DATAMODEL as dm +from .withreadme import RESULTS, REVISIONOF, SCRIPTS, WithREADME, get_glob + + +class ExperimentCFood(AbstractFileCFood, WithREADME): + + # win_paths can be used to define fields that will contain windows style + # path instead of the default unix ones. Possible fields are: + # ["results", "revisionOf"] + win_paths = [] + + @staticmethod + def name_beautifier(x): return x + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + WithREADME.__init__(self) + + self.name_map = {}, + + @staticmethod + def get_re(): + return ".*/ExperimentalData/"+full_pattern + + def collect_information(self): + self.find_referenced_files([RESULTS]) + + @staticmethod + def create_identifiable_experiment(match): + # create the project identifiable + name = ExperimentCFood.name_beautifier( + match.group("project_identifier")) + project = db.Record(name=name) + project.add_parent(name=dm.Project) + + experiment = db.Record() + experiment.add_parent(name=dm.Experiment) + experiment.add_property( + name=dm.date, value=match.group("date")) + experiment.add_property(name=dm.Project, value=project) + + if match.group("suffix") is None: + experiment.add_property( + name="identifier", value="empty_identifier") + else: + experiment.add_property(name="identifier", + value=match.group("suffix")) + + return [experiment, project] + + def create_identifiables(self): + self.experiment, self.project = ( + ExperimentCFood.create_identifiable_experiment(self.match)) + + self.identifiables.extend([self.experiment, self.project]) + self.people = parse_responsibles(self.header) + self.identifiables.extend(self.people) + + def update_identifiables(self): + # set description + assure_has_property(self.experiment, "description", + self.header["description"][0], + to_be_updated=self.to_be_updated) + + # set responsible people + assure_object_is_in_list(self.people, self.experiment, dm.responsible, + to_be_updated=self.to_be_updated, + datatype=db.LIST(db.REFERENCE)) + + self.reference_files_from_header(record=self.experiment) + + if "revisionOf" in self.header: + reference_records_corresponding_to_files( + record=self.experiment, + recordtypes=[dm.Experiment], + globs=get_glob(self.header[REVISIONOF.key]), + path=self.crawled_path, + property_name=dm.revisionOf, + to_be_updated=self.to_be_updated) diff --git a/src/caosadvancedtools/scifolder/generic_pattern.py b/src/caosadvancedtools/scifolder/generic_pattern.py new file mode 100644 index 0000000000000000000000000000000000000000..0b5a4df2063b9639ee6fd018e241d98df8c583d1 --- /dev/null +++ b/src/caosadvancedtools/scifolder/generic_pattern.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# encoding: utf-8 +# +# Copyright (C) 2020 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2020 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +""" this module contains regular expressions neeeded for the standard file +structure """ + + +project_pattern = (r"(?P<project_identifier>" + r"(?P<project_year>\d{4})_?(?P<project_name>((?!/).)*))/") +date_pattern = r"(?P<date>\d{2,4}[-_]\d{1,2}[-_]\d{1,2})" +date_suffix_pattern = r"(_(?P<suffix>(((?!/).)*)))?/" +readme_pattern = r"(readme.md|README.md|readme.xlsx|README.xlsx)$" + +full_pattern = (project_pattern + date_pattern + date_suffix_pattern + # TODO: Additional level are not allowed according to the + # specification. This should be removed or enabled via a + # configuration + + "(.*)" + + readme_pattern) diff --git a/src/caosadvancedtools/scifolder/publication_cfood.py b/src/caosadvancedtools/scifolder/publication_cfood.py new file mode 100644 index 0000000000000000000000000000000000000000..fc78e5b759e98e8989c952ccbafeef117e2ed33d --- /dev/null +++ b/src/caosadvancedtools/scifolder/publication_cfood.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python +# encoding: utf-8 +# +# Copyright (C) 2019 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +import os +from itertools import chain + +import caosdb as db +from caosadvancedtools.cfood import (AbstractFileCFood, + assure_object_is_in_list, fileguide, + get_entity) +from caosadvancedtools.read_md_header import get_header +from caosadvancedtools.utils import find_records_that_reference_ids + +from .generic_pattern import date_suffix_pattern, readme_pattern +from .utils import (get_files_referenced_by_field, parse_responsibles, + reference_records_corresponding_to_files) +from .withreadme import DATAMODEL as dm +from .withreadme import (RESULTS, REVISIONOF, SCRIPTS, SOURCES, WithREADME, + get_glob) + + +def folder_to_type(name): + if name == "Theses": + return "Thesis" + elif name == "Articles": + return "Article" + elif name == "Posters": + return "Poster" + elif name == "Presentations": + return "Presentation" + elif name == "Reports": + return "Report" + else: + raise ValueError() + + +class PublicationCFood(AbstractFileCFood, WithREADME): + # win_paths can be used to define fields that will contain windows style + # path instead of the default unix ones. Possible fields are: + # ["results", "sources", "scripts", "revisionOf"] + win_paths = [] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + WithREADME.__init__(self) + + def collect_information(self): + self.find_referenced_files([RESULTS, SOURCES, SCRIPTS]) + + @staticmethod + def get_re(): + # matches anything but "/", i.e. a folder name + _prefix = ".*/Publications/" + _type = r"(?P<type>Theses|Articles|Posters|Presentations|Reports)/" + _partial_date = r"(?P<date>\d{2,4}([-_]\d{1,2}[-_]\d{1,2})?)" + + return _prefix+_type+_partial_date+date_suffix_pattern+readme_pattern + + def create_identifiables(self): + header = get_header(fileguide.access(self.crawled_path)) + self.publication = db.Record(name=self.match.group("date") + + "_"+self.match.group("suffix")) + self.publication.add_parent(name=folder_to_type( + self.match.group("type"))) + self.identifiables.append(self.publication) + + self.people = parse_responsibles(header) + self.identifiables.extend(self.people) + + def update_identifiables(self): + header = get_header(fileguide.access(self.crawled_path)) + self.publication.description = header["description"][0] + + assure_object_is_in_list(self.people, self.publication, + "responsible", + self.to_be_updated, + datatype=db.LIST(db.REFERENCE)) + + if SOURCES.key in self.header: + reference_records_corresponding_to_files( + record=self.publication, + recordtypes=[dm.Experiment, dm.Publication, dm.Simulation, + dm.Analysis], + globs=get_glob(self.header[SOURCES.key]), + property_name=dm.sources, + path=self.crawled_path, + to_be_updated=self.to_be_updated) + self.reference_files_from_header(record=self.publication) + + if REVISIONOF.key in self.header: + reference_records_corresponding_to_files( + record=self.publication, + recordtypes=[dm.Publication], + property_name=dm.revisionOf, + globs=get_glob(self.header[REVISIONOF.key]), + path=self.crawled_path, + to_be_updated=self.to_be_updated) diff --git a/src/caosadvancedtools/scifolder/simulation_cfood.py b/src/caosadvancedtools/scifolder/simulation_cfood.py new file mode 100644 index 0000000000000000000000000000000000000000..ae129e6a69ce25c6698b98124e81f8bc2921b472 --- /dev/null +++ b/src/caosadvancedtools/scifolder/simulation_cfood.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python +# encoding: utf-8 +# +# Copyright (C) 2019 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +import os +from itertools import chain + +import caosdb as db +from caosadvancedtools.cfood import (AbstractFileCFood, assure_has_parent, + assure_has_property, + assure_object_is_in_list, get_entity) +from caosadvancedtools.read_md_header import get_header + +from .generic_pattern import full_pattern +from .utils import (get_files_referenced_by_field, parse_responsibles, + reference_records_corresponding_to_files) +from .withreadme import DATAMODEL as dm +from .withreadme import (RESULTS, REVISIONOF, SCRIPTS, SOURCES, WithREADME, + get_glob) + + +class SimulationCFood(AbstractFileCFood, WithREADME): + # win_paths can be used to define fields that will contain windows style + # path instead of the default unix ones. Possible fields are: + # ["results", "sources", "scripts", "revisionOf"] + win_paths = [] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + WithREADME.__init__(self) + + def collect_information(self): + self.find_referenced_files([RESULTS, SOURCES, SCRIPTS]) + + @staticmethod + def get_re(): + return ".*/SimulationData/" + full_pattern + + def create_identifiables(self): + # create the project identifiable + self.project = db.Record(name=self.match.group("project_identifier")) + self.project.add_parent(name="Project") + self.identifiables.append(self.project) + + self.simulation = db.Record() + # import IPython + # IPython.embed() + self.simulation.add_parent(name="Simulation") + self.simulation.add_property( + name="date", value=self.match.group("date")) + + self.simulation.add_property(name="Project", value=self.project) + + if self.match.group("suffix") is not None: + self.simulation.add_property( + name="identifier", value=self.match.group("suffix")) + else: + # TODO empty string causes an error in search + self.simulation.add_property(name="identifier", + value="empty_identifier") + self.identifiables.append(self.simulation) + self.people = parse_responsibles(self.header) + self.identifiables.extend(self.people) + + def update_identifiables(self): + assure_has_property(self.simulation, "description", + self.header["description"][0], + to_be_updated=self.to_be_updated) + + # TODO why is here no db.LIST("Person") possible? + + assure_object_is_in_list(self.people, self.simulation, + "responsible", + self.to_be_updated, + datatype=db.LIST(db.REFERENCE)) + + if SOURCES.key in self.header: + reference_records_corresponding_to_files( + record=self.simulation, + recordtypes=["Experiment", "Publication", "Simulation", + "Analysis"], + globs=get_glob(self.header[SOURCES.key]), + property_name=dm.sources, + path=self.crawled_path, + to_be_updated=self.to_be_updated) + self.reference_files_from_header(record=self.simulation) + + if REVISIONOF.key in self.header: + reference_records_corresponding_to_files( + record=self.simulation, + recordtypes=[dm.Software], + property_name=dm.revisionOf, + globs=get_glob(self.header[dm.revisionOf]), + path=self.crawled_path, + to_be_updated=self.to_be_updated) diff --git a/src/caosadvancedtools/scifolder/software_cfood.py b/src/caosadvancedtools/scifolder/software_cfood.py new file mode 100644 index 0000000000000000000000000000000000000000..77fb46521e9aab875b6f99d0a1ee4ac44177e09c --- /dev/null +++ b/src/caosadvancedtools/scifolder/software_cfood.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python +# encoding: utf-8 +# +# Copyright (C) 2019 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2019 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +import os +from itertools import chain + +import caosdb as db +from caosadvancedtools.cfood import (AbstractFileCFood, assure_has_parent, + assure_has_property, assure_name_is, + assure_object_is_in_list, get_entity) +from caosadvancedtools.guard import global_guard as guard +from caosadvancedtools.read_md_header import get_header + +from .generic_pattern import full_pattern +from .utils import get_files_referenced_by_field, parse_responsibles +from .withreadme import BINARIES +from .withreadme import DATAMODEL as dm +from .withreadme import SOURCECODE, WithREADME + + +class SoftwareCFood(AbstractFileCFood, WithREADME): + _prefix = ".*/Software/" + # win_paths can be used to define fields that will contain windows style + # path instead of the default unix ones. Possible fields are: + # ["binaries", "sourceCode","revisionOf"] + win_paths = [] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + WithREADME.__init__(self) + + def collect_information(self): + self.find_referenced_files([BINARIES, SOURCECODE]) + + @staticmethod + def get_re(): + + return SoftwareCFood._prefix + full_pattern + + def create_identifiables(self): + # The software is a record type. Let's try to find it. + self.software = db.execute_query( + "FIND RecordType Software with name = {}".format( + self.match.group("project_identifier"))) + + if len(self.software) == 0: + # Software not found insert if allowed + self.software = db.RecordType( + name=self.match.group("project_identifier")) + self.software.add_parent(name="Software") + self.software.add_property(name="alias", + value=self.match.group("project_name")) + guard.safe_insert(self.software) + elif len(self.software) == 1: + self.software = self.software[0] + else: + raise RuntimeError("Cannot identify software record type. Multiple" + "matches for {}".format( + self.match.group("project_identifier"))) + + # create the software version + # identifiable is made from parent and date and suffix + self.softwareversion = db.Record() + self.softwareversion.add_parent(self.software) + self.softwareversion.add_property("date", self.match.group("date")) + + if self.match.group("suffix"): + self.softwareversion.add_property( + "version", self.match.group("suffix")) + + self.identifiables.append(self.softwareversion) + + # parse people and add them to identifiables + # TODO People are currently 'identifiable' with their first and last + # names. There will be conflicts + self.people = parse_responsibles(self.header) + self.identifiables.extend(self.people) + + def update_identifiables(self): + version_name = self.match.group("project_name") + + if self.match.group("suffix"): + version_name += "_"+self.match.group("suffix") + else: + version_name += "_"+self.match.group("date") + + assure_name_is(self.softwareversion, version_name, + to_be_updated=self.to_be_updated) + assure_has_property(self.softwareversion, "description", + self.header["description"][0], + to_be_updated=self.to_be_updated) + assure_object_is_in_list(obj=self.people, + containing_object=self.softwareversion, + property_name="responsible", + to_be_updated=self.to_be_updated, + datatype=db.LIST(db.REFERENCE) + ) + + self.reference_files_from_header(record=self.softwareversion) diff --git a/src/caosadvancedtools/scifolder/utils.py b/src/caosadvancedtools/scifolder/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3241764fd08b7bdd8509f9b0a11239996a1995fb --- /dev/null +++ b/src/caosadvancedtools/scifolder/utils.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python +# encoding: utf-8 +# +# Copyright (C) 2020 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2020 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +import os +from itertools import chain + +import caosdb as db +import pandas as pd +from caosadvancedtools.cfood import assure_object_is_in_list, fileguide +from caosadvancedtools.utils import (find_records_that_reference_ids, + read_field_as_list, + return_field_or_property, + string_to_person) + + +def parse_responsibles(header): + """ + Extract the responsible person(s) from the yaml header. + + If field responsible is a list every entry from that list will be added as + a person. + Currently only the format <Firstname> <Lastname> <*> is supported. + If it is a simple string, it is added as the only person. + """ + people = [] + + for person in read_field_as_list(header["responsible"]): + people.append(string_to_person(person)) + + return people + + +def get_files_referenced_by_field(globs, prefix="", final_glob=None): + """ + returns all file entities at paths described by given globs + + This function assumes that the supplied globs is a list of + filenames, directories or globs. + + prefix should be the path of the crawled file to supply a context for + relative paths. + """ + referenced_files = [] + globs = [g for g in globs if g is not None] + + for glob in globs: + # TODO extract glob manipulation + + if final_glob is not None and not glob.endswith(final_glob): + glob += final_glob + + if not glob.startswith("/"): + glob = os.path.normpath(os.path.join(prefix, glob)) + else: + glob = os.path.normpath(glob) + + query_string = "FIND file which is stored at {}".format(glob) + + el = db.execute_query(query_string) + + referenced_files.append(el) + + return referenced_files + + +def is_filename_allowed(path, recordtype): + if recordtype.lower() == "experiment": + if "ExperimentalData" in path: + return True + elif recordtype.lower() == "analysis": + if "DataAnalysis" in path: + return True + elif recordtype.lower() == "publication": + if "Publication" in path: + return True + elif recordtype.lower() == "simulation": + if "Simulation" in path: + return True + + return False + + +def get_entity_ids_from_include_file(prefix, file_path): + """reads version ids from include file """ + + if not file_path.startswith("/"): + file_path = os.path.normpath(os.path.join(prefix, file_path)) + else: + file_path = os.path.normpath(file_path) + df = pd.read_csv(fileguide.access(file_path), sep="\t", comment="#") + + if "ID" not in df.columns: + raise ValueError("Include file must have an ID column") + + return list(df.ID) + + +def reference_records_corresponding_to_files(record, recordtypes, globs, path, + to_be_updated, property_name): + # TODO this function needs to be refactored: + # the treatement of keys like 'results' should be separated from searching + # entities (see setting of globs and includes below). + + for recordtype in recordtypes: + + directly_named_files = list(chain(*get_files_referenced_by_field( + globs, + prefix=os.path.dirname(path)))) + + files_in_folders = list(chain(*get_files_referenced_by_field( + globs, + prefix=os.path.dirname(path), + final_glob="**"))) + files = [f for f in directly_named_files + files_in_folders if + is_filename_allowed(f.path, recordtype=recordtype)] + entities = find_records_that_reference_ids( + list(set([ + fi.id for fi in files])), + rt=recordtype) + + if len(entities) == 0: + continue + else: + assure_object_is_in_list(entities, + record, + property_name, + to_be_updated, + datatype=db.LIST(db.REFERENCE)) + + +def create_files_list(df, ftype): + files = [] + + for indx, src in df.loc[ftype, + pd.notnull(df.loc[ftype])].iteritems(): + desc = df.loc[ftype+" description", indx] + + if pd.notnull(desc): + files.append({'file': src, 'description': desc}) + else: + files.append(src) + + return files + + +def add_value_list(header, df, name): + if name in df.index: + header[name] = list(df.loc[name, pd.notnull(df.loc[name])]) + + +def get_xls_header(filepath): + """ + This function reads an xlsx file and creates a dictionary analogue to the + one created by the yaml headers in README.md files read with the get_header + function of caosdb-advancedtools. + As xlsx files lack the hierarchical structure, the information that can be + provided is less complex. See the possibility to use the xlsx files as a + less powerfull version for people who are not comfortable with the + README.md files. + + The xlsx file has a defined set of rows. In each row a list of entries can + be given. This structure is converted to a dictionary with a fix structure. + """ + + header = {} + + df = pd.read_excel(filepath, index_col=0, header=None) + add_value_list(header, df, "responsible") + add_value_list(header, df, "description") + assert len(header["description"]) <= 1 + + for ftype in ["sources", "scripts", "results", "sourceCode", "binaries"]: + if ftype not in df.index: + continue + files = create_files_list(df, ftype) + + if len(files) > 0: + header[ftype] = files + + add_value_list(header, df, "revisionOf") + # there should be only one revision of + + if "revisionOf" in header: + if len(header["revisionOf"]) > 0: + header["revisionOf"] = header["revisionOf"][0] + add_value_list(header, df, "tags") + + return header diff --git a/src/caosadvancedtools/scifolder/withreadme.py b/src/caosadvancedtools/scifolder/withreadme.py new file mode 100644 index 0000000000000000000000000000000000000000..b3eb1095f9af74f11d7045a6096f5ca372913b4a --- /dev/null +++ b/src/caosadvancedtools/scifolder/withreadme.py @@ -0,0 +1,270 @@ +#!/usr/bin/env python +# encoding: utf-8 +# +# Copyright (C) 2020 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2020 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + + +import logging +import os +from dataclasses import dataclass + +import caosdb as db +from caosadvancedtools.cfood import (assure_has_description, assure_has_parent, + assure_object_is_in_list, fileguide) +from caosadvancedtools.read_md_header import get_header as get_md_header +from caosadvancedtools.table_importer import (win_path_converter, + win_path_list_converter) +from caosadvancedtools.utils import return_field_or_property + +from .utils import (get_entity_ids_from_include_file, + get_files_referenced_by_field, get_xls_header) + +LOGGER = logging.getLogger("withreadme") +LOGGER.setLevel(level=logging.ERROR) + + +@dataclass +class DataModel(object): + results: str = "results" + scripts: str = "scripts" + sources: str = "sources" + date: str = "date" + Project: str = "Project" + Analysis: str = "Analysis" + identifier: str = "identifier" + responsible: str = "responsible" + revisionOf: str = "revisionOf" + Experiment: str = "Experiment" + Publication: str = "Publication" + Simulation: str = "Simulation" + Analysis: str = "Analysis" + revisionOf: str = "revisionOf" + binaries: str = "binaries" + sourcecode: str = "sourceCode" + description: str = "description" + + +DATAMODEL = DataModel() +dm = DATAMODEL + + +class HeaderField(object): + def __init__(self, key, model): + self.key = key + self.model = model + + +RESULTS = HeaderField("results", dm.results) +SCRIPTS = HeaderField("scripts", dm.scripts) +SOURCES = HeaderField("sources", dm.sources) +FILE = HeaderField("file", None) +INCLUDE = HeaderField("include", None) +REVISIONOF = HeaderField("revisionOf", dm.revisionOf) +BINARIES = HeaderField("binaries", dm.binaries) +SOURCECODE = HeaderField("sourceCode", dm.sourcecode) +DESCRIPTION = HeaderField("description", dm.description) +RECORDTYPE = HeaderField("recordtype", None) + + +def get_glob(field): + """ takes a field which must be a list of globs or dicts. + + if it is a dict, it must have either an include or a file key""" + globs = [] + + for value in field: + + if isinstance(value, dict) and INCLUDE.key in value: + continue + + globs.append(return_field_or_property(value, FILE.key)) + + return globs + + +def get_description(value): + if isinstance(value, dict) and DESCRIPTION.key in value: + return value[DESCRIPTION.key] + else: + return None + + +def get_rt(value): + if isinstance(value, dict) and RECORDTYPE.key in value: + return value[RECORDTYPE.key] + else: + return None + + +class WithREADME(object): + def __init__(self): + self._header = None + self.ref_files = {} + + @property + def header(self): + if self._header is None: + if self.crawled_path.lower().endswith(".md"): + self._header = get_md_header( + fileguide.access(self.crawled_path)) + elif self.crawled_path.lower().endswith(".xlsx"): + self._header = get_xls_header( + fileguide.access(self.crawled_path)) + else: + raise RuntimeError("Readme format not recognized.") + self.convert_win_paths() + + return self._header + + def find_referenced_files(self, fields): + """ iterates over given fields in the header and searches for files + + if the field contains a glob. The file entities are attached""" + + for field in fields: + + if field.key not in self.header: + continue + + globs = get_glob(self.header[field.key]) + files = get_files_referenced_by_field( + globs, prefix=os.path.dirname(self.crawled_path)) + + description = [get_description(val) for val in + self.header[field.key]] + recordtype = [get_rt(val) for val in self.header[field.key]] + self.ref_files[field.model] = [ + (f, d, r) for f, d, r in zip(files, description, recordtype)] + # flatten returned list of file lists + flat_list = [f.path for sublist in files + for f in sublist] + + if len(flat_list) == 0: + LOGGER.warn("ATTENTION: the field {} does not reference any " + "known files".format(field.key)) + + self.attached_filenames.extend(flat_list) + + def convert_path(self, el): + """ converts the path in el to unix type + + el can be a dict of a string. If el is dict it must have a file key + + returns: same type as el + """ + + if isinstance(el, dict): + if INCLUDE.key in el: + el[INCLUDE.key] = win_path_converter(el[INCLUDE.key]) + + return el + + if FILE.key not in el: + raise ValueError("field should have a 'file' attribute") + el[FILE.key] = win_path_converter(el[FILE.key]) + + return el + else: + return win_path_converter(el) + + def convert_win_paths(self): + for field in self.win_paths: + if field in self.header: + + if isinstance(self.header[field], list): + self.header[field] = [ + self.convert_path(el) for el in self.header[field]] + else: + self.header[field] = self.convert_path(self.header[field]) + + def reference_files_from_header(self, record): + """adds properties that reference the files collected in ref_files + + ref_files is expected to be a list of (files, description, recordtype) + tuples, where files is the list of file entities, description the description + that shall be added to each and recordtype the recordtype that the + files shall get as parent. files may be an empty list and description + and recordtype may be None. + + The files will be grouped according to the keys used in ref_files and + the record types. The record types take precedence. + """ + references = {} + + for prop_name, ref_tuple in self.ref_files.items(): + generic_references = [] + + for files, description, recordtype in ref_tuple: + if len(files) == 0: + continue + + if description is not None: + for fi in files: + assure_has_description(fi, description, force=True) + + if recordtype is None: + generic_references.extend(files) + else: + for fi in files: + # fix parent + assure_has_parent(fi, recordtype, force=True, + unique=False) + + if recordtype not in references: + references[recordtype] = [] + references[recordtype].extend(files) + + if len(generic_references) > 0: + assure_object_is_in_list( + generic_references, + record, + prop_name, + to_be_updated=self.to_be_updated, + datatype=db.LIST(db.REFERENCE), + ) + + for ref_type in references.keys(): + assure_object_is_in_list( + references[ref_type], + record, + ref_type, + to_be_updated=self.to_be_updated, + ) + + def reference_included_records(self, record, fields, to_be_updated): + """ iterates over given fields in the header and searches for files + + if the field contains a glob. The file entities are attached""" + + for field in fields: + + if field.key not in self.header: + continue + included = [] + + for item in self.header[field.key]: + if INCLUDE.key in item: + included.extend( + get_entity_ids_from_include_file( + os.path.dirname(self.crawled_path), + item[INCLUDE.key])) + + assure_object_is_in_list(included, + record, + field.model, + to_be_updated, + datatype=db.LIST(db.REFERENCE)) diff --git a/src/doc/crawler.rst b/src/doc/crawler.rst index 2380cdbdbe44989855adb42afd391467502b5809..92a624bb59f4c0fba8d46076d6df0e0e30bbab75 100644 --- a/src/doc/crawler.rst +++ b/src/doc/crawler.rst @@ -36,9 +36,8 @@ different components of the CaosDB Crawler can be found in the `developers’ information <#extending-the-crawlers>`__ below. In case you are happy with our suggestion of a standard crawler, feel -free to use the standard crawler. The standard crawler lives in this git -repository maintained by Henrik tom Wörden: -https://gitlab.com/henrik_indiscale/scifolder +free to use the standard crawler. The standard crawler lives in the submodule +`caosadvancedtools.scifolder` Usage ===== diff --git a/unittests/data/DataAnalysis/2010_TestProject/2019-02-03_something/README.md b/unittests/data/DataAnalysis/2010_TestProject/2019-02-03_something/README.md new file mode 100644 index 0000000000000000000000000000000000000000..71454e8909393b432ca74fa01e77b33d8b0644d5 --- /dev/null +++ b/unittests/data/DataAnalysis/2010_TestProject/2019-02-03_something/README.md @@ -0,0 +1,15 @@ +--- +responsible: +- Only Responsible +description: A description of another example analysis. + +sources: +- file: "/ExperimentalData/2010_TestProject/2019-02-03/*.dat" + description: an example reference to a results file + +scripts: +- file: plot.py + description: a plotting script +results: +- file: results.pdf +... diff --git a/unittests/data/ExperimentalData/2010_TestProject/2019-02-03_something/README.md b/unittests/data/ExperimentalData/2010_TestProject/2019-02-03_something/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b7e5051c7bdbcdafb1bbd3a870b00feecfb109ff --- /dev/null +++ b/unittests/data/ExperimentalData/2010_TestProject/2019-02-03_something/README.md @@ -0,0 +1,9 @@ +--- +responsible: +- Only Responsible +description: A description of another example experiment. + +results: +- file: "/ExperimentalData/2010_TestProject/2019-02-03/*.dat" + description: an example reference to a results file +... diff --git a/unittests/data/Publications/Posters/2019-02-03_something/README.md b/unittests/data/Publications/Posters/2019-02-03_something/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c95e37ecc569103d8c3a812e45f1a5110781ea26 --- /dev/null +++ b/unittests/data/Publications/Posters/2019-02-03_something/README.md @@ -0,0 +1,11 @@ +--- +responsible: +- Only Responsible +description: A description of another example experiment. + +sources: +- /DataAnalysis/2010_TestProject/2019-02-03/results.pdf + +results: +- "*.pdf" +... diff --git a/unittests/data/README.md b/unittests/data/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a2e0ce6e319219c10bc61653510ad53dd2ab958a --- /dev/null +++ b/unittests/data/README.md @@ -0,0 +1,14 @@ +--- +responsible: Ana Lytic +description: An examplary analysis of very exciting research. The analysis was conducted following state of the art best practices of scientific methodology. +sources: + - /ExperimentalData/2010_TestProject/2019-02-03_something/ + - file: /ExperimentalData/2010_TestProject/2019-02-03_something/ + description: An example reference to an experiment. The experimental data was analysed with statistical methods using proper error calculations. +scripts: + - file: scripts + description: all the files needed to run the analysis +results: + - file: results.pdf + description: a plot of the statistical analysis +... diff --git a/unittests/data/README.xlsx b/unittests/data/README.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..a909347789edc1d5a1bbaacd998744cee83d5f6b Binary files /dev/null and b/unittests/data/README.xlsx differ diff --git a/unittests/data/SimulationData/2010_TestProject/2019-02-03_something/README.md b/unittests/data/SimulationData/2010_TestProject/2019-02-03_something/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fba1bd48a89514cbff92f9d8bd518484ecaa624b --- /dev/null +++ b/unittests/data/SimulationData/2010_TestProject/2019-02-03_something/README.md @@ -0,0 +1,12 @@ +--- +responsible: +- Only Responsible +description: A description of another example experiment. + +results: +- file: "*.dat" + description: an example reference to a results file + +scripts: +- sim.py +... diff --git a/unittests/test_cfoods.py b/unittests/test_cfoods.py new file mode 100644 index 0000000000000000000000000000000000000000..87e6d6d2da0254e134def92c098b1568c26863ab --- /dev/null +++ b/unittests/test_cfoods.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# encoding: utf-8 +# +# Copyright (C) 2019 Henrik tom Wörden +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +import os +import unittest + +from caosadvancedtools.scifolder import (AnalysisCFood, ExperimentCFood, + PublicationCFood, SimulationCFood) + +data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), + "data") + + +class CFoodTest(unittest.TestCase): + def test_analysis(self): + self.assertFalse(AnalysisCFood.match_item("nopath")) + path = (data_path+"/DataAnalysis/2010_TestProject/" + "2019-02-03_something/README.md") + self.assertTrue(AnalysisCFood.match_item(path)) + AnalysisCFood(path) + + def test_experiment(self): + self.assertFalse(ExperimentCFood.match_item("nopath")) + path = (data_path+"/ExperimentalData/2010_TestProject/" + "2019-02-03_something/README.md") + self.assertTrue(ExperimentCFood.match_item(path)) + ExperimentCFood(path) + + def test_publication(self): + self.assertFalse(PublicationCFood.match_item("nopath")) + path = data_path+"/Publications/Posters/2019-02-03_something/README.md" + self.assertTrue(PublicationCFood.match_item(path)) + PublicationCFood(path) + + def test_simulation(self): + self.assertFalse(SimulationCFood.match_item("nopath")) + path = (data_path + "/SimulationData/2010_TestProject/" + "2019-02-03_something/README.md") + self.assertTrue(SimulationCFood.match_item(path)) + SimulationCFood(path) diff --git a/unittests/test_scifolder_utils.py b/unittests/test_scifolder_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..30e211d9daf8da8f831bff4580efbc63d6bdf6fb --- /dev/null +++ b/unittests/test_scifolder_utils.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python +# encoding: utf-8 +# +# Copyright (C) 2020 IndiScale GmbH <info@indiscale.com> +# Copyright (C) 2020 Henrik tom Wörden <h.tomwoerden@indiscale.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +import os +import unittest + +from caosadvancedtools.scifolder.utils import get_xls_header + + +class XLSTest(unittest.TestCase): + def test_read(self): + filename = os.path.join(os.path.dirname(__file__), + "data/README.xlsx") + assert os.path.exists(filename) + + header = get_xls_header(filename) + assert header is not None + assert isinstance(header, dict) + + # responsible + assert header['responsible'] == ["Ana Lytic"] + + # description + assert len(header['description']) == 1 + assert isinstance(header['description'][0], str) + assert len(header['description'][0]) > 20 + assert "exciting" in header['description'][0] + + # sources + assert isinstance(header['sources'], list) + + for el in header['sources']: + assert isinstance(el, dict) + assert "TestProject" in el["file"] + assert "example" in el["description"] + + # scripts + assert isinstance(header['scripts'], list) + + for el in header['scripts']: + assert isinstance(el, dict) + assert "scripts" == el["file"] + assert "all the files" in el["description"] + + # results + assert isinstance(header['results'], list) + + for el in header['results']: + assert isinstance(el, dict) + assert "result.pdf" == el["file"] + assert "plot" in el["description"]