diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f295e0b9480d56359ec1f1d30bc3be5bd54aea57..32e69420872cbadf0f91452eaf15463e7af97210 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -315,7 +315,7 @@ pylint: optional: true allow_failure: true script: - - pylint --unsafe-load-any-extension=y -d all -e E,F src/caoscrawler + - pylint --unsafe-load-any-extension=y -d all -e E,F src/caoscrawler unittests # Build the sphinx documentation and make it ready for deployment by Gitlab Pages # Special job for serving a static website. See https://docs.gitlab.com/ee/ci/yaml/README.html#pages diff --git a/CHANGELOG.md b/CHANGELOG.md index 7c616e2af9790490f404845d488e5b2715434835..45f7afce0801dbee32adc632541c5560a352d583 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### +- SPSS tools now can be configured to give custom names to labels. + ### Changed ### ### Deprecated ### diff --git a/Makefile b/Makefile index 7167ebfdf106f5129ce7941706b9e871d51e551f..d3cd8a84f10e1583eb71aa0b9f0c3127ef4934d8 100644 --- a/Makefile +++ b/Makefile @@ -36,11 +36,11 @@ check: style lint .PHONY: check style: - pycodestyle --count src unittests + pycodestyle --count src/caoscrawler unittests .PHONY: style lint: - pylint --unsafe-load-any-extension=y -d all -e E,F src/caoscrawler + pylint --unsafe-load-any-extension=y -d all -e E,F src/caoscrawler unittests .PHONY: lint unittest: diff --git a/setup.cfg b/setup.cfg index e246ef6a45950046ddb4b3522f23efad649b5d2d..81588ee4da4b5cdb01b43231080e951e58672695 100644 --- a/setup.cfg +++ b/setup.cfg @@ -50,5 +50,6 @@ h5-crawler = numpy spss = pandas[spss] + toml rocrate = rocrate diff --git a/src/caoscrawler/converters/converters.py b/src/caoscrawler/converters/converters.py index 8adacb70444936c8e000d2334de7e7b3675e4746..61ab4545c7c9ad3e34882f3030ca63c9b20a2d6b 100644 --- a/src/caoscrawler/converters/converters.py +++ b/src/caoscrawler/converters/converters.py @@ -27,7 +27,6 @@ from __future__ import annotations import datetime import json import logging -from datetime import date import os import re import warnings @@ -36,6 +35,7 @@ from inspect import signature from string import Template from typing import Any, Callable, Optional, Union from caosadvancedtools.table_importer import XLSImporter + import linkahead as db import pandas as pd import yaml @@ -44,7 +44,7 @@ from jsonschema import ValidationError, validate from ..stores import GeneralStore, RecordStore from ..structure_elements import (BooleanElement, DictElement, Directory, File, - FloatElement, IntegerElement, JSONFile, + FloatElement, IntegerElement, ListElement, NoneElement, StructureElement, TextElement) from ..utils import has_parent @@ -99,8 +99,8 @@ def str_to_bool(x): # However, we should not narrow down the type of the arguments compared to the function definitions # in the parent Converter class. See # - https://mypy.readthedocs.io/en/stable/common_issues.html#incompatible-overrides -# - https://stackoverflow.com/questions/56860/what-is-an-example-of-the-liskov-substitution-principle -# - https://blog.daftcode.pl/covariance-contravariance-and-invariance-the-ultimate-python-guide-8fabc0c24278 +# - https://stackoverflow.com/questions/56860/example-of-the-liskov-substitution-principle +# - https://blog.daftcode.pl/covariance-contravariance-and-invariance-8fabc0c24278 # Thus, the problem lies in the following design: # Converter instances are supposed to be used by the Crawler in a generic way (The crawler calls # `match` and `typecheck` etc) but the functions are not supposed to be called with generic @@ -264,10 +264,12 @@ def create_records(values: GeneralStore, Parameters ---------- values: GeneralStore - This GeneralStore will be used to access variables that are needed during variable substitution - in setting the properties of records and files. - Furthermore, the records that are generated in this function will be stored in this GeneralStore - **additionally to** storing them in the RecordStore given as the second argument to this function. + This GeneralStore will be used to access variables that are needed during variable + substitution in setting the properties of records and files. + + Furthermore, the records that are generated in this function will be stored in this + GeneralStore **additionally to** storing them in the RecordStore given as the second argument + to this function. records: RecordStore The RecordStore where the generated records will be stored. @@ -275,8 +277,9 @@ def create_records(values: GeneralStore, Returns ------- : list[tuple[str, str]] - A list of tuples containing the record names (1st element of tuple) and respective property names - as 2nd element of the tuples. This list will be used by the scanner for creating the debug tree. + A list of tuples containing the record names (1st element of tuple) and respective property + names as 2nd element of the tuples. This list will be used by the scanner for creating the + debug tree. """ keys_modified = [] @@ -548,8 +551,8 @@ class Converter(object, metaclass=ABCMeta): # This matcher works analogously to the attributes matcher in the XMLConverter for prop_def_key, prop_def_value in self.definition[label].items(): match_counter = 0 - matched_m_prop = None - matched_m_prop_value = None + matched_m_prop: Optional[re.Match] = None + matched_m_prop_value: Optional[re.Match] = None for prop_key, prop_value in properties.items(): # print("{} = {}".format(prop_key, prop_value)) # TODO: automatic conversion to str ok? @@ -564,15 +567,19 @@ class Converter(object, metaclass=ABCMeta): matched_m_prop_value = m_prop_value # TODO: How to deal with multiple matches? # There are multiple options: - # - Allow multiple attribute-key matches: Leads to possible overwrites of variables + # - Allow multiple attribute-key matches: Leads to possible overwrites of + # variables # - Require unique attribute-key and attribute-value matches: Very complex - # - Only allow one single attribute-key to match and run attribute-value match separately. + # - Only allow one single attribute-key to match and run attribute-value match + # separately. # Currently the latter option is implemented. # TODO: The ROCrateEntityConverter implements a very similar behavior. if match_counter == 0: return False elif match_counter > 1: raise RuntimeError("Multiple properties match the same {} entry.".format(label)) + assert matched_m_prop is not None + assert matched_m_prop_value is not None vardict.update(matched_m_prop.groupdict()) vardict.update(matched_m_prop_value.groupdict()) return True @@ -629,7 +636,7 @@ class Converter(object, metaclass=ABCMeta): tr_func = transformer_functions[tr_func_key] # Call the function: sig = signature(tr_func) - if len(sig.parameters) == 1 and len(tr_func_params) == 0: + if len(sig.parameters) == 1 and not tr_func_params: # FIXME Test for this line. out_value = tr_func(in_value) else: out_value = tr_func(in_value, tr_func_params) @@ -1069,7 +1076,7 @@ class PropertiesFromDictConverter(DictElementConverter): """ def __init__(self, definition: dict, name: str, converter_registry: dict, - referenced_record_callback: Optional[callable] = None): + referenced_record_callback: Optional[Callable] = None): super().__init__(definition, name, converter_registry) self.referenced_record_callback = referenced_record_callback @@ -1077,15 +1084,16 @@ class PropertiesFromDictConverter(DictElementConverter): def _recursively_create_records(self, subdict: dict, root_record: db.Record, root_rec_name: str, values: GeneralStore, records: RecordStore, - referenced_record_callback: callable, + referenced_record_callback: Optional[Callable], keys_modified: list = [] ): """Create a record form the given `subdict` and recursively create referenced records.""" blacklisted_keys = self.definition["record_from_dict"][ - "properties_blacklist"] if "properties_blacklist" in self.definition["record_from_dict"] else [] - special_references = self.definition["record_from_dict"]["references"] if "references" in self.definition["record_from_dict"] else [ - ] + "properties_blacklist"] if "properties_blacklist" in self.definition[ + "record_from_dict"] else [] + special_references = self.definition["record_from_dict"][ + "references"] if "references" in self.definition["record_from_dict"] else [] for key, value in subdict.items(): @@ -1155,7 +1163,7 @@ class PropertiesFromDictConverter(DictElementConverter): records: RecordStore, values: GeneralStore, keys_modified: list, - referenced_record_callback: callable + referenced_record_callback: Optional[Callable], ): """Create the referenced Record and forward the stores etc. to ``_recursively_create_records``. @@ -1176,7 +1184,7 @@ class PropertiesFromDictConverter(DictElementConverter): ValueStore for entering new Records keys_modified : list List for keeping track of changes - referenced_record_callback : callable + referenced_record_callback : Callable Advanced treatment of referenced records as given in the converter initialization. """ diff --git a/src/caoscrawler/converters/rocrate.py b/src/caoscrawler/converters/rocrate.py index 7dcad86589961f03f1e755ddbc0b60742cf4ed4a..728cef34d991791ae0aeb45ec0016f8216947ae4 100644 --- a/src/caoscrawler/converters/rocrate.py +++ b/src/caoscrawler/converters/rocrate.py @@ -224,6 +224,7 @@ class ROCrateEntityConverter(Converter): children.append( ROCrateEntity(element.folder, element.entity.crate.dereference( p["@id"]))) - # TODO: See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/195 for discussion. + # TODO: See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/195 for + # discussion. return children diff --git a/src/caoscrawler/converters/spss.py b/src/caoscrawler/converters/spss.py index 00742e91506245435ed0c590f68ea9ffce65717a..aa542aa0e19066fd1f6784648b515a8f2a8cd93b 100644 --- a/src/caoscrawler/converters/spss.py +++ b/src/caoscrawler/converters/spss.py @@ -16,17 +16,40 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. -"""Converter for SAV files (stored by SPSS).""" +"""Converter for SAV files (stored by SPSS). + +Configuration +------------- + +SPSS interpretation can be configured via a file. This TOML file has the following sections: + +[labels] +======== + +Each entry is a list of strings with the possible values for the label. The entry's key shall be +used as RecordType name. + +Example +======= + +:: + + [labels] + + Hometown = ["Moscow", "Rome", "Paris", "Lisbon"] + +""" from __future__ import annotations # Can be removed with 3.10. import argparse from collections import OrderedDict -from typing import Any, Optional +from typing import Any, Optional, Union import numpy as np import pandas as pd import pyreadstat +import toml import yaml from ..stores import GeneralStore @@ -40,6 +63,7 @@ READSTAT_TYPES = { ORIGINAL_TYPES = { "EDATE8": "DATETIME", } +STRICT_LABEL_CHOICES = ["NONE", "WARN", "FULL"] class SPSSConverter(converters.TableConverter): @@ -93,17 +117,31 @@ out : dict[str, str] return dtypes -def spss_to_yaml(savfile: str, yamlfile: str, cfood: Optional[str] = None) -> None: +def spss_to_yaml(savfile: str, yamlfile: str, cfood: Optional[str] = None, + conf: Optional[Union[str, dict]] = None, strict: str = "NONE") -> None: """Parse the *.sav and create basic datamodel in ``yamlfile``. Parameters ---------- -cfood: str - If given, also create a cfood skeleton. +cfood: str, optional + If given, also create a cfood skeleton. + +conf: Union[str, dict], optional + If given, a dict with with more options, or the location of a corresponding config.toml file. + +strict: str, default="NONE" + If NONE, nothing special happens if the configuration is not sufficient. If WARN, the function + emits a warning. If FULL, all labels must be given in the configuration. Missing labels will + lead to a help message and an exception will be raised. """ _, meta = pyreadstat.read_sav(savfile, metadataonly=True) dtypes = read_column_types(meta=meta) + if conf is None: + conf = {} + elif isinstance(conf, str): + conf = toml.load(conf) + cfood_str = """ --- metadata: @@ -178,6 +216,8 @@ directory: # corresponds to the directory given to the crawler enums: dict[str, list[str]] = {} properties = OrderedDict() + missing = {} + ambiguous = {} for name in meta.column_names: prop = { "datatype": dtypes[name], @@ -187,6 +227,29 @@ directory: # corresponds to the directory given to the crawler prop["description"] = desc # Handle categorial variables if var_label := meta.variable_to_label.get(name): + guessed_label = _handle_label(var_label, meta, conf) + if guessed_label["status"] == "OK": + var_label = guessed_label["label"] + elif guessed_label["status"] == "AMBIGUOUS": + ambiguous[var_label] = meta.value_labels[var_label].values() + sorted_guessed = sorted(guessed_label["candidates"].items(), + key=lambda x: x[1], reverse=True) + if not strict == "NONE": + converters.logger.warning(f"Ambiguous label heuristic for label {var_label}\n" + f"with values {list(ambiguous[var_label])}:\n" + f"{guessed_label['candidates']}\n") + if sorted_guessed[0][1] > sorted_guessed[1][1]: + var_label = sorted_guessed[0][0] + else: + raise ValueError("Hard ambiguity: label {var_label}\n" + f"with values {list(ambiguous[var_label])}\ncannot decide\n" + f"between {sorted_guessed[0][0]} and {sorted_guessed[1][0]}") + elif guessed_label["status"] == "NOT_FOUND": + missing[var_label] = meta.value_labels[var_label].values() + if not strict == "NONE": + converters.logger.warning( + "Label not found in configuration:\n" + f"{var_label} with {list(missing[var_label])}") vvl = meta.variable_value_labels[name] # reproducible (and sensible) order label_values = [vvl[key] for key in sorted(vvl.keys())] @@ -197,6 +260,19 @@ directory: # corresponds to the directory given to the crawler prop["datatype"] = var_label properties[name] = prop + if strict == "FULL" and (missing or ambiguous): + error_msg = ["\n\nStrict label handling failed:"] + if missing: + error_msg.append("\nMISSING\n=======\n") + for var_label, values in missing.items(): + error_msg.append(f"\n{var_label}\n{'-' * len(var_label)}\n") + error_msg.append(str(list(values)) + "\n") + if ambiguous: + error_msg.append("\nAMBIGUOUS\n=========\n") + for var_label, values in ambiguous.items(): + error_msg.append(f"\n{var_label}\n{'-' * len(var_label)}\n") + error_msg.append(str(list(values)) + "\n") + raise ValueError("".join(error_msg)) output = f"""# auto-generated data model from file "{savfile}". # To insert a datamodel into LinkAhead, run: # @@ -215,6 +291,10 @@ directory: # corresponds to the directory given to the crawler output += f"""{name}: description: # possible values: {values}\n""" + enum_lines = [] + for value in values: + enum_lines.append(f" - \"{value}\"") + output += " enums:\n" + "\n".join(enum_lines) + "\n\n" output += (""" ############### @@ -222,7 +302,7 @@ directory: # corresponds to the directory given to the crawler ############### DummyRT: - description: Note: Change name and enter description. + description: "Note: Change name and enter description." recommended_properties: """ + " ".join(yaml.dump(dict(properties), # from OrderedDict to dict @@ -230,20 +310,20 @@ DummyRT: sort_keys=False).splitlines(keepends=True))) # Experimental: Enum creation - output += """ -############### -# Enum values # -############### -""" - for name, values in enums.items(): - output += f"\n# ### {name} ###\n" - for value in values: - output += f""" -{value}: - role: Record - inherit_from_suggested: - - {name} -""" +# output += """ +# ############### +# # Enum values # +# ############### +# """ +# for name, values in enums.items(): +# output += f"\n# ### {name} ###\n" +# for value in values: +# output += f""" +# {value}: +# role: Record +# inherit_from_suggested: +# - {name} +# """ with open(yamlfile, encoding="utf-8", mode="w") as myfile: myfile.write(output) @@ -283,12 +363,72 @@ DummyRT: myfile.write(cfood_str) +def _handle_label(var_label: str, meta, conf: dict) -> dict[str, Any]: + """Guess how to handle a label. + + Parameters + ---------- + var_label : str + The (native) variable label. + + meta: pyreadstat.metadata_container + PyReadstat's metadata object. + + conf : dict + The configuration + + Returns + ------- + out : dict + A dict with one or more of these fields (``status`` always exists): + - ``status``: May be one of ``OK``, ``NOT_FOUND``, ``AMBIGUOUS``. + - ``label``: A suggested name for the label. + - ``values``: The union of the values in the metadata and the configuration. + - ``candidates``: If ambiguous, a dict with label names and their ratings. + """ + result: dict[str, Any] = {} + value_labels = meta.value_labels[var_label] + values = value_labels.values() + matching: dict[str, float] = {} + label_name = "" + for target_label, target_values in conf.get("labels", {}).items(): + # Stronger punishment for entries that can not be found. + tversky_index = len(set(values).intersection(target_values)) / ( + len(set(values).intersection(target_values)) + + 0.2 * len(set(target_values) - set(values)) + + 0.8 * len(set(values) - set(target_values)) + ) + if tversky_index > 0: + matching[target_label] = tversky_index + label_name = target_label + if len(matching) == 0: + result["status"] = "NOT_FOUND" + elif len(matching) > 1: + result["status"] = "AMBIGUOUS" + result["candidates"] = matching + else: + result["status"] = "OK" + result["label"] = label_name + result["values"] = set(values).union(conf["labels"][label_name]) + return result + + def _parse_arguments(): """Parse the arguments.""" parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('-i', '--input', help="The *.sav file.", required=True) parser.add_argument('-o', '--outfile', help="Yaml filename to save the result", required=True) parser.add_argument('--cfood', help="Yaml filename to create cfood output in", required=False) + parser.add_argument('-c', '--conf', help="Config filename for more options", required=False) + parser.add_argument('--strict-labels', + help="""The strictness for label matching has several grades: + - NONE: (the default), labels do not need to be configured at all. + - WARN: If no label can be found, a warning will be emitted. The same holds for ambiguities, + but the best match will be used. + - FULL: Every label must be defined in the config, without ambiguities (overlaps). Otherwise the + program will terminate with an error. + """, + choices=STRICT_LABEL_CHOICES, default=STRICT_LABEL_CHOICES[0]) return parser.parse_args() @@ -296,7 +436,8 @@ def _parse_arguments(): def spss_to_datamodel_main(): """The main function of this script.""" args = _parse_arguments() - spss_to_yaml(savfile=args.input, yamlfile=args.outfile, cfood=args.cfood) + spss_to_yaml(savfile=args.input, yamlfile=args.outfile, cfood=args.cfood, conf=args.conf, + strict=args.strict_labels) print(f"Written datamodel to: {args.outfile}") if args.cfood: print(f"Written cfood to: {args.cfood}") diff --git a/src/caoscrawler/converters/xml_converter.py b/src/caoscrawler/converters/xml_converter.py index 60d7b49431fb011a06b7105a16471b0b3c7b2268..cc4e8e50ff7328e1247ab78ca147c9607be66f6f 100644 --- a/src/caoscrawler/converters/xml_converter.py +++ b/src/caoscrawler/converters/xml_converter.py @@ -84,14 +84,14 @@ class XMLTagConverter(Converter): - When text_as_children is set to true, text nodes will be generated that contain the text contained in the matched tags. - - When attribs_as_children is set to true, attribute nodes will be generated from the attributes - of the matched tags. + - When attribs_as_children is set to true, attribute nodes will be generated from the + attributes of the matched tags. Notes ----- The default is to take the namespace map from the current node and use it in xpath queries. - Because default namespaces cannot be handled by xpath, it is possible to remap the default namespace - using the key ``default_namespace``. + Because default namespaces cannot be handled by xpath, it is possible to remap the default + namespace using the key ``default_namespace``. The key ``nsmap`` can be used to define additional nsmap entries. """ diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index 04d0eaf3d32a2ed8cfafdfc54c9f9acd6fe6ca1a..a0e2620d76230099456ac498a4cca0e32c33a1ba 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -195,8 +195,8 @@ class Crawler(object): Deprecated arguments: - The debug argument does not have an effect anymore. - - generalStore: This argument does not have an effect anymore. It might be added to the scanning - functions in the scanner module in the future, if needed. + - generalStore: This argument does not have an effect anymore. It might be added to the + scanning functions in the scanner module in the future, if needed. Parameters ---------- @@ -524,8 +524,9 @@ one with the entities that need to be updated and the other with entities to be If an Entity is identified, then the string value is replaced by the ID. """ if get_list_datatype(prop.datatype) is None: # not a list + # datatype is a non-generic reference and value is a string if (isinstance(prop.value, str) and is_reference(prop.datatype) and - prop.datatype != db.FILE and prop.datatype != db.REFERENCE): # datatype is a non-generic reference and value is a string + prop.datatype != db.FILE and prop.datatype != db.REFERENCE): try: # the get_entity function will raise an error if not unique prop.value = Crawler._get_property_id_for_datatype( @@ -910,7 +911,8 @@ the CaosDB Crawler successfully crawled the data and domain = get_config_setting("public_host_url") if get_config_setting("create_crawler_status_records"): text += ("You can checkout the CrawlerRun Record for more information:\n" - f"{domain}/Entity/?P=0L10&query=find%20crawlerrun%20with%20run_id=%27{run_id}%27\n\n") + f"{domain}/Entity/?P=0L10&query=find%20crawlerrun%20with%20run_id=%27{run_id}%27" + "\n\n") text += (f"You can download the logfile here:\n{get_shared_resource_link(domain, logfile)}") send_mail( from_addr=get_config_setting("sendmail_from_address"), @@ -1144,9 +1146,12 @@ def crawler_main(crawled_directory_path: Union[str, list[str]], if "SHARED_DIR" in os.environ: # pylint: disable=E0601 domain = get_config_setting("public_host_url") - logger.error("Unexpected Error: Please tell your administrator about this and provide " - f"the following path.\n{get_shared_resource_link(domain, debuglog_public)}") + logger.error("Unexpected Error: Please tell your administrator about this and " + "provide the following path.\n" + f"{get_shared_resource_link(domain, debuglog_public)}") _update_status_record(crawler.run_id, 0, 0, status="FAILED") + if debug: + raise return 1 @@ -1168,7 +1173,7 @@ def parse_args(): help="Path name of the provenance yaml file. " "This file will only be generated if this option is set.") parser.add_argument("--debug", required=False, action="store_true", - help="Path name of the cfood yaml file to be used.") + help="More debugging.") # TODO allow to provide multiple directories to be crawled on the commandline parser.add_argument("crawled_directory_path", help="The subtree of files below the given path will " @@ -1240,7 +1245,7 @@ def main(): if args.restrict: restricted_path = args.restrict - sys.exit(crawler_main( + return_code = crawler_main( crawled_directory_path=args.crawled_directory_path, cfood_file_name=args.cfood_file_name, identifiables_definition_file=args.load_identifiables, @@ -1254,7 +1259,8 @@ def main(): restricted_path=restricted_path, remove_prefix=args.remove_prefix, add_prefix=args.add_prefix, - )) + ) + sys.exit(return_code) if __name__ == "__main__": diff --git a/src/caoscrawler/identifiable.py b/src/caoscrawler/identifiable.py index 80f83f9c4ea252155392638d0bb20acb5eff1edf..67947a31e04c43625da43c1b2bce51d59ac3dfb2 100644 --- a/src/caoscrawler/identifiable.py +++ b/src/caoscrawler/identifiable.py @@ -84,8 +84,8 @@ class Identifiable(): def _value_representation(value) -> str: """returns the string representation of property values to be used in the hash function - The string is the LinkAhead ID in case of SyncNode objects (SyncNode objects must have an ID) - and the string representation of None, bool, float, int, datetime and str. + The string is the LinkAhead ID in case of SyncNode objects (SyncNode objects must have an + ID) and the string representation of None, bool, float, int, datetime and str. """ if value is None: diff --git a/src/caoscrawler/identifiable_adapters.py b/src/caoscrawler/identifiable_adapters.py index 444b73f5d9a42cf8ec23eec7cb718b1fc183dd30..4dc8ccf248c673ee756506a5202f17944ea2065a 100644 --- a/src/caoscrawler/identifiable_adapters.py +++ b/src/caoscrawler/identifiable_adapters.py @@ -49,6 +49,7 @@ def _retrieve_RecordType(id=None, name=None): """ Retrieve the RecordType from LinkAhead. For mocking purposes. """ + # FIXME cached retrieval return db.RecordType(name=name, id=id).retrieve() @@ -223,8 +224,8 @@ class IdentifiableAdapter(metaclass=ABCMeta): def __create_pov_snippet(pname: str, pvalue, startswith: bool = False): """Return something like ``'name'='some value'`` or ``'name' LIKE 'some*'``. - If ``startswith`` is True, the value of strings will be cut off at 200 characters and a ``LIKE`` - operator will be used to find entities matching at the beginning. + If ``startswith`` is True, the value of strings will be cut off at 200 characters and a + ``LIKE`` operator will be used to find entities matching at the beginning. """ if startswith and isinstance(pvalue, str) and len(pvalue) > 200: operator_value_str = f" LIKE '{escape_squoted_text(pvalue[:200])}*'" @@ -244,7 +245,8 @@ class IdentifiableAdapter(metaclass=ABCMeta): The Identifiable whose properties shall be used. startswith: bool, optional - If True, check string typed properties against the first 200 characters only. Default is False. + If True, check string typed properties against the first 200 characters only. Default is + False. """ query_string = "" pov = IdentifiableAdapter.__create_pov_snippet # Shortcut diff --git a/src/caoscrawler/version.py b/src/caoscrawler/version.py index 4cd435486aca26e20e785bbbeb65c013d8e727cb..b02bed380826c51f42fd5321598b215cb8a920c0 100644 --- a/src/caoscrawler/version.py +++ b/src/caoscrawler/version.py @@ -66,7 +66,8 @@ Crawler version installed on your system: {installed_version} elif cfood_version < installed_version: # only warn if major or minor of installed version are newer than # specified in cfood - if (cfood_version.major < installed_version.major) or (cfood_version.minor < installed_version.minor): + if (cfood_version.major < installed_version.major) or ( + cfood_version.minor < installed_version.minor): msg = f""" The cfood was written for a previous crawler version. Running the crawler in a newer version than specified in the cfood definition may lead to unwanted or diff --git a/unittests/test_converters.py b/unittests/test_converters.py index e4b442d91060c7ba98cb1a910156b1800f050be3..50538e2484ac5b24e1f58b5b23f311eba6f84123 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -50,7 +50,6 @@ from caoscrawler.converters import (Converter, ConverterValidationError, replace_variables) from caoscrawler.converters.converters import \ _AbstractScalarValueElementConverter -from caoscrawler.crawl import Crawler from caoscrawler.scanner import (_load_definition_from_yaml_dict, create_converter_registry, create_transformer_registry, load_definition, @@ -357,9 +356,9 @@ def test_variable_replacement(): values["cm"] = "cm" # basic values stay unchanged - assert replace_variables(5, values) is 5 + assert replace_variables(5, values) == 5 assert replace_variables(True, values) is True - assert replace_variables("$a", values) is 4 + assert replace_variables("$a", values) == 4 assert replace_variables("${b}", values) == "68" # values given as simple strings never have units @@ -376,13 +375,15 @@ def test_variable_replacement(): assert handle_value({"value": "b", "collection_mode": "list"}, values) == ("b", None, "list") assert handle_value({"value": "b", - "collection_mode": "multiproperty"}, values) == ("b", None, "multiproperty") + "collection_mode": "multiproperty"}, values) == ( + "b", None, "multiproperty") assert handle_value({"value": "$b", "collection_mode": "single"}, values) == ("68", None, "single") assert handle_value({"value": "$b", "collection_mode": "list"}, values) == ("68", None, "list") assert handle_value({"value": "$b", - "collection_mode": "multiproperty"}, values) == ("68", None, "multiproperty") + "collection_mode": "multiproperty"}, values) == ( + "68", None, "multiproperty") # Unit specified in the same way as value: assert handle_value({"value": 5, "unit": "m"}, values) == (5, "m", "single") @@ -409,9 +410,9 @@ def test_apply_transformers(converter_registry): conv = ListElementConverter(definition=cfood_def, name='test', converter_registry=converter_registry) - assert values['a'] is "a|b|c" + assert values['a'] == "a|b|c" conv.apply_transformers(values, transformer_functions) - assert values['a'] is "a|b|c" + assert values['a'] == "a|b|c" assert values['b'] == ["a", "b", "c"] # Check replacing of existing variable @@ -612,6 +613,7 @@ def test_match_debug(converter_registry, caplog): if m and mn: with pytest.raises(RuntimeError) as err: mtch = dc.match(IntegerElement(name="a", value=4)) + assert str(err.value) == "Do not supply both, 'match_name' and 'match'." continue else: mtch = dc.match(IntegerElement(name="a", value=4)) @@ -1066,7 +1068,8 @@ def test_dict_match_properties(converter_registry): records = scan_structure_elements(root_dict_element, def_dict, converter_registry) assert len(records) == 0 - with pytest.raises(RuntimeError, match="Multiple properties match the same match_properties entry."): + with pytest.raises(RuntimeError, + match="Multiple properties match the same match_properties entry."): root_dict_element = DictElement("RootDict", { "prop_a": "value", "prop_b": "25", @@ -1112,13 +1115,14 @@ def test_directory_converter_change_date(caplog, converter_registry): # Match but warn with open(tmpfi.name, "w") as fi: - fi.write(f"This is garbage.\n") + fi.write("This is garbage.\n") with pytest.raises(ValueError): dc.match(test_dir_element) assert len(caplog.record_tuples) == 1 assert caplog.record_tuples[0][1] == logging.ERROR assert tmpfi.name in caplog.record_tuples[0][2] - assert "doesn't contain a ISO formatted datetime in its first line" in caplog.record_tuples[0][2] + assert ("doesn't contain a ISO formatted datetime in its first line" + in caplog.record_tuples[0][2]) # Match anything since file doesn't exist, inform in debug log. os.remove(tmpfi.name) diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index ee88857adc8ca44a29712ed61c3241b0b497b9d5..a77b7a6bdba2907ff5e5c19f23ee6d40369cbfbb 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -43,12 +43,14 @@ from linkahead.exceptions import EmptyUniqueQueryError from pytest import raises import caoscrawler +from caoscrawler.converters import ConverterValidationError from caoscrawler.crawl import (Crawler, SecurityMode, _treat_deprecated_prefix, crawler_main, split_restricted_path) from caoscrawler.debug_tree import DebugTree from caoscrawler.exceptions import (ImpossibleMergeError, MissingIdentifyingProperty, - MissingReferencingEntityError) + MissingReferencingEntityError, + ) from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import (CaosDBIdentifiableAdapter, IdentifiableAdapter, @@ -738,13 +740,15 @@ def test_validation_error_print(caplog): # occurs during the data collection stage DATADIR = os.path.join(os.path.dirname(__file__), "test_data", "failing_validation") for fi in ["cfood.yml", "cfood2.yml"]: - ret = crawler_main(DATADIR, - os.path.join(DATADIR, fi), - os.path.join(DATADIR, "identifiables.yml"), - True, - None, - False) - assert "Couldn't validate" in caplog.text + with raises(ConverterValidationError) as err: + crawler_main(DATADIR, + os.path.join(DATADIR, fi), + os.path.join(DATADIR, "identifiables.yml"), + debug=True, + provenance_file=None, + dry_run=False) + assert "Couldn't validate" in err.value.message + assert err.value.message.endswith("5 is not of type 'string'") caplog.clear() @@ -908,14 +912,14 @@ def test_replace_name_with_referenced_entity(): assert prop.value is test_int # no LinkAhead acccess until here - assert caoscrawler.crawl.cached_get_entity_by.call_count == 0 + assert caoscrawler.crawl.cached_get_entity_by.call_count == 0 # pylint: disable=no-member # change Properties with custom dt and str value prop = db.Property(name='a', datatype="RT", value=test_name) Crawler.replace_name_with_referenced_entity_id(prop) assert isinstance(prop.value, int) assert prop.value == test_id - assert caoscrawler.crawl.cached_get_entity_by.call_count == 1 + assert caoscrawler.crawl.cached_get_entity_by.call_count == 1 # pylint: disable=no-member # do not touch Properties with non-ref datatype (LIST) prop = db.Property(name='a', datatype=db.LIST(db.TEXT), value=[test_text]) @@ -947,4 +951,4 @@ def test_replace_name_with_referenced_entity(): assert prop.value[1].name == "hi" assert isinstance(prop.value[2], int) assert prop.value[2] == test_id - assert caoscrawler.crawl.cached_get_entity_by.call_count == 3 + assert caoscrawler.crawl.cached_get_entity_by.call_count == 3 # pylint: disable=no-member diff --git a/unittests/test_identifiable_adapters.py b/unittests/test_identifiable_adapters.py index 1c7733acfe952a2f47eff2853c2b90684c098dbf..656cade1177bf9604d8faa692435b5640b2c510a 100644 --- a/unittests/test_identifiable_adapters.py +++ b/unittests/test_identifiable_adapters.py @@ -23,6 +23,9 @@ # ** end header # +# For test_retrieve_identified_record_for_identifiable(): +# pylint: disable=undefined-variable + """ test identifiable_adapters module """ @@ -257,7 +260,9 @@ def test_retrieve_identified_record_for_identifiable(): # TODO modify this such that it becomes a test that acutally tests (sufficiently) the # retrieve_identified_record_for_identifiable function idr_r0_test = ident.retrieve_identified_record_for_identifiable(id_r0) - idr_r0 = ident.retrieve_identified_record_for_record(r_cur) + idr_r0 = ident.retrieve_identified_record_for_record( + r_cur) # pylint: disable=used-before-assignment + assert idr_r0 == idr_r0_test # take the first measurement in the list of records: diff --git a/unittests/test_issues.py b/unittests/test_issues.py index 779f77711fe18df2433f03580e7e3e4f2035f0f4..01911300e46267276c21dd06e1c097e96ac869ce 100644 --- a/unittests/test_issues.py +++ b/unittests/test_issues.py @@ -208,8 +208,10 @@ def test_issue_112(converter_registry): assert matches["content"] == ' ' # Cfood definition for CSV example file - records = scan_directory(UNITTESTDIR / "test_directories" / "examples_tables" / "ExperimentalData", - UNITTESTDIR / "test_directories" / "examples_tables" / "crawler_for_issue_112.yml") + records = scan_directory(UNITTESTDIR / "test_directories" / "examples_tables" / + "ExperimentalData", + UNITTESTDIR / "test_directories" / "examples_tables" / + "crawler_for_issue_112.yml") assert records for rec in records: print(rec.name) diff --git a/unittests/test_rocrate_converter.py b/unittests/test_rocrate_converter.py index 4b6bde171c789017e95a38729ae93f49ecf3f97b..9312c9d3b576700b2df3f74f27c6324edc87d457 100644 --- a/unittests/test_rocrate_converter.py +++ b/unittests/test_rocrate_converter.py @@ -34,7 +34,7 @@ import yaml from caoscrawler import scanner from caoscrawler.converters import ELNFileConverter, ROCrateEntityConverter from caoscrawler.stores import GeneralStore -from caoscrawler.structure_elements import (DictElement, File, ROCrateEntity, +from caoscrawler.structure_elements import (File, ROCrateEntity, TextElement) from rocrate.model.entity import Entity @@ -42,7 +42,7 @@ UNITTESTDIR = Path(__file__).parent @pytest.fixture -def converter_registry(): +def converter_registry() -> dict: converter_registry: dict[str, dict[str, str]] = { "ELNFile": { "converter": "ELNFileConverter", @@ -149,7 +149,7 @@ match_properties: def test_file(eln_entities): - ds_csv = ROCrateEntityConverter(yaml.safe_load(""" + ds_csv = ROCrateEntityConverter(yaml.safe_load(r""" type: ROCrateEntity match_type: File match_properties: diff --git a/unittests/test_scalars_cfood.py b/unittests/test_scalars_cfood.py index 577fcd5f6c93bee2bc05451983d358aa2e07f798..f414a99f5dc58c2a1fbfd7086bf10196bde5564a 100644 --- a/unittests/test_scalars_cfood.py +++ b/unittests/test_scalars_cfood.py @@ -4,12 +4,10 @@ # A. Schlemmer, 06/2021 from pathlib import Path -import pytest from utils import dircheckstr # The main function that is affected by this issue: from caoscrawler.converters import handle_value -from caoscrawler.crawl import Crawler from caoscrawler.debug_tree import DebugTree from caoscrawler.scanner import scan_directory # We need the store for the above function @@ -56,7 +54,7 @@ def test_record_structure_generation(): assert "RecordThatGetsParentsLater" in subddata prop = subddata["RecordThatGetsParentsLater"].get_property("someId") - assert type(prop.value) == int + assert type(prop.value) is int assert prop.value == 23 # record store on Data Analysis node of debug tree diff --git a/unittests/test_spss_converter.py b/unittests/test_spss_converter.py index 59fe723849dadcda21a699416372f08f2756f4e1..ec55715a2566efa31f4dcfc3afb9872f2f2128fd 100644 --- a/unittests/test_spss_converter.py +++ b/unittests/test_spss_converter.py @@ -20,11 +20,14 @@ import datetime import importlib +import logging from pathlib import Path import numpy as np import pytest +import yaml +import caoscrawler.converters.spss as spss_conv from caoscrawler.converters import ConverterValidationError, SPSSConverter from caoscrawler.structure_elements import (BooleanElement, DictElement, Directory, File, FloatElement, @@ -77,3 +80,72 @@ def test_spss_converter(converter_registry): assert isinstance(my_dict["mytime"], datetime.time) or np.isnan(my_dict["mytime"]) assert isinstance(my_dict["mylabl"], thistype), f"{type(my_dict['mylabl'])}" assert isinstance(my_dict["myord"], thistype), f"{type(my_dict['myord'])}" + + +def test_spss_to_yaml(tmpdir, caplog): + """Test datamodel and identifiables creation from SAV files. + """ + caplog.set_level(logging.DEBUG, logger="caoscrawler.converters") + spss_dir = UNITTESTDIR / "test_tables" / "spss" + conf = { + "labels": { + "gender": ["Male", "Female"], + "gender2": ["Male", "Female", "Nonbinary", "Other"], + } + } + with pytest.raises(ValueError) as exc: + spss_conv.spss_to_yaml(spss_dir/"sample.sav", yamlfile=tmpdir/"datamodel.yaml", + cfood=tmpdir/"cfood.yaml", conf=conf, strict="FULL") + assert str(exc.value) == """ + +Strict label handling failed: +MISSING +======= + +labels1 +------- +['low', 'medium', 'high'] + +AMBIGUOUS +========= + +labels0 +------- +['Male', 'Female'] +""" + + caplog.clear() + spss_conv.spss_to_yaml(spss_dir/"sample.sav", yamlfile=tmpdir/"datamodel.yaml", + cfood=tmpdir/"cfood.yaml", conf=conf, strict="WARN") + + messages = caplog.get_records("call") + assert len(messages) == 2 + assert messages[0].message == ("Ambiguous label heuristic for label labels0\n" + "with values ['Male', 'Female']:\n" + "{'gender': 1.0, 'gender2': 0.8333333333333334}\n") + assert messages[1].message == ("Label not found in configuration:\n" + "labels1 with ['low', 'medium', 'high']") + caplog.clear() + + conf = { + "labels": { + "gender": ["Male", "Female", "Nonbinary", "Other"], + "strength": ["low", "high"], + } + } + spss_conv.spss_to_yaml(spss_dir/"sample.sav", yamlfile=tmpdir/"datamodel.yaml", + cfood=tmpdir/"cfood.yaml", conf=conf) + assert caplog.messages == [] + with open(tmpdir/"datamodel.yaml", encoding="utf-8") as dm_file: + dm_loaded = yaml.safe_load(dm_file) + assert dm_loaded["DummyRT"]["recommended_properties"]["mylabl"] == { + 'datatype': 'gender', 'description': 'labeled'} + assert dm_loaded["DummyRT"]["recommended_properties"]["myord"] == { + 'datatype': 'strength', 'description': 'ordinal'} + assert dm_loaded["strength"] == {"description": None, "enums": ["low", "medium", "high"]} + + with open(tmpdir/"cfood.yaml", encoding="utf-8") as dm_file: + cfoods_loaded = list(yaml.safe_load_all(dm_file)) + assert cfoods_loaded[1]["directory"]["subtree"]["thisfile"]["subtree"]["entry"][ + "subtree"]["mylabl"]["records"] == { + 'gender': {'name': '$val'}, 'BaseElement': {'mylabl': '$gender'}} diff --git a/unittests/test_sync_graph.py b/unittests/test_sync_graph.py index 030306b95578865cdbfe19bdef2998a573848bd5..901d86eef94f7fe4178f14fb3766496440e455ab 100644 --- a/unittests/test_sync_graph.py +++ b/unittests/test_sync_graph.py @@ -27,14 +27,16 @@ from unittest.mock import MagicMock, Mock, patch import linkahead as db import pytest from test_crawler import (basic_retrieve_by_name_mock_up, + crawler_mocked_identifiable_retrieve, mock_cached_only_rt_allow_empty, mock_get_entity_by) +from caoscrawler.crawl import (Crawler, ) from caoscrawler.exceptions import (MissingIdentifyingProperty, MissingRecordType) from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable_adapters import CaosDBIdentifiableAdapter from caoscrawler.sync_graph import SyncGraph, _set_each_scalar_value -from caoscrawler.sync_node import SyncNode, parent_in_list, property_in_list +from caoscrawler.sync_node import SyncNode @pytest.fixture @@ -629,11 +631,10 @@ def test_ignoring_irrelevant_references(simple_adapter): # now a nolonger relies on unchecked assert not st._identity_relies_on_unchecked_entity(st.nodes[0]) -# 'is implementation insufficient' - @pytest.mark.xfail() def test_detect_circular_dependency(crawler_mocked_identifiable_retrieve, caplog): + # 'is implementation insufficient' crawler = crawler_mocked_identifiable_retrieve crawler.identifiableAdapter.get_registered_identifiable = Mock( side_effect=lambda x: db.Record().add_parent('C').add_property(name='C')) @@ -644,10 +645,10 @@ def test_detect_circular_dependency(crawler_mocked_identifiable_retrieve, caplog d = db.Record(name='c').add_parent("C") a.add_property(name="C", value=c) flat = [a, b, c] - circle = Crawler.detect_circular_dependency(flat) + circle = Crawler.detect_circular_dependency(flat) # pylint: disable=no-member assert [id(el) for el in circle] == [id(el) for el in [a, c, b, a]] - assert Crawler.detect_circular_dependency([d]) is None + assert Crawler.detect_circular_dependency([d]) is None # pylint: disable=no-member st = SyncGraph(flat, crawler.identifiableAdapter) with pytest.raises(RuntimeError): _, _ = crawler._split_into_inserts_and_updates(st) diff --git a/unittests/test_table_converter.py b/unittests/test_table_converter.py index 8a7f9bfbdfd36e7145e1ec1699f5575413317fd0..1c6a452ba5d83a89c216407ce48b8db6f88e5f3c 100644 --- a/unittests/test_table_converter.py +++ b/unittests/test_table_converter.py @@ -46,10 +46,7 @@ from caoscrawler.identifiable_adapters import (IdentifiableAdapter, LocalStorageIdentifiableAdapter) from caoscrawler.scanner import scan_directory from caoscrawler.stores import GeneralStore -from caoscrawler.structure_elements import (BooleanElement, DictElement, - Directory, File, FloatElement, - IntegerElement, ListElement, - TextElement) +from caoscrawler.structure_elements import (File, ) UNITTESTDIR = Path(__file__).parent @@ -115,17 +112,17 @@ def test_convert_table(converter_registry): assert len(res) == 5 for i in range(5): assert res[i].name == str(i) - assert type(res[i].name) == str - assert type(res[i].value) == dict + assert type(res[i].name) is str + assert type(res[i].value) is dict assert len(res[i].value) == 6 - assert type(res[i].value["Col_1"]) == int + assert type(res[i].value["Col_1"]) is int assert res[i].value["Col_1"] == i - assert type(res[i].value["Col_2"]) == float - assert type(res[i].value["Col_3"]) == int + assert type(res[i].value["Col_2"]) is float + assert type(res[i].value["Col_3"]) is int if i != 3: - assert type(res[i].value["text"]) == str + assert type(res[i].value["text"]) is str else: - assert type(res[i].value["text"]) == float # the nan value + assert type(res[i].value["text"]) is float # the nan value assert math.isnan(res[i].value["text"]) # Using an index col: diff --git a/unittests/test_utilities.py b/unittests/test_utilities.py index a9b052524957b6f8c1e0378e3153fc06f4f36806..ec2a446bdfb44c1f03c95ac1274435f96e8341d5 100644 --- a/unittests/test_utilities.py +++ b/unittests/test_utilities.py @@ -46,7 +46,7 @@ def test_dummy_class(): with pytest.raises(RuntimeError) as err_info_2: Missing() with pytest.raises(RuntimeError) as err_info_3: - print(Missing.foo) + print(Missing.foo) # pylint: disable=no-member for err_info in (err_info_1, err_info_2, err_info_3): msg = str(err_info.value) @@ -60,7 +60,7 @@ def test_dummy_class(): with pytest.raises(RuntimeError) as err_info_2: MissingErr() with pytest.raises(RuntimeError) as err_info_3: - print(MissingErr.foo) + print(MissingErr.foo) # pylint: disable=no-member for err_info in (err_info_1, err_info_2, err_info_3): msg = str(err_info.value) @@ -77,7 +77,9 @@ def test_shared_resource_link(): "https://example.com", "file.txt") == "https://example.com/Shared/file.txt" assert get_shared_resource_link( "https://example.com", "path/to/file.txt") == "https://example.com/Shared/path/to/file.txt" - assert get_shared_resource_link( - "https://example.com/context-root", "path/to/file.txt") == "https://example.com/context-root/Shared/path/to/file.txt" - assert get_shared_resource_link( - "https://example.com/context-root/", "path/to/file.txt") == "https://example.com/context-root/Shared/path/to/file.txt" + assert (get_shared_resource_link( + "https://example.com/context-root", "path/to/file.txt") + == "https://example.com/context-root/Shared/path/to/file.txt") + assert (get_shared_resource_link( + "https://example.com/context-root/", "path/to/file.txt") + == "https://example.com/context-root/Shared/path/to/file.txt") diff --git a/unittests/test_xml_converter.py b/unittests/test_xml_converter.py index e8869ef6ffad511159a583a14fd49d2fad48766b..f6475c17cbbfdce72aaa765e68632238fb29e416 100644 --- a/unittests/test_xml_converter.py +++ b/unittests/test_xml_converter.py @@ -32,7 +32,6 @@ from lxml.etree import fromstring from caoscrawler.converters import (XMLAttributeNodeConverter, XMLTagConverter, XMLTextNodeConverter) -from caoscrawler.scanner import load_definition from caoscrawler.stores import GeneralStore from caoscrawler.structure_elements import XMLTagElement @@ -40,7 +39,7 @@ UNITTESTDIR = Path(__file__).parent @pytest.fixture -def converter_registry(): +def converter_registry() -> dict: converter_registry: dict[str, dict[str, str]] = { "XMLTag": { "converter": "XMLTagConverter", @@ -66,7 +65,8 @@ def basic_xmltag_converter(converter_registry): type: XMLTag match_tag: a match_attrib: # default is the empty dictionary - "(?P<ref>(href|url))": "test(?P<number>[0-9])" # either the "href" or the "url" attribute must be set + # either the "href" or the "url" attribute must be set + "(?P<ref>(href|url))": "test(?P<number>[0-9])" alt: (.+) # this attribute must be present and contain at least one character match_text: \\s*(?P<node_text>.+)\\s* @@ -85,7 +85,8 @@ def basic_xpath_xmltag_converter(converter_registry): type: XMLTag match_tag: a match_attrib: # default is the empty dictionary - "(?P<ref>(href|url))": "test(?P<number>[0-9])" # either the "href" or the "url" attribute must be set + # either the "href" or the "url" attribute must be set + "(?P<ref>(href|url))": "test(?P<number>[0-9])" alt: (.+) # this attribute must be present and contain at least one character match_text: \\s*(?P<node_text>.+)\\s* xpath: child::*/* @@ -163,7 +164,7 @@ def test_not_matching(basic_xmltag_converter): # TODO: adapt converter -> empty (==None) text node is equivalent to empty string text node # TODO: adapt tests - # TODO: how to match " ajskdlfjaldsf ajsdklfjadkl " without the whitespaces in regexp correctly? + # TODO: how to match " ajsjaldsf ajsdklfja " without the whitespaces in regexp correctly? def test_nested_simple_xml(basic_xmltag_converter, basic_xpath_xmltag_converter): @@ -370,7 +371,7 @@ text_as_children: true children = converter.create_children(GeneralStore(), tag) assert len(children) == 1 - attrib_converter = XMLTextNodeConverter(yaml.safe_load(""" + attrib_converter = XMLTextNodeConverter(yaml.safe_load(r""" type: XMLTextNode match_text: \s*(?P<val>\w*)\s* """), "TestXMLTextNodeConverter", converter_registry)