diff --git a/CHANGELOG.md b/CHANGELOG.md index 68583486202354aae54304d7972bc5ea47bef48b..c8de6a9d0bc2dcee8b646ad270d09ca34de47273 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - JSON schema validation can also be used in the DictElementConverter - YAMLFileConverter class; to parse YAML files - Variables can now be substituted within the definition of yaml macros +- debugging option for the match step of Converters ### Changed ### diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 0b31bad57f94efb1a1d9a1a9e7cf40e5601fd7a1..7920f386fc00202249b29a048e02450ec70e5fc1 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -390,6 +390,56 @@ class Converter(object, metaclass=ABCMeta): """ pass + @staticmethod + def _debug_matching_template(name: str, regexp: list[str], matched: list[str], result: Optional[dict]): + """ Template for the debugging output for the match function """ + print("\n--------", name, "-----------") + for re, ma in zip(regexp, matched): + print("matching against:\n" + re) + print("matching:\n" + ma) + print("---------") + if result is None: + print("No match") + else: + print("Matched groups:") + print(result) + print("----------------------------------------") + + @staticmethod + def debug_matching(kind=None): + def debug_matching_decorator(func): + """ + decorator for the match function of Converters that implements debug for the match of + StructureElements + """ + + def inner(self, element: StructureElement): + mr = func(self, element) + if "debug_match" in self.definition and self.definition["debug_match"]: + if kind == "name": + self._debug_matching_template(name=self.__class__.__name__, + regexp=[self.definition["match"]], + matched=[element.name], + result=mr) + elif kind == "name_and_value": + self._debug_matching_template( + name=self.__class__.__name__, + regexp=[self.definition["match"] + if "match" in self.definition else "", + self.definition["match_name"] + if "match_name" in self.definition else "", + self.definition["match_value"]], + matched=[element.name, element.name, str(element.value)], + result=mr) + else: + self._debug_matching_template(name=self.__class__.__name__, + regexp=self.definition["match"], + matched=str(element), + result=result) + return mr + return inner + return debug_matching_decorator + @abstractmethod def match(self, element: StructureElement) -> Optional[dict]: """ @@ -425,6 +475,7 @@ class DirectoryConverter(Converter): # TODO basically all converters implement such a match function. Shouldn't this be the one # of the parent class and subclasses can overwrite if needed? + @Converter.debug_matching("name") def match(self, element: StructureElement): # TODO: See comment on types and inheritance if not isinstance(element, Directory): @@ -463,10 +514,10 @@ class SimpleFileConverter(Converter): def typecheck(self, element: StructureElement): return isinstance(element, File) - def create_children(self, generalStore: GeneralStore, - element: StructureElement): + def create_children(self, generalStore: GeneralStore, element: StructureElement): return list() + @Converter.debug_matching("name") def match(self, element: StructureElement): # TODO: See comment on types and inheritance if not isinstance(element, File): @@ -512,6 +563,7 @@ class MarkdownFileConverter(Converter): def typecheck(self, element: StructureElement): return isinstance(element, File) + @Converter.debug_matching("name") def match(self, element: StructureElement): # TODO: See comment on types and inheritance if not isinstance(element, File): @@ -601,6 +653,7 @@ class DictElementConverter(Converter): def typecheck(self, element: StructureElement): return isinstance(element, DictElement) + @Converter.debug_matching("name_and_value") def match(self, element: StructureElement): """ Allways matches if the element has the right type. @@ -629,6 +682,7 @@ class JSONFileConverter(Converter): def typecheck(self, element: StructureElement): return isinstance(element, File) + @Converter.debug_matching("name") def match(self, element: StructureElement): # TODO: See comment on types and inheritance if not self.typecheck(element): @@ -656,6 +710,7 @@ class YAMLFileConverter(Converter): def typecheck(self, element: StructureElement): return isinstance(element, File) + @Converter.debug_matching("name") def match(self, element: StructureElement): # TODO: See comment on types and inheritance if not self.typecheck(element): @@ -749,6 +804,7 @@ class _AbstractScalarValueElementConverter(Converter): self.definition) return self._typecheck(element, allowed_matches) + @Converter.debug_matching("name_and_value") def match(self, element: StructureElement): """ Try to match the given structure element. @@ -900,6 +956,7 @@ class ListElementConverter(Converter): def typecheck(self, element: StructureElement): return isinstance(element, ListElement) + @Converter.debug_matching("name") def match(self, element: StructureElement): # TODO: See comment on types and inheritance if not isinstance(element, ListElement): @@ -956,6 +1013,7 @@ class TableConverter(Converter): def typecheck(self, element: StructureElement): return isinstance(element, File) + @Converter.debug_matching("name") def match(self, element: StructureElement): # TODO: See comment on types and inheritance if not isinstance(element, File): diff --git a/src/doc/converters.rst b/src/doc/converters.rst index ae84644072ebbd53f1325d1f9d1d0ef8e5dc6de6..b4ba89ced3b5858ca2f8abe7bc724d6710d9203b 100644 --- a/src/doc/converters.rst +++ b/src/doc/converters.rst @@ -483,3 +483,22 @@ Let's formulate that using `create_records` (again, `dir_name` is constant here) keys_modified = create_records(values, records, record_def) +Debugging +========= + +You can add the key `debug_match` to the definition of a Converter in order to create debugging +output for the match step. The following snippet illustrates this: + +.. code-block:: yaml + + DirConverter: + type: Directory + match: (?P<dir_name>.*) + debug_match: True + records: + Project: + identifier: project_name + + +Whenever this Converter tries to match a StructureElement, it logs what was tried to macht against +what and what the result was. diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 71cb42cf1841aa95e901047cc5037a1a123e3a72..5a1c8b690690c5cce0373a5fa25d954fb89988f6 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -358,7 +358,7 @@ def test_variable_replacement(): assert handle_value(["$a", "$b"], values) == (["4", "68"], "single") -def test_filter_children_of_directory(converter_registry): +def test_filter_children_of_directory(converter_registry, capsys): """Verify that children (i.e., files) in a directory are filtered or sorted correctly. @@ -369,6 +369,7 @@ def test_filter_children_of_directory(converter_registry): dc = DirectoryConverter( definition={ "match": "(.*)", + "debug_match": True, "filter": { "expr": "test_(?P<date>[0-9]{4,4}-[0-9]{2,2}-[0-9]{2,2}).json", "group": "date", @@ -381,6 +382,14 @@ def test_filter_children_of_directory(converter_registry): m = dc.match(test_dir) assert m is not None + # checking debug output + captured = capsys.readouterr() + # the name + assert "examples_filter_children" in captured.out + # the regexp + assert "(.*)" in captured.out + # the empty result set + assert "{}" in captured.out # This should only contain the youngest json and the csv that doesn't match # the above filter expression. @@ -491,10 +500,11 @@ end""") assert val["text"] == "\nbla\n" -def test_converter_value_match(converter_registry): +def test_converter_value_match(converter_registry, capsys): # test with defaults dc = FloatElementConverter( definition={ + "debug_match": True, "match_name": "(.*)", "match_value": "(.*)", }, @@ -503,6 +513,14 @@ def test_converter_value_match(converter_registry): ) m = dc.match(IntegerElement(name="a", value=4)) assert m is not None + # checking debug output + captured = capsys.readouterr() + # the name + assert "a" in captured.out + # the regexp + assert "(.*)" in captured.out + # the empty result set + assert "{}" in captured.out # overwrite default with no match for int dc = FloatElementConverter(