From 1a0b03fee6571caa86bd6b7523811ad92dbfa0da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com> Date: Fri, 11 Nov 2022 14:29:59 +0100 Subject: [PATCH] FIX: floats can be interpreted as integers and vice versa --- CHANGELOG.md | 3 ++ src/caoscrawler/converters.py | 59 +++++++++++++++++++++++++++++------ src/doc/converters.rst | 20 ++++++++---- unittests/test_converters.py | 42 ++++++++++++++++++++++++- 4 files changed, 108 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2587531b..37f081ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * [#30](https://gitlab.com/caosdb/caosdb-crawler/-/issues/30) * [#23](https://gitlab.com/caosdb/caosdb-crawler/-/issues/23) Crawler may overwrite and delete existing data in case of manually added properties +* [#10](https://gitlab.com/caosdb/caosdb-crawler/-/issues/10) floats can be + interpreted as integers and vice versa, there are defaults for allowing other + types and this can be changed per converter ### Security ### diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 90b894c5..e025ab27 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -644,25 +644,66 @@ class _AbstractDictElementConverter(Converter): values.update(m2.groupdict()) return values + def _typecheck(self, element: StructureElement, default: Dict, definition: Dict): + allowed_matches = self._merge_match_definition_with_default(default, definition) + if (bool(allowed_matches["accept_text"]) and isinstance(element, DictTextElement)): + return True + elif (bool(allowed_matches["accept_bool"]) and isinstance(element, DictBooleanElement)): + return True + elif (bool(allowed_matches["accept_int"]) and isinstance(element, DictIntegerElement)): + return True + elif (bool(allowed_matches["accept_float"]) and isinstance(element, DictFloatElement)): + return True + else: + return False + + def _merge_match_definition_with_default(self, default: Dict, definition: Dict): + result = {} + for key in default: + if key in definition: + result[key] = definition[key] + else: + result[key] = default[key] + return result -class DictBooleanElementConverter(_AbstractDictElementConverter): def typecheck(self, element: StructureElement): - return isinstance(element, DictBooleanElement) + return self._typecheck(element, self.default_matches, self.definition) + + +class DictBooleanElementConverter(_AbstractDictElementConverter): + default_matches = { + "accept_text": True, + "accept_bool": True, + "accept_int": True, + "accept_float": False, + } class DictFloatElementConverter(_AbstractDictElementConverter): - def typecheck(self, element: StructureElement): - return isinstance(element, DictFloatElement) + default_matches = { + "accept_text": True, + "accept_bool": False, + "accept_int": True, + "accept_float": True, + } class DictTextElementConverter(_AbstractDictElementConverter): - def typecheck(self, element: StructureElement): - return isinstance(element, DictTextElement) + default_matches = { + "accept_text": True, + "accept_bool": True, + "accept_int": True, + "accept_float": True, + } class DictIntegerElementConverter(_AbstractDictElementConverter): - def typecheck(self, element: StructureElement): - return isinstance(element, DictIntegerElement) + default_matches = { + "accept_text": True, + "accept_bool": True, + "accept_int": True, + "accept_float": True, + } class DictListElementConverter(Converter): @@ -747,7 +788,7 @@ class TableConverter(Converter): The rows can be matched using a DictDictElementConverter. """ - @abstractmethod + @ abstractmethod def get_options(self): """ This method needs to be overwritten by the specific table converter to provide diff --git a/src/doc/converters.rst b/src/doc/converters.rst index 640a1dde..277abd88 100644 --- a/src/doc/converters.rst +++ b/src/doc/converters.rst @@ -77,12 +77,20 @@ Dict Converter Typical Subtree converters -------------------------- -DictBooleanElementConverter -DictFloatElementConverter -DictTextElementConverter -DictIntegerElementConverter -DictListElementConverter -DictDictElementConverter +- DictBooleanElementConverter +- DictFloatElementConverter +- DictTextElementConverter +- DictIntegerElementConverter +- DictListElementConverter +- DictDictElementConverter + +These converters expect `match_name` and `match_value` in their definition +which allow to match the key and the value, respectively. + +Note that their are defaults for accepting other types. For example, +DictFloatElementConverter also accepts DictIntegerElements. The default +behavior can be adjusted with the fields `accept_text`, `accept_int`, +`accept_float`, and `accept_bool`. YAMLFileConverter ================= diff --git a/unittests/test_converters.py b/unittests/test_converters.py index 802483c1..ccba2220 100644 --- a/unittests/test_converters.py +++ b/unittests/test_converters.py @@ -31,7 +31,7 @@ import yaml from caoscrawler.converters import (Converter, ConverterValidationError, DictConverter, DirectoryConverter, handle_value, MarkdownFileConverter, - JSONFileConverter) + DictFloatElementConverter, JSONFileConverter) from caoscrawler.converters import _AbstractDictElementConverter from caoscrawler.crawl import Crawler from caoscrawler.stores import GeneralStore @@ -404,3 +404,43 @@ end""") val = converter.match(element) assert val is not None assert val["text"] == "\nbla\n" + + +def test_converter_value_match(converter_registry): + # test with defaults + dc = DictFloatElementConverter( + definition={ + "match_name": "(.*)", + "match_value": "(.*)", + }, + name="Test", + converter_registry=converter_registry + ) + m = dc.match(DictIntegerElement(name="a", value=4)) + assert m is not None + + # overwrite default with no match for int + dc = DictFloatElementConverter( + definition={ + "match_name": "(.*)", + "match_value": "(.*)", + "accept_int": False, + }, + name="Test", + converter_registry=converter_registry + ) + with pytest.raises(RuntimeError) as err: + m = dc.match(DictIntegerElement(name="a", value=4)) + + # overwrite default with match for float + dc = DictFloatElementConverter( + definition={ + "match_name": "(.*)", + "match_value": "(.*)", + "accept_float": True, + }, + name="Test", + converter_registry=converter_registry + ) + m = dc.match(DictFloatElement(name="a", value=4.0)) + assert m is not None -- GitLab