From 1a0b03fee6571caa86bd6b7523811ad92dbfa0da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20tom=20W=C3=B6rden?= <h.tomwoerden@indiscale.com>
Date: Fri, 11 Nov 2022 14:29:59 +0100
Subject: [PATCH] FIX: floats can be interpreted as integers and vice versa

---
 CHANGELOG.md                  |  3 ++
 src/caoscrawler/converters.py | 59 +++++++++++++++++++++++++++++------
 src/doc/converters.rst        | 20 ++++++++----
 unittests/test_converters.py  | 42 ++++++++++++++++++++++++-
 4 files changed, 108 insertions(+), 16 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2587531b..37f081ef 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 * [#30](https://gitlab.com/caosdb/caosdb-crawler/-/issues/30)
 * [#23](https://gitlab.com/caosdb/caosdb-crawler/-/issues/23) Crawler may
   overwrite and delete existing data in case of manually added properties
+* [#10](https://gitlab.com/caosdb/caosdb-crawler/-/issues/10) floats can be
+  interpreted as integers and vice versa, there are defaults for allowing other
+  types and this can be changed per converter
 
 ### Security ###
 
diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py
index 90b894c5..e025ab27 100644
--- a/src/caoscrawler/converters.py
+++ b/src/caoscrawler/converters.py
@@ -644,25 +644,66 @@ class _AbstractDictElementConverter(Converter):
         values.update(m2.groupdict())
         return values
 
+    def _typecheck(self, element: StructureElement, default: Dict, definition: Dict):
+        allowed_matches = self._merge_match_definition_with_default(default, definition)
+        if (bool(allowed_matches["accept_text"]) and isinstance(element, DictTextElement)):
+            return True
+        elif (bool(allowed_matches["accept_bool"]) and isinstance(element, DictBooleanElement)):
+            return True
+        elif (bool(allowed_matches["accept_int"]) and isinstance(element, DictIntegerElement)):
+            return True
+        elif (bool(allowed_matches["accept_float"]) and isinstance(element, DictFloatElement)):
+            return True
+        else:
+            return False
+
+    def _merge_match_definition_with_default(self, default: Dict, definition: Dict):
+        result = {}
+        for key in default:
+            if key in definition:
+                result[key] = definition[key]
+            else:
+                result[key] = default[key]
+        return result
 
-class DictBooleanElementConverter(_AbstractDictElementConverter):
     def typecheck(self, element: StructureElement):
-        return isinstance(element, DictBooleanElement)
+        return self._typecheck(element, self.default_matches, self.definition)
+
+
+class DictBooleanElementConverter(_AbstractDictElementConverter):
+    default_matches = {
+        "accept_text": True,
+        "accept_bool": True,
+        "accept_int": True,
+        "accept_float": False,
+    }
 
 
 class DictFloatElementConverter(_AbstractDictElementConverter):
-    def typecheck(self, element: StructureElement):
-        return isinstance(element, DictFloatElement)
+    default_matches = {
+        "accept_text": True,
+        "accept_bool": False,
+        "accept_int": True,
+        "accept_float": True,
+    }
 
 
 class DictTextElementConverter(_AbstractDictElementConverter):
-    def typecheck(self, element: StructureElement):
-        return isinstance(element, DictTextElement)
+    default_matches = {
+        "accept_text": True,
+        "accept_bool": True,
+        "accept_int": True,
+        "accept_float": True,
+    }
 
 
 class DictIntegerElementConverter(_AbstractDictElementConverter):
-    def typecheck(self, element: StructureElement):
-        return isinstance(element, DictIntegerElement)
+    default_matches = {
+        "accept_text": True,
+        "accept_bool": True,
+        "accept_int": True,
+        "accept_float": True,
+    }
 
 
 class DictListElementConverter(Converter):
@@ -747,7 +788,7 @@ class TableConverter(Converter):
 
     The rows can be matched using a DictDictElementConverter.
     """
-    @abstractmethod
+    @ abstractmethod
     def get_options(self):
         """
         This method needs to be overwritten by the specific table converter to provide
diff --git a/src/doc/converters.rst b/src/doc/converters.rst
index 640a1dde..277abd88 100644
--- a/src/doc/converters.rst
+++ b/src/doc/converters.rst
@@ -77,12 +77,20 @@ Dict Converter
 Typical Subtree converters
 --------------------------
 
-DictBooleanElementConverter
-DictFloatElementConverter
-DictTextElementConverter
-DictIntegerElementConverter
-DictListElementConverter
-DictDictElementConverter
+- DictBooleanElementConverter
+- DictFloatElementConverter
+- DictTextElementConverter
+- DictIntegerElementConverter
+- DictListElementConverter
+- DictDictElementConverter
+
+These converters expect `match_name` and `match_value` in their definition
+which allow to match the key and the value, respectively.
+
+Note that their are defaults for accepting other types. For example,
+DictFloatElementConverter also accepts DictIntegerElements. The default
+behavior can be adjusted with the fields `accept_text`, `accept_int`,
+`accept_float`, and `accept_bool`.
 
 YAMLFileConverter
 =================
diff --git a/unittests/test_converters.py b/unittests/test_converters.py
index 802483c1..ccba2220 100644
--- a/unittests/test_converters.py
+++ b/unittests/test_converters.py
@@ -31,7 +31,7 @@ import yaml
 from caoscrawler.converters import (Converter, ConverterValidationError,
                                     DictConverter, DirectoryConverter,
                                     handle_value, MarkdownFileConverter,
-                                    JSONFileConverter)
+                                    DictFloatElementConverter, JSONFileConverter)
 from caoscrawler.converters import _AbstractDictElementConverter
 from caoscrawler.crawl import Crawler
 from caoscrawler.stores import GeneralStore
@@ -404,3 +404,43 @@ end""")
     val = converter.match(element)
     assert val is not None
     assert val["text"] == "\nbla\n"
+
+
+def test_converter_value_match(converter_registry):
+    # test with defaults
+    dc = DictFloatElementConverter(
+        definition={
+            "match_name": "(.*)",
+            "match_value": "(.*)",
+        },
+        name="Test",
+        converter_registry=converter_registry
+    )
+    m = dc.match(DictIntegerElement(name="a", value=4))
+    assert m is not None
+
+    # overwrite default with no match for int
+    dc = DictFloatElementConverter(
+        definition={
+            "match_name": "(.*)",
+            "match_value": "(.*)",
+            "accept_int": False,
+        },
+        name="Test",
+        converter_registry=converter_registry
+    )
+    with pytest.raises(RuntimeError) as err:
+        m = dc.match(DictIntegerElement(name="a", value=4))
+
+    # overwrite default with match for float
+    dc = DictFloatElementConverter(
+        definition={
+            "match_name": "(.*)",
+            "match_value": "(.*)",
+            "accept_float": True,
+        },
+        name="Test",
+        converter_registry=converter_registry
+    )
+    m = dc.match(DictFloatElement(name="a", value=4.0))
+    assert m is not None
-- 
GitLab