From 985cb826b9b4b5464caf55316b8c8416eadfdc4c Mon Sep 17 00:00:00 2001
From: Daniel <d.hornung@indiscale.com>
Date: Wed, 2 Mar 2022 13:52:58 +0100
Subject: [PATCH] FIX: Be less strict about numeric datatypes form xlsx files.

---
 CHANGELOG.md                            |   2 +
 src/caosadvancedtools/crawler.py        |   2 +
 src/caosadvancedtools/table_importer.py | 115 +++++++++++++++++-------
 unittests/data/datatypes.xlsx           | Bin 0 -> 5482 bytes
 unittests/test_table_importer.py        |  16 +++-
 5 files changed, 100 insertions(+), 35 deletions(-)
 create mode 100644 unittests/data/datatypes.xlsx

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 44629bd9..a6b2de73 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed ###
 
+- `TableConverter` now converts int to float and vice versa to match the desired dtype.
+
 ### Deprecated ###
 
 ### Removed ###
diff --git a/src/caosadvancedtools/crawler.py b/src/caosadvancedtools/crawler.py
index 5d91d85c..87b91a52 100644
--- a/src/caosadvancedtools/crawler.py
+++ b/src/caosadvancedtools/crawler.py
@@ -279,6 +279,8 @@ class Crawler(object):
                     except DataInconsistencyError as e:
                         logger.debug(traceback.format_exc())
                         logger.debug(e)
+                        # TODO: Generally: in which cases should exceptions be raised? When is
+                        # errors_occured set to True? The expected behavior must be documented.
                     except Exception as e:
                         try:
                             DataModelProblems.evaluate_exception(e)
diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py
index 0b55252b..654d28b4 100755
--- a/src/caosadvancedtools/table_importer.py
+++ b/src/caosadvancedtools/table_importer.py
@@ -205,27 +205,32 @@ def string_in_list(val, options, ignore_case=True):
     return val
 
 
-class TableImporter(object):
+class TableImporter():
+    """Abstract base class for importing data from tables.
+    """
     def __init__(self, converters, obligatory_columns=None, unique_keys=None,
                  datatypes=None):
         """
-        converters: dict with column names as keys and converter functions as
-                    values
-                    This dict also defines what columns are required to exist
-                    throught the existing keys. The converter functions are
-                    applied to the cell values. They should also check for
-                    ValueErrors, such that a separate value check is not
-                    necessary.
-        obligatory_columns: list of column names, optional
-                            each listed column must not have missing values
-        unique_columns : list of column names that in
-                            combination must be unique; i.e. each row has a
-                            unique combination of values in those columns.
-        datatypes: dict with column names as keys and datatypes as values
-                   All non-null values will be checked whether they have the
-                   provided datatype.
-                   This dict also defines what columns are required to exist
-                   throught the existing keys.
+        Parameters
+        ----------
+        converters : dict
+          Dict with column names as keys and converter functions as values. This dict also defines
+          what columns are required to exist throught the existing keys. The converter functions are
+          applied to the cell values. They should also check for ValueErrors, such that a separate
+          value check is not necessary.
+
+        obligatory_columns : list, optional
+          List of column names, each listed column must not have missing values.
+
+        unique_keys : list, optional
+          List of column names that in combination must be unique: each row has a unique
+          combination of values in those columns.
+
+        datatypes : dict, optional
+          Dict with column names as keys and datatypes as values.  All non-null values will be
+          checked whether they have the provided datatype.  This dict also defines what columns are
+          required to exist throught the existing keys.
+
         """
 
         if converters is None:
@@ -247,11 +252,14 @@ class TableImporter(object):
         raise NotImplementedError()
 
     def check_columns(self, df, filename=None):
-        """
-        checks whether all required columns, i.e. columns for which converters
-        were defined exist.
+        """Check whether all required columns exist.
+
+        Required columns are columns for which converters are defined.
+
+        Raises
+        ------
+        DataInconsistencyError
 
-        Raises: DataInconsistencyError
         """
 
         for col in self.required_columns:
@@ -267,12 +275,11 @@ class TableImporter(object):
                 raise DataInconsistencyError(errmsg)
 
     def check_unique(self, df, filename=None):
-        """
-        Check whether value combinations that shall be unique for each row are
-        unique.
+        """Check whether value combinations that shall be unique for each row are unique.
 
         If a second row is found, that uses the same combination of values as a
         previous one, the second one is removed.
+
         """
         df = df.copy()
         uniques = []
@@ -299,13 +306,33 @@ class TableImporter(object):
 
         return df
 
-    def check_datatype(self, df, filename=None):
-        """
-        Check for each column whether non-null fields are have the correct
-        datatype.
-        """
+    def check_datatype(self, df, filename=None, strict=False):
+        """Check for each column whether non-null fields have the correct datatype.
+
+        .. note::
+
+          If columns are float, but should be integer or vice versa, this method converts the
+          respective columns in place.
 
+        Parameters
+        ----------
+
+        strict: boolean, optional
+          If False (the default), try to convert columns, otherwise raise an error.
+
+        """
         for key, datatype in self.datatypes.items():
+            # Check for castable numeric types first: We unconditionally cast float to int and vice
+            # versa, because CaosDB does not have different sizes anyway.
+            col_dtype = df.dtypes[key]
+            if not strict and not np.issubdtype(col_dtype, datatype):
+                issub = np.issubdtype
+                #  These special cases should be fine.
+                if ((issub(col_dtype, np.integer) and issub(datatype, np.floating))
+                        or (issub(col_dtype, np.floating) and issub(datatype, np.integer))):
+                    df[key] = df[key].astype(datatype)
+
+            # Now check each element
             for idx, val in df.loc[
                     pd.notnull(df.loc[:, key]), key].iteritems():
 
@@ -326,6 +353,11 @@ class TableImporter(object):
         Check in each row whether obligatory fields are empty or null.
 
         Rows that have missing values are removed.
+
+        Returns
+        -------
+        out : pandas.DataFrame
+          The input DataFrame with incomplete rows removed.
         """
         df = df.copy()
 
@@ -362,10 +394,26 @@ class TableImporter(object):
 
         return df
 
-    def check_dataframe(self, df, filename):
+    def check_dataframe(self, df, filename=None, strict=False):
+        """Check if the dataframe conforms to the restrictions.
+
+        Checked restrictions are: Columns, data types, uniqueness requirements.
+
+        Parameters
+        ----------
+
+        df: pandas.DataFrame
+          The dataframe to be checked.
+
+        filename: string, optional
+          The file name, only used for output in case of problems.
+
+        strict: boolean, optional
+          If False (the default), try to convert columns, otherwise raise an error.
+        """
         self.check_columns(df, filename=filename)
         df = self.check_missing(df, filename=filename)
-        self.check_datatype(df, filename=filename)
+        self.check_datatype(df, filename=filename, strict=strict)
 
         if len(self.unique_keys) > 0:
             df = self.check_unique(df, filename=filename)
@@ -378,8 +426,7 @@ class XLSImporter(TableImporter):
         return self.read_xls(filename=filename, **kwargs)
 
     def read_xls(self, filename, **kwargs):
-        """
-        converts an xls file into a Pandas DataFrame.
+        """Convert an xls file into a Pandas DataFrame.
 
         The converters of the XLSImporter object are used.
 
diff --git a/unittests/data/datatypes.xlsx b/unittests/data/datatypes.xlsx
new file mode 100644
index 0000000000000000000000000000000000000000..34fc4cf43092a68b630e0e04ebc43609b8a0b17b
GIT binary patch
literal 5482
zcmaJ_1ymFZ_g|1$8d*}hr5ou~bYYj0Mmm*PVx^m1r9%NFrBRenLMaIeNeMx62?6Ow
zO5nfi`_6;U@BiMNGrKeA%)K+WfA?x@T*09P0D(XNmc6we;1Us`uPr?6oIQmEF~22G
znl(QO6F=JYkK#R$m{J1f6xKoR)Uk>)HF3DoS4HNvhVT5m38Em5efKHOJt+9hx&Q@n
z<wpr~X_!aq8%JU{aM=VFwRWGm`wB_+<_5K2mAb79Q|0xD*HYi>nN{w}_lGB_6E5vS
zH3qPL*sR$C<dq(KnCW|qp4mn$B8=B!DBK+kaz}%4>Jd;ECka<DGlQRk*qkBTQ&z9f
z>X5vEBB^=`61zu)z9_rEn(pb%BvLwJ;XBZ)&sY~XK2s09k7GSe>Rx{S98_Dg=;3C)
zhu2$Afe?0p`$O3S??5(QP%(;Gs7IS*{pwx>X=~sEk!j*-8)zf+VFLi#|7a#M^b<bL
zLf$YBCu<nYNzli|IreU+>x3|Q!vP--%0<BZ0e#99!?*;%6nRl+>Jv3;rzpzL+bOx{
zsH=7bJmacllF~%e^{Kuls0^oN#%7}&{hk)9COPdILBx35+JueFbCEA>hPcm6lzRj%
z$9h+$cJg}_hWs`M{OxQESS5+3-}~Z9yjPtH$-VOm+M5We*mbc9p{d4hGC9^Ehn%Jn
zwaFlmN6f@#DQu5p`^Y4&uPG7ViEBj}<i2&T;zuHiN?GG?^g5V>))CzqN3RXXEn=T-
z2LwCncEuTY7T`Xp94rzV^ZuMDGp8$BHUpfMqdnjWR&vaT@T&HWY;(-m26vbav7+v?
zn^&p`zONb3Z71%XHHnSL{d`U}Rvqp)pSqs;!y$(-Fp92ZX=gFW<C}wy<t?I&IQ#D?
zZ+vS*s~-DVIcuh!wH!vh9xg0-KIGe_C@D(KOoq^vDr$-O#^ulJZxrPh@K($p3Fymg
z5*4q(!zg74`9#4ny(-#GlQfB{YgR9(G8To%i5Iw!vm;bSJw&yG262_&5UgJdHtQQk
zh1}HSc&tobUGCN%Pd(+U|KuzDTm!0Wizjm`#Urw5d9x>|{)O6*tz<0TfD8jk%a)hu
zq3|^=yk(cP!I>veQLW%7Zx~j>g21<@b8}$xSY7K#1OE+<YOkb#u9Y)Ju|~tz6&g&;
zMEi)XA7B9hu2=poW=Q^u8Be&cvz;d<V%Ck7wI+mV8#WD%MoJSZ^LZd(a+{UPZbbH-
zr{wrgr6Q3wx2uT+8iTwt@(C&-l=)OV+!J4>Pk#Mks7s;G5F{dGDo;REkjIgl0fnP|
zRg9=xR*%+ZXt7_BYmhGxKXyZM%Ef-f(Ji_Du}j22$-z<-QKP$Xqa)Ehj);TIH!^ht
zTys5P5Sl3cj*^yhFoAF-#_XFn;Z52Y?CR_G$R{j|M1?9#RpjMuWjCF6<$=z%o>`7O
ziXxw|b(Wlov-ZMaADMhMI@ebjg4sK!DCDOY0+$SMin8uf=4X@?NrqBKU?1Tykq<{Y
z2kBW(D1cfCAv(%SU12s?TBRv;>0R#+vq%n;5=_bGO^=asQjhKhW=rc^rG#pa3x1+C
zVIfq{r=)f+q|Yw>Vpw3VRoLjAUKn+98_w1LKs*Z$)X(W?_|7(P@|w;UhaSj*W0%3|
zXZA?=mY(&fisB)=Q`7ZqDSer@EXJ<tZ5kaMI#D#WgKj<jlAfjmpE{lo@>Lk0=dX^{
z$t;To*3=jc_dA3Pm>aqj)`oUHDYc*Ec$LzHJ^ea?39bfiC;Y{~rQ(LP(a)5iV!(Y#
zfFn&a4=$1<dOm#el>r*niuZ08sWe+qJ5M;qZ!(mpv~VxHBoe_n+*GTz7*>8F{UWj2
zbV9FM6tJ)@uX!y|8Z2G~TIL^e(5!KKp;mXSw;JH8N3$U_9Dw)Yc8?Ev#jTz`e<yOW
z2)b5cYdn#<i0!za%g+Ys*ORZo%D%=NfO|WGslUU^Hh+pv*K}xkH?#{4J?pCTL-Cwe
zOf9c|3u~@^KfXv-y>2}LFJI40|LULa!tBi(tnlqubJc;%Ky%f4sq}tq{%+qWN;6<o
zIn?(P$t-NwQd72lN&u*j7)#lG;?@_9EwP;kj(4j}2Dna6q|6`OIa3={0rt%4cc=c8
zLOErij5_2|aL^`pv{9L|hV-(|8qv-;WXkA#nBo){lNOojny5*187-sxx3r+WD5ai`
zc6M-2q2GTZm<(w!H1kZCq70n2Xx{2CDmi+$tG^a!^9?Es&WIhde-9G{<`J;)63BLz
zZ6@Q;=0UhCD#qkm7neLXhHDHD$%6uV#GIn(xGQNVP)$_=_il=Fwb?rd*O}HUz7-JE
zzM2<9?c=Oy<r<vv>*~gJ{~4b47VNOF_;tOAogR0eq&bFdECF{cJ_0*w3b#&4QXx9E
zx3xT97^WY_XD}qTj1W7lNic|m?;w7N7kk_TLR;J(;Us;ztGn}PT_lLgqNks<di?7U
zk9(=1i2}@VYk|W$Y3L>~TiUmR)H=C`nq)syZ<q1#-(Jh%oxb8ukZ$@kCW2c(Po9A4
zh@^tz2|-V(BXJ;DyItf>vSYZRQ2^g??T5S@0P*fV4){tsw-rQ5Wy~vj*4A08p<;RX
zn^ODBgiV}HpC5H?cWX_Rkou2&R9`lIiD`Y7-xNY<FgznL)$8bmrQ78EkS55o{|y%H
zSibgmPR*MdJpF!1F$Mw#?@dz!ws4NuMFG%?6qa^^Vo_F&5O|WrAvG`41d>wxin>V`
zrC5fYM_x5m=crIquy|#OgZi#MLNuY%bE<E@s<t6sU0G-VRFcHMUo1!Jq|VI2;cGU=
zRYk<(7CArg>UFsQ&Jy+NgGJ>7lj#TBFN!&LEvH4mlc%Uk`y6|`G&4Rd-pQv%`P$EJ
z(BiVg;Pde&nb~|aMh{54&j4{h#o&3$4^CfEA5*~G@wSVz1@Y+i&N9NWxmY8*hJ$%v
zkJJU}jh?De(TJDv4GE}&8NSc8Rq{H-G3BTqav+cA=fG9K*3uia#E*(}bIv6TEQ<Cf
zlFk$gZb;F66t#<O;C0acL2N$GwKotPp7BZMM|m$@y|1PepKGSqUR{ZdU&>xMErB=z
z-IOupODM1Lv+T!ZE-=}V25a~hZ1JAH;~J=1$WwD7-%Si3F6*x9!1wwMd*8#>m!Mfj
zwH^rRxv+I9dz{SGW~<h{9rcb6#bVcLC36Z0rk?j(aKW8a_&~K{IWVrhDZ~bJ2?jdu
zKm^hD4}V4UK^^T?JVD2{Dny`TXB8T*y*9*0FBbgv51Dq2ifz4MzdVtj`ixoE%2ndG
zvu_ms5Nh4S8;e<rUA}i4Co@8a-kn8hOOc4Yo<(VsGHaE#5j3RO$<<xXtz-#w6L&K!
z;|?40txb-hv^M6YaVoG(Yx3QIj@ZM>kAX%FKbLRQ`4%FT#)#jD*k}HbIP_O$^K2dn
z+9rHU655*j&T3=kaNbAHzv7hh1I>Pr@J{@M;!=Map<|~8->aP<|1gB^K#iA4nMtx`
zf_jg^JY5iprgD88od)o`ib?4zuZ7JiQFegKcaN3sk&)4qd^6Ex*}}Rx!L~LE$`G&>
z4vzLFp-o*rMEGV?T!|Q^JS6&1fit(Ni@Kf;4!MuB=&a^>G}v5F#t(QcWxn@%^}V$@
z#dDmR#6o<3j$F>>2T4jqB_k^LZQD4&*;+{LUnftl`OtN|s6O!&A-K=r^jhBc@y^aC
z$Z7CFMef`u#?b&*v+&Ty0%_i}`ZjvEEr?LD)E7hMq5iR<J8EV<jW$VXMk^<=xg|9!
zMRi&qoGsY78__TkAGkfkeNu=607Q}gTdlkHSFQUCU|$B0e}U~psMBw-eM+Tv&<4i0
ziIHlEQ7$_@#A?@>>u;5OGE#LOBuH;cXt&n?wEx@g$2<Eg;;}0ct*XR$#kn}h<~A>2
zL+I(o`YfG6Z!T)aku$!Wnc{UlXf%WdEM!u#D`hNb-I5?{mfxwa`RZF|ZTPy5Q1=da
zC<CjP>TZeYl}cAW?K;nv^KzSSB%~I3{x5_-lMwWJFeQo&=*C;m@Ck|H@3|Z;$S)|e
zF^;l#m4(01mGmYUwy<Z~^do{1Oq6H`t^MNI-FcsStE3Dy6@ZPgjMmnX-ym9V&VL2e
zl>Y%oj#eIaws+wk_gx)+W2DT4<|u3t^Z=r8t21zYjg4(^Bzy++#(_X=fCn-(#>Kum
zVTy}v;Gg3;>O5Z{b5EZZe=bH9nJOR9M=#v!vcdL3ZmrRqnNK+PGb3qg=O-kNq}Afh
zuX;&ss%0{zCCSxA2+=R>?wJgp;FtO-Pm|x0dkuo_Ut2RA;&*BJY&kCX*3T#;Rq6~<
z8MrjVf#)4>)E(ghEtN%drLr(Qj`35Ho}_n>kn*oM55E~i_Hu#qWH(<iq{M>I5TiR^
zsBvp8S~D9oIQ-{f;wt(HTbPZu2h7b=$Oh(NhiQF*x@KA}!jyprhWvrd>ymnomK;%X
zVC%V<_V?rF*|iU$rnJBO>tadt<|gD1GO|vFJDn}&n@MAm3(|5Om9zm-&s2){DI08~
z-4NKT&nftYs2~Ga{AmS-pi&+o61h~}aCZvlnb@R)ds+kS(om7=fGfBaSIV@r?j;vm
z_Ni91r)>*W<1v6d!in4<!X0idv@58qi~_iu0{bKLv7s}x9@mwmqw|yzS|<2~<uuIV
zJ*n>!HnL}PE7|?`QC;Rn`or1k*IsQZw!peOe%OE2F80%B(3X*wRH?yQRKh`pR4>RZ
zO4Ph?AWCZI;9{kFv@I3go-{e=w4>16i2C-(<z8B)y;f5PHvbJ>+2BtpzY@2k93C|6
z6KUZhb|0$^7OmV}?5QnKJXR0;AnOxPR8yKa_xvO{Uy1{KXO3Te$?YaMKvLJCUr)09
z>Hf6FWANRoj)xOoC))&*Rbu;19h$Wa<8Yd@qlD!f7*FvUOLjP+y@bIl|LiI9%bv1w
zbHg|)GXV-~!Hmc!RQjz|;OlQxuak_6uOliMWhzaRIR$MWC?w?r&rY!&n|u|<f=W7m
zb{pV}QThXQWGAwD3SR>FG6%gTvcLQ+xtGnxWtB{ufy85E&7}&qWN!8af>rrNo7?f$
zb$2vWawkKtUUe9=FYs=TNa`-<EZwX^7;@jfBAFO9nAXG^s-LYMjGIg3UZZu6<mJ{n
zXoLFirFIw-wUe=7In1Y23r6M)<N3&^+s&H1bJ)ZexYd=lRdq(X6=bfGCEVtD4>~d)
z_jtoe$9up21O3ttNRb7U1K$x{QBI~}KPU(JdWxiLF^lxNGaasqc<d&h|4wh`>y9TU
zXtQMeYlwp`8_wpTCmiPT+jf0#3DH50oSiBHj!o#{Ck|FdOUddTB%rFXJYk=q(SiQ#
zVH<IenH4mL<WE)KkdIw`P$R_00}vn~<raXQ7D$1cJ7+AJ5ZMcO{B>U6?y=)WbYeyG
zvp7i?Q9dXb?VBNaCT%0geha3X<*q9p*VmG|Hiz|8MsHYW3<)}73y?nAUXeJ}Ahn&|
zJVe1*{ax~c943Cxbh9IXkZ0(x-bSDQ91s}2OjTj7a64DHg`TgQou}Dv+^W~B^@)YD
zakI?vLt4vXGwqNozG7T$tFI)mOrSIXXLRz+KWEr3waU}@dt{S;qmVgEx{Gb*n`W6D
z{Wvb47z%BOXHFMG13%NClX!aEyS9-K4V5wv>EikuHxN<=q4}gvS9tXZT{!oxoHVSj
zn+{TZUhoOH_1u6dNh@$(X=sg<u(mkRqmiJa#5E*uzIr`cS30--YnTpm7l#3uyH91^
zFT|;hY_h<j&rL~EzLcWf>{qF!o-YfFErj|@-PKLJT2?5lWmd~W(`MGxyQ|Evh)L!p
zxLAE=<jmC5(*hjp0*3q|Ps$opp)^R#CJEC{Ky#l__M|?*eB^`i+k6$q0q-EwaC(yl
z)>G9TyqP-{5Io7Z-xcTb7!S@E&hGD(=29P4yq^suQwR;*P+-PVd_VqSOUy`Ws?~cV
zLKDh@y?X0>$vZ0ac<n3sv<!Ywv81VV*}~Y4K(E%ej#`?9-!TCuYv*L&li8vJ?<+3G
zNm$sFfQzlx<wXu=tM%V>X~*>^<K<}{v*)=WZuG1E*!uj*e0k=<4Ez^lie3(&nJ>ow
zKUptB2+UA)K^}O2zxRJeq(7xxZnqe?aY5#2E1*B%AHc_-axS+nOmn^<AL73h`Lk93
zDdBQS#k9B!f}<_-KMn6s_REC@Q+F?j3GJ1?+5c08|Kz@$zL+GvAVajUzqv0;(Vqe?
hht_`!5JD^S|Kz;31}@r7001HS2tjv|uN0WO{{wdF3XuQ+

literal 0
HcmV?d00001

diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py
index 9c8a379d..f2727472 100644
--- a/unittests/test_table_importer.py
+++ b/unittests/test_table_importer.py
@@ -23,7 +23,6 @@ import unittest
 from functools import partial
 from tempfile import NamedTemporaryFile
 
-import caosdb as db
 import numpy as np
 import pandas as pd
 import pytest
@@ -211,6 +210,21 @@ class XLSImporterTest(TableImporterTest):
         self.assertRaises(DataInconsistencyError, importer.read_xls,
                           tmp.name)
 
+    def test_datatypes(self):
+        """Test datataypes in columns."""
+        importer = XLSImporter(converters={},
+                               obligatory_columns=["float_as_float"],
+                               datatypes={
+                                   "float_as_float": float,
+                                   "int_as_float": float,
+                                   "int_as_int": int,
+                                   "float_as_int": int,
+                               }
+                               )
+        df = importer.read_xls(os.path.join(os.path.dirname(__file__), "data", "datatypes.xlsx"))
+        assert np.issubdtype(df.loc[0, "int_as_float"], float)
+        assert df.loc[1, "float_as_int"] == 6  # This is an acceptable rounding error.
+
 
 class CSVImporterTest(TableImporterTest):
     def test_full(self):
-- 
GitLab