From 985cb826b9b4b5464caf55316b8c8416eadfdc4c Mon Sep 17 00:00:00 2001 From: Daniel <d.hornung@indiscale.com> Date: Wed, 2 Mar 2022 13:52:58 +0100 Subject: [PATCH] FIX: Be less strict about numeric datatypes form xlsx files. --- CHANGELOG.md | 2 + src/caosadvancedtools/crawler.py | 2 + src/caosadvancedtools/table_importer.py | 115 +++++++++++++++++------- unittests/data/datatypes.xlsx | Bin 0 -> 5482 bytes unittests/test_table_importer.py | 16 +++- 5 files changed, 100 insertions(+), 35 deletions(-) create mode 100644 unittests/data/datatypes.xlsx diff --git a/CHANGELOG.md b/CHANGELOG.md index 44629bd9..a6b2de73 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed ### +- `TableConverter` now converts int to float and vice versa to match the desired dtype. + ### Deprecated ### ### Removed ### diff --git a/src/caosadvancedtools/crawler.py b/src/caosadvancedtools/crawler.py index 5d91d85c..87b91a52 100644 --- a/src/caosadvancedtools/crawler.py +++ b/src/caosadvancedtools/crawler.py @@ -279,6 +279,8 @@ class Crawler(object): except DataInconsistencyError as e: logger.debug(traceback.format_exc()) logger.debug(e) + # TODO: Generally: in which cases should exceptions be raised? When is + # errors_occured set to True? The expected behavior must be documented. except Exception as e: try: DataModelProblems.evaluate_exception(e) diff --git a/src/caosadvancedtools/table_importer.py b/src/caosadvancedtools/table_importer.py index 0b55252b..654d28b4 100755 --- a/src/caosadvancedtools/table_importer.py +++ b/src/caosadvancedtools/table_importer.py @@ -205,27 +205,32 @@ def string_in_list(val, options, ignore_case=True): return val -class TableImporter(object): +class TableImporter(): + """Abstract base class for importing data from tables. + """ def __init__(self, converters, obligatory_columns=None, unique_keys=None, datatypes=None): """ - converters: dict with column names as keys and converter functions as - values - This dict also defines what columns are required to exist - throught the existing keys. The converter functions are - applied to the cell values. They should also check for - ValueErrors, such that a separate value check is not - necessary. - obligatory_columns: list of column names, optional - each listed column must not have missing values - unique_columns : list of column names that in - combination must be unique; i.e. each row has a - unique combination of values in those columns. - datatypes: dict with column names as keys and datatypes as values - All non-null values will be checked whether they have the - provided datatype. - This dict also defines what columns are required to exist - throught the existing keys. + Parameters + ---------- + converters : dict + Dict with column names as keys and converter functions as values. This dict also defines + what columns are required to exist throught the existing keys. The converter functions are + applied to the cell values. They should also check for ValueErrors, such that a separate + value check is not necessary. + + obligatory_columns : list, optional + List of column names, each listed column must not have missing values. + + unique_keys : list, optional + List of column names that in combination must be unique: each row has a unique + combination of values in those columns. + + datatypes : dict, optional + Dict with column names as keys and datatypes as values. All non-null values will be + checked whether they have the provided datatype. This dict also defines what columns are + required to exist throught the existing keys. + """ if converters is None: @@ -247,11 +252,14 @@ class TableImporter(object): raise NotImplementedError() def check_columns(self, df, filename=None): - """ - checks whether all required columns, i.e. columns for which converters - were defined exist. + """Check whether all required columns exist. + + Required columns are columns for which converters are defined. + + Raises + ------ + DataInconsistencyError - Raises: DataInconsistencyError """ for col in self.required_columns: @@ -267,12 +275,11 @@ class TableImporter(object): raise DataInconsistencyError(errmsg) def check_unique(self, df, filename=None): - """ - Check whether value combinations that shall be unique for each row are - unique. + """Check whether value combinations that shall be unique for each row are unique. If a second row is found, that uses the same combination of values as a previous one, the second one is removed. + """ df = df.copy() uniques = [] @@ -299,13 +306,33 @@ class TableImporter(object): return df - def check_datatype(self, df, filename=None): - """ - Check for each column whether non-null fields are have the correct - datatype. - """ + def check_datatype(self, df, filename=None, strict=False): + """Check for each column whether non-null fields have the correct datatype. + + .. note:: + + If columns are float, but should be integer or vice versa, this method converts the + respective columns in place. + Parameters + ---------- + + strict: boolean, optional + If False (the default), try to convert columns, otherwise raise an error. + + """ for key, datatype in self.datatypes.items(): + # Check for castable numeric types first: We unconditionally cast float to int and vice + # versa, because CaosDB does not have different sizes anyway. + col_dtype = df.dtypes[key] + if not strict and not np.issubdtype(col_dtype, datatype): + issub = np.issubdtype + # These special cases should be fine. + if ((issub(col_dtype, np.integer) and issub(datatype, np.floating)) + or (issub(col_dtype, np.floating) and issub(datatype, np.integer))): + df[key] = df[key].astype(datatype) + + # Now check each element for idx, val in df.loc[ pd.notnull(df.loc[:, key]), key].iteritems(): @@ -326,6 +353,11 @@ class TableImporter(object): Check in each row whether obligatory fields are empty or null. Rows that have missing values are removed. + + Returns + ------- + out : pandas.DataFrame + The input DataFrame with incomplete rows removed. """ df = df.copy() @@ -362,10 +394,26 @@ class TableImporter(object): return df - def check_dataframe(self, df, filename): + def check_dataframe(self, df, filename=None, strict=False): + """Check if the dataframe conforms to the restrictions. + + Checked restrictions are: Columns, data types, uniqueness requirements. + + Parameters + ---------- + + df: pandas.DataFrame + The dataframe to be checked. + + filename: string, optional + The file name, only used for output in case of problems. + + strict: boolean, optional + If False (the default), try to convert columns, otherwise raise an error. + """ self.check_columns(df, filename=filename) df = self.check_missing(df, filename=filename) - self.check_datatype(df, filename=filename) + self.check_datatype(df, filename=filename, strict=strict) if len(self.unique_keys) > 0: df = self.check_unique(df, filename=filename) @@ -378,8 +426,7 @@ class XLSImporter(TableImporter): return self.read_xls(filename=filename, **kwargs) def read_xls(self, filename, **kwargs): - """ - converts an xls file into a Pandas DataFrame. + """Convert an xls file into a Pandas DataFrame. The converters of the XLSImporter object are used. diff --git a/unittests/data/datatypes.xlsx b/unittests/data/datatypes.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..34fc4cf43092a68b630e0e04ebc43609b8a0b17b GIT binary patch literal 5482 zcmaJ_1ymFZ_g|1$8d*}hr5ou~bYYj0Mmm*PVx^m1r9%NFrBRenLMaIeNeMx62?6Ow zO5nfi`_6;U@BiMNGrKeA%)K+WfA?x@T*09P0D(XNmc6we;1Us`uPr?6oIQmEF~22G znl(QO6F=JYkK#R$m{J1f6xKoR)Uk>)HF3DoS4HNvhVT5m38Em5efKHOJt+9hx&Q@n z<wpr~X_!aq8%JU{aM=VFwRWGm`wB_+<_5K2mAb79Q|0xD*HYi>nN{w}_lGB_6E5vS zH3qPL*sR$C<dq(KnCW|qp4mn$B8=B!DBK+kaz}%4>Jd;ECka<DGlQRk*qkBTQ&z9f z>X5vEBB^=`61zu)z9_rEn(pb%BvLwJ;XBZ)&sY~XK2s09k7GSe>Rx{S98_Dg=;3C) zhu2$Afe?0p`$O3S??5(QP%(;Gs7IS*{pwx>X=~sEk!j*-8)zf+VFLi#|7a#M^b<bL zLf$YBCu<nYNzli|IreU+>x3|Q!vP--%0<BZ0e#99!?*;%6nRl+>Jv3;rzpzL+bOx{ zsH=7bJmacllF~%e^{Kuls0^oN#%7}&{hk)9COPdILBx35+JueFbCEA>hPcm6lzRj% z$9h+$cJg}_hWs`M{OxQESS5+3-}~Z9yjPtH$-VOm+M5We*mbc9p{d4hGC9^Ehn%Jn zwaFlmN6f@#DQu5p`^Y4&uPG7ViEBj}<i2&T;zuHiN?GG?^g5V>))CzqN3RXXEn=T- z2LwCncEuTY7T`Xp94rzV^ZuMDGp8$BHUpfMqdnjWR&vaT@T&HWY;(-m26vbav7+v? zn^&p`zONb3Z71%XHHnSL{d`U}Rvqp)pSqs;!y$(-Fp92ZX=gFW<C}wy<t?I&IQ#D? zZ+vS*s~-DVIcuh!wH!vh9xg0-KIGe_C@D(KOoq^vDr$-O#^ulJZxrPh@K($p3Fymg z5*4q(!zg74`9#4ny(-#GlQfB{YgR9(G8To%i5Iw!vm;bSJw&yG262_&5UgJdHtQQk zh1}HSc&tobUGCN%Pd(+U|KuzDTm!0Wizjm`#Urw5d9x>|{)O6*tz<0TfD8jk%a)hu zq3|^=yk(cP!I>veQLW%7Zx~j>g21<@b8}$xSY7K#1OE+<YOkb#u9Y)Ju|~tz6&g&; zMEi)XA7B9hu2=poW=Q^u8Be&cvz;d<V%Ck7wI+mV8#WD%MoJSZ^LZd(a+{UPZbbH- zr{wrgr6Q3wx2uT+8iTwt@(C&-l=)OV+!J4>Pk#Mks7s;G5F{dGDo;REkjIgl0fnP| zRg9=xR*%+ZXt7_BYmhGxKXyZM%Ef-f(Ji_Du}j22$-z<-QKP$Xqa)Ehj);TIH!^ht zTys5P5Sl3cj*^yhFoAF-#_XFn;Z52Y?CR_G$R{j|M1?9#RpjMuWjCF6<$=z%o>`7O ziXxw|b(Wlov-ZMaADMhMI@ebjg4sK!DCDOY0+$SMin8uf=4X@?NrqBKU?1Tykq<{Y z2kBW(D1cfCAv(%SU12s?TBRv;>0R#+vq%n;5=_bGO^=asQjhKhW=rc^rG#pa3x1+C zVIfq{r=)f+q|Yw>Vpw3VRoLjAUKn+98_w1LKs*Z$)X(W?_|7(P@|w;UhaSj*W0%3| zXZA?=mY(&fisB)=Q`7ZqDSer@EXJ<tZ5kaMI#D#WgKj<jlAfjmpE{lo@>Lk0=dX^{ z$t;To*3=jc_dA3Pm>aqj)`oUHDYc*Ec$LzHJ^ea?39bfiC;Y{~rQ(LP(a)5iV!(Y# zfFn&a4=$1<dOm#el>r*niuZ08sWe+qJ5M;qZ!(mpv~VxHBoe_n+*GTz7*>8F{UWj2 zbV9FM6tJ)@uX!y|8Z2G~TIL^e(5!KKp;mXSw;JH8N3$U_9Dw)Yc8?Ev#jTz`e<yOW z2)b5cYdn#<i0!za%g+Ys*ORZo%D%=NfO|WGslUU^Hh+pv*K}xkH?#{4J?pCTL-Cwe zOf9c|3u~@^KfXv-y>2}LFJI40|LULa!tBi(tnlqubJc;%Ky%f4sq}tq{%+qWN;6<o zIn?(P$t-NwQd72lN&u*j7)#lG;?@_9EwP;kj(4j}2Dna6q|6`OIa3={0rt%4cc=c8 zLOErij5_2|aL^`pv{9L|hV-(|8qv-;WXkA#nBo){lNOojny5*187-sxx3r+WD5ai` zc6M-2q2GTZm<(w!H1kZCq70n2Xx{2CDmi+$tG^a!^9?Es&WIhde-9G{<`J;)63BLz zZ6@Q;=0UhCD#qkm7neLXhHDHD$%6uV#GIn(xGQNVP)$_=_il=Fwb?rd*O}HUz7-JE zzM2<9?c=Oy<r<vv>*~gJ{~4b47VNOF_;tOAogR0eq&bFdECF{cJ_0*w3b#&4QXx9E zx3xT97^WY_XD}qTj1W7lNic|m?;w7N7kk_TLR;J(;Us;ztGn}PT_lLgqNks<di?7U zk9(=1i2}@VYk|W$Y3L>~TiUmR)H=C`nq)syZ<q1#-(Jh%oxb8ukZ$@kCW2c(Po9A4 zh@^tz2|-V(BXJ;DyItf>vSYZRQ2^g??T5S@0P*fV4){tsw-rQ5Wy~vj*4A08p<;RX zn^ODBgiV}HpC5H?cWX_Rkou2&R9`lIiD`Y7-xNY<FgznL)$8bmrQ78EkS55o{|y%H zSibgmPR*MdJpF!1F$Mw#?@dz!ws4NuMFG%?6qa^^Vo_F&5O|WrAvG`41d>wxin>V` zrC5fYM_x5m=crIquy|#OgZi#MLNuY%bE<E@s<t6sU0G-VRFcHMUo1!Jq|VI2;cGU= zRYk<(7CArg>UFsQ&Jy+NgGJ>7lj#TBFN!&LEvH4mlc%Uk`y6|`G&4Rd-pQv%`P$EJ z(BiVg;Pde&nb~|aMh{54&j4{h#o&3$4^CfEA5*~G@wSVz1@Y+i&N9NWxmY8*hJ$%v zkJJU}jh?De(TJDv4GE}&8NSc8Rq{H-G3BTqav+cA=fG9K*3uia#E*(}bIv6TEQ<Cf zlFk$gZb;F66t#<O;C0acL2N$GwKotPp7BZMM|m$@y|1PepKGSqUR{ZdU&>xMErB=z z-IOupODM1Lv+T!ZE-=}V25a~hZ1JAH;~J=1$WwD7-%Si3F6*x9!1wwMd*8#>m!Mfj zwH^rRxv+I9dz{SGW~<h{9rcb6#bVcLC36Z0rk?j(aKW8a_&~K{IWVrhDZ~bJ2?jdu zKm^hD4}V4UK^^T?JVD2{Dny`TXB8T*y*9*0FBbgv51Dq2ifz4MzdVtj`ixoE%2ndG zvu_ms5Nh4S8;e<rUA}i4Co@8a-kn8hOOc4Yo<(VsGHaE#5j3RO$<<xXtz-#w6L&K! z;|?40txb-hv^M6YaVoG(Yx3QIj@ZM>kAX%FKbLRQ`4%FT#)#jD*k}HbIP_O$^K2dn z+9rHU655*j&T3=kaNbAHzv7hh1I>Pr@J{@M;!=Map<|~8->aP<|1gB^K#iA4nMtx` zf_jg^JY5iprgD88od)o`ib?4zuZ7JiQFegKcaN3sk&)4qd^6Ex*}}Rx!L~LE$`G&> z4vzLFp-o*rMEGV?T!|Q^JS6&1fit(Ni@Kf;4!MuB=&a^>G}v5F#t(QcWxn@%^}V$@ z#dDmR#6o<3j$F>>2T4jqB_k^LZQD4&*;+{LUnftl`OtN|s6O!&A-K=r^jhBc@y^aC z$Z7CFMef`u#?b&*v+&Ty0%_i}`ZjvEEr?LD)E7hMq5iR<J8EV<jW$VXMk^<=xg|9! zMRi&qoGsY78__TkAGkfkeNu=607Q}gTdlkHSFQUCU|$B0e}U~psMBw-eM+Tv&<4i0 ziIHlEQ7$_@#A?@>>u;5OGE#LOBuH;cXt&n?wEx@g$2<Eg;;}0ct*XR$#kn}h<~A>2 zL+I(o`YfG6Z!T)aku$!Wnc{UlXf%WdEM!u#D`hNb-I5?{mfxwa`RZF|ZTPy5Q1=da zC<CjP>TZeYl}cAW?K;nv^KzSSB%~I3{x5_-lMwWJFeQo&=*C;m@Ck|H@3|Z;$S)|e zF^;l#m4(01mGmYUwy<Z~^do{1Oq6H`t^MNI-FcsStE3Dy6@ZPgjMmnX-ym9V&VL2e zl>Y%oj#eIaws+wk_gx)+W2DT4<|u3t^Z=r8t21zYjg4(^Bzy++#(_X=fCn-(#>Kum zVTy}v;Gg3;>O5Z{b5EZZe=bH9nJOR9M=#v!vcdL3ZmrRqnNK+PGb3qg=O-kNq}Afh zuX;&ss%0{zCCSxA2+=R>?wJgp;FtO-Pm|x0dkuo_Ut2RA;&*BJY&kCX*3T#;Rq6~< z8MrjVf#)4>)E(ghEtN%drLr(Qj`35Ho}_n>kn*oM55E~i_Hu#qWH(<iq{M>I5TiR^ zsBvp8S~D9oIQ-{f;wt(HTbPZu2h7b=$Oh(NhiQF*x@KA}!jyprhWvrd>ymnomK;%X zVC%V<_V?rF*|iU$rnJBO>tadt<|gD1GO|vFJDn}&n@MAm3(|5Om9zm-&s2){DI08~ z-4NKT&nftYs2~Ga{AmS-pi&+o61h~}aCZvlnb@R)ds+kS(om7=fGfBaSIV@r?j;vm z_Ni91r)>*W<1v6d!in4<!X0idv@58qi~_iu0{bKLv7s}x9@mwmqw|yzS|<2~<uuIV zJ*n>!HnL}PE7|?`QC;Rn`or1k*IsQZw!peOe%OE2F80%B(3X*wRH?yQRKh`pR4>RZ zO4Ph?AWCZI;9{kFv@I3go-{e=w4>16i2C-(<z8B)y;f5PHvbJ>+2BtpzY@2k93C|6 z6KUZhb|0$^7OmV}?5QnKJXR0;AnOxPR8yKa_xvO{Uy1{KXO3Te$?YaMKvLJCUr)09 z>Hf6FWANRoj)xOoC))&*Rbu;19h$Wa<8Yd@qlD!f7*FvUOLjP+y@bIl|LiI9%bv1w zbHg|)GXV-~!Hmc!RQjz|;OlQxuak_6uOliMWhzaRIR$MWC?w?r&rY!&n|u|<f=W7m zb{pV}QThXQWGAwD3SR>FG6%gTvcLQ+xtGnxWtB{ufy85E&7}&qWN!8af>rrNo7?f$ zb$2vWawkKtUUe9=FYs=TNa`-<EZwX^7;@jfBAFO9nAXG^s-LYMjGIg3UZZu6<mJ{n zXoLFirFIw-wUe=7In1Y23r6M)<N3&^+s&H1bJ)ZexYd=lRdq(X6=bfGCEVtD4>~d) z_jtoe$9up21O3ttNRb7U1K$x{QBI~}KPU(JdWxiLF^lxNGaasqc<d&h|4wh`>y9TU zXtQMeYlwp`8_wpTCmiPT+jf0#3DH50oSiBHj!o#{Ck|FdOUddTB%rFXJYk=q(SiQ# zVH<IenH4mL<WE)KkdIw`P$R_00}vn~<raXQ7D$1cJ7+AJ5ZMcO{B>U6?y=)WbYeyG zvp7i?Q9dXb?VBNaCT%0geha3X<*q9p*VmG|Hiz|8MsHYW3<)}73y?nAUXeJ}Ahn&| zJVe1*{ax~c943Cxbh9IXkZ0(x-bSDQ91s}2OjTj7a64DHg`TgQou}Dv+^W~B^@)YD zakI?vLt4vXGwqNozG7T$tFI)mOrSIXXLRz+KWEr3waU}@dt{S;qmVgEx{Gb*n`W6D z{Wvb47z%BOXHFMG13%NClX!aEyS9-K4V5wv>EikuHxN<=q4}gvS9tXZT{!oxoHVSj zn+{TZUhoOH_1u6dNh@$(X=sg<u(mkRqmiJa#5E*uzIr`cS30--YnTpm7l#3uyH91^ zFT|;hY_h<j&rL~EzLcWf>{qF!o-YfFErj|@-PKLJT2?5lWmd~W(`MGxyQ|Evh)L!p zxLAE=<jmC5(*hjp0*3q|Ps$opp)^R#CJEC{Ky#l__M|?*eB^`i+k6$q0q-EwaC(yl z)>G9TyqP-{5Io7Z-xcTb7!S@E&hGD(=29P4yq^suQwR;*P+-PVd_VqSOUy`Ws?~cV zLKDh@y?X0>$vZ0ac<n3sv<!Ywv81VV*}~Y4K(E%ej#`?9-!TCuYv*L&li8vJ?<+3G zNm$sFfQzlx<wXu=tM%V>X~*>^<K<}{v*)=WZuG1E*!uj*e0k=<4Ez^lie3(&nJ>ow zKUptB2+UA)K^}O2zxRJeq(7xxZnqe?aY5#2E1*B%AHc_-axS+nOmn^<AL73h`Lk93 zDdBQS#k9B!f}<_-KMn6s_REC@Q+F?j3GJ1?+5c08|Kz@$zL+GvAVajUzqv0;(Vqe? hht_`!5JD^S|Kz;31}@r7001HS2tjv|uN0WO{{wdF3XuQ+ literal 0 HcmV?d00001 diff --git a/unittests/test_table_importer.py b/unittests/test_table_importer.py index 9c8a379d..f2727472 100644 --- a/unittests/test_table_importer.py +++ b/unittests/test_table_importer.py @@ -23,7 +23,6 @@ import unittest from functools import partial from tempfile import NamedTemporaryFile -import caosdb as db import numpy as np import pandas as pd import pytest @@ -211,6 +210,21 @@ class XLSImporterTest(TableImporterTest): self.assertRaises(DataInconsistencyError, importer.read_xls, tmp.name) + def test_datatypes(self): + """Test datataypes in columns.""" + importer = XLSImporter(converters={}, + obligatory_columns=["float_as_float"], + datatypes={ + "float_as_float": float, + "int_as_float": float, + "int_as_int": int, + "float_as_int": int, + } + ) + df = importer.read_xls(os.path.join(os.path.dirname(__file__), "data", "datatypes.xlsx")) + assert np.issubdtype(df.loc[0, "int_as_float"], float) + assert df.loc[1, "float_as_int"] == 6 # This is an acceptable rounding error. + class CSVImporterTest(TableImporterTest): def test_full(self): -- GitLab