From 90b97e23e928d4b45b1ab4dc5b877f0ec8499bfa Mon Sep 17 00:00:00 2001 From: hannaeko Date: Mon, 22 Sep 2025 16:34:43 +0200 Subject: [PATCH] fix text extraction for cells with partial styling (#23) --- scripts/shared.js | 54 +++++++++++++------------- tests/fixtures/cellule avec style.ods | Bin 0 -> 8104 bytes tests/ods-files.js | 12 +++++- 3 files changed, 38 insertions(+), 28 deletions(-) create mode 100644 tests/fixtures/cellule avec style.ods diff --git a/scripts/shared.js b/scripts/shared.js index 6a97bbd..9b2b9be 100644 --- a/scripts/shared.js +++ b/scripts/shared.js @@ -11,8 +11,8 @@ import {parseXML} from './DOMUtils.js' const TEXT_NODE = 3 /** - * - * @param {Element} cell + * + * @param {Element} cell * @returns {string} */ function extraxtODSCellText(cell) { @@ -33,7 +33,7 @@ function extraxtODSCellText(cell) { text += pChild.nodeValue; // Append text inside } else if (pChild.nodeName === 'text:line-break') { text += '\n'; // Append newline for - } else if (pChild.nodeName === 'text:a') { + } else if (pChild.nodeName === 'text:a' || pChild.nodeName === 'text:span') { text += pChild.textContent } } @@ -41,7 +41,7 @@ function extraxtODSCellText(cell) { text += '\n'; // Append newline for directly under } } - + return text.trim(); } @@ -127,7 +127,7 @@ export async function getODSTableRawContent(arrayBuffer) { /** * Converts a cell value to the appropriate JavaScript type based on its cell type. - * @param {SheetCellRawContent} _ + * @param {SheetCellRawContent} _ * @returns {number | boolean | string | Date} The converted value. */ export function convertCellValue({value, type}) { @@ -163,10 +163,10 @@ export function convertCellValue({value, type}) { /** * @param {unknown} value - * @returns {value is OdfjsImage} + * @returns {value is OdfjsImage} */ export function isOdfjsImage(value) { - if (typeof value === 'object' && value!==null + if (typeof value === 'object' && value!==null && "content" in value && value.content instanceof ArrayBuffer && "fileName" in value && typeof value.fileName === 'string' && "mediaType" in value && typeof value.mediaType === 'string' @@ -183,15 +183,15 @@ export function isOdfjsImage(value) { /** - * - * @param {Map} rawContentSheets + * + * @param {Map} rawContentSheets * @returns {Map[][]>} */ export function tableRawContentToValues(rawContentSheets){ return new Map( [...rawContentSheets].map(([sheetName, rawContent]) => { return [ - sheetName, + sheetName, rawContent .map(row => row.map(c => convertCellValue(c))) ] @@ -204,7 +204,7 @@ export function tableRawContentToValues(rawContentSheets){ */ /** - * + * * @param {SheetCellRawContent} rawContentCell * @returns {string} */ @@ -213,8 +213,8 @@ export function cellRawContentToStrings(rawContentCell){ } /** - * - * @param {SheetRowRawContent} rawContentRow + * + * @param {SheetRowRawContent} rawContentRow * @returns {string[]} */ export function rowRawContentToStrings(rawContentRow){ @@ -222,8 +222,8 @@ export function rowRawContentToStrings(rawContentRow){ } /** - * - * @param {SheetRawContent} rawContentSheet + * + * @param {SheetRawContent} rawContentSheet * @returns {string[][]} */ export function sheetRawContentToStrings(rawContentSheet){ @@ -231,8 +231,8 @@ export function sheetRawContentToStrings(rawContentSheet){ } /** - * - * @param {Map} rawContentSheets + * + * @param {Map} rawContentSheets * @returns {Map} */ export function tableRawContentToStrings(rawContentSheets){ @@ -253,16 +253,16 @@ export function tableRawContentToStrings(rawContentSheets){ /** * This function expects the first row to contain string values which are used as column names - * It outputs an array of objects which keys are + * It outputs an array of objects which keys are * - * @param {SheetRawContent} rawContent + * @param {SheetRawContent} rawContent * @returns {any[]} */ export function sheetRawContentToObjects(rawContent){ let [firstRow, ...dataRows] = rawContent /** @type {string[]} */ - + const columns = firstRow.map((r, i) => { if (r.value === undefined || r.value === null || r.value === "") { return `Column ${i+1}` @@ -284,8 +284,8 @@ export function sheetRawContentToObjects(rawContent){ } /** - * - * @param {Map} rawContentSheets + * + * @param {Map} rawContentSheets * @returns {Map} */ export function tableRawContentToObjects(rawContentSheets){ @@ -312,7 +312,7 @@ export function isCellFilled({value}){ } /** - * @param {SheetRowRawContent} rawContentRow + * @param {SheetRowRawContent} rawContentRow * @returns {boolean} */ export function isRowNotEmpty(rawContentRow){ @@ -320,7 +320,7 @@ export function isRowNotEmpty(rawContentRow){ } /** - * @param {SheetRawContent} sheet + * @param {SheetRawContent} sheet * @returns {SheetRawContent} */ export function removeEmptyRowsFromSheet(sheet){ @@ -329,8 +329,8 @@ export function removeEmptyRowsFromSheet(sheet){ /** - * - * @param {Map} rawContentTable + * + * @param {Map} rawContentTable * @returns {Map} */ export function tableWithoutEmptyRows(rawContentTable){ @@ -339,4 +339,4 @@ export function tableWithoutEmptyRows(rawContentTable){ return [sheetName, removeEmptyRowsFromSheet(rawContent)] }) ) -} \ No newline at end of file +} diff --git a/tests/fixtures/cellule avec style.ods b/tests/fixtures/cellule avec style.ods new file mode 100644 index 0000000000000000000000000000000000000000..33071acce46436ef9380c213c298fb98a222b741 GIT binary patch literal 8104 zcmb6;1z4O*l0$&t1P$&48G^gJ4esu4gAEP=g1ZEFch}$!0fGdVAPEk^Jqd8|?y`Bi zdvEVu&DY;w|4dg^ch_{Oq6{<)761Sb02DfN=mgnvhcf^GfT!{J6~NBQ&J5z=U}ogt zU~6S!1hKLQGrEFJ8SIUmt(+O`9n8R{_9iZNW?%?|vxAeFk*Tw#nHfa!pLjm9`3Imq zVxsn7b1MrMr$4wkvoQYmYmkg=*gqg zX;breP>dI^5}U0e+NsN=UzR3rmgoj#z<1|77gahl>7+JDWR#u=BM7vVPQn~9OP^ku z-noE^9A~tzhqwl4Nn5^QzK&_XMcyHclFHKk_;9rs<;>Bg`>>rc$c%6b%=^O8$L$7s zll;}`dA~;s5SPIS-LYjTX^So#q+I2c@WpAM51ZZrETy5BmS|c&*H@=|5^)=RZoJjn zwsK_I%$4`rQP9T$_T!hEq_*e0MoI}+MsQip4w#%Ggxs7k$NIS8%P13`v_H3;_CO`V zQI;GvjmOV|1MG$Trwo(HC}6Y8I(w_=h*gs03SYi=SwcP?^OLtY;suO!t#+{AjHIiy zSgp<9D1qpCg>#r9U840aJEm-b#mg)%#`}S6bfipGQykMPH1k9hgsca<37hkM(JvJb zEQ4M1fwx(74%YF3`SF1?D2LnF*0pA(V&Nh<#hNtwt@m;EeOi;k?l#V~{aMW^;B%== zY%@V`9YYBi_`-PG*@av%r-v%_P@zzj$~h>jqrZJ1b#Ph~dH2~UrvJf^iR~V!pcSt8 z4C8(5%KlpkTXZ>U12GqE^%Hb%1>Inzk@qfxax9D3oWtGH@SLG?uHj-}cU4?)os3e1 zTQGMT=(WQ^11&%4dzY)yL&Y_gHFJ#=xhtZC0QtZnnBu`7bG{w%iu81 zW?j?j)MaCKY*e^S5Y<>E$Xv+@i?+V&4jUy~?6($bb=XlJ}F<|SH}R{IO-81oJ0OCRhIOQ@8<^Q zBnBuWa}pWkpY?_vub8Gf203XY5hmqgYee2K(XNnn9k|R2;n|TB=G&Dm1MBNwsuK%S z_gnFB?!p?=7?mt=!HSWaX9zJC3HhP4P4VDxr%pw>qFy$q=7ESE^MK>KpDButyfF zXY7Lc9%A{*48>3K==j>ZBR^~1jMPW+ zP)58x9{7550Yfxlk=H7$3{pYzzu(= z%KBV>Vhf9&!J7*$#<;@ua2#Z0Byr_Ow(*jh3GmG0ALIfFoxX)BX&c-4I_Hgy6#HmL zWmnmS5$*B>V!v`eBph!dO(?Tgt=Z{)-&HC9*`YJU6ieZQdZalGiJbc3G|}e`NO6iN zGo*@nv~VjyK+o}FPxY*G@IlOXO}<<+*okHj-@R~5kprQVFWvIfH*WjHxVsU}5n-3t zgEb;*Z}wI8jLuZ}8B5bnv~$Pc;uCpg5w3bfO!+DMWr+d7fYLll|n)rI*9Hsf4lLRrIs;GyD^@h^z_9VBiqPB1atN&sH5`Qm!02cP@W$oGBh5=uG z4~iU_@qizF%#9x!6H+X4p?sXk0rWGN^H%qb$*Pm*h|;DysxAwS9|!z(Cn|!^Y&7cI zzJ(W=SDRmQzHdY|yvz2zTRvJ?GF^dwAH%&Pnkwc>M%CkaVnw%iL3itG?qk-DjNJtN zfg-QF1fY1 z;saKJ_!3jRfoPA5ajzUL@oU>LZ4APLajnYIp(<`OG}+lRc}~E3u_aS_dFH-QX>o8G zy^TX-x$y4VviXev3ca_xZv;6vB>s%JdjCqMIkO-T+I5GG$^2e8i&N=*mhU*bkzSi| ztfRZEGxKt`8KDSyP!7*K|M4Nah-8Z@_D=2=VblFhn0eYaAJ$pQ#lk5s*e;PTjWiY> zGZ_}Wz)eLFXPS78(!Nwhh9M$cxSAl8G|`&;*BBd#ZWkEJ)3W2L^*9z;H{4uB-vXF# zQ^8A|31>eY$A08k4DDKdoGlX!%x}PNXd5s^11lsBrex(E3GrNdUM|<=>{8cWwuuE| zlDhN^W_Ye1i+|c^OWA>A($1htzo3xx)_GGty#FFW0&ci!R)A73C>Nv=gF{zbZ2dhY zrRFrLUR&i-%$!NWf9V94=ABgj~pwbGHnHN+!{pzqi_RRj#JbOZz zeRuk__iv0HG2OF+DEARz*4B{^4$!POy2{e~QDDsG>-?{ibeTVkBw38#1FE`?V(DJuM<;K3-GkXOV$#sj}7dD za`~!e?9rV#iPq2BEys*S!}QxFz#bEK&dh7k)*EHIW>_%_YoJkJ zEbLU7!lzWbm!hq{OMbbjY7*#hy-zNJUgsG5!jfCxy_P!%^4-jYuXn_!WF%63(jai@d&H!o*-fLpSb3#O0lN)sBXH()(;ylYEE$89)iPyA^T zbc8fs@=kH7QY<(UVRMThPj!pzcqj6<`aO3Pi>`?mYXAQBG#%|5R_(PPYb!wUfX zbx?`?7*yIBfvwEVoFNQOrsiYs#=rqgXo1IkL1yC%LX5DmPBbGe*4f?c!J4p)XK5I# zozd^ql?uxCvu8G{Bv{v?8`2Mr2tP#BmGr2xnb5u(k+N*uqvL5~XenIgSz2hKBS|AV zwGJrE2DZ9^f!qd3LG+x+wiESvH128Vd^ z;6UrR*1!c$67(&|`$eU@zi3+!O~05d5UOV5jt?so^DlV)W6g~Q#8RlK-w5wnOFy2T zDmjj`H8e4#OOI+;F*Bf2*WM!cQqh6lG>-q3Z-);>cKpP0HnB?@2$dk$BT@7V{CrT~ zF|0L?s|tS2O~0;50n2~{E8Ss{A@@Sm27#W)r#j(rCe5uz}7(hmDMUM30UVyLdqFIH~gWFQAX?6B*es)6oXK>XPpe8VH+?eF*6Eh~FSqUWEm_g(ajGUw zJFlqr<)bp^G&<%ac$j=j5?^;JP-i7y`Eyiswia&DU#5g2bd3bE4tQ-?SZd+T`oR?E zblZQP+8V24WULO^nj=mg{{B36zYHzcwePm47KWlDv{b^5wFWIa-G!Q+cMgSX&ecLd zP}P-Uf+sWExTJaChXwiWlF%y$g(iAk*MKYW3lym!_Ybj@O- zToUhJ%uZ!_n38d5sotV~{ASwiab0Dqi~#x=QA^cA1h-%V=UizH(R(k`w6a1fizf3U z2Q03Dq}sF4$7QnzecDPq{z zaA`yOIV>32E3>k&5n7CvXy_DTD8a!lQ?2qtly`+o$NTp1df*^g8f>yo=tO-$`he?d z@TD6uDR&h)*y1Flwm`+veqL*fO&^((Ix16uEN!AE>%IQfxc0qVs&TWTnm%QwdAk%y zR*OmgtzLm=J!^O8J0e3bk$_bfi`t-hx3YCrsG&)jhUtEYbW|XG{>_K{mZqZJ1g|t3&-8D@n%AA z2Q{R$3O`$_dE^) z&Qv#6G!Yi6*{e6!{zhH$)&wa<;mU{SBiXY4DrGd%FdQT;n`lwmW1>mCeM5%2O`LAQ z%8};f@NOb#W zZ~M^KlaYFh|+%3C{R8&-7xoNG*^nJ*-#wNOr z>5uK=Ns2?Id2IFJIU^<1lZJD#q0f>s?+?F#nsw}T`=O$rFOZh*k{<5%laMX@J{os8 zcz*%kc^4P}KpOo&S*t&47-kToCsP%rG74U1LTkUKgW=E!FEFLUAfpp{!Cxrl&m#m5 zmIY%Kv025cZvOPSgy;)g6zPpSxSE%qZ9^tMA#qh#ERw!d!SY8}2Tt2g-d$0Tf0Dbb zY>0^g0g}}l;FzFDNN1E3*t0n0K^?*hWQ5#gdh-K%_&W2kfl~==Vsp8JjEJqrmgdNt zDAt)@nyyBiRYJ=vPggKfH0Ok(k zPd}p`PBP=m%_wq|#~Woys|GpwD}j0#m5|0QGbYJpv_RD|b=_2IvA~Onp*oihz1d^C z(}V956(?I46w>X8D4ig|+?}gHyKW(crm9voFF3b+`}m3tz9SqrmvY(1Y+`efK}Y-; ztu4QPg7lfz&c(iDc|*T#l1H1PGFO)($#*gzMRNQfxvMtay-idXrTL|_LI)H4Z6pav z4B`dA;gGvnnFo=}7f;IE{sAwO{OEe5|D)@9^4!j55Qr7n;*WyRp|*_51{YfAadnP& z2xYsqlyH#F}8ooU)ZlPUUc6T%X1fG)j_VD`#*=oW=@&sJY5Co2`06W=QBs1VMpq$p}` zcvzakt1T@&Q~{ITbzF-M3a zLKVZCZJ{TE&723I>WEfCnBgO2AvF`> zdfU=cSi^yyqTiPQ8^NNDNd=KJ=B`l~m+xjzb_km)C?O_{-Oo)x!&P;|_a6RA#?M7M zoei2=!80JyFprSyaQF7zPLQeHlC>PD7XyjdFetw#&K>K8u{s)W?AANGVx9L!2 zKgZubM`XRfywb}WcnBYS1>s~iB?O-^Dv5^~eh)4UQAX!b?Ml})QSzeEr={Y&z1x5; zgN|-COhe_k3b*>o7q*EuO=n0p>nk66!wL|+pMvrPY>l6;A^LhL|=-|VJT(KqACXM&L%RMS>!OdCrh_9E)6saV2V zRTw1m-E2*`qxZ*QHJFDu-5h1u_zH8^hzxfn>sZ}k)eQs7GXj!&>4WhZSTf_rthzYz(h$u<$@aRJPTD(LA z2Ugc_%-82l4~}yxbW&5RShWbLHXcGux$}(Qf<7bAAXei9+`mju^YN+u*}XZ9;Ff^* zfB-Td@{Ik^O_}QJDzd8=9)1m)%h15WIw3jw8vAtsCHPfV``J(#yQS%uSb2i@4To-z zm*v7fg&|+n_48Q<{RVOvuq90Chrhdp(w}L=Pp}b^WH}Gd!h~>4MMgQsZMs8jG3dS0z>D=ZTdM#gPzvk0H-P&#|T-FfjL4c%?^lj39avd*% zN7Lug;H1X9dL%?F^-&X5Y8gBk!a?)wUXaY*VEM9eRGpVfCdrd|GsV&6FMRY>9-8;Sy)*3`1r)c#AIY-l$Dir zb#+ZlOl)jyoSmILJv{>g0wN+J;^X5pGBWb>^D8SW8yg#2T3Y)0`bI`ZW@cuVmzOs- zHVzIB&d<;9@9!VwJUl!Qe_7yv#1I^$w44C|B+RD)1xU+y+yZpAw3x7}`_kdUN>!Q= zmft5lR43J+!kaX}y0r&-q%;n0xirhEh$5}EpmzWby7W{-#Jf9;XJrqm`|gC>Tb^Q9 zY5U|jW?Gy3^NqO#U4oE9lM=8aeqLoizC^)5`$4^!Z)&q%N*_($%a8ITn!KCZxr`4#BSPdA@uPcb z7+0}e!Y#2`tE6|)a{B~Woq5w{Qy?vTA-#U7P_LiXj(YaARC}y!daA!Pv=i63gY0VD zioJAo>Y`|V&t?fKc%guSq11Dge?oLg-mO?p$UZ~;e5xQ%_R`yWYg)62W#`_*&BI{U zQL3_UztXsV<3Ldzq?MrYfEY zR8uMhsi~GKWB>23WqF{)vJ&M&V3}%ZI8g2Hz5XYLe=Y9s^Z(Og{;&1_@8bS{+xaE# zZyeXZi~Ikd-jle$QpdmZ|FiD?o_2n1=YKT&H*hN_*O4$nEt=OCDTupBIJ^ zOuZ#4OJofK*Re>Katkvq1{Yp#Mlu2IG-mB!O^9a9__FMu%eq#fhNGfjy7{b#0PNJ{ zb^s_UWL(S9!Tsp$B&_S_vmS`(3G#3NW8<(yg_uEC?&ymPD&mG%ma}-kL{hVJFUzTz z1)LRNC9W{O3U?`@W|}BwiLXW>)l;iNsi}zUrq~NkSs{dq6xelmD}DT;gIZ*x$DUjWD|v#u{;L4uOIEbq72kCEWls)UY`{78-8_S|CQ?Z z3;^J%yZRecAJKpBwf?uvU)|O}m&~4=%Wru5i2m-q{*wBO?$5QNr?C7t%s;aE$N%K9 zy!Nk@f6m*UYDvH0FvJ&2#H<| literal 0 HcmV?d00001 diff --git a/tests/ods-files.js b/tests/ods-files.js index fe7aa79..1ffd101 100644 --- a/tests/ods-files.js +++ b/tests/ods-files.js @@ -75,4 +75,14 @@ test('.ods cells with mails should be recognized', async t => { const row3 = feuille1[2] t.deepEqual(row3[0].value, 'Fanny') t.deepEqual(row3[1].value, 'lemaildeFanny@example.com') -}); \ No newline at end of file +}); + +test('.ods cells with partially styled content should be recognized', async t => { + const odsFileWithStyle = (await readFile('./tests/fixtures/cellule avec style.ods')).buffer; + const table = await getODSTableRawContent(odsFileWithStyle); + + const feuille1 = table.get('Feuille1'); + + const row1 = feuille1[0]; + t.deepEqual(row1[0].value, 'Toto titi'); +});