From 56d34688894de040ed0431a095045420dbe2bb0f Mon Sep 17 00:00:00 2001 From: Marcel Gansfusz Date: Wed, 22 Oct 2025 23:26:33 +0200 Subject: [PATCH 1/3] changed the censoring mode to built in censoring with pymupdf --- app/__pycache__/main.cpython-313.pyc | Bin 33746 -> 33234 bytes app/main.py | 66 ++++++--------------------- 2 files changed, 13 insertions(+), 53 deletions(-) diff --git a/app/__pycache__/main.cpython-313.pyc b/app/__pycache__/main.cpython-313.pyc index 5371756fec9970efe85deb984b71113ef50dd3bb..dab1333e555aa09c5b5ab1ade37bfede83d66d3b 100644 GIT binary patch delta 2211 zcmaJ>du$s;8lTyRy~zAZV^FCJ@UVyug6~XyjXyb8%DuJj5DOo_XeIK zw*miT{Jn5Di~nq_lpNylg3(g(C5u;#s;Xlwb{b=T!GZ5LrnVmv@gIzjnj3h$OzyP2 zM;z)on4r&hL8uz?0xJWZVdgAmr%sYf%0KXC3ECtYyAP-{WwAb z>QVNv1317>HgzS_s+Lp|(m*B|OAn7GrSzyYF{~!yQYIBw(yC60^pGm)X-!o|$g`~j zp08j@{S3^#f~L_m;@S5WE+rrC+aZQ+GNw5t?PSOPbpat_ry0-|7VhO)ggSA^1sX@+ z@TxZKAgA}2*S?NWUw|@=Xzw^e$}0#(oK`T6cifAYP}ojx>@Okj?=LQDp?aC2SP$By zX!!ygS9x}gcdpJ{ghGcw*xp-0K93d^xn;X7v@jCt?nTIntjrJ86q8FmPQOFu>OmbTvw{G^)=IKt zE47748F4?bZtE|-F6xa`{5+<&l4QFpc1m!t-350Svh{AfC;G8(S}PVv#n$VmN$$b2 zTlT+j)AzKjp8=Rgw>BeZ;ibBZbu;Pzh#RKcb576s(-(G62Xn51 zOFb8QraPA1h2(h0m8m6fP1akJD=yEKSLb=;+j0}JzQP*?sNA0~MQ-1c%b#`m7hRij zjZO1A{+d}7*3Wikh0SwEvqD|2Y1i8LSXQXdH8!t}sef76Gza3kC7~%RG%X1`Z`2}B zMZOO4!V*`U<%(w>>|Nx_a-99C(37F_eY2Zr!Xs%Sm&kS#H}h zw{Z>z+m^VxELZnmu0ew@0cqDsQ)hGQbue|Epgf%%iYmJl9-G=9@VZ-EMPg3p9YjVPo;p7dS@Ck@2JZ@NM$i(GHP=ZEF>jVrg;uL#BUQ-toE zner4i8&xAw7K`MGR4eWvZ>DPTUgLV|Gp6!4qR!JO3f;yRdI1Ab-((K)VVJ(pI5_qy z#xl7*zOzuID<7fI5842u9VPqDRN-L~Ig{~r0UzMCQ{d_8>Cw8#wKMP9tP>Q;-zK(W zH@P~YmD@mN`sDCK?FWuT;n&f5sJE*_*6c)?EVH@j+zv7|`ARRXX_Gq)rK!w2vdwvP1zm0YA?9mUhsvbGl{(`T#246FeENG~TZpRMO_+7Nab%V$UU zDF-f^({CZ@@?{OUF(8E)ptY|Z6J*6YtYtMhis3-E}{6wkzF zcg>v2`rt9C&O0bC0-twM%Y{6?TW)IgV~p3&$aw_aZ1@)K@`4?g%yvF+eDoyC`~Y~t BF_ZuR delta 2639 zcma)7eM}qY8Gqk9+t}x`jX(HI0tX2VAH_6*&_Eg*3g)9}OoDMr1&ec-15C`B+@1MI z+nFkzl4g@a_Npq=CY4H?mTG@A*=b#?DXTPT(yB~q$e8e)bj`F*9o-a>g{?`XMtj~P z(6ausv-R`y{+{3S@jma%as4!N{S4#0>u_)g9;f%-NZ-qsoj-QrUR`&c!&oIv?)UM_ z#52v4u4PosLpT38S=2< zLyXAp8uw#NK2)lzkr}G&!L$}sLm{S?ebO|6UN;M<&**^iVM7&S$+3OaE_CQ6gsdni zKFJQ@3KRMLz9Hu_Bv7Y7xr}B|njCoQ3bV?7zk3HC-pDOmNK?N+s>*Dstgt17X&sea!nXD9yr>;_j6)RHJNhV^>$7d>m~~yF2O;qyLcNt#Rf8HQVLgvf zpM6c64IARt#R3Xj$(8*T-bN}8F%+voDBFR*zrUh*jb}G_el4vRbsPbegETjlq#SG8 z3EQhd7IuV8R2}9Yt&FV#Yc_IF&t{z?WZCc-F?$}3M0SR)-8m5OKTp^h%7Txzffdl9 z5JD+Cf^C?}AIZcyD8YqH)v)f%U5^bL^}r2ssC@4u;6}now zxy~UWBt?^oB*avqM@dS8HX;c_vA97*6Y_+lXwsmdC4|GVDJiZR;h>QPMT#nt2+n;1 z@N0olqYmiRUMZ@@5>Q2i+@ncV6QT*o6_baJ3Ys*f2?J6*F=-@>iZZMONmOI;*fgXb z1M#3R8Pi4t$v^BDI^`2;R2-MMkY00O8K;m+3>Y%8JHE%N+Yh09vlQiDCPD7kU)nIpHr_6`J zBPejI(W&f4p(PLub#-^zgY80VS1`~MtniYf&on(}8&t%}2#iEiYG`#oK!#N%EyE2< zT4X#nH71T{oG|}HREcPbfoZ6voTn;lYa*H)lVmMpQpE|W!cA^Ivz^OW5=m`5sVPeY zw-lCrN|okuS|DQ?h-oyN4YEayM5-Iyfk}UT~&; z1?j@=>4MUgEaWb}i%f3M9TzGn`A%i7vljUBi@6K@6Y08!jdk7v zzbjq4cVkWcH~F0xLHxuLU$?;5E%NnHI=AqR$c4y7?`-5+PW9QgbbirNe)U2szdBt| zlrE@QF{1+i3StYKXFF-5-X(t90>AA?!%Ic}>qY)cv4x@oi+nRQ?(ob8=1S(oAC%4I z&Exs9`Mhi8d#`yKmiWe7yk{+JE;=tR@#S|qF>)5JjALZBpE>x-!CC8~so>99j&x1! zqIJt$`+}9c+Uj%t(YhPTTb;^bn6Gl1{Ij(V-zFu2qJn<|aTo@y(l;gii?J83X_m-e z27M$R_|`7vPc-HRg_{&U2B=_^zf*ge+N`of?SE0Y3qTs$wwS*Fqep+C?I13^2c`-K zi3Ft`P&u&LNf7sK)+Du?(uyE=zGB5bZvEDav8F|=S&t*{^h2Q8+FLW@#p6T;*9AzndtbfpV zo3)hEburN+txO3f_ak-KMEu8_ut3I+zqhr5ddmSST*?F5?siJNPpuuKS={Tdq&}yz zi_*IRGL{pQqB5*z%<%~^HmE#iaPbLQ9Fr6u`Kfq-E2CNppX+zTUorM7x~AY}^T0BLJ3a0!x5rZLSw6PvJRz z`-upPd2(9bhug>paupuX{~>?E-~hRLatluCpPh6u;QvDHG9QIRGx|a8GR8gR%j9kk zPiurJ3_vb8>E}uFL=nC~j!z`Bp9Q|ctc-#u7z`@UlD|#-)?!SECzmEGv5VZ8RIrr< zr}Bzzpv`y!y^+@Dogfso>)$?fFTsdwt=^~;#(t4X0ZX6of;$gcl&ddysE#ol>!6~WI+ZZF2QY0k02 zQsb7>y<(;YNu+Y>sX_k~-Lt;g=-i&!(FJ$O0$;jfGaMW^R_un!Bd2@SVVF~l{%&e3 F^Iu8YiF*J5 diff --git a/app/main.py b/app/main.py index 41d7fc7..215e881 100644 --- a/app/main.py +++ b/app/main.py @@ -14,6 +14,8 @@ import asyncio # import fastapi from fastapi.staticfiles import StaticFiles import pymupdf + +# import fitz as pymupdf import json import re @@ -487,6 +489,7 @@ async def get_censor_status(file_id: str): async def yield_censor_status(file_id: str): + """Internal function to yield updates to the stream""" while True: await censor_status_update_events[file_id].wait() censor_status_update_events[file_id].clear() @@ -505,9 +508,7 @@ def censor_pdf( secure: bool, file_id: str, ): - """Censors pdf and runs OCR - If Secure is True the file is converted to Pixels and then recreated; else the censored sections are just covering the text below and can be easiliy removed with e.g. Inkscape - Saves the file to the given Destpath. + """Censors pdf and saves the file to the given Destpath. Args: path: path to the pdf document destpath: Path where the result is supposed to be saved to @@ -518,14 +519,8 @@ def censor_pdf( None """ doc = pymupdf.open(path) - output = pymupdf.open() page = doc[0] - # width = page.rect.width - # height = page.rect.height - # print(width, height) npage = doc.page_count - # pages = [] - # tasks = [] for i in range(npage): page = doc[i] if i < len(rects) and rects[i] != []: @@ -539,54 +534,19 @@ def censor_pdf( (rect[0] + rect[2]) * wfac, (rect[1] + rect[3]) * hfac, ) - page.draw_rect( + page.add_redact_annot( prect, - color=(0, 0, 0), fill=(0, 0, 0), ) - if secure: - censor_status_datas[file_id]["page"] = i + 1 - censor_status_datas[file_id]["pages"] = npage - censor_status_datas[file_id]["done"] = False - censor_status_update_events[file_id].set() - - # pages.append(page) - # THis Costs us dearly - bitmap = page.get_pixmap(dpi=400) - pdf_bytes = bitmap.pdfocr_tobytes( - language="deu", - tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files - ) - output.insert_pdf(pymupdf.Document(stream=pdf_bytes)) - # End of the costly part - # tasks.append(asyncio.create_task(censor_page(page))) - print(f"Page {i + 1}/{npage}: CENSORING DONE") - else: - output.insert_pdf(doc, i, i) - - # if secure: - # pages_bytes: List[bytes] = [] - # censor_page(pages[0]) - # with multiprocessing.Pool(npage) as p: - # pages_bytes = p.map(censor_page, pages) - # pages_bytes = p.map(test_function, [1, 2, 3, 4]) - # for pdf_bytes in pages_bytes: - # output.insert_pdf(pymupdf.Document(stream=pdf_bytes)) - # with concurrent.futures.ThreadPoolExecutor() as executor: - # futures = [] - # for page in pages: - # futures.append(executor.submit(censor_page, page)) - # for future in futures: - # output.insert_pdf(pymupdf.Document(stream=future.result())) - # - # for task in tasks: - # output.insert_pdf(pymupdf.Document(stream=await task)) - # print("CENSORING DONE") - output.save(destpath) - if secure: - censor_status_datas[file_id]["done"] = True + page.apply_redactions() + censor_status_datas[file_id]["page"] = i + 1 + censor_status_datas[file_id]["pages"] = npage + censor_status_datas[file_id]["done"] = False censor_status_update_events[file_id].set() - # censor_finished_flags[file_id].set() + doc.set_metadata({}) + doc.save(destpath, garbage=4, deflate=True, clean=True) + censor_status_datas[file_id]["done"] = True + censor_status_update_events[file_id].set() def test_function(i: int) -> bytes: From c30d69d205c88088253d6917e1fa96113c370bf4 Mon Sep 17 00:00:00 2001 From: Marcel Gansfusz Date: Thu, 23 Oct 2025 00:06:25 +0200 Subject: [PATCH 2/3] added back option to run OCR --- app/__pycache__/main.cpython-313.pyc | Bin 33234 -> 35835 bytes app/main.py | 93 +++++++++++++++++++++++---- index.html | 5 +- static/app.js | 4 +- 4 files changed, 84 insertions(+), 18 deletions(-) diff --git a/app/__pycache__/main.cpython-313.pyc b/app/__pycache__/main.cpython-313.pyc index dab1333e555aa09c5b5ab1ade37bfede83d66d3b..098bd0564940a80d154e7ba45b66adaf6ec2b140 100644 GIT binary patch delta 3746 zcmaJ@32;-#8Qy);(=FMuB_AS=o_t{BOU6e)fDFbqBBj{iXMr6HWFgDO*p|$bY#;}A zN;9F!BoWG%eHrf3gT6 z$ByQE|9=0myZ`>Xt2YmEpFGByE*p(H0?i^`-Q7HQ#8g6`e7bs@+SuxMx9G$>Y-Ep8 z+QPo3Zm!BQgEF&K^!q(xgTJ>$DejONQQYY->V1lB-BxJN@&TFmi9>!jShj&>P-eEV zl-fhLvm#A>S-kSN(GdG(eKALr$#ne> zIh{{03ak1F2`*y?8(Yg1tt!kpA-5CJzFR@a41HA5%^Qe7daBu6qr)o5Y5-rp&+Ie! zj6U#TuQXPaI(!Nf5zBpM6KPhq(yX081}m7O(vm+|b=}b{R*02s+47)%Rztc6RwzcO zvx+5_@8{KZLNt`jZ~8jrm3`85m<>0tq=oEz&CxtlC>)7JrNGqiNFW-Ln7-wV?kt3S z7zT=c$j-OywuZS3AtT%ve!|=y{AkD7*4EK4)+FC(T}lHfweIl9p?lsjSr<(9H%<0; z47QZPk}~9{427~#aM`HN(|ttLTHPgtD+MI4C}&&rrM4N_*2ZdA)N$>U^{-gDBOgN) z09=jrz|6nVajUnD{;IMO@I|Z1Ya}mLv=>9?>?-PAr98X38swMCsdtU?rHUGmUpDHz zD-wxWEt>cK9e1^%kXY0vTs)XKcBUyfQxHf{kc zJ!@}s%<8kV_G#dhi)6$vOJu8Hw6zAbjc@dFEPMT%6S!Pzx8= zV)H54jnrTalh$>YC_Ty4>#OPO%(ecM!zkSck)(47ClTfm;B*t| zy9f)6cG%qSftco}q;M!6GlDGK*s`_q0$L*w4vOeIP(^d(N_o=P@hB|`VdioTZ)mtB z$mLUtZ}(Ojd?dHY5uUaA1y)<7H9}!6mYCYdv(+A#x(_$Ie}L`vILuEG((lBOi$!yS z{lIgKZe^afW3!lY1*BZWltYrWi%BlZ4gZ-WYX)!Tb(mgJ|2aOJ_M5hDdaH8N`1I?$A*xvPCCf5@Ze8rO-!>qS(PC#w zkb@2FkZ;nM09UCHj0_9XbR;J9boA=_c87(Lv5BxS7863z$V^y@hlhoDRM;}MKRgkW zjpMt+5dmsP3I{>ipc9j>bt0^6u<`Q zbrkOcI7!or;i)lz@brHyuIZTMitP?cVOKmHiwy_kLDxwxt(gc$MyG?L;j|`4gP$rE zpM~j6-pOwPl^{X6kUR5N2_0SCVo&ew-Cm)yr@O1K+gT_*gu^|I@Gin15P}FlMZlAB z!$K}VH?19(g8Kq^SR@y!8xa148r^6(9+(>2KN*}#&zc~a@GJ%5(Om~%YSJTUq3Mi< zrYFOZxb!}Bo%vsVJkzS^bbM+$o=wkRaGZ+>e?|BJL7v~=KuW82jm2?XEk1`Zq6U(b z4+?>8t+BAS4&Bm5Jh>OLCwD2)7d%q;KwTpKmhP6hj(3cfNBj@?AKW>&frUC=bv$M| zVoF&`Q?`m!X?aFX%=XJfX)d^wM@k);64JPw4Ro%ZJ+EoHV75MHKVm;2ChX_T&cmvd z#kOE^&Rd)*t1V@%&G5uplOYPL?yxqaA=UWF48kP*#&Ss~b~XflWhYVIbiP}Y|s@Vi|}#s%-}y2sv7JsW63}#jPh}yl^R*YP#bl! z$)RU%S&qs^0B4@0K)Dv-S)6?xYY(n$XhNMyawEM8AgvnT7nDY0X?|iRI5sRb$lSzC zBsdwC%Gi&B>-6<#i@+6?*2s8b|=K75Gs(iKc$=STxDT}A2cks#NsNWM4n zeMQB`1oWWog>I2j3z6_0}5(oha#(KH#-xlrS~L1jJ(Iu z&Fr?6`j28g{7wjTJh>_UGz`N2H{DcV#W8ju?1EUZ-vQP>W7VC(=m@9S z(9HCbUNCU-(jFkXySpX1OZ%L(+Zkq=ndek;+N|vKz8YG>e!owmX4bvG*j@l`>4L8Q zKxcbjSD*vH3pY%cC>7&?7WU2k4-KBhkzPf>zg%g4WFi`j%de^Qyi8q-+b79~Q*T8`hA7@s$kXYxe$_`%Xe&!U^c7K`kSgQcC>u(uMk zEv3WhxL~foLQBzXEl|d9q@9MOkY+It|-H0O(s~)Y?Ha3x>7o0MgiXoCUb_b zXOpwVZs@_SV>Bl$iO|v3#GZMx181g1UPnykD^SC(%n)6<=nY@ldnI~TL+=Z{kM~73nkrSUU`wn1vvaPK zEZufwfA8;p&hLEB@+pJ#$s?h>zN_%k*pAXHA2oQtJQivqnaw0qF-TAj!#*(| zd}28sfIe|Aegc+7Hy(r^iY53Y{7S4kH6%}xQ88mA7xI&t8J;YwUXZ5@k73zQ=5Gr5 ziR-jMLh?+gm##=Q?KP21PM$URDLHPqG!ZgqkP&Dxm*7ztGrL@4vPMSS&k*E!BSV1< z=Yk)b^K!!SIXbtzXz+;sJM-QOuMmmGwK+9nr6+Z-<&owm0j>dOEj?(s^qx5) zng5~X52!=lEmy)T`xVm@hu8`1tpZowqA1mPj0NhK>mJ7<-=WNC(T;xj`+-L6gyQ-+ zc)tF+bc2{&B%a!Jh&D`mCYcOEF=mclCYc1in9nfVEW-=i0^-eZrXiI3f0Oi)gO#}P zw&eJ;?zJ5+;f z!O_^*VJF}V%}>_#RBLg-g@04AkLZnDdq*|8QGTS5+}!lzKrVN)c@K%-sg?px_MJL+ zz{cLP@g%+cL8uvg=V9g;mF&@;_ zhmZw%P|(!pg3oM4t!A0qOI(KYi5M-&Q&-tsMKmb>50@yHEe~_ZY01Sr*kk$M>lN)D z6u@0M_hu0wacn1?Zn43oW4W-};wE_yXX_ueY+>w$_m5WL-^1ObuP)iu?~n=9_o=*0 zT^^+B$DRi3FUM&jbmx<+=LQIbCX2XkBzPbIj4EePGZkjTJ~h$rXE!8 z(z?1#WEpK&AdB--jlQcbi5Ybhc68U{O*qzFT<{4=tR}C5M$cs#r7VSN4>kA@mb%Xz z_Tn@-p%8I@@(#m26 z_Jn_l%V0BnP~vGHmFM)z$T5b{xu{PPDW&oY{U^~+S)qj97r}b0jX2+lH4uxusb*XZ z!KpWPRnpGgM7%lb=k%C76ue2JQt(bU`l@K3UG-61OC&8!pHtM+TAH6ZtHj3Da)ZsB zjVrTJ)d`oTTP$TXm&&ghJw5#vW2vTq%0C$Wh4>1V$Wsusc{QrcLRG>irNy?n$ii$i zJ`Y0)g~>4b+liMjF3?BS^GsDc9UY-7Y9o=xk%$&awl bytes: return b"\x00\x66\x99" diff --git a/index.html b/index.html index 2051433..e2724f3 100644 --- a/index.html +++ b/index.html @@ -134,11 +134,10 @@

+ />

diff --git a/static/app.js b/static/app.js index ceea83f..f654998 100644 --- a/static/app.js +++ b/static/app.js @@ -276,8 +276,8 @@ function submitPdf(eve) { formdata.append("fileId", doc.fID); //formdata.append("filename", doc.filename); formdata.append("ftype", doc.filetype); - if (!formdata.has("censor")) { - formdata.append("censor", "False"); + if (!formdata.has("ocr")) { + formdata.append("ocr", "False"); } console.log(formdata); submitForm(formdata); From 5c6a8dfba29a57cbf149794e05211a992c414750 Mon Sep 17 00:00:00 2001 From: Marcel Gansfusz Date: Thu, 23 Oct 2025 15:40:45 +0200 Subject: [PATCH 3/3] fixed bug in js, that blocked showing prof suggestions when nothing is entered in the field --- static/autocomplete.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/static/autocomplete.js b/static/autocomplete.js index 6824849..959a694 100644 --- a/static/autocomplete.js +++ b/static/autocomplete.js @@ -21,7 +21,7 @@ function autocomplete(inp, type) { i, apirq, iname, - val = this.value; + val = inp.value; /*close any already open lists of autocompleted values*/ closeAllLists(); if (!val && type === "lva" && pid === null) { @@ -56,7 +56,7 @@ function autocomplete(inp, type) { a.setAttribute("id", this.id + "autocomplete-list"); a.setAttribute("class", "autocomplete-items"); /*append the DIV element as a child of the autocomplete container:*/ - this.parentNode.appendChild(a); + inp.parentNode.appendChild(a); /*for each item in the array...*/ //await response; if (response.ok) {