From 4a6e74aada47c2d9c283d31a29ac535b016de878 Mon Sep 17 00:00:00 2001 From: Marcel Gansfusz Date: Wed, 23 Jul 2025 00:23:58 +0200 Subject: [PATCH] added optional censoring; made pdf file show up once the conversion is finished; fixed bug when pagescales are not set but no rectangles are on page --- app/__pycache__/main.cpython-313.pyc | Bin 26193 -> 28280 bytes app/main.py | 63 +++++++++++++++++++++------ index.html | 2 + static/app.js | 11 ++++- 4 files changed, 62 insertions(+), 14 deletions(-) diff --git a/app/__pycache__/main.cpython-313.pyc b/app/__pycache__/main.cpython-313.pyc index e678e6ca06d94b98ee0e3ea4c086f999f3986f96..bc800ae7a5a48d4dcca879a63448ccc5b94e3e8d 100644 GIT binary patch delta 8373 zcmb_AZBSg-b?@yr`^m!cB@jG5#Uef>kOV>!--1d4!DVGEkk+f^0V~ihx^IzW8fz7| zu2HPmIJcgl$kWJi$Hwlok%xBbNvEx4IjLLMEl5hm>?o-_Nz=-i#_LLTJa+2zocmZ7 zAvx0@d9df5ch0%zoO|xQ=bn3){!#q&wpe=CW-|-$Y;v3%`_*AV_z8Z*pN%WX-5lfd zrI)QlAjW-JG;1iUua=$cawQLC*NLhK%>iNq5YzM~z#3U^wd-iTMiPc{`gX_FCEhd- z=uL_5I7TI)x5U2^Z<;@3DiG>c2()0Rpl>fbX-~~C4>=12Vjm~Z(JRstcxS?U6TGwFT}n2P>?#d)L%(z2T}C$2 z%_NtWgM~q~o}Cx(K9M#gysM1!3-ed`1w&P}8dA7HIIdgmMRi=RwpDx}qF^biUB)tZPKbsiU2wQgC}TK0g}&7kW+t)TY- zt>x>qZjEa_QIXJ^E2w|u2=_Umy=2u+LqnUBE-*@m_up|$u^|3A8RwR9>(3lxpv*dxec7V z`%AcuoEy|-L>{JjH*z21&z>)#H$9v_w2hBkCTU(5haIE^W>72qc1Iv&_*WmFG(3E5 zK*1?hpO{Rf9YSg*9jJ;nkiF2jln1$!Q+ATBl{OCy-|7T?fCEDm`5oT85yqej;$1~5 z;kOfh_3&$i-*))zf?p}Dkrc9zOYh>+r9(QL^Qw!WYCq@g=2~#J-Oo3W1DwByYsZ;~ z^Kf(I0YNkA=7LS62h1Mirq>!WkY3JjCTIkdfayL?X(42&70kA(%LwTGoZdz}D;?00 zL!8o19^y1IRuu(vCHA896%gC~vrz90v7qbp|rXshy)nw~w7NJezIw zO-@!$ght((sA(iHAqRaY3mDnJ|9r>6eM97q-llJJ7y5s26@AP7F#00yx=e~2c&q$FVwb@69V7srUU0?@cj z5o*yrM2kKyu}(|IiXLLPUk!WDy-6?T3LcSt+~{K8-j&Mw4riAU8_wgP_F8_ zRp#ES8Fa4tY4T|7!q&CDNnPb75i_yWYh;mFgPLZ|h&Uqb(&h;x;&$zb=(e-JF}Ee9 zc0k;vfFiL^%tc8y{?#=sF_-1*CQDQ=QyQY|JC;L{YET>1`lo1AlquR*Fjx{iv=UjS zQ910zL8Kk<*#tl32LSE}=Y*I+Fga!`ZW!}!n{12r!kea|+m7s;ITef5%}Y7WHykaC zrj})+eRk6gV-|?agl?L0Rf#K>rJS8N9F2=?-1^j}>{hdQH8-^-UAS6P4EWpWu2!S= z?Xt{PjrN*G5BN2sw$;3lVLPdF?E>o*_v@%Jpm_Hm>T08V*o)4>-FuPIiJ%L)S}H41 z&Gc#7249K+0CpB>4nVu3y6#X=hDG@6f&s%eRJ@%XDXhpszoKI0onON< zZLGrCp;Wnb)X82hEEb*Y?+aOSBZ{wy8Jj4|Om0GAGlCWbt?Unr3M@wv^&)Vxt3_p5 z7`~`>IOLBSCqn*7UwDlBPurQUc#~Mk%8N7gGaz$gp{w{ct@t#%TiOzd8vJrl2~ocl zIPoVPkl%yf-M1mPsA*$Ost`SbT_75d2vCd5jp#gjfO?MspuwX9X!IBXnmkDWlRRdC zW{(9HrfXe-z=*(x|JzcjV>MMd$pwNZDQ+id9ppsz*KV7|Hi%=%AILX3hM*l%kDa}- z%jF^#kG58P60Jv<--DUk_?hruPGAp1$K5vE}JwPBaTEXV)Zz_ zG^0UTOdLF;B?leNjzmVw7X~xq7L0wnX*#J~Qo@f<%0M&&$`=vfM!@A0 zfLJ2sl6=NH>09zs*%uB3MF#b~BK6b#G3uy;71Ooo&| z7`o*}zuiv045PY0a3mDf9alm@*rh|hVP);YMXOB+ni1eZ4A$-UHs_V@MD#R(s9E*i zJ2^Z;9YAoW(DTUr8iJ>pSe|dC&m;Z~1dQdCXJlh)r0oFI5&_+h>V_d%DJiVC+!a~b ze(bpYB!+{&1TBh0Na$Yxz)k30xu7>Mn;o;-nW@>%x&FD1%cb*m^P8?@y;=9>rnfe{ z+wty>MRIs?@aW=U@1pnk;y7JAt}H5Fi3!44u~h@FC&V^moz&dZYV?MC20@?n+cyrjH~qhiJ5t$8<)l@oZ@lfunIrT1E6Q7* z8;-si-93|_cg&obbIqN8bK6^<@9lY`c=2HWO`Yebx}=$+-`{uJk};>da%RcWw5V(P z^xj7H=hoWn&+jD**7UoA2K2#>o?JFxGheu9ue@ohV!n#hh#nZfQg9o3vL&@W5BBNQ z)*Zsz?sUMf3HG*f;hMb+@g1%6knbI?WaOeL?t=WSq57Me z2xA|WsYyuwY8Sx;+%mAc+{DVh)EWxW;Xn}jsz{*`zFYZ%qqrY}VQ4cs;qybIP;i|W z5!DSTzMm1kub~Rn1=#c;Dhb^quy;MI9~ezo?hVpuh&OaW0=Zx){DAEG?#SRcE z0<40~tg0r>0tJXbVX_Lel=ajUYGJ!&Q#I}V1_(S@qh-LFc1W%5-K3|lt9!51*3;eI z?M|i?+b96ABKL>sEBO5?f)*s|DMVV1nz1bK`a@H}FdhHKmYO4BWSZJm20N<5VS{ZO z^CkTb0=})jjMyI|z-l*Y2u+12r@~PkYD-v(ih-!^^oY+N)r|q9SJ?S2x$IzhdNv;P z=!cFfhx70&g}Dc?SO@$6!5<$P!{ zW%n)Xo)4@kYNI*-W#27p>Brjzdtq#cVAETk-uKmgv!);FvS`J^fm*$|;VN`Yf$j)u zNkI={M-j}jiMlU#Jq4Isq?7p4(I3DostrI1#E&5cbphRfCg#UJ4lc4+x2bmleiQRG zx)xm9U)Ch^NrE$hS8KF~{at;PIKqBeKePD>wBlM*9r5GZvnYE3!E*?nMlj1>Z`f&Y zBKki2c|);-57M{r8^Nm!c{>|Lv0`DU@jZ=L%1ZY%K9)FbT>}Z+J@JUkkE>DrC=E?b z(pF>`aTx^V)6iXBrnMnaH*qR3%saiVv;PcSir`)L&Ym~H;uB3Lwc`8iqt;rngcY@o zL{{xaja24IV$BhmPjACSAUMANAjR&}4*|H1YH`R9&1hdeG@nBZyAO3J*4T;>a7=>m z@aqV=h!P0?6=$gzXX!#xyB8x7=pf=@jHg?_zJA4St9A{P2b+hz6RG!mu>PKa-b5J$ zpRx2#4_KM#{IS;XA4vTvJGg(30k=o`Gxoy%YJOn=gZ1A29pb+;-GO`vPsxGC$S+Xh zzW})HYy83?qW_8D5`sGbqFN|))zJMe()eKM17Z1u!l!3cAD)_oi=*o)43RtKgOU6>XH1+5>LHFa%6@hrzZ9pmVh6wjx(d|D;;`@-88r)4-G77; z{ihy_*tj~n3D4YGHEcY9IC_d!RjO)=p(X^SEUmXX3D2DLNdWAPx^&|`Kwc1X*%uXs z?AhKYNFfv;nnWQ6=kfIyUk+%&pG55?33+g4$IGu#t@=(J8QnV8(zmz%&!EY4(aJ$d z-QO|KtGfbh1-Oag{@);V!zBt&&wmL`Xpp_wSJ}kRHaK;mtLQpUto*oge^-ON34kgM zL}3IFxE5;?4NOlOo3wftFQaR(^j>LQwC}iSs>fR}-v!fcN9yM>Bk(^{(xA$z!agY~Fh*5Fj5R$QZ4bn>aD`hw!Tjp%28@yI=J=JkO?n#eIEbB}ru*d55g>TX{D7S-&SQT&_>p*a;k!fAV#aFpc;NU|9YkS_ zH62Z_UmX|7uxC*n8i#R#L$?|e9egJ&or8M zez(}+7Q5YDEZKWugwNeM)Nu^~#)mc`7(;Ld0A8exj>7bhCc$xfKj4rNOkOXhSg$v# z1#Z-G42n{JYbZDp7>(+>x;uKJ`eFIl)F}NXs@ci@%A2KT`X$upB%YSm=6)~y+V?`K z(@y9Oq^GnGc-w%sma&Wqeu#jNmlk}WRrtv#Q3qnz5#T*6zYpaXkx;&b6ewbjg+dc_ z23arReNI$A0#_7awOl!fLHTc(#xZvNvAXu32=)&Qwp)gb>xPUu(+xxZIqkAw zyd|Vw7gFaQoge+~#A_4Rg`LYL%X3NRlg{avjpnD%eD%yZ-74kZ3^r!<{Yko0+&0&G zSAf@Dt)ST$(*PDLU3lMjMvRmQ6g2D%4l$v;K>Nt|jHK*JiX z7EZOUYKW*RY@BRgr^3Oh$$~utx^%w(nL{&{kF^@RA?D4i zM)Is~wtuep`N5^+{3T04%!qUdhjZ37>z}KeJ-+13=Mh1=8R+*ch=oOi=v;VaxKqq1 q5nXd>F#%qe>*i};YP^SNtdD&tcW1T71e0i=C;WAV{11NdDfE9+PJz?_ delta 6572 zcmaJlTX0*~arfdr@FKt$_y9@sO^6^xiWCV+e3~Rk$$C)|4?$BVIuHbc4~Ycm15htJ zg>1($?YK4FQzvp9C3c(XIL(Yl36eIQCY?z7@i=KCR*ucNRa>V{(kdCZaHPznPU?2| z;6+kWFL?Lv-QBahXV30Ad->a2;@XeI#yb{^Nr0>OB|7)yxFCFpFX^i+h1@xyzTEhF zf!9zgxKbo)_u9u??CSo)5`#CtRv<=V>Jo{WY$vxbz@T2evlz`}mCUjC3v zOu9rW%d*}zr1^DK8Gg0b(WxUfWx^>rGmC9$xns3yxk`adk~*)0)WfZB(F+1;@HP^M zSLc~nm^OTs;>da_uyhp1Obls~J^xmVjX^YmTwARJutky~) z`6I`<&spvD89UoE*v5I>YcYqkY_Zr37LQ|;+gsBX$zx;(M8b?SCvD!APMNfC8P%E| z72BB^)!`K(c9v}ZB;!XqA3%3KfWGSi^xgNP$H2us_hSIP>jCuc2he*SK<|A3y$|TI z`^>nrnDk@3GP$yYJPdg@0Cz9M6@P8%%;GT~1PbO|+op^gLLIc7xM7@wufi~=>?9*8 zd05}r=9J2$@V!eflf8U+2k8KXop5i5y9@5!aCgJ)g1ZN92NY2O*~g_jxwOM8V^JjC zgLHU=^IV`M<{+4ln*~&|p9}2b>M=JlQ}2~Ckpocyw zti=(xI*aAq_3Dv?Yf=3`?_oN{B3j|>*wdDbZO87-|I4WIm^(Zg>zw+{C z`6)T2NEn0dEp;*9qxot`tQ0&V%RgW%Ac{xm6kRE!)kuEBZ(}Ep8oT(85R&fann&7f zsz#XNXAT+}%V9sQweQnS<)&>C)uXct9lG>Xs)eci3{OuC#Mq?^io_Js4eDmZ8KFP3+%uCJwa1SWOBnQ%wwNhMD<=V9381jNojm;iB7- zZ@5r1rVMq8R}Fr zZKz$kW-A@qE?wKMAl@zwb;jqckIS}h&_jCwX!`kx@6g2lVcN?+DyZupLB?JL`;aTq zU_{ehI7}X#TTW|j_To?2^a2X>zoHBK@yY&sC zo0T=NJX}(`AvWy$4W)T~NbE=OFoFU0^M+cp7tudJ;ARDmCOd{nlcvJ~O}`KhEcv5z z++jMzJdSPRcJ>)Zsq#6Hc_;oQ$JZtCSvK4>^hm%Al=Ox8GjQMePbk}DF=wk}Qk-m< zoSr=IF}aM%0g&&RqGl&{+SpRF#cbX}`*O2I%wac7+Y}2Fr{zzEZ8jo$q>Z&n^d3vP zI`_s^J$aMgbEUS0YJr%DxlLqq zW@pTr^`GcG)(2LrL}a4yYE?tlL2I^JVFxHq+tCaDQm|~@Q02xcdz$O9ruTuYrr1VR z$-=)-Fy{|V!@^fbmQOCuMP=}A8f6uw834B4xk3hqW^Tz7xAA7Vuom@3ID=!aD>t@rOU`S z1%jbSnEIBcXPo);IUKiw;8_IEBY1)RQ)^9&rl-L`G~!#*43U5z=HpvRr5ng3w=Zr$ zo`%X{v91!xz+*dXj?2IXjDo?Z2lL!6@oHh=P=#>SQH*$>)tx6?Gv@<-t-|KE zNY~m*-3IBpK>_@_MFRTuJd=B;biJ(5-6CCYQ4rsWbUMrDiZkSMRV$OB2&7UGxPpM8 z$FuWJvBcV@cZf3kw7t66g)F#2ibp0Q=6p!?s8B^tNzb1}>~cq;BV?G$6zJr+>~S__ z;iCa~?*D1m8dCa=cl1` z)VbjN>|B)AgNL*MfrAz8sgvfni>znQ$X=^^WRx5k+dsOmdicob$fz@qE}_)p0PsKw zMClv&{$~hqU)L1s56uQO(~^HS=nI6GLs2@<-rdt4qsTQ3hXc!t!BA9_v3x-trXT$Z zf|n3{6~R*gG&Q^&U0Mb~)ReWNiSwF#cE%sj**Ck6i0?76yO=q;|8gc#DijodvFhcj z*U4(tjeO^dx^63b-{xGiIoFHI){B}Gicr*)5Tqj0iZNjjjJfX{>}v-5m$$uN*7T2M zO;?<2Wxeki`aW(EtaXW2K`}pf@Tr5VhIeKAE!lX(RQ40OoHob3J&IWNJ@6$0ZV1T+ ziDH#=u)v<^{j?8{Gp9&H_>k#Ucxci*Y?b`5RN+)#UWjgPv3L{0!7}@+KB6|mn1F#T zxoz>f{y*2{@#Ss=VscwP%-$Q=Ax<-OaHaVPv|`(^Y4R@OMU;I3!5<-b7Qr90zZvXN z-H5);J{fGt#paHFAq}31w+{7+Vl&G-VT;Gyf2|W8Z1>*Y6WPAt??6;jlbsK5pfqKc zhL@L;o4g)3`B3mIOquWCgD5R8oSC2I6W=o7chD??ud$nZzbiJ!U)=Y&B)-i)Jf#8Owji;`XYXNZ{yH@vW@#g#4rSKX7Tn` z4>(V;Cq4og#Bi`rV;ZyVg-^l5hn$;2h%r;_9 zi7g)ASNL@pISRSOchYac3teCz9dGTyBm`eCBIPLrLjX4Wx`1Q^X7qIe5cp;q5EVA{ zSZ|w7vE4FQR{Sp-ZsivKE}_8tClNe;p00BUf6+SVl&;#$gZ0wYdIfOkxZi*5hojS| zu^xdRK@h+NxfMGnDOm{fNWK8TsiRoznmpsDGy?V^gvkglq6mTxmY5ndV&HIxI>$bk zD31HR-MU!D%I#0#yMbHLbiTIRCJ=QRH{@CNJBr)orMQgh(E}PqafQ1<4X`T7ZVmH< z>ftq_00!%t_6x=N7_py~`|d)D^zV;dN%68EC{sgcQICjnW(lE_lFSMpLii zxKkW;4zTkxFIMqP{!`3n0d^6?S%dhwt-bnDG<2lX~iZ2=uBW=Qg7_HPl zH+mPt)5{CNep(MfgFhki1f*qxJw4}AK2`<2mHmG18^yWeu1oGa0zB?Wg6^S&4zNUh z{QL9g#F(x@EL%4gtaiLAzoxGCtQo5k5|BPH6|TBpwY^rndT`BDolt-T|1QZXS|zX6 zzvft-S~J%q^hh)amg3dGtDUdyT0Om%Q=2d%(Imj|4I0fz%-J&Bg2b40%lKTR<_Xr4 zb%Xi&@y}1Jm_L?uRyC22^x`clN|0Cz#Err^YQm26N+G{&BRyZeQw4MkL2&}7oj58s z42u=(CJRov;sa|w&a(pNnYSvhj$dkcd1@`MX3boSQ?0)JKyNNi)i8~%MXp(-3h@UTHoy8k1HL&!8g8+{|B)o BXu1FZ diff --git a/app/main.py b/app/main.py index 8984ac9..d6c9858 100644 --- a/app/main.py +++ b/app/main.py @@ -18,6 +18,13 @@ import filetype import datetime +import logging + +log = logging.getLogger(__name__) +logging.basicConfig(filename="app.log", level=logging.INFO) +debug = log.debug +info = log.info +error = log.error app = FastAPI() @@ -58,8 +65,10 @@ async def get_index(): async def get_file(file_id: str): """returns the file that cooorosponds with the given ID""" if file_id == "unsupported": + error("File is unsupported") return FileResponse(FILES_IN_PROGRESS + "unsupported.pdf") if file_id == "empty": + error("File Id empty") return FileResponse(FILES_IN_PROGRESS + "empty.pdf") cur = db.cursor() try: @@ -299,9 +308,15 @@ async def get_submission( pagescales: Annotated[ str, Form() ], # Scales of Pages # Annotated[List[Dict[str, float]], Form()], + censor: Annotated[str, Form()] | bool = False, ): """handles submission""" - print(lva, prof, fname, stype, subcat, sem, ex_date, rects, pagescales) + print( + f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}" + ) + info( + f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}" + ) rects_p = json.loads(rects) scales_p = json.loads(pagescales) cur = db.cursor() @@ -316,9 +331,12 @@ async def get_submission( try: dest = make_savepath(lva, prof, stype, subcat, sem, ex_date, fname, ftype) except ValueError as e: + error(f"Error creating savepath: f{e}") raise HTTPException(status_code=400, detail=str(e)) - censor_pdf(filepath, dest, rects_p, scales_p) - return {"done": "ok"} + censor_pdf(filepath, dest, rects_p, scales_p, False if censor is False else True) + # return {"done": "ok"} + print(dest) + return FileResponse(dest, content_disposition_type="inline") def censor_pdf( @@ -326,8 +344,19 @@ def censor_pdf( destpath: str, rects: List[List[List[float]]], scales: List[Dict[str, float]], + secure: bool, ): - """Censors pdf and runs OCR""" + """Censors pdf and runs OCR + If Secure is True the file is converted to Pixels and then recreated; else the censored sections are just covering the text below and can be easiliy removed with e.g. Inkscape + Args: + path: path to the pdf document + destpath: Path where the result is supposed to be saved to + rects: Coordinates of rectangles to be placed on the pdf document + scales: Scales of the rects coordinates for the pdf document + secure: weather or not the pdf document is supposed to be converted into an Image (and back) to make shure, the censoring is irreversible + Returns: + None + """ doc = pymupdf.open(path) output = pymupdf.open() page = doc[0] @@ -336,7 +365,8 @@ def censor_pdf( print(width, height) for i in range(doc.page_count): page = doc[i] - if i < len(rects): + if i < len(rects) and rects[i] != []: + print(i) wfac = page.rect.width / scales[i]["width"] hfac = page.rect.height / scales[i]["height"] for rect in rects[i]: @@ -351,14 +381,20 @@ def censor_pdf( color=(0, 0, 0), fill=(0, 0, 0), ) - bitmap = page.get_pixmap(dpi=300) - pdf_bytes = bitmap.pdfocr_tobytes( - language="deu", - tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files - ) - output.insert_pdf(pymupdf.Document(stream=pdf_bytes)) + if secure: + bitmap = page.get_pixmap(dpi=400) + pdf_bytes = bitmap.pdfocr_tobytes( + language="deu", + tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files + ) + output.insert_pdf(pymupdf.Document(stream=pdf_bytes)) + print(f" Page {i}/{doc.page_count} CENSORING DONE") + else: + output.insert_pdf(doc, i, i) output.save(destpath) - print("CENSORING DONE") + + +# def save_without_censoring(dest) async def is_LVID(term: str) -> bool: @@ -556,6 +592,7 @@ def convert_to_pdf(file: bytes) -> bytes | None: doc = pymupdf.Document(stream=file) return doc.convert_to_pdf() except (pymupdf.mupdf.FzErrorUnsupported, pymupdf.FileDataError) as e: + error(f"Error converting Image to pdf file: {e}") print(e) return None @@ -612,7 +649,7 @@ async def save_files_to_folder(files: List[UploadFile]) -> str: # reqJson = await request.form() # print(reqJson) # return {"done": "ok"} -def guess_filetype(content: str, filename: str) -> str: +def guess_filetype(content: bytes, filename: str) -> str: """Guesses the filetype of a file based on first the sontent, If that fails the extension in teh filename. If no conclusion can be reached it reutrns an empty string""" ftyp = filetype.guess(content) if ftyp is not None: diff --git a/index.html b/index.html index 07dc5a0..a401287 100644 --- a/index.html +++ b/index.html @@ -74,6 +74,8 @@
+

diff --git a/static/app.js b/static/app.js index 944de8f..9e75402 100644 --- a/static/app.js +++ b/static/app.js @@ -285,7 +285,16 @@ async function submitForm(formData) { //let responseJSON=await response.json(); if (response.ok) { console.log("Submit OK"); - console.log(response); + // console.log(response); + // window.open(response); + // console.log(URL.createObjectURL(response.body)); + // window.open(response); + // window.open(response, (target = "_blank")); + // var newWindow = window.open(); + // newWindow.document.write(response); + // var blob = response.blob(); + const blobURL = URL.createObjectURL(await response.blob()); + window.open(blobURL, "_blank"); } else { console.log("Submit failed"); }