From 56d34688894de040ed0431a095045420dbe2bb0f Mon Sep 17 00:00:00 2001 From: Marcel Gansfusz Date: Wed, 22 Oct 2025 23:26:33 +0200 Subject: [PATCH] changed the censoring mode to built in censoring with pymupdf --- app/__pycache__/main.cpython-313.pyc | Bin 33746 -> 33234 bytes app/main.py | 66 ++++++--------------------- 2 files changed, 13 insertions(+), 53 deletions(-) diff --git a/app/__pycache__/main.cpython-313.pyc b/app/__pycache__/main.cpython-313.pyc index 5371756fec9970efe85deb984b71113ef50dd3bb..dab1333e555aa09c5b5ab1ade37bfede83d66d3b 100644 GIT binary patch delta 2211 zcmaJ>du$s;8lTyRy~zAZV^FCJ@UVyug6~XyjXyb8%DuJj5DOo_XeIK zw*miT{Jn5Di~nq_lpNylg3(g(C5u;#s;Xlwb{b=T!GZ5LrnVmv@gIzjnj3h$OzyP2 zM;z)on4r&hL8uz?0xJWZVdgAmr%sYf%0KXC3ECtYyAP-{WwAb z>QVNv1317>HgzS_s+Lp|(m*B|OAn7GrSzyYF{~!yQYIBw(yC60^pGm)X-!o|$g`~j zp08j@{S3^#f~L_m;@S5WE+rrC+aZQ+GNw5t?PSOPbpat_ry0-|7VhO)ggSA^1sX@+ z@TxZKAgA}2*S?NWUw|@=Xzw^e$}0#(oK`T6cifAYP}ojx>@Okj?=LQDp?aC2SP$By zX!!ygS9x}gcdpJ{ghGcw*xp-0K93d^xn;X7v@jCt?nTIntjrJ86q8FmPQOFu>OmbTvw{G^)=IKt zE47748F4?bZtE|-F6xa`{5+<&l4QFpc1m!t-350Svh{AfC;G8(S}PVv#n$VmN$$b2 zTlT+j)AzKjp8=Rgw>BeZ;ibBZbu;Pzh#RKcb576s(-(G62Xn51 zOFb8QraPA1h2(h0m8m6fP1akJD=yEKSLb=;+j0}JzQP*?sNA0~MQ-1c%b#`m7hRij zjZO1A{+d}7*3Wikh0SwEvqD|2Y1i8LSXQXdH8!t}sef76Gza3kC7~%RG%X1`Z`2}B zMZOO4!V*`U<%(w>>|Nx_a-99C(37F_eY2Zr!Xs%Sm&kS#H}h zw{Z>z+m^VxELZnmu0ew@0cqDsQ)hGQbue|Epgf%%iYmJl9-G=9@VZ-EMPg3p9YjVPo;p7dS@Ck@2JZ@NM$i(GHP=ZEF>jVrg;uL#BUQ-toE zner4i8&xAw7K`MGR4eWvZ>DPTUgLV|Gp6!4qR!JO3f;yRdI1Ab-((K)VVJ(pI5_qy z#xl7*zOzuID<7fI5842u9VPqDRN-L~Ig{~r0UzMCQ{d_8>Cw8#wKMP9tP>Q;-zK(W zH@P~YmD@mN`sDCK?FWuT;n&f5sJE*_*6c)?EVH@j+zv7|`ARRXX_Gq)rK!w2vdwvP1zm0YA?9mUhsvbGl{(`T#246FeENG~TZpRMO_+7Nab%V$UU zDF-f^({CZ@@?{OUF(8E)ptY|Z6J*6YtYtMhis3-E}{6wkzF zcg>v2`rt9C&O0bC0-twM%Y{6?TW)IgV~p3&$aw_aZ1@)K@`4?g%yvF+eDoyC`~Y~t BF_ZuR delta 2639 zcma)7eM}qY8Gqk9+t}x`jX(HI0tX2VAH_6*&_Eg*3g)9}OoDMr1&ec-15C`B+@1MI z+nFkzl4g@a_Npq=CY4H?mTG@A*=b#?DXTPT(yB~q$e8e)bj`F*9o-a>g{?`XMtj~P z(6ausv-R`y{+{3S@jma%as4!N{S4#0>u_)g9;f%-NZ-qsoj-QrUR`&c!&oIv?)UM_ z#52v4u4PosLpT38S=2< zLyXAp8uw#NK2)lzkr}G&!L$}sLm{S?ebO|6UN;M<&**^iVM7&S$+3OaE_CQ6gsdni zKFJQ@3KRMLz9Hu_Bv7Y7xr}B|njCoQ3bV?7zk3HC-pDOmNK?N+s>*Dstgt17X&sea!nXD9yr>;_j6)RHJNhV^>$7d>m~~yF2O;qyLcNt#Rf8HQVLgvf zpM6c64IARt#R3Xj$(8*T-bN}8F%+voDBFR*zrUh*jb}G_el4vRbsPbegETjlq#SG8 z3EQhd7IuV8R2}9Yt&FV#Yc_IF&t{z?WZCc-F?$}3M0SR)-8m5OKTp^h%7Txzffdl9 z5JD+Cf^C?}AIZcyD8YqH)v)f%U5^bL^}r2ssC@4u;6}now zxy~UWBt?^oB*avqM@dS8HX;c_vA97*6Y_+lXwsmdC4|GVDJiZR;h>QPMT#nt2+n;1 z@N0olqYmiRUMZ@@5>Q2i+@ncV6QT*o6_baJ3Ys*f2?J6*F=-@>iZZMONmOI;*fgXb z1M#3R8Pi4t$v^BDI^`2;R2-MMkY00O8K;m+3>Y%8JHE%N+Yh09vlQiDCPD7kU)nIpHr_6`J zBPejI(W&f4p(PLub#-^zgY80VS1`~MtniYf&on(}8&t%}2#iEiYG`#oK!#N%EyE2< zT4X#nH71T{oG|}HREcPbfoZ6voTn;lYa*H)lVmMpQpE|W!cA^Ivz^OW5=m`5sVPeY zw-lCrN|okuS|DQ?h-oyN4YEayM5-Iyfk}UT~&; z1?j@=>4MUgEaWb}i%f3M9TzGn`A%i7vljUBi@6K@6Y08!jdk7v zzbjq4cVkWcH~F0xLHxuLU$?;5E%NnHI=AqR$c4y7?`-5+PW9QgbbirNe)U2szdBt| zlrE@QF{1+i3StYKXFF-5-X(t90>AA?!%Ic}>qY)cv4x@oi+nRQ?(ob8=1S(oAC%4I z&Exs9`Mhi8d#`yKmiWe7yk{+JE;=tR@#S|qF>)5JjALZBpE>x-!CC8~so>99j&x1! zqIJt$`+}9c+Uj%t(YhPTTb;^bn6Gl1{Ij(V-zFu2qJn<|aTo@y(l;gii?J83X_m-e z27M$R_|`7vPc-HRg_{&U2B=_^zf*ge+N`of?SE0Y3qTs$wwS*Fqep+C?I13^2c`-K zi3Ft`P&u&LNf7sK)+Du?(uyE=zGB5bZvEDav8F|=S&t*{^h2Q8+FLW@#p6T;*9AzndtbfpV zo3)hEburN+txO3f_ak-KMEu8_ut3I+zqhr5ddmSST*?F5?siJNPpuuKS={Tdq&}yz zi_*IRGL{pQqB5*z%<%~^HmE#iaPbLQ9Fr6u`Kfq-E2CNppX+zTUorM7x~AY}^T0BLJ3a0!x5rZLSw6PvJRz z`-upPd2(9bhug>paupuX{~>?E-~hRLatluCpPh6u;QvDHG9QIRGx|a8GR8gR%j9kk zPiurJ3_vb8>E}uFL=nC~j!z`Bp9Q|ctc-#u7z`@UlD|#-)?!SECzmEGv5VZ8RIrr< zr}Bzzpv`y!y^+@Dogfso>)$?fFTsdwt=^~;#(t4X0ZX6of;$gcl&ddysE#ol>!6~WI+ZZF2QY0k02 zQsb7>y<(;YNu+Y>sX_k~-Lt;g=-i&!(FJ$O0$;jfGaMW^R_un!Bd2@SVVF~l{%&e3 F^Iu8YiF*J5 diff --git a/app/main.py b/app/main.py index 41d7fc7..215e881 100644 --- a/app/main.py +++ b/app/main.py @@ -14,6 +14,8 @@ import asyncio # import fastapi from fastapi.staticfiles import StaticFiles import pymupdf + +# import fitz as pymupdf import json import re @@ -487,6 +489,7 @@ async def get_censor_status(file_id: str): async def yield_censor_status(file_id: str): + """Internal function to yield updates to the stream""" while True: await censor_status_update_events[file_id].wait() censor_status_update_events[file_id].clear() @@ -505,9 +508,7 @@ def censor_pdf( secure: bool, file_id: str, ): - """Censors pdf and runs OCR - If Secure is True the file is converted to Pixels and then recreated; else the censored sections are just covering the text below and can be easiliy removed with e.g. Inkscape - Saves the file to the given Destpath. + """Censors pdf and saves the file to the given Destpath. Args: path: path to the pdf document destpath: Path where the result is supposed to be saved to @@ -518,14 +519,8 @@ def censor_pdf( None """ doc = pymupdf.open(path) - output = pymupdf.open() page = doc[0] - # width = page.rect.width - # height = page.rect.height - # print(width, height) npage = doc.page_count - # pages = [] - # tasks = [] for i in range(npage): page = doc[i] if i < len(rects) and rects[i] != []: @@ -539,54 +534,19 @@ def censor_pdf( (rect[0] + rect[2]) * wfac, (rect[1] + rect[3]) * hfac, ) - page.draw_rect( + page.add_redact_annot( prect, - color=(0, 0, 0), fill=(0, 0, 0), ) - if secure: - censor_status_datas[file_id]["page"] = i + 1 - censor_status_datas[file_id]["pages"] = npage - censor_status_datas[file_id]["done"] = False - censor_status_update_events[file_id].set() - - # pages.append(page) - # THis Costs us dearly - bitmap = page.get_pixmap(dpi=400) - pdf_bytes = bitmap.pdfocr_tobytes( - language="deu", - tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files - ) - output.insert_pdf(pymupdf.Document(stream=pdf_bytes)) - # End of the costly part - # tasks.append(asyncio.create_task(censor_page(page))) - print(f"Page {i + 1}/{npage}: CENSORING DONE") - else: - output.insert_pdf(doc, i, i) - - # if secure: - # pages_bytes: List[bytes] = [] - # censor_page(pages[0]) - # with multiprocessing.Pool(npage) as p: - # pages_bytes = p.map(censor_page, pages) - # pages_bytes = p.map(test_function, [1, 2, 3, 4]) - # for pdf_bytes in pages_bytes: - # output.insert_pdf(pymupdf.Document(stream=pdf_bytes)) - # with concurrent.futures.ThreadPoolExecutor() as executor: - # futures = [] - # for page in pages: - # futures.append(executor.submit(censor_page, page)) - # for future in futures: - # output.insert_pdf(pymupdf.Document(stream=future.result())) - # - # for task in tasks: - # output.insert_pdf(pymupdf.Document(stream=await task)) - # print("CENSORING DONE") - output.save(destpath) - if secure: - censor_status_datas[file_id]["done"] = True + page.apply_redactions() + censor_status_datas[file_id]["page"] = i + 1 + censor_status_datas[file_id]["pages"] = npage + censor_status_datas[file_id]["done"] = False censor_status_update_events[file_id].set() - # censor_finished_flags[file_id].set() + doc.set_metadata({}) + doc.save(destpath, garbage=4, deflate=True, clean=True) + censor_status_datas[file_id]["done"] = True + censor_status_update_events[file_id].set() def test_function(i: int) -> bytes: