From f9643612fe3ecee591e3f46f15e9e3f73abb173f Mon Sep 17 00:00:00 2001 From: Marcel Gansfusz Date: Thu, 30 Jan 2025 22:18:49 +0100 Subject: [PATCH] Added PDF censoring in python --- app.js | 8 ++-- app/__pycache__/main.cpython-313.pyc | Bin 2430 -> 4503 bytes app/main.py | 67 +++++++++++++++++++++++---- index.html | 2 +- 4 files changed, 63 insertions(+), 14 deletions(-) diff --git a/app.js b/app.js index a2f332c..805e6fd 100644 --- a/app.js +++ b/app.js @@ -163,8 +163,8 @@ class PDFDocument { } get paramRects() { let prects = []; - for (var k = 0; k < this.rects.length; k++) { - prects[k] = []; + for (var k = 1; k < this.rects.length; k++) { + prects[k - 1] = []; //console.log(this.rects[k]); if (this.rects[k] === undefined) { continue; @@ -174,7 +174,7 @@ class PDFDocument { let len = this.rects[k].length; for (var i = 0; i < len; i++) { //console.log(this.rects[k][i]); - prects[k].push([this.rects[k][i].makeTuple()]); + prects[k - 1].push(this.rects[k][i].makeTuple()); //console.log(prects[k][i]); } } @@ -263,7 +263,7 @@ function submitPdf(eve) { var formdata = new FormData(eve.target); console.log(doc.paramRects); formdata.append("rects", JSON.stringify(doc.paramRects)); - formdata.append("pagescales", JSON.stringify(doc.pagescales)); + formdata.append("pagescales", JSON.stringify(doc.pagescales.slice(1))); formdata.append("fname", doc.fname); console.log(formdata); submitForm(formdata); diff --git a/app/__pycache__/main.cpython-313.pyc b/app/__pycache__/main.cpython-313.pyc index 6e9e9fdc81888abc45a7b5839abc4d9bc3467e95..4fe65ecf577d81b6c04b57ce6938b1f701d61c56 100644 GIT binary patch literal 4503 zcma)9-ESMm5#ReF@AxTFq)gIsM143e6`M2~DS;AOu5HDRq=asDrmH|g1)4lbbj2gH zM_RUm0CMtR$}S)Zg)6x>Pz8OcgTCmok4^ppU**KaML}G&Kw+RLXsAVzq7R)pen?g< zq)Xi1-t5fm?Cx)Wv)pz%Z3N0Yf4?{5auD(t{Ad(ky|I0XCFBl~iOh_W5r#5hY_rg2 zDLcYZZiJ`2DdR?k5ev1LZGO}`B2sZgqSAe0!*8QtTrA_2H>^(mQsf5!B9Gwobabh%Cy|03+LE`v4avpc&cGXtZQUKT~VV z&Pis{KPl~a?_&k%>SJYBh+N^DW47(%$-T>y4Xj$8vpTSkm8WUth4BNEj=kf{t^3G* z`^f$K$lLajAJ|78*he0GvAnNE4m~&HpnPyLAh*L4m}Jdg2-mam$cI42!Hh$-bvD;p zlwU$0^x%vo((x2GCc+rfX;sY?k_9Cl;SBzrj8-uCp-ie^2p5a_IR!NMH1tLU!+I{M z6;A(n*x=7)<`l!0$j{}HX|rt|Q{FEsXnHP3vtVPF!DS|eomzas8q>_r1vVP{6oyP^ zsFEt==yk(V>i{9ICbPVHQzJnddwrbg zA+K}OOoWXaqUkQPfb-zy6g9$7JY5=^g^k|WY%Z(B<}-8YBu&l6=9SB_X{9h#RKcP+ z6HDgvv1~G7ceuFVASwg;KoNV( zWgwIn_gr9PMr9`u4Q7$q1bYlxGS^@C!xEKCaAO$To|0f13vAK{#L}<{lL@h*JZ5jS zcHS~S;cR@#tSkU+SY#{GMuL^adIVCG?I9wG&vlZc3T z)ONlG86;8@dLKZ;gp4n+I=nLc%hAWKj>YqpaL@0YpE&jKK&i0&;jIt&F9ddjE2XGfjw?UB z1@gP1)mu82AJ`#Qt@Hi@r@&o_@Ro!TOKM6KIDVnY8G(z(P_tN7ehOsX^3f{ zcswyc+qJ#jy(<_ekQ3PsKz)ZyK(kg}L{YGJO}&=nY~sR|aE)P3-cw}Byq~hFJ0Xl6+|vo4yJp`0-CmfvTXqcWN_+2C>(K~fcp&za z-3f9BMY6N8>tQlOBCa?^QiBAON#jA9RWdWPg{P3`$W!ciibp8InMykqBrPG8o6Ax2 zUNmQLX{ET}ixo8*(`J)Yi4_!00|HLQFulzs)tMp?upu_YaQlIq79_dsg6r&Carwg7 zaQu9D=tBIhNQ+_3U(XhSrf4Sy2B|PrpeLb82eEn;g#rbaaOFcJj=u{zh`8ot**fOf7Q0NBs6o+!DViq!}!E4E@N~FbP z82Sc|XtLp(=$=dGQidg0EaZ!zh=cD0W-u9ppPx>q41N}BI&8+7H>0pz&J-{L35J?7 z2iIDlyTPH2x5=gSH5?PGK6tbO>;%xKgV?>xM{gZ17dCC}i)Xi-zU9eVled4ic&=i1 zmEK=^Z4sEVWqEjMc=^K8g~hijPS5gtOYc2)9@3qM7SC<*Qt9v`KCtEVmS0^RTp9e* z*;QgHZtv375?`@9exa57Z!g>)th63j9atGyJ+*Rb{n97zJ($-Izpl5wUK*-*JZ|mQ zTf5h%Kgn!}`pW})>i~2G4&4{a{8pgjH;!LBD!#y)XXX4_dL_QQ=?|`TuKaYZuzvdP zyzYPbL67c>R$e6BmV}_UEL0uD<6m{HxGKJ2CDdIBc2%v!)3Hr>kN26I1Ust%;%!^) zSn1gGbd@X>U+A$fqWdBhf2iVrr794AZ@@MlIap<`b5Dc+h44$hF7YvB$AS-xPx=2Jo^(a^m4i6^qTba74@ zq%$DHQ?30L>MF-D%-5v-8`7hbp07#p8*<_ca^h?9%2&ko6>0g3*#B&CewHgyz*A^w6{HSUntZ*}MBT$NLGQKLyFx_jz>&mAWe}8E`tGD-U6Oe3k zg!P+!unBeb=q3}avKHn?%=#PK1mCC?s)?%JhFb3?1D0MURFMF&clmpk@|(KUQRP7N z#MW9qd)IfbtvssR!c_r85RSuNmhbl5d$~NJ+dHdP6p92icNt44wC&NfqtHQI2g=&r zp?l}bAL!2RsuP7S(5#5|rHi*OmF&+r)?ul-QSKw2&~CZMvh4>s&ZEtYx!T*qv{!7- zDhKZ;E>Bf}H-^|!=F1n?4zEn;EuFd@AdPa7I6Y-wIknbbzM^|Nbw}5~B#@h%_&=Q5 Br8EEl delta 803 zcmZ`$&1(}u9G#in&3-1k+0Ew`ZQ7)fMJ#GkYOxkUZ$bqJ58@sht5qA>RIicJLr=10 z6dQ68yy)4xC;tEgVoOe51wSGY#EbMdO|je1%;E9d_j~W{8|G#4pPlH(;~IjqQFy-m zTG~qFFxfhFtLZC~XomA}0@F8dObq5I(wLu^L{z5A9HuJO=7a(QWihH5%v7Jj2b1Ar z-}cqR^eT~HRV6g;q80JrpG_1vBcc+KC=G08#D;>@2$%KWm4pIm-xw}H?I>^Ywu5HW z;S2T*XYvv9Hs#p z3XeIboV-whVT?250AJBl^gDwjZL=8Od{uc<`y^i8HSF-}=E6rY|FUd88>Wk)tS6ZodCcM_s3Oxy^UJM!OkhV(ga(KI=PxNF?DhQMjv&u%r62{Fb_A_1L{hZyZ`_I diff --git a/app/main.py b/app/main.py index 64381dd..4fdcb57 100644 --- a/app/main.py +++ b/app/main.py @@ -3,6 +3,9 @@ from typing import List, Dict, Tuple from datetime import date from fastapi import FastAPI, File, UploadFile, Request, Form from fastapi.staticfiles import StaticFiles +import pymupdf +import pdf2image +import json app = FastAPI() @@ -26,19 +29,65 @@ async def create_upload_file(file: UploadFile): @app.post("/submit/") async def get_submittion( - lva: Annotated[str, Form()], - prof: Annotated[str, Form()], - fname: Annotated[str, Form()], - sem: Annotated[str, Form()], - stype: Annotated[str, Form()], - date: Annotated[str, Form()], - rects: Annotated[str, Form()], # List[List[Tuple[float, float, float, float]]], - pagescales: Annotated[str, Form()], # Annotated[List[Dict[str, float]], Form()], + lva: Annotated[str, Form()], # LVA Name and Number + prof: Annotated[str, Form()], # Vortragender + fname: Annotated[str, Form()], # Path to pdf File + sem: Annotated[str, Form()], # Semester eg. 2024W + stype: Annotated[str, Form()], # Type of File eg. Prüfung + ex_date: Annotated[str, Form()], # Date of Exam only when type is exam + rects: Annotated[ + str, Form() + ], # Rechtangles # List[List[Tuple[float, float, float, float]]], + pagescales: Annotated[ + str, Form() + ], # Scales of Pages # Annotated[List[Dict[str, float]], Form()], ): - print(lva, prof, fname, stype, sem, date, rects, pagescales) + print(lva, prof, fname, stype, sem, ex_date, rects, pagescales) + rects_p = json.loads(rects) + scales_p = json.loads(pagescales) + censor_pdf(fname, "./app/files/censored.pdf", rects_p, scales_p) return {"done": "ok"} +def censor_pdf( + path: str, + destpath: str, + rects: List[List[List[float]]], + scales: List[Dict[str, float]], +): + doc = pymupdf.open(path) + output = pymupdf.open() + page = doc[0] + width = page.rect.width + height = page.rect.height + print(width, height) + for i in range(doc.page_count): + page = doc[i] + if i < len(rects): + wfac = page.rect.width / scales[i]["width"] + hfac = page.rect.height / scales[i]["height"] + for rect in rects[i]: + prect = pymupdf.Rect( + rect[0] * wfac, + rect[1] * hfac, + (rect[0] + rect[2]) * wfac, + (rect[1] + rect[3]) * hfac, + ) + page.draw_rect( + prect, + color=(0, 0, 0), + fill=(0, 0, 0), + ) + bitmap = page.get_pixmap() + pdf_bytes = bitmap.pdfocr_tobytes( + language="deu", + tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files + ) + output.insert_pdf(pymupdf.Document(stream=pdf_bytes)) + output.save(destpath) + print("CENSORING DONE") + + # async def get_submittion(request: Request): # reqJson = await request.form() # print(reqJson) diff --git a/index.html b/index.html index e4e8eac..85b9f16 100644 --- a/index.html +++ b/index.html @@ -49,7 +49,7 @@
-
+