Added PDF censoring in python

2025-01-30 22:18:49 +01:00
parent 6275e5cfa2
commit f9643612fe
4 changed files with 63 additions and 14 deletions
--- a/app/main.py
+++ b/app/main.py
@@ -3,6 +3,9 @@ from typing import List, Dict, Tuple
 from datetime import date
 from fastapi import FastAPI, File, UploadFile, Request, Form
 from fastapi.staticfiles import StaticFiles
+import pymupdf
+import pdf2image
+import json

 app = FastAPI()

@@ -26,19 +29,65 @@ async def create_upload_file(file: UploadFile):

@app.post("/submit/")
 async def get_submittion(
-    lva: Annotated[str, Form()],
-    prof: Annotated[str, Form()],
-    fname: Annotated[str, Form()],
-    sem: Annotated[str, Form()],
-    stype: Annotated[str, Form()],
-    date: Annotated[str, Form()],
-    rects: Annotated[str, Form()],  # List[List[Tuple[float, float, float, float]]],
-    pagescales: Annotated[str, Form()],  # Annotated[List[Dict[str, float]], Form()],
+    lva: Annotated[str, Form()],  # LVA Name and Number
+    prof: Annotated[str, Form()],  # Vortragender
+    fname: Annotated[str, Form()],  # Path to pdf File
+    sem: Annotated[str, Form()],  # Semester eg. 2024W
+    stype: Annotated[str, Form()],  # Type of File eg. Prüfung
+    ex_date: Annotated[str, Form()],  # Date of Exam only when type is exam
+    rects: Annotated[
+        str, Form()
+    ],  # Rechtangles # List[List[Tuple[float, float, float, float]]],
+    pagescales: Annotated[
+        str, Form()
+    ],  # Scales of Pages  # Annotated[List[Dict[str, float]], Form()],
 ):
-    print(lva, prof, fname, stype, sem, date, rects, pagescales)
+    print(lva, prof, fname, stype, sem, ex_date, rects, pagescales)
+    rects_p = json.loads(rects)
+    scales_p = json.loads(pagescales)
+    censor_pdf(fname, "./app/files/censored.pdf", rects_p, scales_p)
    return {"done": "ok"}


+def censor_pdf(
+    path: str,
+    destpath: str,
+    rects: List[List[List[float]]],
+    scales: List[Dict[str, float]],
+):
+    doc = pymupdf.open(path)
+    output = pymupdf.open()
+    page = doc[0]
+    width = page.rect.width
+    height = page.rect.height
+    print(width, height)
+    for i in range(doc.page_count):
+        page = doc[i]
+        if i < len(rects):
+            wfac = page.rect.width / scales[i]["width"]
+            hfac = page.rect.height / scales[i]["height"]
+            for rect in rects[i]:
+                prect = pymupdf.Rect(
+                    rect[0] * wfac,
+                    rect[1] * hfac,
+                    (rect[0] + rect[2]) * wfac,
+                    (rect[1] + rect[3]) * hfac,
+                )
+                page.draw_rect(
+                    prect,
+                    color=(0, 0, 0),
+                    fill=(0, 0, 0),
+                )
+        bitmap = page.get_pixmap()
+        pdf_bytes = bitmap.pdfocr_tobytes(
+            language="deu",
+            tessdata="/usr/share/tessdata/",  # tesseract needs to be installed; this is the path to thetesseract files
+        )
+        output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
+    output.save(destpath)
+    print("CENSORING DONE")
+
+
 # async def get_submittion(request: Request):
 #     reqJson = await request.form()
 #     print(reqJson)