Merge pull request 'improve_censoring_speed' (#1) from improve_censoring_speed into main

Reviewed-on: #1
2025-10-23 15:43:40 +02:00
parent 26ea274023 5c6a8dfba2
commit b9eb5e8bd4
5 changed files with 94 additions and 68 deletions
--- a/app/pycache/main.cpython-313.pyc
+++ b/app/pycache/main.cpython-313.pyc
--- a/app/main.py
+++ b/app/main.py
@@ -14,6 +14,8 @@ import asyncio
 # import fastapi
 from fastapi.staticfiles import StaticFiles
 import pymupdf
+
+# import fitz as pymupdf
 import json
 import re

@@ -412,14 +414,14 @@ async def get_submission(
    pagescales: Annotated[
        str, Form()
    ],  # Scales of Pages  # Annotated[List[Dict[str, float]], Form()],
-    censor: Annotated[str, Form()],
+    ocr: Annotated[str, Form()],
 ):
    """handles submission"""
    print(
-        f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}"
+        f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
    )
    info(
-        f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}"
+        f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
    )
    rects_p = json.loads(rects)
    scales_p = json.loads(pagescales)
@@ -452,13 +454,22 @@ async def get_submission(
    censor_status_datas[fileId] = {}
    if fileId not in censor_status_update_events:
        censor_status_update_events[fileId] = asyncio.Event()
+    if ocr == "True":
+        await asyncio.to_thread(
+            censor_pdf_ocr,
+            filepath,
+            dest,
+            rects_p,
+            scales_p,
+            fileId,
+        )
+    else:
        await asyncio.to_thread(
            censor_pdf,
            filepath,
            dest,
            rects_p,
            scales_p,
-        False if censor == "False" else True,
            fileId,
        )

@@ -487,6 +498,7 @@ async def get_censor_status(file_id: str):


 async def yield_censor_status(file_id: str):
+    """Internal function to yield updates to the stream"""
    while True:
        await censor_status_update_events[file_id].wait()
        censor_status_update_events[file_id].clear()
@@ -502,11 +514,58 @@ def censor_pdf(
    destpath: str,
    rects: List[List[List[float]]],
    scales: List[Dict[str, float]],
-    secure: bool,
+    file_id: str,
+):
+    """Censors pdf and saves the file to the given Destpath.
+    Args:
+        path: path to the pdf document
+        destpath: Path where the result is supposed to be saved to
+        rects: Coordinates of rectangles to be placed on the pdf document
+        scales: Scales of the rects coordinates for the pdf document
+        secure: weather or not the pdf document is supposed to be converted into an Image (and back) to make shure, the censoring is irreversible
+    Returns:
+        None
+    """
+    doc = pymupdf.open(path)
+    page = doc[0]
+    npage = doc.page_count
+    for i in range(npage):
+        page = doc[i]
+        if i < len(rects) and rects[i] != []:
+            print(i)
+            wfac = page.rect.width / scales[i]["width"]
+            hfac = page.rect.height / scales[i]["height"]
+            for rect in rects[i]:
+                prect = pymupdf.Rect(
+                    rect[0] * wfac,
+                    rect[1] * hfac,
+                    (rect[0] + rect[2]) * wfac,
+                    (rect[1] + rect[3]) * hfac,
+                )
+                page.add_redact_annot(
+                    prect,
+                    fill=(0, 0, 0),
+                )
+            page.apply_redactions()
+        censor_status_datas[file_id]["page"] = i + 1
+        censor_status_datas[file_id]["pages"] = npage
+        censor_status_datas[file_id]["done"] = False
+        censor_status_update_events[file_id].set()
+    doc.set_metadata({})
+    doc.save(destpath, garbage=4, deflate=True, clean=True)
+    censor_status_datas[file_id]["done"] = True
+    censor_status_update_events[file_id].set()
+
+
+def censor_pdf_ocr(
+    path: str,
+    destpath: str,
+    rects: List[List[List[float]]],
+    scales: List[Dict[str, float]],
    file_id: str,
 ):
    """Censors pdf and runs OCR
-    If Secure is True the file is converted to Pixels and then recreated; else the censored sections are just covering the text below and can be easiliy removed with e.g. Inkscape
+    The file is converted to Pixels and then recreated.
    Saves the file to the given Destpath.
    Args:
        path: path to the pdf document
@@ -520,12 +579,7 @@ def censor_pdf(
    doc = pymupdf.open(path)
    output = pymupdf.open()
    page = doc[0]
-    # width = page.rect.width
-    # height = page.rect.height
-    # print(width, height)
    npage = doc.page_count
-    # pages = []
-    # tasks = []
    for i in range(npage):
        page = doc[i]
        if i < len(rects) and rects[i] != []:
@@ -544,13 +598,10 @@ def censor_pdf(
                    color=(0, 0, 0),
                    fill=(0, 0, 0),
                )
-        if secure:
        censor_status_datas[file_id]["page"] = i + 1
        censor_status_datas[file_id]["pages"] = npage
        censor_status_datas[file_id]["done"] = False
        censor_status_update_events[file_id].set()
-
-            # pages.append(page)
        # THis Costs us dearly
        bitmap = page.get_pixmap(dpi=400)
        pdf_bytes = bitmap.pdfocr_tobytes(
@@ -559,34 +610,10 @@ def censor_pdf(
        )
        output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
        # End of the costly part
-            # tasks.append(asyncio.create_task(censor_page(page)))
        print(f"Page {i + 1}/{npage}: CENSORING DONE")
-        else:
-            output.insert_pdf(doc, i, i)
-
-    # if secure:
-    # pages_bytes: List[bytes] = []
-    # censor_page(pages[0])
-    # with multiprocessing.Pool(npage) as p:
-    # pages_bytes = p.map(censor_page, pages)
-    # pages_bytes = p.map(test_function, [1, 2, 3, 4])
-    # for pdf_bytes in pages_bytes:
-    # output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
-    # with concurrent.futures.ThreadPoolExecutor() as executor:
-    #     futures = []
-    #     for page in pages:
-    #         futures.append(executor.submit(censor_page, page))
-    #     for future in futures:
-    #         output.insert_pdf(pymupdf.Document(stream=future.result()))
-    #
-    # for task in tasks:
-    # output.insert_pdf(pymupdf.Document(stream=await task))
-    # print("CENSORING DONE")
    output.save(destpath)
-    if secure:
    censor_status_datas[file_id]["done"] = True
    censor_status_update_events[file_id].set()
-    # censor_finished_flags[file_id].set()


 def test_function(i: int) -> bytes:
--- a/index.html
+++ b/index.html
@@ -134,11 +134,10 @@
            </div>
            <input
              type="checkbox"
-              name="censor"
+              name="ocr"
              id="sec_censor"
              value="True"
-              checked
-            /><label for="sec_censor">Zensieren</label><br /><br />
+            /><label for="sec_censor">OCR</label><br /><br />
            <button type="submit" id="send">Senden</button>
          </form>
        </div>
--- a/static/app.js
+++ b/static/app.js
@@ -276,8 +276,8 @@ function submitPdf(eve) {
  formdata.append("fileId", doc.fID);
  //formdata.append("filename", doc.filename);
  formdata.append("ftype", doc.filetype);
-  if (!formdata.has("censor")) {
-    formdata.append("censor", "False");
+  if (!formdata.has("ocr")) {
+    formdata.append("ocr", "False");
  }
  console.log(formdata);
  submitForm(formdata);
--- a/static/autocomplete.js
+++ b/static/autocomplete.js
@@ -21,7 +21,7 @@ function autocomplete(inp, type) {
      i,
      apirq,
      iname,
-      val = this.value;
+      val = inp.value;
    /*close any already open lists of autocompleted values*/
    closeAllLists();
    if (!val && type === "lva" && pid === null) {
@@ -56,7 +56,7 @@ function autocomplete(inp, type) {
    a.setAttribute("id", this.id + "autocomplete-list");
    a.setAttribute("class", "autocomplete-items");
    /*append the DIV element as a child of the autocomplete container:*/
-    this.parentNode.appendChild(a);
+    inp.parentNode.appendChild(a);
    /*for each item in the array...*/
    //await response;
    if (response.ok) {