fixed bug in js, that blocked showing prof suggestions when nothing is entered in the field

added back option to run OCR
changed the censoring mode to built in censoring with pymupdf
2025-10-23 15:40:45 +02:00 · 2025-10-23 00:06:25 +02:00 · 2025-10-22 23:26:33 +02:00
5 changed files with 94 additions and 68 deletions
--- a/app/pycache/main.cpython-313.pyc
+++ b/app/pycache/main.cpython-313.pyc
--- a/app/main.py
+++ b/app/main.py
@@ -14,6 +14,8 @@ import asyncio
 # import fastapi
 from fastapi.staticfiles import StaticFiles
 import pymupdf
 # import fitz as pymupdf
 import json
 import re
@@ -412,14 +414,14 @@ async def get_submission(
    pagescales: Annotated[
        str, Form()
    ],  # Scales of Pages  # Annotated[List[Dict[str, float]], Form()],
-    censor: Annotated[str, Form()],
+    ocr: Annotated[str, Form()],
 ):
    """handles submission"""
    print(
-        f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}"
+        f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
    )
    info(
-        f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}"
+        f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
    )
    rects_p = json.loads(rects)
    scales_p = json.loads(pagescales)
@@ -452,15 +454,24 @@ async def get_submission(
    censor_status_datas[fileId] = {}
    if fileId not in censor_status_update_events:
        censor_status_update_events[fileId] = asyncio.Event()
-    await asyncio.to_thread(
+    if ocr == "True":
-        censor_pdf,
+        await asyncio.to_thread(
-        filepath,
+            censor_pdf_ocr,
-        dest,
+            filepath,
-        rects_p,
+            dest,
-        scales_p,
+            rects_p,
-        False if censor == "False" else True,
+            scales_p,
-        fileId,
+            fileId,
-    )
+        )
    else:
        await asyncio.to_thread(
            censor_pdf,
            filepath,
            dest,
            rects_p,
            scales_p,
            fileId,
        )
    # return {"done": "ok"}
    # print(dest)
@@ -487,6 +498,7 @@ async def get_censor_status(file_id: str):
 async def yield_censor_status(file_id: str):
    """Internal function to yield updates to the stream"""
    while True:
        await censor_status_update_events[file_id].wait()
        censor_status_update_events[file_id].clear()
@@ -502,11 +514,58 @@ def censor_pdf(
    destpath: str,
    rects: List[List[List[float]]],
    scales: List[Dict[str, float]],
-    secure: bool,
+    file_id: str,
 ):
    """Censors pdf and saves the file to the given Destpath.
    Args:
        path: path to the pdf document
        destpath: Path where the result is supposed to be saved to
        rects: Coordinates of rectangles to be placed on the pdf document
        scales: Scales of the rects coordinates for the pdf document
        secure: weather or not the pdf document is supposed to be converted into an Image (and back) to make shure, the censoring is irreversible
    Returns:
        None
    """
    doc = pymupdf.open(path)
    page = doc[0]
    npage = doc.page_count
    for i in range(npage):
        page = doc[i]
        if i < len(rects) and rects[i] != []:
            print(i)
            wfac = page.rect.width / scales[i]["width"]
            hfac = page.rect.height / scales[i]["height"]
            for rect in rects[i]:
                prect = pymupdf.Rect(
                    rect[0] * wfac,
                    rect[1] * hfac,
                    (rect[0] + rect[2]) * wfac,
                    (rect[1] + rect[3]) * hfac,
                )
                page.add_redact_annot(
                    prect,
                    fill=(0, 0, 0),
                )
            page.apply_redactions()
        censor_status_datas[file_id]["page"] = i + 1
        censor_status_datas[file_id]["pages"] = npage
        censor_status_datas[file_id]["done"] = False
        censor_status_update_events[file_id].set()
    doc.set_metadata({})
    doc.save(destpath, garbage=4, deflate=True, clean=True)
    censor_status_datas[file_id]["done"] = True
    censor_status_update_events[file_id].set()
 def censor_pdf_ocr(
    path: str,
    destpath: str,
    rects: List[List[List[float]]],
    scales: List[Dict[str, float]],
    file_id: str,
 ):
    """Censors pdf and runs OCR
-    If Secure is True the file is converted to Pixels and then recreated; else the censored sections are just covering the text below and can be easiliy removed with e.g. Inkscape
+    The file is converted to Pixels and then recreated.
    Saves the file to the given Destpath.
    Args:
        path: path to the pdf document
@@ -520,12 +579,7 @@ def censor_pdf(
    doc = pymupdf.open(path)
    output = pymupdf.open()
    page = doc[0]
    # width = page.rect.width
    # height = page.rect.height
    # print(width, height)
    npage = doc.page_count
    # pages = []
    # tasks = []
    for i in range(npage):
        page = doc[i]
        if i < len(rects) and rects[i] != []:
@@ -544,49 +598,22 @@ def censor_pdf(
                    color=(0, 0, 0),
                    fill=(0, 0, 0),
                )
-        if secure:
+        censor_status_datas[file_id]["page"] = i + 1
-            censor_status_datas[file_id]["page"] = i + 1
+        censor_status_datas[file_id]["pages"] = npage
-            censor_status_datas[file_id]["pages"] = npage
+        censor_status_datas[file_id]["done"] = False
            censor_status_datas[file_id]["done"] = False
            censor_status_update_events[file_id].set()
            # pages.append(page)
            # THis Costs us dearly
            bitmap = page.get_pixmap(dpi=400)
            pdf_bytes = bitmap.pdfocr_tobytes(
                language="deu",
                tessdata="/usr/share/tessdata/",  # tesseract needs to be installed; this is the path to thetesseract files
            )
            output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
            # End of the costly part
            # tasks.append(asyncio.create_task(censor_page(page)))
            print(f"Page {i + 1}/{npage}: CENSORING DONE")
        else:
            output.insert_pdf(doc, i, i)
    # if secure:
    # pages_bytes: List[bytes] = []
    # censor_page(pages[0])
    # with multiprocessing.Pool(npage) as p:
    # pages_bytes = p.map(censor_page, pages)
    # pages_bytes = p.map(test_function, [1, 2, 3, 4])
    # for pdf_bytes in pages_bytes:
    # output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
    # with concurrent.futures.ThreadPoolExecutor() as executor:
    #     futures = []
    #     for page in pages:
    #         futures.append(executor.submit(censor_page, page))
    #     for future in futures:
    #         output.insert_pdf(pymupdf.Document(stream=future.result()))
    #
    # for task in tasks:
    # output.insert_pdf(pymupdf.Document(stream=await task))
    # print("CENSORING DONE")
    output.save(destpath)
    if secure:
        censor_status_datas[file_id]["done"] = True
        censor_status_update_events[file_id].set()
-    # censor_finished_flags[file_id].set()
+        # THis Costs us dearly
        bitmap = page.get_pixmap(dpi=400)
        pdf_bytes = bitmap.pdfocr_tobytes(
            language="deu",
            tessdata="/usr/share/tessdata/",  # tesseract needs to be installed; this is the path to thetesseract files
        )
        output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
        # End of the costly part
        print(f"Page {i + 1}/{npage}: CENSORING DONE")
    output.save(destpath)
    censor_status_datas[file_id]["done"] = True
    censor_status_update_events[file_id].set()
 def test_function(i: int) -> bytes:
--- a/index.html
+++ b/index.html
@@ -134,11 +134,10 @@
            </div>
            <input
              type="checkbox"
-              name="censor"
+              name="ocr"
              id="sec_censor"
              value="True"
-              checked
+            /><label for="sec_censor">OCR</label><br /><br />
            /><label for="sec_censor">Zensieren</label><br /><br />
            <button type="submit" id="send">Senden</button>
          </form>
        </div>
--- a/static/app.js
+++ b/static/app.js
@@ -276,8 +276,8 @@ function submitPdf(eve) {
  formdata.append("fileId", doc.fID);
  //formdata.append("filename", doc.filename);
  formdata.append("ftype", doc.filetype);
-  if (!formdata.has("censor")) {
+  if (!formdata.has("ocr")) {
-    formdata.append("censor", "False");
+    formdata.append("ocr", "False");
  }
  console.log(formdata);
  submitForm(formdata);
--- a/static/autocomplete.js
+++ b/static/autocomplete.js
@@ -21,7 +21,7 @@ function autocomplete(inp, type) {
      i,
      apirq,
      iname,
-      val = this.value;
+      val = inp.value;
    /*close any already open lists of autocompleted values*/
    closeAllLists();
    if (!val && type === "lva" && pid === null) {
@@ -56,7 +56,7 @@ function autocomplete(inp, type) {
    a.setAttribute("id", this.id + "autocomplete-list");
    a.setAttribute("class", "autocomplete-items");
    /*append the DIV element as a child of the autocomplete container:*/
-    this.parentNode.appendChild(a);
+    inp.parentNode.appendChild(a);
    /*for each item in the array...*/
    //await response;
    if (response.ok) {
Author	SHA1	Message	Date
Marcel Gansfusz	5c6a8dfba2	fixed bug in js, that blocked showing prof suggestions when nothing is entered in the field	2025-10-23 15:40:45 +02:00
Marcel Gansfusz	c30d69d205	added back option to run OCR	2025-10-23 00:06:25 +02:00
Marcel Gansfusz	56d3468889	changed the censoring mode to built in censoring with pymupdf	2025-10-22 23:26:33 +02:00