added back option to run OCR

2025-10-23 00:06:25 +02:00
parent 56d3468889
commit c30d69d205
4 changed files with 84 additions and 18 deletions
--- a/app/pycache/main.cpython-313.pyc
+++ b/app/pycache/main.cpython-313.pyc
--- a/app/main.py
+++ b/app/main.py
@@ -414,14 +414,14 @@ async def get_submission(
    pagescales: Annotated[
        str, Form()
    ],  # Scales of Pages  # Annotated[List[Dict[str, float]], Form()],
-    censor: Annotated[str, Form()],
+    ocr: Annotated[str, Form()],
 ):
    """handles submission"""
    print(
-        f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}"
+        f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
    )
    info(
-        f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}"
+        f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
    )
    rects_p = json.loads(rects)
    scales_p = json.loads(pagescales)
@@ -454,15 +454,24 @@ async def get_submission(
    censor_status_datas[fileId] = {}
    if fileId not in censor_status_update_events:
        censor_status_update_events[fileId] = asyncio.Event()
-    await asyncio.to_thread(
+    if ocr == "True":
-        censor_pdf,
+        await asyncio.to_thread(
-        filepath,
+            censor_pdf_ocr,
-        dest,
+            filepath,
-        rects_p,
+            dest,
-        scales_p,
+            rects_p,
-        False if censor == "False" else True,
+            scales_p,
-        fileId,
+            fileId,
-    )
+        )
    else:
        await asyncio.to_thread(
            censor_pdf,
            filepath,
            dest,
            rects_p,
            scales_p,
            fileId,
        )
    # return {"done": "ok"}
    # print(dest)
@@ -505,7 +514,6 @@ def censor_pdf(
    destpath: str,
    rects: List[List[List[float]]],
    scales: List[Dict[str, float]],
    secure: bool,
    file_id: str,
 ):
    """Censors pdf and saves the file to the given Destpath.
@@ -549,6 +557,65 @@ def censor_pdf(
    censor_status_update_events[file_id].set()
 def censor_pdf_ocr(
    path: str,
    destpath: str,
    rects: List[List[List[float]]],
    scales: List[Dict[str, float]],
    file_id: str,
 ):
    """Censors pdf and runs OCR
    The file is converted to Pixels and then recreated.
    Saves the file to the given Destpath.
    Args:
        path: path to the pdf document
        destpath: Path where the result is supposed to be saved to
        rects: Coordinates of rectangles to be placed on the pdf document
        scales: Scales of the rects coordinates for the pdf document
        secure: weather or not the pdf document is supposed to be converted into an Image (and back) to make shure, the censoring is irreversible
    Returns:
        None
    """
    doc = pymupdf.open(path)
    output = pymupdf.open()
    page = doc[0]
    npage = doc.page_count
    for i in range(npage):
        page = doc[i]
        if i < len(rects) and rects[i] != []:
            print(i)
            wfac = page.rect.width / scales[i]["width"]
            hfac = page.rect.height / scales[i]["height"]
            for rect in rects[i]:
                prect = pymupdf.Rect(
                    rect[0] * wfac,
                    rect[1] * hfac,
                    (rect[0] + rect[2]) * wfac,
                    (rect[1] + rect[3]) * hfac,
                )
                page.draw_rect(
                    prect,
                    color=(0, 0, 0),
                    fill=(0, 0, 0),
                )
        censor_status_datas[file_id]["page"] = i + 1
        censor_status_datas[file_id]["pages"] = npage
        censor_status_datas[file_id]["done"] = False
        censor_status_update_events[file_id].set()
        # THis Costs us dearly
        bitmap = page.get_pixmap(dpi=400)
        pdf_bytes = bitmap.pdfocr_tobytes(
            language="deu",
            tessdata="/usr/share/tessdata/",  # tesseract needs to be installed; this is the path to thetesseract files
        )
        output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
        # End of the costly part
        print(f"Page {i + 1}/{npage}: CENSORING DONE")
    output.save(destpath)
    censor_status_datas[file_id]["done"] = True
    censor_status_update_events[file_id].set()
 def test_function(i: int) -> bytes:
    return b"\x00\x66\x99"
--- a/index.html
+++ b/index.html
@@ -134,11 +134,10 @@
            </div>
            <input
              type="checkbox"
-              name="censor"
+              name="ocr"
              id="sec_censor"
              value="True"
-              checked
+            /><label for="sec_censor">OCR</label><br /><br />
            /><label for="sec_censor">Zensieren</label><br /><br />
            <button type="submit" id="send">Senden</button>
          </form>
        </div>
--- a/static/app.js
+++ b/static/app.js
@@ -276,8 +276,8 @@ function submitPdf(eve) {
  formdata.append("fileId", doc.fID);
  //formdata.append("filename", doc.filename);
  formdata.append("ftype", doc.filetype);
-  if (!formdata.has("censor")) {
+  if (!formdata.has("ocr")) {
-    formdata.append("censor", "False");
+    formdata.append("ocr", "False");
  }
  console.log(formdata);
  submitForm(formdata);