diff --git a/app/__pycache__/main.cpython-313.pyc b/app/__pycache__/main.cpython-313.pyc index dab1333..098bd05 100644 Binary files a/app/__pycache__/main.cpython-313.pyc and b/app/__pycache__/main.cpython-313.pyc differ diff --git a/app/main.py b/app/main.py index 215e881..19ed9ad 100644 --- a/app/main.py +++ b/app/main.py @@ -414,14 +414,14 @@ async def get_submission( pagescales: Annotated[ str, Form() ], # Scales of Pages # Annotated[List[Dict[str, float]], Form()], - censor: Annotated[str, Form()], + ocr: Annotated[str, Form()], ): """handles submission""" print( - f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}" + f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}" ) info( - f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}" + f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}" ) rects_p = json.loads(rects) scales_p = json.loads(pagescales) @@ -454,15 +454,24 @@ async def get_submission( censor_status_datas[fileId] = {} if fileId not in censor_status_update_events: censor_status_update_events[fileId] = asyncio.Event() - await asyncio.to_thread( - censor_pdf, - filepath, - dest, - rects_p, - scales_p, - False if censor == "False" else True, - fileId, - ) + if ocr == "True": + await asyncio.to_thread( + censor_pdf_ocr, + filepath, + dest, + rects_p, + scales_p, + fileId, + ) + else: + await asyncio.to_thread( + censor_pdf, + filepath, + dest, + rects_p, + scales_p, + fileId, + ) # return {"done": "ok"} # print(dest) @@ -505,7 +514,6 @@ def censor_pdf( destpath: str, rects: List[List[List[float]]], scales: List[Dict[str, float]], - secure: bool, file_id: str, ): """Censors pdf and saves the file to the given Destpath. @@ -549,6 +557,65 @@ def censor_pdf( censor_status_update_events[file_id].set() +def censor_pdf_ocr( + path: str, + destpath: str, + rects: List[List[List[float]]], + scales: List[Dict[str, float]], + file_id: str, +): + """Censors pdf and runs OCR + The file is converted to Pixels and then recreated. + Saves the file to the given Destpath. + Args: + path: path to the pdf document + destpath: Path where the result is supposed to be saved to + rects: Coordinates of rectangles to be placed on the pdf document + scales: Scales of the rects coordinates for the pdf document + secure: weather or not the pdf document is supposed to be converted into an Image (and back) to make shure, the censoring is irreversible + Returns: + None + """ + doc = pymupdf.open(path) + output = pymupdf.open() + page = doc[0] + npage = doc.page_count + for i in range(npage): + page = doc[i] + if i < len(rects) and rects[i] != []: + print(i) + wfac = page.rect.width / scales[i]["width"] + hfac = page.rect.height / scales[i]["height"] + for rect in rects[i]: + prect = pymupdf.Rect( + rect[0] * wfac, + rect[1] * hfac, + (rect[0] + rect[2]) * wfac, + (rect[1] + rect[3]) * hfac, + ) + page.draw_rect( + prect, + color=(0, 0, 0), + fill=(0, 0, 0), + ) + censor_status_datas[file_id]["page"] = i + 1 + censor_status_datas[file_id]["pages"] = npage + censor_status_datas[file_id]["done"] = False + censor_status_update_events[file_id].set() + # THis Costs us dearly + bitmap = page.get_pixmap(dpi=400) + pdf_bytes = bitmap.pdfocr_tobytes( + language="deu", + tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files + ) + output.insert_pdf(pymupdf.Document(stream=pdf_bytes)) + # End of the costly part + print(f"Page {i + 1}/{npage}: CENSORING DONE") + output.save(destpath) + censor_status_datas[file_id]["done"] = True + censor_status_update_events[file_id].set() + + def test_function(i: int) -> bytes: return b"\x00\x66\x99" diff --git a/index.html b/index.html index 2051433..e2724f3 100644 --- a/index.html +++ b/index.html @@ -134,11 +134,10 @@

+ />

diff --git a/static/app.js b/static/app.js index ceea83f..f654998 100644 --- a/static/app.js +++ b/static/app.js @@ -276,8 +276,8 @@ function submitPdf(eve) { formdata.append("fileId", doc.fID); //formdata.append("filename", doc.filename); formdata.append("ftype", doc.filetype); - if (!formdata.has("censor")) { - formdata.append("censor", "False"); + if (!formdata.has("ocr")) { + formdata.append("ocr", "False"); } console.log(formdata); submitForm(formdata);