added back option to run OCR

This commit is contained in:
Marcel Gansfusz
2025-10-23 00:06:25 +02:00
parent 56d3468889
commit c30d69d205
4 changed files with 84 additions and 18 deletions

Binary file not shown.

View File

@@ -414,14 +414,14 @@ async def get_submission(
pagescales: Annotated[ pagescales: Annotated[
str, Form() str, Form()
], # Scales of Pages # Annotated[List[Dict[str, float]], Form()], ], # Scales of Pages # Annotated[List[Dict[str, float]], Form()],
censor: Annotated[str, Form()], ocr: Annotated[str, Form()],
): ):
"""handles submission""" """handles submission"""
print( print(
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}" f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
) )
info( info(
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}" f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
) )
rects_p = json.loads(rects) rects_p = json.loads(rects)
scales_p = json.loads(pagescales) scales_p = json.loads(pagescales)
@@ -454,15 +454,24 @@ async def get_submission(
censor_status_datas[fileId] = {} censor_status_datas[fileId] = {}
if fileId not in censor_status_update_events: if fileId not in censor_status_update_events:
censor_status_update_events[fileId] = asyncio.Event() censor_status_update_events[fileId] = asyncio.Event()
await asyncio.to_thread( if ocr == "True":
censor_pdf, await asyncio.to_thread(
filepath, censor_pdf_ocr,
dest, filepath,
rects_p, dest,
scales_p, rects_p,
False if censor == "False" else True, scales_p,
fileId, fileId,
) )
else:
await asyncio.to_thread(
censor_pdf,
filepath,
dest,
rects_p,
scales_p,
fileId,
)
# return {"done": "ok"} # return {"done": "ok"}
# print(dest) # print(dest)
@@ -505,7 +514,6 @@ def censor_pdf(
destpath: str, destpath: str,
rects: List[List[List[float]]], rects: List[List[List[float]]],
scales: List[Dict[str, float]], scales: List[Dict[str, float]],
secure: bool,
file_id: str, file_id: str,
): ):
"""Censors pdf and saves the file to the given Destpath. """Censors pdf and saves the file to the given Destpath.
@@ -549,6 +557,65 @@ def censor_pdf(
censor_status_update_events[file_id].set() censor_status_update_events[file_id].set()
def censor_pdf_ocr(
path: str,
destpath: str,
rects: List[List[List[float]]],
scales: List[Dict[str, float]],
file_id: str,
):
"""Censors pdf and runs OCR
The file is converted to Pixels and then recreated.
Saves the file to the given Destpath.
Args:
path: path to the pdf document
destpath: Path where the result is supposed to be saved to
rects: Coordinates of rectangles to be placed on the pdf document
scales: Scales of the rects coordinates for the pdf document
secure: weather or not the pdf document is supposed to be converted into an Image (and back) to make shure, the censoring is irreversible
Returns:
None
"""
doc = pymupdf.open(path)
output = pymupdf.open()
page = doc[0]
npage = doc.page_count
for i in range(npage):
page = doc[i]
if i < len(rects) and rects[i] != []:
print(i)
wfac = page.rect.width / scales[i]["width"]
hfac = page.rect.height / scales[i]["height"]
for rect in rects[i]:
prect = pymupdf.Rect(
rect[0] * wfac,
rect[1] * hfac,
(rect[0] + rect[2]) * wfac,
(rect[1] + rect[3]) * hfac,
)
page.draw_rect(
prect,
color=(0, 0, 0),
fill=(0, 0, 0),
)
censor_status_datas[file_id]["page"] = i + 1
censor_status_datas[file_id]["pages"] = npage
censor_status_datas[file_id]["done"] = False
censor_status_update_events[file_id].set()
# THis Costs us dearly
bitmap = page.get_pixmap(dpi=400)
pdf_bytes = bitmap.pdfocr_tobytes(
language="deu",
tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files
)
output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
# End of the costly part
print(f"Page {i + 1}/{npage}: CENSORING DONE")
output.save(destpath)
censor_status_datas[file_id]["done"] = True
censor_status_update_events[file_id].set()
def test_function(i: int) -> bytes: def test_function(i: int) -> bytes:
return b"\x00\x66\x99" return b"\x00\x66\x99"

View File

@@ -134,11 +134,10 @@
</div> </div>
<input <input
type="checkbox" type="checkbox"
name="censor" name="ocr"
id="sec_censor" id="sec_censor"
value="True" value="True"
checked /><label for="sec_censor">OCR</label><br /><br />
/><label for="sec_censor">Zensieren</label><br /><br />
<button type="submit" id="send">Senden</button> <button type="submit" id="send">Senden</button>
</form> </form>
</div> </div>

View File

@@ -276,8 +276,8 @@ function submitPdf(eve) {
formdata.append("fileId", doc.fID); formdata.append("fileId", doc.fID);
//formdata.append("filename", doc.filename); //formdata.append("filename", doc.filename);
formdata.append("ftype", doc.filetype); formdata.append("ftype", doc.filetype);
if (!formdata.has("censor")) { if (!formdata.has("ocr")) {
formdata.append("censor", "False"); formdata.append("ocr", "False");
} }
console.log(formdata); console.log(formdata);
submitForm(formdata); submitForm(formdata);