added back option to run OCR
This commit is contained in:
93
app/main.py
93
app/main.py
@@ -414,14 +414,14 @@ async def get_submission(
|
||||
pagescales: Annotated[
|
||||
str, Form()
|
||||
], # Scales of Pages # Annotated[List[Dict[str, float]], Form()],
|
||||
censor: Annotated[str, Form()],
|
||||
ocr: Annotated[str, Form()],
|
||||
):
|
||||
"""handles submission"""
|
||||
print(
|
||||
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}"
|
||||
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
|
||||
)
|
||||
info(
|
||||
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}"
|
||||
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
|
||||
)
|
||||
rects_p = json.loads(rects)
|
||||
scales_p = json.loads(pagescales)
|
||||
@@ -454,15 +454,24 @@ async def get_submission(
|
||||
censor_status_datas[fileId] = {}
|
||||
if fileId not in censor_status_update_events:
|
||||
censor_status_update_events[fileId] = asyncio.Event()
|
||||
await asyncio.to_thread(
|
||||
censor_pdf,
|
||||
filepath,
|
||||
dest,
|
||||
rects_p,
|
||||
scales_p,
|
||||
False if censor == "False" else True,
|
||||
fileId,
|
||||
)
|
||||
if ocr == "True":
|
||||
await asyncio.to_thread(
|
||||
censor_pdf_ocr,
|
||||
filepath,
|
||||
dest,
|
||||
rects_p,
|
||||
scales_p,
|
||||
fileId,
|
||||
)
|
||||
else:
|
||||
await asyncio.to_thread(
|
||||
censor_pdf,
|
||||
filepath,
|
||||
dest,
|
||||
rects_p,
|
||||
scales_p,
|
||||
fileId,
|
||||
)
|
||||
|
||||
# return {"done": "ok"}
|
||||
# print(dest)
|
||||
@@ -505,7 +514,6 @@ def censor_pdf(
|
||||
destpath: str,
|
||||
rects: List[List[List[float]]],
|
||||
scales: List[Dict[str, float]],
|
||||
secure: bool,
|
||||
file_id: str,
|
||||
):
|
||||
"""Censors pdf and saves the file to the given Destpath.
|
||||
@@ -549,6 +557,65 @@ def censor_pdf(
|
||||
censor_status_update_events[file_id].set()
|
||||
|
||||
|
||||
def censor_pdf_ocr(
|
||||
path: str,
|
||||
destpath: str,
|
||||
rects: List[List[List[float]]],
|
||||
scales: List[Dict[str, float]],
|
||||
file_id: str,
|
||||
):
|
||||
"""Censors pdf and runs OCR
|
||||
The file is converted to Pixels and then recreated.
|
||||
Saves the file to the given Destpath.
|
||||
Args:
|
||||
path: path to the pdf document
|
||||
destpath: Path where the result is supposed to be saved to
|
||||
rects: Coordinates of rectangles to be placed on the pdf document
|
||||
scales: Scales of the rects coordinates for the pdf document
|
||||
secure: weather or not the pdf document is supposed to be converted into an Image (and back) to make shure, the censoring is irreversible
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
doc = pymupdf.open(path)
|
||||
output = pymupdf.open()
|
||||
page = doc[0]
|
||||
npage = doc.page_count
|
||||
for i in range(npage):
|
||||
page = doc[i]
|
||||
if i < len(rects) and rects[i] != []:
|
||||
print(i)
|
||||
wfac = page.rect.width / scales[i]["width"]
|
||||
hfac = page.rect.height / scales[i]["height"]
|
||||
for rect in rects[i]:
|
||||
prect = pymupdf.Rect(
|
||||
rect[0] * wfac,
|
||||
rect[1] * hfac,
|
||||
(rect[0] + rect[2]) * wfac,
|
||||
(rect[1] + rect[3]) * hfac,
|
||||
)
|
||||
page.draw_rect(
|
||||
prect,
|
||||
color=(0, 0, 0),
|
||||
fill=(0, 0, 0),
|
||||
)
|
||||
censor_status_datas[file_id]["page"] = i + 1
|
||||
censor_status_datas[file_id]["pages"] = npage
|
||||
censor_status_datas[file_id]["done"] = False
|
||||
censor_status_update_events[file_id].set()
|
||||
# THis Costs us dearly
|
||||
bitmap = page.get_pixmap(dpi=400)
|
||||
pdf_bytes = bitmap.pdfocr_tobytes(
|
||||
language="deu",
|
||||
tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files
|
||||
)
|
||||
output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
|
||||
# End of the costly part
|
||||
print(f"Page {i + 1}/{npage}: CENSORING DONE")
|
||||
output.save(destpath)
|
||||
censor_status_datas[file_id]["done"] = True
|
||||
censor_status_update_events[file_id].set()
|
||||
|
||||
|
||||
def test_function(i: int) -> bytes:
|
||||
return b"\x00\x66\x99"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user