Merge pull request 'improve_censoring_speed' (#1) from improve_censoring_speed into main

Reviewed-on: #1
This commit was merged in pull request #1.
This commit is contained in:
2025-10-23 15:43:40 +02:00
5 changed files with 94 additions and 68 deletions

Binary file not shown.

View File

@@ -14,6 +14,8 @@ import asyncio
# import fastapi
from fastapi.staticfiles import StaticFiles
import pymupdf
# import fitz as pymupdf
import json
import re
@@ -412,14 +414,14 @@ async def get_submission(
pagescales: Annotated[
str, Form()
], # Scales of Pages # Annotated[List[Dict[str, float]], Form()],
censor: Annotated[str, Form()],
ocr: Annotated[str, Form()],
):
"""handles submission"""
print(
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}"
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
)
info(
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}"
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
)
rects_p = json.loads(rects)
scales_p = json.loads(pagescales)
@@ -452,13 +454,22 @@ async def get_submission(
censor_status_datas[fileId] = {}
if fileId not in censor_status_update_events:
censor_status_update_events[fileId] = asyncio.Event()
if ocr == "True":
await asyncio.to_thread(
censor_pdf_ocr,
filepath,
dest,
rects_p,
scales_p,
fileId,
)
else:
await asyncio.to_thread(
censor_pdf,
filepath,
dest,
rects_p,
scales_p,
False if censor == "False" else True,
fileId,
)
@@ -487,6 +498,7 @@ async def get_censor_status(file_id: str):
async def yield_censor_status(file_id: str):
"""Internal function to yield updates to the stream"""
while True:
await censor_status_update_events[file_id].wait()
censor_status_update_events[file_id].clear()
@@ -502,11 +514,58 @@ def censor_pdf(
destpath: str,
rects: List[List[List[float]]],
scales: List[Dict[str, float]],
secure: bool,
file_id: str,
):
"""Censors pdf and saves the file to the given Destpath.
Args:
path: path to the pdf document
destpath: Path where the result is supposed to be saved to
rects: Coordinates of rectangles to be placed on the pdf document
scales: Scales of the rects coordinates for the pdf document
secure: weather or not the pdf document is supposed to be converted into an Image (and back) to make shure, the censoring is irreversible
Returns:
None
"""
doc = pymupdf.open(path)
page = doc[0]
npage = doc.page_count
for i in range(npage):
page = doc[i]
if i < len(rects) and rects[i] != []:
print(i)
wfac = page.rect.width / scales[i]["width"]
hfac = page.rect.height / scales[i]["height"]
for rect in rects[i]:
prect = pymupdf.Rect(
rect[0] * wfac,
rect[1] * hfac,
(rect[0] + rect[2]) * wfac,
(rect[1] + rect[3]) * hfac,
)
page.add_redact_annot(
prect,
fill=(0, 0, 0),
)
page.apply_redactions()
censor_status_datas[file_id]["page"] = i + 1
censor_status_datas[file_id]["pages"] = npage
censor_status_datas[file_id]["done"] = False
censor_status_update_events[file_id].set()
doc.set_metadata({})
doc.save(destpath, garbage=4, deflate=True, clean=True)
censor_status_datas[file_id]["done"] = True
censor_status_update_events[file_id].set()
def censor_pdf_ocr(
path: str,
destpath: str,
rects: List[List[List[float]]],
scales: List[Dict[str, float]],
file_id: str,
):
"""Censors pdf and runs OCR
If Secure is True the file is converted to Pixels and then recreated; else the censored sections are just covering the text below and can be easiliy removed with e.g. Inkscape
The file is converted to Pixels and then recreated.
Saves the file to the given Destpath.
Args:
path: path to the pdf document
@@ -520,12 +579,7 @@ def censor_pdf(
doc = pymupdf.open(path)
output = pymupdf.open()
page = doc[0]
# width = page.rect.width
# height = page.rect.height
# print(width, height)
npage = doc.page_count
# pages = []
# tasks = []
for i in range(npage):
page = doc[i]
if i < len(rects) and rects[i] != []:
@@ -544,13 +598,10 @@ def censor_pdf(
color=(0, 0, 0),
fill=(0, 0, 0),
)
if secure:
censor_status_datas[file_id]["page"] = i + 1
censor_status_datas[file_id]["pages"] = npage
censor_status_datas[file_id]["done"] = False
censor_status_update_events[file_id].set()
# pages.append(page)
# THis Costs us dearly
bitmap = page.get_pixmap(dpi=400)
pdf_bytes = bitmap.pdfocr_tobytes(
@@ -559,34 +610,10 @@ def censor_pdf(
)
output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
# End of the costly part
# tasks.append(asyncio.create_task(censor_page(page)))
print(f"Page {i + 1}/{npage}: CENSORING DONE")
else:
output.insert_pdf(doc, i, i)
# if secure:
# pages_bytes: List[bytes] = []
# censor_page(pages[0])
# with multiprocessing.Pool(npage) as p:
# pages_bytes = p.map(censor_page, pages)
# pages_bytes = p.map(test_function, [1, 2, 3, 4])
# for pdf_bytes in pages_bytes:
# output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
# with concurrent.futures.ThreadPoolExecutor() as executor:
# futures = []
# for page in pages:
# futures.append(executor.submit(censor_page, page))
# for future in futures:
# output.insert_pdf(pymupdf.Document(stream=future.result()))
#
# for task in tasks:
# output.insert_pdf(pymupdf.Document(stream=await task))
# print("CENSORING DONE")
output.save(destpath)
if secure:
censor_status_datas[file_id]["done"] = True
censor_status_update_events[file_id].set()
# censor_finished_flags[file_id].set()
def test_function(i: int) -> bytes:

View File

@@ -134,11 +134,10 @@
</div>
<input
type="checkbox"
name="censor"
name="ocr"
id="sec_censor"
value="True"
checked
/><label for="sec_censor">Zensieren</label><br /><br />
/><label for="sec_censor">OCR</label><br /><br />
<button type="submit" id="send">Senden</button>
</form>
</div>

View File

@@ -276,8 +276,8 @@ function submitPdf(eve) {
formdata.append("fileId", doc.fID);
//formdata.append("filename", doc.filename);
formdata.append("ftype", doc.filetype);
if (!formdata.has("censor")) {
formdata.append("censor", "False");
if (!formdata.has("ocr")) {
formdata.append("ocr", "False");
}
console.log(formdata);
submitForm(formdata);

View File

@@ -21,7 +21,7 @@ function autocomplete(inp, type) {
i,
apirq,
iname,
val = this.value;
val = inp.value;
/*close any already open lists of autocompleted values*/
closeAllLists();
if (!val && type === "lva" && pid === null) {
@@ -56,7 +56,7 @@ function autocomplete(inp, type) {
a.setAttribute("id", this.id + "autocomplete-list");
a.setAttribute("class", "autocomplete-items");
/*append the DIV element as a child of the autocomplete container:*/
this.parentNode.appendChild(a);
inp.parentNode.appendChild(a);
/*for each item in the array...*/
//await response;
if (response.ok) {