Compare commits

..

3 Commits

Author SHA1 Message Date
Marcel Gansfusz
5c6a8dfba2 fixed bug in js, that blocked showing prof suggestions when nothing is entered in the field 2025-10-23 15:40:45 +02:00
Marcel Gansfusz
c30d69d205 added back option to run OCR 2025-10-23 00:06:25 +02:00
Marcel Gansfusz
56d3468889 changed the censoring mode to built in censoring with pymupdf 2025-10-22 23:26:33 +02:00
5 changed files with 94 additions and 68 deletions

Binary file not shown.

View File

@@ -14,6 +14,8 @@ import asyncio
# import fastapi # import fastapi
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
import pymupdf import pymupdf
# import fitz as pymupdf
import json import json
import re import re
@@ -412,14 +414,14 @@ async def get_submission(
pagescales: Annotated[ pagescales: Annotated[
str, Form() str, Form()
], # Scales of Pages # Annotated[List[Dict[str, float]], Form()], ], # Scales of Pages # Annotated[List[Dict[str, float]], Form()],
censor: Annotated[str, Form()], ocr: Annotated[str, Form()],
): ):
"""handles submission""" """handles submission"""
print( print(
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}" f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
) )
info( info(
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}" f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
) )
rects_p = json.loads(rects) rects_p = json.loads(rects)
scales_p = json.loads(pagescales) scales_p = json.loads(pagescales)
@@ -452,15 +454,24 @@ async def get_submission(
censor_status_datas[fileId] = {} censor_status_datas[fileId] = {}
if fileId not in censor_status_update_events: if fileId not in censor_status_update_events:
censor_status_update_events[fileId] = asyncio.Event() censor_status_update_events[fileId] = asyncio.Event()
await asyncio.to_thread( if ocr == "True":
censor_pdf, await asyncio.to_thread(
filepath, censor_pdf_ocr,
dest, filepath,
rects_p, dest,
scales_p, rects_p,
False if censor == "False" else True, scales_p,
fileId, fileId,
) )
else:
await asyncio.to_thread(
censor_pdf,
filepath,
dest,
rects_p,
scales_p,
fileId,
)
# return {"done": "ok"} # return {"done": "ok"}
# print(dest) # print(dest)
@@ -487,6 +498,7 @@ async def get_censor_status(file_id: str):
async def yield_censor_status(file_id: str): async def yield_censor_status(file_id: str):
"""Internal function to yield updates to the stream"""
while True: while True:
await censor_status_update_events[file_id].wait() await censor_status_update_events[file_id].wait()
censor_status_update_events[file_id].clear() censor_status_update_events[file_id].clear()
@@ -502,11 +514,58 @@ def censor_pdf(
destpath: str, destpath: str,
rects: List[List[List[float]]], rects: List[List[List[float]]],
scales: List[Dict[str, float]], scales: List[Dict[str, float]],
secure: bool, file_id: str,
):
"""Censors pdf and saves the file to the given Destpath.
Args:
path: path to the pdf document
destpath: Path where the result is supposed to be saved to
rects: Coordinates of rectangles to be placed on the pdf document
scales: Scales of the rects coordinates for the pdf document
secure: weather or not the pdf document is supposed to be converted into an Image (and back) to make shure, the censoring is irreversible
Returns:
None
"""
doc = pymupdf.open(path)
page = doc[0]
npage = doc.page_count
for i in range(npage):
page = doc[i]
if i < len(rects) and rects[i] != []:
print(i)
wfac = page.rect.width / scales[i]["width"]
hfac = page.rect.height / scales[i]["height"]
for rect in rects[i]:
prect = pymupdf.Rect(
rect[0] * wfac,
rect[1] * hfac,
(rect[0] + rect[2]) * wfac,
(rect[1] + rect[3]) * hfac,
)
page.add_redact_annot(
prect,
fill=(0, 0, 0),
)
page.apply_redactions()
censor_status_datas[file_id]["page"] = i + 1
censor_status_datas[file_id]["pages"] = npage
censor_status_datas[file_id]["done"] = False
censor_status_update_events[file_id].set()
doc.set_metadata({})
doc.save(destpath, garbage=4, deflate=True, clean=True)
censor_status_datas[file_id]["done"] = True
censor_status_update_events[file_id].set()
def censor_pdf_ocr(
path: str,
destpath: str,
rects: List[List[List[float]]],
scales: List[Dict[str, float]],
file_id: str, file_id: str,
): ):
"""Censors pdf and runs OCR """Censors pdf and runs OCR
If Secure is True the file is converted to Pixels and then recreated; else the censored sections are just covering the text below and can be easiliy removed with e.g. Inkscape The file is converted to Pixels and then recreated.
Saves the file to the given Destpath. Saves the file to the given Destpath.
Args: Args:
path: path to the pdf document path: path to the pdf document
@@ -520,12 +579,7 @@ def censor_pdf(
doc = pymupdf.open(path) doc = pymupdf.open(path)
output = pymupdf.open() output = pymupdf.open()
page = doc[0] page = doc[0]
# width = page.rect.width
# height = page.rect.height
# print(width, height)
npage = doc.page_count npage = doc.page_count
# pages = []
# tasks = []
for i in range(npage): for i in range(npage):
page = doc[i] page = doc[i]
if i < len(rects) and rects[i] != []: if i < len(rects) and rects[i] != []:
@@ -544,49 +598,22 @@ def censor_pdf(
color=(0, 0, 0), color=(0, 0, 0),
fill=(0, 0, 0), fill=(0, 0, 0),
) )
if secure: censor_status_datas[file_id]["page"] = i + 1
censor_status_datas[file_id]["page"] = i + 1 censor_status_datas[file_id]["pages"] = npage
censor_status_datas[file_id]["pages"] = npage censor_status_datas[file_id]["done"] = False
censor_status_datas[file_id]["done"] = False
censor_status_update_events[file_id].set()
# pages.append(page)
# THis Costs us dearly
bitmap = page.get_pixmap(dpi=400)
pdf_bytes = bitmap.pdfocr_tobytes(
language="deu",
tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files
)
output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
# End of the costly part
# tasks.append(asyncio.create_task(censor_page(page)))
print(f"Page {i + 1}/{npage}: CENSORING DONE")
else:
output.insert_pdf(doc, i, i)
# if secure:
# pages_bytes: List[bytes] = []
# censor_page(pages[0])
# with multiprocessing.Pool(npage) as p:
# pages_bytes = p.map(censor_page, pages)
# pages_bytes = p.map(test_function, [1, 2, 3, 4])
# for pdf_bytes in pages_bytes:
# output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
# with concurrent.futures.ThreadPoolExecutor() as executor:
# futures = []
# for page in pages:
# futures.append(executor.submit(censor_page, page))
# for future in futures:
# output.insert_pdf(pymupdf.Document(stream=future.result()))
#
# for task in tasks:
# output.insert_pdf(pymupdf.Document(stream=await task))
# print("CENSORING DONE")
output.save(destpath)
if secure:
censor_status_datas[file_id]["done"] = True
censor_status_update_events[file_id].set() censor_status_update_events[file_id].set()
# censor_finished_flags[file_id].set() # THis Costs us dearly
bitmap = page.get_pixmap(dpi=400)
pdf_bytes = bitmap.pdfocr_tobytes(
language="deu",
tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files
)
output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
# End of the costly part
print(f"Page {i + 1}/{npage}: CENSORING DONE")
output.save(destpath)
censor_status_datas[file_id]["done"] = True
censor_status_update_events[file_id].set()
def test_function(i: int) -> bytes: def test_function(i: int) -> bytes:

View File

@@ -134,11 +134,10 @@
</div> </div>
<input <input
type="checkbox" type="checkbox"
name="censor" name="ocr"
id="sec_censor" id="sec_censor"
value="True" value="True"
checked /><label for="sec_censor">OCR</label><br /><br />
/><label for="sec_censor">Zensieren</label><br /><br />
<button type="submit" id="send">Senden</button> <button type="submit" id="send">Senden</button>
</form> </form>
</div> </div>

View File

@@ -276,8 +276,8 @@ function submitPdf(eve) {
formdata.append("fileId", doc.fID); formdata.append("fileId", doc.fID);
//formdata.append("filename", doc.filename); //formdata.append("filename", doc.filename);
formdata.append("ftype", doc.filetype); formdata.append("ftype", doc.filetype);
if (!formdata.has("censor")) { if (!formdata.has("ocr")) {
formdata.append("censor", "False"); formdata.append("ocr", "False");
} }
console.log(formdata); console.log(formdata);
submitForm(formdata); submitForm(formdata);

View File

@@ -21,7 +21,7 @@ function autocomplete(inp, type) {
i, i,
apirq, apirq,
iname, iname,
val = this.value; val = inp.value;
/*close any already open lists of autocompleted values*/ /*close any already open lists of autocompleted values*/
closeAllLists(); closeAllLists();
if (!val && type === "lva" && pid === null) { if (!val && type === "lva" && pid === null) {
@@ -56,7 +56,7 @@ function autocomplete(inp, type) {
a.setAttribute("id", this.id + "autocomplete-list"); a.setAttribute("id", this.id + "autocomplete-list");
a.setAttribute("class", "autocomplete-items"); a.setAttribute("class", "autocomplete-items");
/*append the DIV element as a child of the autocomplete container:*/ /*append the DIV element as a child of the autocomplete container:*/
this.parentNode.appendChild(a); inp.parentNode.appendChild(a);
/*for each item in the array...*/ /*for each item in the array...*/
//await response; //await response;
if (response.ok) { if (response.ok) {