diff --git a/app/__pycache__/main.cpython-313.pyc b/app/__pycache__/main.cpython-313.pyc index 5371756..098bd05 100644 Binary files a/app/__pycache__/main.cpython-313.pyc and b/app/__pycache__/main.cpython-313.pyc differ diff --git a/app/main.py b/app/main.py index 41d7fc7..19ed9ad 100644 --- a/app/main.py +++ b/app/main.py @@ -14,6 +14,8 @@ import asyncio # import fastapi from fastapi.staticfiles import StaticFiles import pymupdf + +# import fitz as pymupdf import json import re @@ -412,14 +414,14 @@ async def get_submission( pagescales: Annotated[ str, Form() ], # Scales of Pages # Annotated[List[Dict[str, float]], Form()], - censor: Annotated[str, Form()], + ocr: Annotated[str, Form()], ): """handles submission""" print( - f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}" + f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}" ) info( - f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}" + f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}" ) rects_p = json.loads(rects) scales_p = json.loads(pagescales) @@ -452,15 +454,24 @@ async def get_submission( censor_status_datas[fileId] = {} if fileId not in censor_status_update_events: censor_status_update_events[fileId] = asyncio.Event() - await asyncio.to_thread( - censor_pdf, - filepath, - dest, - rects_p, - scales_p, - False if censor == "False" else True, - fileId, - ) + if ocr == "True": + await asyncio.to_thread( + censor_pdf_ocr, + filepath, + dest, + rects_p, + scales_p, + fileId, + ) + else: + await asyncio.to_thread( + censor_pdf, + filepath, + dest, + rects_p, + scales_p, + fileId, + ) # return {"done": "ok"} # print(dest) @@ -487,6 +498,7 @@ async def get_censor_status(file_id: str): async def yield_censor_status(file_id: str): + """Internal function to yield updates to the stream""" while True: await censor_status_update_events[file_id].wait() censor_status_update_events[file_id].clear() @@ -502,11 +514,58 @@ def censor_pdf( destpath: str, rects: List[List[List[float]]], scales: List[Dict[str, float]], - secure: bool, + file_id: str, +): + """Censors pdf and saves the file to the given Destpath. + Args: + path: path to the pdf document + destpath: Path where the result is supposed to be saved to + rects: Coordinates of rectangles to be placed on the pdf document + scales: Scales of the rects coordinates for the pdf document + secure: weather or not the pdf document is supposed to be converted into an Image (and back) to make shure, the censoring is irreversible + Returns: + None + """ + doc = pymupdf.open(path) + page = doc[0] + npage = doc.page_count + for i in range(npage): + page = doc[i] + if i < len(rects) and rects[i] != []: + print(i) + wfac = page.rect.width / scales[i]["width"] + hfac = page.rect.height / scales[i]["height"] + for rect in rects[i]: + prect = pymupdf.Rect( + rect[0] * wfac, + rect[1] * hfac, + (rect[0] + rect[2]) * wfac, + (rect[1] + rect[3]) * hfac, + ) + page.add_redact_annot( + prect, + fill=(0, 0, 0), + ) + page.apply_redactions() + censor_status_datas[file_id]["page"] = i + 1 + censor_status_datas[file_id]["pages"] = npage + censor_status_datas[file_id]["done"] = False + censor_status_update_events[file_id].set() + doc.set_metadata({}) + doc.save(destpath, garbage=4, deflate=True, clean=True) + censor_status_datas[file_id]["done"] = True + censor_status_update_events[file_id].set() + + +def censor_pdf_ocr( + path: str, + destpath: str, + rects: List[List[List[float]]], + scales: List[Dict[str, float]], file_id: str, ): """Censors pdf and runs OCR - If Secure is True the file is converted to Pixels and then recreated; else the censored sections are just covering the text below and can be easiliy removed with e.g. Inkscape + The file is converted to Pixels and then recreated. Saves the file to the given Destpath. Args: path: path to the pdf document @@ -520,12 +579,7 @@ def censor_pdf( doc = pymupdf.open(path) output = pymupdf.open() page = doc[0] - # width = page.rect.width - # height = page.rect.height - # print(width, height) npage = doc.page_count - # pages = [] - # tasks = [] for i in range(npage): page = doc[i] if i < len(rects) and rects[i] != []: @@ -544,49 +598,22 @@ def censor_pdf( color=(0, 0, 0), fill=(0, 0, 0), ) - if secure: - censor_status_datas[file_id]["page"] = i + 1 - censor_status_datas[file_id]["pages"] = npage - censor_status_datas[file_id]["done"] = False - censor_status_update_events[file_id].set() - - # pages.append(page) - # THis Costs us dearly - bitmap = page.get_pixmap(dpi=400) - pdf_bytes = bitmap.pdfocr_tobytes( - language="deu", - tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files - ) - output.insert_pdf(pymupdf.Document(stream=pdf_bytes)) - # End of the costly part - # tasks.append(asyncio.create_task(censor_page(page))) - print(f"Page {i + 1}/{npage}: CENSORING DONE") - else: - output.insert_pdf(doc, i, i) - - # if secure: - # pages_bytes: List[bytes] = [] - # censor_page(pages[0]) - # with multiprocessing.Pool(npage) as p: - # pages_bytes = p.map(censor_page, pages) - # pages_bytes = p.map(test_function, [1, 2, 3, 4]) - # for pdf_bytes in pages_bytes: - # output.insert_pdf(pymupdf.Document(stream=pdf_bytes)) - # with concurrent.futures.ThreadPoolExecutor() as executor: - # futures = [] - # for page in pages: - # futures.append(executor.submit(censor_page, page)) - # for future in futures: - # output.insert_pdf(pymupdf.Document(stream=future.result())) - # - # for task in tasks: - # output.insert_pdf(pymupdf.Document(stream=await task)) - # print("CENSORING DONE") - output.save(destpath) - if secure: - censor_status_datas[file_id]["done"] = True + censor_status_datas[file_id]["page"] = i + 1 + censor_status_datas[file_id]["pages"] = npage + censor_status_datas[file_id]["done"] = False censor_status_update_events[file_id].set() - # censor_finished_flags[file_id].set() + # THis Costs us dearly + bitmap = page.get_pixmap(dpi=400) + pdf_bytes = bitmap.pdfocr_tobytes( + language="deu", + tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files + ) + output.insert_pdf(pymupdf.Document(stream=pdf_bytes)) + # End of the costly part + print(f"Page {i + 1}/{npage}: CENSORING DONE") + output.save(destpath) + censor_status_datas[file_id]["done"] = True + censor_status_update_events[file_id].set() def test_function(i: int) -> bytes: diff --git a/index.html b/index.html index 2051433..e2724f3 100644 --- a/index.html +++ b/index.html @@ -134,11 +134,10 @@

+ />

diff --git a/static/app.js b/static/app.js index ceea83f..f654998 100644 --- a/static/app.js +++ b/static/app.js @@ -276,8 +276,8 @@ function submitPdf(eve) { formdata.append("fileId", doc.fID); //formdata.append("filename", doc.filename); formdata.append("ftype", doc.filetype); - if (!formdata.has("censor")) { - formdata.append("censor", "False"); + if (!formdata.has("ocr")) { + formdata.append("ocr", "False"); } console.log(formdata); submitForm(formdata); diff --git a/static/autocomplete.js b/static/autocomplete.js index 6824849..959a694 100644 --- a/static/autocomplete.js +++ b/static/autocomplete.js @@ -21,7 +21,7 @@ function autocomplete(inp, type) { i, apirq, iname, - val = this.value; + val = inp.value; /*close any already open lists of autocompleted values*/ closeAllLists(); if (!val && type === "lva" && pid === null) { @@ -56,7 +56,7 @@ function autocomplete(inp, type) { a.setAttribute("id", this.id + "autocomplete-list"); a.setAttribute("class", "autocomplete-items"); /*append the DIV element as a child of the autocomplete container:*/ - this.parentNode.appendChild(a); + inp.parentNode.appendChild(a); /*for each item in the array...*/ //await response; if (response.ok) {