diff --git a/app/__pycache__/main.cpython-313.pyc b/app/__pycache__/main.cpython-313.pyc index 5371756..dab1333 100644 Binary files a/app/__pycache__/main.cpython-313.pyc and b/app/__pycache__/main.cpython-313.pyc differ diff --git a/app/main.py b/app/main.py index 41d7fc7..215e881 100644 --- a/app/main.py +++ b/app/main.py @@ -14,6 +14,8 @@ import asyncio # import fastapi from fastapi.staticfiles import StaticFiles import pymupdf + +# import fitz as pymupdf import json import re @@ -487,6 +489,7 @@ async def get_censor_status(file_id: str): async def yield_censor_status(file_id: str): + """Internal function to yield updates to the stream""" while True: await censor_status_update_events[file_id].wait() censor_status_update_events[file_id].clear() @@ -505,9 +508,7 @@ def censor_pdf( secure: bool, file_id: str, ): - """Censors pdf and runs OCR - If Secure is True the file is converted to Pixels and then recreated; else the censored sections are just covering the text below and can be easiliy removed with e.g. Inkscape - Saves the file to the given Destpath. + """Censors pdf and saves the file to the given Destpath. Args: path: path to the pdf document destpath: Path where the result is supposed to be saved to @@ -518,14 +519,8 @@ def censor_pdf( None """ doc = pymupdf.open(path) - output = pymupdf.open() page = doc[0] - # width = page.rect.width - # height = page.rect.height - # print(width, height) npage = doc.page_count - # pages = [] - # tasks = [] for i in range(npage): page = doc[i] if i < len(rects) and rects[i] != []: @@ -539,54 +534,19 @@ def censor_pdf( (rect[0] + rect[2]) * wfac, (rect[1] + rect[3]) * hfac, ) - page.draw_rect( + page.add_redact_annot( prect, - color=(0, 0, 0), fill=(0, 0, 0), ) - if secure: - censor_status_datas[file_id]["page"] = i + 1 - censor_status_datas[file_id]["pages"] = npage - censor_status_datas[file_id]["done"] = False - censor_status_update_events[file_id].set() - - # pages.append(page) - # THis Costs us dearly - bitmap = page.get_pixmap(dpi=400) - pdf_bytes = bitmap.pdfocr_tobytes( - language="deu", - tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files - ) - output.insert_pdf(pymupdf.Document(stream=pdf_bytes)) - # End of the costly part - # tasks.append(asyncio.create_task(censor_page(page))) - print(f"Page {i + 1}/{npage}: CENSORING DONE") - else: - output.insert_pdf(doc, i, i) - - # if secure: - # pages_bytes: List[bytes] = [] - # censor_page(pages[0]) - # with multiprocessing.Pool(npage) as p: - # pages_bytes = p.map(censor_page, pages) - # pages_bytes = p.map(test_function, [1, 2, 3, 4]) - # for pdf_bytes in pages_bytes: - # output.insert_pdf(pymupdf.Document(stream=pdf_bytes)) - # with concurrent.futures.ThreadPoolExecutor() as executor: - # futures = [] - # for page in pages: - # futures.append(executor.submit(censor_page, page)) - # for future in futures: - # output.insert_pdf(pymupdf.Document(stream=future.result())) - # - # for task in tasks: - # output.insert_pdf(pymupdf.Document(stream=await task)) - # print("CENSORING DONE") - output.save(destpath) - if secure: - censor_status_datas[file_id]["done"] = True + page.apply_redactions() + censor_status_datas[file_id]["page"] = i + 1 + censor_status_datas[file_id]["pages"] = npage + censor_status_datas[file_id]["done"] = False censor_status_update_events[file_id].set() - # censor_finished_flags[file_id].set() + doc.set_metadata({}) + doc.save(destpath, garbage=4, deflate=True, clean=True) + censor_status_datas[file_id]["done"] = True + censor_status_update_events[file_id].set() def test_function(i: int) -> bytes: