changed the censoring mode to built in censoring with pymupdf

This commit is contained in:
Marcel Gansfusz
2025-10-22 23:26:33 +02:00
parent 26ea274023
commit 56d3468889
2 changed files with 13 additions and 53 deletions

Binary file not shown.

View File

@@ -14,6 +14,8 @@ import asyncio
# import fastapi # import fastapi
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
import pymupdf import pymupdf
# import fitz as pymupdf
import json import json
import re import re
@@ -487,6 +489,7 @@ async def get_censor_status(file_id: str):
async def yield_censor_status(file_id: str): async def yield_censor_status(file_id: str):
"""Internal function to yield updates to the stream"""
while True: while True:
await censor_status_update_events[file_id].wait() await censor_status_update_events[file_id].wait()
censor_status_update_events[file_id].clear() censor_status_update_events[file_id].clear()
@@ -505,9 +508,7 @@ def censor_pdf(
secure: bool, secure: bool,
file_id: str, file_id: str,
): ):
"""Censors pdf and runs OCR """Censors pdf and saves the file to the given Destpath.
If Secure is True the file is converted to Pixels and then recreated; else the censored sections are just covering the text below and can be easiliy removed with e.g. Inkscape
Saves the file to the given Destpath.
Args: Args:
path: path to the pdf document path: path to the pdf document
destpath: Path where the result is supposed to be saved to destpath: Path where the result is supposed to be saved to
@@ -518,14 +519,8 @@ def censor_pdf(
None None
""" """
doc = pymupdf.open(path) doc = pymupdf.open(path)
output = pymupdf.open()
page = doc[0] page = doc[0]
# width = page.rect.width
# height = page.rect.height
# print(width, height)
npage = doc.page_count npage = doc.page_count
# pages = []
# tasks = []
for i in range(npage): for i in range(npage):
page = doc[i] page = doc[i]
if i < len(rects) and rects[i] != []: if i < len(rects) and rects[i] != []:
@@ -539,54 +534,19 @@ def censor_pdf(
(rect[0] + rect[2]) * wfac, (rect[0] + rect[2]) * wfac,
(rect[1] + rect[3]) * hfac, (rect[1] + rect[3]) * hfac,
) )
page.draw_rect( page.add_redact_annot(
prect, prect,
color=(0, 0, 0),
fill=(0, 0, 0), fill=(0, 0, 0),
) )
if secure: page.apply_redactions()
censor_status_datas[file_id]["page"] = i + 1 censor_status_datas[file_id]["page"] = i + 1
censor_status_datas[file_id]["pages"] = npage censor_status_datas[file_id]["pages"] = npage
censor_status_datas[file_id]["done"] = False censor_status_datas[file_id]["done"] = False
censor_status_update_events[file_id].set()
# pages.append(page)
# THis Costs us dearly
bitmap = page.get_pixmap(dpi=400)
pdf_bytes = bitmap.pdfocr_tobytes(
language="deu",
tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files
)
output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
# End of the costly part
# tasks.append(asyncio.create_task(censor_page(page)))
print(f"Page {i + 1}/{npage}: CENSORING DONE")
else:
output.insert_pdf(doc, i, i)
# if secure:
# pages_bytes: List[bytes] = []
# censor_page(pages[0])
# with multiprocessing.Pool(npage) as p:
# pages_bytes = p.map(censor_page, pages)
# pages_bytes = p.map(test_function, [1, 2, 3, 4])
# for pdf_bytes in pages_bytes:
# output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
# with concurrent.futures.ThreadPoolExecutor() as executor:
# futures = []
# for page in pages:
# futures.append(executor.submit(censor_page, page))
# for future in futures:
# output.insert_pdf(pymupdf.Document(stream=future.result()))
#
# for task in tasks:
# output.insert_pdf(pymupdf.Document(stream=await task))
# print("CENSORING DONE")
output.save(destpath)
if secure:
censor_status_datas[file_id]["done"] = True
censor_status_update_events[file_id].set() censor_status_update_events[file_id].set()
# censor_finished_flags[file_id].set() doc.set_metadata({})
doc.save(destpath, garbage=4, deflate=True, clean=True)
censor_status_datas[file_id]["done"] = True
censor_status_update_events[file_id].set()
def test_function(i: int) -> bytes: def test_function(i: int) -> bytes: