changed the censoring mode to built in censoring with pymupdf

This commit is contained in:
Marcel Gansfusz
2025-10-22 23:26:33 +02:00
parent 26ea274023
commit 56d3468889
2 changed files with 13 additions and 53 deletions

Binary file not shown.

View File

@@ -14,6 +14,8 @@ import asyncio
# import fastapi
from fastapi.staticfiles import StaticFiles
import pymupdf
# import fitz as pymupdf
import json
import re
@@ -487,6 +489,7 @@ async def get_censor_status(file_id: str):
async def yield_censor_status(file_id: str):
"""Internal function to yield updates to the stream"""
while True:
await censor_status_update_events[file_id].wait()
censor_status_update_events[file_id].clear()
@@ -505,9 +508,7 @@ def censor_pdf(
secure: bool,
file_id: str,
):
"""Censors pdf and runs OCR
If Secure is True the file is converted to Pixels and then recreated; else the censored sections are just covering the text below and can be easiliy removed with e.g. Inkscape
Saves the file to the given Destpath.
"""Censors pdf and saves the file to the given Destpath.
Args:
path: path to the pdf document
destpath: Path where the result is supposed to be saved to
@@ -518,14 +519,8 @@ def censor_pdf(
None
"""
doc = pymupdf.open(path)
output = pymupdf.open()
page = doc[0]
# width = page.rect.width
# height = page.rect.height
# print(width, height)
npage = doc.page_count
# pages = []
# tasks = []
for i in range(npage):
page = doc[i]
if i < len(rects) and rects[i] != []:
@@ -539,54 +534,19 @@ def censor_pdf(
(rect[0] + rect[2]) * wfac,
(rect[1] + rect[3]) * hfac,
)
page.draw_rect(
page.add_redact_annot(
prect,
color=(0, 0, 0),
fill=(0, 0, 0),
)
if secure:
page.apply_redactions()
censor_status_datas[file_id]["page"] = i + 1
censor_status_datas[file_id]["pages"] = npage
censor_status_datas[file_id]["done"] = False
censor_status_update_events[file_id].set()
# pages.append(page)
# THis Costs us dearly
bitmap = page.get_pixmap(dpi=400)
pdf_bytes = bitmap.pdfocr_tobytes(
language="deu",
tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files
)
output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
# End of the costly part
# tasks.append(asyncio.create_task(censor_page(page)))
print(f"Page {i + 1}/{npage}: CENSORING DONE")
else:
output.insert_pdf(doc, i, i)
# if secure:
# pages_bytes: List[bytes] = []
# censor_page(pages[0])
# with multiprocessing.Pool(npage) as p:
# pages_bytes = p.map(censor_page, pages)
# pages_bytes = p.map(test_function, [1, 2, 3, 4])
# for pdf_bytes in pages_bytes:
# output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
# with concurrent.futures.ThreadPoolExecutor() as executor:
# futures = []
# for page in pages:
# futures.append(executor.submit(censor_page, page))
# for future in futures:
# output.insert_pdf(pymupdf.Document(stream=future.result()))
#
# for task in tasks:
# output.insert_pdf(pymupdf.Document(stream=await task))
# print("CENSORING DONE")
output.save(destpath)
if secure:
doc.set_metadata({})
doc.save(destpath, garbage=4, deflate=True, clean=True)
censor_status_datas[file_id]["done"] = True
censor_status_update_events[file_id].set()
# censor_finished_flags[file_id].set()
def test_function(i: int) -> bytes: