changed the censoring mode to built in censoring with pymupdf
This commit is contained in:
Binary file not shown.
56
app/main.py
56
app/main.py
@@ -14,6 +14,8 @@ import asyncio
|
||||
# import fastapi
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
import pymupdf
|
||||
|
||||
# import fitz as pymupdf
|
||||
import json
|
||||
import re
|
||||
|
||||
@@ -487,6 +489,7 @@ async def get_censor_status(file_id: str):
|
||||
|
||||
|
||||
async def yield_censor_status(file_id: str):
|
||||
"""Internal function to yield updates to the stream"""
|
||||
while True:
|
||||
await censor_status_update_events[file_id].wait()
|
||||
censor_status_update_events[file_id].clear()
|
||||
@@ -505,9 +508,7 @@ def censor_pdf(
|
||||
secure: bool,
|
||||
file_id: str,
|
||||
):
|
||||
"""Censors pdf and runs OCR
|
||||
If Secure is True the file is converted to Pixels and then recreated; else the censored sections are just covering the text below and can be easiliy removed with e.g. Inkscape
|
||||
Saves the file to the given Destpath.
|
||||
"""Censors pdf and saves the file to the given Destpath.
|
||||
Args:
|
||||
path: path to the pdf document
|
||||
destpath: Path where the result is supposed to be saved to
|
||||
@@ -518,14 +519,8 @@ def censor_pdf(
|
||||
None
|
||||
"""
|
||||
doc = pymupdf.open(path)
|
||||
output = pymupdf.open()
|
||||
page = doc[0]
|
||||
# width = page.rect.width
|
||||
# height = page.rect.height
|
||||
# print(width, height)
|
||||
npage = doc.page_count
|
||||
# pages = []
|
||||
# tasks = []
|
||||
for i in range(npage):
|
||||
page = doc[i]
|
||||
if i < len(rects) and rects[i] != []:
|
||||
@@ -539,54 +534,19 @@ def censor_pdf(
|
||||
(rect[0] + rect[2]) * wfac,
|
||||
(rect[1] + rect[3]) * hfac,
|
||||
)
|
||||
page.draw_rect(
|
||||
page.add_redact_annot(
|
||||
prect,
|
||||
color=(0, 0, 0),
|
||||
fill=(0, 0, 0),
|
||||
)
|
||||
if secure:
|
||||
page.apply_redactions()
|
||||
censor_status_datas[file_id]["page"] = i + 1
|
||||
censor_status_datas[file_id]["pages"] = npage
|
||||
censor_status_datas[file_id]["done"] = False
|
||||
censor_status_update_events[file_id].set()
|
||||
|
||||
# pages.append(page)
|
||||
# THis Costs us dearly
|
||||
bitmap = page.get_pixmap(dpi=400)
|
||||
pdf_bytes = bitmap.pdfocr_tobytes(
|
||||
language="deu",
|
||||
tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files
|
||||
)
|
||||
output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
|
||||
# End of the costly part
|
||||
# tasks.append(asyncio.create_task(censor_page(page)))
|
||||
print(f"Page {i + 1}/{npage}: CENSORING DONE")
|
||||
else:
|
||||
output.insert_pdf(doc, i, i)
|
||||
|
||||
# if secure:
|
||||
# pages_bytes: List[bytes] = []
|
||||
# censor_page(pages[0])
|
||||
# with multiprocessing.Pool(npage) as p:
|
||||
# pages_bytes = p.map(censor_page, pages)
|
||||
# pages_bytes = p.map(test_function, [1, 2, 3, 4])
|
||||
# for pdf_bytes in pages_bytes:
|
||||
# output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
|
||||
# with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
# futures = []
|
||||
# for page in pages:
|
||||
# futures.append(executor.submit(censor_page, page))
|
||||
# for future in futures:
|
||||
# output.insert_pdf(pymupdf.Document(stream=future.result()))
|
||||
#
|
||||
# for task in tasks:
|
||||
# output.insert_pdf(pymupdf.Document(stream=await task))
|
||||
# print("CENSORING DONE")
|
||||
output.save(destpath)
|
||||
if secure:
|
||||
doc.set_metadata({})
|
||||
doc.save(destpath, garbage=4, deflate=True, clean=True)
|
||||
censor_status_datas[file_id]["done"] = True
|
||||
censor_status_update_events[file_id].set()
|
||||
# censor_finished_flags[file_id].set()
|
||||
|
||||
|
||||
def test_function(i: int) -> bytes:
|
||||
|
||||
Reference in New Issue
Block a user