diff --git a/app/main.py b/app/main.py index e421051..69d6301 100644 --- a/app/main.py +++ b/app/main.py @@ -671,12 +671,31 @@ def censor_pdf_ocr( censor_status_datas[file_id]["done"] = False censor_status_update_events[file_id].set() # THis Costs us dearly - bitmap = page.get_pixmap(dpi=400) - pdf_bytes = bitmap.pdfocr_tobytes( - language="deu", - tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files - ) - output.insert_pdf(pymupdf.Document(stream=pdf_bytes)) + try: + bitmap = page.get_pixmap(dpi=400) + pdf_bytes = bitmap.pdfocr_tobytes( + language="deu", + tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files + ) + output.insert_pdf(pymupdf.Document(stream=pdf_bytes)) + except RuntimeError as e: + error( + f"Error in OCR for document: {destpath}. Error: {e}. Falling back to standard mode." + ) + if i < len(rects) and rects[i] != []: + for rect in rects[i]: + prect = pymupdf.Rect( + rect[0] * wfac, + rect[1] * hfac, + (rect[0] + rect[2]) * wfac, + (rect[1] + rect[3]) * hfac, + ) + page.add_redact_annot( + prect, + fill=(0, 0, 0), + ) + page.apply_redactions() + output.insert_pdf(page.parent, from_page=page.number, to_page=page.number) # End of the costly part print(f"Page {i + 1}/{npage}: CENSORING DONE") output.save(destpath)