added optional censoring; made pdf file show up once the conversion is finished; fixed bug when pagescales are not set but no rectangles are on page

This commit is contained in:
Marcel Gansfusz
2025-07-23 00:23:58 +02:00
parent f13e8711a7
commit 4a6e74aada
4 changed files with 62 additions and 14 deletions

Binary file not shown.

View File

@@ -18,6 +18,13 @@ import filetype
import datetime import datetime
import logging
log = logging.getLogger(__name__)
logging.basicConfig(filename="app.log", level=logging.INFO)
debug = log.debug
info = log.info
error = log.error
app = FastAPI() app = FastAPI()
@@ -58,8 +65,10 @@ async def get_index():
async def get_file(file_id: str): async def get_file(file_id: str):
"""returns the file that cooorosponds with the given ID""" """returns the file that cooorosponds with the given ID"""
if file_id == "unsupported": if file_id == "unsupported":
error("File is unsupported")
return FileResponse(FILES_IN_PROGRESS + "unsupported.pdf") return FileResponse(FILES_IN_PROGRESS + "unsupported.pdf")
if file_id == "empty": if file_id == "empty":
error("File Id empty")
return FileResponse(FILES_IN_PROGRESS + "empty.pdf") return FileResponse(FILES_IN_PROGRESS + "empty.pdf")
cur = db.cursor() cur = db.cursor()
try: try:
@@ -299,9 +308,15 @@ async def get_submission(
pagescales: Annotated[ pagescales: Annotated[
str, Form() str, Form()
], # Scales of Pages # Annotated[List[Dict[str, float]], Form()], ], # Scales of Pages # Annotated[List[Dict[str, float]], Form()],
censor: Annotated[str, Form()] | bool = False,
): ):
"""handles submission""" """handles submission"""
print(lva, prof, fname, stype, subcat, sem, ex_date, rects, pagescales) print(
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}"
)
info(
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}"
)
rects_p = json.loads(rects) rects_p = json.loads(rects)
scales_p = json.loads(pagescales) scales_p = json.loads(pagescales)
cur = db.cursor() cur = db.cursor()
@@ -316,9 +331,12 @@ async def get_submission(
try: try:
dest = make_savepath(lva, prof, stype, subcat, sem, ex_date, fname, ftype) dest = make_savepath(lva, prof, stype, subcat, sem, ex_date, fname, ftype)
except ValueError as e: except ValueError as e:
error(f"Error creating savepath: f{e}")
raise HTTPException(status_code=400, detail=str(e)) raise HTTPException(status_code=400, detail=str(e))
censor_pdf(filepath, dest, rects_p, scales_p) censor_pdf(filepath, dest, rects_p, scales_p, False if censor is False else True)
return {"done": "ok"} # return {"done": "ok"}
print(dest)
return FileResponse(dest, content_disposition_type="inline")
def censor_pdf( def censor_pdf(
@@ -326,8 +344,19 @@ def censor_pdf(
destpath: str, destpath: str,
rects: List[List[List[float]]], rects: List[List[List[float]]],
scales: List[Dict[str, float]], scales: List[Dict[str, float]],
secure: bool,
): ):
"""Censors pdf and runs OCR""" """Censors pdf and runs OCR
If Secure is True the file is converted to Pixels and then recreated; else the censored sections are just covering the text below and can be easiliy removed with e.g. Inkscape
Args:
path: path to the pdf document
destpath: Path where the result is supposed to be saved to
rects: Coordinates of rectangles to be placed on the pdf document
scales: Scales of the rects coordinates for the pdf document
secure: weather or not the pdf document is supposed to be converted into an Image (and back) to make shure, the censoring is irreversible
Returns:
None
"""
doc = pymupdf.open(path) doc = pymupdf.open(path)
output = pymupdf.open() output = pymupdf.open()
page = doc[0] page = doc[0]
@@ -336,7 +365,8 @@ def censor_pdf(
print(width, height) print(width, height)
for i in range(doc.page_count): for i in range(doc.page_count):
page = doc[i] page = doc[i]
if i < len(rects): if i < len(rects) and rects[i] != []:
print(i)
wfac = page.rect.width / scales[i]["width"] wfac = page.rect.width / scales[i]["width"]
hfac = page.rect.height / scales[i]["height"] hfac = page.rect.height / scales[i]["height"]
for rect in rects[i]: for rect in rects[i]:
@@ -351,14 +381,20 @@ def censor_pdf(
color=(0, 0, 0), color=(0, 0, 0),
fill=(0, 0, 0), fill=(0, 0, 0),
) )
bitmap = page.get_pixmap(dpi=300) if secure:
pdf_bytes = bitmap.pdfocr_tobytes( bitmap = page.get_pixmap(dpi=400)
language="deu", pdf_bytes = bitmap.pdfocr_tobytes(
tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files language="deu",
) tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files
output.insert_pdf(pymupdf.Document(stream=pdf_bytes)) )
output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
print(f" Page {i}/{doc.page_count} CENSORING DONE")
else:
output.insert_pdf(doc, i, i)
output.save(destpath) output.save(destpath)
print("CENSORING DONE")
# def save_without_censoring(dest)
async def is_LVID(term: str) -> bool: async def is_LVID(term: str) -> bool:
@@ -556,6 +592,7 @@ def convert_to_pdf(file: bytes) -> bytes | None:
doc = pymupdf.Document(stream=file) doc = pymupdf.Document(stream=file)
return doc.convert_to_pdf() return doc.convert_to_pdf()
except (pymupdf.mupdf.FzErrorUnsupported, pymupdf.FileDataError) as e: except (pymupdf.mupdf.FzErrorUnsupported, pymupdf.FileDataError) as e:
error(f"Error converting Image to pdf file: {e}")
print(e) print(e)
return None return None
@@ -612,7 +649,7 @@ async def save_files_to_folder(files: List[UploadFile]) -> str:
# reqJson = await request.form() # reqJson = await request.form()
# print(reqJson) # print(reqJson)
# return {"done": "ok"} # return {"done": "ok"}
def guess_filetype(content: str, filename: str) -> str: def guess_filetype(content: bytes, filename: str) -> str:
"""Guesses the filetype of a file based on first the sontent, If that fails the extension in teh filename. If no conclusion can be reached it reutrns an empty string""" """Guesses the filetype of a file based on first the sontent, If that fails the extension in teh filename. If no conclusion can be reached it reutrns an empty string"""
ftyp = filetype.guess(content) ftyp = filetype.guess(content)
if ftyp is not None: if ftyp is not None:

View File

@@ -74,6 +74,8 @@
<label for="date">Datum</label> <label for="date">Datum</label>
<input type="date" id="date" name="ex_date" placeholder="Drop File" /><br /> <input type="date" id="date" name="ex_date" placeholder="Drop File" /><br />
</div> </div>
<input type="checkbox" name="censor" id="sec_censor" value="True" checked /><label
for="sec_censor">Zensieren</label><br /><br />
<button type="submit" id="send">Senden</button> <button type="submit" id="send">Senden</button>
</form> </form>
</div> </div>

View File

@@ -285,7 +285,16 @@ async function submitForm(formData) {
//let responseJSON=await response.json(); //let responseJSON=await response.json();
if (response.ok) { if (response.ok) {
console.log("Submit OK"); console.log("Submit OK");
console.log(response); // console.log(response);
// window.open(response);
// console.log(URL.createObjectURL(response.body));
// window.open(response);
// window.open(response, (target = "_blank"));
// var newWindow = window.open();
// newWindow.document.write(response);
// var blob = response.blob();
const blobURL = URL.createObjectURL(await response.blob());
window.open(blobURL, "_blank");
} else { } else {
console.log("Submit failed"); console.log("Submit failed");
} }