diff --git a/app/__pycache__/main.cpython-313.pyc b/app/__pycache__/main.cpython-313.pyc
index 5371756..098bd05 100644
Binary files a/app/__pycache__/main.cpython-313.pyc and b/app/__pycache__/main.cpython-313.pyc differ
diff --git a/app/main.py b/app/main.py
index 41d7fc7..19ed9ad 100644
--- a/app/main.py
+++ b/app/main.py
@@ -14,6 +14,8 @@ import asyncio
# import fastapi
from fastapi.staticfiles import StaticFiles
import pymupdf
+
+# import fitz as pymupdf
import json
import re
@@ -412,14 +414,14 @@ async def get_submission(
pagescales: Annotated[
str, Form()
], # Scales of Pages # Annotated[List[Dict[str, float]], Form()],
- censor: Annotated[str, Form()],
+ ocr: Annotated[str, Form()],
):
"""handles submission"""
print(
- f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}"
+ f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
)
info(
- f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}"
+ f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
)
rects_p = json.loads(rects)
scales_p = json.loads(pagescales)
@@ -452,15 +454,24 @@ async def get_submission(
censor_status_datas[fileId] = {}
if fileId not in censor_status_update_events:
censor_status_update_events[fileId] = asyncio.Event()
- await asyncio.to_thread(
- censor_pdf,
- filepath,
- dest,
- rects_p,
- scales_p,
- False if censor == "False" else True,
- fileId,
- )
+ if ocr == "True":
+ await asyncio.to_thread(
+ censor_pdf_ocr,
+ filepath,
+ dest,
+ rects_p,
+ scales_p,
+ fileId,
+ )
+ else:
+ await asyncio.to_thread(
+ censor_pdf,
+ filepath,
+ dest,
+ rects_p,
+ scales_p,
+ fileId,
+ )
# return {"done": "ok"}
# print(dest)
@@ -487,6 +498,7 @@ async def get_censor_status(file_id: str):
async def yield_censor_status(file_id: str):
+ """Internal function to yield updates to the stream"""
while True:
await censor_status_update_events[file_id].wait()
censor_status_update_events[file_id].clear()
@@ -502,11 +514,58 @@ def censor_pdf(
destpath: str,
rects: List[List[List[float]]],
scales: List[Dict[str, float]],
- secure: bool,
+ file_id: str,
+):
+ """Censors pdf and saves the file to the given Destpath.
+ Args:
+ path: path to the pdf document
+ destpath: Path where the result is supposed to be saved to
+ rects: Coordinates of rectangles to be placed on the pdf document
+ scales: Scales of the rects coordinates for the pdf document
+ secure: weather or not the pdf document is supposed to be converted into an Image (and back) to make shure, the censoring is irreversible
+ Returns:
+ None
+ """
+ doc = pymupdf.open(path)
+ page = doc[0]
+ npage = doc.page_count
+ for i in range(npage):
+ page = doc[i]
+ if i < len(rects) and rects[i] != []:
+ print(i)
+ wfac = page.rect.width / scales[i]["width"]
+ hfac = page.rect.height / scales[i]["height"]
+ for rect in rects[i]:
+ prect = pymupdf.Rect(
+ rect[0] * wfac,
+ rect[1] * hfac,
+ (rect[0] + rect[2]) * wfac,
+ (rect[1] + rect[3]) * hfac,
+ )
+ page.add_redact_annot(
+ prect,
+ fill=(0, 0, 0),
+ )
+ page.apply_redactions()
+ censor_status_datas[file_id]["page"] = i + 1
+ censor_status_datas[file_id]["pages"] = npage
+ censor_status_datas[file_id]["done"] = False
+ censor_status_update_events[file_id].set()
+ doc.set_metadata({})
+ doc.save(destpath, garbage=4, deflate=True, clean=True)
+ censor_status_datas[file_id]["done"] = True
+ censor_status_update_events[file_id].set()
+
+
+def censor_pdf_ocr(
+ path: str,
+ destpath: str,
+ rects: List[List[List[float]]],
+ scales: List[Dict[str, float]],
file_id: str,
):
"""Censors pdf and runs OCR
- If Secure is True the file is converted to Pixels and then recreated; else the censored sections are just covering the text below and can be easiliy removed with e.g. Inkscape
+ The file is converted to Pixels and then recreated.
Saves the file to the given Destpath.
Args:
path: path to the pdf document
@@ -520,12 +579,7 @@ def censor_pdf(
doc = pymupdf.open(path)
output = pymupdf.open()
page = doc[0]
- # width = page.rect.width
- # height = page.rect.height
- # print(width, height)
npage = doc.page_count
- # pages = []
- # tasks = []
for i in range(npage):
page = doc[i]
if i < len(rects) and rects[i] != []:
@@ -544,49 +598,22 @@ def censor_pdf(
color=(0, 0, 0),
fill=(0, 0, 0),
)
- if secure:
- censor_status_datas[file_id]["page"] = i + 1
- censor_status_datas[file_id]["pages"] = npage
- censor_status_datas[file_id]["done"] = False
- censor_status_update_events[file_id].set()
-
- # pages.append(page)
- # THis Costs us dearly
- bitmap = page.get_pixmap(dpi=400)
- pdf_bytes = bitmap.pdfocr_tobytes(
- language="deu",
- tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files
- )
- output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
- # End of the costly part
- # tasks.append(asyncio.create_task(censor_page(page)))
- print(f"Page {i + 1}/{npage}: CENSORING DONE")
- else:
- output.insert_pdf(doc, i, i)
-
- # if secure:
- # pages_bytes: List[bytes] = []
- # censor_page(pages[0])
- # with multiprocessing.Pool(npage) as p:
- # pages_bytes = p.map(censor_page, pages)
- # pages_bytes = p.map(test_function, [1, 2, 3, 4])
- # for pdf_bytes in pages_bytes:
- # output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
- # with concurrent.futures.ThreadPoolExecutor() as executor:
- # futures = []
- # for page in pages:
- # futures.append(executor.submit(censor_page, page))
- # for future in futures:
- # output.insert_pdf(pymupdf.Document(stream=future.result()))
- #
- # for task in tasks:
- # output.insert_pdf(pymupdf.Document(stream=await task))
- # print("CENSORING DONE")
- output.save(destpath)
- if secure:
- censor_status_datas[file_id]["done"] = True
+ censor_status_datas[file_id]["page"] = i + 1
+ censor_status_datas[file_id]["pages"] = npage
+ censor_status_datas[file_id]["done"] = False
censor_status_update_events[file_id].set()
- # censor_finished_flags[file_id].set()
+ # THis Costs us dearly
+ bitmap = page.get_pixmap(dpi=400)
+ pdf_bytes = bitmap.pdfocr_tobytes(
+ language="deu",
+ tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files
+ )
+ output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
+ # End of the costly part
+ print(f"Page {i + 1}/{npage}: CENSORING DONE")
+ output.save(destpath)
+ censor_status_datas[file_id]["done"] = True
+ censor_status_update_events[file_id].set()
def test_function(i: int) -> bytes:
diff --git a/index.html b/index.html
index 2051433..e2724f3 100644
--- a/index.html
+++ b/index.html
@@ -134,11 +134,10 @@
+ />
diff --git a/static/app.js b/static/app.js
index ceea83f..f654998 100644
--- a/static/app.js
+++ b/static/app.js
@@ -276,8 +276,8 @@ function submitPdf(eve) {
formdata.append("fileId", doc.fID);
//formdata.append("filename", doc.filename);
formdata.append("ftype", doc.filetype);
- if (!formdata.has("censor")) {
- formdata.append("censor", "False");
+ if (!formdata.has("ocr")) {
+ formdata.append("ocr", "False");
}
console.log(formdata);
submitForm(formdata);
diff --git a/static/autocomplete.js b/static/autocomplete.js
index 6824849..959a694 100644
--- a/static/autocomplete.js
+++ b/static/autocomplete.js
@@ -21,7 +21,7 @@ function autocomplete(inp, type) {
i,
apirq,
iname,
- val = this.value;
+ val = inp.value;
/*close any already open lists of autocompleted values*/
closeAllLists();
if (!val && type === "lva" && pid === null) {
@@ -56,7 +56,7 @@ function autocomplete(inp, type) {
a.setAttribute("id", this.id + "autocomplete-list");
a.setAttribute("class", "autocomplete-items");
/*append the DIV element as a child of the autocomplete container:*/
- this.parentNode.appendChild(a);
+ inp.parentNode.appendChild(a);
/*for each item in the array...*/
//await response;
if (response.ok) {