Compare commits
3 Commits
26ea274023
...
improve_ce
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5c6a8dfba2 | ||
|
|
c30d69d205 | ||
|
|
56d3468889 |
Binary file not shown.
149
app/main.py
149
app/main.py
@@ -14,6 +14,8 @@ import asyncio
|
|||||||
# import fastapi
|
# import fastapi
|
||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
import pymupdf
|
import pymupdf
|
||||||
|
|
||||||
|
# import fitz as pymupdf
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
@@ -412,14 +414,14 @@ async def get_submission(
|
|||||||
pagescales: Annotated[
|
pagescales: Annotated[
|
||||||
str, Form()
|
str, Form()
|
||||||
], # Scales of Pages # Annotated[List[Dict[str, float]], Form()],
|
], # Scales of Pages # Annotated[List[Dict[str, float]], Form()],
|
||||||
censor: Annotated[str, Form()],
|
ocr: Annotated[str, Form()],
|
||||||
):
|
):
|
||||||
"""handles submission"""
|
"""handles submission"""
|
||||||
print(
|
print(
|
||||||
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}"
|
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
|
||||||
)
|
)
|
||||||
info(
|
info(
|
||||||
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, censor: {censor}"
|
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
|
||||||
)
|
)
|
||||||
rects_p = json.loads(rects)
|
rects_p = json.loads(rects)
|
||||||
scales_p = json.loads(pagescales)
|
scales_p = json.loads(pagescales)
|
||||||
@@ -452,15 +454,24 @@ async def get_submission(
|
|||||||
censor_status_datas[fileId] = {}
|
censor_status_datas[fileId] = {}
|
||||||
if fileId not in censor_status_update_events:
|
if fileId not in censor_status_update_events:
|
||||||
censor_status_update_events[fileId] = asyncio.Event()
|
censor_status_update_events[fileId] = asyncio.Event()
|
||||||
await asyncio.to_thread(
|
if ocr == "True":
|
||||||
censor_pdf,
|
await asyncio.to_thread(
|
||||||
filepath,
|
censor_pdf_ocr,
|
||||||
dest,
|
filepath,
|
||||||
rects_p,
|
dest,
|
||||||
scales_p,
|
rects_p,
|
||||||
False if censor == "False" else True,
|
scales_p,
|
||||||
fileId,
|
fileId,
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
await asyncio.to_thread(
|
||||||
|
censor_pdf,
|
||||||
|
filepath,
|
||||||
|
dest,
|
||||||
|
rects_p,
|
||||||
|
scales_p,
|
||||||
|
fileId,
|
||||||
|
)
|
||||||
|
|
||||||
# return {"done": "ok"}
|
# return {"done": "ok"}
|
||||||
# print(dest)
|
# print(dest)
|
||||||
@@ -487,6 +498,7 @@ async def get_censor_status(file_id: str):
|
|||||||
|
|
||||||
|
|
||||||
async def yield_censor_status(file_id: str):
|
async def yield_censor_status(file_id: str):
|
||||||
|
"""Internal function to yield updates to the stream"""
|
||||||
while True:
|
while True:
|
||||||
await censor_status_update_events[file_id].wait()
|
await censor_status_update_events[file_id].wait()
|
||||||
censor_status_update_events[file_id].clear()
|
censor_status_update_events[file_id].clear()
|
||||||
@@ -502,11 +514,58 @@ def censor_pdf(
|
|||||||
destpath: str,
|
destpath: str,
|
||||||
rects: List[List[List[float]]],
|
rects: List[List[List[float]]],
|
||||||
scales: List[Dict[str, float]],
|
scales: List[Dict[str, float]],
|
||||||
secure: bool,
|
file_id: str,
|
||||||
|
):
|
||||||
|
"""Censors pdf and saves the file to the given Destpath.
|
||||||
|
Args:
|
||||||
|
path: path to the pdf document
|
||||||
|
destpath: Path where the result is supposed to be saved to
|
||||||
|
rects: Coordinates of rectangles to be placed on the pdf document
|
||||||
|
scales: Scales of the rects coordinates for the pdf document
|
||||||
|
secure: weather or not the pdf document is supposed to be converted into an Image (and back) to make shure, the censoring is irreversible
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
doc = pymupdf.open(path)
|
||||||
|
page = doc[0]
|
||||||
|
npage = doc.page_count
|
||||||
|
for i in range(npage):
|
||||||
|
page = doc[i]
|
||||||
|
if i < len(rects) and rects[i] != []:
|
||||||
|
print(i)
|
||||||
|
wfac = page.rect.width / scales[i]["width"]
|
||||||
|
hfac = page.rect.height / scales[i]["height"]
|
||||||
|
for rect in rects[i]:
|
||||||
|
prect = pymupdf.Rect(
|
||||||
|
rect[0] * wfac,
|
||||||
|
rect[1] * hfac,
|
||||||
|
(rect[0] + rect[2]) * wfac,
|
||||||
|
(rect[1] + rect[3]) * hfac,
|
||||||
|
)
|
||||||
|
page.add_redact_annot(
|
||||||
|
prect,
|
||||||
|
fill=(0, 0, 0),
|
||||||
|
)
|
||||||
|
page.apply_redactions()
|
||||||
|
censor_status_datas[file_id]["page"] = i + 1
|
||||||
|
censor_status_datas[file_id]["pages"] = npage
|
||||||
|
censor_status_datas[file_id]["done"] = False
|
||||||
|
censor_status_update_events[file_id].set()
|
||||||
|
doc.set_metadata({})
|
||||||
|
doc.save(destpath, garbage=4, deflate=True, clean=True)
|
||||||
|
censor_status_datas[file_id]["done"] = True
|
||||||
|
censor_status_update_events[file_id].set()
|
||||||
|
|
||||||
|
|
||||||
|
def censor_pdf_ocr(
|
||||||
|
path: str,
|
||||||
|
destpath: str,
|
||||||
|
rects: List[List[List[float]]],
|
||||||
|
scales: List[Dict[str, float]],
|
||||||
file_id: str,
|
file_id: str,
|
||||||
):
|
):
|
||||||
"""Censors pdf and runs OCR
|
"""Censors pdf and runs OCR
|
||||||
If Secure is True the file is converted to Pixels and then recreated; else the censored sections are just covering the text below and can be easiliy removed with e.g. Inkscape
|
The file is converted to Pixels and then recreated.
|
||||||
Saves the file to the given Destpath.
|
Saves the file to the given Destpath.
|
||||||
Args:
|
Args:
|
||||||
path: path to the pdf document
|
path: path to the pdf document
|
||||||
@@ -520,12 +579,7 @@ def censor_pdf(
|
|||||||
doc = pymupdf.open(path)
|
doc = pymupdf.open(path)
|
||||||
output = pymupdf.open()
|
output = pymupdf.open()
|
||||||
page = doc[0]
|
page = doc[0]
|
||||||
# width = page.rect.width
|
|
||||||
# height = page.rect.height
|
|
||||||
# print(width, height)
|
|
||||||
npage = doc.page_count
|
npage = doc.page_count
|
||||||
# pages = []
|
|
||||||
# tasks = []
|
|
||||||
for i in range(npage):
|
for i in range(npage):
|
||||||
page = doc[i]
|
page = doc[i]
|
||||||
if i < len(rects) and rects[i] != []:
|
if i < len(rects) and rects[i] != []:
|
||||||
@@ -544,49 +598,22 @@ def censor_pdf(
|
|||||||
color=(0, 0, 0),
|
color=(0, 0, 0),
|
||||||
fill=(0, 0, 0),
|
fill=(0, 0, 0),
|
||||||
)
|
)
|
||||||
if secure:
|
censor_status_datas[file_id]["page"] = i + 1
|
||||||
censor_status_datas[file_id]["page"] = i + 1
|
censor_status_datas[file_id]["pages"] = npage
|
||||||
censor_status_datas[file_id]["pages"] = npage
|
censor_status_datas[file_id]["done"] = False
|
||||||
censor_status_datas[file_id]["done"] = False
|
|
||||||
censor_status_update_events[file_id].set()
|
|
||||||
|
|
||||||
# pages.append(page)
|
|
||||||
# THis Costs us dearly
|
|
||||||
bitmap = page.get_pixmap(dpi=400)
|
|
||||||
pdf_bytes = bitmap.pdfocr_tobytes(
|
|
||||||
language="deu",
|
|
||||||
tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files
|
|
||||||
)
|
|
||||||
output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
|
|
||||||
# End of the costly part
|
|
||||||
# tasks.append(asyncio.create_task(censor_page(page)))
|
|
||||||
print(f"Page {i + 1}/{npage}: CENSORING DONE")
|
|
||||||
else:
|
|
||||||
output.insert_pdf(doc, i, i)
|
|
||||||
|
|
||||||
# if secure:
|
|
||||||
# pages_bytes: List[bytes] = []
|
|
||||||
# censor_page(pages[0])
|
|
||||||
# with multiprocessing.Pool(npage) as p:
|
|
||||||
# pages_bytes = p.map(censor_page, pages)
|
|
||||||
# pages_bytes = p.map(test_function, [1, 2, 3, 4])
|
|
||||||
# for pdf_bytes in pages_bytes:
|
|
||||||
# output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
|
|
||||||
# with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
||||||
# futures = []
|
|
||||||
# for page in pages:
|
|
||||||
# futures.append(executor.submit(censor_page, page))
|
|
||||||
# for future in futures:
|
|
||||||
# output.insert_pdf(pymupdf.Document(stream=future.result()))
|
|
||||||
#
|
|
||||||
# for task in tasks:
|
|
||||||
# output.insert_pdf(pymupdf.Document(stream=await task))
|
|
||||||
# print("CENSORING DONE")
|
|
||||||
output.save(destpath)
|
|
||||||
if secure:
|
|
||||||
censor_status_datas[file_id]["done"] = True
|
|
||||||
censor_status_update_events[file_id].set()
|
censor_status_update_events[file_id].set()
|
||||||
# censor_finished_flags[file_id].set()
|
# THis Costs us dearly
|
||||||
|
bitmap = page.get_pixmap(dpi=400)
|
||||||
|
pdf_bytes = bitmap.pdfocr_tobytes(
|
||||||
|
language="deu",
|
||||||
|
tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files
|
||||||
|
)
|
||||||
|
output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
|
||||||
|
# End of the costly part
|
||||||
|
print(f"Page {i + 1}/{npage}: CENSORING DONE")
|
||||||
|
output.save(destpath)
|
||||||
|
censor_status_datas[file_id]["done"] = True
|
||||||
|
censor_status_update_events[file_id].set()
|
||||||
|
|
||||||
|
|
||||||
def test_function(i: int) -> bytes:
|
def test_function(i: int) -> bytes:
|
||||||
|
|||||||
@@ -134,11 +134,10 @@
|
|||||||
</div>
|
</div>
|
||||||
<input
|
<input
|
||||||
type="checkbox"
|
type="checkbox"
|
||||||
name="censor"
|
name="ocr"
|
||||||
id="sec_censor"
|
id="sec_censor"
|
||||||
value="True"
|
value="True"
|
||||||
checked
|
/><label for="sec_censor">OCR</label><br /><br />
|
||||||
/><label for="sec_censor">Zensieren</label><br /><br />
|
|
||||||
<button type="submit" id="send">Senden</button>
|
<button type="submit" id="send">Senden</button>
|
||||||
</form>
|
</form>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -276,8 +276,8 @@ function submitPdf(eve) {
|
|||||||
formdata.append("fileId", doc.fID);
|
formdata.append("fileId", doc.fID);
|
||||||
//formdata.append("filename", doc.filename);
|
//formdata.append("filename", doc.filename);
|
||||||
formdata.append("ftype", doc.filetype);
|
formdata.append("ftype", doc.filetype);
|
||||||
if (!formdata.has("censor")) {
|
if (!formdata.has("ocr")) {
|
||||||
formdata.append("censor", "False");
|
formdata.append("ocr", "False");
|
||||||
}
|
}
|
||||||
console.log(formdata);
|
console.log(formdata);
|
||||||
submitForm(formdata);
|
submitForm(formdata);
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ function autocomplete(inp, type) {
|
|||||||
i,
|
i,
|
||||||
apirq,
|
apirq,
|
||||||
iname,
|
iname,
|
||||||
val = this.value;
|
val = inp.value;
|
||||||
/*close any already open lists of autocompleted values*/
|
/*close any already open lists of autocompleted values*/
|
||||||
closeAllLists();
|
closeAllLists();
|
||||||
if (!val && type === "lva" && pid === null) {
|
if (!val && type === "lva" && pid === null) {
|
||||||
@@ -56,7 +56,7 @@ function autocomplete(inp, type) {
|
|||||||
a.setAttribute("id", this.id + "autocomplete-list");
|
a.setAttribute("id", this.id + "autocomplete-list");
|
||||||
a.setAttribute("class", "autocomplete-items");
|
a.setAttribute("class", "autocomplete-items");
|
||||||
/*append the DIV element as a child of the autocomplete container:*/
|
/*append the DIV element as a child of the autocomplete container:*/
|
||||||
this.parentNode.appendChild(a);
|
inp.parentNode.appendChild(a);
|
||||||
/*for each item in the array...*/
|
/*for each item in the array...*/
|
||||||
//await response;
|
//await response;
|
||||||
if (response.ok) {
|
if (response.ok) {
|
||||||
|
|||||||
Reference in New Issue
Block a user