Compare commits

18 Commits

Author SHA1 Message Date
Marcel Gansfusz
6d5c390350 fixed error in compose.yml 2025-11-04 21:24:38 +01:00
Marcel Gansfusz
e13d06d4a7 fixed regular deletions of files 2025-11-04 19:16:59 +01:00
Marcel Gansfusz
547411ba03 made to kill fip entrys when 1d passes 2025-11-04 19:04:41 +01:00
Marcel Gansfusz
cdd26e0bc3 caught exception in trying to censor 2025-11-04 17:54:24 +01:00
Marcel Gansfusz
f7c73a0c5a fixed js 2025-11-04 17:17:13 +01:00
Marcel Gansfusz
8e74848397 fixed js 2025-11-04 17:13:18 +01:00
Marcel Gansfusz
8704aee82e fixed tesseract in Dockerfile 2025-11-04 16:56:24 +01:00
Marcel Gansfusz
594ac1fa00 updated censoring status logic 2025-11-04 16:55:11 +01:00
Marcel Gansfusz
2ee90cd0d7 added tessercat to Dockerfile 2025-11-04 15:45:37 +01:00
Marcel Gansfusz
d42bab5b19 changed the fetch in js to be relative (no explicite url; just a path); removed version from docker compose 2025-11-04 14:55:04 +01:00
Marcel Gansfusz
c3a87ceee6 changed stryle of greeting file 2025-10-31 17:48:40 +01:00
Marcel Gansfusz
6f2d373292 updated greeting file to represent new censoring mechanism 2025-10-31 16:18:43 +01:00
Marcel Gansfusz
a37206d6a4 added logging statement 2025-10-30 15:48:51 +01:00
Marcel Gansfusz
6bd75bf93f removed .nvim; added log statements 2025-10-30 15:31:00 +01:00
Marcel Gansfusz
5bc24a32d5 removed __pycache__ 2025-10-30 15:09:16 +01:00
Marcel Gansfusz
a9233926e5 added logging statements 2025-10-30 14:45:53 +01:00
Marcel Gansfusz
90235d2788 Made the database reconnect when connection is broken 2025-10-30 13:03:02 +01:00
Marcel Gansfusz
da316a9351 changed from string paths tp pathlib 2025-10-29 12:14:32 +01:00
12 changed files with 596 additions and 116 deletions

2
.gitignore vendored
View File

@@ -7,3 +7,5 @@ init.log
app/__pycache__/ app/__pycache__/
mariadb/* mariadb/*
unizeug unizeug
.mypy_cache
.nvim

View File

@@ -1,2 +0,0 @@
# remote_path="/srv/http/"
# remote_path="dev@10.0.0.25:/var/www/html/"

View File

@@ -20,7 +20,9 @@ RUN apk add --no-cache \
tiff-dev \ tiff-dev \
tk-dev \ tk-dev \
tcl-dev \ tcl-dev \
libwebp-dev libwebp-dev \
tesseract-ocr \
tesseract-ocr-data-deu
RUN python -m ensurepip --upgrade RUN python -m ensurepip --upgrade
RUN pip install setuptools wheel RUN pip install setuptools wheel
RUN pip install -r requirements.txt RUN pip install -r requirements.txt

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 89 KiB

After

Width:  |  Height:  |  Size: 103 KiB

View File

@@ -8,7 +8,7 @@ import os
import json import json
import mariadb import mariadb
import logging import logging
from pathlib import Path
import schedule import schedule
import time import time
import pytz import pytz
@@ -24,7 +24,8 @@ CATEGORIES = [
] ]
SUBCAT_CATEGORIES = ["Klausuren", "Übungen", "Labore"] SUBCAT_CATEGORIES = ["Klausuren", "Übungen", "Labore"]
unizeug_path = os.environ.get("UNIZEUG_PATH", "./unizeug") unizeug_path = os.environ.get("UNIZEUG_PATH", "./unizeug")
APP_ROOT_PATH = Path(os.environ.get("APP_ROOT_PATH", "./app"))
FILES_IN_PROGRESS = APP_ROOT_PATH / "files/"
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
logging.basicConfig( logging.basicConfig(
filename="init.log", filename="init.log",
@@ -79,6 +80,19 @@ except mariadb.OperationalError:
db.commit() db.commit()
def remove_old_FIP_entrys():
cur = db.cursor(dictionary=True)
cur.execute(
"SELECT id,filename FROM FIP WHERE HOUR(TIMEDIFF(NOW(),initTimeStamp)) > 24 "
)
files = cur.fetchall()
info(f"Remove Files: {files}")
for file in files:
c.execute("DELETE FROM FIP WHERE id=?", (file["id"],))
os.remove(FILES_IN_PROGRESS / file["filename"])
db.commit()
def get_dirstruct(): def get_dirstruct():
# with open("app/pwfile.json", "r") as f: # with open("app/pwfile.json", "r") as f:
# cred = json.load(f) # cred = json.load(f)
@@ -149,6 +163,7 @@ def get_dirstruct():
(lid, pid, idx, subcat.name), (lid, pid, idx, subcat.name),
) )
db.commit() db.commit()
remove_old_FIP_entrys()
def link_prof(firstname, lastname, lid): def link_prof(firstname, lastname, lid):

View File

@@ -3,7 +3,7 @@ from typing import List, Dict, Tuple, Sequence
from starlette.responses import StreamingResponse from starlette.responses import StreamingResponse
from annotated_types import IsDigit from annotated_types import IsDigit
from fastapi import FastAPI, File, HTTPException, Path, UploadFile, Request, Form from fastapi import FastAPI, File, HTTPException, UploadFile, Request, Form
from fastapi.responses import FileResponse from fastapi.responses import FileResponse
# import multiprocessing # import multiprocessing
@@ -21,7 +21,9 @@ import re
import os import os
import signal
import mariadb import mariadb
import sys
import filetype import filetype
@@ -36,11 +38,19 @@ log = logging.getLogger(__name__)
logging.basicConfig( logging.basicConfig(
filename=os.environ.get("APP_LOG_PATH"), filename=os.environ.get("APP_LOG_PATH"),
level=logging.INFO, level=logging.INFO,
format="[%(asctime)s, %(filename)s:%(lineno)s -> %(funcName)10s() ]%(levelname)s: %(message)s", format="[%(asctime)s, %(filename)s:%(lineno)s -> %(funcName)10s()] %(levelname)s: %(message)s",
) )
debug = log.debug debug = log.debug
info = log.info info = log.info
error = log.error error = log.error
critical = log.critical
def exception_handler(etype, value, tb):
log.exception(f"Uncought Exception: {value}")
sys.excepthook = exception_handler
db = mariadb.connect( db = mariadb.connect(
host=os.environ.get("DB_HOST", "db"), host=os.environ.get("DB_HOST", "db"),
@@ -117,12 +127,40 @@ def _sql_quarry(
) )
def sql_connector_is_active(connector: mariadb.Connection) -> bool:
try:
connector.ping()
except mariadb.Error as e:
return False
return True
def sql_connect(connector: mariadb.Connection) -> mariadb.Connection:
try:
connector = mariadb.connect(
host=os.environ.get("DB_HOST", "db"),
user=os.environ.get("DB_USER", "user"),
password=os.environ.get("DB_PASSWORD", "DBPASSWORD"),
database=os.environ.get("DB_DATABASE", "Unizeug"),
)
except mariadb.Error as e:
critical(
f"Cannot reconnect to Database {os.environ.get('DB_DATABASE', 'Unizeug')} on {os.environ.get('DB_HOST', 'db')}. Got Mariadb Error: {e}"
)
os.kill(os.getpid(), signal.SIGTERM)
raise HTTPException(500, detail="Database failed")
return connector
def sql( def sql(
querry: str, querry: str,
data: Tuple[str | int, ...] | str | int = (), data: Tuple[str | int, ...] | str | int = (),
return_result: bool = True, return_result: bool = True,
commit: bool = False, commit: bool = False,
) -> List[Tuple]: ) -> List[Tuple]:
global db
if not sql_connector_is_active(db):
db = sql_connect(db)
cur = db.cursor(dictionary=False) cur = db.cursor(dictionary=False)
return _sql_quarry(cur, querry, data, return_result, commit) return _sql_quarry(cur, querry, data, return_result, commit)
@@ -133,6 +171,10 @@ def sqlT(
return_result: bool = True, return_result: bool = True,
commit: bool = False, commit: bool = False,
) -> List[Dict]: ) -> List[Dict]:
global db
if not sql_connector_is_active(db):
db = sql_connect(db)
cur = db.cursor(dictionary=True) cur = db.cursor(dictionary=True)
return _sql_quarry(cur, querry, data, return_result, commit) return _sql_quarry(cur, querry, data, return_result, commit)
@@ -193,7 +235,7 @@ async def get_file(file_id: str):
# status_code=500, detail="Somethings wrong with the database" # status_code=500, detail="Somethings wrong with the database"
# ) # )
# filename = cur.fetchone()[0] # filename = cur.fetchone()[0]
return FileResponse(FILES_IN_PROGRESS + filename) return FileResponse(FILES_IN_PROGRESS / filename)
@app.get("/search/lva") @app.get("/search/lva")
@@ -445,7 +487,7 @@ async def get_submission(
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}" f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
) )
info( info(
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}" f"Got Submission: lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
) )
rects_p = json.loads(rects) rects_p = json.loads(rects)
scales_p = json.loads(pagescales) scales_p = json.loads(pagescales)
@@ -534,8 +576,8 @@ async def yield_censor_status(file_id: str):
def censor_pdf( def censor_pdf(
path: str, path: os.PathLike,
destpath: str, destpath: os.PathLike,
rects: List[List[List[float]]], rects: List[List[List[float]]],
scales: List[Dict[str, float]], scales: List[Dict[str, float]],
file_id: str, file_id: str,
@@ -550,6 +592,7 @@ def censor_pdf(
Returns: Returns:
None None
""" """
info(f"started Censoring for file {path} to be saved to {destpath}")
doc = pymupdf.open(path) doc = pymupdf.open(path)
page = doc[0] page = doc[0]
npage = doc.page_count npage = doc.page_count
@@ -582,8 +625,8 @@ def censor_pdf(
def censor_pdf_ocr( def censor_pdf_ocr(
path: str, path: os.PathLike,
destpath: str, destpath: os.PathLike,
rects: List[List[List[float]]], rects: List[List[List[float]]],
scales: List[Dict[str, float]], scales: List[Dict[str, float]],
file_id: str, file_id: str,
@@ -600,6 +643,7 @@ def censor_pdf_ocr(
Returns: Returns:
None None
""" """
info(f"started Censoring in OCR Mode for file {path} to be saved to {destpath}")
doc = pymupdf.open(path) doc = pymupdf.open(path)
output = pymupdf.open() output = pymupdf.open()
page = doc[0] page = doc[0]
@@ -627,12 +671,31 @@ def censor_pdf_ocr(
censor_status_datas[file_id]["done"] = False censor_status_datas[file_id]["done"] = False
censor_status_update_events[file_id].set() censor_status_update_events[file_id].set()
# THis Costs us dearly # THis Costs us dearly
try:
bitmap = page.get_pixmap(dpi=400) bitmap = page.get_pixmap(dpi=400)
pdf_bytes = bitmap.pdfocr_tobytes( pdf_bytes = bitmap.pdfocr_tobytes(
language="deu", language="deu",
tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files
) )
output.insert_pdf(pymupdf.Document(stream=pdf_bytes)) output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
except RuntimeError as e:
error(
f"Error in OCR for document: {destpath}. Error: {e}. Falling back to standard mode."
)
if i < len(rects) and rects[i] != []:
for rect in rects[i]:
prect = pymupdf.Rect(
rect[0] * wfac,
rect[1] * hfac,
(rect[0] + rect[2]) * wfac,
(rect[1] + rect[3]) * hfac,
)
page.add_redact_annot(
prect,
fill=(0, 0, 0),
)
page.apply_redactions()
output.insert_pdf(page.parent, from_page=page.number, to_page=page.number)
# End of the costly part # End of the costly part
print(f"Page {i + 1}/{npage}: CENSORING DONE") print(f"Page {i + 1}/{npage}: CENSORING DONE")
output.save(destpath) output.save(destpath)
@@ -691,15 +754,16 @@ def make_savepath(
ftype: str, ftype: str,
) -> os.PathLike: ) -> os.PathLike:
"""Generates the path, the file is saved to after the upload process is finished. It creates all nessecery directories.""" """Generates the path, the file is saved to after the upload process is finished. It creates all nessecery directories."""
info(f"Started to make Savepath for '{fname}' in '{lva}' with prof '{prof}'.")
lv = get_lvpath(lva) lv = get_lvpath(lva)
lvpath = lv[1] + "/" lvpath = Path(lv[1])
pf = get_profpath(prof, lv[0]) pf = get_profpath(prof, lv[0])
pfpath = pf[1] + "/" pfpath = Path(pf[1])
catpath = CATEGORIES[int(cat)] + "/" catpath = Path(CATEGORIES[int(cat)])
scpath = "" scpath: str | os.PathLike = ""
if int(cat) in SUBCAT_CATEGORIES_I and subcat != "": if int(cat) in SUBCAT_CATEGORIES_I and subcat != "":
sc = get_subcatpath(subcat, int(cat), pf[0], lv[0]) sc = get_subcatpath(subcat, int(cat), pf[0], lv[0])
scpath = sc[1] + "/" scpath = Path(sc[1])
if int(cat) == 6: if int(cat) == 6:
savepath = UNIZEUG_PATH / (lv[1] + "_Multimedia_only/") / pfpath savepath = UNIZEUG_PATH / (lv[1] + "_Multimedia_only/") / pfpath
else: else:
@@ -724,10 +788,12 @@ def make_savepath(
destpath = savepath / file destpath = savepath / file
i = 0 i = 0
while destpath.is_file(): while destpath.is_file():
info(f"{destpath} already exists.")
file = filename + f"_{i}." + ftype file = filename + f"_{i}." + ftype
i += 1 i += 1
destpath = savepath / file destpath = savepath / file
destpath.touch() destpath.touch()
info(f"Path for file to be saved generated as: {savepath / file}")
return savepath / file return savepath / file
@@ -958,7 +1024,7 @@ async def remove_old_FIP_entrys():
# return_result=False, # return_result=False,
# ) # )
db.commit() db.commit()
return FileResponse(APP_ROOT_PATH / "/index.html") return FileResponse(APP_ROOT_PATH / "index.html")
def delete_from_FIP(uuid: str): def delete_from_FIP(uuid: str):
@@ -966,4 +1032,4 @@ def delete_from_FIP(uuid: str):
if len(res) < 1: if len(res) < 1:
raise HTTPException(500, "I am trying to delete a file that dose not exist") raise HTTPException(500, "I am trying to delete a file that dose not exist")
sql("DELETE FROM FIP WHERE id=?", (uuid,), return_result=False, commit=True) sql("DELETE FROM FIP WHERE id=?", (uuid,), return_result=False, commit=True)
os.remove(FILES_IN_PROGRESS + res[0]["filename"]) os.remove(FILES_IN_PROGRESS / res[0]["filename"])

View File

@@ -283,10 +283,10 @@ function submitPdf(eve) {
submitForm(formdata); submitForm(formdata);
} }
async function submitForm(formData) { async function submitForm(formData) {
var updateEventSource = null;
try { try {
const updateEventSource = new EventSource( updateEventSource = new EventSource("/get_censor_status/" + doc.fID);
window.location + "get_censor_status/" + doc.fID,
);
modal.style.display = "flex"; modal.style.display = "flex";
// console.log("http://127.0.0.1:8000/get_censor_status/" + doc.fID); // console.log("http://127.0.0.1:8000/get_censor_status/" + doc.fID);
updateEventSource.addEventListener("censorUpdate", function(eve) { updateEventSource.addEventListener("censorUpdate", function(eve) {
@@ -295,11 +295,19 @@ async function submitForm(formData) {
upload_status.innerText = upload_status.innerText =
"Censoring Page " + data.page + "/" + data.pages; "Censoring Page " + data.page + "/" + data.pages;
}); });
const response = await fetch(window.location + "submit", { } catch {
console.error(
"Error geting eventsource for updating censoring page count: " + error,
);
}
try {
const response = await fetch("/submit/", {
method: "POST", method: "POST",
body: formData, body: formData,
}); });
if (updateEventSource !== null) {
updateEventSource.close(); updateEventSource.close();
}
modal.style.display = "none"; modal.style.display = "none";
//let responseJSON=await response.json(); //let responseJSON=await response.json();
if (response.ok) { if (response.ok) {
@@ -320,7 +328,7 @@ async function submitForm(formData) {
window.alert("Error: " + (await response.json())["detail"]); window.alert("Error: " + (await response.json())["detail"]);
} }
} catch (error) { } catch (error) {
console.error("Error" + error); console.error("Error submitting: " + error);
} }
} }
function uploadPdf(eve) { function uploadPdf(eve) {
@@ -338,7 +346,7 @@ function uploadPdf(eve) {
} }
async function uploadFile(formData) { async function uploadFile(formData) {
try { try {
const response = await fetch(window.location + "uploadfile", { const response = await fetch("/uploadfile/", {
method: "POST", method: "POST",
body: formData, body: formData,
}); });

View File

@@ -1,4 +1,4 @@
var url = window.location + "search/"; var url = "/search/";
var lid = null; var lid = null;
var pid = null; var pid = null;
var activeAutocompletion = null; var activeAutocompletion = null;

View File

@@ -1,4 +1,3 @@
version: "3"
services: services:
app: app:
container_name: python-app container_name: python-app
@@ -58,6 +57,7 @@ services:
environment: environment:
ENTRY_COMMAND: python /python/init.py ENTRY_COMMAND: python /python/init.py
UNIZEUG_PATH: /unizeug UNIZEUG_PATH: /unizeug
APP_ROOT_PATH: /python
DB_HOST: db DB_HOST: db
DB_USER: app DB_USER: app
DB_PASSWORD: DBPassword DB_PASSWORD: DBPassword