Compare commits

18 Commits

Author SHA1 Message Date
Marcel Gansfusz
6d5c390350 fixed error in compose.yml 2025-11-04 21:24:38 +01:00
Marcel Gansfusz
e13d06d4a7 fixed regular deletions of files 2025-11-04 19:16:59 +01:00
Marcel Gansfusz
547411ba03 made to kill fip entrys when 1d passes 2025-11-04 19:04:41 +01:00
Marcel Gansfusz
cdd26e0bc3 caught exception in trying to censor 2025-11-04 17:54:24 +01:00
Marcel Gansfusz
f7c73a0c5a fixed js 2025-11-04 17:17:13 +01:00
Marcel Gansfusz
8e74848397 fixed js 2025-11-04 17:13:18 +01:00
Marcel Gansfusz
8704aee82e fixed tesseract in Dockerfile 2025-11-04 16:56:24 +01:00
Marcel Gansfusz
594ac1fa00 updated censoring status logic 2025-11-04 16:55:11 +01:00
Marcel Gansfusz
2ee90cd0d7 added tessercat to Dockerfile 2025-11-04 15:45:37 +01:00
Marcel Gansfusz
d42bab5b19 changed the fetch in js to be relative (no explicite url; just a path); removed version from docker compose 2025-11-04 14:55:04 +01:00
Marcel Gansfusz
c3a87ceee6 changed stryle of greeting file 2025-10-31 17:48:40 +01:00
Marcel Gansfusz
6f2d373292 updated greeting file to represent new censoring mechanism 2025-10-31 16:18:43 +01:00
Marcel Gansfusz
a37206d6a4 added logging statement 2025-10-30 15:48:51 +01:00
Marcel Gansfusz
6bd75bf93f removed .nvim; added log statements 2025-10-30 15:31:00 +01:00
Marcel Gansfusz
5bc24a32d5 removed __pycache__ 2025-10-30 15:09:16 +01:00
Marcel Gansfusz
a9233926e5 added logging statements 2025-10-30 14:45:53 +01:00
Marcel Gansfusz
90235d2788 Made the database reconnect when connection is broken 2025-10-30 13:03:02 +01:00
Marcel Gansfusz
da316a9351 changed from string paths tp pathlib 2025-10-29 12:14:32 +01:00
12 changed files with 596 additions and 116 deletions

2
.gitignore vendored
View File

@@ -7,3 +7,5 @@ init.log
app/__pycache__/
mariadb/*
unizeug
.mypy_cache
.nvim

View File

@@ -1,2 +0,0 @@
# remote_path="/srv/http/"
# remote_path="dev@10.0.0.25:/var/www/html/"

View File

@@ -20,7 +20,9 @@ RUN apk add --no-cache \
tiff-dev \
tk-dev \
tcl-dev \
libwebp-dev
libwebp-dev \
tesseract-ocr \
tesseract-ocr-data-deu
RUN python -m ensurepip --upgrade
RUN pip install setuptools wheel
RUN pip install -r requirements.txt

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 89 KiB

After

Width:  |  Height:  |  Size: 103 KiB

View File

@@ -8,7 +8,7 @@ import os
import json
import mariadb
import logging
from pathlib import Path
import schedule
import time
import pytz
@@ -24,7 +24,8 @@ CATEGORIES = [
]
SUBCAT_CATEGORIES = ["Klausuren", "Übungen", "Labore"]
unizeug_path = os.environ.get("UNIZEUG_PATH", "./unizeug")
APP_ROOT_PATH = Path(os.environ.get("APP_ROOT_PATH", "./app"))
FILES_IN_PROGRESS = APP_ROOT_PATH / "files/"
log = logging.getLogger(__name__)
logging.basicConfig(
filename="init.log",
@@ -79,6 +80,19 @@ except mariadb.OperationalError:
db.commit()
def remove_old_FIP_entrys():
cur = db.cursor(dictionary=True)
cur.execute(
"SELECT id,filename FROM FIP WHERE HOUR(TIMEDIFF(NOW(),initTimeStamp)) > 24 "
)
files = cur.fetchall()
info(f"Remove Files: {files}")
for file in files:
c.execute("DELETE FROM FIP WHERE id=?", (file["id"],))
os.remove(FILES_IN_PROGRESS / file["filename"])
db.commit()
def get_dirstruct():
# with open("app/pwfile.json", "r") as f:
# cred = json.load(f)
@@ -149,6 +163,7 @@ def get_dirstruct():
(lid, pid, idx, subcat.name),
)
db.commit()
remove_old_FIP_entrys()
def link_prof(firstname, lastname, lid):

View File

@@ -3,7 +3,7 @@ from typing import List, Dict, Tuple, Sequence
from starlette.responses import StreamingResponse
from annotated_types import IsDigit
from fastapi import FastAPI, File, HTTPException, Path, UploadFile, Request, Form
from fastapi import FastAPI, File, HTTPException, UploadFile, Request, Form
from fastapi.responses import FileResponse
# import multiprocessing
@@ -21,7 +21,9 @@ import re
import os
import signal
import mariadb
import sys
import filetype
@@ -36,11 +38,19 @@ log = logging.getLogger(__name__)
logging.basicConfig(
filename=os.environ.get("APP_LOG_PATH"),
level=logging.INFO,
format="[%(asctime)s, %(filename)s:%(lineno)s -> %(funcName)10s() ]%(levelname)s: %(message)s",
format="[%(asctime)s, %(filename)s:%(lineno)s -> %(funcName)10s()] %(levelname)s: %(message)s",
)
debug = log.debug
info = log.info
error = log.error
critical = log.critical
def exception_handler(etype, value, tb):
log.exception(f"Uncought Exception: {value}")
sys.excepthook = exception_handler
db = mariadb.connect(
host=os.environ.get("DB_HOST", "db"),
@@ -117,12 +127,40 @@ def _sql_quarry(
)
def sql_connector_is_active(connector: mariadb.Connection) -> bool:
try:
connector.ping()
except mariadb.Error as e:
return False
return True
def sql_connect(connector: mariadb.Connection) -> mariadb.Connection:
try:
connector = mariadb.connect(
host=os.environ.get("DB_HOST", "db"),
user=os.environ.get("DB_USER", "user"),
password=os.environ.get("DB_PASSWORD", "DBPASSWORD"),
database=os.environ.get("DB_DATABASE", "Unizeug"),
)
except mariadb.Error as e:
critical(
f"Cannot reconnect to Database {os.environ.get('DB_DATABASE', 'Unizeug')} on {os.environ.get('DB_HOST', 'db')}. Got Mariadb Error: {e}"
)
os.kill(os.getpid(), signal.SIGTERM)
raise HTTPException(500, detail="Database failed")
return connector
def sql(
querry: str,
data: Tuple[str | int, ...] | str | int = (),
return_result: bool = True,
commit: bool = False,
) -> List[Tuple]:
global db
if not sql_connector_is_active(db):
db = sql_connect(db)
cur = db.cursor(dictionary=False)
return _sql_quarry(cur, querry, data, return_result, commit)
@@ -133,6 +171,10 @@ def sqlT(
return_result: bool = True,
commit: bool = False,
) -> List[Dict]:
global db
if not sql_connector_is_active(db):
db = sql_connect(db)
cur = db.cursor(dictionary=True)
return _sql_quarry(cur, querry, data, return_result, commit)
@@ -193,7 +235,7 @@ async def get_file(file_id: str):
# status_code=500, detail="Somethings wrong with the database"
# )
# filename = cur.fetchone()[0]
return FileResponse(FILES_IN_PROGRESS + filename)
return FileResponse(FILES_IN_PROGRESS / filename)
@app.get("/search/lva")
@@ -445,7 +487,7 @@ async def get_submission(
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
)
info(
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
f"Got Submission: lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
)
rects_p = json.loads(rects)
scales_p = json.loads(pagescales)
@@ -534,8 +576,8 @@ async def yield_censor_status(file_id: str):
def censor_pdf(
path: str,
destpath: str,
path: os.PathLike,
destpath: os.PathLike,
rects: List[List[List[float]]],
scales: List[Dict[str, float]],
file_id: str,
@@ -550,6 +592,7 @@ def censor_pdf(
Returns:
None
"""
info(f"started Censoring for file {path} to be saved to {destpath}")
doc = pymupdf.open(path)
page = doc[0]
npage = doc.page_count
@@ -582,8 +625,8 @@ def censor_pdf(
def censor_pdf_ocr(
path: str,
destpath: str,
path: os.PathLike,
destpath: os.PathLike,
rects: List[List[List[float]]],
scales: List[Dict[str, float]],
file_id: str,
@@ -600,6 +643,7 @@ def censor_pdf_ocr(
Returns:
None
"""
info(f"started Censoring in OCR Mode for file {path} to be saved to {destpath}")
doc = pymupdf.open(path)
output = pymupdf.open()
page = doc[0]
@@ -627,12 +671,31 @@ def censor_pdf_ocr(
censor_status_datas[file_id]["done"] = False
censor_status_update_events[file_id].set()
# THis Costs us dearly
try:
bitmap = page.get_pixmap(dpi=400)
pdf_bytes = bitmap.pdfocr_tobytes(
language="deu",
tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files
)
output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
except RuntimeError as e:
error(
f"Error in OCR for document: {destpath}. Error: {e}. Falling back to standard mode."
)
if i < len(rects) and rects[i] != []:
for rect in rects[i]:
prect = pymupdf.Rect(
rect[0] * wfac,
rect[1] * hfac,
(rect[0] + rect[2]) * wfac,
(rect[1] + rect[3]) * hfac,
)
page.add_redact_annot(
prect,
fill=(0, 0, 0),
)
page.apply_redactions()
output.insert_pdf(page.parent, from_page=page.number, to_page=page.number)
# End of the costly part
print(f"Page {i + 1}/{npage}: CENSORING DONE")
output.save(destpath)
@@ -691,15 +754,16 @@ def make_savepath(
ftype: str,
) -> os.PathLike:
"""Generates the path, the file is saved to after the upload process is finished. It creates all nessecery directories."""
info(f"Started to make Savepath for '{fname}' in '{lva}' with prof '{prof}'.")
lv = get_lvpath(lva)
lvpath = lv[1] + "/"
lvpath = Path(lv[1])
pf = get_profpath(prof, lv[0])
pfpath = pf[1] + "/"
catpath = CATEGORIES[int(cat)] + "/"
scpath = ""
pfpath = Path(pf[1])
catpath = Path(CATEGORIES[int(cat)])
scpath: str | os.PathLike = ""
if int(cat) in SUBCAT_CATEGORIES_I and subcat != "":
sc = get_subcatpath(subcat, int(cat), pf[0], lv[0])
scpath = sc[1] + "/"
scpath = Path(sc[1])
if int(cat) == 6:
savepath = UNIZEUG_PATH / (lv[1] + "_Multimedia_only/") / pfpath
else:
@@ -724,10 +788,12 @@ def make_savepath(
destpath = savepath / file
i = 0
while destpath.is_file():
info(f"{destpath} already exists.")
file = filename + f"_{i}." + ftype
i += 1
destpath = savepath / file
destpath.touch()
info(f"Path for file to be saved generated as: {savepath / file}")
return savepath / file
@@ -958,7 +1024,7 @@ async def remove_old_FIP_entrys():
# return_result=False,
# )
db.commit()
return FileResponse(APP_ROOT_PATH / "/index.html")
return FileResponse(APP_ROOT_PATH / "index.html")
def delete_from_FIP(uuid: str):
@@ -966,4 +1032,4 @@ def delete_from_FIP(uuid: str):
if len(res) < 1:
raise HTTPException(500, "I am trying to delete a file that dose not exist")
sql("DELETE FROM FIP WHERE id=?", (uuid,), return_result=False, commit=True)
os.remove(FILES_IN_PROGRESS + res[0]["filename"])
os.remove(FILES_IN_PROGRESS / res[0]["filename"])

View File

@@ -283,10 +283,10 @@ function submitPdf(eve) {
submitForm(formdata);
}
async function submitForm(formData) {
var updateEventSource = null;
try {
const updateEventSource = new EventSource(
window.location + "get_censor_status/" + doc.fID,
);
updateEventSource = new EventSource("/get_censor_status/" + doc.fID);
modal.style.display = "flex";
// console.log("http://127.0.0.1:8000/get_censor_status/" + doc.fID);
updateEventSource.addEventListener("censorUpdate", function(eve) {
@@ -295,11 +295,19 @@ async function submitForm(formData) {
upload_status.innerText =
"Censoring Page " + data.page + "/" + data.pages;
});
const response = await fetch(window.location + "submit", {
} catch {
console.error(
"Error geting eventsource for updating censoring page count: " + error,
);
}
try {
const response = await fetch("/submit/", {
method: "POST",
body: formData,
});
if (updateEventSource !== null) {
updateEventSource.close();
}
modal.style.display = "none";
//let responseJSON=await response.json();
if (response.ok) {
@@ -320,7 +328,7 @@ async function submitForm(formData) {
window.alert("Error: " + (await response.json())["detail"]);
}
} catch (error) {
console.error("Error" + error);
console.error("Error submitting: " + error);
}
}
function uploadPdf(eve) {
@@ -338,7 +346,7 @@ function uploadPdf(eve) {
}
async function uploadFile(formData) {
try {
const response = await fetch(window.location + "uploadfile", {
const response = await fetch("/uploadfile/", {
method: "POST",
body: formData,
});

View File

@@ -1,4 +1,4 @@
var url = window.location + "search/";
var url = "/search/";
var lid = null;
var pid = null;
var activeAutocompletion = null;

View File

@@ -1,4 +1,3 @@
version: "3"
services:
app:
container_name: python-app
@@ -58,6 +57,7 @@ services:
environment:
ENTRY_COMMAND: python /python/init.py
UNIZEUG_PATH: /unizeug
APP_ROOT_PATH: /python
DB_HOST: db
DB_USER: app
DB_PASSWORD: DBPassword