Compare commits

20 Commits

Author SHA1 Message Date
Marcel Gansfusz
6d5c390350 fixed error in compose.yml 2025-11-04 21:24:38 +01:00
Marcel Gansfusz
e13d06d4a7 fixed regular deletions of files 2025-11-04 19:16:59 +01:00
Marcel Gansfusz
547411ba03 made to kill fip entrys when 1d passes 2025-11-04 19:04:41 +01:00
Marcel Gansfusz
cdd26e0bc3 caught exception in trying to censor 2025-11-04 17:54:24 +01:00
Marcel Gansfusz
f7c73a0c5a fixed js 2025-11-04 17:17:13 +01:00
Marcel Gansfusz
8e74848397 fixed js 2025-11-04 17:13:18 +01:00
Marcel Gansfusz
8704aee82e fixed tesseract in Dockerfile 2025-11-04 16:56:24 +01:00
Marcel Gansfusz
594ac1fa00 updated censoring status logic 2025-11-04 16:55:11 +01:00
Marcel Gansfusz
2ee90cd0d7 added tessercat to Dockerfile 2025-11-04 15:45:37 +01:00
Marcel Gansfusz
d42bab5b19 changed the fetch in js to be relative (no explicite url; just a path); removed version from docker compose 2025-11-04 14:55:04 +01:00
Marcel Gansfusz
c3a87ceee6 changed stryle of greeting file 2025-10-31 17:48:40 +01:00
Marcel Gansfusz
6f2d373292 updated greeting file to represent new censoring mechanism 2025-10-31 16:18:43 +01:00
Marcel Gansfusz
a37206d6a4 added logging statement 2025-10-30 15:48:51 +01:00
Marcel Gansfusz
6bd75bf93f removed .nvim; added log statements 2025-10-30 15:31:00 +01:00
Marcel Gansfusz
5bc24a32d5 removed __pycache__ 2025-10-30 15:09:16 +01:00
Marcel Gansfusz
a9233926e5 added logging statements 2025-10-30 14:45:53 +01:00
Marcel Gansfusz
90235d2788 Made the database reconnect when connection is broken 2025-10-30 13:03:02 +01:00
Marcel Gansfusz
da316a9351 changed from string paths tp pathlib 2025-10-29 12:14:32 +01:00
Marcel Gansfusz
e6727daf8e i forgor 2025-10-28 19:32:33 +01:00
Marcel Gansfusz
d6508c739d in between state before converting to pathlib 2025-10-28 19:32:01 +01:00
14 changed files with 697 additions and 159 deletions

4
.gitignore vendored
View File

@@ -5,3 +5,7 @@ app/dest
app.log
init.log
app/__pycache__/
mariadb/*
unizeug
.mypy_cache
.nvim

View File

@@ -1,2 +0,0 @@
# remote_path="/srv/http/"
# remote_path="dev@10.0.0.25:/var/www/html/"

View File

@@ -1,4 +1,31 @@
FROM python:3
FROM python:3.13-rc-alpine
WORKDIR /usr/src/
COPY requirements.txt /usr/src/requirements.txt
COPY entrypoint.sh /usr/src/entrypoint.sh
RUN apk add --no-cache \
gcc \
g++ \
musl-dev \
python3-dev \
libffi-dev \
openssl-dev \
cargo \
make \
mariadb-connector-c-dev \
jpeg-dev \
zlib-dev \
freetype-dev \
lcms2-dev \
openjpeg-dev \
tiff-dev \
tk-dev \
tcl-dev \
libwebp-dev \
tesseract-ocr \
tesseract-ocr-data-deu
RUN python -m ensurepip --upgrade
RUN pip install setuptools wheel
RUN pip install -r requirements.txt
WORKDIR /python
CMD /bin/sh /usr/src/entrypoint.sh
# ENTRYPOINT ["/usr/src/entrypoint.sh"]

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 89 KiB

After

Width:  |  Height:  |  Size: 103 KiB

View File

@@ -8,9 +8,10 @@ import os
import json
import mariadb
import logging
from pathlib import Path
import schedule
import time
import pytz
CATEGORIES = [
"Prüfungen",
@@ -23,7 +24,8 @@ CATEGORIES = [
]
SUBCAT_CATEGORIES = ["Klausuren", "Übungen", "Labore"]
unizeug_path = os.environ.get("UNIZEUG_PATH", "./unizeug")
APP_ROOT_PATH = Path(os.environ.get("APP_ROOT_PATH", "./app"))
FILES_IN_PROGRESS = APP_ROOT_PATH / "files/"
log = logging.getLogger(__name__)
logging.basicConfig(
filename="init.log",
@@ -39,7 +41,6 @@ db = mariadb.connect(
user=os.environ.get("DB_USER", "user"),
password=os.environ.get("DB_PASSWORD", "DBPASSWORD"),
database=os.environ.get("DB_DATABASE", "unizeug"),
)
c = db.cursor()
try:
@@ -71,12 +72,27 @@ c.execute(
"CREATE TABLE SubCats(id BIGINT(20) UNSIGNED NOT NULL AUTO_INCREMENT,LId BIGINT(20),PId BIGINT(20),cat TINYINT UNSIGNED,name VARCHAR(256), PRIMARY KEY(id))"
)
try:
c.execute("CREATE TABLE FIP(id UUID DEFAULT(UUID()), filename VARCHAR(256), filetype VARCHAR(8),initTimeStamp DATETIME, PRIMARY KEY(id))")
c.execute(
"CREATE TABLE FIP(id UUID DEFAULT(UUID()), filename VARCHAR(256), filetype VARCHAR(8),initTimeStamp DATETIME, PRIMARY KEY(id))"
)
except mariadb.OperationalError:
pass
db.commit()
def remove_old_FIP_entrys():
cur = db.cursor(dictionary=True)
cur.execute(
"SELECT id,filename FROM FIP WHERE HOUR(TIMEDIFF(NOW(),initTimeStamp)) > 24 "
)
files = cur.fetchall()
info(f"Remove Files: {files}")
for file in files:
c.execute("DELETE FROM FIP WHERE id=?", (file["id"],))
os.remove(FILES_IN_PROGRESS / file["filename"])
db.commit()
def get_dirstruct():
# with open("app/pwfile.json", "r") as f:
# cred = json.load(f)
@@ -147,6 +163,7 @@ def get_dirstruct():
(lid, pid, idx, subcat.name),
)
db.commit()
remove_old_FIP_entrys()
def link_prof(firstname, lastname, lid):
@@ -173,7 +190,8 @@ def link_prof(firstname, lastname, lid):
if __name__ == "__main__":
get_dirstruct()
schedule.every.day.at("04:00","Europe/Vienna").do(get_dirstruct)
info("Database updated")
schedule.every().day.at("04:00", "Europe/Vienna").do(get_dirstruct)
while True:
schedule.run_pending()
time.sleep(1)

View File

@@ -21,13 +21,16 @@ import re
import os
import signal
import mariadb
import sys
import filetype
import logging
import pathlib
from pathlib import Path
from starlette.types import HTTPExceptionHandler
@@ -40,6 +43,14 @@ logging.basicConfig(
debug = log.debug
info = log.info
error = log.error
critical = log.critical
def exception_handler(etype, value, tb):
log.exception(f"Uncought Exception: {value}")
sys.excepthook = exception_handler
db = mariadb.connect(
host=os.environ.get("DB_HOST", "db"),
@@ -54,16 +65,6 @@ info("App Started")
# startup()
app = FastAPI()
app.mount(
"/favicon",
StaticFiles(directory=os.environ.get("FAVICON_PATH", ".app/favicon")),
name="favicon",
)
app.mount(
"/static",
StaticFiles(directory=os.environ.get("STATIC_PATH", "./static")),
name="static",
)
CATEGORIES = [
@@ -75,16 +76,18 @@ CATEGORIES = [
"Zusammenfassungen",
"Multimedia",
]
APP_ROOT_PATH = os.environ.get("APP_ROOT_PATH", "./app")
APP_ROOT_PATH = Path(os.environ.get("APP_ROOT_PATH", "./app"))
SUBCAT_CATEGORIES = ["Klausuren", "Übungen", "Labore"]
SUBCAT_CATEGORIES_I = [1, 2, 3]
EX_DATE_CATEGORIES = ["Prüfungen", "Klausuren"]
EX_DATE_CATEGORIES_I = [0, 1]
UNIZEUG_PATH = os.environ.get("UNIZEUG_PATH", "./app/dest")
FILES_IN_PROGRESS = f"{APP_ROOT_PATH}/files/"
EMPTYFILE = f"{APP_ROOT_PATH}/graphics/empty.pdf"
UNSUPPORTEDFILE = f"{APP_ROOT_PATH}/graphics/unsupported.pdf"
GREETINGFILE = f"{APP_ROOT_PATH}/graphics/greeting.pdf"
UNIZEUG_PATH = Path(os.environ.get("UNIZEUG_PATH", "./app/dest"))
FILES_IN_PROGRESS = APP_ROOT_PATH / "files/"
EMPTYFILE = APP_ROOT_PATH / "graphics/empty.pdf"
UNSUPPORTEDFILE = APP_ROOT_PATH / "graphics/unsupported.pdf"
GREETINGFILE = APP_ROOT_PATH / "graphics/greeting.pdf"
FAVICON = APP_ROOT_PATH / "favicon"
STATIC_FILES = APP_ROOT_PATH / "static"
# cur = db.cursor()
@@ -124,12 +127,40 @@ def _sql_quarry(
)
def sql_connector_is_active(connector: mariadb.Connection) -> bool:
try:
connector.ping()
except mariadb.Error as e:
return False
return True
def sql_connect(connector: mariadb.Connection) -> mariadb.Connection:
try:
connector = mariadb.connect(
host=os.environ.get("DB_HOST", "db"),
user=os.environ.get("DB_USER", "user"),
password=os.environ.get("DB_PASSWORD", "DBPASSWORD"),
database=os.environ.get("DB_DATABASE", "Unizeug"),
)
except mariadb.Error as e:
critical(
f"Cannot reconnect to Database {os.environ.get('DB_DATABASE', 'Unizeug')} on {os.environ.get('DB_HOST', 'db')}. Got Mariadb Error: {e}"
)
os.kill(os.getpid(), signal.SIGTERM)
raise HTTPException(500, detail="Database failed")
return connector
def sql(
querry: str,
data: Tuple[str | int, ...] | str | int = (),
return_result: bool = True,
commit: bool = False,
) -> List[Tuple]:
global db
if not sql_connector_is_active(db):
db = sql_connect(db)
cur = db.cursor(dictionary=False)
return _sql_quarry(cur, querry, data, return_result, commit)
@@ -140,6 +171,10 @@ def sqlT(
return_result: bool = True,
commit: bool = False,
) -> List[Dict]:
global db
if not sql_connector_is_active(db):
db = sql_connect(db)
cur = db.cursor(dictionary=True)
return _sql_quarry(cur, querry, data, return_result, commit)
@@ -158,10 +193,22 @@ def sqlT(
# )
app.mount(
"/favicon",
StaticFiles(directory=os.environ.get("FAVICON_PATH", FAVICON)),
name="favicon",
)
app.mount(
"/static",
StaticFiles(directory=os.environ.get("STATIC_PATH", STATIC_FILES)),
name="static",
)
@app.get("/")
async def get_index():
"""gives the Index.html file"""
return FileResponse(f"{APP_ROOT_PATH}/index.html")
return FileResponse(APP_ROOT_PATH / "index.html")
@app.get("/files/{file_id}")
@@ -188,7 +235,7 @@ async def get_file(file_id: str):
# status_code=500, detail="Somethings wrong with the database"
# )
# filename = cur.fetchone()[0]
return FileResponse(FILES_IN_PROGRESS + filename)
return FileResponse(FILES_IN_PROGRESS / filename)
@app.get("/search/lva")
@@ -234,6 +281,9 @@ async def search_lva(
)
# res += cur.fetchall()
res = remove_duplicates(res + zw)
info(
f"LVA Search: {searchterm}; Result: {res[: (searchlim if searchlim != 0 else -1)]}"
)
if searchlim == 0:
return res
else:
@@ -268,6 +318,9 @@ async def search_profs(
)
# res += cur.fetchall()
res = remove_duplicates(res + zw)
info(
f"Prof Search: {searchterm}; Result: {res[: (searchlim if searchlim != 0 else -1)]}"
)
if searchlim == 0:
return res
else:
@@ -308,6 +361,9 @@ async def search_subcats(
)
# res += cur.fetchall()
res = remove_duplicates(res + rest)
info(
f"Subcatrgory Search: {searchterm}; Result: {res[: (searchlim if searchlim != 0 else -1)]}"
)
if searchlim == 0:
return res
else:
@@ -364,7 +420,7 @@ async def create_upload_file(files: List[UploadFile], c2pdf: bool = True):
content = doc.tobytes()
if ft != "dir":
filename = make_filename_unique(filename)
locpath = FILES_IN_PROGRESS + filename
locpath = FILES_IN_PROGRESS / filename
# locpaths.append(locpath)
# cur = db.cursor()
# try:
@@ -431,7 +487,7 @@ async def get_submission(
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
)
info(
f"lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
f"Got Submission: lva: {lva}, prof: {prof}, fname {fname}, stype: {stype}, subcat: {subcat}, sem: {sem}, ex_date: {ex_date}, rects: {rects}, pagescales: {pagescales}, ocr: {ocr}"
)
rects_p = json.loads(rects)
scales_p = json.loads(pagescales)
@@ -448,7 +504,7 @@ async def get_submission(
error(f"User tried to upload a file without specifying the {th[1]}")
raise HTTPException(400, f"You need to specify a {th[1]}")
filepath = "./app/files/" + res[0][0]
filepath = FILES_IN_PROGRESS / res[0][0]
# except mariadb.Error as e:
# print(f"Mariadb Error: {e}")
# raise HTTPException(
@@ -520,8 +576,8 @@ async def yield_censor_status(file_id: str):
def censor_pdf(
path: str,
destpath: str,
path: os.PathLike,
destpath: os.PathLike,
rects: List[List[List[float]]],
scales: List[Dict[str, float]],
file_id: str,
@@ -536,6 +592,7 @@ def censor_pdf(
Returns:
None
"""
info(f"started Censoring for file {path} to be saved to {destpath}")
doc = pymupdf.open(path)
page = doc[0]
npage = doc.page_count
@@ -568,8 +625,8 @@ def censor_pdf(
def censor_pdf_ocr(
path: str,
destpath: str,
path: os.PathLike,
destpath: os.PathLike,
rects: List[List[List[float]]],
scales: List[Dict[str, float]],
file_id: str,
@@ -586,6 +643,7 @@ def censor_pdf_ocr(
Returns:
None
"""
info(f"started Censoring in OCR Mode for file {path} to be saved to {destpath}")
doc = pymupdf.open(path)
output = pymupdf.open()
page = doc[0]
@@ -613,12 +671,31 @@ def censor_pdf_ocr(
censor_status_datas[file_id]["done"] = False
censor_status_update_events[file_id].set()
# THis Costs us dearly
try:
bitmap = page.get_pixmap(dpi=400)
pdf_bytes = bitmap.pdfocr_tobytes(
language="deu",
tessdata="/usr/share/tessdata/", # tesseract needs to be installed; this is the path to thetesseract files
)
output.insert_pdf(pymupdf.Document(stream=pdf_bytes))
except RuntimeError as e:
error(
f"Error in OCR for document: {destpath}. Error: {e}. Falling back to standard mode."
)
if i < len(rects) and rects[i] != []:
for rect in rects[i]:
prect = pymupdf.Rect(
rect[0] * wfac,
rect[1] * hfac,
(rect[0] + rect[2]) * wfac,
(rect[1] + rect[3]) * hfac,
)
page.add_redact_annot(
prect,
fill=(0, 0, 0),
)
page.apply_redactions()
output.insert_pdf(page.parent, from_page=page.number, to_page=page.number)
# End of the costly part
print(f"Page {i + 1}/{npage}: CENSORING DONE")
output.save(destpath)
@@ -675,21 +752,22 @@ def make_savepath(
ex_date: str,
fname: str,
ftype: str,
) -> str:
) -> os.PathLike:
"""Generates the path, the file is saved to after the upload process is finished. It creates all nessecery directories."""
info(f"Started to make Savepath for '{fname}' in '{lva}' with prof '{prof}'.")
lv = get_lvpath(lva)
lvpath = lv[1] + "/"
lvpath = Path(lv[1])
pf = get_profpath(prof, lv[0])
pfpath = pf[1] + "/"
catpath = CATEGORIES[int(cat)] + "/"
scpath = ""
pfpath = Path(pf[1])
catpath = Path(CATEGORIES[int(cat)])
scpath: str | os.PathLike = ""
if int(cat) in SUBCAT_CATEGORIES_I and subcat != "":
sc = get_subcatpath(subcat, int(cat), pf[0], lv[0])
scpath = sc[1] + "/"
scpath = Path(sc[1])
if int(cat) == 6:
savepath = UNIZEUG_PATH + lv[1] + "_Multimedia_only/" + pfpath
savepath = UNIZEUG_PATH / (lv[1] + "_Multimedia_only/") / pfpath
else:
savepath = UNIZEUG_PATH + lvpath + pfpath + catpath + scpath
savepath = UNIZEUG_PATH / lvpath / pfpath / catpath / scpath
os.makedirs(savepath, exist_ok=True)
filename = sem + "_"
if int(cat) in EX_DATE_CATEGORIES_I:
@@ -707,14 +785,16 @@ def make_savepath(
filename += fname
file = filename + "." + ftype
destpath = pathlib.Path(savepath + file)
destpath = savepath / file
i = 0
while destpath.is_file():
info(f"{destpath} already exists.")
file = filename + f"_{i}." + ftype
i += 1
destpath = pathlib.Path(savepath + file)
destpath = savepath / file
destpath.touch()
return savepath + file
info(f"Path for file to be saved generated as: {savepath / file}")
return savepath / file
def get_lvpath(lva: str) -> Tuple[int, str]:
@@ -907,10 +987,10 @@ async def save_files_to_folder(files: List[UploadFile]) -> str:
if filename == "":
filename = "None"
filename = make_filename_unique(filename)
os.mkdir(FILES_IN_PROGRESS + filename)
os.mkdir(FILES_IN_PROGRESS / filename)
for idx, file in enumerate(files):
fn = file.filename if file.filename is not None else "None" + str(idx)
with open(FILES_IN_PROGRESS + filename + "/" + fn, "wb") as f:
with open(FILES_IN_PROGRESS / filename / fn, "wb") as f:
f.write(await file.read())
return filename
@@ -938,13 +1018,13 @@ async def remove_old_FIP_entrys():
info(f"Remove Files: {files}")
for file in files:
sql("DELETE FROM FIP WHERE id=?", (file["id"]), return_result=False)
os.remove(FILES_IN_PROGRESS + file["filename"])
os.remove(FILES_IN_PROGRESS / file["filename"])
# sql(
# "DELETE FROM FIP WHERE HOUR(TIMEDIFF(NOW(),initTimeStamp)) > 24",
# return_result=False,
# )
db.commit()
return FileResponse("./index.html")
return FileResponse(APP_ROOT_PATH / "index.html")
def delete_from_FIP(uuid: str):
@@ -952,4 +1032,4 @@ def delete_from_FIP(uuid: str):
if len(res) < 1:
raise HTTPException(500, "I am trying to delete a file that dose not exist")
sql("DELETE FROM FIP WHERE id=?", (uuid,), return_result=False, commit=True)
os.remove(FILES_IN_PROGRESS + res[0]["filename"])
os.remove(FILES_IN_PROGRESS / res[0]["filename"])

View File

@@ -283,10 +283,10 @@ function submitPdf(eve) {
submitForm(formdata);
}
async function submitForm(formData) {
var updateEventSource = null;
try {
const updateEventSource = new EventSource(
"http://127.0.0.1:8000/get_censor_status/" + doc.fID,
);
updateEventSource = new EventSource("/get_censor_status/" + doc.fID);
modal.style.display = "flex";
// console.log("http://127.0.0.1:8000/get_censor_status/" + doc.fID);
updateEventSource.addEventListener("censorUpdate", function(eve) {
@@ -295,11 +295,19 @@ async function submitForm(formData) {
upload_status.innerText =
"Censoring Page " + data.page + "/" + data.pages;
});
const response = await fetch("http://127.0.0.1:8000/submit", {
} catch {
console.error(
"Error geting eventsource for updating censoring page count: " + error,
);
}
try {
const response = await fetch("/submit/", {
method: "POST",
body: formData,
});
if (updateEventSource !== null) {
updateEventSource.close();
}
modal.style.display = "none";
//let responseJSON=await response.json();
if (response.ok) {
@@ -320,7 +328,7 @@ async function submitForm(formData) {
window.alert("Error: " + (await response.json())["detail"]);
}
} catch (error) {
console.error("Error" + error);
console.error("Error submitting: " + error);
}
}
function uploadPdf(eve) {
@@ -338,7 +346,7 @@ function uploadPdf(eve) {
}
async function uploadFile(formData) {
try {
const response = await fetch("http://127.0.0.1:8000/uploadfile", {
const response = await fetch("/uploadfile/", {
method: "POST",
body: formData,
});

View File

@@ -1,4 +1,4 @@
var url = "http://127.0.0.1:8000/search/";
var url = "/search/";
var lid = null;
var pid = null;
var activeAutocompletion = null;

View File

@@ -1,18 +1,18 @@
version: "3"
services:
app:
container_name: python-app
command: python -m uvicorn app.main:app --host 0.0.0.0 --port 80
biuld:
# command: python -m uvicorn app.main:app --host 0.0.0.0 --port 80
build:
context: .
dockerfile: DOCKERFILE
dockerfile: Dockerfile
volumes:
- ./app:/python
- ./unizeug:/unizeug
- ./unizeug:/unizeug:source
ports:
- 80:80
restart: unless-stopped
environment:
ENTRY_COMMAND: python -m uvicorn main:app --host 0.0.0.0 --port 80
APP_LOG_PATH: /python/app.log
APP_ROOT_PATH: /python
UNIZEUG_PATH: /unizeug
@@ -30,25 +30,34 @@ services:
image: mariadb
restart: unless-stopped
environment:
MARAIDB_ROOT_PASSWORD: DBPassword
MARIADB_ROOT_PASSWORD: DBPassword
MARIADB_USER: app
UNIZEUG_PATH: /unizeug
MARIADB_PASSWORD: DBPassword
MARIADB_DATABASE: Unizeug
TZ: "Europe/Vienna"
healthcheck:
test: ["CMD", "healthcheck.sh", "--connect", "--innodb_initialized"]
start_period: 10s
interval: 10s
timeout: 5s
retries: 3
volumes:
- ./mariadb:/var/lib/mysql
scaner:
container_name: python-scaner
command: python /python/init.py
biuld:
# command: python /python/init.py
build:
context: .
dockerfile: DOCKERFILE
dockerfile: Dockerfile
volumes:
- ./app:/python
- ./unizeug
- ./unizeug:/unizeug:source
restart: unless-stopped
environment:
ENTRY_COMMAND: python /python/init.py
UNIZEUG_PATH: /unizeug
APP_ROOT_PATH: /python
DB_HOST: db
DB_USER: app
DB_PASSWORD: DBPassword

4
entrypoint.sh Executable file
View File

@@ -0,0 +1,4 @@
#!/bin/sh
if [[ -n "$ENTRY_COMMAND" ]]; then
/bin/sh -c "$ENTRY_COMMAND"
fi

View File

@@ -44,6 +44,7 @@ pypdf==5.2.0
pytesseract==0.3.13
python-dotenv==1.0.1
python-multipart==0.0.20
pytz==2025.2
PyYAML==6.0.2
requests==2.32.3
rich==13.9.4