diff --git a/classifier/classifier.py b/classifier/classifier.py index 51c87fc..b332771 100644 --- a/classifier/classifier.py +++ b/classifier/classifier.py @@ -41,7 +41,7 @@ def store_training_data(i, d,key=b"answered"): # Lade Trainingsdaten fuer einen angegebenen key (Label/Eigenschaft) -def get_training_threads(key="answered", filter=[]): +def get_training_threads(key="answered", filters=[]): if not data_types.has_key(key): raise ValueError("Key "+str(key)+" unknown") #------------------------------------ @@ -49,12 +49,23 @@ def get_training_threads(key="answered", filter=[]): d_a=[] d_a2=[] #------------------------------------ - for i in train: - if train[i].has_key(key): # In den Trainingsdaten muss der relevante Key sein - t=db_session.query(MailThread).filter(MailThread.firstmail==i).first() - if not t is None: # Thread muss in der Datenbank sein - t_a.append(t) - d_a.append(train[i][key]) + if "db" in filters: + tt=db_session.query(MailThread).filter(MailThread.istrained==True).all() + for t in tt: + t_a.append(t) + if key =="answered": + d_a.append(t.answered) + elif key=="maintopic": + d_a.append(t.maintopic) + + + else: + for i in train: + if train[i].has_key(key): # In den Trainingsdaten muss der relevante Key sein + t=db_session.query(MailThread).filter(MailThread.firstmail==i).first() + if not t is None: # Thread muss in der Datenbank sein + t_a.append(t) + d_a.append(train[i][key]) le=LabelEncoder() d_a2=le.fit_transform(d_a) return (t_a,d_a2,le) @@ -91,9 +102,9 @@ class ThreadTextExtractor(BaseEstimator, TransformerMixin): def transform(self, X,y=None): return [t.text() for t in X] -def get_pipe(p=b"pipe1",k=b"answered"): +def get_pipe(p=b"pipe1",k=b"answered",filters=[]): p=build_pipe(p) - tt= get_training_threads(k) + tt= get_training_threads(k,filters) if len(tt[0]) > 0: p.fit(tt[0],tt[1]) return p,tt[2] diff --git a/classifier/training.py b/classifier/training.py index cf68507..f340d3d 100644 --- a/classifier/training.py +++ b/classifier/training.py @@ -60,7 +60,7 @@ def train_single_thread(tid,p,le,key="answered"): l=le.inverse_transform([ca])[0] if type(l) is numpy.bool_: l=bool(l) - if type(l) is numpy.string_: + if type(l) is numpy.string_ or type(l) is numpy.unicode_: l=str(l) store_training_data(tid,l, key) elif not ca.strip() == "": diff --git a/create_migration b/create_migration new file mode 100755 index 0000000..5d39afb --- /dev/null +++ b/create_migration @@ -0,0 +1,27 @@ +#!/bin/bash +if [ $# -eq 0 ] +then +echo "No Arguments supplied" +exit +fi + +echo "creating a new migration" +./migration.py compare_model_to_db storage.metadata + +echo "Dump current database state to file" +./migration.py create_model > oldmodel.py + +ls db_repository/versions +echo "Choose a filename for the new migration" +read filename + +./migration.py make_update_script_for_model --oldmodel=oldmodel:meta --model=storage:metadata > db_repository/versions/$filename.py + +cp test.sqlite test.sqlite.bak +./migration.py test +rm test.sqlite +mv test.sqlite.bak test.sqlite + + + +rm oldmodel.py \ No newline at end of file diff --git a/data.yml b/data.yml index 0a3f95b..9334d62 100644 --- a/data.yml +++ b/data.yml @@ -1,38 +1,38 @@ -{26808: {maintopic: jobausschreibung}, 27008: {lang: de}, 27017: {lang: de, maintopic: jobausschreibung}, - 27061: {lang: de}, 27070: {maintopic: ausleihen}, 27083: {maintopic: ausleihen}, - 27086: {maintopic: information}, 27094: {maintopic: information}, 27096: {maintopic: jobausschreibung}, - 27102: {lang: en, maintopic: studium}, 27118: {maintopic: information}, 27127: { - maintopic: studium}, 27130: {maintopic: information}, 27133: {maintopic: information}, - 27141: {maintopic: information}, 27146: {maintopic: information}, 27166: {maintopic: umfragen}, - 27171: {maintopic: ausleihen}, 27178: {maintopic: studium}, 27182: {maintopic: studium}, - 27197: {maintopic: information}, 27201: {maintopic: information}, 27218: {maintopic: information}, - 27219: {maintopic: studium}, 27222: {maintopic: information}, 27226: {maintopic: ausleihen}, - 27263: {maintopic: ausleihen}, 27267: {maintopic: ausleihen}, 27420: {answered: true, - maintopic: studium}, 27422: {answered: true, maintopic: studium}, 27425: {answered: false, - maintopic: studium}, 27431: {answered: false, maintopic: information}, 27434: { - answered: false, lang: de, maintopic: information}, 27435: {answered: false}, - 27438: {answered: false, maintopic: information}, 27439: {answered: true, maintopic: studium}, - 27441: {answered: false, maintopic: studium}, 27444: {answered: true, maintopic: ausleihen}, - 27454: {answered: false, maintopic: information}, 27455: {answered: false, maintopic: information}, - 27456: {answered: false, lang: de, maintopic: studium}, 27457: {answered: false, - maintopic: jobausschreibung}, 27468: {answered: true, maintopic: studium}, 27489: { - answered: false, lang: en, maintopic: information}, 27490: {answered: false, maintopic: fachschaftenzeugs}, - 27491: {answered: false, maintopic: jobausschreibung}, 27492: {answered: false, - maintopic: information}, 27495: {answered: false, maintopic: information}, 27496: { - answered: true, maintopic: ausleihen}, 27497: {answered: false, maintopic: information}, - 27500: {answered: true, lang: en, maintopic: studium}, 27501: {answered: false, - lang: en, maintopic: information}, 27514: {answered: true, maintopic: studium}, - 27515: {answered: true, lang: en, maintopic: studium}, 27518: {answered: true, maintopic: studium}, - 27523: {answered: false, maintopic: jobausschreibung}, 27526: {answered: false, - maintopic: studium}, 27536: {answered: true, lang: de, maintopic: studium}, 27541: { - answered: true, maintopic: studium}, 27542: {answered: false, maintopic: studium}, - 27543: {answered: false, maintopic: information}, 27544: {answered: true, maintopic: studium}, - 27545: {answered: false, maintopic: umfragen}, 27546: {answered: false, maintopic: information}, - 27547: {answered: false, maintopic: studium}, 27549: {answered: false}, 27550: { - answered: false, maintopic: information}, 27553: {answered: false, maintopic: information}, - 27558: {answered: false}, 27560: {answered: false, maintopic: ausleihen}, 27562: { - answered: false}, 27564: {answered: false, maintopic: jobausschreibung}, 27565: { - answered: true, maintopic: ausleihen}, 27566: {answered: false, maintopic: information}, +{26808: {maintopic: jobausschreibung}, 26992: {maintopic: jobausschreibung}, 27008: { + lang: de}, 27017: {lang: de, maintopic: jobausschreibung}, 27061: {lang: de}, + 27070: {maintopic: ausleihen}, 27083: {maintopic: ausleihen}, 27086: {maintopic: information}, + 27094: {maintopic: information}, 27096: {maintopic: jobausschreibung}, 27102: { + lang: en, maintopic: studium}, 27118: {maintopic: information}, 27127: {maintopic: studium}, + 27130: {maintopic: information}, 27133: {maintopic: information}, 27141: {maintopic: information}, + 27146: {maintopic: information}, 27166: {maintopic: umfragen}, 27171: {maintopic: ausleihen}, + 27178: {maintopic: studium}, 27182: {maintopic: studium}, 27197: {maintopic: information}, + 27201: {maintopic: information}, 27218: {maintopic: information}, 27219: {maintopic: studium}, + 27222: {maintopic: information}, 27226: {maintopic: ausleihen}, 27263: {maintopic: ausleihen}, + 27267: {maintopic: ausleihen}, 27420: {answered: true, maintopic: studium}, 27422: { + answered: true, maintopic: studium}, 27425: {answered: false, maintopic: studium}, + 27431: {answered: false, maintopic: information}, 27434: {answered: false, lang: de, + maintopic: information}, 27435: {answered: false}, 27438: {answered: false, maintopic: information}, + 27439: {answered: true, maintopic: studium}, 27441: {answered: false, maintopic: studium}, + 27444: {answered: true, maintopic: ausleihen}, 27454: {answered: false, maintopic: information}, + 27455: {answered: false, maintopic: information}, 27456: {answered: false, lang: de, + maintopic: studium}, 27457: {answered: false, maintopic: jobausschreibung}, 27468: { + answered: true, maintopic: studium}, 27489: {answered: false, lang: en, maintopic: information}, + 27490: {answered: false, maintopic: fachschaftenzeugs}, 27491: {answered: false, + maintopic: jobausschreibung}, 27492: {answered: false, maintopic: information}, + 27495: {answered: false, maintopic: information}, 27496: {answered: true, maintopic: ausleihen}, + 27497: {answered: false, maintopic: information}, 27500: {answered: true, lang: en, + maintopic: studium}, 27501: {answered: false, lang: en, maintopic: information}, + 27514: {answered: true, maintopic: studium}, 27515: {answered: true, lang: en, maintopic: studium}, + 27518: {answered: true, maintopic: studium}, 27523: {answered: false, maintopic: jobausschreibung}, + 27526: {answered: false, maintopic: studium}, 27536: {answered: true, lang: de, + maintopic: studium}, 27541: {answered: true, maintopic: studium}, 27542: {answered: false, + maintopic: studium}, 27543: {answered: false, maintopic: information}, 27544: { + answered: true, maintopic: studium}, 27545: {answered: false, maintopic: umfragen}, + 27546: {answered: false, maintopic: information}, 27547: {answered: false, maintopic: studium}, + 27549: {answered: false}, 27550: {answered: false, maintopic: information}, 27553: { + answered: false, maintopic: information}, 27558: {answered: false}, 27560: {answered: false, + maintopic: ausleihen}, 27562: {answered: false}, 27564: {answered: false, maintopic: jobausschreibung}, + 27565: {answered: true, maintopic: ausleihen}, 27566: {answered: false, maintopic: information}, 27567: {answered: false, maintopic: information}, 27568: {answered: false}, 27575: { answered: false, maintopic: information}, 27577: {answered: false, maintopic: information}, 27579: {answered: true, maintopic: diplomarbeit}, 27582: {answered: false, maintopic: studium}, diff --git a/db_repository/versions/005_answered_lang.py b/db_repository/versions/005_answered_lang.py new file mode 100644 index 0000000..84e096b --- /dev/null +++ b/db_repository/versions/005_answered_lang.py @@ -0,0 +1,39 @@ +from sqlalchemy import * +from migrate import * + + +from migrate.changeset import schema +pre_meta = MetaData() +post_meta = MetaData() +threads = Table('threads', post_meta, + Column('created_at', TIMESTAMP, nullable=False), + Column('updated_at', TIMESTAMP, nullable=False), + Column('id', Integer, primary_key=True, nullable=False), + Column('firstmail', Integer), + Column('date', DateTime), + Column('islabeled', Boolean), + Column('istrained', Boolean), + Column('opened', Boolean), + Column('body', Text), + Column('maintopic', String), + Column('lang', String), + Column('answered', String), +) + + +def upgrade(migrate_engine): + # Upgrade operations go here. Don't create your own engine; bind + # migrate_engine to your metadata + pre_meta.bind = migrate_engine + post_meta.bind = migrate_engine + post_meta.tables['threads'].columns['answered'].create() + post_meta.tables['threads'].columns['lang'].create() + + +def downgrade(migrate_engine): + # Operations to reverse the above upgrade go here. + pre_meta.bind = migrate_engine + post_meta.bind = migrate_engine + post_meta.tables['threads'].columns['answered'].drop() + post_meta.tables['threads'].columns['lang'].drop() + diff --git a/flaskapp/__init__.py b/flaskapp/__init__.py index 5899ea6..2a43be7 100644 --- a/flaskapp/__init__.py +++ b/flaskapp/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import flask -from flask import Flask,jsonify,send_from_directory, render_template +from flask import Flask,jsonify,send_from_directory, render_template, request,redirect,url_for from config import Config import yaml import os @@ -14,12 +14,12 @@ package_directory = os.path.dirname(os.path.abspath(__file__)) cfg = Config(file(os.path.join(package_directory, 'config.cfg'))) -def render_index(mths,code=200): +def render_index(mths,opened=None,code=200): return render_template("index.html",mths=mths, - title=cfg.title.decode("utf8"), + title=cfg.title.decode("utf8"),opened=opened ), code from classifier import get_pipe -mail_threads=db_session.query(MailThread).all() +#mail_threads=db_session.query(MailThread).all() #pipe1,le=get_pipe("pipe1",b"answered") #pipe2,le2=get_pipe("pipe2b", b"maintopic") #pipe3,le3=get_pipe("pipe2b", b"lang") @@ -33,28 +33,62 @@ mail_threads=db_session.query(MailThread).all() # t.maintopic=maintopic[i] # t.lang=lang[i] +maintopic_values=["studium", "information","ausleihen"] + @app.route("/") def hello(): mth=db_session.query(MailThread).order_by(desc(MailThread.date)).all() return render_index(mth) -@app.route("/answered//") -def store_answered(id, value): +def store_value(id,key,value): mth=db_session.query(MailThread).filter(MailThread.firstmail==int(id)).first() - value= value in ["true", "True", "1", "t"] - mth.answered=bool(value) - mth.opened=bool(value) - return render_index([mth]) + + if key =="answered": + value = value in ["true", "True", "1", "t"] + mth.answered=bool(value) + mth.opened=bool(value) + if key=="maintopic" and value in maintopic_values: + mth.maintopic=str(value) + if key =="trained": + value = value in ["true", "True", "1", "t"] + mth.istrained=bool(value) + +@app.route("/") +def store_answered(id): + + key = request.args.get('key') + value = request.args.get('value') + if not key is None and not value is None: + store_value(id,key,value) + + return render_index([mth], opened=id) -@app.route("/studium") +@app.route("/studium/") def studium(): mth=db_session.query(MailThread).filter(MailThread.maintopic=="studium").order_by(desc(MailThread.date)).all() return render_index(mth) -@app.route("/") +@app.route("//") def maintopic(maintopic): mth=db_session.query(MailThread).filter(MailThread.maintopic=="%s" % maintopic).order_by(desc(MailThread.date)).all() return render_index(mth) + +@app.route("//") +def maintopic_store(maintopic,id): + if maintopic == "trained": + mth=db_session.query(MailThread).filter(MailThread.istrained==True).order_by(desc(MailThread.date)).all() + else: + mth=db_session.query(MailThread).filter(MailThread.maintopic=="%s" % maintopic).order_by(desc(MailThread.date)).all() + + key = request.args.get('key') + value = request.args.get('value') + + + if not key is None and not value is None: + store_value(id,key,value) + return redirect(url_for('maintopic_store', id=id, maintopic=maintopic), 302) + else: + return render_index(mth,opened=id) diff --git a/flaskapp/templates/index.html b/flaskapp/templates/index.html index dc96d23..172aa41 100644 --- a/flaskapp/templates/index.html +++ b/flaskapp/templates/index.html @@ -1,13 +1,13 @@ {{title}} - - - + + + @@ -19,19 +19,20 @@
{% for m in mths %} -
+
-
+
-
- {{m.maintopic}} +
+ answered:{{(not m.is_answered())}} + {{m.maintopic}}, {{ m.istrained }} trained:{{(not m.istrained)}}
{{ m.print_text() }}
diff --git a/run.py b/run.py index 9544bf4..4406645 100644 --- a/run.py +++ b/run.py @@ -34,18 +34,18 @@ if len(sys.argv)>1: pipe1,le=get_pipe("pipe1",b"answered") pipe2,le2=get_pipe("pipe2b", b"maintopic") pipe3,le3=get_pipe("pipe2b", b"lang") - mail_threads=db_session.query(MailThread).all() + mail_threads=db_session.query(MailThread).filter(MailThread.istrained==False).all() answered=le.inverse_transform(pipe1.predict(mail_threads)) maintopic=le2.inverse_transform(pipe2.predict(mail_threads)) lang=le3.inverse_transform(pipe3.predict(mail_threads)) for i, t in enumerate(mail_threads): - t.answered=answered[i] - t.opened=answered[i] + t.answered=bool(answered[i]) + t.opened=bool(answered[i]) - t.maintopic=maintopic[i] - t.lang=lang[i] + t.maintopic=str(maintopic[i]) + t.lang=str(lang[i]) db_session.add(t) db_session.commit() @@ -74,8 +74,15 @@ if len(sys.argv)>1: mth=db_session.query(MailThread).all() for t in mth: t.compile() - - + + if sys.argv[1] == "trained_threads_from_yml": + from classifier.classifier import train + for k in train: + print k + t=db_session.query(MailThread).filter(MailThread.firstmail==k).first() + t.istrained=True + db_session.add(t) + db_session.commit() if sys.argv[1] == "print_threads2": mth=db_session.query(MailThread).all() for t in mth: @@ -83,8 +90,8 @@ if len(sys.argv)>1: print "---------------\n" if sys.argv[1] == "train_thrd2": - p, le=get_pipe("pipe2", "maintopic") - pb, lb =get_pipe("pipe2b", "maintopic") + p, le=get_pipe("pipe2", "maintopic",["db"]) + pb, lb =get_pipe("pipe2b", "maintopic",["db"]) train_single_thread(int(sys.argv[2]),p,le,b"maintopic") @@ -120,7 +127,8 @@ if len(sys.argv)>1: t=db_session.query(MailThread).filter(MailThread.firstmail==sys.argv[2]).first() print t.to_text() print le.inverse_transform(pipe2.predict([t])) - + + if sys.argv[1] == "train_thrd": pipe1, labelencoder=train_fit_pipe() diff --git a/storage/thread_model.py b/storage/thread_model.py index 79c3e63..69b96fa 100644 --- a/storage/thread_model.py +++ b/storage/thread_model.py @@ -29,16 +29,19 @@ class MailThread(Base): opened = Column(Boolean) body = Column(Text) maintopic=Column(String) + lang=Column(String) + answered=Column(String) __schema__=FullThreadSchema __jsonid__='thread' __whiteattrs__= ["body"] __jsonattrs__=None - answered=False +# answered=False # maintopic="information" - lang="" +# lang="" def bdy(self): return yaml.load(self.body) - + def is_answered(self): + return self.answered in ["1", "true", "True", "t","T"] def to_text(self): mmm=self.mails() txt="" diff --git a/test.sqlite b/test.sqlite index 21a1c0a..5ef2874 100644 Binary files a/test.sqlite and b/test.sqlite differ