diff --git a/classifier/classifier.py b/classifier/classifier.py index c51cc4b..51c87fc 100644 --- a/classifier/classifier.py +++ b/classifier/classifier.py @@ -102,7 +102,7 @@ def get_pipe(p=b"pipe1",k=b"answered"): def test_pipe(pp,k): tt= get_training_threads(k) - X_train,X_test,y_train,y_test=train_test_split(tt[0],tt[1],test_size=0.2) + X_train,X_test,y_train,y_test=train_test_split(tt[0],tt[1],test_size=0.4) if type(pp) is list: for p in pp: print "pipe: %s" % p diff --git a/data.yml b/data.yml index 0589963..0a3f95b 100644 --- a/data.yml +++ b/data.yml @@ -7,9 +7,10 @@ 27171: {maintopic: ausleihen}, 27178: {maintopic: studium}, 27182: {maintopic: studium}, 27197: {maintopic: information}, 27201: {maintopic: information}, 27218: {maintopic: information}, 27219: {maintopic: studium}, 27222: {maintopic: information}, 27226: {maintopic: ausleihen}, - 27420: {answered: true, maintopic: studium}, 27422: {answered: true, maintopic: studium}, - 27425: {answered: false, maintopic: studium}, 27431: {answered: false, maintopic: information}, - 27434: {answered: false, lang: de, maintopic: information}, 27435: {answered: false}, + 27263: {maintopic: ausleihen}, 27267: {maintopic: ausleihen}, 27420: {answered: true, + maintopic: studium}, 27422: {answered: true, maintopic: studium}, 27425: {answered: false, + maintopic: studium}, 27431: {answered: false, maintopic: information}, 27434: { + answered: false, lang: de, maintopic: information}, 27435: {answered: false}, 27438: {answered: false, maintopic: information}, 27439: {answered: true, maintopic: studium}, 27441: {answered: false, maintopic: studium}, 27444: {answered: true, maintopic: ausleihen}, 27454: {answered: false, maintopic: information}, 27455: {answered: false, maintopic: information}, diff --git a/db_model_version_1.py b/db_model_version_1.py new file mode 100644 index 0000000..a493443 --- /dev/null +++ b/db_model_version_1.py @@ -0,0 +1,35 @@ + +## File autogenerated by genmodel.py + +from sqlalchemy import * + + +meta = MetaData() + +mails = Table('mails', meta, + Column('created_at', TIMESTAMP, nullable=False), + Column('updated_at', TIMESTAMP, nullable=False), + Column('id', INTEGER, primary_key=True, nullable=False), + Column('date', DATETIME), + Column('envelope', TEXT), + Column('body', TEXT), + Column('text', TEXT), + Column('from_', TEXT), + Column('from_mailbox', VARCHAR), + Column('from_host', VARCHAR), + Column('to_', TEXT), + Column('to_mailbox', TEXT), + Column('to_host', VARCHAR), + Column('subject', TEXT), +) + +threads = Table('threads', meta, + Column('created_at', TIMESTAMP, nullable=False), + Column('updated_at', TIMESTAMP, nullable=False), + Column('id', INTEGER, primary_key=True, nullable=False), + Column('firstmail', INTEGER), + Column('islabeled', BOOLEAN), + Column('opened', BOOLEAN), + Column('body', TEXT), +) + diff --git a/db_repository/README b/db_repository/README new file mode 100644 index 0000000..6218f8c --- /dev/null +++ b/db_repository/README @@ -0,0 +1,4 @@ +This is a database migration repository. + +More information at +http://code.google.com/p/sqlalchemy-migrate/ diff --git a/db_repository/__init__.py b/db_repository/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/db_repository/manage.py b/db_repository/manage.py new file mode 100644 index 0000000..d29890d --- /dev/null +++ b/db_repository/manage.py @@ -0,0 +1,6 @@ +#!.env/bin/python +from migrate.versioning.shell import main +import storage + +if __name__ == '__main__': + main(debug='False') diff --git a/db_repository/migrate.cfg b/db_repository/migrate.cfg new file mode 100644 index 0000000..0dd8945 --- /dev/null +++ b/db_repository/migrate.cfg @@ -0,0 +1,25 @@ +[db_settings] +# Used to identify which repository this database is versioned under. +# You can use the name of your project. +repository_id=service mail db repository + +# The name of the database table used to track the schema version. +# This name shouldn't already be used by your project. +# If this is changed once a database is under version control, you'll need to +# change the table name in each database too. +version_table=migrate_version + +# When committing a change script, Migrate will attempt to generate the +# sql for all supported databases; normally, if one of them fails - probably +# because you don't have that database installed - it is ignored and the +# commit continues, perhaps ending successfully. +# Databases in this list MUST compile successfully during a commit, or the +# entire commit will fail. List the databases your application will actually +# be using to ensure your updates to that database work properly. +# This must be a list; example: ['postgres','sqlite'] +required_dbs=[] + +# When creating new change scripts, Migrate will stamp the new script with +# a version number. By default this is latest_version + 1. You can set this +# to 'true' to tell Migrate to use the UTC timestamp instead. +use_timestamp_numbering=False diff --git a/db_repository/storage b/db_repository/storage new file mode 120000 index 0000000..5d252d7 --- /dev/null +++ b/db_repository/storage @@ -0,0 +1 @@ +storage \ No newline at end of file diff --git a/db_repository/versions/002_add_date_to_threads.py b/db_repository/versions/002_add_date_to_threads.py new file mode 100644 index 0000000..f837fd9 --- /dev/null +++ b/db_repository/versions/002_add_date_to_threads.py @@ -0,0 +1,33 @@ +from sqlalchemy import * +from migrate import * + + +from migrate.changeset import schema +pre_meta = MetaData() +post_meta = MetaData() +threads = Table('threads', post_meta, + Column('created_at', TIMESTAMP, nullable=False), + Column('updated_at', TIMESTAMP, nullable=False), + Column('id', Integer, primary_key=True, nullable=False), + Column('firstmail', Integer), + Column('date', DateTime), + Column('islabeled', Boolean), + Column('opened', Boolean), + Column('body', Text), +) + + +def upgrade(migrate_engine): + # Upgrade operations go here. Don't create your own engine; bind + # migrate_engine to your metadata + pre_meta.bind = migrate_engine + post_meta.bind = migrate_engine + post_meta.tables['threads'].columns['date'].create() + + +def downgrade(migrate_engine): + # Operations to reverse the above upgrade go here. + pre_meta.bind = migrate_engine + post_meta.bind = migrate_engine + post_meta.tables['threads'].columns['date'].drop() + diff --git a/db_repository/versions/003_maintopic.py b/db_repository/versions/003_maintopic.py new file mode 100644 index 0000000..62395c0 --- /dev/null +++ b/db_repository/versions/003_maintopic.py @@ -0,0 +1,34 @@ +from sqlalchemy import * +from migrate import * + + +from migrate.changeset import schema +pre_meta = MetaData() +post_meta = MetaData() +threads = Table('threads', post_meta, + Column('created_at', TIMESTAMP, nullable=False), + Column('updated_at', TIMESTAMP, nullable=False), + Column('id', Integer, primary_key=True, nullable=False), + Column('firstmail', Integer), + Column('date', DateTime), + Column('islabeled', Boolean), + Column('opened', Boolean), + Column('body', Text), + Column('maintopic', String), +) + + +def upgrade(migrate_engine): + # Upgrade operations go here. Don't create your own engine; bind + # migrate_engine to your metadata + pre_meta.bind = migrate_engine + post_meta.bind = migrate_engine + post_meta.tables['threads'].columns['maintopic'].create() + + +def downgrade(migrate_engine): + # Operations to reverse the above upgrade go here. + pre_meta.bind = migrate_engine + post_meta.bind = migrate_engine + post_meta.tables['threads'].columns['maintopic'].drop() + diff --git a/db_repository/versions/__init__.py b/db_repository/versions/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/flaskapp/__init__.py b/flaskapp/__init__.py index 64e0b1b..b14876d 100644 --- a/flaskapp/__init__.py +++ b/flaskapp/__init__.py @@ -5,7 +5,7 @@ from flask import Flask,jsonify,send_from_directory, render_template from config import Config import yaml import os - +from sqlalchemy import desc from storage import MailThread,db_session app = Flask(__name__, template_folder="templates", static_folder="static") @@ -20,20 +20,35 @@ def render_index(mths,code=200): ), code from classifier import get_pipe mail_threads=db_session.query(MailThread).all() -pipe1,le=get_pipe("pipe1",b"answered") -pipe2,le2=get_pipe("pipe2b", b"maintopic") -pipe3,le3=get_pipe("pipe2b", b"lang") +#pipe1,le=get_pipe("pipe1",b"answered") +#pipe2,le2=get_pipe("pipe2b", b"maintopic") +#pipe3,le3=get_pipe("pipe2b", b"lang") -answered=le.inverse_transform(pipe1.predict(mail_threads)) -maintopic=le2.inverse_transform(pipe2.predict(mail_threads)) -lang=le3.inverse_transform(pipe3.predict(mail_threads)) +#answered=le.inverse_transform(pipe1.predict(mail_threads)) +#maintopic=le2.inverse_transform(pipe2.predict(mail_threads)) +#lang=le3.inverse_transform(pipe3.predict(mail_threads)) + +#for i, t in enumerate(mail_threads): +# t.answered=answered[i] +# t.maintopic=maintopic[i] +# t.lang=lang[i] -for i, t in enumerate(mail_threads): - t.answered=answered[i] - t.maintopic=maintopic[i] - t.lang=lang[i] @app.route("/") def hello(): - mth=mail_threads + mth=db_session.query(MailThread).order_by(desc(MailThread.date)).all() return render_index(mth) +@app.route("/answered//") +def store_answered(id, value): + mth=db_session.query(MailThread).filter(MailThread.firstmail==int(id)).first() + value= value in ["true", "True", "1", "t"] + mth.answered=bool(value) + mth.opened=bool(value) + return render_index([mth]) + + + +@app.route("/studium") +def studium(): + mth=db_session.query(MailThread).filter(MailThread.maintopic=="studium").order_by(desc(MailThread.date)).all() + return render_index(mth) diff --git a/flaskapp/templates/index.html b/flaskapp/templates/index.html index 112375e..d127712 100644 --- a/flaskapp/templates/index.html +++ b/flaskapp/templates/index.html @@ -6,27 +6,32 @@ +
-

{{title}}

- -
+

{{title}}

+ +
{% for m in mths %} -
+
-
{{ m.text() }}
+
{{ m.print_text() }}
@@ -36,7 +41,7 @@ {% for m in mths %} ID: {{m.tstr()}} - {{ m.text() }} + {{ m.print_text() }} {% endfor %}
diff --git a/migration.py b/migration.py new file mode 100755 index 0000000..9add6ea --- /dev/null +++ b/migration.py @@ -0,0 +1,6 @@ +#!.env/bin/python +from migrate.versioning.shell import main +import storage + +if __name__ == '__main__': + main(debug='False',repository="db_repository", url="sqlite:///test.sqlite") diff --git a/run.py b/run.py index 6c94f7c..9544bf4 100644 --- a/run.py +++ b/run.py @@ -16,7 +16,6 @@ from storage import Mail, MailThread, db_session #import yaml #import email from classifier import get_training_threads, print_answers, in_training, store_training_data, get_pipe, test_pipe, train_single_thread # , pipe2, pipe2b -from flaskapp import app def predict_thread(p,l,t): @@ -27,10 +26,31 @@ def predict_thread(p,l,t): #print "arg1:"+sys.argv[1] if len(sys.argv)>1: + if sys.argv[1] == "fetch_threads": print flatten_threads(fetch_threads()) + if sys.argv[1] == "predict_threads": + pipe1,le=get_pipe("pipe1",b"answered") + pipe2,le2=get_pipe("pipe2b", b"maintopic") + pipe3,le3=get_pipe("pipe2b", b"lang") + mail_threads=db_session.query(MailThread).all() + + answered=le.inverse_transform(pipe1.predict(mail_threads)) + maintopic=le2.inverse_transform(pipe2.predict(mail_threads)) + lang=le3.inverse_transform(pipe3.predict(mail_threads)) + + for i, t in enumerate(mail_threads): + t.answered=answered[i] + t.opened=answered[i] + + t.maintopic=maintopic[i] + t.lang=lang[i] + db_session.add(t) + db_session.commit() + if sys.argv[1] == "run_server": + from flaskapp import app app.run(port=3000,debug=True) if sys.argv[1] == "print_threads": diff --git a/storage/__init__.py b/storage/__init__.py index 0185283..2a9def7 100644 --- a/storage/__init__.py +++ b/storage/__init__.py @@ -1,3 +1,5 @@ from database import db_session, init_db from mail_model import Mail from thread_model import MailThread +from database import Base +metadata=Base.metadata diff --git a/storage/mail_model.py b/storage/mail_model.py index 5157c4d..37848f1 100644 --- a/storage/mail_model.py +++ b/storage/mail_model.py @@ -75,6 +75,8 @@ class Mail(Base): from_array=[] # print "Status" # print env + if not env.date is None: + self.date=env.date if not env.to is None: for t in env.to: a={"host": t.host, "mail": t.mailbox} diff --git a/storage/thread_model.py b/storage/thread_model.py index 0bfab63..35641c4 100644 --- a/storage/thread_model.py +++ b/storage/thread_model.py @@ -23,15 +23,17 @@ class MailThread(Base): __tablename__ = 'threads' id = Column(Integer, primary_key=True) firstmail = Column(Integer) + date = Column(DateTime) islabeled = Column(Boolean) opened = Column(Boolean) body = Column(Text) + maintopic=Column(String) __schema__=FullThreadSchema __jsonid__='thread' __whiteattrs__= ["body"] __jsonattrs__=None answered=False - maintopic="information" +# maintopic="information" lang="" def bdy(self): return yaml.load(self.body) @@ -51,7 +53,7 @@ class MailThread(Base): def tstr(self): fr=yaml.load(self.mails()[0].from_) - return "(" + str(self.answered)+ ", "+ str(self.maintopic)+ ", "+ str(self.lang) + ") " + str(self.firstmail)+": "+str(fr[0]["mail"])+"@"+str(fr[0]["host"]) + " | ".join(yaml.load(self.mails()[0].subject)) + return "(" + str(self.opened)+ ", "+ str(self.maintopic)+ ", "+ str(self.lang) + ") " + str(self.firstmail)+": "+str(fr[0]["mail"])+"@"+str(fr[0]["host"]) + " | ".join(yaml.load(self.mails()[0].subject)) def mails(self): a=[] @@ -94,7 +96,34 @@ class MailThread(Base): m.compile_text() db_session.add(m) db_session.commit() - + self.date=self.mails()[0].date + + def print_text(self,filter="all"): + a=u"" + def mail_txt(m): + #txt ="Gesendet von: "+ str(m.from_mailbox)+"@"+str(m.from_host) +"\n" + txt="" + fr=yaml.load(m.from_) + txt= txt+ "Gesendet von: "+str(fr[0]["mail"])+"@"+str(fr[0]["host"])+" am "+ str(m.date) + "\n" + t=yaml.load(m.text) + if type(t) is unicode: + #txt=txt + txt=txt+t + else: + t=t.decode("ISO-8859-1") + txt=txt+t + return txt + + if filter=="all": + mm=self.mails() + for m in mm: + a=a+mail_txt(m)+"\n****........................................***\n" + elif filter=="first": + a=mail_txt(m[0]) + a=re.sub(r'\n\s*\n',r'\n',a) + a=re.sub(r'',r'',a,flags=re.MULTILINE|re.DOTALL) + a=re.sub(r'\s*>+ .*\n',r'',a) + return a def text(self,filter="all"): a=u"" def mail_txt(m): @@ -108,7 +137,7 @@ class MailThread(Base): mm=self.mails() if filter=="all": for m in mm: - a=a+mail_txt(m)+"****........................................***\n" + a=a+mail_txt(m)+"\n****........................................***\n" elif filter=="first": a=mail_txt(m[0]) a=re.sub(r'\n\s*\n',r'\n',a) diff --git a/test.sqlite b/test.sqlite index b747096..21a1c0a 100644 Binary files a/test.sqlite and b/test.sqlite differ