diff --git a/classifier/__init__.py b/classifier/__init__.py index 7303a39..cf5c18a 100644 --- a/classifier/__init__.py +++ b/classifier/__init__.py @@ -2,3 +2,4 @@ from classifier import in_training, print_answers from classifier import get_pipe, test_pipe, get_training_threads from training import train_single_thread from classifier import store_training_data +from prediction import predict_threads diff --git a/classifier/classifier.py b/classifier/classifier.py index b332771..66ef5ab 100644 --- a/classifier/classifier.py +++ b/classifier/classifier.py @@ -9,14 +9,14 @@ from sklearn.model_selection import train_test_split import numpy as np import yaml from storage import MailThread,db_session -from sklearn.metrics import accuracy_score +from sklearn.metrics import accuracy_score, confusion_matrix -with open("data.yml", 'r') as stream: - try: - train=yaml.load(stream) - except yaml.YAMLError as exc: - print(exc) +#with open("data.yml", 'r') as stream: +# try: +# train=yaml.load(stream) +# except yaml.YAMLError as exc: +# print(exc) data_types= { "answered": bool, "maintopic": str, "lang": str} @@ -50,25 +50,34 @@ def get_training_threads(key="answered", filters=[]): d_a2=[] #------------------------------------ if "db" in filters: - tt=db_session.query(MailThread).filter(MailThread.istrained==True).all() + q=db_session.query(MailThread).filter(MailThread.istrained.is_(True)) + if "de" in filters: + q=q.filter(MailThread.lang=="de") + elif "en" in filters: + q=q.filter(MailThread.lang=="en") + tt=q.all() for t in tt: t_a.append(t) if key =="answered": - d_a.append(t.answered) + d_a.append(t.is_answered()) elif key=="maintopic": d_a.append(t.maintopic) - - + elif key=="lang": + d_a.append(t.lang) else: - for i in train: - if train[i].has_key(key): # In den Trainingsdaten muss der relevante Key sein - t=db_session.query(MailThread).filter(MailThread.firstmail==i).first() - if not t is None: # Thread muss in der Datenbank sein - t_a.append(t) - d_a.append(train[i][key]) + raise ValueError("Database Filter now required") le=LabelEncoder() d_a2=le.fit_transform(d_a) return (t_a,d_a2,le) + + + # else: + # for i in train: + # if train[i].has_key(key): # In den Trainingsdaten muss der relevante Key sein +# t=db_session.query(MailThread).filter(MailThread.firstmail==i).first#() +# if not t is None: # Thread muss in der Datenbank sein +# t_a.append(t) +# d_a.append(train[i][key]) def in_training(i, key="answered"): @@ -102,17 +111,24 @@ class ThreadTextExtractor(BaseEstimator, TransformerMixin): def transform(self, X,y=None): return [t.text() for t in X] +class ThreadFirstTextExtractor(BaseEstimator, TransformerMixin): + def fit(self, x, y=None): + return self + def transform(self, X,y=None): + return [t.text("first") for t in X] + def get_pipe(p=b"pipe1",k=b"answered",filters=[]): p=build_pipe(p) tt= get_training_threads(k,filters) + #print tt if len(tt[0]) > 0: p.fit(tt[0],tt[1]) return p,tt[2] else: return None, None -def test_pipe(pp,k): - tt= get_training_threads(k) +def test_pipe(pp,k,f=[]): + tt= get_training_threads(k,f) X_train,X_test,y_train,y_test=train_test_split(tt[0],tt[1],test_size=0.4) if type(pp) is list: for p in pp: @@ -120,8 +136,9 @@ def test_pipe(pp,k): p=build_pipe(p) p.fit(X_train,y_train) ypred=p.predict(X_test) + print tt[2].classes_ print accuracy_score(y_test,ypred) - + print confusion_matrix(y_test,ypred) @@ -173,6 +190,137 @@ def build_pipe(p=b"pipe1"): } )), ('mlc', MLPClassifier()) ]) + elif p=="pipe2d": + p = Pipeline([ + ('union', FeatureUnion(transformer_list=[ + ('subject', Pipeline([('tse', ThreadSubjectExtractor()), + ('cv',CountVectorizer()), + ('tfidf', TfidfTransformer()) + ])), + ('text', Pipeline([('tte',ThreadTextExtractor()), + ('cv',CountVectorizer()), + ('tfidf', TfidfTransformer()) + ])), + ('firsttext', Pipeline([('tte',ThreadFirstTextExtractor()), + ('cv',CountVectorizer()), + ('tfidf', TfidfTransformer()) + ])), + ('envelope', Pipeline([('tde', ThreadDictExtractor()), + ('dv',DictVectorizer()) + ])) + ], transformer_weights={ + 'subject': 1.3, + 'text': 1, + 'firsttext': 0.9, + 'envelope': 0.2 + } )), + ('mlc', MLPClassifier()) + ]) + + elif p=="pipe2e": + p = Pipeline([ + ('union', FeatureUnion(transformer_list=[ + ('subject', Pipeline([('tse', ThreadSubjectExtractor()), + ('cv',CountVectorizer()), + ('tfidf', TfidfTransformer()) + ])), + ('text', Pipeline([('tte',ThreadTextExtractor()), + ('cv',CountVectorizer()), + ('tfidf', TfidfTransformer()) + ])), + ('firsttext', Pipeline([('tte',ThreadFirstTextExtractor()), + ('cv',CountVectorizer()), + ('tfidf', TfidfTransformer()) + ])), + ('envelope', Pipeline([('tde', ThreadDictExtractor()), + ('dv',DictVectorizer()) + ])) + ], transformer_weights={ + 'subject': 1.3, + 'text': 1, + 'firsttext': 0.9, + 'envelope': 0.2 + } )), + ('mlc', MLPClassifier(hidden_layer_sizes=(100,100))) + ]) + elif p=="pipe2e1": + p = Pipeline([ + ('union', FeatureUnion(transformer_list=[ + ('subject', Pipeline([('tse', ThreadSubjectExtractor()), + ('cv',CountVectorizer()), + ('tfidf', TfidfTransformer()) + ])), + ('text', Pipeline([('tte',ThreadTextExtractor()), + ('cv',CountVectorizer()), + ('tfidf', TfidfTransformer()) + ])), + ('firsttext', Pipeline([('tte',ThreadFirstTextExtractor()), + ('cv',CountVectorizer()), + ('tfidf', TfidfTransformer()) + ])), + ('envelope', Pipeline([('tde', ThreadDictExtractor()), + ('dv',DictVectorizer()) + ])) + ], transformer_weights={ + 'subject': 1.3, + 'text': 1, + 'firsttext': 0.9, + 'envelope': 0.2 + } )), + ('mlc', MLPClassifier(hidden_layer_sizes=(100,100,50))) + ]) + elif p=="pipe2f": + p = Pipeline([ + ('union', FeatureUnion(transformer_list=[ + ('subject', Pipeline([('tse', ThreadSubjectExtractor()), + ('cv',CountVectorizer(ngram_range=(1,1))), + ('tfidf', TfidfTransformer()) + ])), + ('text', Pipeline([('tte',ThreadTextExtractor()), + ('cv',CountVectorizer(ngram_range=(1,1))), + ('tfidf', TfidfTransformer()) + ])), + ('firsttext', Pipeline([('tte',ThreadFirstTextExtractor()), + ('cv',CountVectorizer(ngram_range=(1,2))), + ('tfidf', TfidfTransformer()) + ])), + ('envelope', Pipeline([('tde', ThreadDictExtractor()), + ('dv',DictVectorizer()) + ])) + ], transformer_weights={ + 'subject': 1.3, + 'text': 1, + 'firsttext': 0.9, + 'envelope': 0.2 + } )), + ('mlc', MLPClassifier(hidden_layer_sizes=(100,100))) + ]) + elif p=="pipe2g": + p = Pipeline([ + ('union', FeatureUnion(transformer_list=[ + ('subject', Pipeline([('tse', ThreadSubjectExtractor()), + ('cv',CountVectorizer(ngram_range=(1,1))), + ('tfidf', TfidfTransformer()) + ])), + ('text', Pipeline([('tte',ThreadTextExtractor()), + ('cv',CountVectorizer(ngram_range=(1,1))), + ('tfidf', TfidfTransformer()) + ])), + ('firsttext', Pipeline([('tte',ThreadFirstTextExtractor()), + ('cv',CountVectorizer(ngram_range=(1,2))), + ('tfidf', TfidfTransformer()) + ])), + ('envelope', Pipeline([('tde', ThreadDictExtractor()), + ('dv',DictVectorizer()) + ])) + ], transformer_weights={ + 'subject': 1.3, + 'text': 1, + 'firsttext': 0.9, + 'envelope': 0.2 + } )), + ('mlc', MLPClassifier(hidden_layer_sizes=(100,100,100))) + ]) elif p=="pipe2c": p = Pipeline([ ('union', FeatureUnion(transformer_list=[ diff --git a/classifier/prediction.py b/classifier/prediction.py new file mode 100644 index 0000000..35eac79 --- /dev/null +++ b/classifier/prediction.py @@ -0,0 +1,22 @@ +from classifier import get_training_threads, print_answers, in_training, store_training_data, get_pipe +from storage import db_session, MailThread + +def predict_threads(): + pipe1,le=get_pipe("pipe1",b"answered",["db"]) + pipe2,le2=get_pipe("pipe2g", b"maintopic",["db"]) + pipe3,le3=get_pipe("pipe2b", b"lang",["db"]) + q=db_session.query(MailThread).filter(MailThread.istrained.op("IS NOT")(True)) + mail_threads=q.all() + if len(mail_threads) ==0: + raise ValueError("no untrained threads found") + answered=le.inverse_transform(pipe1.predict(mail_threads)) + maintopic=le2.inverse_transform(pipe2.predict(mail_threads)) + lang=le3.inverse_transform(pipe3.predict(mail_threads)) + + for i, t in enumerate(mail_threads): + t.answered=bool(answered[i]) + t.opened=bool(answered[i]) + t.maintopic=str(maintopic[i]) + t.lang=str(lang[i]) + db_session.add(t) + db_session.commit() diff --git a/flaskapp/__init__.py b/flaskapp/__init__.py index 2a43be7..a1c1388 100644 --- a/flaskapp/__init__.py +++ b/flaskapp/__init__.py @@ -13,10 +13,11 @@ package_directory = os.path.dirname(os.path.abspath(__file__)) cfg = Config(file(os.path.join(package_directory, 'config.cfg'))) +maintopic_values=["studium", "information","ausleihen","jobausschreibung", "umfragen"] def render_index(mths,opened=None,code=200): return render_template("index.html",mths=mths, - title=cfg.title.decode("utf8"),opened=opened + title=cfg.title.decode("utf8"),opened=opened,maintopics=maintopic_values ), code from classifier import get_pipe #mail_threads=db_session.query(MailThread).all() @@ -33,7 +34,6 @@ from classifier import get_pipe # t.maintopic=maintopic[i] # t.lang=lang[i] -maintopic_values=["studium", "information","ausleihen"] @app.route("/") def hello(): @@ -49,10 +49,13 @@ def store_value(id,key,value): mth.opened=bool(value) if key=="maintopic" and value in maintopic_values: mth.maintopic=str(value) + if key=="lang" and value in maintopic_values: + mth.lang=str(value) if key =="trained": value = value in ["true", "True", "1", "t"] mth.istrained=bool(value) - + db_session.add(mth) + db_session.commit() @app.route("/") def store_answered(id): @@ -60,6 +63,7 @@ def store_answered(id): value = request.args.get('value') if not key is None and not value is None: store_value(id,key,value) + mth=db_session.query(MailThread).filter(MailThread.firstmail==int(id)).first() return render_index([mth], opened=id) @@ -73,7 +77,10 @@ def studium(): @app.route("//") def maintopic(maintopic): - mth=db_session.query(MailThread).filter(MailThread.maintopic=="%s" % maintopic).order_by(desc(MailThread.date)).all() + if maintopic == "trained": + mth=db_session.query(MailThread).filter(MailThread.istrained==True).order_by(desc(MailThread.date)).all() + else: + mth=db_session.query(MailThread).filter(MailThread.maintopic=="%s" % maintopic).order_by(desc(MailThread.date)).all() return render_index(mth) @app.route("//") diff --git a/flaskapp/templates/index.html b/flaskapp/templates/index.html index 172aa41..755065e 100644 --- a/flaskapp/templates/index.html +++ b/flaskapp/templates/index.html @@ -1,55 +1,104 @@ + + - - {{title}} - - - - - - -
-
-
-

{{title}}

+ + {{title}} + + + + + + +
+
+
+

{{title}}

+ +
+ + {% for m in mths %} +
+ + +
+ +
-
+ + + {% for txt in m.print_text() %} +
+ {{ txt }} +
+ {% endfor %} +
+ {% endfor %} +
+
- +
+
+
+ diff --git a/run.py b/run.py index 4406645..768745a 100644 --- a/run.py +++ b/run.py @@ -16,7 +16,8 @@ from storage import Mail, MailThread, db_session #import yaml #import email from classifier import get_training_threads, print_answers, in_training, store_training_data, get_pipe, test_pipe, train_single_thread # , pipe2, pipe2b - +from classifier import predict_threads +maintopic_values=["studium", "information","ausleihen","jobausschreibung", "umfragen"] def predict_thread(p,l,t): pre=p.predict([t]) @@ -29,13 +30,19 @@ if len(sys.argv)>1: if sys.argv[1] == "fetch_threads": print flatten_threads(fetch_threads()) - + if sys.argv[1] == "predict_threads2": + predict_threads() if sys.argv[1] == "predict_threads": - pipe1,le=get_pipe("pipe1",b"answered") - pipe2,le2=get_pipe("pipe2b", b"maintopic") - pipe3,le3=get_pipe("pipe2b", b"lang") - mail_threads=db_session.query(MailThread).filter(MailThread.istrained==False).all() - + print "predicting threads" + pipe1,le=get_pipe("pipe1",b"answered",["db"]) + pipe2,le2=get_pipe("pipe2g", b"maintopic",["db"]) + pipe3,le3=get_pipe("pipe2b", b"lang",["db"]) + q=db_session.query(MailThread).filter(MailThread.istrained.op("IS NOT")(True)) + + mail_threads=q.all() + + if len(mail_threads) ==0: + raise ValueError("no untrained threads found") answered=le.inverse_transform(pipe1.predict(mail_threads)) maintopic=le2.inverse_transform(pipe2.predict(mail_threads)) lang=le3.inverse_transform(pipe3.predict(mail_threads)) @@ -48,7 +55,17 @@ if len(sys.argv)>1: t.lang=str(lang[i]) db_session.add(t) db_session.commit() + if sys.argv[1]=="stats": + for topic in maintopic_values: + print topic + n_answ=db_session.query(MailThread).filter(MailThread.maintopic==topic).filter(MailThread.answered.op("IS")(True)).count() + n_nansw=db_session.query(MailThread).filter(MailThread.maintopic==topic).filter(MailThread.answered.op("IS NOT")(True)).count() + n_ges=db_session.query(MailThread).filter(MailThread.maintopic==topic).count() + print "%d answered and %d not answered of %d(%d) that are %d percent answerd" % (n_answ,n_nansw, n_ges,n_answ+n_nansw, float(n_answ)/float(n_ges)*100.0) + + + if sys.argv[1] == "run_server": from flaskapp import app app.run(port=3000,debug=True) @@ -72,7 +89,9 @@ if len(sys.argv)>1: print t.text() if sys.argv[1] == "compile_threads": mth=db_session.query(MailThread).all() - for t in mth: + l=len(mth) + for i,t in enumerate(mth): + print "%d/%d" % (i,l) t.compile() if sys.argv[1] == "trained_threads_from_yml": @@ -115,9 +134,16 @@ if len(sys.argv)>1: print t.text() predict_thread(pb,lb,t) train_single_thread(t.firstmail, p, labelencoder, b"maintopic") + if sys.argv[1] == "benchpipe3": + test_pipe(["pipe2d","pipe2e","pipe2e1","pipe2f","pipe2g"],"maintopic",["db","de"]) if sys.argv[1] == "benchpipe2": - test_pipe(["pipe2","pipe2b","pipe2c"],"maintopic") + test_pipe(["pipe2","pipe2b","pipe2c","pipe2d"],"maintopic",["db","de"]) +# print "testing with db training data:" +# test_pipe(["pipe2b"],"maintopic",["db"]) +# test_pipe(["pipe2b"],"maintopic",["db"]) +# print "testing only with german data" +# test_pipe(["pipe2b"],"maintopic",["db","de"]) if sys.argv[1] == "testpipe2": from classifier import ThreadSubjectExtractor, ThreadTextExtractor diff --git a/storage/fetch_mail.py b/storage/fetch_mail.py index 2a10136..b8d275f 100644 --- a/storage/fetch_mail.py +++ b/storage/fetch_mail.py @@ -4,12 +4,17 @@ from datetime import date from config import Config f=file('config.cfg') cfg=Config(f) -server = imapclient.IMAPClient(cfg.host, use_uid=True, ssl=True) -server.login(cfg.user, cfg.password) -server.select_folder('INBOX') - +try: + server = imapclient.IMAPClient(cfg.host, use_uid=True, ssl=True) + server.login(cfg.user, cfg.password) + server.select_folder('INBOX') +except Error: + print "error initializing server" + server=None def fetch_mail(myid): + if server is None: + raise ValueError("Server is None") m=server.fetch([myid],['ENVELOPE','RFC822']) m=m[myid] m["id"]=myid @@ -19,7 +24,9 @@ def fetch_thread(tp): return tp def fetch_threads(): - src=server.thread(criteria=[b'SUBJECT', b'service', b'SINCE', date(2017,05,01)]) + if server is None: + raise ValueError("Server is None") + src=server.thread(criteria=[b'SUBJECT', b'service', b'SINCE', date(2017,02,01)]) #, b'BEFORE', date(2017,08,01) return src @@ -44,12 +51,17 @@ def store_threads(thrds): if th == None: th=MailThread() th.firstmail=t[0] + th.body=yaml.dump(t) # body zb (27422,27506), (27450,) + th.islabeled=False + th.opened=True + th.istrained=False elif not th.body == yaml.dump(t): # Ansonsten body vergleichen th.body=yaml.dump(t) # body zb (27422,27506), (27450,) th.islabeled=False th.opened=True - else: - th.body=yaml.dump(t) + th.istrained=False +# else: +# th.body=yaml.dump(t) db_session.add(th) db_session.commit() diff --git a/storage/mail_model.py b/storage/mail_model.py index 37848f1..23023ef 100644 --- a/storage/mail_model.py +++ b/storage/mail_model.py @@ -9,6 +9,7 @@ import yaml import email from fetch_mail import fetch_mail import bs4 +import re class FullMailSchema(Schema): id=fields.Integer() text=fields.String() @@ -56,6 +57,8 @@ class Mail(Base): return mm def get_email(self): + if self.body is None: + raise ValueError("body not yet loaded") em=email.message_from_string(yaml.load(self.body)) return em @@ -91,11 +94,10 @@ class Mail(Base): def dict_envelope(self): d={} - i=0 - for p in yaml.load(self.subject): - if p is not None: - d["subject_"+str(i)]=p - i=i+1 + if self.to_ is None: + self.compile_envelope() + if self.to_ is None: + raise ValueError("Self.to_ of mail not yet compiled") i=0 for p in yaml.load(self.to_): if p["host"] is not None: @@ -126,3 +128,19 @@ class Mail(Base): self.text= yaml.dump(b4.get_text()) else: self.text =yaml.dump( pl) + def print_text(self): + txt="" + fr=yaml.load(self.from_) + txt= txt+ "Gesendet von: "+str(fr[0]["mail"])+"@"+str(fr[0]["host"])+" am "+ str(self.date) + "\n" + t=yaml.load(self.text) + if type(t) is unicode: + #txt=txt + txt=txt+t + else: + t=t.decode("ISO-8859-1") + txt=txt+t + txt=re.sub(r'\n\s*\n',r'\n',txt) + txt=re.sub(r'',r'',txt,flags=re.MULTILINE|re.DOTALL) + txt=re.sub(r'\s*>+ .*\n',r'',txt) + + return txt diff --git a/storage/thread_model.py b/storage/thread_model.py index 69b96fa..c6ff1ab 100644 --- a/storage/thread_model.py +++ b/storage/thread_model.py @@ -73,7 +73,7 @@ class MailThread(Base): a=[] # print "maildicts: "+ str(self.mails()) for m in self.mails(): - m.compile_envelope() +# m.compile_envelope() a.append(m.dict_envelope()) return a @@ -85,6 +85,9 @@ class MailThread(Base): for i in range(0,len(dc)): for k, v in dc[i].iteritems(): d["mail_"+str(i)+"_"+k]=v + for k, v in dc[-1].iteritems(): + d["mail_last_"+k]=v + return d def subject(self): @@ -103,30 +106,13 @@ class MailThread(Base): self.date=self.mails()[0].date def print_text(self,filter="all"): - a=u"" - def mail_txt(m): - #txt ="Gesendet von: "+ str(m.from_mailbox)+"@"+str(m.from_host) +"\n" - txt="" - fr=yaml.load(m.from_) - txt= txt+ "Gesendet von: "+str(fr[0]["mail"])+"@"+str(fr[0]["host"])+" am "+ str(m.date) + "\n" - t=yaml.load(m.text) - if type(t) is unicode: - #txt=txt - txt=txt+t - else: - t=t.decode("ISO-8859-1") - txt=txt+t - return txt - + a=[] if filter=="all": mm=self.mails() for m in mm: - a=a+mail_txt(m)+"\n****........................................***\n" + a.append(m.print_text()) elif filter=="first": - a=mail_txt(m[0]) - a=re.sub(r'\n\s*\n',r'\n',a) - a=re.sub(r'',r'',a,flags=re.MULTILINE|re.DOTALL) - a=re.sub(r'\s*>+ .*\n',r'',a) + a.append(m[0].print_text()) return a def text(self,filter="all"): a=u"" @@ -143,7 +129,7 @@ class MailThread(Base): for m in mm: a=a+mail_txt(m)+"\n****........................................***\n" elif filter=="first": - a=mail_txt(m[0]) + a=mail_txt(mm[0]) a=re.sub(r'\n\s*\n',r'\n',a) a=re.sub(r'',r'',a,flags=re.MULTILINE|re.DOTALL) a=re.sub(r'\s*>+ .*\n',r'',a) diff --git a/test.sqlite b/test.sqlite index 5ef2874..cc6bf43 100644 Binary files a/test.sqlite and b/test.sqlite differ