diff --git a/classifier/__init__.py b/classifier/__init__.py new file mode 100644 index 0000000..7303a39 --- /dev/null +++ b/classifier/__init__.py @@ -0,0 +1,4 @@ +from classifier import in_training, print_answers +from classifier import get_pipe, test_pipe, get_training_threads +from training import train_single_thread +from classifier import store_training_data diff --git a/classifier/classifier.py b/classifier/classifier.py new file mode 100644 index 0000000..c51cc4b --- /dev/null +++ b/classifier/classifier.py @@ -0,0 +1,191 @@ +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.feature_extraction import DictVectorizer +from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer +from sklearn.preprocessing import LabelEncoder +from sklearn.pipeline import Pipeline, FeatureUnion +from sklearn.naive_bayes import MultinomialNB +from sklearn.neural_network import MLPClassifier +from sklearn.model_selection import train_test_split +import numpy as np +import yaml +from storage import MailThread,db_session +from sklearn.metrics import accuracy_score + + +with open("data.yml", 'r') as stream: + try: + train=yaml.load(stream) + except yaml.YAMLError as exc: + print(exc) + +data_types= { "answered": bool, "maintopic": str, "lang": str} + +def set_train_data(i,d,key=b"answered"): + global train + #------------------------------------ + if not data_types.has_key(key): + raise ValueError("Key "+str(key)+" unknown") + if not train.has_key(i) or train[i] is None: + train[i]={} + if not type(d) is data_types[key]: + raise TypeError("Data - %s - for key "% d +str(key)+" must be " +str(data_types[key])+ " but it is "+ str(type(d))) + #------------------------------------ + train[i][key]=d + + +def store_training_data(i, d,key=b"answered"): + set_train_data(i,d,key) + with open("data.yml","w") as file: + file.write(yaml.dump(train,default_flow_style=True)) + file.close() + + +# Lade Trainingsdaten fuer einen angegebenen key (Label/Eigenschaft) +def get_training_threads(key="answered", filter=[]): + if not data_types.has_key(key): + raise ValueError("Key "+str(key)+" unknown") + #------------------------------------ + t_a=[] + d_a=[] + d_a2=[] + #------------------------------------ + for i in train: + if train[i].has_key(key): # In den Trainingsdaten muss der relevante Key sein + t=db_session.query(MailThread).filter(MailThread.firstmail==i).first() + if not t is None: # Thread muss in der Datenbank sein + t_a.append(t) + d_a.append(train[i][key]) + le=LabelEncoder() + d_a2=le.fit_transform(d_a) + return (t_a,d_a2,le) + + +def in_training(i, key="answered"): + return train.has_key(i) and train[i].has_key(key) + + +def print_answers(l): + + cc=l.classes_ + c_id=l.transform(cc) + for i,c in enumerate(cc): + print str(i) + ": " + str(c) + return None + + +class ThreadDictExtractor(BaseEstimator, TransformerMixin): + def fit(self, x, y=None): + return self + def transform(self, X,y=None): + return [t.mail_flat_dict() for t in X] + +class ThreadSubjectExtractor(BaseEstimator, TransformerMixin): + def fit(self, x, y=None): + return self + def transform(self, X,y=None): + return [t.subject() for t in X] + +class ThreadTextExtractor(BaseEstimator, TransformerMixin): + def fit(self, x, y=None): + return self + def transform(self, X,y=None): + return [t.text() for t in X] + +def get_pipe(p=b"pipe1",k=b"answered"): + p=build_pipe(p) + tt= get_training_threads(k) + if len(tt[0]) > 0: + p.fit(tt[0],tt[1]) + return p,tt[2] + else: + return None, None + +def test_pipe(pp,k): + tt= get_training_threads(k) + X_train,X_test,y_train,y_test=train_test_split(tt[0],tt[1],test_size=0.2) + if type(pp) is list: + for p in pp: + print "pipe: %s" % p + p=build_pipe(p) + p.fit(X_train,y_train) + ypred=p.predict(X_test) + print accuracy_score(y_test,ypred) + + + + +def build_pipe(p=b"pipe1"): + if p == "pipe1": + p=Pipeline([('tde', ThreadDictExtractor()), + ('dv',DictVectorizer()), + ('clf', MultinomialNB()) + ]) + elif p=="pipe2": + p = Pipeline([ + ('union', FeatureUnion(transformer_list=[ + ('subject', Pipeline([('tse', ThreadSubjectExtractor()), + ('cv',CountVectorizer()), + ('tfidf', TfidfTransformer()) + ])), + ('text', Pipeline([('tte',ThreadTextExtractor()), + ('cv',CountVectorizer()), + ('tfidf', TfidfTransformer()) + ])), + ('envelope', Pipeline([('tde', ThreadDictExtractor()), + ('dv',DictVectorizer()) + ])) + ], transformer_weights={ + 'subject': 1, + 'text': 0.7, + 'envelope': 0.7 + } )), + ('clf', MultinomialNB()) + ]) + elif p=="pipe2b": + p = Pipeline([ + ('union', FeatureUnion(transformer_list=[ + ('subject', Pipeline([('tse', ThreadSubjectExtractor()), + ('cv',CountVectorizer()), + ('tfidf', TfidfTransformer()) + ])), + ('text', Pipeline([('tte',ThreadTextExtractor()), + ('cv',CountVectorizer()), + ('tfidf', TfidfTransformer()) + ])), + ('envelope', Pipeline([('tde', ThreadDictExtractor()), + ('dv',DictVectorizer()) + ])) + ], transformer_weights={ + 'subject': 1, + 'text': 0.7, + 'envelope': 0.7 + } )), + ('mlc', MLPClassifier()) + ]) + elif p=="pipe2c": + p = Pipeline([ + ('union', FeatureUnion(transformer_list=[ + ('subject', Pipeline([('tse', ThreadSubjectExtractor()), + ('cv',CountVectorizer()), + ('tfidf', TfidfTransformer()) + ])), + ('text', Pipeline([('tte',ThreadTextExtractor()), + ('cv',CountVectorizer()), + ('tfidf', TfidfTransformer()) + ])), + ('envelope', Pipeline([('tde', ThreadDictExtractor()), + ('dv',DictVectorizer()) + ])) + ], transformer_weights={ + 'subject': 1, + 'text': 1, + 'envelope': 0.4 + } )), + ('mlc', MLPClassifier()) + ]) + else: + raise ValueError("The pipe %s is not a valid pipe") + return p + + + diff --git a/classifier/classify_mail.py b/classifier/classify_mail.py new file mode 100644 index 0000000..00a87a7 --- /dev/null +++ b/classifier/classify_mail.py @@ -0,0 +1,25 @@ +from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer +from sklearn.feature_extraction import DictVectorizer +from sklearn.naive_bayes import MultinomialNB +from sklearn.pipeline import Pipeline, FeatureUnion +import sys +import yaml +from sklearn.preprocessing import OneHotEncoder +from sklearn.preprocessing import LabelEncoder + + +text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB())]) + +text_ohc = Pipeline([('ohc', OneHotEncoder()),('clf', MultinomialNB())]) + +combined_features = FeatureUnion([('vect1', CountVectorizer()),('vect2', CountVectorizer())]) + + +enc=OneHotEncoder() +with open("example_1.yaml", 'r') as stream: + try: + train=yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) + +tc=text_clf.fit(train["data"],train["target"]) diff --git a/classifier/classify_text.py b/classifier/classify_text.py new file mode 100644 index 0000000..f95aac4 --- /dev/null +++ b/classifier/classify_text.py @@ -0,0 +1,42 @@ +from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer +from sklearn.naive_bayes import MultinomialNB +from sklearn.pipeline import Pipeline +text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB())]) +import sys +import yaml + + + +with open("example_1.yaml", 'r') as stream: + try: + train=yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) + +tc=text_clf.fit(train["data"],train["target"]) +print(sys.argv[1]) + +answ=(tc.predict([sys.argv[1]]))[0] +print train["target_names"][answ] + +for i in range(0, (len(train["target_names"]))): + print (str(i)+" "+ train["target_names"][i]) + +ca=int(raw_input("Correct answer..")) + + +if ca == answ: + print ("Yes I got it right") +else: + print("should I remember this?") + a=raw_input("shoudIrememberthis?") + if a == "y": + train["data"].append(sys.argv[1]) + train["target"].append(ca) + print yaml.dump(train,default_flow_style=False) + file=open("example_1.yaml","w") + file.write(yaml.dump(train,default_flow_style=False)) + file.close() + else: + print ("Ok, I already forgot") + diff --git a/classifier/training.py b/classifier/training.py new file mode 100644 index 0000000..cf68507 --- /dev/null +++ b/classifier/training.py @@ -0,0 +1,70 @@ +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import LabelEncoder +import numpy +from storage import Mail, MailThread, db_session +from classifier import store_training_data, print_answers + + + +def train_fit_pipe(): + tt= get_training_threads(b"answered") + pipe1.fit(tt[0],tt[1]) + return pipe1,tt[2] +def train_fit_pipe2(): + tt= get_training_threads(b"maintopic") + pipe2.fit(tt[0],tt[1]) + return pipe2,tt[2] + +def train_fit_pipe2b(): + tt= get_training_threads(b"maintopic") + pipe2b.fit(tt[0],tt[1]) + return pipe2b,tt[2] + + +def predict_thread(mth,p,le,key): + #------------------------------------------------------- + if not type(p) is Pipeline: raise TypeError("Second Argument needs to be type Pipeline") + if not type(le) is LabelEncoder: raise TypeError("Second Argument needs to be type LabelEncoder") + #------------------------------------------------------- + pre=p.predict([mth]) + answ=pre[0] + print "Status is answered is estimated to be: " + str(le.inverse_transform(pre)[0]) + return answ + +def train_single_thread(tid,p,le,key="answered"): + if (not type(tid) is int): raise TypeError("ID must be of type int") + + mth=db_session.query(MailThread).filter(MailThread.firstmail==tid).first() + if mth is None: raise ValueError("Thread with firstmail %d not in Database" %tid) + print mth.firstmail + print mth.subject() + print mth.text() + + if not p is None and not le is None: + answ=predict_thread(mth,p,le,key) + else: answ=None + if not le is None: + print_answers(le) + + ca=raw_input("Correct answer..") + try: + ca=int(ca) + except ValueError: + print "String Data" + + if type(ca)==int: + if ca == answ: + print ("Yes I got it right") + else: + print("Oh no...!") + l=le.inverse_transform([ca])[0] + if type(l) is numpy.bool_: + l=bool(l) + if type(l) is numpy.string_: + l=str(l) + store_training_data(tid,l, key) + elif not ca.strip() == "": + store_training_data(tid, ca, key) + else: + print "couldn't handle %s" % ca + diff --git a/data.yml b/data.yml index b390106..0589963 100644 --- a/data.yml +++ b/data.yml @@ -1,31 +1,32 @@ -{26808: {maintopic: jobausschreibung}, 27017: {maintopic: jobausschreibung}, 27070: { - maintopic: ausleihen}, 27083: {maintopic: ausleihen}, 27086: {maintopic: information}, - 27094: {maintopic: information}, 27096: {maintopic: jobausschreibung}, 27102: { - maintopic: studium}, 27118: {maintopic: information}, 27127: {maintopic: studium}, - 27130: {maintopic: information}, 27133: {maintopic: information}, 27141: {maintopic: information}, - 27146: {maintopic: information}, 27166: {maintopic: umfragen}, 27171: {maintopic: ausleihen}, - 27178: {maintopic: studium}, 27182: {maintopic: studium}, 27197: {maintopic: information}, - 27201: {maintopic: information}, 27218: {maintopic: information}, 27219: {maintopic: studium}, - 27222: {maintopic: information}, 27226: {maintopic: ausleihen}, 27420: {answered: true, - maintopic: studium}, 27422: {answered: true, maintopic: studium}, 27425: {answered: false, - maintopic: studium}, 27431: {answered: false, maintopic: information}, 27434: { - answered: false, maintopic: information}, 27435: {answered: false}, 27438: {answered: false, - maintopic: information}, 27439: {answered: true, maintopic: studium}, 27441: { - answered: false, maintopic: studium}, 27444: {answered: true, maintopic: ausleihen}, +{26808: {maintopic: jobausschreibung}, 27008: {lang: de}, 27017: {lang: de, maintopic: jobausschreibung}, + 27061: {lang: de}, 27070: {maintopic: ausleihen}, 27083: {maintopic: ausleihen}, + 27086: {maintopic: information}, 27094: {maintopic: information}, 27096: {maintopic: jobausschreibung}, + 27102: {lang: en, maintopic: studium}, 27118: {maintopic: information}, 27127: { + maintopic: studium}, 27130: {maintopic: information}, 27133: {maintopic: information}, + 27141: {maintopic: information}, 27146: {maintopic: information}, 27166: {maintopic: umfragen}, + 27171: {maintopic: ausleihen}, 27178: {maintopic: studium}, 27182: {maintopic: studium}, + 27197: {maintopic: information}, 27201: {maintopic: information}, 27218: {maintopic: information}, + 27219: {maintopic: studium}, 27222: {maintopic: information}, 27226: {maintopic: ausleihen}, + 27420: {answered: true, maintopic: studium}, 27422: {answered: true, maintopic: studium}, + 27425: {answered: false, maintopic: studium}, 27431: {answered: false, maintopic: information}, + 27434: {answered: false, lang: de, maintopic: information}, 27435: {answered: false}, + 27438: {answered: false, maintopic: information}, 27439: {answered: true, maintopic: studium}, + 27441: {answered: false, maintopic: studium}, 27444: {answered: true, maintopic: ausleihen}, 27454: {answered: false, maintopic: information}, 27455: {answered: false, maintopic: information}, - 27456: {answered: false, maintopic: studium}, 27457: {answered: false, maintopic: jobausschreibung}, - 27468: {answered: true, maintopic: studium}, 27489: {answered: false, maintopic: information}, - 27490: {answered: false, maintopic: fachschaftenzeugs}, 27491: {answered: false, - maintopic: jobausschreibung}, 27492: {answered: false, maintopic: information}, - 27495: {answered: false, maintopic: information}, 27496: {answered: true, maintopic: ausleihen}, - 27497: {answered: false, maintopic: information}, 27500: {answered: true, maintopic: studium}, - 27501: {answered: false, maintopic: information}, 27514: {answered: true, maintopic: studium}, - 27515: {answered: true, maintopic: studium}, 27518: {answered: true, maintopic: studium}, + 27456: {answered: false, lang: de, maintopic: studium}, 27457: {answered: false, + maintopic: jobausschreibung}, 27468: {answered: true, maintopic: studium}, 27489: { + answered: false, lang: en, maintopic: information}, 27490: {answered: false, maintopic: fachschaftenzeugs}, + 27491: {answered: false, maintopic: jobausschreibung}, 27492: {answered: false, + maintopic: information}, 27495: {answered: false, maintopic: information}, 27496: { + answered: true, maintopic: ausleihen}, 27497: {answered: false, maintopic: information}, + 27500: {answered: true, lang: en, maintopic: studium}, 27501: {answered: false, + lang: en, maintopic: information}, 27514: {answered: true, maintopic: studium}, + 27515: {answered: true, lang: en, maintopic: studium}, 27518: {answered: true, maintopic: studium}, 27523: {answered: false, maintopic: jobausschreibung}, 27526: {answered: false, - maintopic: studium}, 27536: {answered: true, maintopic: studium}, 27541: {answered: true, - maintopic: studium}, 27542: {answered: false, maintopic: studium}, 27543: {answered: false, - maintopic: information}, 27544: {answered: true, maintopic: studium}, 27545: { - answered: false, maintopic: umfragen}, 27546: {answered: false, maintopic: information}, + maintopic: studium}, 27536: {answered: true, lang: de, maintopic: studium}, 27541: { + answered: true, maintopic: studium}, 27542: {answered: false, maintopic: studium}, + 27543: {answered: false, maintopic: information}, 27544: {answered: true, maintopic: studium}, + 27545: {answered: false, maintopic: umfragen}, 27546: {answered: false, maintopic: information}, 27547: {answered: false, maintopic: studium}, 27549: {answered: false}, 27550: { answered: false, maintopic: information}, 27553: {answered: false, maintopic: information}, 27558: {answered: false}, 27560: {answered: false, maintopic: ausleihen}, 27562: { diff --git a/flaskapp/__init__.py b/flaskapp/__init__.py index 894d7c3..64e0b1b 100644 --- a/flaskapp/__init__.py +++ b/flaskapp/__init__.py @@ -22,11 +22,16 @@ from classifier import get_pipe mail_threads=db_session.query(MailThread).all() pipe1,le=get_pipe("pipe1",b"answered") pipe2,le2=get_pipe("pipe2b", b"maintopic") +pipe3,le3=get_pipe("pipe2b", b"lang") + answered=le.inverse_transform(pipe1.predict(mail_threads)) maintopic=le2.inverse_transform(pipe2.predict(mail_threads)) +lang=le3.inverse_transform(pipe3.predict(mail_threads)) + for i, t in enumerate(mail_threads): t.answered=answered[i] t.maintopic=maintopic[i] + t.lang=lang[i] @app.route("/") def hello(): mth=mail_threads diff --git a/run.py b/run.py index 4c88ac7..fde12e9 100644 --- a/run.py +++ b/run.py @@ -1,87 +1,30 @@ from __future__ import unicode_literals -import imapclient +#import imapclient from config import Config import sys -from email.header import decode_header -import email +#from email.header import decode_header +#import email import codecs -import sys -import bs4 + +#import sys +#import bs4 + #sys.stdout = codecs.getwriter('utf8')(sys.stdout) from storage.fetch_mail import fetch_mail from storage.fetch_mail import fetch_threads, flatten_threads - from storage import Mail, MailThread, db_session -import yaml -import email -from classifier import get_training_threads, ThreadDictExtractor, print_answers, in_training, store_training_data, get_pipe, test_pipe # , pipe2, pipe2b -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import LabelEncoder -import numpy +#import yaml +#import email +from classifier import get_training_threads, print_answers, in_training, store_training_data, get_pipe, test_pipe, train_single_thread # , pipe2, pipe2b +from flaskapp import app - -def train_fit_pipe(): - tt= get_training_threads(b"answered") -# print tt[1] -# print tt[0] - pipe1.fit(tt[0],tt[1]) - return pipe1,tt[2] -def train_fit_pipe2(): - tt= get_training_threads(b"maintopic") - pipe2.fit(tt[0],tt[1]) - return pipe2,tt[2] - -def train_fit_pipe2b(): - tt= get_training_threads(b"maintopic") - pipe2b.fit(tt[0],tt[1]) - return pipe2b,tt[2] - def predict_thread(p,l,t): pre=p.predict([t]) print "Status is answered is estimated to be: " + str(l.inverse_transform(pre)[0]) return pre - - -def train_single_thread(tid,p,le,key="answered"): - if (not type(tid) is int): raise TypeError("ID must be of type int") - if not type(p) is Pipeline: raise TypeError("Second Argument needs to be type Pipeline") - if not type(le) is LabelEncoder: raise TypeError("Second Argument needs to be type LabelEncoder") - mth=db_session.query(MailThread).filter(MailThread.firstmail==tid).first() - if mth is None: raise ValueError("Thread with firstmail %d not in Database" %tid) - # Predict the value - pre=p.predict([mth]) - answ=pre[0] - # -# print mth.to_text() -# print mth.text() - print "Status is answered is estimated to be: " + str(le.inverse_transform(pre)[0]) - print_answers(le) - - ca=raw_input("Correct answer..") - try: - ca=int(ca) - - except ValueError: - print "String Data" - if type(ca)==int: - if ca == answ: - print ("Yes I got it right") - else: - print("Oh no...!") - l=le.inverse_transform([ca])[0] - if type(l) is numpy.bool_: - l=bool(l) - if type(l) is numpy.string_: - l=str(l) - store_training_data(tid,l, key) - elif not ca.strip() == "": - store_training_data(tid, ca, key) - else: - print "couldn't handle %s" % ca - -from flaskapp import app + #print "arg1:"+sys.argv[1] if len(sys.argv)>1: if sys.argv[1] == "fetch_threads": @@ -89,6 +32,7 @@ if len(sys.argv)>1: if sys.argv[1] == "run_server": app.run(port=3000,debug=True) + if sys.argv[1] == "print_threads": mth=db_session.query(MailThread).all() for t in mth: @@ -122,7 +66,15 @@ if len(sys.argv)>1: p, le=get_pipe("pipe2", "maintopic") pb, lb =get_pipe("pipe2b", "maintopic") - train_single_thread(int(sys.argv[2]),p,le,b"maintopic") + train_single_thread(int(sys.argv[2]),p,le,b"maintopic") + + if sys.argv[1] == "train_thrd3": +# p, le=get_pipe("pipe2", "maintopic") + pb, lb =get_pipe("pipe2b", "lang") + + train_single_thread(int(sys.argv[2]),pb,lb,b"lang") + + if sys.argv[1] == "train_all2": p, labelencoder=train_fit_pipe2() pb, lb=train_fit_pipe2b() diff --git a/run_server b/run_server new file mode 100755 index 0000000..9926b50 --- /dev/null +++ b/run_server @@ -0,0 +1,3 @@ +#!/bin/bash +. .env/bin/activate +python run.py run_server \ No newline at end of file diff --git a/storage/fetch_mail.py b/storage/fetch_mail.py index 6dd71be..2a10136 100644 --- a/storage/fetch_mail.py +++ b/storage/fetch_mail.py @@ -35,3 +35,22 @@ def flatten_threads(thrds, array=[], level=0): for t in thrds: array.append(flatten_threads(t,[],1)) return array + +def store_threads(thrds): + for t in thrds: + if type(t[0]) is int: + th=db_session.query(MailThread).filter(MailThread.firstmail==t[0]).first() + # Wenn nicht gefunden neuen anlegen + if th == None: + th=MailThread() + th.firstmail=t[0] + elif not th.body == yaml.dump(t): # Ansonsten body vergleichen + th.body=yaml.dump(t) # body zb (27422,27506), (27450,) + th.islabeled=False + th.opened=True + else: + th.body=yaml.dump(t) + db_session.add(th) + db_session.commit() + + diff --git a/storage/thread_model.py b/storage/thread_model.py index 24505a9..2264b7b 100644 --- a/storage/thread_model.py +++ b/storage/thread_model.py @@ -32,6 +32,7 @@ class MailThread(Base): __jsonattrs__=None answered=False maintopic="information" + lang="" def bdy(self): return yaml.load(self.body) @@ -50,7 +51,7 @@ class MailThread(Base): def tstr(self): fr=yaml.load(self.mails()[0].from_) - return "(" + str(self.answered)+ ", "+ str(self.maintopic) + ") " + str(self.firstmail)+": "+str(fr[0]["mail"])+"@"+str(fr[0]["host"]) + " | ".join(yaml.load(self.mails()[0].subject)) + return "(" + str(self.answered)+ ", "+ str(self.maintopic)+ ", "+ str(self.lang) + ") " + str(self.firstmail)+": "+str(fr[0]["mail"])+"@"+str(fr[0]["host"]) + " | ".join(yaml.load(self.mails()[0].subject)) def mails(self): a=[] @@ -111,7 +112,7 @@ class MailThread(Base): elif filter=="first": a=mail_txt(m[0]) a=re.sub(r'\n\s*\n',r'\n',a) - a=re.sub(r'',r'',a,flags=re.MULTILINE|re.DOTALL) +# a=re.sub(r'',r'',a,flags=re.MULTILINE|re.DOTALL) a=re.sub(r'\s*>+ .*\n',r'',a)