diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..60b747a --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +env/* +*~ +config.cfg +*.yml3 +*.yaml +*.sqlite +*.sqlite-journal +.env +*.pyc +*# diff --git a/classifier.py b/classifier.py index e0a3081..52e76bd 100644 --- a/classifier.py +++ b/classifier.py @@ -4,6 +4,7 @@ from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer from sklearn.preprocessing import LabelEncoder from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.naive_bayes import MultinomialNB +from sklearn.neural_network import MLPClassifier import numpy as np import yaml @@ -100,7 +101,28 @@ pipe2 = Pipeline([ ], transformer_weights={ 'subject': 1, 'text': 0.7, - 'envelope': 0.5 + 'envelope': 0.7 } )), ('clf', MultinomialNB()) ]) + +pipe2b = Pipeline([ + ('union', FeatureUnion(transformer_list=[ + ('subject', Pipeline([('tse', ThreadSubjectExtractor()), + ('cv',CountVectorizer()), + ('tfidf', TfidfTransformer()) + ])), + ('text', Pipeline([('tte',ThreadTextExtractor()), + ('cv',CountVectorizer()), + ('tfidf', TfidfTransformer()) + ])), + ('envelope', Pipeline([('tde', ThreadDictExtractor()), + ('dv',DictVectorizer()) + ])) + ], transformer_weights={ + 'subject': 1, + 'text': 0.7, + 'envelope': 0.7 + } )), + ('mlc', MLPClassifier()) +]) diff --git a/data.yml b/data.yml new file mode 100644 index 0000000..7defbd8 --- /dev/null +++ b/data.yml @@ -0,0 +1,41 @@ +{27070: {maintopic: ausleihen}, 27083: {maintopic: ausleihen}, 27086: {maintopic: information}, + 27094: {maintopic: information}, 27096: {maintopic: jobausschreibung}, 27102: { + maintopic: studium}, 27118: {maintopic: information}, 27127: {maintopic: studium}, + 27130: {maintopic: information}, 27133: {maintopic: information}, 27141: {maintopic: information}, + 27146: {maintopic: information}, 27166: {maintopic: umfragen}, 27171: {maintopic: ausleihen}, + 27178: {maintopic: studium}, 27182: {maintopic: studium}, 27197: {maintopic: information}, + 27201: {maintopic: information}, 27218: {maintopic: information}, 27219: {maintopic: studium}, + 27222: {maintopic: information}, 27226: {maintopic: ausleihen}, 27420: {answered: true, + maintopic: studium}, 27422: {answered: true, maintopic: studium}, 27425: {answered: false, + maintopic: studium}, 27431: {answered: false, maintopic: information}, 27434: { + answered: false, maintopic: studium}, 27435: {answered: false}, 27438: {answered: false, + maintopic: information}, 27439: {answered: true, maintopic: studium}, 27441: { + answered: false, maintopic: studium}, 27444: {answered: true, maintopic: ausleihen}, + 27454: {answered: false, maintopic: information}, 27455: {answered: false, maintopic: information}, + 27456: {answered: false, maintopic: studium}, 27457: {answered: false, maintopic: jobausschreibung}, + 27468: {answered: true, maintopic: studium}, 27489: {answered: false, maintopic: information}, + 27490: {answered: false, maintopic: fachschaftenzeugs}, 27491: {answered: false, + maintopic: jobausschreibung}, 27492: {answered: false, maintopic: information}, + 27495: {answered: false, maintopic: information}, 27496: {answered: true, maintopic: ausleihen}, + 27497: {answered: false, maintopic: information}, 27500: {answered: true, maintopic: studium}, + 27501: {answered: false, maintopic: information}, 27514: {answered: true, maintopic: studium}, + 27515: {answered: true, maintopic: studium}, 27518: {answered: true, maintopic: studium}, + 27523: {answered: false, maintopic: jobausschreibung}, 27526: {answered: false, + maintopic: studium}, 27536: {answered: true, maintopic: studium}, 27541: {answered: true, + maintopic: studium}, 27542: {answered: false, maintopic: studium}, 27543: {answered: false, + maintopic: information}, 27544: {answered: true, maintopic: studium}, 27545: { + answered: false, maintopic: umfragen}, 27546: {answered: false, maintopic: information}, + 27547: {answered: false, maintopic: studium}, 27549: {answered: false}, 27550: { + answered: false, maintopic: information}, 27553: {answered: false, maintopic: information}, + 27558: {answered: false}, 27560: {answered: false, maintopic: ausleihen}, 27562: { + answered: false}, 27564: {answered: false, maintopic: jobausschreibung}, 27565: { + answered: true, maintopic: ausleihen}, 27566: {answered: false, maintopic: information}, + 27567: {answered: false, maintopic: information}, 27568: {answered: false}, 27575: { + answered: false, maintopic: information}, 27577: {answered: false, maintopic: information}, + 27579: {answered: true, maintopic: diplomarbeit}, 27582: {answered: false, maintopic: studium}, + 27583: {answered: true, maintopic: studium}, 27584: {answered: false, maintopic: studium}, + 27585: {answered: false, maintopic: information}, 27586: {answered: false, maintopic: ausleihen}, + 27587: {answered: false, maintopic: information}, 27588: {answered: false, maintopic: ausleihen}, + 27592: {answered: false, maintopic: studium}, 27597: {answered: false, maintopic: jobausschreibung}, + 27598: {answered: false, maintopic: umfragen}, 27604: {answered: false, maintopic: umfragen}, + 27607: {answered: false, maintopic: information}} diff --git a/run.py b/run.py index 3ccde40..991d973 100644 --- a/run.py +++ b/run.py @@ -14,7 +14,7 @@ from storage.fetch_mail import fetch_threads, flatten_threads from storage import Mail, MailThread, db_session import yaml import email -from classifier import get_training_threads, ThreadDictExtractor, pipe1, print_answers, in_training, store_training_data, pipe2 +from classifier import get_training_threads, ThreadDictExtractor, pipe1, print_answers, in_training, store_training_data, pipe2, pipe2b from sklearn.pipeline import Pipeline from sklearn.preprocessing import LabelEncoder import numpy @@ -32,6 +32,16 @@ def train_fit_pipe2(): pipe2.fit(tt[0],tt[1]) return pipe2,tt[2] +def train_fit_pipe2b(): + tt= get_training_threads(b"maintopic") + pipe2b.fit(tt[0],tt[1]) + return pipe2b,tt[2] + +def predict_thread(p,l,t): + pre=p.predict([t]) + print "Status is answered is estimated to be: " + str(l.inverse_transform(pre)[0]) + return pre + def train_single_thread(tid,p,le,key="answered"): if (not type(tid) is int): raise TypeError("ID must be of type int") @@ -43,8 +53,8 @@ def train_single_thread(tid,p,le,key="answered"): pre=p.predict([mth]) answ=pre[0] # - print mth.to_text() - print mth.text() +# print mth.to_text() +# print mth.text() print "Status is answered is estimated to be: " + str(le.inverse_transform(pre)[0]) print_answers(le) @@ -104,9 +114,11 @@ if len(sys.argv)>1: if sys.argv[1] == "train_thrd2": p, le=train_fit_pipe2() + pb, lb =train_fit_pipe2b() train_single_thread(int(sys.argv[2]),p,le,b"maintopic") if sys.argv[1] == "train_all2": p, labelencoder=train_fit_pipe2() + pb, lb=train_fit_pipe2b() mth=db_session.query(MailThread).all() print mth for t in mth: @@ -115,6 +127,7 @@ if len(sys.argv)>1: print "---------------------------------------------------" print t.firstmail print t.text() + predict_thread(pb,lb,t) train_single_thread(t.firstmail, p, labelencoder, b"maintopic") diff --git a/storage/fetch_mail.py b/storage/fetch_mail.py index a8891ab..6dd71be 100644 --- a/storage/fetch_mail.py +++ b/storage/fetch_mail.py @@ -19,7 +19,7 @@ def fetch_thread(tp): return tp def fetch_threads(): - src=server.thread(criteria=[b'SUBJECT', b'service', b'SINCE', date(2017,07,01)]) + src=server.thread(criteria=[b'SUBJECT', b'service', b'SINCE', date(2017,05,01)]) #, b'BEFORE', date(2017,08,01) return src diff --git a/storage/thread_model.py b/storage/thread_model.py index b3f83b2..757cdbb 100644 --- a/storage/thread_model.py +++ b/storage/thread_model.py @@ -88,6 +88,6 @@ class MailThread(Base): # print "withintm:"+str(type(t)) t=t.decode("ISO-8859-1") txt=t - a=a+txt+"\n\n" + a=a+txt+"***........................................***\n" return a diff --git a/test.sqlite b/test.sqlite new file mode 100644 index 0000000..bbc718f Binary files /dev/null and b/test.sqlite differ