rewrite classifier

2017-08-07 10:20:28 +02:00
parent ff0bdc6d3b
commit 94d8d26187
11 changed files with 411 additions and 98 deletions
--- a/classifier/init.py
+++ b/classifier/init.py
@@ -0,0 +1,4 @@
 from classifier import in_training, print_answers
 from classifier import get_pipe, test_pipe, get_training_threads
 from training import train_single_thread
 from classifier import store_training_data
--- a/classifier/classifier.py
+++ b/classifier/classifier.py
@@ -0,0 +1,191 @@
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.feature_extraction import DictVectorizer
 from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
 from sklearn.preprocessing import LabelEncoder
 from sklearn.pipeline import Pipeline, FeatureUnion
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.neural_network import MLPClassifier
 from sklearn.model_selection import train_test_split
 import numpy as np
 import yaml
 from storage import MailThread,db_session
 from sklearn.metrics import accuracy_score
 with open("data.yml", 'r') as stream:
    try:
        train=yaml.load(stream)
    except yaml.YAMLError as exc:
        print(exc)
 data_types= { "answered": bool, "maintopic": str, "lang": str}
 def set_train_data(i,d,key=b"answered"):
    global train
    #------------------------------------    
    if not data_types.has_key(key):
        raise ValueError("Key "+str(key)+" unknown")
    if not train.has_key(i) or train[i] is None:
        train[i]={}
    if not type(d)  is data_types[key]:
        raise TypeError("Data - %s - for key "% d +str(key)+" must be " +str(data_types[key])+ " but it is "+ str(type(d)))
    #------------------------------------
    train[i][key]=d
 def store_training_data(i, d,key=b"answered"):
    set_train_data(i,d,key)
    with open("data.yml","w") as file:
        file.write(yaml.dump(train,default_flow_style=True))
        file.close()
 # Lade Trainingsdaten fuer einen angegebenen key (Label/Eigenschaft) 
 def get_training_threads(key="answered", filter=[]):
    if not data_types.has_key(key):
        raise ValueError("Key "+str(key)+" unknown")
    #------------------------------------
    t_a=[]
    d_a=[]
    d_a2=[]
    #------------------------------------
    for i in train:
        if train[i].has_key(key): # In den Trainingsdaten muss der relevante Key sein
            t=db_session.query(MailThread).filter(MailThread.firstmail==i).first()
            if not t is None:   # Thread muss in der Datenbank sein
                t_a.append(t)
                d_a.append(train[i][key])
    le=LabelEncoder()
    d_a2=le.fit_transform(d_a)
    return (t_a,d_a2,le)
 def in_training(i, key="answered"):
    return train.has_key(i) and train[i].has_key(key)
 def print_answers(l):
    cc=l.classes_
    c_id=l.transform(cc)
    for i,c in enumerate(cc):
        print str(i) + ":  " + str(c)
    return None
 class ThreadDictExtractor(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, X,y=None):
        return [t.mail_flat_dict() for t in X]
 class ThreadSubjectExtractor(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, X,y=None):
        return [t.subject() for t in X]
 class ThreadTextExtractor(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, X,y=None):
        return [t.text() for t in X]
 def get_pipe(p=b"pipe1",k=b"answered"):
    p=build_pipe(p)
    tt= get_training_threads(k)
    if len(tt[0]) > 0:
        p.fit(tt[0],tt[1])
        return p,tt[2]
    else:
        return None, None
 def test_pipe(pp,k):
    tt= get_training_threads(k)
    X_train,X_test,y_train,y_test=train_test_split(tt[0],tt[1],test_size=0.2)
    if type(pp) is list:
        for p in pp:
            print "pipe: %s" % p
            p=build_pipe(p)
            p.fit(X_train,y_train)
            ypred=p.predict(X_test)
            print accuracy_score(y_test,ypred)
 def build_pipe(p=b"pipe1"):
    if p == "pipe1":
        p=Pipeline([('tde', ThreadDictExtractor()),
                    ('dv',DictVectorizer()),
                    ('clf', MultinomialNB())
        ])
    elif p=="pipe2":
        p = Pipeline([
            ('union', FeatureUnion(transformer_list=[
                ('subject', Pipeline([('tse', ThreadSubjectExtractor()),
                                      ('cv',CountVectorizer()),
                                      ('tfidf', TfidfTransformer())
                ])),
                ('text',    Pipeline([('tte',ThreadTextExtractor()),
                                      ('cv',CountVectorizer()),
                                      ('tfidf', TfidfTransformer())
                ])),
                ('envelope', Pipeline([('tde', ThreadDictExtractor()),
                                       ('dv',DictVectorizer())
                ]))
            ], transformer_weights={
                'subject': 1,
                'text': 0.7,
                'envelope': 0.7
            } )),
            ('clf', MultinomialNB())
        ])
    elif p=="pipe2b":
        p = Pipeline([
            ('union', FeatureUnion(transformer_list=[
            ('subject', Pipeline([('tse', ThreadSubjectExtractor()),
                                ('cv',CountVectorizer()),
                                ('tfidf', TfidfTransformer())
        ])),
            ('text',    Pipeline([('tte',ThreadTextExtractor()),
                                  ('cv',CountVectorizer()),
                                ('tfidf', TfidfTransformer())
            ])),
            ('envelope', Pipeline([('tde', ThreadDictExtractor()),
                                   ('dv',DictVectorizer())
            ]))
            ], transformer_weights={
                'subject': 1,
                'text': 0.7,
                'envelope': 0.7
            } )),
            ('mlc', MLPClassifier())
        ])
    elif p=="pipe2c":
        p = Pipeline([
            ('union', FeatureUnion(transformer_list=[
            ('subject', Pipeline([('tse', ThreadSubjectExtractor()),
                                ('cv',CountVectorizer()),
                                ('tfidf', TfidfTransformer())
        ])),
            ('text',    Pipeline([('tte',ThreadTextExtractor()),
                                  ('cv',CountVectorizer()),
                                ('tfidf', TfidfTransformer())
            ])),
            ('envelope', Pipeline([('tde', ThreadDictExtractor()),
                                   ('dv',DictVectorizer())
            ]))
            ], transformer_weights={
                'subject': 1,
                'text': 1,
                'envelope': 0.4
            } )),
            ('mlc', MLPClassifier())
        ])
    else:
        raise ValueError("The pipe %s is not a valid pipe")
    return p
--- a/classifier/classify_mail.py
+++ b/classifier/classify_mail.py
@@ -0,0 +1,25 @@
 from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
 from sklearn.feature_extraction import DictVectorizer
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.pipeline import Pipeline, FeatureUnion
 import sys
 import yaml
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import LabelEncoder
 text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB())])
 text_ohc = Pipeline([('ohc', OneHotEncoder()),('clf', MultinomialNB())])
 combined_features = FeatureUnion([('vect1', CountVectorizer()),('vect2', CountVectorizer())])
 enc=OneHotEncoder()
 with open("example_1.yaml", 'r') as stream:
    try:
        train=yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)
 tc=text_clf.fit(train["data"],train["target"])
--- a/classifier/classify_text.py
+++ b/classifier/classify_text.py
@@ -0,0 +1,42 @@
 from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.pipeline import Pipeline
 text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB())])
 import sys
 import yaml
 with open("example_1.yaml", 'r') as stream:
    try:
        train=yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)
 tc=text_clf.fit(train["data"],train["target"])
 print(sys.argv[1])
 answ=(tc.predict([sys.argv[1]]))[0]
 print train["target_names"][answ]
 for i in range(0,  (len(train["target_names"]))):
    print (str(i)+"  "+ train["target_names"][i])
 ca=int(raw_input("Correct answer.."))
 if ca == answ:
           print ("Yes I got it right")
 else:
    print("should I remember this?")
    a=raw_input("shoudIrememberthis?")
    if a == "y":
        train["data"].append(sys.argv[1])
        train["target"].append(ca)
        print yaml.dump(train,default_flow_style=False)
        file=open("example_1.yaml","w")
        file.write(yaml.dump(train,default_flow_style=False))
        file.close()
    else:
        print ("Ok, I already forgot")
--- a/classifier/training.py
+++ b/classifier/training.py
@@ -0,0 +1,70 @@
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import LabelEncoder
 import numpy
 from storage import Mail, MailThread, db_session
 from classifier import store_training_data, print_answers
 def train_fit_pipe():
        tt= get_training_threads(b"answered")
        pipe1.fit(tt[0],tt[1])
        return pipe1,tt[2]
 def train_fit_pipe2():
        tt= get_training_threads(b"maintopic")
        pipe2.fit(tt[0],tt[1])
        return pipe2,tt[2]
 def train_fit_pipe2b():
        tt= get_training_threads(b"maintopic")
        pipe2b.fit(tt[0],tt[1])
        return pipe2b,tt[2]
 def predict_thread(mth,p,le,key):
        #-------------------------------------------------------
        if not type(p) is Pipeline: raise TypeError("Second Argument needs to be type Pipeline")
        if not type(le) is LabelEncoder: raise TypeError("Second Argument needs to be type LabelEncoder")
        #-------------------------------------------------------
        pre=p.predict([mth])
        answ=pre[0]
        print "Status is answered is estimated to be: " + str(le.inverse_transform(pre)[0])
        return answ
 def train_single_thread(tid,p,le,key="answered"):
        if (not type(tid) is int): raise TypeError("ID must be of type int") 
        mth=db_session.query(MailThread).filter(MailThread.firstmail==tid).first()
        if mth is None: raise ValueError("Thread with firstmail %d not in Database" %tid)
        print mth.firstmail
        print mth.subject()
        print mth.text()
        if not p is None and not le is None:
                answ=predict_thread(mth,p,le,key)
        else: answ=None
        if not le is None:
                print_answers(le)
        ca=raw_input("Correct answer..")
        try:
                ca=int(ca)
        except ValueError:
                print "String Data"
        if type(ca)==int:
                if ca == answ:
                        print ("Yes I got it right")
                else:
                        print("Oh no...!")
                l=le.inverse_transform([ca])[0]
                if type(l) is numpy.bool_:
                        l=bool(l)
                if type(l) is numpy.string_:
                        l=str(l)
                store_training_data(tid,l, key)
        elif not ca.strip() == "":
                store_training_data(tid, ca, key)
        else:
                print "couldn't handle %s" % ca 
--- a/data.yml
+++ b/data.yml
@@ -1,31 +1,32 @@
-{26808: {maintopic: jobausschreibung}, 27017: {maintopic: jobausschreibung}, 27070: {
+{26808: {maintopic: jobausschreibung}, 27008: {lang: de}, 27017: {lang: de, maintopic: jobausschreibung},
-    maintopic: ausleihen}, 27083: {maintopic: ausleihen}, 27086: {maintopic: information},
+  27061: {lang: de}, 27070: {maintopic: ausleihen}, 27083: {maintopic: ausleihen},
-  27094: {maintopic: information}, 27096: {maintopic: jobausschreibung}, 27102: {
+  27086: {maintopic: information}, 27094: {maintopic: information}, 27096: {maintopic: jobausschreibung},
-    maintopic: studium}, 27118: {maintopic: information}, 27127: {maintopic: studium},
+  27102: {lang: en, maintopic: studium}, 27118: {maintopic: information}, 27127: {
-  27130: {maintopic: information}, 27133: {maintopic: information}, 27141: {maintopic: information},
+    maintopic: studium}, 27130: {maintopic: information}, 27133: {maintopic: information},
-  27146: {maintopic: information}, 27166: {maintopic: umfragen}, 27171: {maintopic: ausleihen},
+  27141: {maintopic: information}, 27146: {maintopic: information}, 27166: {maintopic: umfragen},
-  27178: {maintopic: studium}, 27182: {maintopic: studium}, 27197: {maintopic: information},
+  27171: {maintopic: ausleihen}, 27178: {maintopic: studium}, 27182: {maintopic: studium},
-  27201: {maintopic: information}, 27218: {maintopic: information}, 27219: {maintopic: studium},
+  27197: {maintopic: information}, 27201: {maintopic: information}, 27218: {maintopic: information},
-  27222: {maintopic: information}, 27226: {maintopic: ausleihen}, 27420: {answered: true,
+  27219: {maintopic: studium}, 27222: {maintopic: information}, 27226: {maintopic: ausleihen},
-    maintopic: studium}, 27422: {answered: true, maintopic: studium}, 27425: {answered: false,
+  27420: {answered: true, maintopic: studium}, 27422: {answered: true, maintopic: studium},
-    maintopic: studium}, 27431: {answered: false, maintopic: information}, 27434: {
+  27425: {answered: false, maintopic: studium}, 27431: {answered: false, maintopic: information},
-    answered: false, maintopic: information}, 27435: {answered: false}, 27438: {answered: false,
+  27434: {answered: false, lang: de, maintopic: information}, 27435: {answered: false},
-    maintopic: information}, 27439: {answered: true, maintopic: studium}, 27441: {
+  27438: {answered: false, maintopic: information}, 27439: {answered: true, maintopic: studium},
-    answered: false, maintopic: studium}, 27444: {answered: true, maintopic: ausleihen},
+  27441: {answered: false, maintopic: studium}, 27444: {answered: true, maintopic: ausleihen},
  27454: {answered: false, maintopic: information}, 27455: {answered: false, maintopic: information},
-  27456: {answered: false, maintopic: studium}, 27457: {answered: false, maintopic: jobausschreibung},
+  27456: {answered: false, lang: de, maintopic: studium}, 27457: {answered: false,
-  27468: {answered: true, maintopic: studium}, 27489: {answered: false, maintopic: information},
+    maintopic: jobausschreibung}, 27468: {answered: true, maintopic: studium}, 27489: {
-  27490: {answered: false, maintopic: fachschaftenzeugs}, 27491: {answered: false,
+    answered: false, lang: en, maintopic: information}, 27490: {answered: false, maintopic: fachschaftenzeugs},
-    maintopic: jobausschreibung}, 27492: {answered: false, maintopic: information},
+  27491: {answered: false, maintopic: jobausschreibung}, 27492: {answered: false,
-  27495: {answered: false, maintopic: information}, 27496: {answered: true, maintopic: ausleihen},
+    maintopic: information}, 27495: {answered: false, maintopic: information}, 27496: {
-  27497: {answered: false, maintopic: information}, 27500: {answered: true, maintopic: studium},
+    answered: true, maintopic: ausleihen}, 27497: {answered: false, maintopic: information},
-  27501: {answered: false, maintopic: information}, 27514: {answered: true, maintopic: studium},
+  27500: {answered: true, lang: en, maintopic: studium}, 27501: {answered: false,
-  27515: {answered: true, maintopic: studium}, 27518: {answered: true, maintopic: studium},
+    lang: en, maintopic: information}, 27514: {answered: true, maintopic: studium},
  27515: {answered: true, lang: en, maintopic: studium}, 27518: {answered: true, maintopic: studium},
  27523: {answered: false, maintopic: jobausschreibung}, 27526: {answered: false,
-    maintopic: studium}, 27536: {answered: true, maintopic: studium}, 27541: {answered: true,
+    maintopic: studium}, 27536: {answered: true, lang: de, maintopic: studium}, 27541: {
-    maintopic: studium}, 27542: {answered: false, maintopic: studium}, 27543: {answered: false,
+    answered: true, maintopic: studium}, 27542: {answered: false, maintopic: studium},
-    maintopic: information}, 27544: {answered: true, maintopic: studium}, 27545: {
+  27543: {answered: false, maintopic: information}, 27544: {answered: true, maintopic: studium},
-    answered: false, maintopic: umfragen}, 27546: {answered: false, maintopic: information},
+  27545: {answered: false, maintopic: umfragen}, 27546: {answered: false, maintopic: information},
  27547: {answered: false, maintopic: studium}, 27549: {answered: false}, 27550: {
    answered: false, maintopic: information}, 27553: {answered: false, maintopic: information},
  27558: {answered: false}, 27560: {answered: false, maintopic: ausleihen}, 27562: {
--- a/flaskapp/init.py
+++ b/flaskapp/init.py
@@ -22,11 +22,16 @@ from classifier import get_pipe
 mail_threads=db_session.query(MailThread).all()
 pipe1,le=get_pipe("pipe1",b"answered")
 pipe2,le2=get_pipe("pipe2b", b"maintopic")
 pipe3,le3=get_pipe("pipe2b", b"lang")
 answered=le.inverse_transform(pipe1.predict(mail_threads))
 maintopic=le2.inverse_transform(pipe2.predict(mail_threads))
 lang=le3.inverse_transform(pipe3.predict(mail_threads))
 for i, t in enumerate(mail_threads):
    t.answered=answered[i]
    t.maintopic=maintopic[i]
    t.lang=lang[i]
@app.route("/")
 def hello():
    mth=mail_threads
--- a/run.py
+++ b/run.py
@@ -1,87 +1,30 @@
 from __future__ import unicode_literals
-import imapclient
+#import imapclient
 from config import Config
 import sys
-from email.header import decode_header
+#from email.header import decode_header
-import email
+#import email
 import codecs
-import sys
+
-import bs4
+#import sys
 #import bs4
 #sys.stdout = codecs.getwriter('utf8')(sys.stdout)
 from storage.fetch_mail import fetch_mail
 from storage.fetch_mail import fetch_threads, flatten_threads
 from storage import Mail, MailThread, db_session
-import yaml
+#import yaml
-import email
+#import email
-from classifier import get_training_threads, ThreadDictExtractor,  print_answers, in_training, store_training_data, get_pipe, test_pipe # , pipe2, pipe2b
+from classifier import get_training_threads,  print_answers, in_training, store_training_data, get_pipe, test_pipe, train_single_thread # , pipe2, pipe2b
-from sklearn.pipeline import Pipeline
+from flaskapp import app
 from sklearn.preprocessing import LabelEncoder
 import numpy
 def train_fit_pipe():
        tt= get_training_threads(b"answered")
 #        print tt[1]
 #        print tt[0]
        pipe1.fit(tt[0],tt[1])
        return pipe1,tt[2]
 def train_fit_pipe2():
        tt= get_training_threads(b"maintopic")
        pipe2.fit(tt[0],tt[1])
        return pipe2,tt[2]
 def train_fit_pipe2b():
        tt= get_training_threads(b"maintopic")
        pipe2b.fit(tt[0],tt[1])
        return pipe2b,tt[2]
 def predict_thread(p,l,t):
        pre=p.predict([t])
        print "Status is answered is estimated to be: " + str(l.inverse_transform(pre)[0])
        return pre
 def train_single_thread(tid,p,le,key="answered"):
        if (not type(tid) is int): raise TypeError("ID must be of type int") 
        if not type(p) is Pipeline: raise TypeError("Second Argument needs to be type Pipeline")
        if not type(le) is LabelEncoder: raise TypeError("Second Argument needs to be type LabelEncoder")
        mth=db_session.query(MailThread).filter(MailThread.firstmail==tid).first()
        if mth is None: raise ValueError("Thread with firstmail %d not in Database" %tid)
        # Predict the value 
        pre=p.predict([mth])
        answ=pre[0]
        #
 #        print mth.to_text()
 #        print mth.text()
        print "Status is answered is estimated to be: " + str(le.inverse_transform(pre)[0])
        print_answers(le)
        ca=raw_input("Correct answer..")
        try:
                ca=int(ca)
        except ValueError:
                print "String Data"
        if type(ca)==int:
                if ca == answ:
                        print ("Yes I got it right")
                else:
                        print("Oh no...!")
                l=le.inverse_transform([ca])[0]
                if type(l) is numpy.bool_:
                        l=bool(l)
                if type(l) is numpy.string_:
                        l=str(l)
                store_training_data(tid,l, key)
        elif not ca.strip() == "":
                store_training_data(tid, ca, key)
        else:
                print "couldn't handle %s" % ca 
 from flaskapp import app
 #print "arg1:"+sys.argv[1]
 if len(sys.argv)>1:
    if sys.argv[1] == "fetch_threads":
@@ -89,6 +32,7 @@ if len(sys.argv)>1:
    if sys.argv[1] == "run_server":
            app.run(port=3000,debug=True)
    if sys.argv[1] == "print_threads":
        mth=db_session.query(MailThread).all()
        for t in mth:
@@ -122,7 +66,15 @@ if len(sys.argv)>1:
        p, le=get_pipe("pipe2", "maintopic")
        pb, lb =get_pipe("pipe2b", "maintopic")
-        train_single_thread(int(sys.argv[2]),p,le,b"maintopic")        
+        train_single_thread(int(sys.argv[2]),p,le,b"maintopic")
    if sys.argv[1] == "train_thrd3":
 #        p, le=get_pipe("pipe2", "maintopic")
        pb, lb =get_pipe("pipe2b", "lang")
        train_single_thread(int(sys.argv[2]),pb,lb,b"lang")
    if sys.argv[1] == "train_all2":
        p, labelencoder=train_fit_pipe2()
        pb, lb=train_fit_pipe2b()
--- a/3
+++ b/3
@@ -0,0 +1,3 @@
 #!/bin/bash
 . .env/bin/activate
 python run.py run_server
--- a/storage/fetch_mail.py
+++ b/storage/fetch_mail.py
@@ -35,3 +35,22 @@ def flatten_threads(thrds, array=[], level=0):
        for t in thrds: 
            array.append(flatten_threads(t,[],1))
    return array
 def store_threads(thrds):
    for t in thrds:
        if type(t[0]) is int:
            th=db_session.query(MailThread).filter(MailThread.firstmail==t[0]).first()
            # Wenn nicht gefunden neuen anlegen
            if th == None:
                th=MailThread()
                th.firstmail=t[0]
            elif not th.body == yaml.dump(t): # Ansonsten body vergleichen
                th.body=yaml.dump(t) # body zb (27422,27506), (27450,)
                th.islabeled=False
                th.opened=True
            else:
                th.body=yaml.dump(t)
            db_session.add(th)
            db_session.commit()
--- a/storage/thread_model.py
+++ b/storage/thread_model.py
@@ -32,6 +32,7 @@ class MailThread(Base):
    __jsonattrs__=None
    answered=False
    maintopic="information"
    lang=""
    def bdy(self):
        return yaml.load(self.body)
@@ -50,7 +51,7 @@ class MailThread(Base):
    def tstr(self):
        fr=yaml.load(self.mails()[0].from_)
-        return "(" + str(self.answered)+ ", "+ str(self.maintopic) + ") " + str(self.firstmail)+": "+str(fr[0]["mail"])+"@"+str(fr[0]["host"]) + " | ".join(yaml.load(self.mails()[0].subject))
+        return "(" + str(self.answered)+ ", "+ str(self.maintopic)+ ", "+ str(self.lang) + ") " + str(self.firstmail)+": "+str(fr[0]["mail"])+"@"+str(fr[0]["host"]) + " | ".join(yaml.load(self.mails()[0].subject))
    def mails(self):
        a=[]
@@ -111,7 +112,7 @@ class MailThread(Base):
        elif filter=="first":
            a=mail_txt(m[0])
        a=re.sub(r'\n\s*\n',r'\n',a)
-        a=re.sub(r'<!--.*-->',r'',a,flags=re.MULTILINE|re.DOTALL)
+#        a=re.sub(r'<!--.*-->',r'',a,flags=re.MULTILINE|re.DOTALL)
        a=re.sub(r'\s*>+ .*\n',r'',a)