refactor1

2017-08-28 09:08:47 +02:00
parent 699f4f6546
commit 630b982502
14 changed files with 274 additions and 230 deletions
--- a/classifier/init.py
+++ b/classifier/init.py
@@ -1,6 +1,7 @@
-from classifier import in_training, print_answers
+from classifier import  print_answers
 from classifier import get_pipe, test_pipe, get_training_threads
-from classifier import store_training_data
+#from classifier import store_training_data
+#in_training,

 from training import train_single_thread

--- a/classifier/classifier.py
+++ b/classifier/classifier.py
@@ -8,88 +8,18 @@ from sklearn.neural_network import MLPClassifier
 from sklearn.model_selection import train_test_split
 import numpy as np
 import yaml
-from storage import MailThread,db_session
 from sklearn.metrics import accuracy_score, confusion_matrix
+from collections import namedtuple

+from storage import get_training_threads #MailThread,db_session

-#with open("data.yml", 'r') as stream:
-#    try:
-#        train=yaml.load(stream)
-#    except yaml.YAMLError as exc:
-#        print(exc)
-
-data_types= { "answered": bool, "maintopic": str, "lang": str}
-
-def set_train_data(i,d,key=b"answered"):
-    global train
-    #------------------------------------    
-    if not data_types.has_key(key):
-        raise ValueError("Key "+str(key)+" unknown")
-    if not train.has_key(i) or train[i] is None:
-        train[i]={}
-    if not type(d)  is data_types[key]:
-        raise TypeError("Data - %s - for key "% d +str(key)+" must be " +str(data_types[key])+ " but it is "+ str(type(d)))
-    #------------------------------------
-    train[i][key]=d
-
-        
-def store_training_data(i, d,key=b"answered"):
-    set_train_data(i,d,key)
-    with open("data.yml","w") as file:
-        file.write(yaml.dump(train,default_flow_style=True))
-        file.close()
-
-
-# Lade Trainingsdaten fuer einen angegebenen key (Label/Eigenschaft) 
-def get_training_threads(key="answered", filters=[]):
-    if not data_types.has_key(key):
-        raise ValueError("Key "+str(key)+" unknown")
-    #------------------------------------
-    t_a=[]
-    d_a=[]
-    d_a2=[]
-    #------------------------------------
-    if "db" in filters:
-        q=db_session.query(MailThread).filter(MailThread.istrained.is_(True))
-        if "de" in filters:
-            q=q.filter(MailThread.lang=="de")
-        elif "en" in filters:
-            q=q.filter(MailThread.lang=="en")
-        tt=q.all()
-        for t in tt:
-               t_a.append(t)
-               if key =="answered":
-                   d_a.append(t.is_answered())
-               elif key=="maintopic":
-                   d_a.append(t.maintopic)
-               elif key=="lang":
-                   d_a.append(t.lang)
-    else:
-        raise ValueError("Database Filter now required")
-    le=LabelEncoder()
-    d_a2=le.fit_transform(d_a)
-    return (t_a,d_a2,le)
-                   
-               
- #   else:
- #       for i in train:
- #           if train[i].has_key(key): # In den Trainingsdaten muss der relevante Key sein
-#                t=db_session.query(MailThread).filter(MailThread.firstmail==i).first#()
-#                if not t is None:   # Thread muss in der Datenbank sein
-#                    t_a.append(t)
-#                    d_a.append(train[i][key])
-
-
-def in_training(i, key="answered"):
-    return train.has_key(i) and train[i].has_key(key)
-
+PredictTool=namedtuple("PredictTool",["Pipeline","LabelEncoder"])

 def print_answers(l):
-    
-    cc=l.classes_
-    c_id=l.transform(cc)
-    for i,c in enumerate(cc):
-        print str(i) + ":  " + str(c)
+    classes=l.classes_
+    classes_encoded=l.transform(cc)
+    for i, c in zip(classes_encoded,classes):
+        print str(i) + ":  " + str(c)    
    return None


@@ -117,15 +47,14 @@ class ThreadFirstTextExtractor(BaseEstimator, TransformerMixin):
    def transform(self, X,y=None):
        return [t.text("first") for t in X]

-def get_pipe(p=b"pipe1",k=b"answered",filters=[]):
+def get_pipe(p=b"pipe1",key=b"answered",filters=["db"]):
    p=build_pipe(p)
-    tt= get_training_threads(k,filters)
-    #print tt
-    if len(tt[0]) > 0:
-        p.fit(tt[0],tt[1])
-        return p,tt[2]
+    threads, labels, labelenc= get_training_threads(key,filters)
+    if len(threads) > 0:
+        p.fit(threads,labels)
+        return PredictTool(p,labelenc)
    else:
-        return None, None
+        return PredictTool(None, None)

 def test_pipe(pp,k,f=[]):
    tt= get_training_threads(k,f)
@@ -142,13 +71,13 @@ def test_pipe(pp,k,f=[]):



-def build_pipe(p=b"pipe1"):
-    if p == "pipe1":
+def build_pipe(pipe=b"pipe1"):
+    if pipe == "pipe1":
        p=Pipeline([('tde', ThreadDictExtractor()),
                    ('dv',DictVectorizer()),
                    ('clf', MultinomialNB())
        ])
-    elif p=="pipe2":
+    elif pipe=="pipe2":
        p = Pipeline([
            ('union', FeatureUnion(transformer_list=[
                ('subject', Pipeline([('tse', ThreadSubjectExtractor()),
@@ -169,7 +98,7 @@ def build_pipe(p=b"pipe1"):
            } )),
            ('clf', MultinomialNB())
        ])
-    elif p=="pipe2b":
+    elif pipe=="pipe2b":
        p = Pipeline([
            ('union', FeatureUnion(transformer_list=[
            ('subject', Pipeline([('tse', ThreadSubjectExtractor()),
@@ -190,7 +119,7 @@ def build_pipe(p=b"pipe1"):
            } )),
            ('mlc', MLPClassifier())
        ])
-    elif p=="pipe2d":
+    elif pipe=="pipe2d":
        p = Pipeline([
            ('union', FeatureUnion(transformer_list=[
            ('subject', Pipeline([('tse', ThreadSubjectExtractor()),
@@ -217,7 +146,7 @@ def build_pipe(p=b"pipe1"):
            ('mlc', MLPClassifier())
        ])
        
-    elif p=="pipe2e":
+    elif pipe=="pipe2e":
        p = Pipeline([
            ('union', FeatureUnion(transformer_list=[
            ('subject', Pipeline([('tse', ThreadSubjectExtractor()),
@@ -243,7 +172,7 @@ def build_pipe(p=b"pipe1"):
            } )),
            ('mlc', MLPClassifier(hidden_layer_sizes=(100,100)))
        ])
-    elif p=="pipe2e1":
+    elif pipe=="pipe2e1":
        p = Pipeline([
            ('union', FeatureUnion(transformer_list=[
            ('subject', Pipeline([('tse', ThreadSubjectExtractor()),
@@ -269,7 +198,7 @@ def build_pipe(p=b"pipe1"):
            } )),
            ('mlc', MLPClassifier(hidden_layer_sizes=(100,100,50)))
        ])
-    elif p=="pipe2f":
+    elif pipe=="pipe2f":
        p = Pipeline([
            ('union', FeatureUnion(transformer_list=[
            ('subject', Pipeline([('tse', ThreadSubjectExtractor()),
@@ -295,7 +224,7 @@ def build_pipe(p=b"pipe1"):
            } )),
            ('mlc', MLPClassifier(hidden_layer_sizes=(100,100)))
        ])        
-    elif p=="pipe2g":
+    elif pipe=="pipe2g":
        p = Pipeline([
            ('union', FeatureUnion(transformer_list=[
            ('subject', Pipeline([('tse', ThreadSubjectExtractor()),
@@ -321,7 +250,7 @@ def build_pipe(p=b"pipe1"):
            } )),
            ('mlc', MLPClassifier(hidden_layer_sizes=(100,100,100)))
        ])
-    elif p=="pipe2c":
+    elif pipe=="pipe2c":
        p = Pipeline([
            ('union', FeatureUnion(transformer_list=[
            ('subject', Pipeline([('tse', ThreadSubjectExtractor()),
--- a/classifier/oldstuff.py
+++ b/classifier/oldstuff.py
@@ -0,0 +1,23 @@
+def set_train_data(i,d,key=b"answered"):
+    global train
+    #------------------------------------    
+    if not data_types.has_key(key):
+        raise ValueError("Key "+str(key)+" unknown")
+    if not train.has_key(i) or train[i] is None:
+        train[i]={}
+    if not type(d)  is data_types[key]:
+        raise TypeError("Data - %s - for key "% d +str(key)+" must be " +str(data_types[key])+ " but it is "+ str(type(d)))
+    #------------------------------------
+    train[i][key]=d
+
+        
+def store_training_data(i, d,key=b"answered"):
+    set_train_data(i,d,key)
+    with open("data.yml","w") as file:
+        file.write(yaml.dump(train,default_flow_style=True))
+        file.close()
+                  
+
+def in_training(i, key="answered"):
+    return train.has_key(i) and train[i].has_key(key)
+data_types= { "answered": bool, "maintopic": str, "lang": str}
--- a/classifier/prediction.py
+++ b/classifier/prediction.py
@@ -1,22 +1,31 @@
-from classifier import get_training_threads,  print_answers, in_training, store_training_data, get_pipe
+from classifier import  get_pipe
 from storage import db_session, MailThread

 def predict_threads():
-    pipe1,le=get_pipe("pipe1",b"answered",["db"])
+    """
+    Predicts the language, topic and if a thread is anwered and writes that to the database. This function doesn't have a return value.
+    """
+    # Loading pipes for the prediction of each thread
+    pipe1,le=get_pipe("pipe1",key=b"answered",filter=["db"])
    pipe2,le2=get_pipe("pipe2g", b"maintopic",["db"])
    pipe3,le3=get_pipe("pipe2b", b"lang",["db"])
+
+    # Loading untrained MailThreads:
    q=db_session.query(MailThread).filter(MailThread.istrained.op("IS NOT")(True))
    mail_threads=q.all()
+    
    if len(mail_threads) ==0:
-        raise ValueError("no untrained threads found")
+        raise StandardError("no untrained threads found in database")
+    
    answered=le.inverse_transform(pipe1.predict(mail_threads))
    maintopic=le2.inverse_transform(pipe2.predict(mail_threads))
    lang=le3.inverse_transform(pipe3.predict(mail_threads))

    for i, t in enumerate(mail_threads):
-        t.answered=bool(answered[i])
-        t.opened=bool(answered[i])
-        t.maintopic=str(maintopic[i])
-        t.lang=str(lang[i])
+        t.answered, t.opened, t.maintopic, t.lang = ( bool(answered[i]),
+                                                      bool(answered[i]),
+                                                      str(maintopic[i]),
+                                                      str(lang[i])
+        )
        db_session.add(t)
        db_session.commit()
--- a/classifier/training.py
+++ b/classifier/training.py
@@ -2,7 +2,7 @@ from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import LabelEncoder
 import numpy
 from storage import Mail, MailThread, db_session
-from classifier import store_training_data, print_answers
+from classifier import  print_answers