updates + sqlite db

2017-08-04 08:48:54 +02:00
parent 941cbc3d45
commit 4060a77c48
7 changed files with 92 additions and 6 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,10 @@
 env/*
 *~
 config.cfg
 *.yml3
 *.yaml
 *.sqlite
 *.sqlite-journal
 .env
 *.pyc
 *#
--- a/classifier.py
+++ b/classifier.py
@@ -4,6 +4,7 @@ from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
 from sklearn.preprocessing import LabelEncoder
 from sklearn.pipeline import Pipeline, FeatureUnion
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.neural_network import MLPClassifier
 import numpy as np
 import yaml
@@ -100,7 +101,28 @@ pipe2 = Pipeline([
    ], transformer_weights={
        'subject': 1,
        'text': 0.7,
-        'envelope': 0.5
+        'envelope': 0.7
    } )),
    ('clf', MultinomialNB())
 ])
 pipe2b = Pipeline([
    ('union', FeatureUnion(transformer_list=[
        ('subject', Pipeline([('tse', ThreadSubjectExtractor()),
                              ('cv',CountVectorizer()),
                              ('tfidf', TfidfTransformer())
        ])),
        ('text',    Pipeline([('tte',ThreadTextExtractor()),
                              ('cv',CountVectorizer()),
                              ('tfidf', TfidfTransformer())
        ])),
        ('envelope', Pipeline([('tde', ThreadDictExtractor()),
                               ('dv',DictVectorizer())
        ]))
    ], transformer_weights={
        'subject': 1,
        'text': 0.7,
        'envelope': 0.7
    } )),
    ('mlc', MLPClassifier())
 ])
--- a/data.yml
+++ b/data.yml
@@ -0,0 +1,41 @@
 {27070: {maintopic: ausleihen}, 27083: {maintopic: ausleihen}, 27086: {maintopic: information},
  27094: {maintopic: information}, 27096: {maintopic: jobausschreibung}, 27102: {
    maintopic: studium}, 27118: {maintopic: information}, 27127: {maintopic: studium},
  27130: {maintopic: information}, 27133: {maintopic: information}, 27141: {maintopic: information},
  27146: {maintopic: information}, 27166: {maintopic: umfragen}, 27171: {maintopic: ausleihen},
  27178: {maintopic: studium}, 27182: {maintopic: studium}, 27197: {maintopic: information},
  27201: {maintopic: information}, 27218: {maintopic: information}, 27219: {maintopic: studium},
  27222: {maintopic: information}, 27226: {maintopic: ausleihen}, 27420: {answered: true,
    maintopic: studium}, 27422: {answered: true, maintopic: studium}, 27425: {answered: false,
    maintopic: studium}, 27431: {answered: false, maintopic: information}, 27434: {
    answered: false, maintopic: studium}, 27435: {answered: false}, 27438: {answered: false,
    maintopic: information}, 27439: {answered: true, maintopic: studium}, 27441: {
    answered: false, maintopic: studium}, 27444: {answered: true, maintopic: ausleihen},
  27454: {answered: false, maintopic: information}, 27455: {answered: false, maintopic: information},
  27456: {answered: false, maintopic: studium}, 27457: {answered: false, maintopic: jobausschreibung},
  27468: {answered: true, maintopic: studium}, 27489: {answered: false, maintopic: information},
  27490: {answered: false, maintopic: fachschaftenzeugs}, 27491: {answered: false,
    maintopic: jobausschreibung}, 27492: {answered: false, maintopic: information},
  27495: {answered: false, maintopic: information}, 27496: {answered: true, maintopic: ausleihen},
  27497: {answered: false, maintopic: information}, 27500: {answered: true, maintopic: studium},
  27501: {answered: false, maintopic: information}, 27514: {answered: true, maintopic: studium},
  27515: {answered: true, maintopic: studium}, 27518: {answered: true, maintopic: studium},
  27523: {answered: false, maintopic: jobausschreibung}, 27526: {answered: false,
    maintopic: studium}, 27536: {answered: true, maintopic: studium}, 27541: {answered: true,
    maintopic: studium}, 27542: {answered: false, maintopic: studium}, 27543: {answered: false,
    maintopic: information}, 27544: {answered: true, maintopic: studium}, 27545: {
    answered: false, maintopic: umfragen}, 27546: {answered: false, maintopic: information},
  27547: {answered: false, maintopic: studium}, 27549: {answered: false}, 27550: {
    answered: false, maintopic: information}, 27553: {answered: false, maintopic: information},
  27558: {answered: false}, 27560: {answered: false, maintopic: ausleihen}, 27562: {
    answered: false}, 27564: {answered: false, maintopic: jobausschreibung}, 27565: {
    answered: true, maintopic: ausleihen}, 27566: {answered: false, maintopic: information},
  27567: {answered: false, maintopic: information}, 27568: {answered: false}, 27575: {
    answered: false, maintopic: information}, 27577: {answered: false, maintopic: information},
  27579: {answered: true, maintopic: diplomarbeit}, 27582: {answered: false, maintopic: studium},
  27583: {answered: true, maintopic: studium}, 27584: {answered: false, maintopic: studium},
  27585: {answered: false, maintopic: information}, 27586: {answered: false, maintopic: ausleihen},
  27587: {answered: false, maintopic: information}, 27588: {answered: false, maintopic: ausleihen},
  27592: {answered: false, maintopic: studium}, 27597: {answered: false, maintopic: jobausschreibung},
  27598: {answered: false, maintopic: umfragen}, 27604: {answered: false, maintopic: umfragen},
  27607: {answered: false, maintopic: information}}
--- a/run.py
+++ b/run.py
@@ -14,7 +14,7 @@ from storage.fetch_mail import fetch_threads, flatten_threads
 from storage import Mail, MailThread, db_session
 import yaml
 import email
-from classifier import get_training_threads, ThreadDictExtractor, pipe1, print_answers, in_training, store_training_data, pipe2
+from classifier import get_training_threads, ThreadDictExtractor, pipe1, print_answers, in_training, store_training_data, pipe2, pipe2b
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import LabelEncoder
 import numpy
@@ -32,6 +32,16 @@ def train_fit_pipe2():
        pipe2.fit(tt[0],tt[1])
        return pipe2,tt[2]
 def train_fit_pipe2b():
        tt= get_training_threads(b"maintopic")
        pipe2b.fit(tt[0],tt[1])
        return pipe2b,tt[2]
 def predict_thread(p,l,t):
        pre=p.predict([t])
        print "Status is answered is estimated to be: " + str(l.inverse_transform(pre)[0])
        return pre
 def train_single_thread(tid,p,le,key="answered"):
        if (not type(tid) is int): raise TypeError("ID must be of type int") 
@@ -43,8 +53,8 @@ def train_single_thread(tid,p,le,key="answered"):
        pre=p.predict([mth])
        answ=pre[0]
        #
-        print mth.to_text()
+#        print mth.to_text()
-        print mth.text()
+#        print mth.text()
        print "Status is answered is estimated to be: " + str(le.inverse_transform(pre)[0])
        print_answers(le)
@@ -104,9 +114,11 @@ if len(sys.argv)>1:
    if sys.argv[1] == "train_thrd2":
        p, le=train_fit_pipe2()
        pb, lb =train_fit_pipe2b()
        train_single_thread(int(sys.argv[2]),p,le,b"maintopic")        
    if sys.argv[1] == "train_all2":
        p, labelencoder=train_fit_pipe2()
        pb, lb=train_fit_pipe2b()
        mth=db_session.query(MailThread).all()
        print mth
        for t in mth:
@@ -115,6 +127,7 @@ if len(sys.argv)>1:
                print "---------------------------------------------------"
                print t.firstmail
                print t.text()
                predict_thread(pb,lb,t)
                train_single_thread(t.firstmail, p, labelencoder, b"maintopic") 
--- a/storage/fetch_mail.py
+++ b/storage/fetch_mail.py
@@ -19,7 +19,7 @@ def fetch_thread(tp):
    return tp
 def fetch_threads():
-    src=server.thread(criteria=[b'SUBJECT', b'service', b'SINCE', date(2017,07,01)])
+    src=server.thread(criteria=[b'SUBJECT', b'service', b'SINCE', date(2017,05,01)])
    #, b'BEFORE', date(2017,08,01)
    return src
--- a/storage/thread_model.py
+++ b/storage/thread_model.py
@@ -88,6 +88,6 @@ class MailThread(Base):
 #                print "withintm:"+str(type(t))
                t=t.decode("ISO-8859-1")
                txt=t
-            a=a+txt+"\n\n"
+            a=a+txt+"***........................................***\n"
        return a
--- a/test.sqlite
+++ b/test.sqlite