updates + sqlite db

This commit is contained in:
Andreas Stephanides
2017-08-04 08:48:54 +02:00
parent 941cbc3d45
commit 4060a77c48
7 changed files with 92 additions and 6 deletions

10
.gitignore vendored Normal file
View File

@@ -0,0 +1,10 @@
env/*
*~
config.cfg
*.yml3
*.yaml
*.sqlite
*.sqlite-journal
.env
*.pyc
*#

View File

@@ -4,6 +4,7 @@ from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
import numpy as np import numpy as np
import yaml import yaml
@@ -100,7 +101,28 @@ pipe2 = Pipeline([
], transformer_weights={ ], transformer_weights={
'subject': 1, 'subject': 1,
'text': 0.7, 'text': 0.7,
'envelope': 0.5 'envelope': 0.7
} )), } )),
('clf', MultinomialNB()) ('clf', MultinomialNB())
]) ])
pipe2b = Pipeline([
('union', FeatureUnion(transformer_list=[
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
('cv',CountVectorizer()),
('tfidf', TfidfTransformer())
])),
('text', Pipeline([('tte',ThreadTextExtractor()),
('cv',CountVectorizer()),
('tfidf', TfidfTransformer())
])),
('envelope', Pipeline([('tde', ThreadDictExtractor()),
('dv',DictVectorizer())
]))
], transformer_weights={
'subject': 1,
'text': 0.7,
'envelope': 0.7
} )),
('mlc', MLPClassifier())
])

41
data.yml Normal file
View File

@@ -0,0 +1,41 @@
{27070: {maintopic: ausleihen}, 27083: {maintopic: ausleihen}, 27086: {maintopic: information},
27094: {maintopic: information}, 27096: {maintopic: jobausschreibung}, 27102: {
maintopic: studium}, 27118: {maintopic: information}, 27127: {maintopic: studium},
27130: {maintopic: information}, 27133: {maintopic: information}, 27141: {maintopic: information},
27146: {maintopic: information}, 27166: {maintopic: umfragen}, 27171: {maintopic: ausleihen},
27178: {maintopic: studium}, 27182: {maintopic: studium}, 27197: {maintopic: information},
27201: {maintopic: information}, 27218: {maintopic: information}, 27219: {maintopic: studium},
27222: {maintopic: information}, 27226: {maintopic: ausleihen}, 27420: {answered: true,
maintopic: studium}, 27422: {answered: true, maintopic: studium}, 27425: {answered: false,
maintopic: studium}, 27431: {answered: false, maintopic: information}, 27434: {
answered: false, maintopic: studium}, 27435: {answered: false}, 27438: {answered: false,
maintopic: information}, 27439: {answered: true, maintopic: studium}, 27441: {
answered: false, maintopic: studium}, 27444: {answered: true, maintopic: ausleihen},
27454: {answered: false, maintopic: information}, 27455: {answered: false, maintopic: information},
27456: {answered: false, maintopic: studium}, 27457: {answered: false, maintopic: jobausschreibung},
27468: {answered: true, maintopic: studium}, 27489: {answered: false, maintopic: information},
27490: {answered: false, maintopic: fachschaftenzeugs}, 27491: {answered: false,
maintopic: jobausschreibung}, 27492: {answered: false, maintopic: information},
27495: {answered: false, maintopic: information}, 27496: {answered: true, maintopic: ausleihen},
27497: {answered: false, maintopic: information}, 27500: {answered: true, maintopic: studium},
27501: {answered: false, maintopic: information}, 27514: {answered: true, maintopic: studium},
27515: {answered: true, maintopic: studium}, 27518: {answered: true, maintopic: studium},
27523: {answered: false, maintopic: jobausschreibung}, 27526: {answered: false,
maintopic: studium}, 27536: {answered: true, maintopic: studium}, 27541: {answered: true,
maintopic: studium}, 27542: {answered: false, maintopic: studium}, 27543: {answered: false,
maintopic: information}, 27544: {answered: true, maintopic: studium}, 27545: {
answered: false, maintopic: umfragen}, 27546: {answered: false, maintopic: information},
27547: {answered: false, maintopic: studium}, 27549: {answered: false}, 27550: {
answered: false, maintopic: information}, 27553: {answered: false, maintopic: information},
27558: {answered: false}, 27560: {answered: false, maintopic: ausleihen}, 27562: {
answered: false}, 27564: {answered: false, maintopic: jobausschreibung}, 27565: {
answered: true, maintopic: ausleihen}, 27566: {answered: false, maintopic: information},
27567: {answered: false, maintopic: information}, 27568: {answered: false}, 27575: {
answered: false, maintopic: information}, 27577: {answered: false, maintopic: information},
27579: {answered: true, maintopic: diplomarbeit}, 27582: {answered: false, maintopic: studium},
27583: {answered: true, maintopic: studium}, 27584: {answered: false, maintopic: studium},
27585: {answered: false, maintopic: information}, 27586: {answered: false, maintopic: ausleihen},
27587: {answered: false, maintopic: information}, 27588: {answered: false, maintopic: ausleihen},
27592: {answered: false, maintopic: studium}, 27597: {answered: false, maintopic: jobausschreibung},
27598: {answered: false, maintopic: umfragen}, 27604: {answered: false, maintopic: umfragen},
27607: {answered: false, maintopic: information}}

19
run.py
View File

@@ -14,7 +14,7 @@ from storage.fetch_mail import fetch_threads, flatten_threads
from storage import Mail, MailThread, db_session from storage import Mail, MailThread, db_session
import yaml import yaml
import email import email
from classifier import get_training_threads, ThreadDictExtractor, pipe1, print_answers, in_training, store_training_data, pipe2 from classifier import get_training_threads, ThreadDictExtractor, pipe1, print_answers, in_training, store_training_data, pipe2, pipe2b
from sklearn.pipeline import Pipeline from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import LabelEncoder
import numpy import numpy
@@ -32,6 +32,16 @@ def train_fit_pipe2():
pipe2.fit(tt[0],tt[1]) pipe2.fit(tt[0],tt[1])
return pipe2,tt[2] return pipe2,tt[2]
def train_fit_pipe2b():
tt= get_training_threads(b"maintopic")
pipe2b.fit(tt[0],tt[1])
return pipe2b,tt[2]
def predict_thread(p,l,t):
pre=p.predict([t])
print "Status is answered is estimated to be: " + str(l.inverse_transform(pre)[0])
return pre
def train_single_thread(tid,p,le,key="answered"): def train_single_thread(tid,p,le,key="answered"):
if (not type(tid) is int): raise TypeError("ID must be of type int") if (not type(tid) is int): raise TypeError("ID must be of type int")
@@ -43,8 +53,8 @@ def train_single_thread(tid,p,le,key="answered"):
pre=p.predict([mth]) pre=p.predict([mth])
answ=pre[0] answ=pre[0]
# #
print mth.to_text() # print mth.to_text()
print mth.text() # print mth.text()
print "Status is answered is estimated to be: " + str(le.inverse_transform(pre)[0]) print "Status is answered is estimated to be: " + str(le.inverse_transform(pre)[0])
print_answers(le) print_answers(le)
@@ -104,9 +114,11 @@ if len(sys.argv)>1:
if sys.argv[1] == "train_thrd2": if sys.argv[1] == "train_thrd2":
p, le=train_fit_pipe2() p, le=train_fit_pipe2()
pb, lb =train_fit_pipe2b()
train_single_thread(int(sys.argv[2]),p,le,b"maintopic") train_single_thread(int(sys.argv[2]),p,le,b"maintopic")
if sys.argv[1] == "train_all2": if sys.argv[1] == "train_all2":
p, labelencoder=train_fit_pipe2() p, labelencoder=train_fit_pipe2()
pb, lb=train_fit_pipe2b()
mth=db_session.query(MailThread).all() mth=db_session.query(MailThread).all()
print mth print mth
for t in mth: for t in mth:
@@ -115,6 +127,7 @@ if len(sys.argv)>1:
print "---------------------------------------------------" print "---------------------------------------------------"
print t.firstmail print t.firstmail
print t.text() print t.text()
predict_thread(pb,lb,t)
train_single_thread(t.firstmail, p, labelencoder, b"maintopic") train_single_thread(t.firstmail, p, labelencoder, b"maintopic")

View File

@@ -19,7 +19,7 @@ def fetch_thread(tp):
return tp return tp
def fetch_threads(): def fetch_threads():
src=server.thread(criteria=[b'SUBJECT', b'service', b'SINCE', date(2017,07,01)]) src=server.thread(criteria=[b'SUBJECT', b'service', b'SINCE', date(2017,05,01)])
#, b'BEFORE', date(2017,08,01) #, b'BEFORE', date(2017,08,01)
return src return src

View File

@@ -88,6 +88,6 @@ class MailThread(Base):
# print "withintm:"+str(type(t)) # print "withintm:"+str(type(t))
t=t.decode("ISO-8859-1") t=t.decode("ISO-8859-1")
txt=t txt=t
a=a+txt+"\n\n" a=a+txt+"***........................................***\n"
return a return a

BIN
test.sqlite Normal file

Binary file not shown.