updates + sqlite db
This commit is contained in:
10
.gitignore
vendored
Normal file
10
.gitignore
vendored
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
env/*
|
||||||
|
*~
|
||||||
|
config.cfg
|
||||||
|
*.yml3
|
||||||
|
*.yaml
|
||||||
|
*.sqlite
|
||||||
|
*.sqlite-journal
|
||||||
|
.env
|
||||||
|
*.pyc
|
||||||
|
*#
|
||||||
@@ -4,6 +4,7 @@ from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
|
|||||||
from sklearn.preprocessing import LabelEncoder
|
from sklearn.preprocessing import LabelEncoder
|
||||||
from sklearn.pipeline import Pipeline, FeatureUnion
|
from sklearn.pipeline import Pipeline, FeatureUnion
|
||||||
from sklearn.naive_bayes import MultinomialNB
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
from sklearn.neural_network import MLPClassifier
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import yaml
|
import yaml
|
||||||
@@ -100,7 +101,28 @@ pipe2 = Pipeline([
|
|||||||
], transformer_weights={
|
], transformer_weights={
|
||||||
'subject': 1,
|
'subject': 1,
|
||||||
'text': 0.7,
|
'text': 0.7,
|
||||||
'envelope': 0.5
|
'envelope': 0.7
|
||||||
} )),
|
} )),
|
||||||
('clf', MultinomialNB())
|
('clf', MultinomialNB())
|
||||||
])
|
])
|
||||||
|
|
||||||
|
pipe2b = Pipeline([
|
||||||
|
('union', FeatureUnion(transformer_list=[
|
||||||
|
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
||||||
|
('cv',CountVectorizer()),
|
||||||
|
('tfidf', TfidfTransformer())
|
||||||
|
])),
|
||||||
|
('text', Pipeline([('tte',ThreadTextExtractor()),
|
||||||
|
('cv',CountVectorizer()),
|
||||||
|
('tfidf', TfidfTransformer())
|
||||||
|
])),
|
||||||
|
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
||||||
|
('dv',DictVectorizer())
|
||||||
|
]))
|
||||||
|
], transformer_weights={
|
||||||
|
'subject': 1,
|
||||||
|
'text': 0.7,
|
||||||
|
'envelope': 0.7
|
||||||
|
} )),
|
||||||
|
('mlc', MLPClassifier())
|
||||||
|
])
|
||||||
|
|||||||
41
data.yml
Normal file
41
data.yml
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
{27070: {maintopic: ausleihen}, 27083: {maintopic: ausleihen}, 27086: {maintopic: information},
|
||||||
|
27094: {maintopic: information}, 27096: {maintopic: jobausschreibung}, 27102: {
|
||||||
|
maintopic: studium}, 27118: {maintopic: information}, 27127: {maintopic: studium},
|
||||||
|
27130: {maintopic: information}, 27133: {maintopic: information}, 27141: {maintopic: information},
|
||||||
|
27146: {maintopic: information}, 27166: {maintopic: umfragen}, 27171: {maintopic: ausleihen},
|
||||||
|
27178: {maintopic: studium}, 27182: {maintopic: studium}, 27197: {maintopic: information},
|
||||||
|
27201: {maintopic: information}, 27218: {maintopic: information}, 27219: {maintopic: studium},
|
||||||
|
27222: {maintopic: information}, 27226: {maintopic: ausleihen}, 27420: {answered: true,
|
||||||
|
maintopic: studium}, 27422: {answered: true, maintopic: studium}, 27425: {answered: false,
|
||||||
|
maintopic: studium}, 27431: {answered: false, maintopic: information}, 27434: {
|
||||||
|
answered: false, maintopic: studium}, 27435: {answered: false}, 27438: {answered: false,
|
||||||
|
maintopic: information}, 27439: {answered: true, maintopic: studium}, 27441: {
|
||||||
|
answered: false, maintopic: studium}, 27444: {answered: true, maintopic: ausleihen},
|
||||||
|
27454: {answered: false, maintopic: information}, 27455: {answered: false, maintopic: information},
|
||||||
|
27456: {answered: false, maintopic: studium}, 27457: {answered: false, maintopic: jobausschreibung},
|
||||||
|
27468: {answered: true, maintopic: studium}, 27489: {answered: false, maintopic: information},
|
||||||
|
27490: {answered: false, maintopic: fachschaftenzeugs}, 27491: {answered: false,
|
||||||
|
maintopic: jobausschreibung}, 27492: {answered: false, maintopic: information},
|
||||||
|
27495: {answered: false, maintopic: information}, 27496: {answered: true, maintopic: ausleihen},
|
||||||
|
27497: {answered: false, maintopic: information}, 27500: {answered: true, maintopic: studium},
|
||||||
|
27501: {answered: false, maintopic: information}, 27514: {answered: true, maintopic: studium},
|
||||||
|
27515: {answered: true, maintopic: studium}, 27518: {answered: true, maintopic: studium},
|
||||||
|
27523: {answered: false, maintopic: jobausschreibung}, 27526: {answered: false,
|
||||||
|
maintopic: studium}, 27536: {answered: true, maintopic: studium}, 27541: {answered: true,
|
||||||
|
maintopic: studium}, 27542: {answered: false, maintopic: studium}, 27543: {answered: false,
|
||||||
|
maintopic: information}, 27544: {answered: true, maintopic: studium}, 27545: {
|
||||||
|
answered: false, maintopic: umfragen}, 27546: {answered: false, maintopic: information},
|
||||||
|
27547: {answered: false, maintopic: studium}, 27549: {answered: false}, 27550: {
|
||||||
|
answered: false, maintopic: information}, 27553: {answered: false, maintopic: information},
|
||||||
|
27558: {answered: false}, 27560: {answered: false, maintopic: ausleihen}, 27562: {
|
||||||
|
answered: false}, 27564: {answered: false, maintopic: jobausschreibung}, 27565: {
|
||||||
|
answered: true, maintopic: ausleihen}, 27566: {answered: false, maintopic: information},
|
||||||
|
27567: {answered: false, maintopic: information}, 27568: {answered: false}, 27575: {
|
||||||
|
answered: false, maintopic: information}, 27577: {answered: false, maintopic: information},
|
||||||
|
27579: {answered: true, maintopic: diplomarbeit}, 27582: {answered: false, maintopic: studium},
|
||||||
|
27583: {answered: true, maintopic: studium}, 27584: {answered: false, maintopic: studium},
|
||||||
|
27585: {answered: false, maintopic: information}, 27586: {answered: false, maintopic: ausleihen},
|
||||||
|
27587: {answered: false, maintopic: information}, 27588: {answered: false, maintopic: ausleihen},
|
||||||
|
27592: {answered: false, maintopic: studium}, 27597: {answered: false, maintopic: jobausschreibung},
|
||||||
|
27598: {answered: false, maintopic: umfragen}, 27604: {answered: false, maintopic: umfragen},
|
||||||
|
27607: {answered: false, maintopic: information}}
|
||||||
19
run.py
19
run.py
@@ -14,7 +14,7 @@ from storage.fetch_mail import fetch_threads, flatten_threads
|
|||||||
from storage import Mail, MailThread, db_session
|
from storage import Mail, MailThread, db_session
|
||||||
import yaml
|
import yaml
|
||||||
import email
|
import email
|
||||||
from classifier import get_training_threads, ThreadDictExtractor, pipe1, print_answers, in_training, store_training_data, pipe2
|
from classifier import get_training_threads, ThreadDictExtractor, pipe1, print_answers, in_training, store_training_data, pipe2, pipe2b
|
||||||
from sklearn.pipeline import Pipeline
|
from sklearn.pipeline import Pipeline
|
||||||
from sklearn.preprocessing import LabelEncoder
|
from sklearn.preprocessing import LabelEncoder
|
||||||
import numpy
|
import numpy
|
||||||
@@ -32,6 +32,16 @@ def train_fit_pipe2():
|
|||||||
pipe2.fit(tt[0],tt[1])
|
pipe2.fit(tt[0],tt[1])
|
||||||
return pipe2,tt[2]
|
return pipe2,tt[2]
|
||||||
|
|
||||||
|
def train_fit_pipe2b():
|
||||||
|
tt= get_training_threads(b"maintopic")
|
||||||
|
pipe2b.fit(tt[0],tt[1])
|
||||||
|
return pipe2b,tt[2]
|
||||||
|
|
||||||
|
def predict_thread(p,l,t):
|
||||||
|
pre=p.predict([t])
|
||||||
|
print "Status is answered is estimated to be: " + str(l.inverse_transform(pre)[0])
|
||||||
|
return pre
|
||||||
|
|
||||||
|
|
||||||
def train_single_thread(tid,p,le,key="answered"):
|
def train_single_thread(tid,p,le,key="answered"):
|
||||||
if (not type(tid) is int): raise TypeError("ID must be of type int")
|
if (not type(tid) is int): raise TypeError("ID must be of type int")
|
||||||
@@ -43,8 +53,8 @@ def train_single_thread(tid,p,le,key="answered"):
|
|||||||
pre=p.predict([mth])
|
pre=p.predict([mth])
|
||||||
answ=pre[0]
|
answ=pre[0]
|
||||||
#
|
#
|
||||||
print mth.to_text()
|
# print mth.to_text()
|
||||||
print mth.text()
|
# print mth.text()
|
||||||
print "Status is answered is estimated to be: " + str(le.inverse_transform(pre)[0])
|
print "Status is answered is estimated to be: " + str(le.inverse_transform(pre)[0])
|
||||||
print_answers(le)
|
print_answers(le)
|
||||||
|
|
||||||
@@ -104,9 +114,11 @@ if len(sys.argv)>1:
|
|||||||
|
|
||||||
if sys.argv[1] == "train_thrd2":
|
if sys.argv[1] == "train_thrd2":
|
||||||
p, le=train_fit_pipe2()
|
p, le=train_fit_pipe2()
|
||||||
|
pb, lb =train_fit_pipe2b()
|
||||||
train_single_thread(int(sys.argv[2]),p,le,b"maintopic")
|
train_single_thread(int(sys.argv[2]),p,le,b"maintopic")
|
||||||
if sys.argv[1] == "train_all2":
|
if sys.argv[1] == "train_all2":
|
||||||
p, labelencoder=train_fit_pipe2()
|
p, labelencoder=train_fit_pipe2()
|
||||||
|
pb, lb=train_fit_pipe2b()
|
||||||
mth=db_session.query(MailThread).all()
|
mth=db_session.query(MailThread).all()
|
||||||
print mth
|
print mth
|
||||||
for t in mth:
|
for t in mth:
|
||||||
@@ -115,6 +127,7 @@ if len(sys.argv)>1:
|
|||||||
print "---------------------------------------------------"
|
print "---------------------------------------------------"
|
||||||
print t.firstmail
|
print t.firstmail
|
||||||
print t.text()
|
print t.text()
|
||||||
|
predict_thread(pb,lb,t)
|
||||||
train_single_thread(t.firstmail, p, labelencoder, b"maintopic")
|
train_single_thread(t.firstmail, p, labelencoder, b"maintopic")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ def fetch_thread(tp):
|
|||||||
return tp
|
return tp
|
||||||
|
|
||||||
def fetch_threads():
|
def fetch_threads():
|
||||||
src=server.thread(criteria=[b'SUBJECT', b'service', b'SINCE', date(2017,07,01)])
|
src=server.thread(criteria=[b'SUBJECT', b'service', b'SINCE', date(2017,05,01)])
|
||||||
#, b'BEFORE', date(2017,08,01)
|
#, b'BEFORE', date(2017,08,01)
|
||||||
return src
|
return src
|
||||||
|
|
||||||
|
|||||||
@@ -88,6 +88,6 @@ class MailThread(Base):
|
|||||||
# print "withintm:"+str(type(t))
|
# print "withintm:"+str(type(t))
|
||||||
t=t.decode("ISO-8859-1")
|
t=t.decode("ISO-8859-1")
|
||||||
txt=t
|
txt=t
|
||||||
a=a+txt+"\n\n"
|
a=a+txt+"***........................................***\n"
|
||||||
|
|
||||||
return a
|
return a
|
||||||
|
|||||||
BIN
test.sqlite
Normal file
BIN
test.sqlite
Normal file
Binary file not shown.
Reference in New Issue
Block a user