updates + sqlite db
This commit is contained in:
10
.gitignore
vendored
Normal file
10
.gitignore
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
env/*
|
||||
*~
|
||||
config.cfg
|
||||
*.yml3
|
||||
*.yaml
|
||||
*.sqlite
|
||||
*.sqlite-journal
|
||||
.env
|
||||
*.pyc
|
||||
*#
|
||||
@@ -4,6 +4,7 @@ from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.pipeline import Pipeline, FeatureUnion
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
|
||||
import numpy as np
|
||||
import yaml
|
||||
@@ -100,7 +101,28 @@ pipe2 = Pipeline([
|
||||
], transformer_weights={
|
||||
'subject': 1,
|
||||
'text': 0.7,
|
||||
'envelope': 0.5
|
||||
'envelope': 0.7
|
||||
} )),
|
||||
('clf', MultinomialNB())
|
||||
])
|
||||
|
||||
pipe2b = Pipeline([
|
||||
('union', FeatureUnion(transformer_list=[
|
||||
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('text', Pipeline([('tte',ThreadTextExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
||||
('dv',DictVectorizer())
|
||||
]))
|
||||
], transformer_weights={
|
||||
'subject': 1,
|
||||
'text': 0.7,
|
||||
'envelope': 0.7
|
||||
} )),
|
||||
('mlc', MLPClassifier())
|
||||
])
|
||||
|
||||
41
data.yml
Normal file
41
data.yml
Normal file
@@ -0,0 +1,41 @@
|
||||
{27070: {maintopic: ausleihen}, 27083: {maintopic: ausleihen}, 27086: {maintopic: information},
|
||||
27094: {maintopic: information}, 27096: {maintopic: jobausschreibung}, 27102: {
|
||||
maintopic: studium}, 27118: {maintopic: information}, 27127: {maintopic: studium},
|
||||
27130: {maintopic: information}, 27133: {maintopic: information}, 27141: {maintopic: information},
|
||||
27146: {maintopic: information}, 27166: {maintopic: umfragen}, 27171: {maintopic: ausleihen},
|
||||
27178: {maintopic: studium}, 27182: {maintopic: studium}, 27197: {maintopic: information},
|
||||
27201: {maintopic: information}, 27218: {maintopic: information}, 27219: {maintopic: studium},
|
||||
27222: {maintopic: information}, 27226: {maintopic: ausleihen}, 27420: {answered: true,
|
||||
maintopic: studium}, 27422: {answered: true, maintopic: studium}, 27425: {answered: false,
|
||||
maintopic: studium}, 27431: {answered: false, maintopic: information}, 27434: {
|
||||
answered: false, maintopic: studium}, 27435: {answered: false}, 27438: {answered: false,
|
||||
maintopic: information}, 27439: {answered: true, maintopic: studium}, 27441: {
|
||||
answered: false, maintopic: studium}, 27444: {answered: true, maintopic: ausleihen},
|
||||
27454: {answered: false, maintopic: information}, 27455: {answered: false, maintopic: information},
|
||||
27456: {answered: false, maintopic: studium}, 27457: {answered: false, maintopic: jobausschreibung},
|
||||
27468: {answered: true, maintopic: studium}, 27489: {answered: false, maintopic: information},
|
||||
27490: {answered: false, maintopic: fachschaftenzeugs}, 27491: {answered: false,
|
||||
maintopic: jobausschreibung}, 27492: {answered: false, maintopic: information},
|
||||
27495: {answered: false, maintopic: information}, 27496: {answered: true, maintopic: ausleihen},
|
||||
27497: {answered: false, maintopic: information}, 27500: {answered: true, maintopic: studium},
|
||||
27501: {answered: false, maintopic: information}, 27514: {answered: true, maintopic: studium},
|
||||
27515: {answered: true, maintopic: studium}, 27518: {answered: true, maintopic: studium},
|
||||
27523: {answered: false, maintopic: jobausschreibung}, 27526: {answered: false,
|
||||
maintopic: studium}, 27536: {answered: true, maintopic: studium}, 27541: {answered: true,
|
||||
maintopic: studium}, 27542: {answered: false, maintopic: studium}, 27543: {answered: false,
|
||||
maintopic: information}, 27544: {answered: true, maintopic: studium}, 27545: {
|
||||
answered: false, maintopic: umfragen}, 27546: {answered: false, maintopic: information},
|
||||
27547: {answered: false, maintopic: studium}, 27549: {answered: false}, 27550: {
|
||||
answered: false, maintopic: information}, 27553: {answered: false, maintopic: information},
|
||||
27558: {answered: false}, 27560: {answered: false, maintopic: ausleihen}, 27562: {
|
||||
answered: false}, 27564: {answered: false, maintopic: jobausschreibung}, 27565: {
|
||||
answered: true, maintopic: ausleihen}, 27566: {answered: false, maintopic: information},
|
||||
27567: {answered: false, maintopic: information}, 27568: {answered: false}, 27575: {
|
||||
answered: false, maintopic: information}, 27577: {answered: false, maintopic: information},
|
||||
27579: {answered: true, maintopic: diplomarbeit}, 27582: {answered: false, maintopic: studium},
|
||||
27583: {answered: true, maintopic: studium}, 27584: {answered: false, maintopic: studium},
|
||||
27585: {answered: false, maintopic: information}, 27586: {answered: false, maintopic: ausleihen},
|
||||
27587: {answered: false, maintopic: information}, 27588: {answered: false, maintopic: ausleihen},
|
||||
27592: {answered: false, maintopic: studium}, 27597: {answered: false, maintopic: jobausschreibung},
|
||||
27598: {answered: false, maintopic: umfragen}, 27604: {answered: false, maintopic: umfragen},
|
||||
27607: {answered: false, maintopic: information}}
|
||||
19
run.py
19
run.py
@@ -14,7 +14,7 @@ from storage.fetch_mail import fetch_threads, flatten_threads
|
||||
from storage import Mail, MailThread, db_session
|
||||
import yaml
|
||||
import email
|
||||
from classifier import get_training_threads, ThreadDictExtractor, pipe1, print_answers, in_training, store_training_data, pipe2
|
||||
from classifier import get_training_threads, ThreadDictExtractor, pipe1, print_answers, in_training, store_training_data, pipe2, pipe2b
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
import numpy
|
||||
@@ -32,6 +32,16 @@ def train_fit_pipe2():
|
||||
pipe2.fit(tt[0],tt[1])
|
||||
return pipe2,tt[2]
|
||||
|
||||
def train_fit_pipe2b():
|
||||
tt= get_training_threads(b"maintopic")
|
||||
pipe2b.fit(tt[0],tt[1])
|
||||
return pipe2b,tt[2]
|
||||
|
||||
def predict_thread(p,l,t):
|
||||
pre=p.predict([t])
|
||||
print "Status is answered is estimated to be: " + str(l.inverse_transform(pre)[0])
|
||||
return pre
|
||||
|
||||
|
||||
def train_single_thread(tid,p,le,key="answered"):
|
||||
if (not type(tid) is int): raise TypeError("ID must be of type int")
|
||||
@@ -43,8 +53,8 @@ def train_single_thread(tid,p,le,key="answered"):
|
||||
pre=p.predict([mth])
|
||||
answ=pre[0]
|
||||
#
|
||||
print mth.to_text()
|
||||
print mth.text()
|
||||
# print mth.to_text()
|
||||
# print mth.text()
|
||||
print "Status is answered is estimated to be: " + str(le.inverse_transform(pre)[0])
|
||||
print_answers(le)
|
||||
|
||||
@@ -104,9 +114,11 @@ if len(sys.argv)>1:
|
||||
|
||||
if sys.argv[1] == "train_thrd2":
|
||||
p, le=train_fit_pipe2()
|
||||
pb, lb =train_fit_pipe2b()
|
||||
train_single_thread(int(sys.argv[2]),p,le,b"maintopic")
|
||||
if sys.argv[1] == "train_all2":
|
||||
p, labelencoder=train_fit_pipe2()
|
||||
pb, lb=train_fit_pipe2b()
|
||||
mth=db_session.query(MailThread).all()
|
||||
print mth
|
||||
for t in mth:
|
||||
@@ -115,6 +127,7 @@ if len(sys.argv)>1:
|
||||
print "---------------------------------------------------"
|
||||
print t.firstmail
|
||||
print t.text()
|
||||
predict_thread(pb,lb,t)
|
||||
train_single_thread(t.firstmail, p, labelencoder, b"maintopic")
|
||||
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ def fetch_thread(tp):
|
||||
return tp
|
||||
|
||||
def fetch_threads():
|
||||
src=server.thread(criteria=[b'SUBJECT', b'service', b'SINCE', date(2017,07,01)])
|
||||
src=server.thread(criteria=[b'SUBJECT', b'service', b'SINCE', date(2017,05,01)])
|
||||
#, b'BEFORE', date(2017,08,01)
|
||||
return src
|
||||
|
||||
|
||||
@@ -88,6 +88,6 @@ class MailThread(Base):
|
||||
# print "withintm:"+str(type(t))
|
||||
t=t.decode("ISO-8859-1")
|
||||
txt=t
|
||||
a=a+txt+"\n\n"
|
||||
a=a+txt+"***........................................***\n"
|
||||
|
||||
return a
|
||||
|
||||
BIN
test.sqlite
Normal file
BIN
test.sqlite
Normal file
Binary file not shown.
Reference in New Issue
Block a user