added flask interface

This commit is contained in:
Andreas Stephanides
2017-08-06 10:16:30 +02:00
parent 4060a77c48
commit ff0bdc6d3b
23 changed files with 14913 additions and 63 deletions

View File

@@ -5,7 +5,7 @@ from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import yaml
from storage import MailThread,db_session
@@ -26,7 +26,7 @@ def store_training_data(i, d,key=b"answered"):
train[i]={}
if not key is None and type(train[i]) is dict:
if not type(d) is data_types[key]:
# print str(type(d)) + " vs " + str(data_types[key])
# print str(type(d)) + " vs " + str(data_types[key])
raise TypeError("Data - %s - for key "% d +str(key)+" must be " +str(data_types[key])+ " but it is "+ str(type(d)))
train[i][key]=d
@@ -47,8 +47,8 @@ def get_training_threads(key="answered"):
if train[i].has_key(key): # In den Trainingsdaten muss der relevante Key sein
t_a.append(t)
d_a.append(train[i][key])
le=LabelEncoder()
d_a2=le.fit_transform(d_a)
le=LabelEncoder()
d_a2=le.fit_transform(d_a)
return (t_a,d_a2,le)
@@ -83,46 +83,102 @@ class ThreadTextExtractor(BaseEstimator, TransformerMixin):
return [t.text() for t in X]
pipe1=Pipeline([('tde', ThreadDictExtractor()),('dv',DictVectorizer()),('clf', MultinomialNB())])
pipe2 = Pipeline([
('union', FeatureUnion(transformer_list=[
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
('cv',CountVectorizer()),
('tfidf', TfidfTransformer())
])),
('text', Pipeline([('tte',ThreadTextExtractor()),
('cv',CountVectorizer()),
('tfidf', TfidfTransformer())
])),
('envelope', Pipeline([('tde', ThreadDictExtractor()),
('dv',DictVectorizer())
]))
], transformer_weights={
'subject': 1,
'text': 0.7,
'envelope': 0.7
} )),
('clf', MultinomialNB())
])
pipe2b = Pipeline([
('union', FeatureUnion(transformer_list=[
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
('cv',CountVectorizer()),
('tfidf', TfidfTransformer())
def build_pipe(p=b"pipe1"):
if p == "pipe1":
p=Pipeline([('tde', ThreadDictExtractor()),
('dv',DictVectorizer()),
('clf', MultinomialNB())
])
elif p=="pipe2":
p = Pipeline([
('union', FeatureUnion(transformer_list=[
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
('cv',CountVectorizer()),
('tfidf', TfidfTransformer())
])),
('text', Pipeline([('tte',ThreadTextExtractor()),
('cv',CountVectorizer()),
('tfidf', TfidfTransformer())
])),
('envelope', Pipeline([('tde', ThreadDictExtractor()),
('dv',DictVectorizer())
]))
], transformer_weights={
'subject': 1,
'text': 0.7,
'envelope': 0.7
} )),
('clf', MultinomialNB())
])
elif p=="pipe2b":
p = Pipeline([
('union', FeatureUnion(transformer_list=[
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
('cv',CountVectorizer()),
('tfidf', TfidfTransformer())
])),
('text', Pipeline([('tte',ThreadTextExtractor()),
('cv',CountVectorizer()),
('tfidf', TfidfTransformer())
('text', Pipeline([('tte',ThreadTextExtractor()),
('cv',CountVectorizer()),
('tfidf', TfidfTransformer())
])),
('envelope', Pipeline([('tde', ThreadDictExtractor()),
('dv',DictVectorizer())
]))
], transformer_weights={
'subject': 1,
'text': 0.7,
'envelope': 0.7
} )),
('mlc', MLPClassifier())
])
elif p=="pipe2c":
p = Pipeline([
('union', FeatureUnion(transformer_list=[
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
('cv',CountVectorizer()),
('tfidf', TfidfTransformer())
])),
('envelope', Pipeline([('tde', ThreadDictExtractor()),
('dv',DictVectorizer())
]))
], transformer_weights={
'subject': 1,
'text': 0.7,
'envelope': 0.7
} )),
('mlc', MLPClassifier())
])
('text', Pipeline([('tte',ThreadTextExtractor()),
('cv',CountVectorizer()),
('tfidf', TfidfTransformer())
])),
('envelope', Pipeline([('tde', ThreadDictExtractor()),
('dv',DictVectorizer())
]))
], transformer_weights={
'subject': 1,
'text': 1,
'envelope': 0.4
} )),
('mlc', MLPClassifier())
])
else:
raise ValueError("The pipe %s is not a valid pipe")
return p
def get_pipe(p=b"pipe1",k=b"answered"):
p=build_pipe(p)
tt= get_training_threads(k)
p.fit(tt[0],tt[1])
return p,tt[2]
from sklearn.metrics import accuracy_score
def test_pipe(pp,k):
tt= get_training_threads(k)
X_train,X_test,y_train,y_test=train_test_split(tt[0],tt[1],test_size=0.2)
if type(pp) is list:
for p in pp:
print "pipe: %s" % p
p=build_pipe(p)
p.fit(X_train,y_train)
ypred=p.predict(X_test)
print accuracy_score(y_test,ypred)
#pipe1=get_pipe("pipe1", "answered")
#pipe2=get_pipe("pipe2", "maintopic")
#pipe2b=get_pipe("pipe2b", "maintopic")