added flask interface
This commit is contained in:
142
classifier.py
142
classifier.py
@@ -5,7 +5,7 @@ from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.pipeline import Pipeline, FeatureUnion
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
import numpy as np
|
||||
import yaml
|
||||
from storage import MailThread,db_session
|
||||
@@ -26,7 +26,7 @@ def store_training_data(i, d,key=b"answered"):
|
||||
train[i]={}
|
||||
if not key is None and type(train[i]) is dict:
|
||||
if not type(d) is data_types[key]:
|
||||
# print str(type(d)) + " vs " + str(data_types[key])
|
||||
# print str(type(d)) + " vs " + str(data_types[key])
|
||||
raise TypeError("Data - %s - for key "% d +str(key)+" must be " +str(data_types[key])+ " but it is "+ str(type(d)))
|
||||
train[i][key]=d
|
||||
|
||||
@@ -47,8 +47,8 @@ def get_training_threads(key="answered"):
|
||||
if train[i].has_key(key): # In den Trainingsdaten muss der relevante Key sein
|
||||
t_a.append(t)
|
||||
d_a.append(train[i][key])
|
||||
le=LabelEncoder()
|
||||
d_a2=le.fit_transform(d_a)
|
||||
le=LabelEncoder()
|
||||
d_a2=le.fit_transform(d_a)
|
||||
return (t_a,d_a2,le)
|
||||
|
||||
|
||||
@@ -83,46 +83,102 @@ class ThreadTextExtractor(BaseEstimator, TransformerMixin):
|
||||
return [t.text() for t in X]
|
||||
|
||||
|
||||
pipe1=Pipeline([('tde', ThreadDictExtractor()),('dv',DictVectorizer()),('clf', MultinomialNB())])
|
||||
|
||||
pipe2 = Pipeline([
|
||||
('union', FeatureUnion(transformer_list=[
|
||||
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('text', Pipeline([('tte',ThreadTextExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
||||
('dv',DictVectorizer())
|
||||
]))
|
||||
], transformer_weights={
|
||||
'subject': 1,
|
||||
'text': 0.7,
|
||||
'envelope': 0.7
|
||||
} )),
|
||||
('clf', MultinomialNB())
|
||||
])
|
||||
|
||||
pipe2b = Pipeline([
|
||||
('union', FeatureUnion(transformer_list=[
|
||||
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
|
||||
def build_pipe(p=b"pipe1"):
|
||||
|
||||
if p == "pipe1":
|
||||
p=Pipeline([('tde', ThreadDictExtractor()),
|
||||
('dv',DictVectorizer()),
|
||||
('clf', MultinomialNB())
|
||||
])
|
||||
elif p=="pipe2":
|
||||
p = Pipeline([
|
||||
('union', FeatureUnion(transformer_list=[
|
||||
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('text', Pipeline([('tte',ThreadTextExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
||||
('dv',DictVectorizer())
|
||||
]))
|
||||
], transformer_weights={
|
||||
'subject': 1,
|
||||
'text': 0.7,
|
||||
'envelope': 0.7
|
||||
} )),
|
||||
('clf', MultinomialNB())
|
||||
])
|
||||
elif p=="pipe2b":
|
||||
p = Pipeline([
|
||||
('union', FeatureUnion(transformer_list=[
|
||||
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('text', Pipeline([('tte',ThreadTextExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
('text', Pipeline([('tte',ThreadTextExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
||||
('dv',DictVectorizer())
|
||||
]))
|
||||
], transformer_weights={
|
||||
'subject': 1,
|
||||
'text': 0.7,
|
||||
'envelope': 0.7
|
||||
} )),
|
||||
('mlc', MLPClassifier())
|
||||
])
|
||||
elif p=="pipe2c":
|
||||
p = Pipeline([
|
||||
('union', FeatureUnion(transformer_list=[
|
||||
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
||||
('dv',DictVectorizer())
|
||||
]))
|
||||
], transformer_weights={
|
||||
'subject': 1,
|
||||
'text': 0.7,
|
||||
'envelope': 0.7
|
||||
} )),
|
||||
('mlc', MLPClassifier())
|
||||
])
|
||||
('text', Pipeline([('tte',ThreadTextExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
||||
('dv',DictVectorizer())
|
||||
]))
|
||||
], transformer_weights={
|
||||
'subject': 1,
|
||||
'text': 1,
|
||||
'envelope': 0.4
|
||||
} )),
|
||||
('mlc', MLPClassifier())
|
||||
])
|
||||
else:
|
||||
raise ValueError("The pipe %s is not a valid pipe")
|
||||
return p
|
||||
|
||||
def get_pipe(p=b"pipe1",k=b"answered"):
|
||||
p=build_pipe(p)
|
||||
tt= get_training_threads(k)
|
||||
p.fit(tt[0],tt[1])
|
||||
return p,tt[2]
|
||||
|
||||
from sklearn.metrics import accuracy_score
|
||||
|
||||
def test_pipe(pp,k):
|
||||
tt= get_training_threads(k)
|
||||
X_train,X_test,y_train,y_test=train_test_split(tt[0],tt[1],test_size=0.2)
|
||||
if type(pp) is list:
|
||||
for p in pp:
|
||||
print "pipe: %s" % p
|
||||
p=build_pipe(p)
|
||||
p.fit(X_train,y_train)
|
||||
ypred=p.predict(X_test)
|
||||
print accuracy_score(y_test,ypred)
|
||||
|
||||
#pipe1=get_pipe("pipe1", "answered")
|
||||
#pipe2=get_pipe("pipe2", "maintopic")
|
||||
#pipe2b=get_pipe("pipe2b", "maintopic")
|
||||
|
||||
Reference in New Issue
Block a user