280 lines
11 KiB
Python
280 lines
11 KiB
Python
from sklearn.base import BaseEstimator, TransformerMixin
|
|
from sklearn.feature_extraction import DictVectorizer
|
|
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
|
|
from sklearn.preprocessing import LabelEncoder
|
|
from sklearn.pipeline import Pipeline, FeatureUnion
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
from sklearn.neural_network import MLPClassifier
|
|
from sklearn.model_selection import train_test_split
|
|
import numpy as np
|
|
import yaml
|
|
from sklearn.metrics import accuracy_score, confusion_matrix
|
|
from collections import namedtuple
|
|
|
|
from storage import get_training_threads #MailThread,db_session
|
|
|
|
PredictTool=namedtuple("PredictTool",["Pipeline","LabelEncoder"])
|
|
|
|
def print_answers(l):
|
|
classes=l.classes_
|
|
classes_encoded=l.transform(cc)
|
|
for i, c in zip(classes_encoded,classes):
|
|
print str(i) + ": " + str(c)
|
|
return None
|
|
|
|
|
|
class ThreadDictExtractor(BaseEstimator, TransformerMixin):
|
|
def fit(self, x, y=None):
|
|
return self
|
|
def transform(self, X,y=None):
|
|
return [t.mail_flat_dict() for t in X]
|
|
|
|
class ThreadSubjectExtractor(BaseEstimator, TransformerMixin):
|
|
def fit(self, x, y=None):
|
|
return self
|
|
def transform(self, X,y=None):
|
|
return [t.subject() for t in X]
|
|
|
|
class ThreadTextExtractor(BaseEstimator, TransformerMixin):
|
|
def fit(self, x, y=None):
|
|
return self
|
|
def transform(self, X,y=None):
|
|
return [t.text() for t in X]
|
|
|
|
class ThreadFirstTextExtractor(BaseEstimator, TransformerMixin):
|
|
def fit(self, x, y=None):
|
|
return self
|
|
def transform(self, X,y=None):
|
|
return [t.text("first") for t in X]
|
|
|
|
def get_pipe(p=b"pipe1",key=b"answered",filters=["db"]):
|
|
p=build_pipe(p)
|
|
threads, labels, labelenc= get_training_threads(key,filters)
|
|
if len(threads) > 0:
|
|
p.fit(threads,labels)
|
|
return PredictTool(p,labelenc)
|
|
else:
|
|
return PredictTool(None, None)
|
|
|
|
def test_pipe(pp,k,f=[]):
|
|
tt= get_training_threads(k,f)
|
|
X_train,X_test,y_train,y_test=train_test_split(tt[0],tt[1],test_size=0.4)
|
|
if type(pp) is list:
|
|
for p in pp:
|
|
print "pipe: %s" % p
|
|
p=build_pipe(p)
|
|
p.fit(X_train,y_train)
|
|
ypred=p.predict(X_test)
|
|
print tt[2].classes_
|
|
print accuracy_score(y_test,ypred)
|
|
print confusion_matrix(y_test,ypred)
|
|
|
|
|
|
|
|
def build_pipe(pipe=b"pipe1"):
|
|
if pipe == "pipe1":
|
|
p=Pipeline([('tde', ThreadDictExtractor()),
|
|
('dv',DictVectorizer()),
|
|
('clf', MultinomialNB())
|
|
])
|
|
elif pipe=="pipe2":
|
|
p = Pipeline([
|
|
('union', FeatureUnion(transformer_list=[
|
|
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
|
('cv',CountVectorizer()),
|
|
('tfidf', TfidfTransformer())
|
|
])),
|
|
('text', Pipeline([('tte',ThreadTextExtractor()),
|
|
('cv',CountVectorizer()),
|
|
('tfidf', TfidfTransformer())
|
|
])),
|
|
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
|
('dv',DictVectorizer())
|
|
]))
|
|
], transformer_weights={
|
|
'subject': 1,
|
|
'text': 0.7,
|
|
'envelope': 0.7
|
|
} )),
|
|
('clf', MultinomialNB())
|
|
])
|
|
elif pipe=="pipe2b":
|
|
p = Pipeline([
|
|
('union', FeatureUnion(transformer_list=[
|
|
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
|
('cv',CountVectorizer()),
|
|
('tfidf', TfidfTransformer())
|
|
])),
|
|
('text', Pipeline([('tte',ThreadTextExtractor()),
|
|
('cv',CountVectorizer()),
|
|
('tfidf', TfidfTransformer())
|
|
])),
|
|
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
|
('dv',DictVectorizer())
|
|
]))
|
|
], transformer_weights={
|
|
'subject': 1,
|
|
'text': 0.7,
|
|
'envelope': 0.7
|
|
} )),
|
|
('mlc', MLPClassifier())
|
|
])
|
|
elif pipe=="pipe2d":
|
|
p = Pipeline([
|
|
('union', FeatureUnion(transformer_list=[
|
|
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
|
('cv',CountVectorizer()),
|
|
('tfidf', TfidfTransformer())
|
|
])),
|
|
('text', Pipeline([('tte',ThreadTextExtractor()),
|
|
('cv',CountVectorizer()),
|
|
('tfidf', TfidfTransformer())
|
|
])),
|
|
('firsttext', Pipeline([('tte',ThreadFirstTextExtractor()),
|
|
('cv',CountVectorizer()),
|
|
('tfidf', TfidfTransformer())
|
|
])),
|
|
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
|
('dv',DictVectorizer())
|
|
]))
|
|
], transformer_weights={
|
|
'subject': 1.3,
|
|
'text': 1,
|
|
'firsttext': 0.9,
|
|
'envelope': 0.2
|
|
} )),
|
|
('mlc', MLPClassifier())
|
|
])
|
|
|
|
elif pipe=="pipe2e":
|
|
p = Pipeline([
|
|
('union', FeatureUnion(transformer_list=[
|
|
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
|
('cv',CountVectorizer()),
|
|
('tfidf', TfidfTransformer())
|
|
])),
|
|
('text', Pipeline([('tte',ThreadTextExtractor()),
|
|
('cv',CountVectorizer()),
|
|
('tfidf', TfidfTransformer())
|
|
])),
|
|
('firsttext', Pipeline([('tte',ThreadFirstTextExtractor()),
|
|
('cv',CountVectorizer()),
|
|
('tfidf', TfidfTransformer())
|
|
])),
|
|
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
|
('dv',DictVectorizer())
|
|
]))
|
|
], transformer_weights={
|
|
'subject': 1.3,
|
|
'text': 1,
|
|
'firsttext': 0.9,
|
|
'envelope': 0.2
|
|
} )),
|
|
('mlc', MLPClassifier(hidden_layer_sizes=(100,100)))
|
|
])
|
|
elif pipe=="pipe2e1":
|
|
p = Pipeline([
|
|
('union', FeatureUnion(transformer_list=[
|
|
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
|
('cv',CountVectorizer()),
|
|
('tfidf', TfidfTransformer())
|
|
])),
|
|
('text', Pipeline([('tte',ThreadTextExtractor()),
|
|
('cv',CountVectorizer()),
|
|
('tfidf', TfidfTransformer())
|
|
])),
|
|
('firsttext', Pipeline([('tte',ThreadFirstTextExtractor()),
|
|
('cv',CountVectorizer()),
|
|
('tfidf', TfidfTransformer())
|
|
])),
|
|
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
|
('dv',DictVectorizer())
|
|
]))
|
|
], transformer_weights={
|
|
'subject': 1.3,
|
|
'text': 1,
|
|
'firsttext': 0.9,
|
|
'envelope': 0.2
|
|
} )),
|
|
('mlc', MLPClassifier(hidden_layer_sizes=(100,100,50)))
|
|
])
|
|
elif pipe=="pipe2f":
|
|
p = Pipeline([
|
|
('union', FeatureUnion(transformer_list=[
|
|
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
|
('cv',CountVectorizer(ngram_range=(1,1))),
|
|
('tfidf', TfidfTransformer())
|
|
])),
|
|
('text', Pipeline([('tte',ThreadTextExtractor()),
|
|
('cv',CountVectorizer(ngram_range=(1,1))),
|
|
('tfidf', TfidfTransformer())
|
|
])),
|
|
('firsttext', Pipeline([('tte',ThreadFirstTextExtractor()),
|
|
('cv',CountVectorizer(ngram_range=(1,2))),
|
|
('tfidf', TfidfTransformer())
|
|
])),
|
|
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
|
('dv',DictVectorizer())
|
|
]))
|
|
], transformer_weights={
|
|
'subject': 1.3,
|
|
'text': 1,
|
|
'firsttext': 0.9,
|
|
'envelope': 0.2
|
|
} )),
|
|
('mlc', MLPClassifier(hidden_layer_sizes=(100,100)))
|
|
])
|
|
elif pipe=="pipe2g":
|
|
p = Pipeline([
|
|
('union', FeatureUnion(transformer_list=[
|
|
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
|
('cv',CountVectorizer(ngram_range=(1,1))),
|
|
('tfidf', TfidfTransformer())
|
|
])),
|
|
('text', Pipeline([('tte',ThreadTextExtractor()),
|
|
('cv',CountVectorizer(ngram_range=(1,1))),
|
|
('tfidf', TfidfTransformer())
|
|
])),
|
|
('firsttext', Pipeline([('tte',ThreadFirstTextExtractor()),
|
|
('cv',CountVectorizer(ngram_range=(1,2))),
|
|
('tfidf', TfidfTransformer())
|
|
])),
|
|
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
|
('dv',DictVectorizer())
|
|
]))
|
|
], transformer_weights={
|
|
'subject': 1.3,
|
|
'text': 1,
|
|
'firsttext': 0.9,
|
|
'envelope': 0.2
|
|
} )),
|
|
('mlc', MLPClassifier(hidden_layer_sizes=(100,100,100)))
|
|
])
|
|
elif pipe=="pipe2c":
|
|
p = Pipeline([
|
|
('union', FeatureUnion(transformer_list=[
|
|
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
|
('cv',CountVectorizer()),
|
|
('tfidf', TfidfTransformer())
|
|
])),
|
|
('text', Pipeline([('tte',ThreadTextExtractor()),
|
|
('cv',CountVectorizer()),
|
|
('tfidf', TfidfTransformer())
|
|
])),
|
|
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
|
('dv',DictVectorizer())
|
|
]))
|
|
], transformer_weights={
|
|
'subject': 1,
|
|
'text': 1,
|
|
'envelope': 0.4
|
|
} )),
|
|
('mlc', MLPClassifier())
|
|
])
|
|
else:
|
|
raise ValueError("The pipe %s is not a valid pipe")
|
|
return p
|
|
|
|
|
|
|