small fixes and improvements
This commit is contained in:
@@ -9,14 +9,14 @@ from sklearn.model_selection import train_test_split
|
||||
import numpy as np
|
||||
import yaml
|
||||
from storage import MailThread,db_session
|
||||
from sklearn.metrics import accuracy_score
|
||||
from sklearn.metrics import accuracy_score, confusion_matrix
|
||||
|
||||
|
||||
with open("data.yml", 'r') as stream:
|
||||
try:
|
||||
train=yaml.load(stream)
|
||||
except yaml.YAMLError as exc:
|
||||
print(exc)
|
||||
#with open("data.yml", 'r') as stream:
|
||||
# try:
|
||||
# train=yaml.load(stream)
|
||||
# except yaml.YAMLError as exc:
|
||||
# print(exc)
|
||||
|
||||
data_types= { "answered": bool, "maintopic": str, "lang": str}
|
||||
|
||||
@@ -50,25 +50,34 @@ def get_training_threads(key="answered", filters=[]):
|
||||
d_a2=[]
|
||||
#------------------------------------
|
||||
if "db" in filters:
|
||||
tt=db_session.query(MailThread).filter(MailThread.istrained==True).all()
|
||||
q=db_session.query(MailThread).filter(MailThread.istrained.is_(True))
|
||||
if "de" in filters:
|
||||
q=q.filter(MailThread.lang=="de")
|
||||
elif "en" in filters:
|
||||
q=q.filter(MailThread.lang=="en")
|
||||
tt=q.all()
|
||||
for t in tt:
|
||||
t_a.append(t)
|
||||
if key =="answered":
|
||||
d_a.append(t.answered)
|
||||
d_a.append(t.is_answered())
|
||||
elif key=="maintopic":
|
||||
d_a.append(t.maintopic)
|
||||
|
||||
|
||||
elif key=="lang":
|
||||
d_a.append(t.lang)
|
||||
else:
|
||||
for i in train:
|
||||
if train[i].has_key(key): # In den Trainingsdaten muss der relevante Key sein
|
||||
t=db_session.query(MailThread).filter(MailThread.firstmail==i).first()
|
||||
if not t is None: # Thread muss in der Datenbank sein
|
||||
t_a.append(t)
|
||||
d_a.append(train[i][key])
|
||||
raise ValueError("Database Filter now required")
|
||||
le=LabelEncoder()
|
||||
d_a2=le.fit_transform(d_a)
|
||||
return (t_a,d_a2,le)
|
||||
|
||||
|
||||
# else:
|
||||
# for i in train:
|
||||
# if train[i].has_key(key): # In den Trainingsdaten muss der relevante Key sein
|
||||
# t=db_session.query(MailThread).filter(MailThread.firstmail==i).first#()
|
||||
# if not t is None: # Thread muss in der Datenbank sein
|
||||
# t_a.append(t)
|
||||
# d_a.append(train[i][key])
|
||||
|
||||
|
||||
def in_training(i, key="answered"):
|
||||
@@ -102,17 +111,24 @@ class ThreadTextExtractor(BaseEstimator, TransformerMixin):
|
||||
def transform(self, X,y=None):
|
||||
return [t.text() for t in X]
|
||||
|
||||
class ThreadFirstTextExtractor(BaseEstimator, TransformerMixin):
|
||||
def fit(self, x, y=None):
|
||||
return self
|
||||
def transform(self, X,y=None):
|
||||
return [t.text("first") for t in X]
|
||||
|
||||
def get_pipe(p=b"pipe1",k=b"answered",filters=[]):
|
||||
p=build_pipe(p)
|
||||
tt= get_training_threads(k,filters)
|
||||
#print tt
|
||||
if len(tt[0]) > 0:
|
||||
p.fit(tt[0],tt[1])
|
||||
return p,tt[2]
|
||||
else:
|
||||
return None, None
|
||||
|
||||
def test_pipe(pp,k):
|
||||
tt= get_training_threads(k)
|
||||
def test_pipe(pp,k,f=[]):
|
||||
tt= get_training_threads(k,f)
|
||||
X_train,X_test,y_train,y_test=train_test_split(tt[0],tt[1],test_size=0.4)
|
||||
if type(pp) is list:
|
||||
for p in pp:
|
||||
@@ -120,8 +136,9 @@ def test_pipe(pp,k):
|
||||
p=build_pipe(p)
|
||||
p.fit(X_train,y_train)
|
||||
ypred=p.predict(X_test)
|
||||
print tt[2].classes_
|
||||
print accuracy_score(y_test,ypred)
|
||||
|
||||
print confusion_matrix(y_test,ypred)
|
||||
|
||||
|
||||
|
||||
@@ -173,6 +190,137 @@ def build_pipe(p=b"pipe1"):
|
||||
} )),
|
||||
('mlc', MLPClassifier())
|
||||
])
|
||||
elif p=="pipe2d":
|
||||
p = Pipeline([
|
||||
('union', FeatureUnion(transformer_list=[
|
||||
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('text', Pipeline([('tte',ThreadTextExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('firsttext', Pipeline([('tte',ThreadFirstTextExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
||||
('dv',DictVectorizer())
|
||||
]))
|
||||
], transformer_weights={
|
||||
'subject': 1.3,
|
||||
'text': 1,
|
||||
'firsttext': 0.9,
|
||||
'envelope': 0.2
|
||||
} )),
|
||||
('mlc', MLPClassifier())
|
||||
])
|
||||
|
||||
elif p=="pipe2e":
|
||||
p = Pipeline([
|
||||
('union', FeatureUnion(transformer_list=[
|
||||
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('text', Pipeline([('tte',ThreadTextExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('firsttext', Pipeline([('tte',ThreadFirstTextExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
||||
('dv',DictVectorizer())
|
||||
]))
|
||||
], transformer_weights={
|
||||
'subject': 1.3,
|
||||
'text': 1,
|
||||
'firsttext': 0.9,
|
||||
'envelope': 0.2
|
||||
} )),
|
||||
('mlc', MLPClassifier(hidden_layer_sizes=(100,100)))
|
||||
])
|
||||
elif p=="pipe2e1":
|
||||
p = Pipeline([
|
||||
('union', FeatureUnion(transformer_list=[
|
||||
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('text', Pipeline([('tte',ThreadTextExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('firsttext', Pipeline([('tte',ThreadFirstTextExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
||||
('dv',DictVectorizer())
|
||||
]))
|
||||
], transformer_weights={
|
||||
'subject': 1.3,
|
||||
'text': 1,
|
||||
'firsttext': 0.9,
|
||||
'envelope': 0.2
|
||||
} )),
|
||||
('mlc', MLPClassifier(hidden_layer_sizes=(100,100,50)))
|
||||
])
|
||||
elif p=="pipe2f":
|
||||
p = Pipeline([
|
||||
('union', FeatureUnion(transformer_list=[
|
||||
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
||||
('cv',CountVectorizer(ngram_range=(1,1))),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('text', Pipeline([('tte',ThreadTextExtractor()),
|
||||
('cv',CountVectorizer(ngram_range=(1,1))),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('firsttext', Pipeline([('tte',ThreadFirstTextExtractor()),
|
||||
('cv',CountVectorizer(ngram_range=(1,2))),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
||||
('dv',DictVectorizer())
|
||||
]))
|
||||
], transformer_weights={
|
||||
'subject': 1.3,
|
||||
'text': 1,
|
||||
'firsttext': 0.9,
|
||||
'envelope': 0.2
|
||||
} )),
|
||||
('mlc', MLPClassifier(hidden_layer_sizes=(100,100)))
|
||||
])
|
||||
elif p=="pipe2g":
|
||||
p = Pipeline([
|
||||
('union', FeatureUnion(transformer_list=[
|
||||
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
||||
('cv',CountVectorizer(ngram_range=(1,1))),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('text', Pipeline([('tte',ThreadTextExtractor()),
|
||||
('cv',CountVectorizer(ngram_range=(1,1))),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('firsttext', Pipeline([('tte',ThreadFirstTextExtractor()),
|
||||
('cv',CountVectorizer(ngram_range=(1,2))),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
||||
('dv',DictVectorizer())
|
||||
]))
|
||||
], transformer_weights={
|
||||
'subject': 1.3,
|
||||
'text': 1,
|
||||
'firsttext': 0.9,
|
||||
'envelope': 0.2
|
||||
} )),
|
||||
('mlc', MLPClassifier(hidden_layer_sizes=(100,100,100)))
|
||||
])
|
||||
elif p=="pipe2c":
|
||||
p = Pipeline([
|
||||
('union', FeatureUnion(transformer_list=[
|
||||
|
||||
Reference in New Issue
Block a user