small fixes and improvements
This commit is contained in:
@@ -2,3 +2,4 @@ from classifier import in_training, print_answers
|
|||||||
from classifier import get_pipe, test_pipe, get_training_threads
|
from classifier import get_pipe, test_pipe, get_training_threads
|
||||||
from training import train_single_thread
|
from training import train_single_thread
|
||||||
from classifier import store_training_data
|
from classifier import store_training_data
|
||||||
|
from prediction import predict_threads
|
||||||
|
|||||||
@@ -9,14 +9,14 @@ from sklearn.model_selection import train_test_split
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import yaml
|
import yaml
|
||||||
from storage import MailThread,db_session
|
from storage import MailThread,db_session
|
||||||
from sklearn.metrics import accuracy_score
|
from sklearn.metrics import accuracy_score, confusion_matrix
|
||||||
|
|
||||||
|
|
||||||
with open("data.yml", 'r') as stream:
|
#with open("data.yml", 'r') as stream:
|
||||||
try:
|
# try:
|
||||||
train=yaml.load(stream)
|
# train=yaml.load(stream)
|
||||||
except yaml.YAMLError as exc:
|
# except yaml.YAMLError as exc:
|
||||||
print(exc)
|
# print(exc)
|
||||||
|
|
||||||
data_types= { "answered": bool, "maintopic": str, "lang": str}
|
data_types= { "answered": bool, "maintopic": str, "lang": str}
|
||||||
|
|
||||||
@@ -50,27 +50,36 @@ def get_training_threads(key="answered", filters=[]):
|
|||||||
d_a2=[]
|
d_a2=[]
|
||||||
#------------------------------------
|
#------------------------------------
|
||||||
if "db" in filters:
|
if "db" in filters:
|
||||||
tt=db_session.query(MailThread).filter(MailThread.istrained==True).all()
|
q=db_session.query(MailThread).filter(MailThread.istrained.is_(True))
|
||||||
|
if "de" in filters:
|
||||||
|
q=q.filter(MailThread.lang=="de")
|
||||||
|
elif "en" in filters:
|
||||||
|
q=q.filter(MailThread.lang=="en")
|
||||||
|
tt=q.all()
|
||||||
for t in tt:
|
for t in tt:
|
||||||
t_a.append(t)
|
t_a.append(t)
|
||||||
if key =="answered":
|
if key =="answered":
|
||||||
d_a.append(t.answered)
|
d_a.append(t.is_answered())
|
||||||
elif key=="maintopic":
|
elif key=="maintopic":
|
||||||
d_a.append(t.maintopic)
|
d_a.append(t.maintopic)
|
||||||
|
elif key=="lang":
|
||||||
|
d_a.append(t.lang)
|
||||||
else:
|
else:
|
||||||
for i in train:
|
raise ValueError("Database Filter now required")
|
||||||
if train[i].has_key(key): # In den Trainingsdaten muss der relevante Key sein
|
|
||||||
t=db_session.query(MailThread).filter(MailThread.firstmail==i).first()
|
|
||||||
if not t is None: # Thread muss in der Datenbank sein
|
|
||||||
t_a.append(t)
|
|
||||||
d_a.append(train[i][key])
|
|
||||||
le=LabelEncoder()
|
le=LabelEncoder()
|
||||||
d_a2=le.fit_transform(d_a)
|
d_a2=le.fit_transform(d_a)
|
||||||
return (t_a,d_a2,le)
|
return (t_a,d_a2,le)
|
||||||
|
|
||||||
|
|
||||||
|
# else:
|
||||||
|
# for i in train:
|
||||||
|
# if train[i].has_key(key): # In den Trainingsdaten muss der relevante Key sein
|
||||||
|
# t=db_session.query(MailThread).filter(MailThread.firstmail==i).first#()
|
||||||
|
# if not t is None: # Thread muss in der Datenbank sein
|
||||||
|
# t_a.append(t)
|
||||||
|
# d_a.append(train[i][key])
|
||||||
|
|
||||||
|
|
||||||
def in_training(i, key="answered"):
|
def in_training(i, key="answered"):
|
||||||
return train.has_key(i) and train[i].has_key(key)
|
return train.has_key(i) and train[i].has_key(key)
|
||||||
|
|
||||||
@@ -102,17 +111,24 @@ class ThreadTextExtractor(BaseEstimator, TransformerMixin):
|
|||||||
def transform(self, X,y=None):
|
def transform(self, X,y=None):
|
||||||
return [t.text() for t in X]
|
return [t.text() for t in X]
|
||||||
|
|
||||||
|
class ThreadFirstTextExtractor(BaseEstimator, TransformerMixin):
|
||||||
|
def fit(self, x, y=None):
|
||||||
|
return self
|
||||||
|
def transform(self, X,y=None):
|
||||||
|
return [t.text("first") for t in X]
|
||||||
|
|
||||||
def get_pipe(p=b"pipe1",k=b"answered",filters=[]):
|
def get_pipe(p=b"pipe1",k=b"answered",filters=[]):
|
||||||
p=build_pipe(p)
|
p=build_pipe(p)
|
||||||
tt= get_training_threads(k,filters)
|
tt= get_training_threads(k,filters)
|
||||||
|
#print tt
|
||||||
if len(tt[0]) > 0:
|
if len(tt[0]) > 0:
|
||||||
p.fit(tt[0],tt[1])
|
p.fit(tt[0],tt[1])
|
||||||
return p,tt[2]
|
return p,tt[2]
|
||||||
else:
|
else:
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
def test_pipe(pp,k):
|
def test_pipe(pp,k,f=[]):
|
||||||
tt= get_training_threads(k)
|
tt= get_training_threads(k,f)
|
||||||
X_train,X_test,y_train,y_test=train_test_split(tt[0],tt[1],test_size=0.4)
|
X_train,X_test,y_train,y_test=train_test_split(tt[0],tt[1],test_size=0.4)
|
||||||
if type(pp) is list:
|
if type(pp) is list:
|
||||||
for p in pp:
|
for p in pp:
|
||||||
@@ -120,8 +136,9 @@ def test_pipe(pp,k):
|
|||||||
p=build_pipe(p)
|
p=build_pipe(p)
|
||||||
p.fit(X_train,y_train)
|
p.fit(X_train,y_train)
|
||||||
ypred=p.predict(X_test)
|
ypred=p.predict(X_test)
|
||||||
|
print tt[2].classes_
|
||||||
print accuracy_score(y_test,ypred)
|
print accuracy_score(y_test,ypred)
|
||||||
|
print confusion_matrix(y_test,ypred)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -173,6 +190,137 @@ def build_pipe(p=b"pipe1"):
|
|||||||
} )),
|
} )),
|
||||||
('mlc', MLPClassifier())
|
('mlc', MLPClassifier())
|
||||||
])
|
])
|
||||||
|
elif p=="pipe2d":
|
||||||
|
p = Pipeline([
|
||||||
|
('union', FeatureUnion(transformer_list=[
|
||||||
|
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
||||||
|
('cv',CountVectorizer()),
|
||||||
|
('tfidf', TfidfTransformer())
|
||||||
|
])),
|
||||||
|
('text', Pipeline([('tte',ThreadTextExtractor()),
|
||||||
|
('cv',CountVectorizer()),
|
||||||
|
('tfidf', TfidfTransformer())
|
||||||
|
])),
|
||||||
|
('firsttext', Pipeline([('tte',ThreadFirstTextExtractor()),
|
||||||
|
('cv',CountVectorizer()),
|
||||||
|
('tfidf', TfidfTransformer())
|
||||||
|
])),
|
||||||
|
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
||||||
|
('dv',DictVectorizer())
|
||||||
|
]))
|
||||||
|
], transformer_weights={
|
||||||
|
'subject': 1.3,
|
||||||
|
'text': 1,
|
||||||
|
'firsttext': 0.9,
|
||||||
|
'envelope': 0.2
|
||||||
|
} )),
|
||||||
|
('mlc', MLPClassifier())
|
||||||
|
])
|
||||||
|
|
||||||
|
elif p=="pipe2e":
|
||||||
|
p = Pipeline([
|
||||||
|
('union', FeatureUnion(transformer_list=[
|
||||||
|
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
||||||
|
('cv',CountVectorizer()),
|
||||||
|
('tfidf', TfidfTransformer())
|
||||||
|
])),
|
||||||
|
('text', Pipeline([('tte',ThreadTextExtractor()),
|
||||||
|
('cv',CountVectorizer()),
|
||||||
|
('tfidf', TfidfTransformer())
|
||||||
|
])),
|
||||||
|
('firsttext', Pipeline([('tte',ThreadFirstTextExtractor()),
|
||||||
|
('cv',CountVectorizer()),
|
||||||
|
('tfidf', TfidfTransformer())
|
||||||
|
])),
|
||||||
|
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
||||||
|
('dv',DictVectorizer())
|
||||||
|
]))
|
||||||
|
], transformer_weights={
|
||||||
|
'subject': 1.3,
|
||||||
|
'text': 1,
|
||||||
|
'firsttext': 0.9,
|
||||||
|
'envelope': 0.2
|
||||||
|
} )),
|
||||||
|
('mlc', MLPClassifier(hidden_layer_sizes=(100,100)))
|
||||||
|
])
|
||||||
|
elif p=="pipe2e1":
|
||||||
|
p = Pipeline([
|
||||||
|
('union', FeatureUnion(transformer_list=[
|
||||||
|
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
||||||
|
('cv',CountVectorizer()),
|
||||||
|
('tfidf', TfidfTransformer())
|
||||||
|
])),
|
||||||
|
('text', Pipeline([('tte',ThreadTextExtractor()),
|
||||||
|
('cv',CountVectorizer()),
|
||||||
|
('tfidf', TfidfTransformer())
|
||||||
|
])),
|
||||||
|
('firsttext', Pipeline([('tte',ThreadFirstTextExtractor()),
|
||||||
|
('cv',CountVectorizer()),
|
||||||
|
('tfidf', TfidfTransformer())
|
||||||
|
])),
|
||||||
|
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
||||||
|
('dv',DictVectorizer())
|
||||||
|
]))
|
||||||
|
], transformer_weights={
|
||||||
|
'subject': 1.3,
|
||||||
|
'text': 1,
|
||||||
|
'firsttext': 0.9,
|
||||||
|
'envelope': 0.2
|
||||||
|
} )),
|
||||||
|
('mlc', MLPClassifier(hidden_layer_sizes=(100,100,50)))
|
||||||
|
])
|
||||||
|
elif p=="pipe2f":
|
||||||
|
p = Pipeline([
|
||||||
|
('union', FeatureUnion(transformer_list=[
|
||||||
|
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
||||||
|
('cv',CountVectorizer(ngram_range=(1,1))),
|
||||||
|
('tfidf', TfidfTransformer())
|
||||||
|
])),
|
||||||
|
('text', Pipeline([('tte',ThreadTextExtractor()),
|
||||||
|
('cv',CountVectorizer(ngram_range=(1,1))),
|
||||||
|
('tfidf', TfidfTransformer())
|
||||||
|
])),
|
||||||
|
('firsttext', Pipeline([('tte',ThreadFirstTextExtractor()),
|
||||||
|
('cv',CountVectorizer(ngram_range=(1,2))),
|
||||||
|
('tfidf', TfidfTransformer())
|
||||||
|
])),
|
||||||
|
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
||||||
|
('dv',DictVectorizer())
|
||||||
|
]))
|
||||||
|
], transformer_weights={
|
||||||
|
'subject': 1.3,
|
||||||
|
'text': 1,
|
||||||
|
'firsttext': 0.9,
|
||||||
|
'envelope': 0.2
|
||||||
|
} )),
|
||||||
|
('mlc', MLPClassifier(hidden_layer_sizes=(100,100)))
|
||||||
|
])
|
||||||
|
elif p=="pipe2g":
|
||||||
|
p = Pipeline([
|
||||||
|
('union', FeatureUnion(transformer_list=[
|
||||||
|
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
||||||
|
('cv',CountVectorizer(ngram_range=(1,1))),
|
||||||
|
('tfidf', TfidfTransformer())
|
||||||
|
])),
|
||||||
|
('text', Pipeline([('tte',ThreadTextExtractor()),
|
||||||
|
('cv',CountVectorizer(ngram_range=(1,1))),
|
||||||
|
('tfidf', TfidfTransformer())
|
||||||
|
])),
|
||||||
|
('firsttext', Pipeline([('tte',ThreadFirstTextExtractor()),
|
||||||
|
('cv',CountVectorizer(ngram_range=(1,2))),
|
||||||
|
('tfidf', TfidfTransformer())
|
||||||
|
])),
|
||||||
|
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
||||||
|
('dv',DictVectorizer())
|
||||||
|
]))
|
||||||
|
], transformer_weights={
|
||||||
|
'subject': 1.3,
|
||||||
|
'text': 1,
|
||||||
|
'firsttext': 0.9,
|
||||||
|
'envelope': 0.2
|
||||||
|
} )),
|
||||||
|
('mlc', MLPClassifier(hidden_layer_sizes=(100,100,100)))
|
||||||
|
])
|
||||||
elif p=="pipe2c":
|
elif p=="pipe2c":
|
||||||
p = Pipeline([
|
p = Pipeline([
|
||||||
('union', FeatureUnion(transformer_list=[
|
('union', FeatureUnion(transformer_list=[
|
||||||
|
|||||||
22
classifier/prediction.py
Normal file
22
classifier/prediction.py
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
from classifier import get_training_threads, print_answers, in_training, store_training_data, get_pipe
|
||||||
|
from storage import db_session, MailThread
|
||||||
|
|
||||||
|
def predict_threads():
|
||||||
|
pipe1,le=get_pipe("pipe1",b"answered",["db"])
|
||||||
|
pipe2,le2=get_pipe("pipe2g", b"maintopic",["db"])
|
||||||
|
pipe3,le3=get_pipe("pipe2b", b"lang",["db"])
|
||||||
|
q=db_session.query(MailThread).filter(MailThread.istrained.op("IS NOT")(True))
|
||||||
|
mail_threads=q.all()
|
||||||
|
if len(mail_threads) ==0:
|
||||||
|
raise ValueError("no untrained threads found")
|
||||||
|
answered=le.inverse_transform(pipe1.predict(mail_threads))
|
||||||
|
maintopic=le2.inverse_transform(pipe2.predict(mail_threads))
|
||||||
|
lang=le3.inverse_transform(pipe3.predict(mail_threads))
|
||||||
|
|
||||||
|
for i, t in enumerate(mail_threads):
|
||||||
|
t.answered=bool(answered[i])
|
||||||
|
t.opened=bool(answered[i])
|
||||||
|
t.maintopic=str(maintopic[i])
|
||||||
|
t.lang=str(lang[i])
|
||||||
|
db_session.add(t)
|
||||||
|
db_session.commit()
|
||||||
@@ -13,10 +13,11 @@ package_directory = os.path.dirname(os.path.abspath(__file__))
|
|||||||
|
|
||||||
cfg = Config(file(os.path.join(package_directory, 'config.cfg')))
|
cfg = Config(file(os.path.join(package_directory, 'config.cfg')))
|
||||||
|
|
||||||
|
maintopic_values=["studium", "information","ausleihen","jobausschreibung", "umfragen"]
|
||||||
|
|
||||||
def render_index(mths,opened=None,code=200):
|
def render_index(mths,opened=None,code=200):
|
||||||
return render_template("index.html",mths=mths,
|
return render_template("index.html",mths=mths,
|
||||||
title=cfg.title.decode("utf8"),opened=opened
|
title=cfg.title.decode("utf8"),opened=opened,maintopics=maintopic_values
|
||||||
), code
|
), code
|
||||||
from classifier import get_pipe
|
from classifier import get_pipe
|
||||||
#mail_threads=db_session.query(MailThread).all()
|
#mail_threads=db_session.query(MailThread).all()
|
||||||
@@ -33,7 +34,6 @@ from classifier import get_pipe
|
|||||||
# t.maintopic=maintopic[i]
|
# t.maintopic=maintopic[i]
|
||||||
# t.lang=lang[i]
|
# t.lang=lang[i]
|
||||||
|
|
||||||
maintopic_values=["studium", "information","ausleihen"]
|
|
||||||
|
|
||||||
@app.route("/")
|
@app.route("/")
|
||||||
def hello():
|
def hello():
|
||||||
@@ -49,10 +49,13 @@ def store_value(id,key,value):
|
|||||||
mth.opened=bool(value)
|
mth.opened=bool(value)
|
||||||
if key=="maintopic" and value in maintopic_values:
|
if key=="maintopic" and value in maintopic_values:
|
||||||
mth.maintopic=str(value)
|
mth.maintopic=str(value)
|
||||||
|
if key=="lang" and value in maintopic_values:
|
||||||
|
mth.lang=str(value)
|
||||||
if key =="trained":
|
if key =="trained":
|
||||||
value = value in ["true", "True", "1", "t"]
|
value = value in ["true", "True", "1", "t"]
|
||||||
mth.istrained=bool(value)
|
mth.istrained=bool(value)
|
||||||
|
db_session.add(mth)
|
||||||
|
db_session.commit()
|
||||||
@app.route("/<int:id>")
|
@app.route("/<int:id>")
|
||||||
def store_answered(id):
|
def store_answered(id):
|
||||||
|
|
||||||
@@ -60,6 +63,7 @@ def store_answered(id):
|
|||||||
value = request.args.get('value')
|
value = request.args.get('value')
|
||||||
if not key is None and not value is None:
|
if not key is None and not value is None:
|
||||||
store_value(id,key,value)
|
store_value(id,key,value)
|
||||||
|
mth=db_session.query(MailThread).filter(MailThread.firstmail==int(id)).first()
|
||||||
|
|
||||||
return render_index([mth], opened=id)
|
return render_index([mth], opened=id)
|
||||||
|
|
||||||
@@ -73,6 +77,9 @@ def studium():
|
|||||||
|
|
||||||
@app.route("/<maintopic>/")
|
@app.route("/<maintopic>/")
|
||||||
def maintopic(maintopic):
|
def maintopic(maintopic):
|
||||||
|
if maintopic == "trained":
|
||||||
|
mth=db_session.query(MailThread).filter(MailThread.istrained==True).order_by(desc(MailThread.date)).all()
|
||||||
|
else:
|
||||||
mth=db_session.query(MailThread).filter(MailThread.maintopic=="%s" % maintopic).order_by(desc(MailThread.date)).all()
|
mth=db_session.query(MailThread).filter(MailThread.maintopic=="%s" % maintopic).order_by(desc(MailThread.date)).all()
|
||||||
return render_index(mth)
|
return render_index(mth)
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
|
||||||
|
|
||||||
<html>
|
<html>
|
||||||
<head>
|
<head>
|
||||||
<title>{{title}}</title>
|
<title>{{title}}</title>
|
||||||
@@ -10,6 +12,9 @@
|
|||||||
.card.answ-1,.card.answ-True {
|
.card.answ-1,.card.answ-True {
|
||||||
background: lightgreen;
|
background: lightgreen;
|
||||||
}
|
}
|
||||||
|
.card.trained-1,.card.trained-True {
|
||||||
|
border: solid blue;
|
||||||
|
}
|
||||||
</style>
|
</style>
|
||||||
<div class="container">
|
<div class="container">
|
||||||
<div class="row">
|
<div class="row">
|
||||||
@@ -19,7 +24,7 @@
|
|||||||
<div id="accordion" role="tablist" aria-multiselectable="true">
|
<div id="accordion" role="tablist" aria-multiselectable="true">
|
||||||
|
|
||||||
{% for m in mths %}
|
{% for m in mths %}
|
||||||
<div class="card answ-{{m.is_answered()}}" style="padding-top: 2pt; padding-bottom:2pt; border-radius:0;margin-top:1pt; margin-bottom:1pt">
|
<div class="card answ-{{m.is_answered()}} trained-{{m.istrained}}" style="padding-top: 2pt; padding-bottom:2pt; border-radius:0;margin-top:1pt; margin-bottom:1pt">
|
||||||
<div class="" role="tab" id="heading{{m.firstmail}}">
|
<div class="" role="tab" id="heading{{m.firstmail}}">
|
||||||
<b class="mb-0">
|
<b class="mb-0">
|
||||||
<a data-toggle="collapse" data-parent="#accordion" href="#collapse{{m.firstmail}}" aria-expanded="true" aria-controls="collapse1">
|
<a data-toggle="collapse" data-parent="#accordion" href="#collapse{{m.firstmail}}" aria-expanded="true" aria-controls="collapse1">
|
||||||
@@ -31,22 +36,66 @@
|
|||||||
<div id="collapse{{m.firstmail}}" class="collapse {{'show' if m.firstmail==opened}}" role="tabpanel" aria-labelledby="headingOne">
|
<div id="collapse{{m.firstmail}}" class="collapse {{'show' if m.firstmail==opened}}" role="tabpanel" aria-labelledby="headingOne">
|
||||||
|
|
||||||
<div class="card-block">
|
<div class="card-block">
|
||||||
<a href="{{m.firstmail}}?key=answered&value={{(not m.is_answered())}}">answered:{{(not m.is_answered())}}</a>
|
|
||||||
{{m.maintopic}}, {{ m.istrained }} <a href="{{m.firstmail}}?key=trained&value={{(not m.istrained)}}">trained:{{(not m.istrained)}}</a>
|
|
||||||
<div style="white-space: pre-wrap;font:Courier, monospace; font-size:small; width:50em; border: thin blue solid;">
|
|
||||||
{{ m.print_text() }}
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
{% endfor %}
|
|
||||||
</div>
|
|
||||||
<div style="white-space: pre-wrap;font:Courier, monospace; font-size:small; width:50em">
|
|
||||||
|
|
||||||
{% for m in mths %}
|
<div class="nav">
|
||||||
ID: {{m.tstr()}}
|
<div class="nav-item dropdown">
|
||||||
{{ m.print_text() }}
|
<a class="nav-link dropdown-toggle" id="dropdownMenuLink1_{{m.firstmail}}" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false" href="#"> {% if m.is_answered() %}
|
||||||
|
IS answered
|
||||||
|
{% else %}
|
||||||
|
ISNOT answered
|
||||||
|
{% endif %}
|
||||||
|
</a>
|
||||||
|
<div class="dropdown-menu" arialabelledby="dropdownMenuLink1_{{m.firstmail}}">
|
||||||
|
|
||||||
|
<a class="btn" href="{{m.firstmail}}?key=answered&value={{(not m.is_answered())}}">answered:{{(not m.is_answered())}}</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dropdown nav-item">
|
||||||
|
<a class="nav-link dropdown-toggle" id="dropdownMenuLink2_{{m.firstmail}}" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false" href="#"> {{m.maintopic}}</a>
|
||||||
|
<div class="dropdown-menu" arialabelledby="dropdownMenuLink2_{{m.firstmail}}">
|
||||||
|
{% for t in maintopics %}
|
||||||
|
<a class="dropdown-item" href="{{m.firstmail}}?key=maintopic&value={{t}}">{{t}}</a>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="nav-item dropdown">
|
||||||
|
<a class="nav-link dropdown-toggle" id="dropdownMenuLink3_{{m.firstmail}}" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false" href="#"> {% if m.istrained %}
|
||||||
|
IS trained
|
||||||
|
{% else %}
|
||||||
|
ISNOT trained
|
||||||
|
{% endif %}
|
||||||
|
</a>
|
||||||
|
<div class="dropdown-menu" arialabelledby="dropdownMenuLink3_{{m.firstmail}}">
|
||||||
|
<a class="btn" href="{{m.firstmail}}?key=trained&value={{(not m.istrained)}}">TRAINED:{{(not m.istrained)}}</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="nav-item dropdown">
|
||||||
|
<a class="nav-link dropdown-toggle" id="dropdownMenuLink4_{{m.firstmail}}" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false" href="#"> {{ m.lang }}
|
||||||
|
</a>
|
||||||
|
<div class="dropdown-menu" arialabelledby="dropdownMenuLink3_{{m.firstmail}}">
|
||||||
|
{% for l in ["de", "en"] %}
|
||||||
|
|
||||||
|
<a class="btn" href="{{m.firstmail}}?key=lang&value={{l}}">{{(l)}}</a>
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{% for txt in m.print_text() %}
|
||||||
|
<div style="white-space: pre-wrap;font:Courier, monospace; font-size:small; width:50em; border: thin blue solid;">
|
||||||
|
{{ txt }}
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
42
run.py
42
run.py
@@ -16,7 +16,8 @@ from storage import Mail, MailThread, db_session
|
|||||||
#import yaml
|
#import yaml
|
||||||
#import email
|
#import email
|
||||||
from classifier import get_training_threads, print_answers, in_training, store_training_data, get_pipe, test_pipe, train_single_thread # , pipe2, pipe2b
|
from classifier import get_training_threads, print_answers, in_training, store_training_data, get_pipe, test_pipe, train_single_thread # , pipe2, pipe2b
|
||||||
|
from classifier import predict_threads
|
||||||
|
maintopic_values=["studium", "information","ausleihen","jobausschreibung", "umfragen"]
|
||||||
|
|
||||||
def predict_thread(p,l,t):
|
def predict_thread(p,l,t):
|
||||||
pre=p.predict([t])
|
pre=p.predict([t])
|
||||||
@@ -29,13 +30,19 @@ if len(sys.argv)>1:
|
|||||||
|
|
||||||
if sys.argv[1] == "fetch_threads":
|
if sys.argv[1] == "fetch_threads":
|
||||||
print flatten_threads(fetch_threads())
|
print flatten_threads(fetch_threads())
|
||||||
|
if sys.argv[1] == "predict_threads2":
|
||||||
|
predict_threads()
|
||||||
if sys.argv[1] == "predict_threads":
|
if sys.argv[1] == "predict_threads":
|
||||||
pipe1,le=get_pipe("pipe1",b"answered")
|
print "predicting threads"
|
||||||
pipe2,le2=get_pipe("pipe2b", b"maintopic")
|
pipe1,le=get_pipe("pipe1",b"answered",["db"])
|
||||||
pipe3,le3=get_pipe("pipe2b", b"lang")
|
pipe2,le2=get_pipe("pipe2g", b"maintopic",["db"])
|
||||||
mail_threads=db_session.query(MailThread).filter(MailThread.istrained==False).all()
|
pipe3,le3=get_pipe("pipe2b", b"lang",["db"])
|
||||||
|
q=db_session.query(MailThread).filter(MailThread.istrained.op("IS NOT")(True))
|
||||||
|
|
||||||
|
mail_threads=q.all()
|
||||||
|
|
||||||
|
if len(mail_threads) ==0:
|
||||||
|
raise ValueError("no untrained threads found")
|
||||||
answered=le.inverse_transform(pipe1.predict(mail_threads))
|
answered=le.inverse_transform(pipe1.predict(mail_threads))
|
||||||
maintopic=le2.inverse_transform(pipe2.predict(mail_threads))
|
maintopic=le2.inverse_transform(pipe2.predict(mail_threads))
|
||||||
lang=le3.inverse_transform(pipe3.predict(mail_threads))
|
lang=le3.inverse_transform(pipe3.predict(mail_threads))
|
||||||
@@ -48,6 +55,16 @@ if len(sys.argv)>1:
|
|||||||
t.lang=str(lang[i])
|
t.lang=str(lang[i])
|
||||||
db_session.add(t)
|
db_session.add(t)
|
||||||
db_session.commit()
|
db_session.commit()
|
||||||
|
if sys.argv[1]=="stats":
|
||||||
|
for topic in maintopic_values:
|
||||||
|
print topic
|
||||||
|
n_answ=db_session.query(MailThread).filter(MailThread.maintopic==topic).filter(MailThread.answered.op("IS")(True)).count()
|
||||||
|
n_nansw=db_session.query(MailThread).filter(MailThread.maintopic==topic).filter(MailThread.answered.op("IS NOT")(True)).count()
|
||||||
|
n_ges=db_session.query(MailThread).filter(MailThread.maintopic==topic).count()
|
||||||
|
print "%d answered and %d not answered of %d(%d) that are %d percent answerd" % (n_answ,n_nansw, n_ges,n_answ+n_nansw, float(n_answ)/float(n_ges)*100.0)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if sys.argv[1] == "run_server":
|
if sys.argv[1] == "run_server":
|
||||||
from flaskapp import app
|
from flaskapp import app
|
||||||
@@ -72,7 +89,9 @@ if len(sys.argv)>1:
|
|||||||
print t.text()
|
print t.text()
|
||||||
if sys.argv[1] == "compile_threads":
|
if sys.argv[1] == "compile_threads":
|
||||||
mth=db_session.query(MailThread).all()
|
mth=db_session.query(MailThread).all()
|
||||||
for t in mth:
|
l=len(mth)
|
||||||
|
for i,t in enumerate(mth):
|
||||||
|
print "%d/%d" % (i,l)
|
||||||
t.compile()
|
t.compile()
|
||||||
|
|
||||||
if sys.argv[1] == "trained_threads_from_yml":
|
if sys.argv[1] == "trained_threads_from_yml":
|
||||||
@@ -115,9 +134,16 @@ if len(sys.argv)>1:
|
|||||||
print t.text()
|
print t.text()
|
||||||
predict_thread(pb,lb,t)
|
predict_thread(pb,lb,t)
|
||||||
train_single_thread(t.firstmail, p, labelencoder, b"maintopic")
|
train_single_thread(t.firstmail, p, labelencoder, b"maintopic")
|
||||||
|
if sys.argv[1] == "benchpipe3":
|
||||||
|
test_pipe(["pipe2d","pipe2e","pipe2e1","pipe2f","pipe2g"],"maintopic",["db","de"])
|
||||||
|
|
||||||
if sys.argv[1] == "benchpipe2":
|
if sys.argv[1] == "benchpipe2":
|
||||||
test_pipe(["pipe2","pipe2b","pipe2c"],"maintopic")
|
test_pipe(["pipe2","pipe2b","pipe2c","pipe2d"],"maintopic",["db","de"])
|
||||||
|
# print "testing with db training data:"
|
||||||
|
# test_pipe(["pipe2b"],"maintopic",["db"])
|
||||||
|
# test_pipe(["pipe2b"],"maintopic",["db"])
|
||||||
|
# print "testing only with german data"
|
||||||
|
# test_pipe(["pipe2b"],"maintopic",["db","de"])
|
||||||
|
|
||||||
if sys.argv[1] == "testpipe2":
|
if sys.argv[1] == "testpipe2":
|
||||||
from classifier import ThreadSubjectExtractor, ThreadTextExtractor
|
from classifier import ThreadSubjectExtractor, ThreadTextExtractor
|
||||||
|
|||||||
@@ -4,12 +4,17 @@ from datetime import date
|
|||||||
from config import Config
|
from config import Config
|
||||||
f=file('config.cfg')
|
f=file('config.cfg')
|
||||||
cfg=Config(f)
|
cfg=Config(f)
|
||||||
|
try:
|
||||||
server = imapclient.IMAPClient(cfg.host, use_uid=True, ssl=True)
|
server = imapclient.IMAPClient(cfg.host, use_uid=True, ssl=True)
|
||||||
server.login(cfg.user, cfg.password)
|
server.login(cfg.user, cfg.password)
|
||||||
server.select_folder('INBOX')
|
server.select_folder('INBOX')
|
||||||
|
except Error:
|
||||||
|
print "error initializing server"
|
||||||
|
server=None
|
||||||
|
|
||||||
def fetch_mail(myid):
|
def fetch_mail(myid):
|
||||||
|
if server is None:
|
||||||
|
raise ValueError("Server is None")
|
||||||
m=server.fetch([myid],['ENVELOPE','RFC822'])
|
m=server.fetch([myid],['ENVELOPE','RFC822'])
|
||||||
m=m[myid]
|
m=m[myid]
|
||||||
m["id"]=myid
|
m["id"]=myid
|
||||||
@@ -19,7 +24,9 @@ def fetch_thread(tp):
|
|||||||
return tp
|
return tp
|
||||||
|
|
||||||
def fetch_threads():
|
def fetch_threads():
|
||||||
src=server.thread(criteria=[b'SUBJECT', b'service', b'SINCE', date(2017,05,01)])
|
if server is None:
|
||||||
|
raise ValueError("Server is None")
|
||||||
|
src=server.thread(criteria=[b'SUBJECT', b'service', b'SINCE', date(2017,02,01)])
|
||||||
#, b'BEFORE', date(2017,08,01)
|
#, b'BEFORE', date(2017,08,01)
|
||||||
return src
|
return src
|
||||||
|
|
||||||
@@ -44,12 +51,17 @@ def store_threads(thrds):
|
|||||||
if th == None:
|
if th == None:
|
||||||
th=MailThread()
|
th=MailThread()
|
||||||
th.firstmail=t[0]
|
th.firstmail=t[0]
|
||||||
|
th.body=yaml.dump(t) # body zb (27422,27506), (27450,)
|
||||||
|
th.islabeled=False
|
||||||
|
th.opened=True
|
||||||
|
th.istrained=False
|
||||||
elif not th.body == yaml.dump(t): # Ansonsten body vergleichen
|
elif not th.body == yaml.dump(t): # Ansonsten body vergleichen
|
||||||
th.body=yaml.dump(t) # body zb (27422,27506), (27450,)
|
th.body=yaml.dump(t) # body zb (27422,27506), (27450,)
|
||||||
th.islabeled=False
|
th.islabeled=False
|
||||||
th.opened=True
|
th.opened=True
|
||||||
else:
|
th.istrained=False
|
||||||
th.body=yaml.dump(t)
|
# else:
|
||||||
|
# th.body=yaml.dump(t)
|
||||||
db_session.add(th)
|
db_session.add(th)
|
||||||
db_session.commit()
|
db_session.commit()
|
||||||
|
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ import yaml
|
|||||||
import email
|
import email
|
||||||
from fetch_mail import fetch_mail
|
from fetch_mail import fetch_mail
|
||||||
import bs4
|
import bs4
|
||||||
|
import re
|
||||||
class FullMailSchema(Schema):
|
class FullMailSchema(Schema):
|
||||||
id=fields.Integer()
|
id=fields.Integer()
|
||||||
text=fields.String()
|
text=fields.String()
|
||||||
@@ -56,6 +57,8 @@ class Mail(Base):
|
|||||||
return mm
|
return mm
|
||||||
|
|
||||||
def get_email(self):
|
def get_email(self):
|
||||||
|
if self.body is None:
|
||||||
|
raise ValueError("body not yet loaded")
|
||||||
em=email.message_from_string(yaml.load(self.body))
|
em=email.message_from_string(yaml.load(self.body))
|
||||||
return em
|
return em
|
||||||
|
|
||||||
@@ -91,11 +94,10 @@ class Mail(Base):
|
|||||||
|
|
||||||
def dict_envelope(self):
|
def dict_envelope(self):
|
||||||
d={}
|
d={}
|
||||||
i=0
|
if self.to_ is None:
|
||||||
for p in yaml.load(self.subject):
|
self.compile_envelope()
|
||||||
if p is not None:
|
if self.to_ is None:
|
||||||
d["subject_"+str(i)]=p
|
raise ValueError("Self.to_ of mail not yet compiled")
|
||||||
i=i+1
|
|
||||||
i=0
|
i=0
|
||||||
for p in yaml.load(self.to_):
|
for p in yaml.load(self.to_):
|
||||||
if p["host"] is not None:
|
if p["host"] is not None:
|
||||||
@@ -126,3 +128,19 @@ class Mail(Base):
|
|||||||
self.text= yaml.dump(b4.get_text())
|
self.text= yaml.dump(b4.get_text())
|
||||||
else:
|
else:
|
||||||
self.text =yaml.dump( pl)
|
self.text =yaml.dump( pl)
|
||||||
|
def print_text(self):
|
||||||
|
txt=""
|
||||||
|
fr=yaml.load(self.from_)
|
||||||
|
txt= txt+ "Gesendet von: "+str(fr[0]["mail"])+"@"+str(fr[0]["host"])+" am "+ str(self.date) + "\n"
|
||||||
|
t=yaml.load(self.text)
|
||||||
|
if type(t) is unicode:
|
||||||
|
#txt=txt
|
||||||
|
txt=txt+t
|
||||||
|
else:
|
||||||
|
t=t.decode("ISO-8859-1")
|
||||||
|
txt=txt+t
|
||||||
|
txt=re.sub(r'\n\s*\n',r'\n',txt)
|
||||||
|
txt=re.sub(r'<!--.*-->',r'',txt,flags=re.MULTILINE|re.DOTALL)
|
||||||
|
txt=re.sub(r'\s*>+ .*\n',r'',txt)
|
||||||
|
|
||||||
|
return txt
|
||||||
|
|||||||
@@ -73,7 +73,7 @@ class MailThread(Base):
|
|||||||
a=[]
|
a=[]
|
||||||
# print "maildicts: "+ str(self.mails())
|
# print "maildicts: "+ str(self.mails())
|
||||||
for m in self.mails():
|
for m in self.mails():
|
||||||
m.compile_envelope()
|
# m.compile_envelope()
|
||||||
a.append(m.dict_envelope())
|
a.append(m.dict_envelope())
|
||||||
return a
|
return a
|
||||||
|
|
||||||
@@ -85,6 +85,9 @@ class MailThread(Base):
|
|||||||
for i in range(0,len(dc)):
|
for i in range(0,len(dc)):
|
||||||
for k, v in dc[i].iteritems():
|
for k, v in dc[i].iteritems():
|
||||||
d["mail_"+str(i)+"_"+k]=v
|
d["mail_"+str(i)+"_"+k]=v
|
||||||
|
for k, v in dc[-1].iteritems():
|
||||||
|
d["mail_last_"+k]=v
|
||||||
|
|
||||||
return d
|
return d
|
||||||
|
|
||||||
def subject(self):
|
def subject(self):
|
||||||
@@ -103,30 +106,13 @@ class MailThread(Base):
|
|||||||
self.date=self.mails()[0].date
|
self.date=self.mails()[0].date
|
||||||
|
|
||||||
def print_text(self,filter="all"):
|
def print_text(self,filter="all"):
|
||||||
a=u""
|
a=[]
|
||||||
def mail_txt(m):
|
|
||||||
#txt ="Gesendet von: "+ str(m.from_mailbox)+"@"+str(m.from_host) +"\n"
|
|
||||||
txt=""
|
|
||||||
fr=yaml.load(m.from_)
|
|
||||||
txt= txt+ "Gesendet von: "+str(fr[0]["mail"])+"@"+str(fr[0]["host"])+" am "+ str(m.date) + "\n"
|
|
||||||
t=yaml.load(m.text)
|
|
||||||
if type(t) is unicode:
|
|
||||||
#txt=txt
|
|
||||||
txt=txt+t
|
|
||||||
else:
|
|
||||||
t=t.decode("ISO-8859-1")
|
|
||||||
txt=txt+t
|
|
||||||
return txt
|
|
||||||
|
|
||||||
if filter=="all":
|
if filter=="all":
|
||||||
mm=self.mails()
|
mm=self.mails()
|
||||||
for m in mm:
|
for m in mm:
|
||||||
a=a+mail_txt(m)+"\n****........................................***\n"
|
a.append(m.print_text())
|
||||||
elif filter=="first":
|
elif filter=="first":
|
||||||
a=mail_txt(m[0])
|
a.append(m[0].print_text())
|
||||||
a=re.sub(r'\n\s*\n',r'\n',a)
|
|
||||||
a=re.sub(r'<!--.*-->',r'',a,flags=re.MULTILINE|re.DOTALL)
|
|
||||||
a=re.sub(r'\s*>+ .*\n',r'',a)
|
|
||||||
return a
|
return a
|
||||||
def text(self,filter="all"):
|
def text(self,filter="all"):
|
||||||
a=u""
|
a=u""
|
||||||
@@ -143,7 +129,7 @@ class MailThread(Base):
|
|||||||
for m in mm:
|
for m in mm:
|
||||||
a=a+mail_txt(m)+"\n****........................................***\n"
|
a=a+mail_txt(m)+"\n****........................................***\n"
|
||||||
elif filter=="first":
|
elif filter=="first":
|
||||||
a=mail_txt(m[0])
|
a=mail_txt(mm[0])
|
||||||
a=re.sub(r'\n\s*\n',r'\n',a)
|
a=re.sub(r'\n\s*\n',r'\n',a)
|
||||||
a=re.sub(r'<!--.*-->',r'',a,flags=re.MULTILINE|re.DOTALL)
|
a=re.sub(r'<!--.*-->',r'',a,flags=re.MULTILINE|re.DOTALL)
|
||||||
a=re.sub(r'\s*>+ .*\n',r'',a)
|
a=re.sub(r'\s*>+ .*\n',r'',a)
|
||||||
|
|||||||
BIN
test.sqlite
BIN
test.sqlite
Binary file not shown.
Reference in New Issue
Block a user