added flask interface

This commit is contained in:
Andreas Stephanides
2017-08-06 10:16:30 +02:00
parent 4060a77c48
commit ff0bdc6d3b
23 changed files with 14913 additions and 63 deletions

View File

@@ -5,7 +5,7 @@ from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import yaml
from storage import MailThread,db_session
@@ -26,7 +26,7 @@ def store_training_data(i, d,key=b"answered"):
train[i]={}
if not key is None and type(train[i]) is dict:
if not type(d) is data_types[key]:
# print str(type(d)) + " vs " + str(data_types[key])
# print str(type(d)) + " vs " + str(data_types[key])
raise TypeError("Data - %s - for key "% d +str(key)+" must be " +str(data_types[key])+ " but it is "+ str(type(d)))
train[i][key]=d
@@ -47,8 +47,8 @@ def get_training_threads(key="answered"):
if train[i].has_key(key): # In den Trainingsdaten muss der relevante Key sein
t_a.append(t)
d_a.append(train[i][key])
le=LabelEncoder()
d_a2=le.fit_transform(d_a)
le=LabelEncoder()
d_a2=le.fit_transform(d_a)
return (t_a,d_a2,le)
@@ -83,46 +83,102 @@ class ThreadTextExtractor(BaseEstimator, TransformerMixin):
return [t.text() for t in X]
pipe1=Pipeline([('tde', ThreadDictExtractor()),('dv',DictVectorizer()),('clf', MultinomialNB())])
pipe2 = Pipeline([
('union', FeatureUnion(transformer_list=[
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
('cv',CountVectorizer()),
('tfidf', TfidfTransformer())
])),
('text', Pipeline([('tte',ThreadTextExtractor()),
('cv',CountVectorizer()),
('tfidf', TfidfTransformer())
])),
('envelope', Pipeline([('tde', ThreadDictExtractor()),
('dv',DictVectorizer())
]))
], transformer_weights={
'subject': 1,
'text': 0.7,
'envelope': 0.7
} )),
('clf', MultinomialNB())
])
pipe2b = Pipeline([
('union', FeatureUnion(transformer_list=[
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
('cv',CountVectorizer()),
('tfidf', TfidfTransformer())
def build_pipe(p=b"pipe1"):
if p == "pipe1":
p=Pipeline([('tde', ThreadDictExtractor()),
('dv',DictVectorizer()),
('clf', MultinomialNB())
])
elif p=="pipe2":
p = Pipeline([
('union', FeatureUnion(transformer_list=[
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
('cv',CountVectorizer()),
('tfidf', TfidfTransformer())
])),
('text', Pipeline([('tte',ThreadTextExtractor()),
('cv',CountVectorizer()),
('tfidf', TfidfTransformer())
])),
('envelope', Pipeline([('tde', ThreadDictExtractor()),
('dv',DictVectorizer())
]))
], transformer_weights={
'subject': 1,
'text': 0.7,
'envelope': 0.7
} )),
('clf', MultinomialNB())
])
elif p=="pipe2b":
p = Pipeline([
('union', FeatureUnion(transformer_list=[
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
('cv',CountVectorizer()),
('tfidf', TfidfTransformer())
])),
('text', Pipeline([('tte',ThreadTextExtractor()),
('cv',CountVectorizer()),
('tfidf', TfidfTransformer())
('text', Pipeline([('tte',ThreadTextExtractor()),
('cv',CountVectorizer()),
('tfidf', TfidfTransformer())
])),
('envelope', Pipeline([('tde', ThreadDictExtractor()),
('dv',DictVectorizer())
]))
], transformer_weights={
'subject': 1,
'text': 0.7,
'envelope': 0.7
} )),
('mlc', MLPClassifier())
])
elif p=="pipe2c":
p = Pipeline([
('union', FeatureUnion(transformer_list=[
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
('cv',CountVectorizer()),
('tfidf', TfidfTransformer())
])),
('envelope', Pipeline([('tde', ThreadDictExtractor()),
('dv',DictVectorizer())
]))
], transformer_weights={
'subject': 1,
'text': 0.7,
'envelope': 0.7
} )),
('mlc', MLPClassifier())
])
('text', Pipeline([('tte',ThreadTextExtractor()),
('cv',CountVectorizer()),
('tfidf', TfidfTransformer())
])),
('envelope', Pipeline([('tde', ThreadDictExtractor()),
('dv',DictVectorizer())
]))
], transformer_weights={
'subject': 1,
'text': 1,
'envelope': 0.4
} )),
('mlc', MLPClassifier())
])
else:
raise ValueError("The pipe %s is not a valid pipe")
return p
def get_pipe(p=b"pipe1",k=b"answered"):
p=build_pipe(p)
tt= get_training_threads(k)
p.fit(tt[0],tt[1])
return p,tt[2]
from sklearn.metrics import accuracy_score
def test_pipe(pp,k):
tt= get_training_threads(k)
X_train,X_test,y_train,y_test=train_test_split(tt[0],tt[1],test_size=0.2)
if type(pp) is list:
for p in pp:
print "pipe: %s" % p
p=build_pipe(p)
p.fit(X_train,y_train)
ypred=p.predict(X_test)
print accuracy_score(y_test,ypred)
#pipe1=get_pipe("pipe1", "answered")
#pipe2=get_pipe("pipe2", "maintopic")
#pipe2b=get_pipe("pipe2b", "maintopic")

View File

@@ -1,4 +1,5 @@
{27070: {maintopic: ausleihen}, 27083: {maintopic: ausleihen}, 27086: {maintopic: information},
{26808: {maintopic: jobausschreibung}, 27017: {maintopic: jobausschreibung}, 27070: {
maintopic: ausleihen}, 27083: {maintopic: ausleihen}, 27086: {maintopic: information},
27094: {maintopic: information}, 27096: {maintopic: jobausschreibung}, 27102: {
maintopic: studium}, 27118: {maintopic: information}, 27127: {maintopic: studium},
27130: {maintopic: information}, 27133: {maintopic: information}, 27141: {maintopic: information},
@@ -8,7 +9,7 @@
27222: {maintopic: information}, 27226: {maintopic: ausleihen}, 27420: {answered: true,
maintopic: studium}, 27422: {answered: true, maintopic: studium}, 27425: {answered: false,
maintopic: studium}, 27431: {answered: false, maintopic: information}, 27434: {
answered: false, maintopic: studium}, 27435: {answered: false}, 27438: {answered: false,
answered: false, maintopic: information}, 27435: {answered: false}, 27438: {answered: false,
maintopic: information}, 27439: {answered: true, maintopic: studium}, 27441: {
answered: false, maintopic: studium}, 27444: {answered: true, maintopic: ausleihen},
27454: {answered: false, maintopic: information}, 27455: {answered: false, maintopic: information},

34
flaskapp/__init__.py Normal file
View File

@@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
import flask
from flask import Flask,jsonify,send_from_directory, render_template
from config import Config
import yaml
import os
from storage import MailThread,db_session
app = Flask(__name__, template_folder="templates", static_folder="static")
package_directory = os.path.dirname(os.path.abspath(__file__))
cfg = Config(file(os.path.join(package_directory, 'config.cfg')))
def render_index(mths,code=200):
return render_template("index.html",mths=mths,
title=cfg.title.decode("utf8"),
), code
from classifier import get_pipe
mail_threads=db_session.query(MailThread).all()
pipe1,le=get_pipe("pipe1",b"answered")
pipe2,le2=get_pipe("pipe2b", b"maintopic")
answered=le.inverse_transform(pipe1.predict(mail_threads))
maintopic=le2.inverse_transform(pipe2.predict(mail_threads))
for i, t in enumerate(mail_threads):
t.answered=answered[i]
t.maintopic=maintopic[i]
@app.route("/")
def hello():
mth=mail_threads
return render_index(mth)

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,459 @@
/*! normalize.css v5.0.0 | MIT License | github.com/necolas/normalize.css */
html {
font-family: sans-serif;
line-height: 1.15;
-ms-text-size-adjust: 100%;
-webkit-text-size-adjust: 100%;
}
body {
margin: 0;
}
article,
aside,
footer,
header,
nav,
section {
display: block;
}
h1 {
font-size: 2em;
margin: 0.67em 0;
}
figcaption,
figure,
main {
display: block;
}
figure {
margin: 1em 40px;
}
hr {
-webkit-box-sizing: content-box;
box-sizing: content-box;
height: 0;
overflow: visible;
}
pre {
font-family: monospace, monospace;
font-size: 1em;
}
a {
background-color: transparent;
-webkit-text-decoration-skip: objects;
}
a:active,
a:hover {
outline-width: 0;
}
abbr[title] {
border-bottom: none;
text-decoration: underline;
text-decoration: underline dotted;
}
b,
strong {
font-weight: inherit;
}
b,
strong {
font-weight: bolder;
}
code,
kbd,
samp {
font-family: monospace, monospace;
font-size: 1em;
}
dfn {
font-style: italic;
}
mark {
background-color: #ff0;
color: #000;
}
small {
font-size: 80%;
}
sub,
sup {
font-size: 75%;
line-height: 0;
position: relative;
vertical-align: baseline;
}
sub {
bottom: -0.25em;
}
sup {
top: -0.5em;
}
audio,
video {
display: inline-block;
}
audio:not([controls]) {
display: none;
height: 0;
}
img {
border-style: none;
}
svg:not(:root) {
overflow: hidden;
}
button,
input,
optgroup,
select,
textarea {
font-family: sans-serif;
font-size: 100%;
line-height: 1.15;
margin: 0;
}
button,
input {
overflow: visible;
}
button,
select {
text-transform: none;
}
button,
html [type="button"],
[type="reset"],
[type="submit"] {
-webkit-appearance: button;
}
button::-moz-focus-inner,
[type="button"]::-moz-focus-inner,
[type="reset"]::-moz-focus-inner,
[type="submit"]::-moz-focus-inner {
border-style: none;
padding: 0;
}
button:-moz-focusring,
[type="button"]:-moz-focusring,
[type="reset"]:-moz-focusring,
[type="submit"]:-moz-focusring {
outline: 1px dotted ButtonText;
}
fieldset {
border: 1px solid #c0c0c0;
margin: 0 2px;
padding: 0.35em 0.625em 0.75em;
}
legend {
-webkit-box-sizing: border-box;
box-sizing: border-box;
color: inherit;
display: table;
max-width: 100%;
padding: 0;
white-space: normal;
}
progress {
display: inline-block;
vertical-align: baseline;
}
textarea {
overflow: auto;
}
[type="checkbox"],
[type="radio"] {
-webkit-box-sizing: border-box;
box-sizing: border-box;
padding: 0;
}
[type="number"]::-webkit-inner-spin-button,
[type="number"]::-webkit-outer-spin-button {
height: auto;
}
[type="search"] {
-webkit-appearance: textfield;
outline-offset: -2px;
}
[type="search"]::-webkit-search-cancel-button,
[type="search"]::-webkit-search-decoration {
-webkit-appearance: none;
}
::-webkit-file-upload-button {
-webkit-appearance: button;
font: inherit;
}
details,
menu {
display: block;
}
summary {
display: list-item;
}
canvas {
display: inline-block;
}
template {
display: none;
}
[hidden] {
display: none;
}
html {
-webkit-box-sizing: border-box;
box-sizing: border-box;
}
*,
*::before,
*::after {
-webkit-box-sizing: inherit;
box-sizing: inherit;
}
@-ms-viewport {
width: device-width;
}
html {
-ms-overflow-style: scrollbar;
-webkit-tap-highlight-color: transparent;
}
body {
font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
font-size: 1rem;
font-weight: normal;
line-height: 1.5;
color: #292b2c;
background-color: #fff;
}
[tabindex="-1"]:focus {
outline: none !important;
}
h1, h2, h3, h4, h5, h6 {
margin-top: 0;
margin-bottom: .5rem;
}
p {
margin-top: 0;
margin-bottom: 1rem;
}
abbr[title],
abbr[data-original-title] {
cursor: help;
}
address {
margin-bottom: 1rem;
font-style: normal;
line-height: inherit;
}
ol,
ul,
dl {
margin-top: 0;
margin-bottom: 1rem;
}
ol ol,
ul ul,
ol ul,
ul ol {
margin-bottom: 0;
}
dt {
font-weight: bold;
}
dd {
margin-bottom: .5rem;
margin-left: 0;
}
blockquote {
margin: 0 0 1rem;
}
a {
color: #0275d8;
text-decoration: none;
}
a:focus, a:hover {
color: #014c8c;
text-decoration: underline;
}
a:not([href]):not([tabindex]) {
color: inherit;
text-decoration: none;
}
a:not([href]):not([tabindex]):focus, a:not([href]):not([tabindex]):hover {
color: inherit;
text-decoration: none;
}
a:not([href]):not([tabindex]):focus {
outline: 0;
}
pre {
margin-top: 0;
margin-bottom: 1rem;
overflow: auto;
}
figure {
margin: 0 0 1rem;
}
img {
vertical-align: middle;
}
[role="button"] {
cursor: pointer;
}
a,
area,
button,
[role="button"],
input,
label,
select,
summary,
textarea {
-ms-touch-action: manipulation;
touch-action: manipulation;
}
table {
border-collapse: collapse;
background-color: transparent;
}
caption {
padding-top: 0.75rem;
padding-bottom: 0.75rem;
color: #636c72;
text-align: left;
caption-side: bottom;
}
th {
text-align: left;
}
label {
display: inline-block;
margin-bottom: .5rem;
}
button:focus {
outline: 1px dotted;
outline: 5px auto -webkit-focus-ring-color;
}
input,
button,
select,
textarea {
line-height: inherit;
}
input[type="radio"]:disabled,
input[type="checkbox"]:disabled {
cursor: not-allowed;
}
input[type="date"],
input[type="time"],
input[type="datetime-local"],
input[type="month"] {
-webkit-appearance: listbox;
}
textarea {
resize: vertical;
}
fieldset {
min-width: 0;
padding: 0;
margin: 0;
border: 0;
}
legend {
display: block;
width: 100%;
padding: 0;
margin-bottom: .5rem;
font-size: 1.5rem;
line-height: inherit;
}
input[type="search"] {
-webkit-appearance: none;
}
output {
display: inline-block;
}
[hidden] {
display: none !important;
}
/*# sourceMappingURL=bootstrap-reboot.css.map */

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1 @@
/*! normalize.css v5.0.0 | MIT License | github.com/necolas/normalize.css */html{font-family:sans-serif;line-height:1.15;-ms-text-size-adjust:100%;-webkit-text-size-adjust:100%}body{margin:0}article,aside,footer,header,nav,section{display:block}h1{font-size:2em;margin:.67em 0}figcaption,figure,main{display:block}figure{margin:1em 40px}hr{-webkit-box-sizing:content-box;box-sizing:content-box;height:0;overflow:visible}pre{font-family:monospace,monospace;font-size:1em}a{background-color:transparent;-webkit-text-decoration-skip:objects}a:active,a:hover{outline-width:0}abbr[title]{border-bottom:none;text-decoration:underline;text-decoration:underline dotted}b,strong{font-weight:inherit}b,strong{font-weight:bolder}code,kbd,samp{font-family:monospace,monospace;font-size:1em}dfn{font-style:italic}mark{background-color:#ff0;color:#000}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}audio,video{display:inline-block}audio:not([controls]){display:none;height:0}img{border-style:none}svg:not(:root){overflow:hidden}button,input,optgroup,select,textarea{font-family:sans-serif;font-size:100%;line-height:1.15;margin:0}button,input{overflow:visible}button,select{text-transform:none}[type=reset],[type=submit],button,html [type=button]{-webkit-appearance:button}[type=button]::-moz-focus-inner,[type=reset]::-moz-focus-inner,[type=submit]::-moz-focus-inner,button::-moz-focus-inner{border-style:none;padding:0}[type=button]:-moz-focusring,[type=reset]:-moz-focusring,[type=submit]:-moz-focusring,button:-moz-focusring{outline:1px dotted ButtonText}fieldset{border:1px solid silver;margin:0 2px;padding:.35em .625em .75em}legend{-webkit-box-sizing:border-box;box-sizing:border-box;color:inherit;display:table;max-width:100%;padding:0;white-space:normal}progress{display:inline-block;vertical-align:baseline}textarea{overflow:auto}[type=checkbox],[type=radio]{-webkit-box-sizing:border-box;box-sizing:border-box;padding:0}[type=number]::-webkit-inner-spin-button,[type=number]::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}[type=search]::-webkit-search-cancel-button,[type=search]::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}details,menu{display:block}summary{display:list-item}canvas{display:inline-block}template{display:none}[hidden]{display:none}html{-webkit-box-sizing:border-box;box-sizing:border-box}*,::after,::before{-webkit-box-sizing:inherit;box-sizing:inherit}@-ms-viewport{width:device-width}html{-ms-overflow-style:scrollbar;-webkit-tap-highlight-color:transparent}body{font-family:-apple-system,system-ui,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,sans-serif;font-size:1rem;font-weight:400;line-height:1.5;color:#292b2c;background-color:#fff}[tabindex="-1"]:focus{outline:0!important}h1,h2,h3,h4,h5,h6{margin-top:0;margin-bottom:.5rem}p{margin-top:0;margin-bottom:1rem}abbr[data-original-title],abbr[title]{cursor:help}address{margin-bottom:1rem;font-style:normal;line-height:inherit}dl,ol,ul{margin-top:0;margin-bottom:1rem}ol ol,ol ul,ul ol,ul ul{margin-bottom:0}dt{font-weight:700}dd{margin-bottom:.5rem;margin-left:0}blockquote{margin:0 0 1rem}a{color:#0275d8;text-decoration:none}a:focus,a:hover{color:#014c8c;text-decoration:underline}a:not([href]):not([tabindex]){color:inherit;text-decoration:none}a:not([href]):not([tabindex]):focus,a:not([href]):not([tabindex]):hover{color:inherit;text-decoration:none}a:not([href]):not([tabindex]):focus{outline:0}pre{margin-top:0;margin-bottom:1rem;overflow:auto}figure{margin:0 0 1rem}img{vertical-align:middle}[role=button]{cursor:pointer}[role=button],a,area,button,input,label,select,summary,textarea{-ms-touch-action:manipulation;touch-action:manipulation}table{border-collapse:collapse;background-color:transparent}caption{padding-top:.75rem;padding-bottom:.75rem;color:#636c72;text-align:left;caption-side:bottom}th{text-align:left}label{display:inline-block;margin-bottom:.5rem}button:focus{outline:1px dotted;outline:5px auto -webkit-focus-ring-color}button,input,select,textarea{line-height:inherit}input[type=checkbox]:disabled,input[type=radio]:disabled{cursor:not-allowed}input[type=date],input[type=time],input[type=datetime-local],input[type=month]{-webkit-appearance:listbox}textarea{resize:vertical}fieldset{min-width:0;padding:0;margin:0;border:0}legend{display:block;width:100%;padding:0;margin-bottom:.5rem;font-size:1.5rem;line-height:inherit}input[type=search]{-webkit-appearance:none}output{display:inline-block}[hidden]{display:none!important}/*# sourceMappingURL=bootstrap-reboot.min.css.map */

View File

@@ -0,0 +1 @@
{"version":3,"sources":["../../scss/_normalize.scss","bootstrap-reboot.css","../../scss/_reboot.scss","../../scss/_variables.scss","../../scss/mixins/_hover.scss"],"names":[],"mappings":"4EAYA,KACE,YAAA,WACA,YAAA,KACA,qBAAA,KACA,yBAAA,KAUF,KACE,OAAA,EAOF,QAAA,MAAA,OAAA,OAAA,IAAA,QAME,QAAA,MAQF,GACE,UAAA,IACA,OAAA,MAAA,EAWF,WAAA,OAAA,KAGE,QAAA,MAOF,OACE,OAAA,IAAA,KAQF,GACE,mBAAA,YAAA,WAAA,YACA,OAAA,EACA,SAAA,QAQF,IACE,YAAA,UAAA,UACA,UAAA,IAWF,EACE,iBAAA,YACA,6BAAA,QAQF,SAAA,QAEE,cAAA,EAQF,YACE,cAAA,KACA,gBAAA,UACA,gBAAA,UAAA,OAOF,EAAA,OAEE,YAAA,QAOF,EAAA,OAEE,YAAA,OAQF,KAAA,IAAA,KAGE,YAAA,UAAA,UACA,UAAA,IAOF,IACE,WAAA,OAOF,KACE,iBAAA,KACA,MAAA,KAOF,MACE,UAAA,IAQF,IAAA,IAEE,UAAA,IACA,YAAA,EACA,SAAA,SACA,eAAA,SAGF,IACE,OAAA,OAGF,IACE,IAAA,MAUF,MAAA,MAEE,QAAA,aAOF,sBACE,QAAA,KACA,OAAA,EAOF,IACE,aAAA,KAOF,eACE,SAAA,OAWF,OAAA,MAAA,SAAA,OAAA,SAKE,YAAA,WACA,UAAA,KACA,YAAA,KACA,OAAA,EAQF,OAAA,MAEE,SAAA,QAQF,OAAA,OAEE,eAAA,KASF,aAAA,cAAA,OAAA,mBAIE,mBAAA,OAOF,gCAAA,+BAAA,gCAAA,yBAIE,aAAA,KACA,QAAA,EAOF,6BAAA,4BAAA,6BAAA,sBAIE,QAAA,IAAA,OAAA,WAOF,SACE,OAAA,IAAA,MAAA,OACA,OAAA,EAAA,IACA,QAAA,MAAA,OAAA,MAUF,OACE,mBAAA,WAAA,WAAA,WACA,MAAA,QACA,QAAA,MACA,UAAA,KACA,QAAA,EACA,YAAA,OAQF,SACE,QAAA,aACA,eAAA,SAOF,SACE,SAAA,KCrKF,gBAAA,aD+KE,mBAAA,WAAA,WAAA,WACA,QAAA,EC1KF,yCAAA,yCDmLE,OAAA,KC9KF,cDuLE,mBAAA,UACA,eAAA,KCnLF,4CAAA,yCD4LE,mBAAA,KAQF,6BACE,mBAAA,OACA,KAAA,QAWF,QAAA,KAEE,QAAA,MAOF,QACE,QAAA,UAUF,OACE,QAAA,aAOF,SACE,QAAA,KCnNF,SD8NE,QAAA,KEtbF,KACE,mBAAA,WAAA,WAAA,WAGF,EAAA,QAAA,SAGE,mBAAA,QAAA,WAAA,QAoBA,cAAgB,MAAA,aAQlB,KAYE,mBAAA,UAGA,4BAAA,YAGF,KACE,YAAA,cAAA,UAAA,mBAAA,WAAA,OC2K4H,iBD3K5H,MAAA,WACA,UAAA,KACA,YAAA,IACA,YAAA,IAEA,MAAA,QAEA,iBAAA,KD2LF,sBClLE,QAAA,YAYF,GAAI,GAAI,GAAI,GAAI,GAAI,GAClB,WAAA,EACA,cAAA,MAOF,EACE,WAAA,EACA,cAAA,KAIF,0BAAA,YAGE,OAAA,KAGF,QACE,cAAA,KACA,WAAA,OACA,YAAA,QAGF,GAAA,GAAA,GAGE,WAAA,EACA,cAAA,KAGF,MAAA,MAAA,MAAA,MAIE,cAAA,EAGF,GACE,YAAA,IAGF,GACE,cAAA,MACA,YAAA,EAGF,WACE,OAAA,EAAA,EAAA,KAQF,EACE,MAAA,QACA,gBAAA,KEhJE,QAAA,QFmJA,MAAA,QACA,gBAAA,UAUJ,8BACE,MAAA,QACA,gBAAA,KEhKE,oCAAA,oCFmKA,MAAA,QACA,gBAAA,KANJ,oCAUI,QAAA,EASJ,IAEE,WAAA,EAEA,cAAA,KAEA,SAAA,KAQF,OAGE,OAAA,EAAA,EAAA,KAQF,IAGE,eAAA,ODsIF,cCzHE,OAAA,QAcF,cAAA,EAAA,KAAA,OAAA,MAAA,MAAA,OAAA,QAAA,SASE,iBAAA,aAAA,aAAA,aAQF,MAEE,gBAAA,SAEA,iBAAA,YAGF,QACE,YAAA,OACA,eAAA,OACA,MAAA,QACA,WAAA,KACA,aAAA,OAGF,GAEE,WAAA,KAQF,MAEE,QAAA,aACA,cAAA,MAOF,aACE,QAAA,IAAA,OACA,QAAA,IAAA,KAAA,yBAGF,OAAA,MAAA,OAAA,SAME,YAAA,QAGF,8BAAA,2BAMI,OAAA,YAKJ,iBAAA,iBAAA,2BAAA,kBASE,mBAAA,QAGF,SAEE,OAAA,SAGF,SAME,UAAA,EAEA,QAAA,EACA,OAAA,EACA,OAAA,EAGF,OAEE,QAAA,MACA,MAAA,KACA,QAAA,EACA,cAAA,MACA,UAAA,OACA,YAAA,QAGF,mBAKE,mBAAA,KAIF,OACE,QAAA,aDsEF,SC9DE,QAAA"}

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

3535
flaskapp/static/bootstrap/js/bootstrap.js vendored Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

4
flaskapp/static/jquery-3.2.0.min.js vendored Normal file

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,46 @@
<html>
<head>
<title>{{title}}</title>
<script src="static/jquery-3.2.0.min.js" ></script>
<link rel="stylesheet" href="static/bootstrap/css/bootstrap.min.css"/>
<script src="static/bootstrap/js/bootstrap.min.js" ></script>
</head>
<body>
<div class="container">
<div class="row">
<div class="col-12">
<h1>{{title}}</h1>
<div id="accordion" role="tablist" aria-multiselectable="true">
{% for m in mths %}
<div class="card" style="padding-top: 2pt; padding-bottom:2pt; border-radius:0;margin-top:1pt; margin-bottom:1pt">
<div class="" role="tab" id="heading{{m.firstmail}}">
<b class="mb-0">
<a data-toggle="collapse" data-parent="#accordion" href="#collapse{{m.firstmail}}" aria-expanded="true" aria-controls="collapse1">
{{m.tstr()}}
</a>
</b>
</div>
<div id="collapse{{m.firstmail}}" class="collapse" role="tabpanel" aria-labelledby="headingOne">
<div class="card-block">
<div style="white-space: pre-wrap;font:Courier, monospace; font-size:small; width:50em; border: thin blue solid;"> {{ m.text() }} </div>
</div>
</div>
</div>
{% endfor %}
</div>
<div style="white-space: pre-wrap;font:Courier, monospace; font-size:small; width:50em">
{% for m in mths %}
ID: {{m.tstr()}}
{{ m.text() }}
{% endfor %}
</div>
</div>
</div>
</div>
</body>

21
run.py
View File

@@ -14,7 +14,7 @@ from storage.fetch_mail import fetch_threads, flatten_threads
from storage import Mail, MailThread, db_session
import yaml
import email
from classifier import get_training_threads, ThreadDictExtractor, pipe1, print_answers, in_training, store_training_data, pipe2, pipe2b
from classifier import get_training_threads, ThreadDictExtractor, print_answers, in_training, store_training_data, get_pipe, test_pipe # , pipe2, pipe2b
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
import numpy
@@ -23,8 +23,8 @@ import numpy
def train_fit_pipe():
tt= get_training_threads(b"answered")
print tt[1]
print tt[0]
# print tt[1]
# print tt[0]
pipe1.fit(tt[0],tt[1])
return pipe1,tt[2]
def train_fit_pipe2():
@@ -80,13 +80,15 @@ def train_single_thread(tid,p,le,key="answered"):
else:
print "couldn't handle %s" % ca
from flaskapp import app
#print "arg1:"+sys.argv[1]
if len(sys.argv)>1:
if sys.argv[1] == "fetch_threads":
print flatten_threads(fetch_threads())
if sys.argv[1] == "run_server":
app.run(port=3000,debug=True)
if sys.argv[1] == "print_threads":
mth=db_session.query(MailThread).all()
for t in mth:
@@ -104,6 +106,10 @@ if len(sys.argv)>1:
print t.firstmail
print t.subject()
print t.text()
if sys.argv[1] == "compile_threads":
mth=db_session.query(MailThread).all()
for t in mth:
t.compile()
if sys.argv[1] == "print_threads2":
@@ -113,8 +119,9 @@ if len(sys.argv)>1:
print "---------------\n"
if sys.argv[1] == "train_thrd2":
p, le=train_fit_pipe2()
pb, lb =train_fit_pipe2b()
p, le=get_pipe("pipe2", "maintopic")
pb, lb =get_pipe("pipe2b", "maintopic")
train_single_thread(int(sys.argv[2]),p,le,b"maintopic")
if sys.argv[1] == "train_all2":
p, labelencoder=train_fit_pipe2()
@@ -130,6 +137,8 @@ if len(sys.argv)>1:
predict_thread(pb,lb,t)
train_single_thread(t.firstmail, p, labelencoder, b"maintopic")
if sys.argv[1] == "benchpipe2":
test_pipe(["pipe2","pipe2b","pipe2c"],"maintopic")
if sys.argv[1] == "testpipe2":
from classifier import ThreadSubjectExtractor, ThreadTextExtractor

View File

@@ -75,9 +75,7 @@ class Mail(Base):
from_array=[]
# print "Status"
# print env
if env.to is None:
print self.id
else:
if not env.to is None:
for t in env.to:
a={"host": t.host, "mail": t.mailbox}
to_array.append(a)
@@ -121,6 +119,8 @@ class Mail(Base):
if p.get_content_subtype()=="html":
b4=bs4.BeautifulSoup(pl,"html.parser")
[s.extract() for s in b4('script')]
[s.extract() for s in b4('style')]
self.text= yaml.dump(b4.get_text())
else:
self.text =yaml.dump( pl)

View File

@@ -9,6 +9,8 @@ import yaml
import email
from mail_model import Mail
#from fetch_mail import fetch_mail
import re
class FullThreadSchema(Schema):
id=fields.Integer()
@@ -28,13 +30,16 @@ class MailThread(Base):
__jsonid__='thread'
__whiteattrs__= ["body"]
__jsonattrs__=None
answered=False
maintopic="information"
def bdy(self):
return yaml.load(self.body)
def to_text(self):
mmm=self.mails()
txt=""
for m in mmm:
m.compile_envelope()
# m.compile_envelope()
txt=txt+"mail: \n"
for f in yaml.load(m.from_):
txt=txt+f["mail"]+"@"+f["host"]
@@ -43,6 +48,10 @@ class MailThread(Base):
txt=txt+"\n"
return txt
def tstr(self):
fr=yaml.load(self.mails()[0].from_)
return "(" + str(self.answered)+ ", "+ str(self.maintopic) + ") " + str(self.firstmail)+": "+str(fr[0]["mail"])+"@"+str(fr[0]["host"]) + " | ".join(yaml.load(self.mails()[0].subject))
def mails(self):
a=[]
# print self.bdy()
@@ -60,6 +69,7 @@ class MailThread(Base):
m.compile_envelope()
a.append(m.dict_envelope())
return a
def mail_flat_dict(self):
a=[]
d={}
@@ -69,25 +79,41 @@ class MailThread(Base):
for k, v in dc[i].iteritems():
d["mail_"+str(i)+"_"+k]=v
return d
def subject(self):
a=""
for m in self.mails():
m.compile_envelope()
a=a + " ".join(yaml.load(m.subject))+"\n"
return a
def text(self):
a=u""
def compile(self):
for m in self.mails():
m.compile_envelope()
m.compile_text()
db_session.add(m)
db_session.commit()
def text(self,filter="all"):
a=u""
def mail_txt(m):
t=yaml.load(m.text)
if type(t) is unicode:
txt=t
else:
# print "withintm:"+str(type(t))
t=t.decode("ISO-8859-1")
txt=t
a=a+txt+"***........................................***\n"
return txt
mm=self.mails()
if filter=="all":
for m in mm:
a=a+mail_txt(m)+"****........................................***\n"
elif filter=="first":
a=mail_txt(m[0])
a=re.sub(r'\n\s*\n',r'\n',a)
a=re.sub(r'<!--.*-->',r'',a,flags=re.MULTILINE|re.DOTALL)
a=re.sub(r'\s*>+ .*\n',r'',a)
return a

Binary file not shown.