diff --git a/classifier/__init__.py b/classifier/__init__.py index e53691d..41c689b 100644 --- a/classifier/__init__.py +++ b/classifier/__init__.py @@ -1,6 +1,7 @@ -from classifier import in_training, print_answers +from classifier import print_answers from classifier import get_pipe, test_pipe, get_training_threads -from classifier import store_training_data +#from classifier import store_training_data +#in_training, from training import train_single_thread diff --git a/classifier/classifier.py b/classifier/classifier.py index 66ef5ab..394cdb3 100644 --- a/classifier/classifier.py +++ b/classifier/classifier.py @@ -8,88 +8,18 @@ from sklearn.neural_network import MLPClassifier from sklearn.model_selection import train_test_split import numpy as np import yaml -from storage import MailThread,db_session from sklearn.metrics import accuracy_score, confusion_matrix +from collections import namedtuple +from storage import get_training_threads #MailThread,db_session -#with open("data.yml", 'r') as stream: -# try: -# train=yaml.load(stream) -# except yaml.YAMLError as exc: -# print(exc) - -data_types= { "answered": bool, "maintopic": str, "lang": str} - -def set_train_data(i,d,key=b"answered"): - global train - #------------------------------------ - if not data_types.has_key(key): - raise ValueError("Key "+str(key)+" unknown") - if not train.has_key(i) or train[i] is None: - train[i]={} - if not type(d) is data_types[key]: - raise TypeError("Data - %s - for key "% d +str(key)+" must be " +str(data_types[key])+ " but it is "+ str(type(d))) - #------------------------------------ - train[i][key]=d - - -def store_training_data(i, d,key=b"answered"): - set_train_data(i,d,key) - with open("data.yml","w") as file: - file.write(yaml.dump(train,default_flow_style=True)) - file.close() - - -# Lade Trainingsdaten fuer einen angegebenen key (Label/Eigenschaft) -def get_training_threads(key="answered", filters=[]): - if not data_types.has_key(key): - raise ValueError("Key "+str(key)+" unknown") - #------------------------------------ - t_a=[] - d_a=[] - d_a2=[] - #------------------------------------ - if "db" in filters: - q=db_session.query(MailThread).filter(MailThread.istrained.is_(True)) - if "de" in filters: - q=q.filter(MailThread.lang=="de") - elif "en" in filters: - q=q.filter(MailThread.lang=="en") - tt=q.all() - for t in tt: - t_a.append(t) - if key =="answered": - d_a.append(t.is_answered()) - elif key=="maintopic": - d_a.append(t.maintopic) - elif key=="lang": - d_a.append(t.lang) - else: - raise ValueError("Database Filter now required") - le=LabelEncoder() - d_a2=le.fit_transform(d_a) - return (t_a,d_a2,le) - - - # else: - # for i in train: - # if train[i].has_key(key): # In den Trainingsdaten muss der relevante Key sein -# t=db_session.query(MailThread).filter(MailThread.firstmail==i).first#() -# if not t is None: # Thread muss in der Datenbank sein -# t_a.append(t) -# d_a.append(train[i][key]) - - -def in_training(i, key="answered"): - return train.has_key(i) and train[i].has_key(key) - +PredictTool=namedtuple("PredictTool",["Pipeline","LabelEncoder"]) def print_answers(l): - - cc=l.classes_ - c_id=l.transform(cc) - for i,c in enumerate(cc): - print str(i) + ": " + str(c) + classes=l.classes_ + classes_encoded=l.transform(cc) + for i, c in zip(classes_encoded,classes): + print str(i) + ": " + str(c) return None @@ -117,15 +47,14 @@ class ThreadFirstTextExtractor(BaseEstimator, TransformerMixin): def transform(self, X,y=None): return [t.text("first") for t in X] -def get_pipe(p=b"pipe1",k=b"answered",filters=[]): +def get_pipe(p=b"pipe1",key=b"answered",filters=["db"]): p=build_pipe(p) - tt= get_training_threads(k,filters) - #print tt - if len(tt[0]) > 0: - p.fit(tt[0],tt[1]) - return p,tt[2] + threads, labels, labelenc= get_training_threads(key,filters) + if len(threads) > 0: + p.fit(threads,labels) + return PredictTool(p,labelenc) else: - return None, None + return PredictTool(None, None) def test_pipe(pp,k,f=[]): tt= get_training_threads(k,f) @@ -142,13 +71,13 @@ def test_pipe(pp,k,f=[]): -def build_pipe(p=b"pipe1"): - if p == "pipe1": +def build_pipe(pipe=b"pipe1"): + if pipe == "pipe1": p=Pipeline([('tde', ThreadDictExtractor()), ('dv',DictVectorizer()), ('clf', MultinomialNB()) ]) - elif p=="pipe2": + elif pipe=="pipe2": p = Pipeline([ ('union', FeatureUnion(transformer_list=[ ('subject', Pipeline([('tse', ThreadSubjectExtractor()), @@ -169,7 +98,7 @@ def build_pipe(p=b"pipe1"): } )), ('clf', MultinomialNB()) ]) - elif p=="pipe2b": + elif pipe=="pipe2b": p = Pipeline([ ('union', FeatureUnion(transformer_list=[ ('subject', Pipeline([('tse', ThreadSubjectExtractor()), @@ -190,7 +119,7 @@ def build_pipe(p=b"pipe1"): } )), ('mlc', MLPClassifier()) ]) - elif p=="pipe2d": + elif pipe=="pipe2d": p = Pipeline([ ('union', FeatureUnion(transformer_list=[ ('subject', Pipeline([('tse', ThreadSubjectExtractor()), @@ -217,7 +146,7 @@ def build_pipe(p=b"pipe1"): ('mlc', MLPClassifier()) ]) - elif p=="pipe2e": + elif pipe=="pipe2e": p = Pipeline([ ('union', FeatureUnion(transformer_list=[ ('subject', Pipeline([('tse', ThreadSubjectExtractor()), @@ -243,7 +172,7 @@ def build_pipe(p=b"pipe1"): } )), ('mlc', MLPClassifier(hidden_layer_sizes=(100,100))) ]) - elif p=="pipe2e1": + elif pipe=="pipe2e1": p = Pipeline([ ('union', FeatureUnion(transformer_list=[ ('subject', Pipeline([('tse', ThreadSubjectExtractor()), @@ -269,7 +198,7 @@ def build_pipe(p=b"pipe1"): } )), ('mlc', MLPClassifier(hidden_layer_sizes=(100,100,50))) ]) - elif p=="pipe2f": + elif pipe=="pipe2f": p = Pipeline([ ('union', FeatureUnion(transformer_list=[ ('subject', Pipeline([('tse', ThreadSubjectExtractor()), @@ -295,7 +224,7 @@ def build_pipe(p=b"pipe1"): } )), ('mlc', MLPClassifier(hidden_layer_sizes=(100,100))) ]) - elif p=="pipe2g": + elif pipe=="pipe2g": p = Pipeline([ ('union', FeatureUnion(transformer_list=[ ('subject', Pipeline([('tse', ThreadSubjectExtractor()), @@ -321,7 +250,7 @@ def build_pipe(p=b"pipe1"): } )), ('mlc', MLPClassifier(hidden_layer_sizes=(100,100,100))) ]) - elif p=="pipe2c": + elif pipe=="pipe2c": p = Pipeline([ ('union', FeatureUnion(transformer_list=[ ('subject', Pipeline([('tse', ThreadSubjectExtractor()), diff --git a/classifier/oldstuff.py b/classifier/oldstuff.py new file mode 100644 index 0000000..8084733 --- /dev/null +++ b/classifier/oldstuff.py @@ -0,0 +1,23 @@ +def set_train_data(i,d,key=b"answered"): + global train + #------------------------------------ + if not data_types.has_key(key): + raise ValueError("Key "+str(key)+" unknown") + if not train.has_key(i) or train[i] is None: + train[i]={} + if not type(d) is data_types[key]: + raise TypeError("Data - %s - for key "% d +str(key)+" must be " +str(data_types[key])+ " but it is "+ str(type(d))) + #------------------------------------ + train[i][key]=d + + +def store_training_data(i, d,key=b"answered"): + set_train_data(i,d,key) + with open("data.yml","w") as file: + file.write(yaml.dump(train,default_flow_style=True)) + file.close() + + +def in_training(i, key="answered"): + return train.has_key(i) and train[i].has_key(key) +data_types= { "answered": bool, "maintopic": str, "lang": str} diff --git a/classifier/prediction.py b/classifier/prediction.py index 35eac79..afa6373 100644 --- a/classifier/prediction.py +++ b/classifier/prediction.py @@ -1,22 +1,31 @@ -from classifier import get_training_threads, print_answers, in_training, store_training_data, get_pipe +from classifier import get_pipe from storage import db_session, MailThread def predict_threads(): - pipe1,le=get_pipe("pipe1",b"answered",["db"]) + """ + Predicts the language, topic and if a thread is anwered and writes that to the database. This function doesn't have a return value. + """ + # Loading pipes for the prediction of each thread + pipe1,le=get_pipe("pipe1",key=b"answered",filter=["db"]) pipe2,le2=get_pipe("pipe2g", b"maintopic",["db"]) pipe3,le3=get_pipe("pipe2b", b"lang",["db"]) + + # Loading untrained MailThreads: q=db_session.query(MailThread).filter(MailThread.istrained.op("IS NOT")(True)) mail_threads=q.all() + if len(mail_threads) ==0: - raise ValueError("no untrained threads found") + raise StandardError("no untrained threads found in database") + answered=le.inverse_transform(pipe1.predict(mail_threads)) maintopic=le2.inverse_transform(pipe2.predict(mail_threads)) lang=le3.inverse_transform(pipe3.predict(mail_threads)) for i, t in enumerate(mail_threads): - t.answered=bool(answered[i]) - t.opened=bool(answered[i]) - t.maintopic=str(maintopic[i]) - t.lang=str(lang[i]) + t.answered, t.opened, t.maintopic, t.lang = ( bool(answered[i]), + bool(answered[i]), + str(maintopic[i]), + str(lang[i]) + ) db_session.add(t) db_session.commit() diff --git a/classifier/training.py b/classifier/training.py index f340d3d..f39fd0c 100644 --- a/classifier/training.py +++ b/classifier/training.py @@ -2,7 +2,7 @@ from sklearn.pipeline import Pipeline from sklearn.preprocessing import LabelEncoder import numpy from storage import Mail, MailThread, db_session -from classifier import store_training_data, print_answers +from classifier import print_answers diff --git a/flaskapp/__init__.py b/flaskapp/__init__.py index a1c1388..1623a94 100644 --- a/flaskapp/__init__.py +++ b/flaskapp/__init__.py @@ -49,13 +49,16 @@ def store_value(id,key,value): mth.opened=bool(value) if key=="maintopic" and value in maintopic_values: mth.maintopic=str(value) - if key=="lang" and value in maintopic_values: + if key=="lang" and value in ["de", "en"]: mth.lang=str(value) if key =="trained": value = value in ["true", "True", "1", "t"] mth.istrained=bool(value) db_session.add(mth) db_session.commit() + + + @app.route("/") def store_answered(id): diff --git a/flaskapp/templates/_macros.html b/flaskapp/templates/_macros.html new file mode 100644 index 0000000..55183ff --- /dev/null +++ b/flaskapp/templates/_macros.html @@ -0,0 +1,89 @@ +{# -*-jinja2-*- #} + + +{% macro render_nav(m, maintopics) %} + + +{% endmacro %} + +{% macro render_inline_thread(m, maintopics) %} + +
+ +
+
+ {{render_nav(m,maintopics) }} +
+ + {% for (h,txt) in m.print_mail() %} +
{{h}}
+
{{ txt }}
+ {% endfor %} +
+
+{% endmacro %} diff --git a/flaskapp/templates/index.html b/flaskapp/templates/index.html index 755065e..7c97d2e 100644 --- a/flaskapp/templates/index.html +++ b/flaskapp/templates/index.html @@ -1,104 +1,45 @@ +{# -*-jinja2-*- #} +{% from "_macros.html" import render_inline_thread %} +{% extends "layout.html" %} +{% block header %} + +{% endblock %} - - - {{title}} - - - - - - -
-
-
-

{{title}}

- -
- - {% for m in mths %} -
- +{% block content %} +
+
+
-
- - - - {% for txt in m.print_text() %} -
- {{ txt }} -
- {% endfor %} -
-
+
+ {% for m in mths %} + {{ render_inline_thread(m, maintopics) }} {% endfor %} -
-
-
- +
+{% endblock %} diff --git a/flaskapp/templates/layout.html b/flaskapp/templates/layout.html new file mode 100644 index 0000000..4f75ccb --- /dev/null +++ b/flaskapp/templates/layout.html @@ -0,0 +1,17 @@ +{# -*-jinja2-*- #} + + + + {{title}} + + + + {% block header %} + {% endblock %} + + + {% block content %} + {% endblock %} + + + diff --git a/run.py b/run.py index 768745a..7190863 100644 --- a/run.py +++ b/run.py @@ -70,29 +70,6 @@ if len(sys.argv)>1: from flaskapp import app app.run(port=3000,debug=True) - if sys.argv[1] == "print_threads": - mth=db_session.query(MailThread).all() - for t in mth: - print t.firstmail - print t.mail_flat_dict() - - if sys.argv[1] == "print_thrd": - if len(sys.argv)<3: - mth=db_session.query(MailThread).all() - for t in mth: - print t.firstmail - else: - t=db_session.query(MailThread).filter(MailThread.firstmail==sys.argv[2]).first() - - print t.firstmail - print t.subject() - print t.text() - if sys.argv[1] == "compile_threads": - mth=db_session.query(MailThread).all() - l=len(mth) - for i,t in enumerate(mth): - print "%d/%d" % (i,l) - t.compile() if sys.argv[1] == "trained_threads_from_yml": from classifier.classifier import train @@ -188,10 +165,9 @@ if len(sys.argv)>1: th.body=yaml.dump(t) th.islabeled=False th.opened=True - else: - th.body=yaml.dump(t) - db_session.add(th) - db_session.commit() + th.compile() + db_session.add(th) + db_session.commit() print thrds if sys.argv[1] == "print_raw_mail": diff --git a/storage/__init__.py b/storage/__init__.py index e6ffa63..68ec3e0 100644 --- a/storage/__init__.py +++ b/storage/__init__.py @@ -2,7 +2,41 @@ from database import db_session, init_db from database import Base metadata=Base.metadata - +from sklearn.preprocessing import LabelEncoder +from collections import namedtuple # Two main objects: Mail & MailThread from mail_model import Mail from thread_model import MailThread +def str_bool(s): + return s in ["1", "true", "True", "t","T"] + +# Lade Trainingsdaten fuer einen angegebenen key (Label/Eigenschaft) +def get_training_threads(key="answered", filters=[]): + #------------------------------------ + + db_fields= {"answered": lambda t: t.is_answered(), + "lang": lambda t: t.lang, + "maintopic": lambda t: t.maintopic} + + if not db_fields.has_key(key): + raise ValueError("Key "+str(key)+" unknown") + + q=db_session.query(MailThread) + q=q.filter(MailThread.istrained.is_(True)) + + if "de" in filters: + q=q.filter(MailThread.lang=="de") + elif "en" in filters: + q=q.filter(MailThread.lang=="en") + + # load and extract thread fields + threads=q.all() + labels = map(db_fields[key], threads) + + # encode using LabelEncoder + le=LabelEncoder() + labels=le.fit_transform(labels) + + TrainingThreads=namedtuple("TrainingThreads", ["MailThreads","EncodedLabels","LabelEncoder"]) + + return TrainingThreads(threads,labels,le) diff --git a/storage/mail_model.py b/storage/mail_model.py index 23023ef..a17833b 100644 --- a/storage/mail_model.py +++ b/storage/mail_model.py @@ -128,10 +128,14 @@ class Mail(Base): self.text= yaml.dump(b4.get_text()) else: self.text =yaml.dump( pl) + def print_head(self): + fr=yaml.load(self.from_) + return "Gesendet von: "+str(fr[0]["mail"])+"@"+str(fr[0]["host"])+" am "+ str(self.date) + def print_text(self): txt="" fr=yaml.load(self.from_) - txt= txt+ "Gesendet von: "+str(fr[0]["mail"])+"@"+str(fr[0]["host"])+" am "+ str(self.date) + "\n" +# txt= txt+ "Gesendet von: "+str(fr[0]["mail"])+"@"+str(fr[0]["host"])+" am "+ str(self.date) + "\n" t=yaml.load(self.text) if type(t) is unicode: #txt=txt diff --git a/storage/thread_model.py b/storage/thread_model.py index c6ff1ab..7cf9116 100644 --- a/storage/thread_model.py +++ b/storage/thread_model.py @@ -105,6 +105,14 @@ class MailThread(Base): db_session.commit() self.date=self.mails()[0].date + def print_mail(self, filter="all"): + a=[] + if filter=="all": + mm=self.mails() + for m in mm: + a.append((m.print_head(), m.print_text())) + return a + def print_text(self,filter="all"): a=[] if filter=="all": @@ -114,6 +122,16 @@ class MailThread(Base): elif filter=="first": a.append(m[0].print_text()) return a + def print_head(self,filter="all"): + a=[] + if filter=="all": + mm=self.mails() + for m in mm: + a.append(m.print_head()) + elif filter=="first": + a.append(m[0].print_head()) + return a + def text(self,filter="all"): a=u"" def mail_txt(m): diff --git a/test.sqlite b/test.sqlite index cc6bf43..65a2d55 100644 Binary files a/test.sqlite and b/test.sqlite differ