From 941cbc3d451b8d0b1d05bfac925d77976e1503ec Mon Sep 17 00:00:00 2001
From: Andreas Stephanides <andreas.stephanides@gmail.com>
Date: Fri, 4 Aug 2017 07:49:39 +0200
Subject: [PATCH] init learning cats

---
 classifier.py             | 106 +++++++++++++++++
 classify_mail.py          |  25 ++++
 classify_text.py          |  42 +++++++
 install.sh                |  13 +++
 requirements.txt          |  10 ++
 run.py                    | 238 ++++++++++++++++++++++++++++++++++++++
 storage/__init__.py       |   3 +
 storage/database.py       |  23 ++++
 storage/database_mbase.py |  67 +++++++++++
 storage/fetch_mail.py     |  37 ++++++
 storage/mail_model.py     | 126 ++++++++++++++++++++
 storage/models.py         |   2 +
 storage/thread_model.py   |  93 +++++++++++++++
 test_imap.py              |  62 ++++++++++
 14 files changed, 847 insertions(+)
 create mode 100644 classifier.py
 create mode 100644 classify_mail.py
 create mode 100644 classify_text.py
 create mode 100755 install.sh
 create mode 100644 requirements.txt
 create mode 100644 run.py
 create mode 100644 storage/__init__.py
 create mode 100644 storage/database.py
 create mode 100644 storage/database_mbase.py
 create mode 100644 storage/fetch_mail.py
 create mode 100644 storage/mail_model.py
 create mode 100644 storage/models.py
 create mode 100644 storage/thread_model.py
 create mode 100644 test_imap.py

diff --git a/classifier.py b/classifier.py
new file mode 100644
index 0000000..e0a3081
--- /dev/null
+++ b/classifier.py
@@ -0,0 +1,106 @@
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.feature_extraction import DictVectorizer
+from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
+from sklearn.preprocessing import LabelEncoder
+from sklearn.pipeline import Pipeline, FeatureUnion
+from sklearn.naive_bayes import MultinomialNB
+
+import numpy as np
+import yaml
+from storage import MailThread,db_session
+
+with open("data.yml", 'r') as stream:
+    try:
+        train=yaml.load(stream)
+    except yaml.YAMLError as exc:
+        print(exc)
+
+data_types= { "answered": bool, "maintopic": str}
+
+def store_training_data(i, d,key=b"answered"):
+    global train
+    if not data_types.has_key(key):
+        raise ValueError("Key "+str(key)+" unknown")
+    if not train.has_key(i):
+        train[i]={}
+    if not key is None and type(train[i]) is dict:
+        if not type(d)  is data_types[key]:
+#            print str(type(d)) + " vs " + str(data_types[key])
+            raise TypeError("Data - %s - for key "% d +str(key)+" must be " +str(data_types[key])+ " but it is "+ str(type(d)))
+        train[i][key]=d
+        
+    
+    with open("data.yml","w") as file:
+        file.write(yaml.dump(train,default_flow_style=True))
+        file.close()
+
+
+# Lade Trainingsdaten fuer einen angegebenen key (Label/Eigenschaft) 
+def get_training_threads(key="answered"):
+    t_a=[]
+    d_a=[]
+    d_a2=[]
+    for i in train:
+        t=db_session.query(MailThread).filter(MailThread.firstmail==i).first()
+        if not t is None:   # Thread muss in der Datenbank sein
+            if train[i].has_key(key): # In den Trainingsdaten muss der relevante Key sein
+                t_a.append(t)
+                d_a.append(train[i][key])
+    le=LabelEncoder()
+    d_a2=le.fit_transform(d_a)
+    return (t_a,d_a2,le)
+
+
+def in_training(i, key="answered"):
+    return train.has_key(i) and train[i].has_key(key)
+
+
+def print_answers(l):
+    cc=l.classes_
+    c_id=l.transform(cc)
+    for i,c in enumerate(cc):
+        print str(i) + ":  " + str(c)
+    return None
+
+
+class ThreadDictExtractor(BaseEstimator, TransformerMixin):
+    def fit(self, x, y=None):
+        return self
+    def transform(self, X,y=None):
+        return [t.mail_flat_dict() for t in X]
+
+class ThreadSubjectExtractor(BaseEstimator, TransformerMixin):
+    def fit(self, x, y=None):
+        return self
+    def transform(self, X,y=None):
+        return [t.subject() for t in X]
+
+class ThreadTextExtractor(BaseEstimator, TransformerMixin):
+    def fit(self, x, y=None):
+        return self
+    def transform(self, X,y=None):
+        return [t.text() for t in X]
+
+
+pipe1=Pipeline([('tde', ThreadDictExtractor()),('dv',DictVectorizer()),('clf', MultinomialNB())])
+
+pipe2 = Pipeline([
+    ('union', FeatureUnion(transformer_list=[
+        ('subject', Pipeline([('tse', ThreadSubjectExtractor()),
+                              ('cv',CountVectorizer()),
+                              ('tfidf', TfidfTransformer())
+        ])),
+        ('text',    Pipeline([('tte',ThreadTextExtractor()),
+                              ('cv',CountVectorizer()),
+                              ('tfidf', TfidfTransformer())
+        ])),
+        ('envelope', Pipeline([('tde', ThreadDictExtractor()),
+                               ('dv',DictVectorizer())
+        ]))
+    ], transformer_weights={
+        'subject': 1,
+        'text': 0.7,
+        'envelope': 0.5
+    } )),
+    ('clf', MultinomialNB())
+])
diff --git a/classify_mail.py b/classify_mail.py
new file mode 100644
index 0000000..00a87a7
--- /dev/null
+++ b/classify_mail.py
@@ -0,0 +1,25 @@
+from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
+from sklearn.feature_extraction import DictVectorizer
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.pipeline import Pipeline, FeatureUnion
+import sys
+import yaml
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import LabelEncoder
+
+
+text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB())])
+
+text_ohc = Pipeline([('ohc', OneHotEncoder()),('clf', MultinomialNB())])
+
+combined_features = FeatureUnion([('vect1', CountVectorizer()),('vect2', CountVectorizer())])
+
+
+enc=OneHotEncoder()
+with open("example_1.yaml", 'r') as stream:
+    try:
+        train=yaml.safe_load(stream)
+    except yaml.YAMLError as exc:
+        print(exc)
+
+tc=text_clf.fit(train["data"],train["target"])
diff --git a/classify_text.py b/classify_text.py
new file mode 100644
index 0000000..f95aac4
--- /dev/null
+++ b/classify_text.py
@@ -0,0 +1,42 @@
+from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.pipeline import Pipeline
+text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB())])
+import sys
+import yaml
+
+
+
+with open("example_1.yaml", 'r') as stream:
+    try:
+        train=yaml.safe_load(stream)
+    except yaml.YAMLError as exc:
+        print(exc)
+
+tc=text_clf.fit(train["data"],train["target"])
+print(sys.argv[1])
+
+answ=(tc.predict([sys.argv[1]]))[0]
+print train["target_names"][answ]
+
+for i in range(0,  (len(train["target_names"]))):
+    print (str(i)+"  "+ train["target_names"][i])
+
+ca=int(raw_input("Correct answer.."))
+
+
+if ca == answ:
+           print ("Yes I got it right")
+else:
+    print("should I remember this?")
+    a=raw_input("shoudIrememberthis?")
+    if a == "y":
+        train["data"].append(sys.argv[1])
+        train["target"].append(ca)
+        print yaml.dump(train,default_flow_style=False)
+        file=open("example_1.yaml","w")
+        file.write(yaml.dump(train,default_flow_style=False))
+        file.close()
+    else:
+        print ("Ok, I already forgot")
+    
diff --git a/install.sh b/install.sh
new file mode 100755
index 0000000..e97f9f8
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+if test ! -d ".env"; then
+    echo "Erzeuge virtuelle Umgebung ...."
+    virtualenv .env
+fi
+echo "Aktiviere virtuelle Python Umgebung ..."
+. .env/bin/activate
+echo "Installiere requirements ..."
+pip install --upgrade pip
+pip install -r requirements.txt
+if test ! -e "config.cfg" -a -e "config.cfg.sample"; then
+    cp config.cfg.sample config.cfg
+fi
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..8141473
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,10 @@
+imapclient
+email
+config
+sklearn
+numpy
+scipy
+bs4
+sqlalchemy
+marshmallow
+PyYAML
diff --git a/run.py b/run.py
new file mode 100644
index 0000000..3ccde40
--- /dev/null
+++ b/run.py
@@ -0,0 +1,238 @@
+from __future__ import unicode_literals
+import imapclient
+from config import Config
+import sys
+from email.header import decode_header
+import email
+import codecs
+import sys
+import bs4
+#sys.stdout = codecs.getwriter('utf8')(sys.stdout)
+from storage.fetch_mail import fetch_mail
+from storage.fetch_mail import fetch_threads, flatten_threads
+
+from storage import Mail, MailThread, db_session
+import yaml
+import email
+from classifier import get_training_threads, ThreadDictExtractor, pipe1, print_answers, in_training, store_training_data, pipe2
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import LabelEncoder
+import numpy
+
+
+
+def train_fit_pipe():
+        tt= get_training_threads(b"answered")
+        print tt[1]
+        print tt[0]
+        pipe1.fit(tt[0],tt[1])
+        return pipe1,tt[2]
+def train_fit_pipe2():
+        tt= get_training_threads(b"maintopic")
+        pipe2.fit(tt[0],tt[1])
+        return pipe2,tt[2]
+
+
+def train_single_thread(tid,p,le,key="answered"):
+        if (not type(tid) is int): raise TypeError("ID must be of type int") 
+        if not type(p) is Pipeline: raise TypeError("Second Argument needs to be type Pipeline")
+        if not type(le) is LabelEncoder: raise TypeError("Second Argument needs to be type LabelEncoder")
+        mth=db_session.query(MailThread).filter(MailThread.firstmail==tid).first()
+        if mth is None: raise ValueError("Thread with firstmail %d not in Database" %tid)
+        # Predict the value 
+        pre=p.predict([mth])
+        answ=pre[0]
+        #
+        print mth.to_text()
+        print mth.text()
+        print "Status is answered is estimated to be: " + str(le.inverse_transform(pre)[0])
+        print_answers(le)
+
+        ca=raw_input("Correct answer..")
+        try:
+                ca=int(ca)
+     
+        except ValueError:
+                print "String Data"
+        if type(ca)==int:
+                if ca == answ:
+                        print ("Yes I got it right")
+                else:
+                        print("Oh no...!")
+                l=le.inverse_transform([ca])[0]
+                if type(l) is numpy.bool_:
+                        l=bool(l)
+                if type(l) is numpy.string_:
+                        l=str(l)
+                store_training_data(tid,l, key)
+        elif not ca.strip() == "":
+                store_training_data(tid, ca, key)
+        else:
+                print "couldn't handle %s" % ca 
+    
+
+#print "arg1:"+sys.argv[1]
+if len(sys.argv)>1:
+    if sys.argv[1] == "fetch_threads":
+        print flatten_threads(fetch_threads())
+
+
+    if sys.argv[1] == "print_threads":
+        mth=db_session.query(MailThread).all()
+        for t in mth:
+            print t.firstmail
+            print t.mail_flat_dict()
+
+    if sys.argv[1] == "print_thrd":
+        if len(sys.argv)<3:
+            mth=db_session.query(MailThread).all()
+            for t in mth:
+                print t.firstmail
+        else:
+            t=db_session.query(MailThread).filter(MailThread.firstmail==sys.argv[2]).first()
+            
+            print t.firstmail
+            print t.subject()
+            print t.text()
+            
+        
+    if sys.argv[1] == "print_threads2":
+        mth=db_session.query(MailThread).all()
+        for t in mth:
+            print t.to_text()
+            print "---------------\n"
+    
+    if sys.argv[1] == "train_thrd2":
+        p, le=train_fit_pipe2()
+        train_single_thread(int(sys.argv[2]),p,le,b"maintopic")        
+    if sys.argv[1] == "train_all2":
+        p, labelencoder=train_fit_pipe2()
+        mth=db_session.query(MailThread).all()
+        print mth
+        for t in mth:
+            if not in_training(t.firstmail,"maintopic"):
+                print "---------------------------------------------------"
+                print "---------------------------------------------------"
+                print t.firstmail
+                print t.text()
+                train_single_thread(t.firstmail, p, labelencoder, b"maintopic") 
+
+        
+    if sys.argv[1] == "testpipe2":
+        from classifier import ThreadSubjectExtractor, ThreadTextExtractor
+        pipe2,le=train_fit_pipe2()
+        
+        if len(sys.argv)>2:
+                t=db_session.query(MailThread).filter(MailThread.firstmail==sys.argv[2]).first()
+                print t.to_text()
+                print le.inverse_transform(pipe2.predict([t]))
+            
+
+    if sys.argv[1] == "train_thrd":
+        pipe1, labelencoder=train_fit_pipe()
+        train_single_thread(int(sys.argv[2]),pipe1,labelencoder)        
+
+    if sys.argv[1] == "train_all":
+        pipe1, labelencoder=train_fit_pipe()
+        mth=db_session.query(MailThread).all()
+        print mth
+        for t in mth:
+            if not in_training(t.firstmail):
+                print "---------------------------------------------------"
+                print "---------------------------------------------------"
+                print t.firstmail
+                train_single_thread(t.firstmail,pipe1,labelencoder)        
+        
+    if sys.argv[1] == "print_thread":
+        mth=db_session.query(MailThread).filter(MailThread.firstmail==int(sys.argv[2])).first()
+        print mth.mail_dicts()
+        print mth.mail_flat_dict()
+    
+    if sys.argv[1] == "store_threads":
+        thrds=flatten_threads(fetch_threads())
+        for t in thrds:
+            if type(t[0]) is int:
+                th=db_session.query(MailThread).filter(MailThread.firstmail==t[0]).first()
+                if th == None:
+                    th=MailThread()
+                    th.firstmail=t[0]
+                if not th.body == yaml.dump(t):
+                    th.body=yaml.dump(t)
+                    th.islabeled=False
+                    th.opened=True
+                else:
+                    th.body=yaml.dump(t)
+                db_session.add(th)
+                db_session.commit()
+        print thrds
+    
+
+
+    if sys.argv[1] == "print_mail":
+        mm=db_session.query(Mail).filter(Mail.id==int(sys.argv[2])).first()
+        mm.compile_text()
+        mm.compile_envelope()
+        print mm.subject
+        print "----------"
+        print mm.text
+    
+    if sys.argv[1] == "mail_dict_test":
+        mm=db_session.query(Mail).filter(Mail.id==int(sys.argv[2])).first()
+        mm.compile_envelope()
+        print mm.dict_envelope()
+
+    
+    if sys.argv[1] == "load_mail":
+        mm=db_session.query(Mail).filter(Mail.id==int(sys.argv[2])).first()
+        mm.compile_text()
+        print mm.text
+        env=yaml.load(mm.envelope)
+        print env.subject
+        print env
+
+    
+    if sys.argv[1] == "store_mail":
+        m=fetch_mail(int(sys.argv[2]))
+        mm=Mail()
+        mm.envelope=yaml.dump(m['ENVELOPE'])
+        mm.body=yaml.dump(m['RFC822'])
+        mm.id=m['id']
+        db_session.add(mm)
+        db_session.commit()
+    
+
+    if sys.argv[1] == "fetch_mail":
+        print "fetching mail %d " % int(sys.argv[2])
+        m=fetch_mail(int(sys.argv[2]))
+        hd=decode_header(m['ENVELOPE'].subject)
+        hd2=[]
+        #    print hd
+        for h in hd:
+            if not h[1] is None:
+                hd2.append(h[0].decode(h[1]))
+                #            print h[0].decode(h[1])
+            else:
+                hd2.append(h[0])
+        print "\nBetreff:"
+        for h in hd2:
+            print h
+        print "FROM:"
+        for t in m['ENVELOPE'].from_:
+            print t
+        print "TO:"
+        for t in m['ENVELOPE'].to:
+            print t
+        em=email.message_from_string(m['RFC822'])
+        for p in em.walk():
+            if p.get_content_maintype()=="text":
+                print p.get_payload()
+            elif p.get_content_maintype()=="multipart":
+                print p.get_payload()
+            else:
+                print p.get_content_maintype()
+
+    
+
+    if sys.argv[1] == "initdb":
+        from storage import init_db
+        init_db()
diff --git a/storage/__init__.py b/storage/__init__.py
new file mode 100644
index 0000000..0185283
--- /dev/null
+++ b/storage/__init__.py
@@ -0,0 +1,3 @@
+from database import db_session, init_db
+from mail_model import Mail
+from thread_model import MailThread
diff --git a/storage/database.py b/storage/database.py
new file mode 100644
index 0000000..97df376
--- /dev/null
+++ b/storage/database.py
@@ -0,0 +1,23 @@
+from sqlalchemy import create_engine
+from sqlalchemy.orm import scoped_session, sessionmaker
+from sqlalchemy.ext.declarative import declarative_base
+from config import Config
+from database_mbase import MyBase
+import os
+f=file('config.cfg')
+cfg=Config(f)
+
+
+if cfg.get("db_main_type") == "mysql":
+    engine = create_engine("mysql+pymysql://%s:%s@localhost/crawler_articles?charset=utf8" % (cfg.get("db_main_user"), cfg.get("db_main_pw"))) 
+else:
+    engine = create_engine('sqlite:///'+ os.path.join(cfg.db_path,cfg.db_mainfile), convert_unicode=True)
+
+
+db_session = scoped_session(sessionmaker(autocommit=False,#  autoflush=False,
+                                         bind=engine))
+
+Base=declarative_base(cls=MyBase)                                         
+def init_db():
+    import models
+    Base.metadata.create_all(bind=engine)
diff --git a/storage/database_mbase.py b/storage/database_mbase.py
new file mode 100644
index 0000000..9ecff65
--- /dev/null
+++ b/storage/database_mbase.py
@@ -0,0 +1,67 @@
+from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey, Index, TIMESTAMP
+
+from datetime import datetime
+class MyBase(object):
+    id = Column(Integer, primary_key=True)
+    created_at = Column(TIMESTAMP, default=datetime.utcnow, nullable=False)
+    updated_at = Column(TIMESTAMP, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
+    def __json__(self):
+        if self.__jsonattrs__ is None:
+            return self.__schema__().dump(self)[0]
+        else:
+            return self.__schema__(only=self.__jsonattrs__).dump(self)[0]
+#    def __init__(self, data={}):
+#        self.update(data,False)
+        
+    def update(self,data, partial=True):
+        data, errors=self.__schema__( only=self.__whiteattrs__).load(data, partial=partial)
+        if len(errors)>0:
+            print errors
+            return (False,errors)
+        else:
+            for a in self.__whiteattrs__:
+                if data.has_key(a):
+                    setattr(self,a,data[a])
+            return (True, [])
+
+    @classmethod
+    def deserialize(cls,data):
+        data, errors=cls.__schema__().load(data,partial=True)
+        a=cls()
+        for c in cls.__table__.columns:
+            if data.has_key(c.key):
+                setattr(a, c.key,data[c.key])
+        return a
+
+class MyBase2(object):
+    id = Column(Integer, primary_key=True)
+#    created_at = Column(TIMESTAMP, default=datetime.utcnow, nullable=False)
+#    updated_at = Column(TIMESTAMP, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
+    def __json__(self):
+        if self.__jsonattrs__ is None:
+            return self.__schema__().dump(self)[0]
+        else:
+            return self.__schema__(only=self.__jsonattrs__).dump(self)[0]
+#    def __init__(self, data={}):
+#        self.update(data,False)
+        
+    def update(self,data, partial=True):
+        data, errors=self.__schema__( only=self.__whiteattrs__).load(data, partial=partial)
+        if len(errors)>0:
+            print errors
+            return (False,errors)
+        else:
+            for a in self.__whiteattrs__:
+                if data.has_key(a):
+                    setattr(self,a,data[a])
+            return (True, [])
+
+    @classmethod
+    def deserialize(cls,data):
+        data, errors=cls.__schema__().load(data,partial=True)
+        a=cls()
+        for c in cls.__table__.columns:
+            if data.has_key(c.key):
+                setattr(a, c.key,data[c.key])
+        return a
+    
diff --git a/storage/fetch_mail.py b/storage/fetch_mail.py
new file mode 100644
index 0000000..a8891ab
--- /dev/null
+++ b/storage/fetch_mail.py
@@ -0,0 +1,37 @@
+import imapclient
+from datetime import date
+
+from config import Config
+f=file('config.cfg')
+cfg=Config(f)
+server = imapclient.IMAPClient(cfg.host, use_uid=True, ssl=True)
+server.login(cfg.user, cfg.password)
+server.select_folder('INBOX')
+
+
+def fetch_mail(myid):
+    m=server.fetch([myid],['ENVELOPE','RFC822'])
+    m=m[myid]
+    m["id"]=myid
+    return m
+
+def fetch_thread(tp): 
+    return tp
+
+def fetch_threads():
+    src=server.thread(criteria=[b'SUBJECT', b'service', b'SINCE', date(2017,07,01)])
+    #, b'BEFORE', date(2017,08,01)
+    return src
+
+
+def flatten_threads(thrds, array=[], level=0):
+    if level > 0:
+        for t in thrds:
+            if type(t) is tuple:
+                array = array + (flatten_threads(t,[],1))
+            else:
+                array.append(t)
+    else:
+        for t in thrds: 
+            array.append(flatten_threads(t,[],1))
+    return array
diff --git a/storage/mail_model.py b/storage/mail_model.py
new file mode 100644
index 0000000..6e1cb8e
--- /dev/null
+++ b/storage/mail_model.py
@@ -0,0 +1,126 @@
+from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey, Unicode
+from sqlalchemy.orm import relationship
+from datetime import datetime
+from database import Base
+from database import db_session
+from email.header import decode_header
+from marshmallow import Schema, fields, post_load
+import yaml
+import email
+from fetch_mail import fetch_mail
+import bs4 
+class FullMailSchema(Schema):
+    id=fields.Integer()
+    text=fields.String()
+    body=fields.String()
+    envelope=fields.String()
+    
+
+
+class Mail(Base):
+    __tablename__ = 'mails'
+    id = Column(Integer, primary_key=True)
+    date = Column(DateTime)
+    envelope = Column(Text)
+    body = Column(Text)
+    text = Column(Text)
+    from_ = Column(Text)
+    from_mailbox=Column(String)
+    from_host=Column(String)
+    to_ = Column(Text)
+    to_mailbox = Column(Text)
+    to_host=Column(String)
+    subject = Column(Text)
+    __schema__=FullMailSchema
+    __jsonid__='mail'
+    __whiteattrs__= ["text", "envelope"]
+    __jsonattrs__=None
+
+
+    @classmethod
+    def fetch_mail(self,mid):
+        m=fetch_mail(mid)
+        mm=Mail()
+        mm.envelope=yaml.dump(m['ENVELOPE'])
+        em=email.message_from_string(m['RFC822'])
+        if type(em.get_payload()) is list:
+            pt=[]
+            for p in em.walk():
+                if p.get_content_maintype() == "text":
+                    pt.append(p)
+            em.set_payload(pt)
+        mm.body=yaml.dump(str(em))
+        mm.id=m['id']
+        db_session.add(mm)
+        db_session.commit()
+        return mm
+        
+    def get_email(self):
+        em=email.message_from_string(yaml.load(self.body))
+        return em    
+
+    def compile_envelope(self):
+        env=yaml.load(self.envelope)
+        hd=decode_header(env.subject)
+        hd2=[]
+
+        for h in hd:
+            if not h[1] is None:
+                hd2.append(h[0].decode(h[1]))
+                #            print h[0].decode(h[1])
+            else:
+                hd2.append(h[0])
+        self.subject=yaml.dump(hd2)
+        to_array=[]
+        from_array=[]
+#        print "Status"
+        #        print env
+        if env.to is  None:
+            print self.id
+        else:
+            for t in env.to:
+                a={"host": t.host, "mail": t.mailbox}
+                to_array.append(a)    
+        self.to_=yaml.dump(to_array)
+        for t in env.from_:
+            a={"host": t.host, "mail": t.mailbox}
+            from_array.append(a)    
+        self.to_=yaml.dump(to_array)
+        self.from_=yaml.dump(from_array)
+        return None
+
+    def dict_envelope(self):
+        d={}
+        i=0
+        for p in yaml.load(self.subject):
+            if p is not None:
+                d["subject_"+str(i)]=p
+            i=i+1
+        i=0
+        for p in yaml.load(self.to_):
+            if p["host"] is not None:
+                d["to_host_"+str(i)]=p["host"]
+            if p["mail"] is not None:
+                d["to_mailbox_"+str(i)]=p["mail"]
+            i=i+1
+        i=0
+        for p in yaml.load(self.from_):
+            if p["host"] is not None:
+                d["from_host_"+str(i)]=p["host"]
+            if p["mail"] is not None:
+                d["from_mailbox_"+str(i)]=p["mail"]
+            i=i+1    
+        return d
+    
+    def compile_text(self):
+        for p in self.get_email().walk():
+            if p.get_content_maintype()=="text":
+                pl=p.get_payload(decode=True)
+#                print pl
+#                print p.get_content_type()
+                if p.get_content_subtype()=="html":
+
+                    b4=bs4.BeautifulSoup(pl,"html.parser")
+                    self.text= yaml.dump(b4.get_text())
+                else:
+                    self.text =yaml.dump( pl)
diff --git a/storage/models.py b/storage/models.py
new file mode 100644
index 0000000..bf6f2b9
--- /dev/null
+++ b/storage/models.py
@@ -0,0 +1,2 @@
+from mail_model import Mail
+from thread_model import MailThread
diff --git a/storage/thread_model.py b/storage/thread_model.py
new file mode 100644
index 0000000..b3f83b2
--- /dev/null
+++ b/storage/thread_model.py
@@ -0,0 +1,93 @@
+from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey, Unicode
+from sqlalchemy.orm import relationship
+from datetime import datetime
+from database import Base
+from database import db_session
+from email.header import decode_header
+from marshmallow import Schema, fields, post_load
+import yaml
+import email
+from mail_model import Mail
+#from fetch_mail import fetch_mail
+
+class FullThreadSchema(Schema):
+    id=fields.Integer()
+    text=fields.String()
+    body=fields.String()
+    envelope=fields.String()
+
+
+class MailThread(Base):
+    __tablename__ = 'threads'
+    id = Column(Integer, primary_key=True)
+    firstmail  = Column(Integer)
+    islabeled = Column(Boolean)
+    opened = Column(Boolean)
+    body = Column(Text)
+    __schema__=FullThreadSchema
+    __jsonid__='thread'
+    __whiteattrs__= ["body"]
+    __jsonattrs__=None
+    def bdy(self):
+        return yaml.load(self.body)
+    def to_text(self):
+        mmm=self.mails()
+        txt=""
+        for m in mmm:
+            m.compile_envelope()
+            txt=txt+"mail: \n"
+            for f in yaml.load(m.from_):
+                txt=txt+f["mail"]+"@"+f["host"]
+            txt=txt+" --- "
+            txt=txt+" ".join(yaml.load(m.subject))
+            txt=txt+"\n"
+        return txt
+    
+    def mails(self):
+        a=[]
+#        print self.bdy()
+        for m in self.bdy():
+            mail=db_session.query(Mail).filter(Mail.id==int(m)).first()
+            if mail is None:
+                mail=Mail.fetch_mail(int(m))
+            a.append(mail)
+        return a
+        
+    def mail_dicts(self):
+        a=[]
+#        print "maildicts: "+ str(self.mails())
+        for m in self.mails():
+            m.compile_envelope()
+            a.append(m.dict_envelope())
+        return a
+    def mail_flat_dict(self):
+        a=[]
+        d={}
+        dc=self.mail_dicts()
+#        print dc
+        for i in range(0,len(dc)):
+            for k, v  in dc[i].iteritems():
+                d["mail_"+str(i)+"_"+k]=v
+        return d
+    def subject(self):
+        a=""
+        for m in self.mails():
+            m.compile_envelope()
+            a=a + " ".join(yaml.load(m.subject))+"\n"
+
+        return a
+    
+    def text(self):
+        a=u""
+        for m in self.mails():
+            m.compile_text()
+            t=yaml.load(m.text)
+            if type(t) is unicode:
+                txt=t
+            else:
+#                print "withintm:"+str(type(t))
+                t=t.decode("ISO-8859-1")
+                txt=t
+            a=a+txt+"\n\n"
+
+        return a
diff --git a/test_imap.py b/test_imap.py
new file mode 100644
index 0000000..27bccc9
--- /dev/null
+++ b/test_imap.py
@@ -0,0 +1,62 @@
+#!.env/bin/python
+from __future__ import unicode_literals
+from imapclient import IMAPClient
+from datetime import date
+import yaml
+HOST="buran.htu.tuwien.ac.at"
+USERNAME="andis"
+PASSWORD="t4MJAvU2"
+ssl=True
+server=IMAPClient(HOST, use_uid=True, ssl=ssl)
+server.login(USERNAME,PASSWORD)
+
+select_info=server.select_folder('INBOX')
+
+messages=server.search([u'SUBJECT', 'service',u'SINCE', date(2017,06,1)])
+#pritn(select_info)
+
+
+#response = server.fetch(messages, ['FLAGS', 'RFC822.SIZE', 'BODY', 'ENVELOPE','X-GM-THRID', 'X-GM-MSGID'])
+#response = server.fetch(messages, ['ENVELOPE'])
+
+#print(response)
+#for msgid, data in response.iteritems():
+#    print('   ID %d: %d bytes, flags=%s, %s' % (msgid,
+#                                            data[b'RFC822.SIZE'],
+#                                                data[b'FLAGS'], data['ENVELOPE']))
+#response = server.thread()
+
+print "\n\n --------------------------------\n"
+response= server.thread(criteria=[u'SUBJECT', 'service',u'SINCE', date(2017,04,1)])
+print(response)
+#resp=server.thread('X-GM')
+#for msgid, data in response.iteritems():
+#    print('   ID %d: \t %s \t %s' % (msgid, data[b'X-GM-THRID'], data[b'X-GM-MSGID']))
+print "\n---------------------\n"
+print response[0], len(response[0])
+
+
+def get_msg(mid):
+    print mid
+    sf=server.fetch([mid],['ENVELOPE'])
+    for msgid, data in sf.iteritems():
+        return {"msgid": msgid, "envelope": data[b'ENVELOPE']}
+    
+
+def get_msg_tuple(ids):
+    r=[]
+    for i in ids:
+        if type(i) is int:
+            r.append(get_msg(i))
+        elif type(i) is tuple:
+            r.append(get_msg_tuple(i))
+    return r
+
+r=[]
+for ids in response:
+        r.append(get_msg_tuple(ids))
+
+print yaml.dump(r,default_flow_style=False)
+file=open("envelopes.yaml","w")
+file.write(yaml.dump(r,default_flow_style=False))
+file.close()