From 941cbc3d451b8d0b1d05bfac925d77976e1503ec Mon Sep 17 00:00:00 2001 From: Andreas Stephanides Date: Fri, 4 Aug 2017 07:49:39 +0200 Subject: [PATCH] init learning cats --- classifier.py | 106 +++++++++++++++++ classify_mail.py | 25 ++++ classify_text.py | 42 +++++++ install.sh | 13 +++ requirements.txt | 10 ++ run.py | 238 ++++++++++++++++++++++++++++++++++++++ storage/__init__.py | 3 + storage/database.py | 23 ++++ storage/database_mbase.py | 67 +++++++++++ storage/fetch_mail.py | 37 ++++++ storage/mail_model.py | 126 ++++++++++++++++++++ storage/models.py | 2 + storage/thread_model.py | 93 +++++++++++++++ test_imap.py | 62 ++++++++++ 14 files changed, 847 insertions(+) create mode 100644 classifier.py create mode 100644 classify_mail.py create mode 100644 classify_text.py create mode 100755 install.sh create mode 100644 requirements.txt create mode 100644 run.py create mode 100644 storage/__init__.py create mode 100644 storage/database.py create mode 100644 storage/database_mbase.py create mode 100644 storage/fetch_mail.py create mode 100644 storage/mail_model.py create mode 100644 storage/models.py create mode 100644 storage/thread_model.py create mode 100644 test_imap.py diff --git a/classifier.py b/classifier.py new file mode 100644 index 0000000..e0a3081 --- /dev/null +++ b/classifier.py @@ -0,0 +1,106 @@ +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.feature_extraction import DictVectorizer +from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer +from sklearn.preprocessing import LabelEncoder +from sklearn.pipeline import Pipeline, FeatureUnion +from sklearn.naive_bayes import MultinomialNB + +import numpy as np +import yaml +from storage import MailThread,db_session + +with open("data.yml", 'r') as stream: + try: + train=yaml.load(stream) + except yaml.YAMLError as exc: + print(exc) + +data_types= { "answered": bool, "maintopic": str} + +def store_training_data(i, d,key=b"answered"): + global train + if not data_types.has_key(key): + raise ValueError("Key "+str(key)+" unknown") + if not train.has_key(i): + train[i]={} + if not key is None and type(train[i]) is dict: + if not type(d) is data_types[key]: +# print str(type(d)) + " vs " + str(data_types[key]) + raise TypeError("Data - %s - for key "% d +str(key)+" must be " +str(data_types[key])+ " but it is "+ str(type(d))) + train[i][key]=d + + + with open("data.yml","w") as file: + file.write(yaml.dump(train,default_flow_style=True)) + file.close() + + +# Lade Trainingsdaten fuer einen angegebenen key (Label/Eigenschaft) +def get_training_threads(key="answered"): + t_a=[] + d_a=[] + d_a2=[] + for i in train: + t=db_session.query(MailThread).filter(MailThread.firstmail==i).first() + if not t is None: # Thread muss in der Datenbank sein + if train[i].has_key(key): # In den Trainingsdaten muss der relevante Key sein + t_a.append(t) + d_a.append(train[i][key]) + le=LabelEncoder() + d_a2=le.fit_transform(d_a) + return (t_a,d_a2,le) + + +def in_training(i, key="answered"): + return train.has_key(i) and train[i].has_key(key) + + +def print_answers(l): + cc=l.classes_ + c_id=l.transform(cc) + for i,c in enumerate(cc): + print str(i) + ": " + str(c) + return None + + +class ThreadDictExtractor(BaseEstimator, TransformerMixin): + def fit(self, x, y=None): + return self + def transform(self, X,y=None): + return [t.mail_flat_dict() for t in X] + +class ThreadSubjectExtractor(BaseEstimator, TransformerMixin): + def fit(self, x, y=None): + return self + def transform(self, X,y=None): + return [t.subject() for t in X] + +class ThreadTextExtractor(BaseEstimator, TransformerMixin): + def fit(self, x, y=None): + return self + def transform(self, X,y=None): + return [t.text() for t in X] + + +pipe1=Pipeline([('tde', ThreadDictExtractor()),('dv',DictVectorizer()),('clf', MultinomialNB())]) + +pipe2 = Pipeline([ + ('union', FeatureUnion(transformer_list=[ + ('subject', Pipeline([('tse', ThreadSubjectExtractor()), + ('cv',CountVectorizer()), + ('tfidf', TfidfTransformer()) + ])), + ('text', Pipeline([('tte',ThreadTextExtractor()), + ('cv',CountVectorizer()), + ('tfidf', TfidfTransformer()) + ])), + ('envelope', Pipeline([('tde', ThreadDictExtractor()), + ('dv',DictVectorizer()) + ])) + ], transformer_weights={ + 'subject': 1, + 'text': 0.7, + 'envelope': 0.5 + } )), + ('clf', MultinomialNB()) +]) diff --git a/classify_mail.py b/classify_mail.py new file mode 100644 index 0000000..00a87a7 --- /dev/null +++ b/classify_mail.py @@ -0,0 +1,25 @@ +from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer +from sklearn.feature_extraction import DictVectorizer +from sklearn.naive_bayes import MultinomialNB +from sklearn.pipeline import Pipeline, FeatureUnion +import sys +import yaml +from sklearn.preprocessing import OneHotEncoder +from sklearn.preprocessing import LabelEncoder + + +text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB())]) + +text_ohc = Pipeline([('ohc', OneHotEncoder()),('clf', MultinomialNB())]) + +combined_features = FeatureUnion([('vect1', CountVectorizer()),('vect2', CountVectorizer())]) + + +enc=OneHotEncoder() +with open("example_1.yaml", 'r') as stream: + try: + train=yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) + +tc=text_clf.fit(train["data"],train["target"]) diff --git a/classify_text.py b/classify_text.py new file mode 100644 index 0000000..f95aac4 --- /dev/null +++ b/classify_text.py @@ -0,0 +1,42 @@ +from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer +from sklearn.naive_bayes import MultinomialNB +from sklearn.pipeline import Pipeline +text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB())]) +import sys +import yaml + + + +with open("example_1.yaml", 'r') as stream: + try: + train=yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) + +tc=text_clf.fit(train["data"],train["target"]) +print(sys.argv[1]) + +answ=(tc.predict([sys.argv[1]]))[0] +print train["target_names"][answ] + +for i in range(0, (len(train["target_names"]))): + print (str(i)+" "+ train["target_names"][i]) + +ca=int(raw_input("Correct answer..")) + + +if ca == answ: + print ("Yes I got it right") +else: + print("should I remember this?") + a=raw_input("shoudIrememberthis?") + if a == "y": + train["data"].append(sys.argv[1]) + train["target"].append(ca) + print yaml.dump(train,default_flow_style=False) + file=open("example_1.yaml","w") + file.write(yaml.dump(train,default_flow_style=False)) + file.close() + else: + print ("Ok, I already forgot") + diff --git a/install.sh b/install.sh new file mode 100755 index 0000000..e97f9f8 --- /dev/null +++ b/install.sh @@ -0,0 +1,13 @@ +#!/bin/bash +if test ! -d ".env"; then + echo "Erzeuge virtuelle Umgebung ...." + virtualenv .env +fi +echo "Aktiviere virtuelle Python Umgebung ..." +. .env/bin/activate +echo "Installiere requirements ..." +pip install --upgrade pip +pip install -r requirements.txt +if test ! -e "config.cfg" -a -e "config.cfg.sample"; then + cp config.cfg.sample config.cfg +fi diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8141473 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +imapclient +email +config +sklearn +numpy +scipy +bs4 +sqlalchemy +marshmallow +PyYAML diff --git a/run.py b/run.py new file mode 100644 index 0000000..3ccde40 --- /dev/null +++ b/run.py @@ -0,0 +1,238 @@ +from __future__ import unicode_literals +import imapclient +from config import Config +import sys +from email.header import decode_header +import email +import codecs +import sys +import bs4 +#sys.stdout = codecs.getwriter('utf8')(sys.stdout) +from storage.fetch_mail import fetch_mail +from storage.fetch_mail import fetch_threads, flatten_threads + +from storage import Mail, MailThread, db_session +import yaml +import email +from classifier import get_training_threads, ThreadDictExtractor, pipe1, print_answers, in_training, store_training_data, pipe2 +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import LabelEncoder +import numpy + + + +def train_fit_pipe(): + tt= get_training_threads(b"answered") + print tt[1] + print tt[0] + pipe1.fit(tt[0],tt[1]) + return pipe1,tt[2] +def train_fit_pipe2(): + tt= get_training_threads(b"maintopic") + pipe2.fit(tt[0],tt[1]) + return pipe2,tt[2] + + +def train_single_thread(tid,p,le,key="answered"): + if (not type(tid) is int): raise TypeError("ID must be of type int") + if not type(p) is Pipeline: raise TypeError("Second Argument needs to be type Pipeline") + if not type(le) is LabelEncoder: raise TypeError("Second Argument needs to be type LabelEncoder") + mth=db_session.query(MailThread).filter(MailThread.firstmail==tid).first() + if mth is None: raise ValueError("Thread with firstmail %d not in Database" %tid) + # Predict the value + pre=p.predict([mth]) + answ=pre[0] + # + print mth.to_text() + print mth.text() + print "Status is answered is estimated to be: " + str(le.inverse_transform(pre)[0]) + print_answers(le) + + ca=raw_input("Correct answer..") + try: + ca=int(ca) + + except ValueError: + print "String Data" + if type(ca)==int: + if ca == answ: + print ("Yes I got it right") + else: + print("Oh no...!") + l=le.inverse_transform([ca])[0] + if type(l) is numpy.bool_: + l=bool(l) + if type(l) is numpy.string_: + l=str(l) + store_training_data(tid,l, key) + elif not ca.strip() == "": + store_training_data(tid, ca, key) + else: + print "couldn't handle %s" % ca + + +#print "arg1:"+sys.argv[1] +if len(sys.argv)>1: + if sys.argv[1] == "fetch_threads": + print flatten_threads(fetch_threads()) + + + if sys.argv[1] == "print_threads": + mth=db_session.query(MailThread).all() + for t in mth: + print t.firstmail + print t.mail_flat_dict() + + if sys.argv[1] == "print_thrd": + if len(sys.argv)<3: + mth=db_session.query(MailThread).all() + for t in mth: + print t.firstmail + else: + t=db_session.query(MailThread).filter(MailThread.firstmail==sys.argv[2]).first() + + print t.firstmail + print t.subject() + print t.text() + + + if sys.argv[1] == "print_threads2": + mth=db_session.query(MailThread).all() + for t in mth: + print t.to_text() + print "---------------\n" + + if sys.argv[1] == "train_thrd2": + p, le=train_fit_pipe2() + train_single_thread(int(sys.argv[2]),p,le,b"maintopic") + if sys.argv[1] == "train_all2": + p, labelencoder=train_fit_pipe2() + mth=db_session.query(MailThread).all() + print mth + for t in mth: + if not in_training(t.firstmail,"maintopic"): + print "---------------------------------------------------" + print "---------------------------------------------------" + print t.firstmail + print t.text() + train_single_thread(t.firstmail, p, labelencoder, b"maintopic") + + + if sys.argv[1] == "testpipe2": + from classifier import ThreadSubjectExtractor, ThreadTextExtractor + pipe2,le=train_fit_pipe2() + + if len(sys.argv)>2: + t=db_session.query(MailThread).filter(MailThread.firstmail==sys.argv[2]).first() + print t.to_text() + print le.inverse_transform(pipe2.predict([t])) + + + if sys.argv[1] == "train_thrd": + pipe1, labelencoder=train_fit_pipe() + train_single_thread(int(sys.argv[2]),pipe1,labelencoder) + + if sys.argv[1] == "train_all": + pipe1, labelencoder=train_fit_pipe() + mth=db_session.query(MailThread).all() + print mth + for t in mth: + if not in_training(t.firstmail): + print "---------------------------------------------------" + print "---------------------------------------------------" + print t.firstmail + train_single_thread(t.firstmail,pipe1,labelencoder) + + if sys.argv[1] == "print_thread": + mth=db_session.query(MailThread).filter(MailThread.firstmail==int(sys.argv[2])).first() + print mth.mail_dicts() + print mth.mail_flat_dict() + + if sys.argv[1] == "store_threads": + thrds=flatten_threads(fetch_threads()) + for t in thrds: + if type(t[0]) is int: + th=db_session.query(MailThread).filter(MailThread.firstmail==t[0]).first() + if th == None: + th=MailThread() + th.firstmail=t[0] + if not th.body == yaml.dump(t): + th.body=yaml.dump(t) + th.islabeled=False + th.opened=True + else: + th.body=yaml.dump(t) + db_session.add(th) + db_session.commit() + print thrds + + + + if sys.argv[1] == "print_mail": + mm=db_session.query(Mail).filter(Mail.id==int(sys.argv[2])).first() + mm.compile_text() + mm.compile_envelope() + print mm.subject + print "----------" + print mm.text + + if sys.argv[1] == "mail_dict_test": + mm=db_session.query(Mail).filter(Mail.id==int(sys.argv[2])).first() + mm.compile_envelope() + print mm.dict_envelope() + + + if sys.argv[1] == "load_mail": + mm=db_session.query(Mail).filter(Mail.id==int(sys.argv[2])).first() + mm.compile_text() + print mm.text + env=yaml.load(mm.envelope) + print env.subject + print env + + + if sys.argv[1] == "store_mail": + m=fetch_mail(int(sys.argv[2])) + mm=Mail() + mm.envelope=yaml.dump(m['ENVELOPE']) + mm.body=yaml.dump(m['RFC822']) + mm.id=m['id'] + db_session.add(mm) + db_session.commit() + + + if sys.argv[1] == "fetch_mail": + print "fetching mail %d " % int(sys.argv[2]) + m=fetch_mail(int(sys.argv[2])) + hd=decode_header(m['ENVELOPE'].subject) + hd2=[] + # print hd + for h in hd: + if not h[1] is None: + hd2.append(h[0].decode(h[1])) + # print h[0].decode(h[1]) + else: + hd2.append(h[0]) + print "\nBetreff:" + for h in hd2: + print h + print "FROM:" + for t in m['ENVELOPE'].from_: + print t + print "TO:" + for t in m['ENVELOPE'].to: + print t + em=email.message_from_string(m['RFC822']) + for p in em.walk(): + if p.get_content_maintype()=="text": + print p.get_payload() + elif p.get_content_maintype()=="multipart": + print p.get_payload() + else: + print p.get_content_maintype() + + + + if sys.argv[1] == "initdb": + from storage import init_db + init_db() diff --git a/storage/__init__.py b/storage/__init__.py new file mode 100644 index 0000000..0185283 --- /dev/null +++ b/storage/__init__.py @@ -0,0 +1,3 @@ +from database import db_session, init_db +from mail_model import Mail +from thread_model import MailThread diff --git a/storage/database.py b/storage/database.py new file mode 100644 index 0000000..97df376 --- /dev/null +++ b/storage/database.py @@ -0,0 +1,23 @@ +from sqlalchemy import create_engine +from sqlalchemy.orm import scoped_session, sessionmaker +from sqlalchemy.ext.declarative import declarative_base +from config import Config +from database_mbase import MyBase +import os +f=file('config.cfg') +cfg=Config(f) + + +if cfg.get("db_main_type") == "mysql": + engine = create_engine("mysql+pymysql://%s:%s@localhost/crawler_articles?charset=utf8" % (cfg.get("db_main_user"), cfg.get("db_main_pw"))) +else: + engine = create_engine('sqlite:///'+ os.path.join(cfg.db_path,cfg.db_mainfile), convert_unicode=True) + + +db_session = scoped_session(sessionmaker(autocommit=False,# autoflush=False, + bind=engine)) + +Base=declarative_base(cls=MyBase) +def init_db(): + import models + Base.metadata.create_all(bind=engine) diff --git a/storage/database_mbase.py b/storage/database_mbase.py new file mode 100644 index 0000000..9ecff65 --- /dev/null +++ b/storage/database_mbase.py @@ -0,0 +1,67 @@ +from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey, Index, TIMESTAMP + +from datetime import datetime +class MyBase(object): + id = Column(Integer, primary_key=True) + created_at = Column(TIMESTAMP, default=datetime.utcnow, nullable=False) + updated_at = Column(TIMESTAMP, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False) + def __json__(self): + if self.__jsonattrs__ is None: + return self.__schema__().dump(self)[0] + else: + return self.__schema__(only=self.__jsonattrs__).dump(self)[0] +# def __init__(self, data={}): +# self.update(data,False) + + def update(self,data, partial=True): + data, errors=self.__schema__( only=self.__whiteattrs__).load(data, partial=partial) + if len(errors)>0: + print errors + return (False,errors) + else: + for a in self.__whiteattrs__: + if data.has_key(a): + setattr(self,a,data[a]) + return (True, []) + + @classmethod + def deserialize(cls,data): + data, errors=cls.__schema__().load(data,partial=True) + a=cls() + for c in cls.__table__.columns: + if data.has_key(c.key): + setattr(a, c.key,data[c.key]) + return a + +class MyBase2(object): + id = Column(Integer, primary_key=True) +# created_at = Column(TIMESTAMP, default=datetime.utcnow, nullable=False) +# updated_at = Column(TIMESTAMP, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False) + def __json__(self): + if self.__jsonattrs__ is None: + return self.__schema__().dump(self)[0] + else: + return self.__schema__(only=self.__jsonattrs__).dump(self)[0] +# def __init__(self, data={}): +# self.update(data,False) + + def update(self,data, partial=True): + data, errors=self.__schema__( only=self.__whiteattrs__).load(data, partial=partial) + if len(errors)>0: + print errors + return (False,errors) + else: + for a in self.__whiteattrs__: + if data.has_key(a): + setattr(self,a,data[a]) + return (True, []) + + @classmethod + def deserialize(cls,data): + data, errors=cls.__schema__().load(data,partial=True) + a=cls() + for c in cls.__table__.columns: + if data.has_key(c.key): + setattr(a, c.key,data[c.key]) + return a + diff --git a/storage/fetch_mail.py b/storage/fetch_mail.py new file mode 100644 index 0000000..a8891ab --- /dev/null +++ b/storage/fetch_mail.py @@ -0,0 +1,37 @@ +import imapclient +from datetime import date + +from config import Config +f=file('config.cfg') +cfg=Config(f) +server = imapclient.IMAPClient(cfg.host, use_uid=True, ssl=True) +server.login(cfg.user, cfg.password) +server.select_folder('INBOX') + + +def fetch_mail(myid): + m=server.fetch([myid],['ENVELOPE','RFC822']) + m=m[myid] + m["id"]=myid + return m + +def fetch_thread(tp): + return tp + +def fetch_threads(): + src=server.thread(criteria=[b'SUBJECT', b'service', b'SINCE', date(2017,07,01)]) + #, b'BEFORE', date(2017,08,01) + return src + + +def flatten_threads(thrds, array=[], level=0): + if level > 0: + for t in thrds: + if type(t) is tuple: + array = array + (flatten_threads(t,[],1)) + else: + array.append(t) + else: + for t in thrds: + array.append(flatten_threads(t,[],1)) + return array diff --git a/storage/mail_model.py b/storage/mail_model.py new file mode 100644 index 0000000..6e1cb8e --- /dev/null +++ b/storage/mail_model.py @@ -0,0 +1,126 @@ +from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey, Unicode +from sqlalchemy.orm import relationship +from datetime import datetime +from database import Base +from database import db_session +from email.header import decode_header +from marshmallow import Schema, fields, post_load +import yaml +import email +from fetch_mail import fetch_mail +import bs4 +class FullMailSchema(Schema): + id=fields.Integer() + text=fields.String() + body=fields.String() + envelope=fields.String() + + + +class Mail(Base): + __tablename__ = 'mails' + id = Column(Integer, primary_key=True) + date = Column(DateTime) + envelope = Column(Text) + body = Column(Text) + text = Column(Text) + from_ = Column(Text) + from_mailbox=Column(String) + from_host=Column(String) + to_ = Column(Text) + to_mailbox = Column(Text) + to_host=Column(String) + subject = Column(Text) + __schema__=FullMailSchema + __jsonid__='mail' + __whiteattrs__= ["text", "envelope"] + __jsonattrs__=None + + + @classmethod + def fetch_mail(self,mid): + m=fetch_mail(mid) + mm=Mail() + mm.envelope=yaml.dump(m['ENVELOPE']) + em=email.message_from_string(m['RFC822']) + if type(em.get_payload()) is list: + pt=[] + for p in em.walk(): + if p.get_content_maintype() == "text": + pt.append(p) + em.set_payload(pt) + mm.body=yaml.dump(str(em)) + mm.id=m['id'] + db_session.add(mm) + db_session.commit() + return mm + + def get_email(self): + em=email.message_from_string(yaml.load(self.body)) + return em + + def compile_envelope(self): + env=yaml.load(self.envelope) + hd=decode_header(env.subject) + hd2=[] + + for h in hd: + if not h[1] is None: + hd2.append(h[0].decode(h[1])) + # print h[0].decode(h[1]) + else: + hd2.append(h[0]) + self.subject=yaml.dump(hd2) + to_array=[] + from_array=[] +# print "Status" + # print env + if env.to is None: + print self.id + else: + for t in env.to: + a={"host": t.host, "mail": t.mailbox} + to_array.append(a) + self.to_=yaml.dump(to_array) + for t in env.from_: + a={"host": t.host, "mail": t.mailbox} + from_array.append(a) + self.to_=yaml.dump(to_array) + self.from_=yaml.dump(from_array) + return None + + def dict_envelope(self): + d={} + i=0 + for p in yaml.load(self.subject): + if p is not None: + d["subject_"+str(i)]=p + i=i+1 + i=0 + for p in yaml.load(self.to_): + if p["host"] is not None: + d["to_host_"+str(i)]=p["host"] + if p["mail"] is not None: + d["to_mailbox_"+str(i)]=p["mail"] + i=i+1 + i=0 + for p in yaml.load(self.from_): + if p["host"] is not None: + d["from_host_"+str(i)]=p["host"] + if p["mail"] is not None: + d["from_mailbox_"+str(i)]=p["mail"] + i=i+1 + return d + + def compile_text(self): + for p in self.get_email().walk(): + if p.get_content_maintype()=="text": + pl=p.get_payload(decode=True) +# print pl +# print p.get_content_type() + if p.get_content_subtype()=="html": + + b4=bs4.BeautifulSoup(pl,"html.parser") + self.text= yaml.dump(b4.get_text()) + else: + self.text =yaml.dump( pl) diff --git a/storage/models.py b/storage/models.py new file mode 100644 index 0000000..bf6f2b9 --- /dev/null +++ b/storage/models.py @@ -0,0 +1,2 @@ +from mail_model import Mail +from thread_model import MailThread diff --git a/storage/thread_model.py b/storage/thread_model.py new file mode 100644 index 0000000..b3f83b2 --- /dev/null +++ b/storage/thread_model.py @@ -0,0 +1,93 @@ +from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey, Unicode +from sqlalchemy.orm import relationship +from datetime import datetime +from database import Base +from database import db_session +from email.header import decode_header +from marshmallow import Schema, fields, post_load +import yaml +import email +from mail_model import Mail +#from fetch_mail import fetch_mail + +class FullThreadSchema(Schema): + id=fields.Integer() + text=fields.String() + body=fields.String() + envelope=fields.String() + + +class MailThread(Base): + __tablename__ = 'threads' + id = Column(Integer, primary_key=True) + firstmail = Column(Integer) + islabeled = Column(Boolean) + opened = Column(Boolean) + body = Column(Text) + __schema__=FullThreadSchema + __jsonid__='thread' + __whiteattrs__= ["body"] + __jsonattrs__=None + def bdy(self): + return yaml.load(self.body) + def to_text(self): + mmm=self.mails() + txt="" + for m in mmm: + m.compile_envelope() + txt=txt+"mail: \n" + for f in yaml.load(m.from_): + txt=txt+f["mail"]+"@"+f["host"] + txt=txt+" --- " + txt=txt+" ".join(yaml.load(m.subject)) + txt=txt+"\n" + return txt + + def mails(self): + a=[] +# print self.bdy() + for m in self.bdy(): + mail=db_session.query(Mail).filter(Mail.id==int(m)).first() + if mail is None: + mail=Mail.fetch_mail(int(m)) + a.append(mail) + return a + + def mail_dicts(self): + a=[] +# print "maildicts: "+ str(self.mails()) + for m in self.mails(): + m.compile_envelope() + a.append(m.dict_envelope()) + return a + def mail_flat_dict(self): + a=[] + d={} + dc=self.mail_dicts() +# print dc + for i in range(0,len(dc)): + for k, v in dc[i].iteritems(): + d["mail_"+str(i)+"_"+k]=v + return d + def subject(self): + a="" + for m in self.mails(): + m.compile_envelope() + a=a + " ".join(yaml.load(m.subject))+"\n" + + return a + + def text(self): + a=u"" + for m in self.mails(): + m.compile_text() + t=yaml.load(m.text) + if type(t) is unicode: + txt=t + else: +# print "withintm:"+str(type(t)) + t=t.decode("ISO-8859-1") + txt=t + a=a+txt+"\n\n" + + return a diff --git a/test_imap.py b/test_imap.py new file mode 100644 index 0000000..27bccc9 --- /dev/null +++ b/test_imap.py @@ -0,0 +1,62 @@ +#!.env/bin/python +from __future__ import unicode_literals +from imapclient import IMAPClient +from datetime import date +import yaml +HOST="buran.htu.tuwien.ac.at" +USERNAME="andis" +PASSWORD="t4MJAvU2" +ssl=True +server=IMAPClient(HOST, use_uid=True, ssl=ssl) +server.login(USERNAME,PASSWORD) + +select_info=server.select_folder('INBOX') + +messages=server.search([u'SUBJECT', 'service',u'SINCE', date(2017,06,1)]) +#pritn(select_info) + + +#response = server.fetch(messages, ['FLAGS', 'RFC822.SIZE', 'BODY', 'ENVELOPE','X-GM-THRID', 'X-GM-MSGID']) +#response = server.fetch(messages, ['ENVELOPE']) + +#print(response) +#for msgid, data in response.iteritems(): +# print(' ID %d: %d bytes, flags=%s, %s' % (msgid, +# data[b'RFC822.SIZE'], +# data[b'FLAGS'], data['ENVELOPE'])) +#response = server.thread() + +print "\n\n --------------------------------\n" +response= server.thread(criteria=[u'SUBJECT', 'service',u'SINCE', date(2017,04,1)]) +print(response) +#resp=server.thread('X-GM') +#for msgid, data in response.iteritems(): +# print(' ID %d: \t %s \t %s' % (msgid, data[b'X-GM-THRID'], data[b'X-GM-MSGID'])) +print "\n---------------------\n" +print response[0], len(response[0]) + + +def get_msg(mid): + print mid + sf=server.fetch([mid],['ENVELOPE']) + for msgid, data in sf.iteritems(): + return {"msgid": msgid, "envelope": data[b'ENVELOPE']} + + +def get_msg_tuple(ids): + r=[] + for i in ids: + if type(i) is int: + r.append(get_msg(i)) + elif type(i) is tuple: + r.append(get_msg_tuple(i)) + return r + +r=[] +for ids in response: + r.append(get_msg_tuple(ids)) + +print yaml.dump(r,default_flow_style=False) +file=open("envelopes.yaml","w") +file.write(yaml.dump(r,default_flow_style=False)) +file.close()