init learning cats
This commit is contained in:
106
classifier.py
Normal file
106
classifier.py
Normal file
@@ -0,0 +1,106 @@
|
||||
from sklearn.base import BaseEstimator, TransformerMixin
|
||||
from sklearn.feature_extraction import DictVectorizer
|
||||
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.pipeline import Pipeline, FeatureUnion
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
|
||||
import numpy as np
|
||||
import yaml
|
||||
from storage import MailThread,db_session
|
||||
|
||||
with open("data.yml", 'r') as stream:
|
||||
try:
|
||||
train=yaml.load(stream)
|
||||
except yaml.YAMLError as exc:
|
||||
print(exc)
|
||||
|
||||
data_types= { "answered": bool, "maintopic": str}
|
||||
|
||||
def store_training_data(i, d,key=b"answered"):
|
||||
global train
|
||||
if not data_types.has_key(key):
|
||||
raise ValueError("Key "+str(key)+" unknown")
|
||||
if not train.has_key(i):
|
||||
train[i]={}
|
||||
if not key is None and type(train[i]) is dict:
|
||||
if not type(d) is data_types[key]:
|
||||
# print str(type(d)) + " vs " + str(data_types[key])
|
||||
raise TypeError("Data - %s - for key "% d +str(key)+" must be " +str(data_types[key])+ " but it is "+ str(type(d)))
|
||||
train[i][key]=d
|
||||
|
||||
|
||||
with open("data.yml","w") as file:
|
||||
file.write(yaml.dump(train,default_flow_style=True))
|
||||
file.close()
|
||||
|
||||
|
||||
# Lade Trainingsdaten fuer einen angegebenen key (Label/Eigenschaft)
|
||||
def get_training_threads(key="answered"):
|
||||
t_a=[]
|
||||
d_a=[]
|
||||
d_a2=[]
|
||||
for i in train:
|
||||
t=db_session.query(MailThread).filter(MailThread.firstmail==i).first()
|
||||
if not t is None: # Thread muss in der Datenbank sein
|
||||
if train[i].has_key(key): # In den Trainingsdaten muss der relevante Key sein
|
||||
t_a.append(t)
|
||||
d_a.append(train[i][key])
|
||||
le=LabelEncoder()
|
||||
d_a2=le.fit_transform(d_a)
|
||||
return (t_a,d_a2,le)
|
||||
|
||||
|
||||
def in_training(i, key="answered"):
|
||||
return train.has_key(i) and train[i].has_key(key)
|
||||
|
||||
|
||||
def print_answers(l):
|
||||
cc=l.classes_
|
||||
c_id=l.transform(cc)
|
||||
for i,c in enumerate(cc):
|
||||
print str(i) + ": " + str(c)
|
||||
return None
|
||||
|
||||
|
||||
class ThreadDictExtractor(BaseEstimator, TransformerMixin):
|
||||
def fit(self, x, y=None):
|
||||
return self
|
||||
def transform(self, X,y=None):
|
||||
return [t.mail_flat_dict() for t in X]
|
||||
|
||||
class ThreadSubjectExtractor(BaseEstimator, TransformerMixin):
|
||||
def fit(self, x, y=None):
|
||||
return self
|
||||
def transform(self, X,y=None):
|
||||
return [t.subject() for t in X]
|
||||
|
||||
class ThreadTextExtractor(BaseEstimator, TransformerMixin):
|
||||
def fit(self, x, y=None):
|
||||
return self
|
||||
def transform(self, X,y=None):
|
||||
return [t.text() for t in X]
|
||||
|
||||
|
||||
pipe1=Pipeline([('tde', ThreadDictExtractor()),('dv',DictVectorizer()),('clf', MultinomialNB())])
|
||||
|
||||
pipe2 = Pipeline([
|
||||
('union', FeatureUnion(transformer_list=[
|
||||
('subject', Pipeline([('tse', ThreadSubjectExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('text', Pipeline([('tte',ThreadTextExtractor()),
|
||||
('cv',CountVectorizer()),
|
||||
('tfidf', TfidfTransformer())
|
||||
])),
|
||||
('envelope', Pipeline([('tde', ThreadDictExtractor()),
|
||||
('dv',DictVectorizer())
|
||||
]))
|
||||
], transformer_weights={
|
||||
'subject': 1,
|
||||
'text': 0.7,
|
||||
'envelope': 0.5
|
||||
} )),
|
||||
('clf', MultinomialNB())
|
||||
])
|
||||
25
classify_mail.py
Normal file
25
classify_mail.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
|
||||
from sklearn.feature_extraction import DictVectorizer
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.pipeline import Pipeline, FeatureUnion
|
||||
import sys
|
||||
import yaml
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
|
||||
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB())])
|
||||
|
||||
text_ohc = Pipeline([('ohc', OneHotEncoder()),('clf', MultinomialNB())])
|
||||
|
||||
combined_features = FeatureUnion([('vect1', CountVectorizer()),('vect2', CountVectorizer())])
|
||||
|
||||
|
||||
enc=OneHotEncoder()
|
||||
with open("example_1.yaml", 'r') as stream:
|
||||
try:
|
||||
train=yaml.safe_load(stream)
|
||||
except yaml.YAMLError as exc:
|
||||
print(exc)
|
||||
|
||||
tc=text_clf.fit(train["data"],train["target"])
|
||||
42
classify_text.py
Normal file
42
classify_text.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.pipeline import Pipeline
|
||||
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB())])
|
||||
import sys
|
||||
import yaml
|
||||
|
||||
|
||||
|
||||
with open("example_1.yaml", 'r') as stream:
|
||||
try:
|
||||
train=yaml.safe_load(stream)
|
||||
except yaml.YAMLError as exc:
|
||||
print(exc)
|
||||
|
||||
tc=text_clf.fit(train["data"],train["target"])
|
||||
print(sys.argv[1])
|
||||
|
||||
answ=(tc.predict([sys.argv[1]]))[0]
|
||||
print train["target_names"][answ]
|
||||
|
||||
for i in range(0, (len(train["target_names"]))):
|
||||
print (str(i)+" "+ train["target_names"][i])
|
||||
|
||||
ca=int(raw_input("Correct answer.."))
|
||||
|
||||
|
||||
if ca == answ:
|
||||
print ("Yes I got it right")
|
||||
else:
|
||||
print("should I remember this?")
|
||||
a=raw_input("shoudIrememberthis?")
|
||||
if a == "y":
|
||||
train["data"].append(sys.argv[1])
|
||||
train["target"].append(ca)
|
||||
print yaml.dump(train,default_flow_style=False)
|
||||
file=open("example_1.yaml","w")
|
||||
file.write(yaml.dump(train,default_flow_style=False))
|
||||
file.close()
|
||||
else:
|
||||
print ("Ok, I already forgot")
|
||||
|
||||
13
install.sh
Executable file
13
install.sh
Executable file
@@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
if test ! -d ".env"; then
|
||||
echo "Erzeuge virtuelle Umgebung ...."
|
||||
virtualenv .env
|
||||
fi
|
||||
echo "Aktiviere virtuelle Python Umgebung ..."
|
||||
. .env/bin/activate
|
||||
echo "Installiere requirements ..."
|
||||
pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
if test ! -e "config.cfg" -a -e "config.cfg.sample"; then
|
||||
cp config.cfg.sample config.cfg
|
||||
fi
|
||||
10
requirements.txt
Normal file
10
requirements.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
imapclient
|
||||
email
|
||||
config
|
||||
sklearn
|
||||
numpy
|
||||
scipy
|
||||
bs4
|
||||
sqlalchemy
|
||||
marshmallow
|
||||
PyYAML
|
||||
238
run.py
Normal file
238
run.py
Normal file
@@ -0,0 +1,238 @@
|
||||
from __future__ import unicode_literals
|
||||
import imapclient
|
||||
from config import Config
|
||||
import sys
|
||||
from email.header import decode_header
|
||||
import email
|
||||
import codecs
|
||||
import sys
|
||||
import bs4
|
||||
#sys.stdout = codecs.getwriter('utf8')(sys.stdout)
|
||||
from storage.fetch_mail import fetch_mail
|
||||
from storage.fetch_mail import fetch_threads, flatten_threads
|
||||
|
||||
from storage import Mail, MailThread, db_session
|
||||
import yaml
|
||||
import email
|
||||
from classifier import get_training_threads, ThreadDictExtractor, pipe1, print_answers, in_training, store_training_data, pipe2
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
import numpy
|
||||
|
||||
|
||||
|
||||
def train_fit_pipe():
|
||||
tt= get_training_threads(b"answered")
|
||||
print tt[1]
|
||||
print tt[0]
|
||||
pipe1.fit(tt[0],tt[1])
|
||||
return pipe1,tt[2]
|
||||
def train_fit_pipe2():
|
||||
tt= get_training_threads(b"maintopic")
|
||||
pipe2.fit(tt[0],tt[1])
|
||||
return pipe2,tt[2]
|
||||
|
||||
|
||||
def train_single_thread(tid,p,le,key="answered"):
|
||||
if (not type(tid) is int): raise TypeError("ID must be of type int")
|
||||
if not type(p) is Pipeline: raise TypeError("Second Argument needs to be type Pipeline")
|
||||
if not type(le) is LabelEncoder: raise TypeError("Second Argument needs to be type LabelEncoder")
|
||||
mth=db_session.query(MailThread).filter(MailThread.firstmail==tid).first()
|
||||
if mth is None: raise ValueError("Thread with firstmail %d not in Database" %tid)
|
||||
# Predict the value
|
||||
pre=p.predict([mth])
|
||||
answ=pre[0]
|
||||
#
|
||||
print mth.to_text()
|
||||
print mth.text()
|
||||
print "Status is answered is estimated to be: " + str(le.inverse_transform(pre)[0])
|
||||
print_answers(le)
|
||||
|
||||
ca=raw_input("Correct answer..")
|
||||
try:
|
||||
ca=int(ca)
|
||||
|
||||
except ValueError:
|
||||
print "String Data"
|
||||
if type(ca)==int:
|
||||
if ca == answ:
|
||||
print ("Yes I got it right")
|
||||
else:
|
||||
print("Oh no...!")
|
||||
l=le.inverse_transform([ca])[0]
|
||||
if type(l) is numpy.bool_:
|
||||
l=bool(l)
|
||||
if type(l) is numpy.string_:
|
||||
l=str(l)
|
||||
store_training_data(tid,l, key)
|
||||
elif not ca.strip() == "":
|
||||
store_training_data(tid, ca, key)
|
||||
else:
|
||||
print "couldn't handle %s" % ca
|
||||
|
||||
|
||||
#print "arg1:"+sys.argv[1]
|
||||
if len(sys.argv)>1:
|
||||
if sys.argv[1] == "fetch_threads":
|
||||
print flatten_threads(fetch_threads())
|
||||
|
||||
|
||||
if sys.argv[1] == "print_threads":
|
||||
mth=db_session.query(MailThread).all()
|
||||
for t in mth:
|
||||
print t.firstmail
|
||||
print t.mail_flat_dict()
|
||||
|
||||
if sys.argv[1] == "print_thrd":
|
||||
if len(sys.argv)<3:
|
||||
mth=db_session.query(MailThread).all()
|
||||
for t in mth:
|
||||
print t.firstmail
|
||||
else:
|
||||
t=db_session.query(MailThread).filter(MailThread.firstmail==sys.argv[2]).first()
|
||||
|
||||
print t.firstmail
|
||||
print t.subject()
|
||||
print t.text()
|
||||
|
||||
|
||||
if sys.argv[1] == "print_threads2":
|
||||
mth=db_session.query(MailThread).all()
|
||||
for t in mth:
|
||||
print t.to_text()
|
||||
print "---------------\n"
|
||||
|
||||
if sys.argv[1] == "train_thrd2":
|
||||
p, le=train_fit_pipe2()
|
||||
train_single_thread(int(sys.argv[2]),p,le,b"maintopic")
|
||||
if sys.argv[1] == "train_all2":
|
||||
p, labelencoder=train_fit_pipe2()
|
||||
mth=db_session.query(MailThread).all()
|
||||
print mth
|
||||
for t in mth:
|
||||
if not in_training(t.firstmail,"maintopic"):
|
||||
print "---------------------------------------------------"
|
||||
print "---------------------------------------------------"
|
||||
print t.firstmail
|
||||
print t.text()
|
||||
train_single_thread(t.firstmail, p, labelencoder, b"maintopic")
|
||||
|
||||
|
||||
if sys.argv[1] == "testpipe2":
|
||||
from classifier import ThreadSubjectExtractor, ThreadTextExtractor
|
||||
pipe2,le=train_fit_pipe2()
|
||||
|
||||
if len(sys.argv)>2:
|
||||
t=db_session.query(MailThread).filter(MailThread.firstmail==sys.argv[2]).first()
|
||||
print t.to_text()
|
||||
print le.inverse_transform(pipe2.predict([t]))
|
||||
|
||||
|
||||
if sys.argv[1] == "train_thrd":
|
||||
pipe1, labelencoder=train_fit_pipe()
|
||||
train_single_thread(int(sys.argv[2]),pipe1,labelencoder)
|
||||
|
||||
if sys.argv[1] == "train_all":
|
||||
pipe1, labelencoder=train_fit_pipe()
|
||||
mth=db_session.query(MailThread).all()
|
||||
print mth
|
||||
for t in mth:
|
||||
if not in_training(t.firstmail):
|
||||
print "---------------------------------------------------"
|
||||
print "---------------------------------------------------"
|
||||
print t.firstmail
|
||||
train_single_thread(t.firstmail,pipe1,labelencoder)
|
||||
|
||||
if sys.argv[1] == "print_thread":
|
||||
mth=db_session.query(MailThread).filter(MailThread.firstmail==int(sys.argv[2])).first()
|
||||
print mth.mail_dicts()
|
||||
print mth.mail_flat_dict()
|
||||
|
||||
if sys.argv[1] == "store_threads":
|
||||
thrds=flatten_threads(fetch_threads())
|
||||
for t in thrds:
|
||||
if type(t[0]) is int:
|
||||
th=db_session.query(MailThread).filter(MailThread.firstmail==t[0]).first()
|
||||
if th == None:
|
||||
th=MailThread()
|
||||
th.firstmail=t[0]
|
||||
if not th.body == yaml.dump(t):
|
||||
th.body=yaml.dump(t)
|
||||
th.islabeled=False
|
||||
th.opened=True
|
||||
else:
|
||||
th.body=yaml.dump(t)
|
||||
db_session.add(th)
|
||||
db_session.commit()
|
||||
print thrds
|
||||
|
||||
|
||||
|
||||
if sys.argv[1] == "print_mail":
|
||||
mm=db_session.query(Mail).filter(Mail.id==int(sys.argv[2])).first()
|
||||
mm.compile_text()
|
||||
mm.compile_envelope()
|
||||
print mm.subject
|
||||
print "----------"
|
||||
print mm.text
|
||||
|
||||
if sys.argv[1] == "mail_dict_test":
|
||||
mm=db_session.query(Mail).filter(Mail.id==int(sys.argv[2])).first()
|
||||
mm.compile_envelope()
|
||||
print mm.dict_envelope()
|
||||
|
||||
|
||||
if sys.argv[1] == "load_mail":
|
||||
mm=db_session.query(Mail).filter(Mail.id==int(sys.argv[2])).first()
|
||||
mm.compile_text()
|
||||
print mm.text
|
||||
env=yaml.load(mm.envelope)
|
||||
print env.subject
|
||||
print env
|
||||
|
||||
|
||||
if sys.argv[1] == "store_mail":
|
||||
m=fetch_mail(int(sys.argv[2]))
|
||||
mm=Mail()
|
||||
mm.envelope=yaml.dump(m['ENVELOPE'])
|
||||
mm.body=yaml.dump(m['RFC822'])
|
||||
mm.id=m['id']
|
||||
db_session.add(mm)
|
||||
db_session.commit()
|
||||
|
||||
|
||||
if sys.argv[1] == "fetch_mail":
|
||||
print "fetching mail %d " % int(sys.argv[2])
|
||||
m=fetch_mail(int(sys.argv[2]))
|
||||
hd=decode_header(m['ENVELOPE'].subject)
|
||||
hd2=[]
|
||||
# print hd
|
||||
for h in hd:
|
||||
if not h[1] is None:
|
||||
hd2.append(h[0].decode(h[1]))
|
||||
# print h[0].decode(h[1])
|
||||
else:
|
||||
hd2.append(h[0])
|
||||
print "\nBetreff:"
|
||||
for h in hd2:
|
||||
print h
|
||||
print "FROM:"
|
||||
for t in m['ENVELOPE'].from_:
|
||||
print t
|
||||
print "TO:"
|
||||
for t in m['ENVELOPE'].to:
|
||||
print t
|
||||
em=email.message_from_string(m['RFC822'])
|
||||
for p in em.walk():
|
||||
if p.get_content_maintype()=="text":
|
||||
print p.get_payload()
|
||||
elif p.get_content_maintype()=="multipart":
|
||||
print p.get_payload()
|
||||
else:
|
||||
print p.get_content_maintype()
|
||||
|
||||
|
||||
|
||||
if sys.argv[1] == "initdb":
|
||||
from storage import init_db
|
||||
init_db()
|
||||
3
storage/__init__.py
Normal file
3
storage/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from database import db_session, init_db
|
||||
from mail_model import Mail
|
||||
from thread_model import MailThread
|
||||
23
storage/database.py
Normal file
23
storage/database.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import scoped_session, sessionmaker
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from config import Config
|
||||
from database_mbase import MyBase
|
||||
import os
|
||||
f=file('config.cfg')
|
||||
cfg=Config(f)
|
||||
|
||||
|
||||
if cfg.get("db_main_type") == "mysql":
|
||||
engine = create_engine("mysql+pymysql://%s:%s@localhost/crawler_articles?charset=utf8" % (cfg.get("db_main_user"), cfg.get("db_main_pw")))
|
||||
else:
|
||||
engine = create_engine('sqlite:///'+ os.path.join(cfg.db_path,cfg.db_mainfile), convert_unicode=True)
|
||||
|
||||
|
||||
db_session = scoped_session(sessionmaker(autocommit=False,# autoflush=False,
|
||||
bind=engine))
|
||||
|
||||
Base=declarative_base(cls=MyBase)
|
||||
def init_db():
|
||||
import models
|
||||
Base.metadata.create_all(bind=engine)
|
||||
67
storage/database_mbase.py
Normal file
67
storage/database_mbase.py
Normal file
@@ -0,0 +1,67 @@
|
||||
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey, Index, TIMESTAMP
|
||||
|
||||
from datetime import datetime
|
||||
class MyBase(object):
|
||||
id = Column(Integer, primary_key=True)
|
||||
created_at = Column(TIMESTAMP, default=datetime.utcnow, nullable=False)
|
||||
updated_at = Column(TIMESTAMP, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
|
||||
def __json__(self):
|
||||
if self.__jsonattrs__ is None:
|
||||
return self.__schema__().dump(self)[0]
|
||||
else:
|
||||
return self.__schema__(only=self.__jsonattrs__).dump(self)[0]
|
||||
# def __init__(self, data={}):
|
||||
# self.update(data,False)
|
||||
|
||||
def update(self,data, partial=True):
|
||||
data, errors=self.__schema__( only=self.__whiteattrs__).load(data, partial=partial)
|
||||
if len(errors)>0:
|
||||
print errors
|
||||
return (False,errors)
|
||||
else:
|
||||
for a in self.__whiteattrs__:
|
||||
if data.has_key(a):
|
||||
setattr(self,a,data[a])
|
||||
return (True, [])
|
||||
|
||||
@classmethod
|
||||
def deserialize(cls,data):
|
||||
data, errors=cls.__schema__().load(data,partial=True)
|
||||
a=cls()
|
||||
for c in cls.__table__.columns:
|
||||
if data.has_key(c.key):
|
||||
setattr(a, c.key,data[c.key])
|
||||
return a
|
||||
|
||||
class MyBase2(object):
|
||||
id = Column(Integer, primary_key=True)
|
||||
# created_at = Column(TIMESTAMP, default=datetime.utcnow, nullable=False)
|
||||
# updated_at = Column(TIMESTAMP, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
|
||||
def __json__(self):
|
||||
if self.__jsonattrs__ is None:
|
||||
return self.__schema__().dump(self)[0]
|
||||
else:
|
||||
return self.__schema__(only=self.__jsonattrs__).dump(self)[0]
|
||||
# def __init__(self, data={}):
|
||||
# self.update(data,False)
|
||||
|
||||
def update(self,data, partial=True):
|
||||
data, errors=self.__schema__( only=self.__whiteattrs__).load(data, partial=partial)
|
||||
if len(errors)>0:
|
||||
print errors
|
||||
return (False,errors)
|
||||
else:
|
||||
for a in self.__whiteattrs__:
|
||||
if data.has_key(a):
|
||||
setattr(self,a,data[a])
|
||||
return (True, [])
|
||||
|
||||
@classmethod
|
||||
def deserialize(cls,data):
|
||||
data, errors=cls.__schema__().load(data,partial=True)
|
||||
a=cls()
|
||||
for c in cls.__table__.columns:
|
||||
if data.has_key(c.key):
|
||||
setattr(a, c.key,data[c.key])
|
||||
return a
|
||||
|
||||
37
storage/fetch_mail.py
Normal file
37
storage/fetch_mail.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import imapclient
|
||||
from datetime import date
|
||||
|
||||
from config import Config
|
||||
f=file('config.cfg')
|
||||
cfg=Config(f)
|
||||
server = imapclient.IMAPClient(cfg.host, use_uid=True, ssl=True)
|
||||
server.login(cfg.user, cfg.password)
|
||||
server.select_folder('INBOX')
|
||||
|
||||
|
||||
def fetch_mail(myid):
|
||||
m=server.fetch([myid],['ENVELOPE','RFC822'])
|
||||
m=m[myid]
|
||||
m["id"]=myid
|
||||
return m
|
||||
|
||||
def fetch_thread(tp):
|
||||
return tp
|
||||
|
||||
def fetch_threads():
|
||||
src=server.thread(criteria=[b'SUBJECT', b'service', b'SINCE', date(2017,07,01)])
|
||||
#, b'BEFORE', date(2017,08,01)
|
||||
return src
|
||||
|
||||
|
||||
def flatten_threads(thrds, array=[], level=0):
|
||||
if level > 0:
|
||||
for t in thrds:
|
||||
if type(t) is tuple:
|
||||
array = array + (flatten_threads(t,[],1))
|
||||
else:
|
||||
array.append(t)
|
||||
else:
|
||||
for t in thrds:
|
||||
array.append(flatten_threads(t,[],1))
|
||||
return array
|
||||
126
storage/mail_model.py
Normal file
126
storage/mail_model.py
Normal file
@@ -0,0 +1,126 @@
|
||||
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey, Unicode
|
||||
from sqlalchemy.orm import relationship
|
||||
from datetime import datetime
|
||||
from database import Base
|
||||
from database import db_session
|
||||
from email.header import decode_header
|
||||
from marshmallow import Schema, fields, post_load
|
||||
import yaml
|
||||
import email
|
||||
from fetch_mail import fetch_mail
|
||||
import bs4
|
||||
class FullMailSchema(Schema):
|
||||
id=fields.Integer()
|
||||
text=fields.String()
|
||||
body=fields.String()
|
||||
envelope=fields.String()
|
||||
|
||||
|
||||
|
||||
class Mail(Base):
|
||||
__tablename__ = 'mails'
|
||||
id = Column(Integer, primary_key=True)
|
||||
date = Column(DateTime)
|
||||
envelope = Column(Text)
|
||||
body = Column(Text)
|
||||
text = Column(Text)
|
||||
from_ = Column(Text)
|
||||
from_mailbox=Column(String)
|
||||
from_host=Column(String)
|
||||
to_ = Column(Text)
|
||||
to_mailbox = Column(Text)
|
||||
to_host=Column(String)
|
||||
subject = Column(Text)
|
||||
__schema__=FullMailSchema
|
||||
__jsonid__='mail'
|
||||
__whiteattrs__= ["text", "envelope"]
|
||||
__jsonattrs__=None
|
||||
|
||||
|
||||
@classmethod
|
||||
def fetch_mail(self,mid):
|
||||
m=fetch_mail(mid)
|
||||
mm=Mail()
|
||||
mm.envelope=yaml.dump(m['ENVELOPE'])
|
||||
em=email.message_from_string(m['RFC822'])
|
||||
if type(em.get_payload()) is list:
|
||||
pt=[]
|
||||
for p in em.walk():
|
||||
if p.get_content_maintype() == "text":
|
||||
pt.append(p)
|
||||
em.set_payload(pt)
|
||||
mm.body=yaml.dump(str(em))
|
||||
mm.id=m['id']
|
||||
db_session.add(mm)
|
||||
db_session.commit()
|
||||
return mm
|
||||
|
||||
def get_email(self):
|
||||
em=email.message_from_string(yaml.load(self.body))
|
||||
return em
|
||||
|
||||
def compile_envelope(self):
|
||||
env=yaml.load(self.envelope)
|
||||
hd=decode_header(env.subject)
|
||||
hd2=[]
|
||||
|
||||
for h in hd:
|
||||
if not h[1] is None:
|
||||
hd2.append(h[0].decode(h[1]))
|
||||
# print h[0].decode(h[1])
|
||||
else:
|
||||
hd2.append(h[0])
|
||||
self.subject=yaml.dump(hd2)
|
||||
to_array=[]
|
||||
from_array=[]
|
||||
# print "Status"
|
||||
# print env
|
||||
if env.to is None:
|
||||
print self.id
|
||||
else:
|
||||
for t in env.to:
|
||||
a={"host": t.host, "mail": t.mailbox}
|
||||
to_array.append(a)
|
||||
self.to_=yaml.dump(to_array)
|
||||
for t in env.from_:
|
||||
a={"host": t.host, "mail": t.mailbox}
|
||||
from_array.append(a)
|
||||
self.to_=yaml.dump(to_array)
|
||||
self.from_=yaml.dump(from_array)
|
||||
return None
|
||||
|
||||
def dict_envelope(self):
|
||||
d={}
|
||||
i=0
|
||||
for p in yaml.load(self.subject):
|
||||
if p is not None:
|
||||
d["subject_"+str(i)]=p
|
||||
i=i+1
|
||||
i=0
|
||||
for p in yaml.load(self.to_):
|
||||
if p["host"] is not None:
|
||||
d["to_host_"+str(i)]=p["host"]
|
||||
if p["mail"] is not None:
|
||||
d["to_mailbox_"+str(i)]=p["mail"]
|
||||
i=i+1
|
||||
i=0
|
||||
for p in yaml.load(self.from_):
|
||||
if p["host"] is not None:
|
||||
d["from_host_"+str(i)]=p["host"]
|
||||
if p["mail"] is not None:
|
||||
d["from_mailbox_"+str(i)]=p["mail"]
|
||||
i=i+1
|
||||
return d
|
||||
|
||||
def compile_text(self):
|
||||
for p in self.get_email().walk():
|
||||
if p.get_content_maintype()=="text":
|
||||
pl=p.get_payload(decode=True)
|
||||
# print pl
|
||||
# print p.get_content_type()
|
||||
if p.get_content_subtype()=="html":
|
||||
|
||||
b4=bs4.BeautifulSoup(pl,"html.parser")
|
||||
self.text= yaml.dump(b4.get_text())
|
||||
else:
|
||||
self.text =yaml.dump( pl)
|
||||
2
storage/models.py
Normal file
2
storage/models.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from mail_model import Mail
|
||||
from thread_model import MailThread
|
||||
93
storage/thread_model.py
Normal file
93
storage/thread_model.py
Normal file
@@ -0,0 +1,93 @@
|
||||
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey, Unicode
|
||||
from sqlalchemy.orm import relationship
|
||||
from datetime import datetime
|
||||
from database import Base
|
||||
from database import db_session
|
||||
from email.header import decode_header
|
||||
from marshmallow import Schema, fields, post_load
|
||||
import yaml
|
||||
import email
|
||||
from mail_model import Mail
|
||||
#from fetch_mail import fetch_mail
|
||||
|
||||
class FullThreadSchema(Schema):
|
||||
id=fields.Integer()
|
||||
text=fields.String()
|
||||
body=fields.String()
|
||||
envelope=fields.String()
|
||||
|
||||
|
||||
class MailThread(Base):
|
||||
__tablename__ = 'threads'
|
||||
id = Column(Integer, primary_key=True)
|
||||
firstmail = Column(Integer)
|
||||
islabeled = Column(Boolean)
|
||||
opened = Column(Boolean)
|
||||
body = Column(Text)
|
||||
__schema__=FullThreadSchema
|
||||
__jsonid__='thread'
|
||||
__whiteattrs__= ["body"]
|
||||
__jsonattrs__=None
|
||||
def bdy(self):
|
||||
return yaml.load(self.body)
|
||||
def to_text(self):
|
||||
mmm=self.mails()
|
||||
txt=""
|
||||
for m in mmm:
|
||||
m.compile_envelope()
|
||||
txt=txt+"mail: \n"
|
||||
for f in yaml.load(m.from_):
|
||||
txt=txt+f["mail"]+"@"+f["host"]
|
||||
txt=txt+" --- "
|
||||
txt=txt+" ".join(yaml.load(m.subject))
|
||||
txt=txt+"\n"
|
||||
return txt
|
||||
|
||||
def mails(self):
|
||||
a=[]
|
||||
# print self.bdy()
|
||||
for m in self.bdy():
|
||||
mail=db_session.query(Mail).filter(Mail.id==int(m)).first()
|
||||
if mail is None:
|
||||
mail=Mail.fetch_mail(int(m))
|
||||
a.append(mail)
|
||||
return a
|
||||
|
||||
def mail_dicts(self):
|
||||
a=[]
|
||||
# print "maildicts: "+ str(self.mails())
|
||||
for m in self.mails():
|
||||
m.compile_envelope()
|
||||
a.append(m.dict_envelope())
|
||||
return a
|
||||
def mail_flat_dict(self):
|
||||
a=[]
|
||||
d={}
|
||||
dc=self.mail_dicts()
|
||||
# print dc
|
||||
for i in range(0,len(dc)):
|
||||
for k, v in dc[i].iteritems():
|
||||
d["mail_"+str(i)+"_"+k]=v
|
||||
return d
|
||||
def subject(self):
|
||||
a=""
|
||||
for m in self.mails():
|
||||
m.compile_envelope()
|
||||
a=a + " ".join(yaml.load(m.subject))+"\n"
|
||||
|
||||
return a
|
||||
|
||||
def text(self):
|
||||
a=u""
|
||||
for m in self.mails():
|
||||
m.compile_text()
|
||||
t=yaml.load(m.text)
|
||||
if type(t) is unicode:
|
||||
txt=t
|
||||
else:
|
||||
# print "withintm:"+str(type(t))
|
||||
t=t.decode("ISO-8859-1")
|
||||
txt=t
|
||||
a=a+txt+"\n\n"
|
||||
|
||||
return a
|
||||
62
test_imap.py
Normal file
62
test_imap.py
Normal file
@@ -0,0 +1,62 @@
|
||||
#!.env/bin/python
|
||||
from __future__ import unicode_literals
|
||||
from imapclient import IMAPClient
|
||||
from datetime import date
|
||||
import yaml
|
||||
HOST="buran.htu.tuwien.ac.at"
|
||||
USERNAME="andis"
|
||||
PASSWORD="t4MJAvU2"
|
||||
ssl=True
|
||||
server=IMAPClient(HOST, use_uid=True, ssl=ssl)
|
||||
server.login(USERNAME,PASSWORD)
|
||||
|
||||
select_info=server.select_folder('INBOX')
|
||||
|
||||
messages=server.search([u'SUBJECT', 'service',u'SINCE', date(2017,06,1)])
|
||||
#pritn(select_info)
|
||||
|
||||
|
||||
#response = server.fetch(messages, ['FLAGS', 'RFC822.SIZE', 'BODY', 'ENVELOPE','X-GM-THRID', 'X-GM-MSGID'])
|
||||
#response = server.fetch(messages, ['ENVELOPE'])
|
||||
|
||||
#print(response)
|
||||
#for msgid, data in response.iteritems():
|
||||
# print(' ID %d: %d bytes, flags=%s, %s' % (msgid,
|
||||
# data[b'RFC822.SIZE'],
|
||||
# data[b'FLAGS'], data['ENVELOPE']))
|
||||
#response = server.thread()
|
||||
|
||||
print "\n\n --------------------------------\n"
|
||||
response= server.thread(criteria=[u'SUBJECT', 'service',u'SINCE', date(2017,04,1)])
|
||||
print(response)
|
||||
#resp=server.thread('X-GM')
|
||||
#for msgid, data in response.iteritems():
|
||||
# print(' ID %d: \t %s \t %s' % (msgid, data[b'X-GM-THRID'], data[b'X-GM-MSGID']))
|
||||
print "\n---------------------\n"
|
||||
print response[0], len(response[0])
|
||||
|
||||
|
||||
def get_msg(mid):
|
||||
print mid
|
||||
sf=server.fetch([mid],['ENVELOPE'])
|
||||
for msgid, data in sf.iteritems():
|
||||
return {"msgid": msgid, "envelope": data[b'ENVELOPE']}
|
||||
|
||||
|
||||
def get_msg_tuple(ids):
|
||||
r=[]
|
||||
for i in ids:
|
||||
if type(i) is int:
|
||||
r.append(get_msg(i))
|
||||
elif type(i) is tuple:
|
||||
r.append(get_msg_tuple(i))
|
||||
return r
|
||||
|
||||
r=[]
|
||||
for ids in response:
|
||||
r.append(get_msg_tuple(ids))
|
||||
|
||||
print yaml.dump(r,default_flow_style=False)
|
||||
file=open("envelopes.yaml","w")
|
||||
file.write(yaml.dump(r,default_flow_style=False))
|
||||
file.close()
|
||||
Reference in New Issue
Block a user