init learning cats

This commit is contained in:
Andreas Stephanides
2017-08-04 07:49:39 +02:00
commit 941cbc3d45
14 changed files with 847 additions and 0 deletions

3
storage/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from database import db_session, init_db
from mail_model import Mail
from thread_model import MailThread

23
storage/database.py Normal file
View File

@@ -0,0 +1,23 @@
from sqlalchemy import create_engine
from sqlalchemy.orm import scoped_session, sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from config import Config
from database_mbase import MyBase
import os
f=file('config.cfg')
cfg=Config(f)
if cfg.get("db_main_type") == "mysql":
engine = create_engine("mysql+pymysql://%s:%s@localhost/crawler_articles?charset=utf8" % (cfg.get("db_main_user"), cfg.get("db_main_pw")))
else:
engine = create_engine('sqlite:///'+ os.path.join(cfg.db_path,cfg.db_mainfile), convert_unicode=True)
db_session = scoped_session(sessionmaker(autocommit=False,# autoflush=False,
bind=engine))
Base=declarative_base(cls=MyBase)
def init_db():
import models
Base.metadata.create_all(bind=engine)

67
storage/database_mbase.py Normal file
View File

@@ -0,0 +1,67 @@
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey, Index, TIMESTAMP
from datetime import datetime
class MyBase(object):
id = Column(Integer, primary_key=True)
created_at = Column(TIMESTAMP, default=datetime.utcnow, nullable=False)
updated_at = Column(TIMESTAMP, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
def __json__(self):
if self.__jsonattrs__ is None:
return self.__schema__().dump(self)[0]
else:
return self.__schema__(only=self.__jsonattrs__).dump(self)[0]
# def __init__(self, data={}):
# self.update(data,False)
def update(self,data, partial=True):
data, errors=self.__schema__( only=self.__whiteattrs__).load(data, partial=partial)
if len(errors)>0:
print errors
return (False,errors)
else:
for a in self.__whiteattrs__:
if data.has_key(a):
setattr(self,a,data[a])
return (True, [])
@classmethod
def deserialize(cls,data):
data, errors=cls.__schema__().load(data,partial=True)
a=cls()
for c in cls.__table__.columns:
if data.has_key(c.key):
setattr(a, c.key,data[c.key])
return a
class MyBase2(object):
id = Column(Integer, primary_key=True)
# created_at = Column(TIMESTAMP, default=datetime.utcnow, nullable=False)
# updated_at = Column(TIMESTAMP, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
def __json__(self):
if self.__jsonattrs__ is None:
return self.__schema__().dump(self)[0]
else:
return self.__schema__(only=self.__jsonattrs__).dump(self)[0]
# def __init__(self, data={}):
# self.update(data,False)
def update(self,data, partial=True):
data, errors=self.__schema__( only=self.__whiteattrs__).load(data, partial=partial)
if len(errors)>0:
print errors
return (False,errors)
else:
for a in self.__whiteattrs__:
if data.has_key(a):
setattr(self,a,data[a])
return (True, [])
@classmethod
def deserialize(cls,data):
data, errors=cls.__schema__().load(data,partial=True)
a=cls()
for c in cls.__table__.columns:
if data.has_key(c.key):
setattr(a, c.key,data[c.key])
return a

37
storage/fetch_mail.py Normal file
View File

@@ -0,0 +1,37 @@
import imapclient
from datetime import date
from config import Config
f=file('config.cfg')
cfg=Config(f)
server = imapclient.IMAPClient(cfg.host, use_uid=True, ssl=True)
server.login(cfg.user, cfg.password)
server.select_folder('INBOX')
def fetch_mail(myid):
m=server.fetch([myid],['ENVELOPE','RFC822'])
m=m[myid]
m["id"]=myid
return m
def fetch_thread(tp):
return tp
def fetch_threads():
src=server.thread(criteria=[b'SUBJECT', b'service', b'SINCE', date(2017,07,01)])
#, b'BEFORE', date(2017,08,01)
return src
def flatten_threads(thrds, array=[], level=0):
if level > 0:
for t in thrds:
if type(t) is tuple:
array = array + (flatten_threads(t,[],1))
else:
array.append(t)
else:
for t in thrds:
array.append(flatten_threads(t,[],1))
return array

126
storage/mail_model.py Normal file
View File

@@ -0,0 +1,126 @@
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey, Unicode
from sqlalchemy.orm import relationship
from datetime import datetime
from database import Base
from database import db_session
from email.header import decode_header
from marshmallow import Schema, fields, post_load
import yaml
import email
from fetch_mail import fetch_mail
import bs4
class FullMailSchema(Schema):
id=fields.Integer()
text=fields.String()
body=fields.String()
envelope=fields.String()
class Mail(Base):
__tablename__ = 'mails'
id = Column(Integer, primary_key=True)
date = Column(DateTime)
envelope = Column(Text)
body = Column(Text)
text = Column(Text)
from_ = Column(Text)
from_mailbox=Column(String)
from_host=Column(String)
to_ = Column(Text)
to_mailbox = Column(Text)
to_host=Column(String)
subject = Column(Text)
__schema__=FullMailSchema
__jsonid__='mail'
__whiteattrs__= ["text", "envelope"]
__jsonattrs__=None
@classmethod
def fetch_mail(self,mid):
m=fetch_mail(mid)
mm=Mail()
mm.envelope=yaml.dump(m['ENVELOPE'])
em=email.message_from_string(m['RFC822'])
if type(em.get_payload()) is list:
pt=[]
for p in em.walk():
if p.get_content_maintype() == "text":
pt.append(p)
em.set_payload(pt)
mm.body=yaml.dump(str(em))
mm.id=m['id']
db_session.add(mm)
db_session.commit()
return mm
def get_email(self):
em=email.message_from_string(yaml.load(self.body))
return em
def compile_envelope(self):
env=yaml.load(self.envelope)
hd=decode_header(env.subject)
hd2=[]
for h in hd:
if not h[1] is None:
hd2.append(h[0].decode(h[1]))
# print h[0].decode(h[1])
else:
hd2.append(h[0])
self.subject=yaml.dump(hd2)
to_array=[]
from_array=[]
# print "Status"
# print env
if env.to is None:
print self.id
else:
for t in env.to:
a={"host": t.host, "mail": t.mailbox}
to_array.append(a)
self.to_=yaml.dump(to_array)
for t in env.from_:
a={"host": t.host, "mail": t.mailbox}
from_array.append(a)
self.to_=yaml.dump(to_array)
self.from_=yaml.dump(from_array)
return None
def dict_envelope(self):
d={}
i=0
for p in yaml.load(self.subject):
if p is not None:
d["subject_"+str(i)]=p
i=i+1
i=0
for p in yaml.load(self.to_):
if p["host"] is not None:
d["to_host_"+str(i)]=p["host"]
if p["mail"] is not None:
d["to_mailbox_"+str(i)]=p["mail"]
i=i+1
i=0
for p in yaml.load(self.from_):
if p["host"] is not None:
d["from_host_"+str(i)]=p["host"]
if p["mail"] is not None:
d["from_mailbox_"+str(i)]=p["mail"]
i=i+1
return d
def compile_text(self):
for p in self.get_email().walk():
if p.get_content_maintype()=="text":
pl=p.get_payload(decode=True)
# print pl
# print p.get_content_type()
if p.get_content_subtype()=="html":
b4=bs4.BeautifulSoup(pl,"html.parser")
self.text= yaml.dump(b4.get_text())
else:
self.text =yaml.dump( pl)

2
storage/models.py Normal file
View File

@@ -0,0 +1,2 @@
from mail_model import Mail
from thread_model import MailThread

93
storage/thread_model.py Normal file
View File

@@ -0,0 +1,93 @@
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey, Unicode
from sqlalchemy.orm import relationship
from datetime import datetime
from database import Base
from database import db_session
from email.header import decode_header
from marshmallow import Schema, fields, post_load
import yaml
import email
from mail_model import Mail
#from fetch_mail import fetch_mail
class FullThreadSchema(Schema):
id=fields.Integer()
text=fields.String()
body=fields.String()
envelope=fields.String()
class MailThread(Base):
__tablename__ = 'threads'
id = Column(Integer, primary_key=True)
firstmail = Column(Integer)
islabeled = Column(Boolean)
opened = Column(Boolean)
body = Column(Text)
__schema__=FullThreadSchema
__jsonid__='thread'
__whiteattrs__= ["body"]
__jsonattrs__=None
def bdy(self):
return yaml.load(self.body)
def to_text(self):
mmm=self.mails()
txt=""
for m in mmm:
m.compile_envelope()
txt=txt+"mail: \n"
for f in yaml.load(m.from_):
txt=txt+f["mail"]+"@"+f["host"]
txt=txt+" --- "
txt=txt+" ".join(yaml.load(m.subject))
txt=txt+"\n"
return txt
def mails(self):
a=[]
# print self.bdy()
for m in self.bdy():
mail=db_session.query(Mail).filter(Mail.id==int(m)).first()
if mail is None:
mail=Mail.fetch_mail(int(m))
a.append(mail)
return a
def mail_dicts(self):
a=[]
# print "maildicts: "+ str(self.mails())
for m in self.mails():
m.compile_envelope()
a.append(m.dict_envelope())
return a
def mail_flat_dict(self):
a=[]
d={}
dc=self.mail_dicts()
# print dc
for i in range(0,len(dc)):
for k, v in dc[i].iteritems():
d["mail_"+str(i)+"_"+k]=v
return d
def subject(self):
a=""
for m in self.mails():
m.compile_envelope()
a=a + " ".join(yaml.load(m.subject))+"\n"
return a
def text(self):
a=u""
for m in self.mails():
m.compile_text()
t=yaml.load(m.text)
if type(t) is unicode:
txt=t
else:
# print "withintm:"+str(type(t))
t=t.decode("ISO-8859-1")
txt=t
a=a+txt+"\n\n"
return a