Files
fachschaften/articles/model.py
2017-02-17 10:02:20 +01:00

188 lines
6.0 KiB
Python

from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey
from sqlalchemy.orm import relationship
from datetime import datetime
from src.database import Base
from src.database import db_session
from marshmallow import Schema, fields, post_load
from src.sections.model import Section
#import json
import json
import flask
#json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None)
import hashlib
#import clogger
import logging
#from crawler.compiler.mqueues import put_fetch_queue
from src import clogger
#json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None)
def calc_fingerprint(a):
return calc_fingerprint_h({"url": a.url, "title":a.title, "published": str(a.published_date)})
def calc_fingerprint_h(a):
if a["published"] is not None and a["published"]!= "None":
# clogger.info( "published:"+str(a["published"]))
if a["published"] is str:
pp=parse(a["published"])
else:
pp=a["published"]
else:
pp=""
#clogger.info( unicode(a["url"])+ unicode(a["title"])+unicode(pp))
h=hashlib.md5()
h.update(unicode(a["url"]))
h.update(a["title"].encode("utf-8"))
h.update(unicode(pp))
return h.hexdigest()
class FullArticleSchema(Schema):
id=fields.Integer()
parent_id=fields.Integer(allow_none=True)
url =fields.String()
is_primary=fields.Boolean(allow_none=True)
fingerprint=fields.String()
hash=fields.String(allow_none=True)
last_fetched=fields.DateTime(allow_none=True)
first_fetched=fields.DateTime(allow_none=True)
published_date=fields.DateTime()
date=fields.DateTime(allow_none=True)
text=fields.String()
title=fields.String()
author=fields.String(allow_none=True)
section_id=fields.Integer()
sourcetype =fields.String()
organization_name=fields.String(dump_only=True)
organization_image=fields.String(dump_only=True)
organization_id=fields.Integer(dump_only=True)
image =fields.String(allow_none=True)
# @post_load
# def make_article(self, data):
# return Article.deserialize(data)
class Article(Base):
__tablename__ = 'articles'
id = Column(Integer, primary_key=True)
parent_id= Column(Integer)
url = Column(String(250))
is_primary = Column(Boolean)
fingerprint = Column(String(250),unique=True)
hash = Column(String(250))
last_fetched = Column(DateTime)
first_fetched=Column(DateTime)
published_date = Column(DateTime)
date = Column(DateTime)
text = Column(Text)
title = Column(String(250))
author = Column(String(250))
section = relationship("Section")
section_id=Column(Integer, ForeignKey('sections.id'))
sourcetype = Column(String(250))
image=Column(String(250))
__schema__=FullArticleSchema
__jsonid__='article'
__whiteattrs__= []
__jsonattrs__=None
def __init__(self, url=None,title=None, published_date=None):
self.url=url
self.title=title
self.published_date=published_date
self.first_fetched=datetime.now()
# def __json__(self):
# return ArticleSchema().dump(self)[0]
def organization_name(self):
return self.section.organization.name
def organization_image(self):
return self.section.organization.image
def organization_id(self):
return self.section.organization.id
# def dict(self):
# return {"id": str(int(self.id)), "title": self.title, "text": self.text, "author": self.author, "section":self.section, "sourcetype": self.sourcetype, "last_fetched": self.last_fetched, "first_fetched": self.first_fetched, "published_date": self.published_date, "date": self.date,"image": self.image, "url": self.url}
# @classmethod
# def deserialize(cls,data):
# a=Article()
# for c in Article.__table__.columns:
# if data.has_key(c.key):
# setattr(a, c.key,data[c.key])
# return a
# @classmethod
# def sections(self):
# sects=db_session.query(Article.section).distinct().all()
# for i in range(len(sects)):
# sects[i]=sects[i][0]
# return sects
@classmethod
def from_hash(cls, a):
fp = calc_fingerprint_h(a)
aa = Article.query.filter(Article.fingerprint==fp).first()
if aa is None:
clogger.debug( "new Article")
if a["published"] is not None:
if a["published"] is str:
pd= parse(a["published"])
else:
pd=a["published"]
else:
pd=None
aa=Article(a["url"], a["title"],pd)
aa.fingerprint = calc_fingerprint(aa)
db_session.add(aa)
db_session.commit()
return aa
def process_hash(self, a):
self.text=a["text"].decode('utf8')
if "image" in a:
self.image=a["image"]
if "author" in a:
self.author=a["author"]
if "title" in a:
self.title=a["title"]
if "author" in a:
self.author=a["author"]
if "sourcetype" in a:
self.sourcetype=a["sourcetype"]
if "section" in a:
self.section=Section.find_or_create(a["section"])
# if "last_fetched" in a:
# self.last_fetched=a["last_fetched"]
if "published_date" in a:
self.published_date=a["published_date"]
#flask.json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None)
#json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None)
class ArticleSchema(Schema):
id=fields.Integer()
text=fields.String()
title=fields.String()
author=fields.String()
sourcetype =fields.String()
image =fields.String()
url =fields.String()
published_date=fields.DateTime()
date=fields.DateTime()
first_fetched=fields.DateTime()
section_id=fields.Integer()