Files
fachschaften/articles/model.py
Andreas Stephanides 0c1b586962 loaddump_Articles
2017-02-08 07:14:36 +01:00

170 lines
5.5 KiB
Python

from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey
from sqlalchemy.orm import relationship
from datetime import datetime
from src.database import Base
from src.database import db_session
from marshmallow import Schema, fields, post_load
from src.sections.model import Section
#import json
import json
import flask
#json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None)
import hashlib
#import clogger
import logging
#from crawler.compiler.mqueues import put_fetch_queue
from src import clogger
#json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None)
def calc_fingerprint(a):
return calc_fingerprint_h({"url": a.url, "title":a.title, "published": str(a.published_date)})
def calc_fingerprint_h(a):
if a["published"] is not None and a["published"]!= "None":
# clogger.info( "published:"+str(a["published"]))
if a["published"] is str:
pp=parse(a["published"])
else:
pp=a["published"]
else:
pp=""
#clogger.info( unicode(a["url"])+ unicode(a["title"])+unicode(pp))
h=hashlib.md5()
h.update(unicode(a["url"]))
h.update(a["title"].encode("utf-8"))
h.update(unicode(pp))
return h.hexdigest()
class Article(Base):
__tablename__ = 'articles'
id = Column(Integer, primary_key=True)
parent_id= Column(Integer)
url = Column(String(250))
is_primary = Column(Boolean)
fingerprint = Column(String(250),unique=True)
hash = Column(String(250))
last_fetched = Column(DateTime)
first_fetched=Column(DateTime)
published_date = Column(DateTime)
date = Column(DateTime)
text = Column(Text)
title = Column(String(250))
author = Column(String(250))
section = relationship("Section")
section_id=Column(Integer, ForeignKey('sections.id'))
sourcetype = Column(String(250))
image=Column(String(250))
def __init__(self, url=None,title=None, published_date=None):
self.url=url
self.title=title
self.published_date=published_date
self.first_fetched=datetime.now()
def __json__(self):
return ArticleSchema().dump(self)[0]
def dict(self):
return {"id": str(int(self.id)), "title": self.title, "text": self.text, "author": self.author, "section":self.section, "sourcetype": self.sourcetype, "last_fetched": self.last_fetched, "first_fetched": self.first_fetched, "published_date": self.published_date, "date": self.date,"image": self.image, "url": self.url}
@classmethod
def deserialize(cls,data):
a=Article()
for c in Article.__table__.columns:
if data.has_key(c.key):
setattr(a, c.key,data[c.key])
return a
# @classmethod
# def sections(self):
# sects=db_session.query(Article.section).distinct().all()
# for i in range(len(sects)):
# sects[i]=sects[i][0]
# return sects
@classmethod
def from_hash(cls, a):
fp = calc_fingerprint_h(a)
aa = Article.query.filter(Article.fingerprint==fp).first()
if aa is None:
clogger.debug( "new Article")
if a["published"] is not None:
if a["published"] is str:
pd= parse(a["published"])
else:
pd=a["published"]
else:
pd=None
aa=Article(a["url"], a["title"],pd)
aa.fingerprint = calc_fingerprint(aa)
db_session.add(aa)
db_session.commit()
return aa
def process_hash(self, a):
self.text=a["text"].decode('utf8')
if "image" in a:
self.image=a["image"]
if "author" in a:
self.author=a["author"]
if "title" in a:
self.title=a["title"]
if "author" in a:
self.author=a["author"]
if "sourcetype" in a:
self.sourcetype=a["sourcetype"]
if "section" in a:
self.section=Section.find_or_create(a["section"])
# if "last_fetched" in a:
# self.last_fetched=a["last_fetched"]
if "published_date" in a:
self.published_date=a["published_date"]
#flask.json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None)
#json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None)
class FullArticleSchema(Schema):
id=fields.Integer()
parent_id=fields.Integer(allow_none=True)
url =fields.String()
is_primary=fields.Boolean(allow_none=True)
fingerprint=fields.String()
hash=fields.String(allow_none=True)
last_fetched=fields.DateTime(allow_none=True)
first_fetched=fields.DateTime(allow_none=True)
published_date=fields.DateTime()
date=fields.DateTime(allow_none=True)
text=fields.String()
title=fields.String()
author=fields.String(allow_none=True)
section_id=fields.Integer()
sourcetype =fields.String()
image =fields.String(allow_none=True)
@post_load
def make_article(self, data):
return Article.deserialize(data)
class ArticleSchema(Schema):
id=fields.Integer()
text=fields.String()
title=fields.String()
author=fields.String()
sourcetype =fields.String()
image =fields.String()
url =fields.String()
published_date=fields.DateTime()
date=fields.DateTime()
first_fetched=fields.DateTime()
section_id=fields.Integer()