140 lines
4.6 KiB
Python
140 lines
4.6 KiB
Python
|
|
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey
|
|
from sqlalchemy.orm import relationship
|
|
from datetime import datetime
|
|
from src.database import Base
|
|
from src.database import db_session
|
|
from marshmallow import Schema, fields
|
|
from src.sections.model import Section
|
|
|
|
#import json
|
|
import json
|
|
import flask
|
|
#json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None)
|
|
import hashlib
|
|
|
|
#import clogger
|
|
import logging
|
|
#from crawler.compiler.mqueues import put_fetch_queue
|
|
from src import clogger
|
|
#json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None)
|
|
|
|
|
|
|
|
|
|
def calc_fingerprint(a):
|
|
return calc_fingerprint_h({"url": a.url, "title":a.title, "published": str(a.published_date)})
|
|
|
|
def calc_fingerprint_h(a):
|
|
if a["published"] is not None and a["published"]!= "None":
|
|
# clogger.info( "published:"+str(a["published"]))
|
|
if a["published"] is str:
|
|
pp=parse(a["published"])
|
|
else:
|
|
pp=a["published"]
|
|
else:
|
|
pp=""
|
|
#clogger.info( unicode(a["url"])+ unicode(a["title"])+unicode(pp))
|
|
h=hashlib.md5()
|
|
h.update(unicode(a["url"]))
|
|
h.update(a["title"].encode("utf-8"))
|
|
h.update(unicode(pp))
|
|
return h.hexdigest()
|
|
|
|
class Article(Base):
|
|
__tablename__ = 'articles'
|
|
id = Column(Integer, primary_key=True)
|
|
parent_id= Column(Integer)
|
|
url = Column(String(250))
|
|
is_primary = Column(Boolean)
|
|
fingerprint = Column(String(250),unique=True)
|
|
hash = Column(String(250))
|
|
last_fetched = Column(DateTime)
|
|
first_fetched=Column(DateTime)
|
|
published_date = Column(DateTime)
|
|
date = Column(DateTime)
|
|
text = Column(Text)
|
|
title = Column(String(250))
|
|
author = Column(String(250))
|
|
section = relationship("Section")
|
|
section_id=Column(Integer, ForeignKey('sections.id'))
|
|
sourcetype = Column(String(250))
|
|
image=Column(String(250))
|
|
|
|
def __init__(self, url=None,title=None, published_date=None):
|
|
self.url=url
|
|
self.title=title
|
|
self.published_date=published_date
|
|
self.first_fetched=datetime.now()
|
|
def __json__(self):
|
|
return ArticleSchema().dump(self)[0]
|
|
|
|
def dict(self):
|
|
return {"id": str(int(self.id)), "title": self.title, "text": self.text, "author": self.author, "section":self.section, "sourcetype": self.sourcetype, "last_fetched": self.last_fetched, "first_fetched": self.first_fetched, "published_date": self.published_date, "date": self.date,"image": self.image, "url": self.url}
|
|
|
|
|
|
# @classmethod
|
|
# def sections(self):
|
|
# sects=db_session.query(Article.section).distinct().all()
|
|
# for i in range(len(sects)):
|
|
# sects[i]=sects[i][0]
|
|
# return sects
|
|
|
|
@classmethod
|
|
def from_hash(cls, a):
|
|
fp = calc_fingerprint_h(a)
|
|
aa = Article.query.filter(Article.fingerprint==fp).first()
|
|
if aa is None:
|
|
clogger.debug( "new Article")
|
|
if a["published"] is not None:
|
|
if a["published"] is str:
|
|
pd= parse(a["published"])
|
|
else:
|
|
pd=a["published"]
|
|
else:
|
|
pd=None
|
|
aa=Article(a["url"], a["title"],pd)
|
|
aa.fingerprint = calc_fingerprint(aa)
|
|
db_session.add(aa)
|
|
db_session.commit()
|
|
return aa
|
|
|
|
def process_hash(self, a):
|
|
self.text=a["text"].decode('utf8')
|
|
if "image" in a:
|
|
self.image=a["image"]
|
|
if "author" in a:
|
|
self.author=a["author"]
|
|
if "title" in a:
|
|
self.title=a["title"]
|
|
if "author" in a:
|
|
self.author=a["author"]
|
|
if "sourcetype" in a:
|
|
self.sourcetype=a["sourcetype"]
|
|
if "section" in a:
|
|
self.section=Section.find_or_create(a["section"])
|
|
# if "last_fetched" in a:
|
|
# self.last_fetched=a["last_fetched"]
|
|
if "published_date" in a:
|
|
self.published_date=a["published_date"]
|
|
|
|
|
|
|
|
#flask.json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None)
|
|
|
|
#json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None)
|
|
|
|
class ArticleSchema(Schema):
|
|
id=fields.Integer()
|
|
text=fields.String()
|
|
title=fields.String()
|
|
author=fields.String()
|
|
sourcetype =fields.String()
|
|
image =fields.String()
|
|
url =fields.String()
|
|
published_date=fields.DateTime()
|
|
date=fields.DateTime()
|
|
first_fetched=fields.DateTime()
|
|
section_id=fields.Integer()
|
|
|