init commit
This commit is contained in:
139
articles/model.py
Normal file
139
articles/model.py
Normal file
@@ -0,0 +1,139 @@
|
||||
|
||||
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey
|
||||
from sqlalchemy.orm import relationship
|
||||
from datetime import datetime
|
||||
from src.database import Base
|
||||
from src.database import db_session
|
||||
from marshmallow import Schema, fields
|
||||
from src.sections.model import Section
|
||||
|
||||
#import json
|
||||
import json
|
||||
import flask
|
||||
#json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None)
|
||||
import hashlib
|
||||
|
||||
#import clogger
|
||||
import logging
|
||||
#from crawler.compiler.mqueues import put_fetch_queue
|
||||
from src import clogger
|
||||
#json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None)
|
||||
|
||||
|
||||
|
||||
|
||||
def calc_fingerprint(a):
|
||||
return calc_fingerprint_h({"url": a.url, "title":a.title, "published": str(a.published_date)})
|
||||
|
||||
def calc_fingerprint_h(a):
|
||||
if a["published"] is not None and a["published"]!= "None":
|
||||
# clogger.info( "published:"+str(a["published"]))
|
||||
if a["published"] is str:
|
||||
pp=parse(a["published"])
|
||||
else:
|
||||
pp=a["published"]
|
||||
else:
|
||||
pp=""
|
||||
#clogger.info( unicode(a["url"])+ unicode(a["title"])+unicode(pp))
|
||||
h=hashlib.md5()
|
||||
h.update(unicode(a["url"]))
|
||||
h.update(a["title"].encode("utf-8"))
|
||||
h.update(unicode(pp))
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
class ArticleSchema(Schema):
|
||||
id=fields.Integer()
|
||||
text=fields.String()
|
||||
title=fields.String()
|
||||
author=fields.String()
|
||||
sourcetype =fields.String()
|
||||
image =fields.String()
|
||||
url =fields.String()
|
||||
published_date=fields.DateTime()
|
||||
date=fields.DateTime()
|
||||
first_fetched=fields.DateTime()
|
||||
section_id=fields.Integer()
|
||||
|
||||
class Article(Base):
|
||||
__tablename__ = 'articles'
|
||||
id = Column(Integer, primary_key=True)
|
||||
parent_id= Column(Integer)
|
||||
url = Column(String(250))
|
||||
is_primary = Column(Boolean)
|
||||
fingerprint = Column(String(250),unique=True)
|
||||
hash = Column(String(250))
|
||||
last_fetched = Column(DateTime)
|
||||
first_fetched=Column(DateTime)
|
||||
published_date = Column(DateTime)
|
||||
date = Column(DateTime)
|
||||
text = Column(Text)
|
||||
title = Column(String(250))
|
||||
author = Column(String(250))
|
||||
section = relationship("Section")
|
||||
section_id=Column(Integer, ForeignKey('sections.id'))
|
||||
sourcetype = Column(String(250))
|
||||
image=Column(String(250))
|
||||
|
||||
def __init__(self, url=None,title=None, published_date=None):
|
||||
self.url=url
|
||||
self.title=title
|
||||
self.published_date=published_date
|
||||
self.first_fetched=datetime.now()
|
||||
def __json__(self):
|
||||
return ArticleSchema().dump(self)[0]
|
||||
|
||||
def dict(self):
|
||||
return {"id": str(int(self.id)), "title": self.title, "text": self.text, "author": self.author, "section":self.section, "sourcetype": self.sourcetype, "last_fetched": self.last_fetched, "first_fetched": self.first_fetched, "published_date": self.published_date, "date": self.date,"image": self.image, "url": self.url}
|
||||
|
||||
|
||||
# @classmethod
|
||||
# def sections(self):
|
||||
# sects=db_session.query(Article.section).distinct().all()
|
||||
# for i in range(len(sects)):
|
||||
# sects[i]=sects[i][0]
|
||||
# return sects
|
||||
|
||||
@classmethod
|
||||
def from_hash(cls, a):
|
||||
fp = calc_fingerprint_h(a)
|
||||
aa = Article.query.filter(Article.fingerprint==fp).first()
|
||||
if aa is None:
|
||||
clogger.debug( "new Article")
|
||||
if a["published"] is not None:
|
||||
if a["published"] is str:
|
||||
pd= parse(a["published"])
|
||||
else:
|
||||
pd=a["published"]
|
||||
else:
|
||||
pd=None
|
||||
aa=Article(a["url"], a["title"],pd)
|
||||
aa.fingerprint = calc_fingerprint(aa)
|
||||
db_session.add(aa)
|
||||
db_session.commit()
|
||||
return aa
|
||||
|
||||
def process_hash(self, a):
|
||||
self.text=a["text"].decode('utf8')
|
||||
if "image" in a:
|
||||
self.image=a["image"]
|
||||
if "author" in a:
|
||||
self.author=a["author"]
|
||||
if "title" in a:
|
||||
self.title=a["title"]
|
||||
if "author" in a:
|
||||
self.author=a["author"]
|
||||
if "sourcetype" in a:
|
||||
self.sourcetype=a["sourcetype"]
|
||||
if "section" in a:
|
||||
self.section=Section.find_or_create(a["section"])
|
||||
# if "last_fetched" in a:
|
||||
# self.last_fetched=a["last_fetched"]
|
||||
if "published_date" in a:
|
||||
self.published_date=a["published_date"]
|
||||
|
||||
|
||||
|
||||
#flask.json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None)
|
||||
|
||||
#json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None)
|
||||
Reference in New Issue
Block a user