from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey from sqlalchemy.orm import relationship from datetime import datetime from src.database import Base from src.database import db_session from marshmallow import Schema, fields, post_load from src.sections.model import Section #import json import json import flask #json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None) import hashlib #import clogger import logging #from crawler.compiler.mqueues import put_fetch_queue from src import clogger #json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None) def calc_fingerprint(a): return calc_fingerprint_h({"url": a.url, "title":a.title, "published": str(a.published_date)}) def calc_fingerprint_h(a): if a["published"] is not None and a["published"]!= "None": # clogger.info( "published:"+str(a["published"])) if a["published"] is str: pp=parse(a["published"]) else: pp=a["published"] else: pp="" #clogger.info( unicode(a["url"])+ unicode(a["title"])+unicode(pp)) h=hashlib.md5() h.update(unicode(a["url"])) h.update(a["title"].encode("utf-8")) h.update(unicode(pp)) return h.hexdigest() class FullArticleSchema(Schema): id=fields.Integer() parent_id=fields.Integer(allow_none=True) url =fields.String() is_primary=fields.Boolean(allow_none=True) fingerprint=fields.String() hash=fields.String(allow_none=True) last_fetched=fields.DateTime(allow_none=True) first_fetched=fields.DateTime(allow_none=True) published_date=fields.DateTime() date=fields.DateTime(allow_none=True) text=fields.String() title=fields.String() author=fields.String(allow_none=True) section_id=fields.Integer() sourcetype =fields.String() organization_name=fields.String(dump_only=True) organization_image=fields.String(dump_only=True) organization_id=fields.Integer(dump_only=True) image =fields.String(allow_none=True) # @post_load # def make_article(self, data): # return Article.deserialize(data) class Article(Base): __tablename__ = 'articles' id = Column(Integer, primary_key=True) parent_id= Column(Integer) url = Column(String(250)) is_primary = Column(Boolean) fingerprint = Column(String(250),unique=True) hash = Column(String(250)) last_fetched = Column(DateTime) first_fetched=Column(DateTime) published_date = Column(DateTime) date = Column(DateTime) text = Column(Text) title = Column(String(250)) author = Column(String(250)) section = relationship("Section") section_id=Column(Integer, ForeignKey('sections.id')) sourcetype = Column(String(250)) image=Column(String(250)) __schema__=FullArticleSchema __jsonid__='article' __whiteattrs__= [] __jsonattrs__=None def __init__(self, url=None,title=None, published_date=None): self.url=url self.title=title self.published_date=published_date self.first_fetched=datetime.now() # def __json__(self): # return ArticleSchema().dump(self)[0] def organization_name(self): return self.section.organization.name def organization_image(self): return self.section.organization.image def organization_id(self): return self.section.organization.id # def dict(self): # return {"id": str(int(self.id)), "title": self.title, "text": self.text, "author": self.author, "section":self.section, "sourcetype": self.sourcetype, "last_fetched": self.last_fetched, "first_fetched": self.first_fetched, "published_date": self.published_date, "date": self.date,"image": self.image, "url": self.url} # @classmethod # def deserialize(cls,data): # a=Article() # for c in Article.__table__.columns: # if data.has_key(c.key): # setattr(a, c.key,data[c.key]) # return a # @classmethod # def sections(self): # sects=db_session.query(Article.section).distinct().all() # for i in range(len(sects)): # sects[i]=sects[i][0] # return sects @classmethod def from_hash(cls, a): fp = calc_fingerprint_h(a) aa = Article.query.filter(Article.fingerprint==fp).first() if aa is None: clogger.debug( "new Article") if a["published"] is not None: if a["published"] is str: pd= parse(a["published"]) else: pd=a["published"] else: pd=None aa=Article(a["url"], a["title"],pd) aa.fingerprint = calc_fingerprint(aa) db_session.add(aa) db_session.commit() return aa def process_hash(self, a): self.text=a["text"].decode('utf8') if "image" in a: self.image=a["image"] if "author" in a: self.author=a["author"] if "title" in a: self.title=a["title"] if "author" in a: self.author=a["author"] if "sourcetype" in a: self.sourcetype=a["sourcetype"] if "section" in a: self.section=Section.find_or_create(a["section"]) # if "last_fetched" in a: # self.last_fetched=a["last_fetched"] if "published_date" in a: self.published_date=a["published_date"] #flask.json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None) #json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None) class ArticleSchema(Schema): id=fields.Integer() text=fields.String() title=fields.String() author=fields.String() sourcetype =fields.String() image =fields.String() url =fields.String() published_date=fields.DateTime() date=fields.DateTime() first_fetched=fields.DateTime() section_id=fields.Integer()