init commit

This commit is contained in:
Andreas Stephanides
2017-01-14 12:23:04 +01:00
commit 8955bf17f5
32 changed files with 1555 additions and 0 deletions

139
articles/model.py Normal file
View File

@@ -0,0 +1,139 @@
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey
from sqlalchemy.orm import relationship
from datetime import datetime
from src.database import Base
from src.database import db_session
from marshmallow import Schema, fields
from src.sections.model import Section
#import json
import json
import flask
#json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None)
import hashlib
#import clogger
import logging
#from crawler.compiler.mqueues import put_fetch_queue
from src import clogger
#json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None)
def calc_fingerprint(a):
return calc_fingerprint_h({"url": a.url, "title":a.title, "published": str(a.published_date)})
def calc_fingerprint_h(a):
if a["published"] is not None and a["published"]!= "None":
# clogger.info( "published:"+str(a["published"]))
if a["published"] is str:
pp=parse(a["published"])
else:
pp=a["published"]
else:
pp=""
#clogger.info( unicode(a["url"])+ unicode(a["title"])+unicode(pp))
h=hashlib.md5()
h.update(unicode(a["url"]))
h.update(a["title"].encode("utf-8"))
h.update(unicode(pp))
return h.hexdigest()
class ArticleSchema(Schema):
id=fields.Integer()
text=fields.String()
title=fields.String()
author=fields.String()
sourcetype =fields.String()
image =fields.String()
url =fields.String()
published_date=fields.DateTime()
date=fields.DateTime()
first_fetched=fields.DateTime()
section_id=fields.Integer()
class Article(Base):
__tablename__ = 'articles'
id = Column(Integer, primary_key=True)
parent_id= Column(Integer)
url = Column(String(250))
is_primary = Column(Boolean)
fingerprint = Column(String(250),unique=True)
hash = Column(String(250))
last_fetched = Column(DateTime)
first_fetched=Column(DateTime)
published_date = Column(DateTime)
date = Column(DateTime)
text = Column(Text)
title = Column(String(250))
author = Column(String(250))
section = relationship("Section")
section_id=Column(Integer, ForeignKey('sections.id'))
sourcetype = Column(String(250))
image=Column(String(250))
def __init__(self, url=None,title=None, published_date=None):
self.url=url
self.title=title
self.published_date=published_date
self.first_fetched=datetime.now()
def __json__(self):
return ArticleSchema().dump(self)[0]
def dict(self):
return {"id": str(int(self.id)), "title": self.title, "text": self.text, "author": self.author, "section":self.section, "sourcetype": self.sourcetype, "last_fetched": self.last_fetched, "first_fetched": self.first_fetched, "published_date": self.published_date, "date": self.date,"image": self.image, "url": self.url}
# @classmethod
# def sections(self):
# sects=db_session.query(Article.section).distinct().all()
# for i in range(len(sects)):
# sects[i]=sects[i][0]
# return sects
@classmethod
def from_hash(cls, a):
fp = calc_fingerprint_h(a)
aa = Article.query.filter(Article.fingerprint==fp).first()
if aa is None:
clogger.debug( "new Article")
if a["published"] is not None:
if a["published"] is str:
pd= parse(a["published"])
else:
pd=a["published"]
else:
pd=None
aa=Article(a["url"], a["title"],pd)
aa.fingerprint = calc_fingerprint(aa)
db_session.add(aa)
db_session.commit()
return aa
def process_hash(self, a):
self.text=a["text"].decode('utf8')
if "image" in a:
self.image=a["image"]
if "author" in a:
self.author=a["author"]
if "title" in a:
self.title=a["title"]
if "author" in a:
self.author=a["author"]
if "sourcetype" in a:
self.sourcetype=a["sourcetype"]
if "section" in a:
self.section=Section.find_or_create(a["section"])
# if "last_fetched" in a:
# self.last_fetched=a["last_fetched"]
if "published_date" in a:
self.published_date=a["published_date"]
#flask.json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None)
#json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None)