Files
fachschaften/compiler/models.py
2017-02-17 10:02:20 +01:00

83 lines
2.3 KiB
Python

from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text
from datetime import datetime
from src.database import Base2
from src.database import db_session2
from mqueues import put_fetch_queue
from marshmallow import Schema,fields,ValidationError
import json
import flask
def add_url(tpe, url):
cu=CrawlUrl.find_or_create(tpe,url)
db_session2.add(cu)
db_session2.commit()
cu.schedule()
class CrawlUrlSchema(Schema):
id=fields.Integer()
tpe=fields.String()
url=fields.String()
last_fetched=fields.DateTime()
fetched = fields.DateTime()
class CrawlUrl(Base2):
__tablename__='crawlurls'
id = Column(Integer, primary_key=True)
tpe=Column(String(250))
url = Column(String(250))
last_fetched = Column(DateTime)
__schema__=CrawlUrlSchema
__jsonid__='crawlurl'
__whiteattrs__=["id","tpe","url"]
__jsonattrs__=None
def fetched(self):
CrawlCache.query.find(CrawlCache.url==self.url).first()
@classmethod
def find_or_create(self, tpe, url):
aa = CrawlUrl.query.filter(CrawlUrl.url==url).filter(CrawlUrl.tpe==tpe).first()
if aa is None:
aa=CrawlUrl({"tpe":tpe,"url": url})
return aa
def schedule(self):
put_fetch_queue((0, self.tpe, self.url))
# def __init__(self, tpe, url):
# self.url=url
# self.tpe=tpe
def __json__(self):
return CrawlUrlSchema().dump(self)[0]
class CrawlCacheSchema(Schema):
id=fields.Integer()
raw=fields.String()
url=fields.String()
fetched=fields.DateTime()
class CrawlCache(Base2):
__tablename__='crawlcache'
id = Column(Integer, primary_key=True)
url=Column(String(250))
fetched=Column(DateTime)
raw=Column(Text)
__schema__=CrawlCacheSchema
__jsonattrs__=None
__jsonid__='crawlcache'
__whiteattrs__= []
def __init__(self, url,rw):
self.url=url
self.raw=rw
self.fetched=datetime.utcnow()
def __json__(self):
return CrawlCacheSchema().dump(self)
@classmethod
def store(cls, url, rw):
cc=CrawlCache(url,rw)
db_session2.add(cc)
db_session2.commit()
#flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, CrawlUrl) else None)