from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text from datetime import datetime from src.database import Base2 from src.database import db_session2 from mqueues import put_fetch_queue from marshmallow import Schema,fields,ValidationError import json import flask def add_url(tpe, url): cu=CrawlUrl.find_or_create(tpe,url) db_session2.add(cu) db_session2.commit() cu.schedule() class CrawlUrlSchema(Schema): id=fields.Integer() tpe=fields.String() url=fields.String() last_fetched=fields.DateTime() fetched = fields.DateTime() class CrawlUrl(Base2): __tablename__='crawlurls' id = Column(Integer, primary_key=True) tpe=Column(String(250)) url = Column(String(250)) last_fetched = Column(DateTime) __schema__=CrawlUrlSchema __jsonid__='crawlurl' __whiteattrs__=["id","tpe","url"] __jsonattrs__=None def fetched(self): CrawlCache.query.find(CrawlCache.url==self.url).first() @classmethod def find_or_create(self, tpe, url): aa = CrawlUrl.query.filter(CrawlUrl.url==url).filter(CrawlUrl.tpe==tpe).first() if aa is None: aa=CrawlUrl({"tpe":tpe,"url": url}) return aa def schedule(self): put_fetch_queue((0, self.tpe, self.url)) # def __init__(self, tpe, url): # self.url=url # self.tpe=tpe def __json__(self): return CrawlUrlSchema().dump(self)[0] class CrawlCacheSchema(Schema): id=fields.Integer() raw=fields.String() url=fields.String() fetched=fields.DateTime() class CrawlCache(Base2): __tablename__='crawlcache' id = Column(Integer, primary_key=True) url=Column(String(250)) fetched=Column(DateTime) raw=Column(Text) __schema__=CrawlCacheSchema __jsonattrs__=None __jsonid__='crawlcache' __whiteattrs__= [] def __init__(self, url,rw): self.url=url self.raw=rw self.fetched=datetime.utcnow() def __json__(self): return CrawlCacheSchema().dump(self) @classmethod def store(cls, url, rw): cc=CrawlCache(url,rw) db_session2.add(cc) db_session2.commit() #flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, CrawlUrl) else None)