83 lines
2.3 KiB
Python
83 lines
2.3 KiB
Python
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text
|
|
from datetime import datetime
|
|
from src.database import Base2
|
|
from src.database import db_session2
|
|
from mqueues import put_fetch_queue
|
|
from marshmallow import Schema,fields,ValidationError
|
|
import json
|
|
import flask
|
|
|
|
def add_url(tpe, url):
|
|
cu=CrawlUrl.find_or_create(tpe,url)
|
|
db_session2.add(cu)
|
|
db_session2.commit()
|
|
cu.schedule()
|
|
|
|
|
|
class CrawlUrlSchema(Schema):
|
|
id=fields.Integer()
|
|
tpe=fields.String()
|
|
url=fields.String()
|
|
last_fetched=fields.DateTime()
|
|
fetched = fields.DateTime()
|
|
|
|
class CrawlUrl(Base2):
|
|
__tablename__='crawlurls'
|
|
id = Column(Integer, primary_key=True)
|
|
tpe=Column(String(250))
|
|
url = Column(String(250))
|
|
last_fetched = Column(DateTime)
|
|
__schema__=CrawlUrlSchema
|
|
__jsonid__='crawlurl'
|
|
__whiteattrs__=["id","tpe","url"]
|
|
__jsonattrs__=None
|
|
def fetched(self):
|
|
CrawlCache.query.find(CrawlCache.url==self.url).first()
|
|
@classmethod
|
|
def find_or_create(self, tpe, url):
|
|
aa = CrawlUrl.query.filter(CrawlUrl.url==url).filter(CrawlUrl.tpe==tpe).first()
|
|
if aa is None:
|
|
aa=CrawlUrl({"tpe":tpe,"url": url})
|
|
return aa
|
|
def schedule(self):
|
|
put_fetch_queue((0, self.tpe, self.url))
|
|
# def __init__(self, tpe, url):
|
|
# self.url=url
|
|
# self.tpe=tpe
|
|
def __json__(self):
|
|
return CrawlUrlSchema().dump(self)[0]
|
|
|
|
class CrawlCacheSchema(Schema):
|
|
id=fields.Integer()
|
|
raw=fields.String()
|
|
url=fields.String()
|
|
fetched=fields.DateTime()
|
|
|
|
class CrawlCache(Base2):
|
|
__tablename__='crawlcache'
|
|
id = Column(Integer, primary_key=True)
|
|
url=Column(String(250))
|
|
fetched=Column(DateTime)
|
|
raw=Column(Text)
|
|
__schema__=CrawlCacheSchema
|
|
__jsonattrs__=None
|
|
__jsonid__='crawlcache'
|
|
__whiteattrs__= []
|
|
def __init__(self, url,rw):
|
|
self.url=url
|
|
self.raw=rw
|
|
self.fetched=datetime.utcnow()
|
|
def __json__(self):
|
|
return CrawlCacheSchema().dump(self)
|
|
|
|
@classmethod
|
|
def store(cls, url, rw):
|
|
cc=CrawlCache(url,rw)
|
|
db_session2.add(cc)
|
|
db_session2.commit()
|
|
|
|
|
|
|
|
|
|
#flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, CrawlUrl) else None)
|