div. updates

This commit is contained in:
andis
2017-02-17 10:02:20 +01:00
parent bdfa16728d
commit b71803c050
14 changed files with 224 additions and 65 deletions

View File

@@ -27,19 +27,23 @@ class CrawlUrl(Base2):
tpe=Column(String(250))
url = Column(String(250))
last_fetched = Column(DateTime)
__schema__=CrawlUrlSchema
__jsonid__='crawlurl'
__whiteattrs__=["id","tpe","url"]
__jsonattrs__=None
def fetched(self):
CrawlCache.query.find(CrawlCache.url==self.url).first()
@classmethod
def find_or_create(self, tpe, url):
aa = CrawlUrl.query.filter(CrawlUrl.url==url).filter(CrawlUrl.tpe==tpe).first()
if aa is None:
aa=CrawlUrl(tpe,url)
aa=CrawlUrl({"tpe":tpe,"url": url})
return aa
def schedule(self):
put_fetch_queue((0, self.tpe, self.url))
def __init__(self, tpe, url):
self.url=url
self.tpe=tpe
# def __init__(self, tpe, url):
# self.url=url
# self.tpe=tpe
def __json__(self):
return CrawlUrlSchema().dump(self)[0]
@@ -55,7 +59,10 @@ class CrawlCache(Base2):
url=Column(String(250))
fetched=Column(DateTime)
raw=Column(Text)
__schema__=CrawlCacheSchema
__jsonattrs__=None
__jsonid__='crawlcache'
__whiteattrs__= []
def __init__(self, url,rw):
self.url=url
self.raw=rw