improved fb fetching (since)

This commit is contained in:
Andreas Stephanides
2017-01-19 15:09:24 +01:00
parent 7e90ce324d
commit 2fd82f0bc9
3 changed files with 23 additions and 2 deletions

View File

@@ -98,7 +98,7 @@ def handle(handler,msg):
lg.debug(msg['data']) lg.debug(msg['data'])
def make_article_json(art): def make_article_json(art):
res={"type":"article", "title":art.title+" (" +str(art.section.title())+")", "id": str(art.id), "url": art.url, "message_text": art.title + " " + art.url} res={"type":"article", "title":art.title+" (" +str(art.section.title())+")", "id": str(art.id), "url": art.url, "message_text": art.title +" (" +str(art.section.title())+")" + " " + art.url}
if art.image != None: if art.image != None:
lg.debug("http://crawler.fachschaften.at/"+str(art.image)) lg.debug("http://crawler.fachschaften.at/"+str(art.image))
res["thumb_url"]="http://crawler.fachschaften.at/"+str(art.image) res["thumb_url"]="http://crawler.fachschaften.at/"+str(art.image)

View File

@@ -8,6 +8,8 @@ from gevent import spawn
from src import clogger from src import clogger
from src.fb import graph from src.fb import graph
from hashlib import md5 from hashlib import md5
from src.database import db_session2
from models import CrawlUrl
import errno import errno
import urlparse import urlparse
def announce_articleid(id): def announce_articleid(id):
@@ -38,15 +40,32 @@ def fetch_page(furl):
current_time = datetime.utcnow() current_time = datetime.utcnow()
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days) ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
u=urlparse.urlparse(furl) u=urlparse.urlparse(furl)
cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first()
if u[0] == '': if u[0] == '':
furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4])) furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first() cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
if cc is None or u[0]=='fb': # no caching for Facebook if cc is None or u[0]=='fb': # no caching for Facebook
clogger.debug("fetching url: "+ str(furl)) clogger.debug("fetching url: "+ str(furl))
if u[0]=='fb': if u[0]=='fb':
tx = json.dumps(graph.get_object(id=u[1]+u[2]+"?fields=story,created_time,id,message,attachments")) fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds()))
furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments"
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
if cc is None:
tx = json.dumps(graph.get_object(id=furl))
else:
tx = cc.raw
if not cu==None:
cu.last_fetched = datetime.utcnow()
db_session2.add(cu)
db_session2.commit()
else: else:
tx=s.get(furl).text tx=s.get(furl).text
if not cu==None:
cu.last_fetched = datetime.utcnow()
db_session2.add(cu)
db_session2.commit()
CrawlCache.store(furl,tx) CrawlCache.store(furl,tx)
else: else:
#if furl is not None: #if furl is not None:

View File

@@ -5,7 +5,9 @@ from mprocess import do_process
from fetching import fetch_page from fetching import fetch_page
from gevent import spawn from gevent import spawn
from itertools import repeat from itertools import repeat
from models import CrawlUrl
from src import clogger from src import clogger
from src.database import db_session2
def start_workers(f,c,p): def start_workers(f,c,p):
for _ in range(f): for _ in range(f):
clogger.debug("spawn fetchworker") clogger.debug("spawn fetchworker")