From 80c42c04cdbb56dac4a64d2c1277c260b20dc95c Mon Sep 17 00:00:00 2001 From: Andreas Stephanides Date: Fri, 17 Feb 2017 09:58:35 +0100 Subject: [PATCH] fetch all fb --- compiler/fetching.py | 7 +++++-- compiler/mprocess.py | 9 ++++++--- compiler/mworker.py | 2 +- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/compiler/fetching.py b/compiler/fetching.py index 54dfb10..6fe24b0 100644 --- a/compiler/fetching.py +++ b/compiler/fetching.py @@ -36,7 +36,7 @@ from datetime import datetime, timedelta -def fetch_page(furl): +def fetch_page(furl,p={}): current_time = datetime.utcnow() ten_weeks_ago = current_time - timedelta(days=cfg.cache_days) u=urlparse.urlparse(furl) @@ -49,7 +49,10 @@ def fetch_page(furl): clogger.debug("fetching url: "+ str(furl)) if u[0]=='fb': fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds())) - furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments" + if p.has_key("nofollow") and p["nofollow"]==False: + furl=u[1]+u[2]+"?fields=story,created_time,id,message,attachments" + else: + furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments" cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first() if cc is None: tx = json.dumps(graph.get_object(id=furl)) diff --git a/compiler/mprocess.py b/compiler/mprocess.py index 44f067a..30f2ed9 100644 --- a/compiler/mprocess.py +++ b/compiler/mprocess.py @@ -5,7 +5,7 @@ from src.database import db_session from mqueues import fetch_queue, compile_queue, put_fetch_queue from fetching import fetch_page, downloadfile, announce_articleid from fixing import fix_html, fix_file - +from sqlalchemy.exc import InvalidRequestError from compiler import article_types from fixing import fix_link # process article expects an hash with raw data for the article and puts it into an @@ -28,9 +28,12 @@ def process_article(art): aa.last_fetched = datetime.now() aa.sourcetype = art["sourcetype"] db_session.add(aa) - db_session.commit() + try: + db_session.commit() + except InvalidRequestError,e: + db_session.rollback() + clogger.error(e) clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8"))) -# announce_articleid(aa.id) return aa # process a single found url diff --git a/compiler/mworker.py b/compiler/mworker.py index 477a443..3edd4d0 100644 --- a/compiler/mworker.py +++ b/compiler/mworker.py @@ -56,7 +56,7 @@ def run_fetch(): tc, tpe, url= fetch_queue.get(True, 100) clogger.info("Fechted url:"+url) if tpe is not "dummyarticle" and tpe is not "dummyindex": - rw=fetch_page(url) + rw = fetch_page(url, p) else: rw="

dummytext

" compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw},p))