From 80c42c04cdbb56dac4a64d2c1277c260b20dc95c Mon Sep 17 00:00:00 2001
From: Andreas Stephanides <andreas.stephanides@gmail.com>
Date: Fri, 17 Feb 2017 09:58:35 +0100
Subject: [PATCH] fetch all fb

---
 compiler/fetching.py | 7 +++++--
 compiler/mprocess.py | 9 ++++++---
 compiler/mworker.py  | 2 +-
 3 files changed, 12 insertions(+), 6 deletions(-)
diff --git a/compiler/fetching.py b/compiler/fetching.py
index 54dfb10..6fe24b0 100644
--- a/compiler/fetching.py
+++ b/compiler/fetching.py
@@ -36,7 +36,7 @@ from datetime import datetime, timedelta
 
 
 
-def fetch_page(furl):
+def fetch_page(furl,p={}):
     current_time = datetime.utcnow()
     ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
     u=urlparse.urlparse(furl)
@@ -49,7 +49,10 @@ def fetch_page(furl):
         clogger.debug("fetching url:  "+ str(furl))
         if u[0]=='fb':
             fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds()))
-            furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments"
+            if p.has_key("nofollow") and p["nofollow"]==False:
+                furl=u[1]+u[2]+"?fields=story,created_time,id,message,attachments"
+            else:
+                furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments"
             cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
             if cc is None:
                 tx = json.dumps(graph.get_object(id=furl))
diff --git a/compiler/mprocess.py b/compiler/mprocess.py
index 44f067a..30f2ed9 100644
--- a/compiler/mprocess.py
+++ b/compiler/mprocess.py
@@ -5,7 +5,7 @@ from src.database import db_session
 from mqueues import fetch_queue, compile_queue, put_fetch_queue
 from fetching import fetch_page, downloadfile, announce_articleid
 from fixing import fix_html, fix_file
-
+from sqlalchemy.exc import InvalidRequestError
 from compiler import article_types
 from fixing import fix_link
 # process article expects an hash with raw data for the article and puts it into an
@@ -28,9 +28,12 @@ def process_article(art):
         aa.last_fetched = datetime.now()
         aa.sourcetype = art["sourcetype"]
         db_session.add(aa)
-        db_session.commit()
+        try:
+            db_session.commit()
+        except InvalidRequestError,e:
+            db_session.rollback()
+            clogger.error(e)
         clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
-#        announce_articleid(aa.id)
     return aa
 
 # process a single found url
diff --git a/compiler/mworker.py b/compiler/mworker.py
index 477a443..3edd4d0 100644
--- a/compiler/mworker.py
+++ b/compiler/mworker.py
@@ -56,7 +56,7 @@ def run_fetch():
         tc, tpe, url= fetch_queue.get(True, 100)
     clogger.info("Fechted url:"+url)
     if tpe is not "dummyarticle" and tpe is not "dummyindex":
-        rw=fetch_page(url)
+        rw = fetch_page(url, p)
     else:
         rw="<p> dummytext</p>"
     compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw},p))