fetch all fb

2017-02-17 09:58:35 +01:00
parent eb071d9f95
commit 80c42c04cd
3 changed files with 12 additions and 6 deletions
--- a/compiler/fetching.py
+++ b/compiler/fetching.py
@@ -36,7 +36,7 @@ from datetime import datetime, timedelta



-def fetch_page(furl):
+def fetch_page(furl,p={}):
    current_time = datetime.utcnow()
    ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
    u=urlparse.urlparse(furl)
@@ -49,7 +49,10 @@ def fetch_page(furl):
        clogger.debug("fetching url:  "+ str(furl))
        if u[0]=='fb':
            fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds()))
-            furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments"
+            if p.has_key("nofollow") and p["nofollow"]==False:
+                furl=u[1]+u[2]+"?fields=story,created_time,id,message,attachments"
+            else:
+                furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments"
            cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
            if cc is None:
                tx = json.dumps(graph.get_object(id=furl))
--- a/compiler/mprocess.py
+++ b/compiler/mprocess.py
@@ -5,7 +5,7 @@ from src.database import db_session
 from mqueues import fetch_queue, compile_queue, put_fetch_queue
 from fetching import fetch_page, downloadfile, announce_articleid
 from fixing import fix_html, fix_file
-
+from sqlalchemy.exc import InvalidRequestError
 from compiler import article_types
 from fixing import fix_link
 # process article expects an hash with raw data for the article and puts it into an
@@ -28,9 +28,12 @@ def process_article(art):
        aa.last_fetched = datetime.now()
        aa.sourcetype = art["sourcetype"]
        db_session.add(aa)
-        db_session.commit()
+        try:
+            db_session.commit()
+        except InvalidRequestError,e:
+            db_session.rollback()
+            clogger.error(e)
        clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
-#        announce_articleid(aa.id)
    return aa

 # process a single found url
--- a/compiler/mworker.py
+++ b/compiler/mworker.py
@@ -56,7 +56,7 @@ def run_fetch():
        tc, tpe, url= fetch_queue.get(True, 100)
    clogger.info("Fechted url:"+url)
    if tpe is not "dummyarticle" and tpe is not "dummyindex":
-        rw=fetch_page(url)
+        rw = fetch_page(url, p)
    else:
        rw="<p> dummytext</p>"
    compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw},p))