improved fb fetching (since)

2017-01-19 15:09:24 +01:00
parent 7e90ce324d
commit 2fd82f0bc9
3 changed files with 23 additions and 2 deletions
--- a/bot/bot.py
+++ b/bot/bot.py
@@ -98,7 +98,7 @@ def handle(handler,msg):
        lg.debug(msg['data'])

 def make_article_json(art):
-    res={"type":"article", "title":art.title+" (" +str(art.section.title())+")", "id": str(art.id), "url": art.url, "message_text": art.title + " " + art.url}
+    res={"type":"article", "title":art.title+" (" +str(art.section.title())+")", "id": str(art.id), "url": art.url, "message_text": art.title +" (" +str(art.section.title())+")" + " " + art.url}
    if art.image != None:
        lg.debug("http://crawler.fachschaften.at/"+str(art.image))
        res["thumb_url"]="http://crawler.fachschaften.at/"+str(art.image)
--- a/compiler/fetching.py
+++ b/compiler/fetching.py
@@ -8,6 +8,8 @@ from gevent import spawn
 from src import clogger
 from src.fb import graph
 from hashlib import md5
+from src.database import db_session2
+from models import CrawlUrl
 import errno
 import urlparse
 def announce_articleid(id):
@@ -38,15 +40,32 @@ def fetch_page(furl):
    current_time = datetime.utcnow()
    ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
    u=urlparse.urlparse(furl)
+    cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first()
+ 
    if u[0] == '':
        furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
    cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
    if cc is None or u[0]=='fb': # no caching for Facebook
        clogger.debug("fetching url:  "+ str(furl))
        if u[0]=='fb':
-            tx = json.dumps(graph.get_object(id=u[1]+u[2]+"?fields=story,created_time,id,message,attachments"))
+            fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds()))
+            furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments"
+            cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
+            if cc is None:
+                tx = json.dumps(graph.get_object(id=furl))
+            else:
+                tx = cc.raw
+                if not cu==None:
+                    cu.last_fetched = datetime.utcnow()
+                    db_session2.add(cu)
+                    db_session2.commit()
+
        else:
            tx=s.get(furl).text
+            if not cu==None:
+                cu.last_fetched = datetime.utcnow()
+                db_session2.add(cu)
+                db_session2.commit()
        CrawlCache.store(furl,tx)
    else:
    #if furl is not None:
--- a/compiler/mworker.py
+++ b/compiler/mworker.py
@@ -5,7 +5,9 @@ from mprocess import do_process
 from fetching import fetch_page
 from gevent import spawn
 from itertools import repeat
+from models import CrawlUrl
 from src import clogger
+from src.database import db_session2
 def start_workers(f,c,p):
    for _ in range(f):
        clogger.debug("spawn fetchworker")