diff --git a/bot/bot.py b/bot/bot.py index 8ba4b67..c55d459 100644 --- a/bot/bot.py +++ b/bot/bot.py @@ -98,7 +98,7 @@ def handle(handler,msg): lg.debug(msg['data']) def make_article_json(art): - res={"type":"article", "title":art.title+" (" +str(art.section.title())+")", "id": str(art.id), "url": art.url, "message_text": art.title + " " + art.url} + res={"type":"article", "title":art.title+" (" +str(art.section.title())+")", "id": str(art.id), "url": art.url, "message_text": art.title +" (" +str(art.section.title())+")" + " " + art.url} if art.image != None: lg.debug("http://crawler.fachschaften.at/"+str(art.image)) res["thumb_url"]="http://crawler.fachschaften.at/"+str(art.image) diff --git a/compiler/fetching.py b/compiler/fetching.py index 1d8646f..54dfb10 100644 --- a/compiler/fetching.py +++ b/compiler/fetching.py @@ -8,6 +8,8 @@ from gevent import spawn from src import clogger from src.fb import graph from hashlib import md5 +from src.database import db_session2 +from models import CrawlUrl import errno import urlparse def announce_articleid(id): @@ -38,15 +40,32 @@ def fetch_page(furl): current_time = datetime.utcnow() ten_weeks_ago = current_time - timedelta(days=cfg.cache_days) u=urlparse.urlparse(furl) + cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first() + if u[0] == '': furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4])) cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first() if cc is None or u[0]=='fb': # no caching for Facebook clogger.debug("fetching url: "+ str(furl)) if u[0]=='fb': - tx = json.dumps(graph.get_object(id=u[1]+u[2]+"?fields=story,created_time,id,message,attachments")) + fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds())) + furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments" + cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first() + if cc is None: + tx = json.dumps(graph.get_object(id=furl)) + else: + tx = cc.raw + if not cu==None: + cu.last_fetched = datetime.utcnow() + db_session2.add(cu) + db_session2.commit() + else: tx=s.get(furl).text + if not cu==None: + cu.last_fetched = datetime.utcnow() + db_session2.add(cu) + db_session2.commit() CrawlCache.store(furl,tx) else: #if furl is not None: diff --git a/compiler/mworker.py b/compiler/mworker.py index b623978..106868d 100644 --- a/compiler/mworker.py +++ b/compiler/mworker.py @@ -5,7 +5,9 @@ from mprocess import do_process from fetching import fetch_page from gevent import spawn from itertools import repeat +from models import CrawlUrl from src import clogger +from src.database import db_session2 def start_workers(f,c,p): for _ in range(f): clogger.debug("spawn fetchworker")