improved fb fetching (since)
This commit is contained in:
@@ -98,7 +98,7 @@ def handle(handler,msg):
|
|||||||
lg.debug(msg['data'])
|
lg.debug(msg['data'])
|
||||||
|
|
||||||
def make_article_json(art):
|
def make_article_json(art):
|
||||||
res={"type":"article", "title":art.title+" (" +str(art.section.title())+")", "id": str(art.id), "url": art.url, "message_text": art.title + " " + art.url}
|
res={"type":"article", "title":art.title+" (" +str(art.section.title())+")", "id": str(art.id), "url": art.url, "message_text": art.title +" (" +str(art.section.title())+")" + " " + art.url}
|
||||||
if art.image != None:
|
if art.image != None:
|
||||||
lg.debug("http://crawler.fachschaften.at/"+str(art.image))
|
lg.debug("http://crawler.fachschaften.at/"+str(art.image))
|
||||||
res["thumb_url"]="http://crawler.fachschaften.at/"+str(art.image)
|
res["thumb_url"]="http://crawler.fachschaften.at/"+str(art.image)
|
||||||
|
|||||||
@@ -8,6 +8,8 @@ from gevent import spawn
|
|||||||
from src import clogger
|
from src import clogger
|
||||||
from src.fb import graph
|
from src.fb import graph
|
||||||
from hashlib import md5
|
from hashlib import md5
|
||||||
|
from src.database import db_session2
|
||||||
|
from models import CrawlUrl
|
||||||
import errno
|
import errno
|
||||||
import urlparse
|
import urlparse
|
||||||
def announce_articleid(id):
|
def announce_articleid(id):
|
||||||
@@ -38,15 +40,32 @@ def fetch_page(furl):
|
|||||||
current_time = datetime.utcnow()
|
current_time = datetime.utcnow()
|
||||||
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
|
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
|
||||||
u=urlparse.urlparse(furl)
|
u=urlparse.urlparse(furl)
|
||||||
|
cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first()
|
||||||
|
|
||||||
if u[0] == '':
|
if u[0] == '':
|
||||||
furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
|
furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
|
||||||
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
|
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
|
||||||
if cc is None or u[0]=='fb': # no caching for Facebook
|
if cc is None or u[0]=='fb': # no caching for Facebook
|
||||||
clogger.debug("fetching url: "+ str(furl))
|
clogger.debug("fetching url: "+ str(furl))
|
||||||
if u[0]=='fb':
|
if u[0]=='fb':
|
||||||
tx = json.dumps(graph.get_object(id=u[1]+u[2]+"?fields=story,created_time,id,message,attachments"))
|
fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds()))
|
||||||
|
furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments"
|
||||||
|
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
|
||||||
|
if cc is None:
|
||||||
|
tx = json.dumps(graph.get_object(id=furl))
|
||||||
|
else:
|
||||||
|
tx = cc.raw
|
||||||
|
if not cu==None:
|
||||||
|
cu.last_fetched = datetime.utcnow()
|
||||||
|
db_session2.add(cu)
|
||||||
|
db_session2.commit()
|
||||||
|
|
||||||
else:
|
else:
|
||||||
tx=s.get(furl).text
|
tx=s.get(furl).text
|
||||||
|
if not cu==None:
|
||||||
|
cu.last_fetched = datetime.utcnow()
|
||||||
|
db_session2.add(cu)
|
||||||
|
db_session2.commit()
|
||||||
CrawlCache.store(furl,tx)
|
CrawlCache.store(furl,tx)
|
||||||
else:
|
else:
|
||||||
#if furl is not None:
|
#if furl is not None:
|
||||||
|
|||||||
@@ -5,7 +5,9 @@ from mprocess import do_process
|
|||||||
from fetching import fetch_page
|
from fetching import fetch_page
|
||||||
from gevent import spawn
|
from gevent import spawn
|
||||||
from itertools import repeat
|
from itertools import repeat
|
||||||
|
from models import CrawlUrl
|
||||||
from src import clogger
|
from src import clogger
|
||||||
|
from src.database import db_session2
|
||||||
def start_workers(f,c,p):
|
def start_workers(f,c,p):
|
||||||
for _ in range(f):
|
for _ in range(f):
|
||||||
clogger.debug("spawn fetchworker")
|
clogger.debug("spawn fetchworker")
|
||||||
|
|||||||
Reference in New Issue
Block a user