fetch all fb

This commit is contained in:
Andreas Stephanides
2017-02-17 09:58:35 +01:00
parent eb071d9f95
commit 80c42c04cd
3 changed files with 12 additions and 6 deletions

View File

@@ -36,7 +36,7 @@ from datetime import datetime, timedelta
def fetch_page(furl):
def fetch_page(furl,p={}):
current_time = datetime.utcnow()
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
u=urlparse.urlparse(furl)
@@ -49,7 +49,10 @@ def fetch_page(furl):
clogger.debug("fetching url: "+ str(furl))
if u[0]=='fb':
fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds()))
furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments"
if p.has_key("nofollow") and p["nofollow"]==False:
furl=u[1]+u[2]+"?fields=story,created_time,id,message,attachments"
else:
furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments"
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
if cc is None:
tx = json.dumps(graph.get_object(id=furl))

View File

@@ -5,7 +5,7 @@ from src.database import db_session
from mqueues import fetch_queue, compile_queue, put_fetch_queue
from fetching import fetch_page, downloadfile, announce_articleid
from fixing import fix_html, fix_file
from sqlalchemy.exc import InvalidRequestError
from compiler import article_types
from fixing import fix_link
# process article expects an hash with raw data for the article and puts it into an
@@ -28,9 +28,12 @@ def process_article(art):
aa.last_fetched = datetime.now()
aa.sourcetype = art["sourcetype"]
db_session.add(aa)
db_session.commit()
try:
db_session.commit()
except InvalidRequestError,e:
db_session.rollback()
clogger.error(e)
clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
# announce_articleid(aa.id)
return aa
# process a single found url

View File

@@ -56,7 +56,7 @@ def run_fetch():
tc, tpe, url= fetch_queue.get(True, 100)
clogger.info("Fechted url:"+url)
if tpe is not "dummyarticle" and tpe is not "dummyindex":
rw=fetch_page(url)
rw = fetch_page(url, p)
else:
rw="<p> dummytext</p>"
compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw},p))