fetch all fb
This commit is contained in:
@@ -36,7 +36,7 @@ from datetime import datetime, timedelta
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_page(furl):
|
def fetch_page(furl,p={}):
|
||||||
current_time = datetime.utcnow()
|
current_time = datetime.utcnow()
|
||||||
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
|
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
|
||||||
u=urlparse.urlparse(furl)
|
u=urlparse.urlparse(furl)
|
||||||
@@ -49,6 +49,9 @@ def fetch_page(furl):
|
|||||||
clogger.debug("fetching url: "+ str(furl))
|
clogger.debug("fetching url: "+ str(furl))
|
||||||
if u[0]=='fb':
|
if u[0]=='fb':
|
||||||
fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds()))
|
fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds()))
|
||||||
|
if p.has_key("nofollow") and p["nofollow"]==False:
|
||||||
|
furl=u[1]+u[2]+"?fields=story,created_time,id,message,attachments"
|
||||||
|
else:
|
||||||
furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments"
|
furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments"
|
||||||
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
|
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
|
||||||
if cc is None:
|
if cc is None:
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ from src.database import db_session
|
|||||||
from mqueues import fetch_queue, compile_queue, put_fetch_queue
|
from mqueues import fetch_queue, compile_queue, put_fetch_queue
|
||||||
from fetching import fetch_page, downloadfile, announce_articleid
|
from fetching import fetch_page, downloadfile, announce_articleid
|
||||||
from fixing import fix_html, fix_file
|
from fixing import fix_html, fix_file
|
||||||
|
from sqlalchemy.exc import InvalidRequestError
|
||||||
from compiler import article_types
|
from compiler import article_types
|
||||||
from fixing import fix_link
|
from fixing import fix_link
|
||||||
# process article expects an hash with raw data for the article and puts it into an
|
# process article expects an hash with raw data for the article and puts it into an
|
||||||
@@ -28,9 +28,12 @@ def process_article(art):
|
|||||||
aa.last_fetched = datetime.now()
|
aa.last_fetched = datetime.now()
|
||||||
aa.sourcetype = art["sourcetype"]
|
aa.sourcetype = art["sourcetype"]
|
||||||
db_session.add(aa)
|
db_session.add(aa)
|
||||||
|
try:
|
||||||
db_session.commit()
|
db_session.commit()
|
||||||
|
except InvalidRequestError,e:
|
||||||
|
db_session.rollback()
|
||||||
|
clogger.error(e)
|
||||||
clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
|
clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
|
||||||
# announce_articleid(aa.id)
|
|
||||||
return aa
|
return aa
|
||||||
|
|
||||||
# process a single found url
|
# process a single found url
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ def run_fetch():
|
|||||||
tc, tpe, url= fetch_queue.get(True, 100)
|
tc, tpe, url= fetch_queue.get(True, 100)
|
||||||
clogger.info("Fechted url:"+url)
|
clogger.info("Fechted url:"+url)
|
||||||
if tpe is not "dummyarticle" and tpe is not "dummyindex":
|
if tpe is not "dummyarticle" and tpe is not "dummyindex":
|
||||||
rw=fetch_page(url)
|
rw = fetch_page(url, p)
|
||||||
else:
|
else:
|
||||||
rw="<p> dummytext</p>"
|
rw="<p> dummytext</p>"
|
||||||
compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw},p))
|
compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw},p))
|
||||||
|
|||||||
Reference in New Issue
Block a user