merge
This commit is contained in:
@@ -40,6 +40,7 @@ def cleanup_cache():
|
||||
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days*2)
|
||||
CrawlCache.query.filter(CrawlCache.fetched<ten_weeks_ago).delete()
|
||||
|
||||
|
||||
def get_cached_page(furl):
|
||||
current_time = datetime.utcnow()
|
||||
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
|
||||
@@ -51,7 +52,8 @@ def get_cached_page(furl):
|
||||
return cc
|
||||
|
||||
|
||||
def fetch_page(furl):
|
||||
|
||||
def fetch_page(furl,p={}):
|
||||
u=urlparse.urlparse(furl)
|
||||
current_time = datetime.utcnow()
|
||||
cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first()
|
||||
@@ -62,8 +64,13 @@ def fetch_page(furl):
|
||||
clogger.debug("fetching url: "+ str(furl))
|
||||
if u[0]=='fb':
|
||||
fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds()))
|
||||
if p.has_key("nofollow") and p["nofollow"]==False:
|
||||
furl=u[1]+u[2]+"?fields=story,created_time,id,message,attachments"
|
||||
else:
|
||||
furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments"
|
||||
cc=get_cached_page(furl) #CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
|
||||
# cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
|
||||
|
||||
if cc is None:
|
||||
tx = json.dumps(graph.get_object(id=furl))
|
||||
else:
|
||||
|
||||
@@ -5,7 +5,7 @@ from src.database import db_session
|
||||
from mqueues import fetch_queue, compile_queue, put_fetch_queue
|
||||
from fetching import fetch_page, downloadfile, announce_articleid
|
||||
from fixing import fix_html, fix_file
|
||||
#from src import app
|
||||
from sqlalchemy.exc import InvalidRequestError
|
||||
|
||||
from compiler import article_types
|
||||
from fixing import fix_link
|
||||
@@ -20,14 +20,14 @@ def process_article(art):
|
||||
clogger.error("Invalid article hash:" + str(art))
|
||||
aa=None
|
||||
else:
|
||||
art["text"]=fix_html(art["text"],art["url"])
|
||||
art["text"] = fix_html(art["text"], art["url"])
|
||||
if "image" in art:
|
||||
art["image"]=fix_file(art["url"], art["image"])
|
||||
clogger.info(art)
|
||||
aa = Article.from_hash(art)
|
||||
aa.process_hash(art)
|
||||
aa.last_fetched=datetime.now()
|
||||
aa.sourcetype=art["sourcetype"]
|
||||
aa.last_fetched = datetime.now()
|
||||
aa.sourcetype = art["sourcetype"]
|
||||
db_session.add(aa)
|
||||
try:
|
||||
db_session.commit()
|
||||
@@ -92,4 +92,5 @@ def do_process(tpe, cont,params={}):
|
||||
if a.has_key("url")==False:
|
||||
a["url"]=cont["url"]
|
||||
process_article(a)
|
||||
db_session.remove()
|
||||
return
|
||||
|
||||
@@ -8,6 +8,8 @@ from itertools import repeat
|
||||
from models import CrawlUrl
|
||||
from src import clogger
|
||||
from src.database import db_session2,db_session
|
||||
from Queue import Empty
|
||||
|
||||
def start_workers(f,c,p):
|
||||
for _ in range(f):
|
||||
clogger.debug("spawn fetchworker")
|
||||
@@ -18,17 +20,26 @@ def start_workers(f,c,p):
|
||||
spawn(work_process)
|
||||
|
||||
def work_fetch():
|
||||
try:
|
||||
while True:
|
||||
run_fetch()
|
||||
except Empty:
|
||||
clogger.info("Fetch - Worker died")
|
||||
|
||||
def work_process():
|
||||
try:
|
||||
while True:
|
||||
run_process()
|
||||
# db_session.close()
|
||||
except Empty:
|
||||
clogger.info("Process - Worker died")
|
||||
|
||||
|
||||
def work_compile():
|
||||
try:
|
||||
while True:
|
||||
run_compile()
|
||||
except Empty:
|
||||
clogger.info("Compile - Worker died")
|
||||
|
||||
|
||||
def queue_url(tpe, url,params={"nofollow": False}):
|
||||
@@ -38,16 +49,15 @@ def queue_url(tpe, url,params={"nofollow": False}):
|
||||
def queue_url_upd(tpe, url,params={"nofollow": True}):
|
||||
fetch_queue.put((True,tpe,url,params))
|
||||
|
||||
|
||||
# fetch a page from the url list
|
||||
def run_fetch():
|
||||
try:
|
||||
tc, tpe, url, p= fetch_queue.get()
|
||||
tc, tpe, url, p= fetch_queue.get(True, 100)
|
||||
except ValueError:
|
||||
tc, tpe, url= fetch_queue.get()
|
||||
clogger.debug("fetched : "+url)
|
||||
tc, tpe, url= fetch_queue.get(True, 100)
|
||||
clogger.info("Fechted url:"+url)
|
||||
if tpe is not "dummyarticle" and tpe is not "dummyindex":
|
||||
rw=fetch_page(url)
|
||||
rw = fetch_page(url, p)
|
||||
else:
|
||||
rw="<p> dummytext</p>"
|
||||
compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw},p))
|
||||
@@ -56,7 +66,7 @@ def run_fetch():
|
||||
|
||||
#comile something from the compile list
|
||||
def run_compile():
|
||||
tc,tpe,h, p = compile_queue.get()
|
||||
tc,tpe,h, p = compile_queue.get(True, 100)
|
||||
if p.has_key('parent_item'):
|
||||
h["parent_item"]=p["parent_item"]
|
||||
h=do_compile(tpe,h,p)
|
||||
@@ -65,7 +75,7 @@ def run_compile():
|
||||
# compile_queue.task_done()
|
||||
|
||||
def run_process():
|
||||
tc,tpe,h,p = process_queue.get()
|
||||
tc,tpe,h,p = process_queue.get(True, 100)
|
||||
do_process(tpe, h,p)
|
||||
return h
|
||||
# process_queue.task_done()
|
||||
|
||||
Reference in New Issue
Block a user