from mqueues import fetch_queue, compile_queue, process_queue from compiler import do_compile from mprocess import do_process from fetching import fetch_page from gevent import spawn from itertools import repeat from models import CrawlUrl from src import clogger from src.database import db_session2 def start_workers(f,c,p): for _ in range(f): clogger.debug("spawn fetchworker") spawn(work_fetch) for _ in range(c): spawn(work_compile) for _ in range(p): spawn(work_process) def work_fetch(): while True: run_fetch() def work_process(): while True: run_process() def work_compile(): while True: run_compile() def queue_url(tpe, url,params={"nofollow": False}): fetch_queue.put((0,tpe,url,params)) #param nofollow = True : Don't follow pagination recursivly to only fetch an update def queue_url_upd(tpe, url,params={"nofollow": True}): fetch_queue.put((0,tpe,url,params)) # fetch a page from the url list def run_fetch(): tc, tpe, url,params = fetch_queue.get() if tpe is not "dummyarticle" and tpe is not "dummyindex": rw=fetch_page(url) else: rw="
dummytext
" compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw},params)) return rw # fetch_queue.task_done() #comile something from the compile list def run_compile(): tc,tpe,h,params = compile_queue.get() h=do_compile(tpe,h) process_queue.put((0,tpe, h,params)) return h # compile_queue.task_done() def run_process(): tc,tpe,h,params = process_queue.get() do_process(tpe, h,params) return h # process_queue.task_done()