diff --git a/__init__.py b/__init__.py index 8771806..fae53d5 100644 --- a/__init__.py +++ b/__init__.py @@ -10,6 +10,7 @@ import logging file_handler=logging.FileHandler(cfg.logfile) file_handler.setLevel(logging.DEBUG) stream_handler=logging.StreamHandler(sys.stdout) +stream_handler.setLevel(logging.DEBUG) clt=logging.getLogger('mylogger') clt.setLevel(logging.DEBUG) @@ -21,6 +22,7 @@ clogger=clt download_path=cfg.download_path lg=clt + from gevent import spawn, monkey monkey.patch_all() from .compiler import start_workers diff --git a/compiler/controller.py b/compiler/controller.py index dc7d25e..a95946b 100644 --- a/compiler/controller.py +++ b/compiler/controller.py @@ -17,6 +17,12 @@ def urls_que(id): mworker.queue_url(cu.tpe, cu.url) return cu +def urls_que_upd(id): + cu=CrawlUrl.query.get(id) + mworker.queue_url_upd(cu.tpe, cu.url) + return cu + + def url_add(url,tpe): u=CrawlUrl.find_or_create(tpe, url) db_session2.add(u) diff --git a/compiler/mprocess.py b/compiler/mprocess.py index 86062bb..fb87028 100644 --- a/compiler/mprocess.py +++ b/compiler/mprocess.py @@ -29,7 +29,7 @@ def process_article(art): aa.sourcetype=art["sourcetype"] db_session.add(aa) db_session.commit() - clogger.debug("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8"))) + clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8"))) # announce_articleid(aa.id) return aa @@ -47,15 +47,20 @@ def process_urllist(urllist, tpe, parent_url): process_url(u,tpe, parent_url) -def do_process(tpe, cont): +def do_process(tpe, cont,params={}): urllist=[] # clogger.debug("process :" + str(cont)) if "article_links" in cont: process_urllist(cont["article_links"], article_types[tpe], cont["url"]) if "index_links" in cont: process_urllist(cont["index_links"], tpe , cont["url"]) + + if params.has_key("nofollow") and params["nofollow"]: + nofollow=True + else: + nofollow=False - if "next_page" in cont and cont["next_page"] is not None: + if "next_page" in cont and (cont["next_page"] is not None) and (not nofollow): process_url(cont["next_page"],tpe, cont["url"]) if "article" in cont: diff --git a/compiler/mworker.py b/compiler/mworker.py index 106868d..f22144b 100644 --- a/compiler/mworker.py +++ b/compiler/mworker.py @@ -29,32 +29,36 @@ def work_compile(): run_compile() -def queue_url(tpe, url): - fetch_queue.put((0,tpe,url)) +def queue_url(tpe, url,params={}): + fetch_queue.put((0,tpe,url,params)) + +#param nofollow = True : Don't follow pagination recursivly to only fetch an update +def queue_url_upd(tpe, url,params={"nofollow": True}): + fetch_queue.put((0,tpe,url,params)) # fetch a page from the url list def run_fetch(): - tc, tpe, url = fetch_queue.get() + tc, tpe, url,params = fetch_queue.get() if tpe is not "dummyarticle" and tpe is not "dummyindex": rw=fetch_page(url) else: rw="

dummytext

" - compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw})) + compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw},params)) return rw # fetch_queue.task_done() #comile something from the compile list def run_compile(): - tc,tpe,h = compile_queue.get() + tc,tpe,h,params = compile_queue.get() h=do_compile(tpe,h) - process_queue.put((0,tpe, h)) + process_queue.put((0,tpe, h,params)) return h # compile_queue.task_done() def run_process(): - tc,tpe,h = process_queue.get() - do_process(tpe, h) + tc,tpe,h,params = process_queue.get() + do_process(tpe, h,params) return h # process_queue.task_done() diff --git a/compiler/views.py b/compiler/views.py index 424be49..b9770df 100644 --- a/compiler/views.py +++ b/compiler/views.py @@ -16,7 +16,7 @@ import mworker from compiler import do_compile from fetching import fetch_page -from .controller import urls_test, start_workers, urls_que, url_add +from .controller import urls_test, start_workers, urls_que, url_add, urls_que_upd #flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Article,CrawlUrl)) else None) @@ -87,11 +87,20 @@ def urls_json(id): @compiler_pages.route("/urls//que.json") def urls_que_json(id): # Lade Alle Urls - cu=urls_que(id) + cu=urls_que_upd(id) cc=CrawlCache.query.filter(CrawlCache.url==cu.url) return jsonify(urls=cu, cache=cc) +# que an existing CrawlUrl for fetching +@compiler_pages.route("/urls//que_all") +@compiler_pages.route("/urls//que_all.json") +def urls_queall_json(id): + # Lade Alle Urls + cu=urls_que(id) + cc=CrawlCache.query.filter(CrawlCache.url==cu.url) + return jsonify(urls=cu, cache=cc) + @compiler_pages.route("/urls/que.lst") def urls_que_lst(): # Lade Alle Urls