que all / que

2017-01-24 11:25:07 +01:00
parent 5006c9fbe3
commit debf6b0ccc
5 changed files with 39 additions and 13 deletions
--- a/init.py
+++ b/init.py
@@ -10,6 +10,7 @@ import logging
 file_handler=logging.FileHandler(cfg.logfile)
 file_handler.setLevel(logging.DEBUG)
 stream_handler=logging.StreamHandler(sys.stdout)
 stream_handler.setLevel(logging.DEBUG)
 clt=logging.getLogger('mylogger')
 clt.setLevel(logging.DEBUG)
@@ -21,6 +22,7 @@ clogger=clt
 download_path=cfg.download_path
 lg=clt
 from gevent import spawn, monkey
 monkey.patch_all()
 from .compiler import start_workers
--- a/compiler/controller.py
+++ b/compiler/controller.py
@@ -17,6 +17,12 @@ def urls_que(id):
    mworker.queue_url(cu.tpe, cu.url)
    return cu
 def urls_que_upd(id):
    cu=CrawlUrl.query.get(id)
    mworker.queue_url_upd(cu.tpe, cu.url)
    return cu
 def url_add(url,tpe):
    u=CrawlUrl.find_or_create(tpe, url)
    db_session2.add(u)
--- a/compiler/mprocess.py
+++ b/compiler/mprocess.py
@@ -29,7 +29,7 @@ def process_article(art):
        aa.sourcetype=art["sourcetype"]
        db_session.add(aa)
        db_session.commit()
-        clogger.debug("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
+        clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
 #        announce_articleid(aa.id)
    return aa
@@ -47,15 +47,20 @@ def process_urllist(urllist, tpe, parent_url):
        process_url(u,tpe, parent_url)
-def do_process(tpe, cont):
+def do_process(tpe, cont,params={}):
    urllist=[]
 #    clogger.debug("process :" + str(cont))
    if "article_links" in cont:
        process_urllist(cont["article_links"],  article_types[tpe], cont["url"])
    if "index_links" in cont:
        process_urllist(cont["index_links"],  tpe , cont["url"])
    if params.has_key("nofollow") and params["nofollow"]:
        nofollow=True
    else:
        nofollow=False
-    if "next_page" in cont and cont["next_page"] is not None:
+    if "next_page" in cont and (cont["next_page"] is not None) and (not nofollow):
        process_url(cont["next_page"],tpe, cont["url"])
    if "article" in cont:
--- a/compiler/mworker.py
+++ b/compiler/mworker.py
@@ -29,32 +29,36 @@ def work_compile():
        run_compile()
-def queue_url(tpe, url):
+def queue_url(tpe, url,params={}):
-    fetch_queue.put((0,tpe,url))
+    fetch_queue.put((0,tpe,url,params))
 #param nofollow = True : Don't follow pagination recursivly to only fetch an update
 def queue_url_upd(tpe, url,params={"nofollow": True}):
    fetch_queue.put((0,tpe,url,params))
 # fetch a page from the url list
 def run_fetch():
-    tc, tpe, url = fetch_queue.get()
+    tc, tpe, url,params = fetch_queue.get()
    if tpe is not "dummyarticle" and tpe is not "dummyindex":
        rw=fetch_page(url)
    else:
        rw="<p> dummytext</p>"
-    compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw}))
+    compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw},params))
    return rw
    #    fetch_queue.task_done()
 #comile something from the compile list
 def run_compile():
-    tc,tpe,h = compile_queue.get()
+    tc,tpe,h,params = compile_queue.get()
    h=do_compile(tpe,h)
-    process_queue.put((0,tpe, h))
+    process_queue.put((0,tpe, h,params))
    return h
    #    compile_queue.task_done()
 def run_process():
-    tc,tpe,h = process_queue.get()
+    tc,tpe,h,params = process_queue.get()
-    do_process(tpe, h)
+    do_process(tpe, h,params)
    return h
 #    process_queue.task_done()
--- a/compiler/views.py
+++ b/compiler/views.py
@@ -16,7 +16,7 @@ import mworker
 from compiler import do_compile
 from fetching import fetch_page
-from .controller import  urls_test, start_workers, urls_que, url_add
+from .controller import  urls_test, start_workers, urls_que, url_add, urls_que_upd
 #flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Article,CrawlUrl)) else None) 
@@ -87,11 +87,20 @@ def urls_json(id):
@compiler_pages.route("/urls/<int:id>/que.json")
 def urls_que_json(id):
    # Lade Alle Urls
-    cu=urls_que(id)
+    cu=urls_que_upd(id)
    cc=CrawlCache.query.filter(CrawlCache.url==cu.url)
    return jsonify(urls=cu, cache=cc)
 # que an existing CrawlUrl for fetching
@compiler_pages.route("/urls/<int:id>/que_all")
@compiler_pages.route("/urls/<int:id>/que_all.json")
 def urls_queall_json(id):
    # Lade Alle Urls
    cu=urls_que(id)
    cc=CrawlCache.query.filter(CrawlCache.url==cu.url)
    return jsonify(urls=cu, cache=cc)
@compiler_pages.route("/urls/que.lst")
 def urls_que_lst():
    # Lade Alle Urls