que all / que

This commit is contained in:
Andreas Stephanides
2017-01-24 11:25:07 +01:00
parent 5006c9fbe3
commit debf6b0ccc
5 changed files with 39 additions and 13 deletions

View File

@@ -17,6 +17,12 @@ def urls_que(id):
mworker.queue_url(cu.tpe, cu.url)
return cu
def urls_que_upd(id):
cu=CrawlUrl.query.get(id)
mworker.queue_url_upd(cu.tpe, cu.url)
return cu
def url_add(url,tpe):
u=CrawlUrl.find_or_create(tpe, url)
db_session2.add(u)

View File

@@ -29,7 +29,7 @@ def process_article(art):
aa.sourcetype=art["sourcetype"]
db_session.add(aa)
db_session.commit()
clogger.debug("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
# announce_articleid(aa.id)
return aa
@@ -47,15 +47,20 @@ def process_urllist(urllist, tpe, parent_url):
process_url(u,tpe, parent_url)
def do_process(tpe, cont):
def do_process(tpe, cont,params={}):
urllist=[]
# clogger.debug("process :" + str(cont))
if "article_links" in cont:
process_urllist(cont["article_links"], article_types[tpe], cont["url"])
if "index_links" in cont:
process_urllist(cont["index_links"], tpe , cont["url"])
if params.has_key("nofollow") and params["nofollow"]:
nofollow=True
else:
nofollow=False
if "next_page" in cont and cont["next_page"] is not None:
if "next_page" in cont and (cont["next_page"] is not None) and (not nofollow):
process_url(cont["next_page"],tpe, cont["url"])
if "article" in cont:

View File

@@ -29,32 +29,36 @@ def work_compile():
run_compile()
def queue_url(tpe, url):
fetch_queue.put((0,tpe,url))
def queue_url(tpe, url,params={}):
fetch_queue.put((0,tpe,url,params))
#param nofollow = True : Don't follow pagination recursivly to only fetch an update
def queue_url_upd(tpe, url,params={"nofollow": True}):
fetch_queue.put((0,tpe,url,params))
# fetch a page from the url list
def run_fetch():
tc, tpe, url = fetch_queue.get()
tc, tpe, url,params = fetch_queue.get()
if tpe is not "dummyarticle" and tpe is not "dummyindex":
rw=fetch_page(url)
else:
rw="<p> dummytext</p>"
compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw}))
compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw},params))
return rw
# fetch_queue.task_done()
#comile something from the compile list
def run_compile():
tc,tpe,h = compile_queue.get()
tc,tpe,h,params = compile_queue.get()
h=do_compile(tpe,h)
process_queue.put((0,tpe, h))
process_queue.put((0,tpe, h,params))
return h
# compile_queue.task_done()
def run_process():
tc,tpe,h = process_queue.get()
do_process(tpe, h)
tc,tpe,h,params = process_queue.get()
do_process(tpe, h,params)
return h
# process_queue.task_done()

View File

@@ -16,7 +16,7 @@ import mworker
from compiler import do_compile
from fetching import fetch_page
from .controller import urls_test, start_workers, urls_que, url_add
from .controller import urls_test, start_workers, urls_que, url_add, urls_que_upd
#flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Article,CrawlUrl)) else None)
@@ -87,11 +87,20 @@ def urls_json(id):
@compiler_pages.route("/urls/<int:id>/que.json")
def urls_que_json(id):
# Lade Alle Urls
cu=urls_que(id)
cu=urls_que_upd(id)
cc=CrawlCache.query.filter(CrawlCache.url==cu.url)
return jsonify(urls=cu, cache=cc)
# que an existing CrawlUrl for fetching
@compiler_pages.route("/urls/<int:id>/que_all")
@compiler_pages.route("/urls/<int:id>/que_all.json")
def urls_queall_json(id):
# Lade Alle Urls
cu=urls_que(id)
cc=CrawlCache.query.filter(CrawlCache.url==cu.url)
return jsonify(urls=cu, cache=cc)
@compiler_pages.route("/urls/que.lst")
def urls_que_lst():
# Lade Alle Urls