que all / que
This commit is contained in:
@@ -10,6 +10,7 @@ import logging
|
||||
file_handler=logging.FileHandler(cfg.logfile)
|
||||
file_handler.setLevel(logging.DEBUG)
|
||||
stream_handler=logging.StreamHandler(sys.stdout)
|
||||
stream_handler.setLevel(logging.DEBUG)
|
||||
|
||||
clt=logging.getLogger('mylogger')
|
||||
clt.setLevel(logging.DEBUG)
|
||||
@@ -21,6 +22,7 @@ clogger=clt
|
||||
download_path=cfg.download_path
|
||||
|
||||
lg=clt
|
||||
|
||||
from gevent import spawn, monkey
|
||||
monkey.patch_all()
|
||||
from .compiler import start_workers
|
||||
|
||||
@@ -17,6 +17,12 @@ def urls_que(id):
|
||||
mworker.queue_url(cu.tpe, cu.url)
|
||||
return cu
|
||||
|
||||
def urls_que_upd(id):
|
||||
cu=CrawlUrl.query.get(id)
|
||||
mworker.queue_url_upd(cu.tpe, cu.url)
|
||||
return cu
|
||||
|
||||
|
||||
def url_add(url,tpe):
|
||||
u=CrawlUrl.find_or_create(tpe, url)
|
||||
db_session2.add(u)
|
||||
|
||||
@@ -29,7 +29,7 @@ def process_article(art):
|
||||
aa.sourcetype=art["sourcetype"]
|
||||
db_session.add(aa)
|
||||
db_session.commit()
|
||||
clogger.debug("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
|
||||
clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
|
||||
# announce_articleid(aa.id)
|
||||
return aa
|
||||
|
||||
@@ -47,15 +47,20 @@ def process_urllist(urllist, tpe, parent_url):
|
||||
process_url(u,tpe, parent_url)
|
||||
|
||||
|
||||
def do_process(tpe, cont):
|
||||
def do_process(tpe, cont,params={}):
|
||||
urllist=[]
|
||||
# clogger.debug("process :" + str(cont))
|
||||
if "article_links" in cont:
|
||||
process_urllist(cont["article_links"], article_types[tpe], cont["url"])
|
||||
if "index_links" in cont:
|
||||
process_urllist(cont["index_links"], tpe , cont["url"])
|
||||
|
||||
if params.has_key("nofollow") and params["nofollow"]:
|
||||
nofollow=True
|
||||
else:
|
||||
nofollow=False
|
||||
|
||||
if "next_page" in cont and cont["next_page"] is not None:
|
||||
if "next_page" in cont and (cont["next_page"] is not None) and (not nofollow):
|
||||
process_url(cont["next_page"],tpe, cont["url"])
|
||||
|
||||
if "article" in cont:
|
||||
|
||||
@@ -29,32 +29,36 @@ def work_compile():
|
||||
run_compile()
|
||||
|
||||
|
||||
def queue_url(tpe, url):
|
||||
fetch_queue.put((0,tpe,url))
|
||||
def queue_url(tpe, url,params={}):
|
||||
fetch_queue.put((0,tpe,url,params))
|
||||
|
||||
#param nofollow = True : Don't follow pagination recursivly to only fetch an update
|
||||
def queue_url_upd(tpe, url,params={"nofollow": True}):
|
||||
fetch_queue.put((0,tpe,url,params))
|
||||
|
||||
|
||||
# fetch a page from the url list
|
||||
def run_fetch():
|
||||
tc, tpe, url = fetch_queue.get()
|
||||
tc, tpe, url,params = fetch_queue.get()
|
||||
if tpe is not "dummyarticle" and tpe is not "dummyindex":
|
||||
rw=fetch_page(url)
|
||||
else:
|
||||
rw="<p> dummytext</p>"
|
||||
compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw}))
|
||||
compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw},params))
|
||||
return rw
|
||||
# fetch_queue.task_done()
|
||||
|
||||
#comile something from the compile list
|
||||
def run_compile():
|
||||
tc,tpe,h = compile_queue.get()
|
||||
tc,tpe,h,params = compile_queue.get()
|
||||
h=do_compile(tpe,h)
|
||||
process_queue.put((0,tpe, h))
|
||||
process_queue.put((0,tpe, h,params))
|
||||
return h
|
||||
# compile_queue.task_done()
|
||||
|
||||
def run_process():
|
||||
tc,tpe,h = process_queue.get()
|
||||
do_process(tpe, h)
|
||||
tc,tpe,h,params = process_queue.get()
|
||||
do_process(tpe, h,params)
|
||||
return h
|
||||
# process_queue.task_done()
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ import mworker
|
||||
|
||||
from compiler import do_compile
|
||||
from fetching import fetch_page
|
||||
from .controller import urls_test, start_workers, urls_que, url_add
|
||||
from .controller import urls_test, start_workers, urls_que, url_add, urls_que_upd
|
||||
|
||||
#flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Article,CrawlUrl)) else None)
|
||||
|
||||
@@ -87,11 +87,20 @@ def urls_json(id):
|
||||
@compiler_pages.route("/urls/<int:id>/que.json")
|
||||
def urls_que_json(id):
|
||||
# Lade Alle Urls
|
||||
cu=urls_que(id)
|
||||
cu=urls_que_upd(id)
|
||||
cc=CrawlCache.query.filter(CrawlCache.url==cu.url)
|
||||
return jsonify(urls=cu, cache=cc)
|
||||
|
||||
|
||||
# que an existing CrawlUrl for fetching
|
||||
@compiler_pages.route("/urls/<int:id>/que_all")
|
||||
@compiler_pages.route("/urls/<int:id>/que_all.json")
|
||||
def urls_queall_json(id):
|
||||
# Lade Alle Urls
|
||||
cu=urls_que(id)
|
||||
cc=CrawlCache.query.filter(CrawlCache.url==cu.url)
|
||||
return jsonify(urls=cu, cache=cc)
|
||||
|
||||
@compiler_pages.route("/urls/que.lst")
|
||||
def urls_que_lst():
|
||||
# Lade Alle Urls
|
||||
|
||||
Reference in New Issue
Block a user