que all / que
This commit is contained in:
@@ -10,6 +10,7 @@ import logging
|
|||||||
file_handler=logging.FileHandler(cfg.logfile)
|
file_handler=logging.FileHandler(cfg.logfile)
|
||||||
file_handler.setLevel(logging.DEBUG)
|
file_handler.setLevel(logging.DEBUG)
|
||||||
stream_handler=logging.StreamHandler(sys.stdout)
|
stream_handler=logging.StreamHandler(sys.stdout)
|
||||||
|
stream_handler.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
clt=logging.getLogger('mylogger')
|
clt=logging.getLogger('mylogger')
|
||||||
clt.setLevel(logging.DEBUG)
|
clt.setLevel(logging.DEBUG)
|
||||||
@@ -21,6 +22,7 @@ clogger=clt
|
|||||||
download_path=cfg.download_path
|
download_path=cfg.download_path
|
||||||
|
|
||||||
lg=clt
|
lg=clt
|
||||||
|
|
||||||
from gevent import spawn, monkey
|
from gevent import spawn, monkey
|
||||||
monkey.patch_all()
|
monkey.patch_all()
|
||||||
from .compiler import start_workers
|
from .compiler import start_workers
|
||||||
|
|||||||
@@ -17,6 +17,12 @@ def urls_que(id):
|
|||||||
mworker.queue_url(cu.tpe, cu.url)
|
mworker.queue_url(cu.tpe, cu.url)
|
||||||
return cu
|
return cu
|
||||||
|
|
||||||
|
def urls_que_upd(id):
|
||||||
|
cu=CrawlUrl.query.get(id)
|
||||||
|
mworker.queue_url_upd(cu.tpe, cu.url)
|
||||||
|
return cu
|
||||||
|
|
||||||
|
|
||||||
def url_add(url,tpe):
|
def url_add(url,tpe):
|
||||||
u=CrawlUrl.find_or_create(tpe, url)
|
u=CrawlUrl.find_or_create(tpe, url)
|
||||||
db_session2.add(u)
|
db_session2.add(u)
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ def process_article(art):
|
|||||||
aa.sourcetype=art["sourcetype"]
|
aa.sourcetype=art["sourcetype"]
|
||||||
db_session.add(aa)
|
db_session.add(aa)
|
||||||
db_session.commit()
|
db_session.commit()
|
||||||
clogger.debug("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
|
clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
|
||||||
# announce_articleid(aa.id)
|
# announce_articleid(aa.id)
|
||||||
return aa
|
return aa
|
||||||
|
|
||||||
@@ -47,7 +47,7 @@ def process_urllist(urllist, tpe, parent_url):
|
|||||||
process_url(u,tpe, parent_url)
|
process_url(u,tpe, parent_url)
|
||||||
|
|
||||||
|
|
||||||
def do_process(tpe, cont):
|
def do_process(tpe, cont,params={}):
|
||||||
urllist=[]
|
urllist=[]
|
||||||
# clogger.debug("process :" + str(cont))
|
# clogger.debug("process :" + str(cont))
|
||||||
if "article_links" in cont:
|
if "article_links" in cont:
|
||||||
@@ -55,7 +55,12 @@ def do_process(tpe, cont):
|
|||||||
if "index_links" in cont:
|
if "index_links" in cont:
|
||||||
process_urllist(cont["index_links"], tpe , cont["url"])
|
process_urllist(cont["index_links"], tpe , cont["url"])
|
||||||
|
|
||||||
if "next_page" in cont and cont["next_page"] is not None:
|
if params.has_key("nofollow") and params["nofollow"]:
|
||||||
|
nofollow=True
|
||||||
|
else:
|
||||||
|
nofollow=False
|
||||||
|
|
||||||
|
if "next_page" in cont and (cont["next_page"] is not None) and (not nofollow):
|
||||||
process_url(cont["next_page"],tpe, cont["url"])
|
process_url(cont["next_page"],tpe, cont["url"])
|
||||||
|
|
||||||
if "article" in cont:
|
if "article" in cont:
|
||||||
|
|||||||
@@ -29,32 +29,36 @@ def work_compile():
|
|||||||
run_compile()
|
run_compile()
|
||||||
|
|
||||||
|
|
||||||
def queue_url(tpe, url):
|
def queue_url(tpe, url,params={}):
|
||||||
fetch_queue.put((0,tpe,url))
|
fetch_queue.put((0,tpe,url,params))
|
||||||
|
|
||||||
|
#param nofollow = True : Don't follow pagination recursivly to only fetch an update
|
||||||
|
def queue_url_upd(tpe, url,params={"nofollow": True}):
|
||||||
|
fetch_queue.put((0,tpe,url,params))
|
||||||
|
|
||||||
|
|
||||||
# fetch a page from the url list
|
# fetch a page from the url list
|
||||||
def run_fetch():
|
def run_fetch():
|
||||||
tc, tpe, url = fetch_queue.get()
|
tc, tpe, url,params = fetch_queue.get()
|
||||||
if tpe is not "dummyarticle" and tpe is not "dummyindex":
|
if tpe is not "dummyarticle" and tpe is not "dummyindex":
|
||||||
rw=fetch_page(url)
|
rw=fetch_page(url)
|
||||||
else:
|
else:
|
||||||
rw="<p> dummytext</p>"
|
rw="<p> dummytext</p>"
|
||||||
compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw}))
|
compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw},params))
|
||||||
return rw
|
return rw
|
||||||
# fetch_queue.task_done()
|
# fetch_queue.task_done()
|
||||||
|
|
||||||
#comile something from the compile list
|
#comile something from the compile list
|
||||||
def run_compile():
|
def run_compile():
|
||||||
tc,tpe,h = compile_queue.get()
|
tc,tpe,h,params = compile_queue.get()
|
||||||
h=do_compile(tpe,h)
|
h=do_compile(tpe,h)
|
||||||
process_queue.put((0,tpe, h))
|
process_queue.put((0,tpe, h,params))
|
||||||
return h
|
return h
|
||||||
# compile_queue.task_done()
|
# compile_queue.task_done()
|
||||||
|
|
||||||
def run_process():
|
def run_process():
|
||||||
tc,tpe,h = process_queue.get()
|
tc,tpe,h,params = process_queue.get()
|
||||||
do_process(tpe, h)
|
do_process(tpe, h,params)
|
||||||
return h
|
return h
|
||||||
# process_queue.task_done()
|
# process_queue.task_done()
|
||||||
|
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ import mworker
|
|||||||
|
|
||||||
from compiler import do_compile
|
from compiler import do_compile
|
||||||
from fetching import fetch_page
|
from fetching import fetch_page
|
||||||
from .controller import urls_test, start_workers, urls_que, url_add
|
from .controller import urls_test, start_workers, urls_que, url_add, urls_que_upd
|
||||||
|
|
||||||
#flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Article,CrawlUrl)) else None)
|
#flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Article,CrawlUrl)) else None)
|
||||||
|
|
||||||
@@ -87,11 +87,20 @@ def urls_json(id):
|
|||||||
@compiler_pages.route("/urls/<int:id>/que.json")
|
@compiler_pages.route("/urls/<int:id>/que.json")
|
||||||
def urls_que_json(id):
|
def urls_que_json(id):
|
||||||
# Lade Alle Urls
|
# Lade Alle Urls
|
||||||
cu=urls_que(id)
|
cu=urls_que_upd(id)
|
||||||
cc=CrawlCache.query.filter(CrawlCache.url==cu.url)
|
cc=CrawlCache.query.filter(CrawlCache.url==cu.url)
|
||||||
return jsonify(urls=cu, cache=cc)
|
return jsonify(urls=cu, cache=cc)
|
||||||
|
|
||||||
|
|
||||||
|
# que an existing CrawlUrl for fetching
|
||||||
|
@compiler_pages.route("/urls/<int:id>/que_all")
|
||||||
|
@compiler_pages.route("/urls/<int:id>/que_all.json")
|
||||||
|
def urls_queall_json(id):
|
||||||
|
# Lade Alle Urls
|
||||||
|
cu=urls_que(id)
|
||||||
|
cc=CrawlCache.query.filter(CrawlCache.url==cu.url)
|
||||||
|
return jsonify(urls=cu, cache=cc)
|
||||||
|
|
||||||
@compiler_pages.route("/urls/que.lst")
|
@compiler_pages.route("/urls/que.lst")
|
||||||
def urls_que_lst():
|
def urls_que_lst():
|
||||||
# Lade Alle Urls
|
# Lade Alle Urls
|
||||||
|
|||||||
Reference in New Issue
Block a user