que all / que

This commit is contained in:
Andreas Stephanides
2017-01-24 11:25:07 +01:00
parent 5006c9fbe3
commit debf6b0ccc
5 changed files with 39 additions and 13 deletions

View File

@@ -10,6 +10,7 @@ import logging
file_handler=logging.FileHandler(cfg.logfile) file_handler=logging.FileHandler(cfg.logfile)
file_handler.setLevel(logging.DEBUG) file_handler.setLevel(logging.DEBUG)
stream_handler=logging.StreamHandler(sys.stdout) stream_handler=logging.StreamHandler(sys.stdout)
stream_handler.setLevel(logging.DEBUG)
clt=logging.getLogger('mylogger') clt=logging.getLogger('mylogger')
clt.setLevel(logging.DEBUG) clt.setLevel(logging.DEBUG)
@@ -21,6 +22,7 @@ clogger=clt
download_path=cfg.download_path download_path=cfg.download_path
lg=clt lg=clt
from gevent import spawn, monkey from gevent import spawn, monkey
monkey.patch_all() monkey.patch_all()
from .compiler import start_workers from .compiler import start_workers

View File

@@ -17,6 +17,12 @@ def urls_que(id):
mworker.queue_url(cu.tpe, cu.url) mworker.queue_url(cu.tpe, cu.url)
return cu return cu
def urls_que_upd(id):
cu=CrawlUrl.query.get(id)
mworker.queue_url_upd(cu.tpe, cu.url)
return cu
def url_add(url,tpe): def url_add(url,tpe):
u=CrawlUrl.find_or_create(tpe, url) u=CrawlUrl.find_or_create(tpe, url)
db_session2.add(u) db_session2.add(u)

View File

@@ -29,7 +29,7 @@ def process_article(art):
aa.sourcetype=art["sourcetype"] aa.sourcetype=art["sourcetype"]
db_session.add(aa) db_session.add(aa)
db_session.commit() db_session.commit()
clogger.debug("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8"))) clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
# announce_articleid(aa.id) # announce_articleid(aa.id)
return aa return aa
@@ -47,15 +47,20 @@ def process_urllist(urllist, tpe, parent_url):
process_url(u,tpe, parent_url) process_url(u,tpe, parent_url)
def do_process(tpe, cont): def do_process(tpe, cont,params={}):
urllist=[] urllist=[]
# clogger.debug("process :" + str(cont)) # clogger.debug("process :" + str(cont))
if "article_links" in cont: if "article_links" in cont:
process_urllist(cont["article_links"], article_types[tpe], cont["url"]) process_urllist(cont["article_links"], article_types[tpe], cont["url"])
if "index_links" in cont: if "index_links" in cont:
process_urllist(cont["index_links"], tpe , cont["url"]) process_urllist(cont["index_links"], tpe , cont["url"])
if params.has_key("nofollow") and params["nofollow"]:
nofollow=True
else:
nofollow=False
if "next_page" in cont and cont["next_page"] is not None: if "next_page" in cont and (cont["next_page"] is not None) and (not nofollow):
process_url(cont["next_page"],tpe, cont["url"]) process_url(cont["next_page"],tpe, cont["url"])
if "article" in cont: if "article" in cont:

View File

@@ -29,32 +29,36 @@ def work_compile():
run_compile() run_compile()
def queue_url(tpe, url): def queue_url(tpe, url,params={}):
fetch_queue.put((0,tpe,url)) fetch_queue.put((0,tpe,url,params))
#param nofollow = True : Don't follow pagination recursivly to only fetch an update
def queue_url_upd(tpe, url,params={"nofollow": True}):
fetch_queue.put((0,tpe,url,params))
# fetch a page from the url list # fetch a page from the url list
def run_fetch(): def run_fetch():
tc, tpe, url = fetch_queue.get() tc, tpe, url,params = fetch_queue.get()
if tpe is not "dummyarticle" and tpe is not "dummyindex": if tpe is not "dummyarticle" and tpe is not "dummyindex":
rw=fetch_page(url) rw=fetch_page(url)
else: else:
rw="<p> dummytext</p>" rw="<p> dummytext</p>"
compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw})) compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw},params))
return rw return rw
# fetch_queue.task_done() # fetch_queue.task_done()
#comile something from the compile list #comile something from the compile list
def run_compile(): def run_compile():
tc,tpe,h = compile_queue.get() tc,tpe,h,params = compile_queue.get()
h=do_compile(tpe,h) h=do_compile(tpe,h)
process_queue.put((0,tpe, h)) process_queue.put((0,tpe, h,params))
return h return h
# compile_queue.task_done() # compile_queue.task_done()
def run_process(): def run_process():
tc,tpe,h = process_queue.get() tc,tpe,h,params = process_queue.get()
do_process(tpe, h) do_process(tpe, h,params)
return h return h
# process_queue.task_done() # process_queue.task_done()

View File

@@ -16,7 +16,7 @@ import mworker
from compiler import do_compile from compiler import do_compile
from fetching import fetch_page from fetching import fetch_page
from .controller import urls_test, start_workers, urls_que, url_add from .controller import urls_test, start_workers, urls_que, url_add, urls_que_upd
#flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Article,CrawlUrl)) else None) #flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Article,CrawlUrl)) else None)
@@ -87,11 +87,20 @@ def urls_json(id):
@compiler_pages.route("/urls/<int:id>/que.json") @compiler_pages.route("/urls/<int:id>/que.json")
def urls_que_json(id): def urls_que_json(id):
# Lade Alle Urls # Lade Alle Urls
cu=urls_que(id) cu=urls_que_upd(id)
cc=CrawlCache.query.filter(CrawlCache.url==cu.url) cc=CrawlCache.query.filter(CrawlCache.url==cu.url)
return jsonify(urls=cu, cache=cc) return jsonify(urls=cu, cache=cc)
# que an existing CrawlUrl for fetching
@compiler_pages.route("/urls/<int:id>/que_all")
@compiler_pages.route("/urls/<int:id>/que_all.json")
def urls_queall_json(id):
# Lade Alle Urls
cu=urls_que(id)
cc=CrawlCache.query.filter(CrawlCache.url==cu.url)
return jsonify(urls=cu, cache=cc)
@compiler_pages.route("/urls/que.lst") @compiler_pages.route("/urls/que.lst")
def urls_que_lst(): def urls_que_lst():
# Lade Alle Urls # Lade Alle Urls