96 lines
3.2 KiB
Python
96 lines
3.2 KiB
Python
from src import clogger # Logger for crawler
|
|
from src.models import Article # Article model
|
|
from datetime import datetime
|
|
from src.database import db_session
|
|
from mqueues import fetch_queue, compile_queue, put_fetch_queue
|
|
from fetching import fetch_page, downloadfile, announce_articleid
|
|
from fixing import fix_html, fix_file
|
|
#from src import app
|
|
|
|
from compiler import article_types
|
|
from fixing import fix_link
|
|
# process article expects an hash with raw data for the article and puts it into an
|
|
# article object stored in the database it is intended to prevent dublicates
|
|
|
|
def is_article_hash(h):
|
|
return "text" in h and "url" in h and "sourcetype" in h and "section" in h
|
|
|
|
def process_article(art):
|
|
if not is_article_hash(art):
|
|
clogger.error("Invalid article hash:" + str(art))
|
|
aa=None
|
|
else:
|
|
art["text"]=fix_html(art["text"],art["url"])
|
|
if "image" in art:
|
|
art["image"]=fix_file(art["url"], art["image"])
|
|
clogger.info(art)
|
|
aa = Article.from_hash(art)
|
|
aa.process_hash(art)
|
|
aa.last_fetched=datetime.now()
|
|
aa.sourcetype=art["sourcetype"]
|
|
db_session.add(aa)
|
|
try:
|
|
db_session.commit()
|
|
except InvalidRequestError,e:
|
|
db_session.rollback()
|
|
clogger.error(e)
|
|
clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
|
|
return aa
|
|
# app.logger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
|
|
# db_session.close()
|
|
# announce_articleid(aa.id)
|
|
#
|
|
|
|
|
|
|
|
# process a single found url
|
|
def process_url(url,tpe, parent_url,params={}):
|
|
#clogger.debug("process URL of type "+ tpe + ": " + url)
|
|
if parent_url is not None:
|
|
url=fix_link(parent_url, url)
|
|
params.update({"nofollow":False})
|
|
put_fetch_queue((0,tpe,url,params))
|
|
|
|
|
|
# process a url list
|
|
def process_urllist(urllist, tpe, parent_url,params={}):
|
|
for u in urllist:
|
|
if isinstance(u, basestring):
|
|
process_url(u,tpe, parent_url,params)
|
|
elif isinstance(u,tuple):
|
|
params.update({"parent_item": u[1]})
|
|
process_url(u[0], tpe ,parent_url,params)
|
|
else:
|
|
clogger.error("url has wrong type: "+ type(u))
|
|
|
|
def do_process(tpe, cont,params={}):
|
|
urllist=[]
|
|
# clogger.debug("process :" + str(cont))
|
|
if "article_links" in cont:
|
|
process_urllist(cont["article_links"], article_types[tpe], cont["url"], params)
|
|
if "index_links" in cont:
|
|
process_urllist(cont["index_links"], tpe , cont["url"])
|
|
|
|
if params.has_key("nofollow") and params["nofollow"]:
|
|
nofollow=True
|
|
else:
|
|
nofollow=False
|
|
|
|
if "next_page" in cont and (cont["next_page"] is not None) and (not nofollow):
|
|
process_url(cont["next_page"],tpe, cont["url"])
|
|
|
|
if "article" in cont:
|
|
art=cont["article"]
|
|
art["sourcetype"]=tpe
|
|
process_article(art)
|
|
|
|
if "articles" in cont:
|
|
clogger.debug("articles")
|
|
for a in cont["articles"]:
|
|
if "title" in a:
|
|
a["sourcetype"]=tpe
|
|
if a.has_key("url")==False:
|
|
a["url"]=cont["url"]
|
|
process_article(a)
|
|
return
|