from src import clogger # Logger for crawler from src.models import Article # Article model from datetime import datetime from src.database import db_session from mqueues import fetch_queue, compile_queue, put_fetch_queue from fetching import fetch_page, downloadfile, announce_articleid from fixing import fix_html, fix_file from compiler import article_types from fixing import fix_link # process article expects an hash with raw data for the article and puts it into an # article object stored in the database it is intended to prevent dublicates def is_article_hash(h): return "text" in h and "url" in h and "sourcetype" in h and "section" in h def process_article(art): if not is_article_hash(art): clogger.error("Invalid article hash:" + str(art)) aa=None else: art["text"]=fix_html(art["text"],art["url"]) if "image" in art: art["image"]=fix_file(art["url"], art["image"]) clogger.info(art) aa = Article.from_hash(art) aa.process_hash(art) aa.last_fetched=datetime.now() aa.sourcetype=art["sourcetype"] db_session.add(aa) db_session.commit() clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8"))) # announce_articleid(aa.id) return aa # process a single found url def process_url(url,tpe, parent_url): #clogger.debug("process URL of type "+ tpe + ": " + url) if parent_url is not None: url=fix_link(parent_url, url) put_fetch_queue((0,tpe,url,{"nofollow":False})) # process a url list def process_urllist(urllist, tpe, parent_url): for u in urllist: process_url(u,tpe, parent_url) def do_process(tpe, cont,params={}): urllist=[] # clogger.debug("process :" + str(cont)) if "article_links" in cont: process_urllist(cont["article_links"], article_types[tpe], cont["url"]) if "index_links" in cont: process_urllist(cont["index_links"], tpe , cont["url"]) if params.has_key("nofollow") and params["nofollow"]: nofollow=True else: nofollow=False if "next_page" in cont and (cont["next_page"] is not None) and (not nofollow): process_url(cont["next_page"],tpe, cont["url"]) if "article" in cont: art=cont["article"] art["sourcetype"]=tpe process_article(art) if "articles" in cont: clogger.debug("articles") for a in cont["articles"]: if "title" in a: a["sourcetype"]=tpe if a.has_key("url")==False: a["url"]=cont["url"] process_article(a) return