fachschaften/compiler/mprocess.py

from src import clogger # Logger for crawler
from src.models import Article # Article model
from datetime import datetime
from src.database import db_session
from mqueues import fetch_queue, compile_queue, put_fetch_queue
from fetching import fetch_page, downloadfile, announce_articleid
from fixing import fix_html, fix_file
from sqlalchemy.exc import InvalidRequestError
from compiler import article_types
from fixing import fix_link
# process article expects an hash with raw data for the article and puts it into an
# article object stored in the database it is intended to prevent dublicates

def is_article_hash(h):
    return "text" in h  and "url" in h and "sourcetype" in h and "section" in h

def process_article(art):
    if not is_article_hash(art):
        clogger.error("Invalid article hash:" + str(art))
        aa=None
    else:
        art["text"] = fix_html(art["text"], art["url"])
        if "image" in art:
            art["image"]=fix_file(art["url"], art["image"])
        clogger.info(art)
        aa = Article.from_hash(art)
        aa.process_hash(art)
        aa.last_fetched = datetime.now()
        aa.sourcetype = art["sourcetype"]
        db_session.add(aa)
        try:
            db_session.commit()
        except InvalidRequestError,e:
            db_session.rollback()
            clogger.error(e)
        clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
    return aa

# process a single found url
def process_url(url,tpe, parent_url,params={}):
    #clogger.debug("process URL of type "+ tpe + ": " + url)
    if parent_url is not None:
        url=fix_link(parent_url, url)
    params.update({"nofollow":False})
    put_fetch_queue((0,tpe,url,params))


# process a url list
def process_urllist(urllist, tpe, parent_url,params={}):
    for u in urllist:
        if isinstance(u, basestring):
            process_url(u,tpe, parent_url,params)
        elif isinstance(u,tuple):
            params.update({"parent_item": u[1]})
            process_url(u[0], tpe ,parent_url,params)
        else:
            clogger.error("url has wrong type: "+ type(u))

def do_process(tpe, cont,params={}):
    urllist=[]
#    clogger.debug("process :" + str(cont))
    if "article_links" in cont:
        process_urllist(cont["article_links"],  article_types[tpe], cont["url"], params)
    if "index_links" in cont:
        process_urllist(cont["index_links"],  tpe , cont["url"])

    if params.has_key("nofollow") and params["nofollow"]:
        nofollow=True
    else:
        nofollow=False

    if "next_page" in cont and (cont["next_page"] is not None) and (not nofollow):
        process_url(cont["next_page"],tpe, cont["url"])

    if "article" in cont:
        art=cont["article"]
        art["sourcetype"]=tpe
        process_article(art)

    if "articles" in cont:
        clogger.debug("articles")
        for a in cont["articles"]:
            if "title" in a:
                a["sourcetype"]=tpe
                if a.has_key("url")==False:
                    a["url"]=cont["url"]
                process_article(a)
    db_session.remove()
    return