from src import clogger # Logger for crawler
from src.models import Article # Article model
from datetime import datetime 
from src.database import db_session
from mqueues import fetch_queue, compile_queue, put_fetch_queue
from fetching import fetch_page, downloadfile, announce_articleid
from fixing import fix_html, fix_file

from compiler import article_types
from fixing import fix_link
# process article expects an hash with raw data for the article and puts it into an
# article object stored in the database it is intended to prevent dublicates

def is_article_hash(h):
    return "text" in h  and "url" in h and "sourcetype" in h and "section" in h

def process_article(art):
    if not is_article_hash(art):
        clogger.error("Invalid article hash:" + str(art))
        aa=None
    else:
        art["text"]=fix_html(art["text"],art["url"])
        if "image" in art:
            art["image"]=fix_file(art["url"], art["image"])
        clogger.info(art)
        aa = Article.from_hash(art)
        aa.process_hash(art)
        aa.last_fetched=datetime.now()
        aa.sourcetype=art["sourcetype"]
        db_session.add(aa)
        db_session.commit()
        clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
#        announce_articleid(aa.id)
    return aa

# process a single found url
def process_url(url,tpe, parent_url):
    #clogger.debug("process URL of type "+ tpe + ": " + url)
    if parent_url is not None:
        url=fix_link(parent_url, url)
    put_fetch_queue((0,tpe,url,{"nofollow":False}))


# process a url list
def process_urllist(urllist, tpe, parent_url):
    for u in urllist:
        process_url(u,tpe, parent_url)
        
        
def do_process(tpe, cont,params={}):
    urllist=[]
#    clogger.debug("process :" + str(cont))
    if "article_links" in cont:
        process_urllist(cont["article_links"],  article_types[tpe], cont["url"])
    if "index_links" in cont:
        process_urllist(cont["index_links"],  tpe , cont["url"])
    
    if params.has_key("nofollow") and params["nofollow"]:
        nofollow=True
    else:
        nofollow=False

    if "next_page" in cont and (cont["next_page"] is not None) and (not nofollow):
        process_url(cont["next_page"],tpe, cont["url"])

    if "article" in cont:
        art=cont["article"]
        art["sourcetype"]=tpe
        process_article(art)

    if "articles" in cont:
        clogger.debug("articles")
        for a in cont["articles"]:
            if "title" in a:
                a["sourcetype"]=tpe
                if a.has_key("url")==False:
                    a["url"]=cont["url"]
                process_article(a)
    return