from bs4 import BeautifulSoup
from urlparse import urlparse, urlunparse, urljoin
from fetching import downloadfile
import bleach

def fix_link(url, link):
    r= urlparse(link)
    if r.scheme is None or r.scheme == '':
        return urljoin(url,link)
    else:
        return link

def fix_file(url, link):
    u=fix_link(url,link)
    return downloadfile(u)

def load_file(url, link):
    return fix_file(url,link)


def fix_html(html, baseurl):
    html=bleach.clean(html, tags=['b','p','span','a','img','div','br','strong','ul','li'], strip=True)
    sp=BeautifulSoup(html)
    images=sp.find_all("img")
    for t in images:
        if "src" in t.attrs and t.attrs["src"] is not None:
            t.attrs["src"]=fix_file(baseurl,t.attrs["src"])
    links=sp.find_all("a")
    for t in links:
        if "href" in t.attrs:
            t.attrs["href"]=fix_link(baseurl, t.attrs["href"])
    for t in sp.find_all("script"):
        t.extract()
        b=sp.find("base")
        if b is not None:
            b.attrs["href"]=""
    return sp