from bs4 import BeautifulSoup from urlparse import urlparse, urlunparse, urljoin from fetching import downloadfile import bleach def fix_link(url, link): r= urlparse(link) if r.scheme is None or r.scheme == '': return urljoin(url,link) else: return link def fix_file(url, link): u=fix_link(url,link) return downloadfile(u) def load_file(url, link): return fix_file(url,link) def fix_html(html, baseurl): html=bleach.clean(html, tags=['b','p','span','a','img','div','br','strong','ul','li'], strip=True) sp=BeautifulSoup(html) images=sp.find_all("img") for t in images: if "src" in t.attrs and t.attrs["src"] is not None: t.attrs["src"]=fix_file(baseurl,t.attrs["src"]) links=sp.find_all("a") for t in links: if "href" in t.attrs: t.attrs["href"]=fix_link(baseurl, t.attrs["href"]) for t in sp.find_all("script"): t.extract() b=sp.find("base") if b is not None: b.attrs["href"]="" return sp