38 lines
1.0 KiB
Python
38 lines
1.0 KiB
Python
from bs4 import BeautifulSoup
|
|
from urlparse import urlparse, urlunparse, urljoin
|
|
from fetching import downloadfile
|
|
import bleach
|
|
|
|
def fix_link(url, link):
|
|
r= urlparse(link)
|
|
if r.scheme is None or r.scheme == '':
|
|
return urljoin(url,link)
|
|
else:
|
|
return link
|
|
|
|
def fix_file(url, link):
|
|
u=fix_link(url,link)
|
|
return downloadfile(u)
|
|
|
|
def load_file(url, link):
|
|
return fix_file(url,link)
|
|
|
|
|
|
def fix_html(html, baseurl):
|
|
html=bleach.clean(html, tags=['b','p','span','a','img','div','br','strong','ul','li'], strip=True)
|
|
sp=BeautifulSoup(html)
|
|
images=sp.find_all("img")
|
|
for t in images:
|
|
if "src" in t.attrs and t.attrs["src"] is not None:
|
|
t.attrs["src"]=fix_file(baseurl,t.attrs["src"])
|
|
links=sp.find_all("a")
|
|
for t in links:
|
|
if "href" in t.attrs:
|
|
t.attrs["href"]=fix_link(baseurl, t.attrs["href"])
|
|
for t in sp.find_all("script"):
|
|
t.extract()
|
|
b=sp.find("base")
|
|
if b is not None:
|
|
b.attrs["href"]=""
|
|
return sp
|