init commit
This commit is contained in:
37
compiler/fixing.py
Normal file
37
compiler/fixing.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from bs4 import BeautifulSoup
|
||||
from urlparse import urlparse, urlunparse, urljoin
|
||||
from fetching import downloadfile
|
||||
import bleach
|
||||
|
||||
def fix_link(url, link):
|
||||
r= urlparse(link)
|
||||
if r.scheme is None or r.scheme == '':
|
||||
return urljoin(url,link)
|
||||
else:
|
||||
return link
|
||||
|
||||
def fix_file(url, link):
|
||||
u=fix_link(url,link)
|
||||
return downloadfile(u)
|
||||
|
||||
def load_file(url, link):
|
||||
return fix_file(url,link)
|
||||
|
||||
|
||||
def fix_html(html, baseurl):
|
||||
html=bleach.clean(html, tags=['b','p','span','a','img','div','br','strong','ul','li'], strip=True)
|
||||
sp=BeautifulSoup(html)
|
||||
images=sp.find_all("img")
|
||||
for t in images:
|
||||
if "src" in t.attrs and t.attrs["src"] is not None:
|
||||
t.attrs["src"]=fix_file(baseurl,t.attrs["src"])
|
||||
links=sp.find_all("a")
|
||||
for t in links:
|
||||
if "href" in t.attrs:
|
||||
t.attrs["href"]=fix_link(baseurl, t.attrs["href"])
|
||||
for t in sp.find_all("script"):
|
||||
t.extract()
|
||||
b=sp.find("base")
|
||||
if b is not None:
|
||||
b.attrs["href"]=""
|
||||
return sp
|
||||
Reference in New Issue
Block a user