init commit

This commit is contained in:
Andreas Stephanides
2017-01-14 12:23:04 +01:00
commit 8955bf17f5
32 changed files with 1555 additions and 0 deletions

37
compiler/fixing.py Normal file
View File

@@ -0,0 +1,37 @@
from bs4 import BeautifulSoup
from urlparse import urlparse, urlunparse, urljoin
from fetching import downloadfile
import bleach
def fix_link(url, link):
r= urlparse(link)
if r.scheme is None or r.scheme == '':
return urljoin(url,link)
else:
return link
def fix_file(url, link):
u=fix_link(url,link)
return downloadfile(u)
def load_file(url, link):
return fix_file(url,link)
def fix_html(html, baseurl):
html=bleach.clean(html, tags=['b','p','span','a','img','div','br','strong','ul','li'], strip=True)
sp=BeautifulSoup(html)
images=sp.find_all("img")
for t in images:
if "src" in t.attrs and t.attrs["src"] is not None:
t.attrs["src"]=fix_file(baseurl,t.attrs["src"])
links=sp.find_all("a")
for t in links:
if "href" in t.attrs:
t.attrs["href"]=fix_link(baseurl, t.attrs["href"])
for t in sp.find_all("script"):
t.extract()
b=sp.find("base")
if b is not None:
b.attrs["href"]=""
return sp