51 lines
1.5 KiB
Python
51 lines
1.5 KiB
Python
from bs4 import BeautifulSoup
|
|
from dateutil.parser import parse
|
|
from datetime import datetime
|
|
import re
|
|
import urlparse
|
|
from src import clogger, cfg
|
|
from src.compiler.fixing import fix_link
|
|
|
|
# d["title"], d["image"], d["published"], d["text"], d["section"], d["url"]
|
|
|
|
# h=html.find("h2", {"class":"item-page-title"})
|
|
# h1= re.sub(r'.*- (\d+) ([a-zA-Z]+) (\d+) - ([:\d]+)[^\d]*', r'\3/\2/\1 \4',unicode(h.text.strip()))
|
|
|
|
# h1= re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text).strip())
|
|
# d["published"]=parse(h1.strip())
|
|
# d["text"]=h.encode_contents().strip()
|
|
|
|
|
|
|
|
def fscharticle(url,raw,params={}):
|
|
if raw is None:
|
|
raise Error
|
|
html=BeautifulSoup(raw)
|
|
d={}
|
|
h=html.find("h2", {"class":"item-page-title"})
|
|
if h is not None:
|
|
d["title"]=h.text.strip()
|
|
d["author"]=None
|
|
h=html.find("div", {"class":"item-page"})
|
|
if h is not None:
|
|
h2=h.find("h2", {"class":"item-page-title"})
|
|
if h2 is not None:
|
|
h2.extract()
|
|
#d["text"]=h.encode_contents().strip()
|
|
h2= h.find("img")
|
|
if h2 is not None:
|
|
d["image"]=h2.attrs["src"]
|
|
else:
|
|
d["image"]=""
|
|
if params.has_key("parent_item"):
|
|
pi=params["parent_item"]
|
|
if pi.has_key("author_detail"):
|
|
d["author"]=pi["author_detail"]
|
|
if pi.has_key("published"):
|
|
d["published"]=parse(pi["published"])
|
|
d["pi"]=pi
|
|
d["sourcetype"]="fscharticle"
|
|
d["section"]= "Fachschaft Chemie"
|
|
d["url"]=url
|
|
return {"article": d}
|