from bs4 import BeautifulSoup from dateutil.parser import parse from datetime import datetime import re import urlparse from src import clogger, cfg from src.compiler.fixing import fix_link # d["title"], d["image"], d["published"], d["text"], d["section"], d["url"] # h=html.find("h2", {"class":"item-page-title"}) # h1= re.sub(r'.*- (\d+) ([a-zA-Z]+) (\d+) - ([:\d]+)[^\d]*', r'\3/\2/\1 \4',unicode(h.text.strip())) # h1= re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text).strip()) # d["published"]=parse(h1.strip()) # d["text"]=h.encode_contents().strip() def fscharticle(url,raw,params={}): if raw is None: raise Error html=BeautifulSoup(raw) d={} h=html.find("h2", {"class":"item-page-title"}) if h is not None: d["title"]=h.text.strip() d["author"]=None h=html.find("div", {"class":"item-page"}) if h is not None: h2=h.find("h2", {"class":"item-page-title"}) if h2 is not None: h2.extract() #d["text"]=h.encode_contents().strip() h2= h.find("img") if h2 is not None: d["image"]=h2.attrs["src"] else: d["image"]="" if params.has_key("parent_item"): pi=params["parent_item"] if pi.has_key("author_detail"): d["author"]=pi["author_detail"] if pi.has_key("published"): d["published"]=parse(pi["published"]) d["pi"]=pi d["sourcetype"]="fscharticle" d["section"]= "Fachschaft Chemie" return {"article": d}