fachschaften/compiler/comp/fsch.py

from bs4 import BeautifulSoup
from dateutil.parser import parse
from datetime import datetime
import re
import urlparse
from src import clogger, cfg
from src.compiler.fixing import fix_link

# d["title"], d["image"], d["published"], d["text"], d["section"], d["url"]

# h=html.find("h2", {"class":"item-page-title"})
# h1=  re.sub(r'.*- (\d+) ([a-zA-Z]+) (\d+) - ([:\d]+)[^\d]*', r'\3/\2/\1 \4',unicode(h.text.strip()))

# h1=  re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text).strip())
# d["published"]=parse(h1.strip())
#   d["text"]=h.encode_contents().strip()


def fscharticle(url,raw,params={}):
    if raw is None:
        raise Error
    html=BeautifulSoup(raw)
    d={}
    h=html.find("h2", {"class":"item-page-title"})
    if h is not None:
        d["title"]=h.text.strip()
    d["author"]=None
    h=html.find("div", {"class":"item-page"})
    if h is not None:
        h2=h.find("h2", {"class":"item-page-title"})
        if h2 is not None:
            h2.extract()
        #d["text"]=h.encode_contents().strip()
        h2= h.find("img")
        if h2 is not None:
            d["image"]=h2.attrs["src"]
        else:
            d["image"]=""
    if params.has_key("parent_item"):
        pi=params["parent_item"]
        if pi.has_key("author_detail"):
            d["author"]=pi["author_detail"]
        if pi.has_key("published"):
            d["published"]=parse(pi["published"])
        d["pi"]=pi
    d["sourcetype"]="fscharticle"
    d["section"]= "Fachschaft Chemie"
    d["url"]=url
    return {"article": d}