fachschaften/compiler/compiler.py

from bs4 import BeautifulSoup
#import crawler.objects.models
#from crawler.objects.models import Object
from dateutil.parser import parse
from datetime import datetime
import re
import urlparse
from src import clogger, cfg
from fixing import fix_link
import feedparser

#from fetching import downloadfile
import json
def do_compile(tpe, cont):
    if  type(cont) != dict:
        clogger.error("Type Error for do compile for :"+str(cont["url"]))
    # Starting to compile an generic object
    if "url" not in cont:
        clogger.error("no url can't compile "+tpe)
    else:
        clogger.debug("compile: type:"+str(tpe)+ "| "+ str(cont["url"]))
        if tpe in compiler:
            cont=compiler[tpe](cont["url"], cont["raw"])
    return cont

from comp import rssfeed
from comp import fbfeed
def dummyarticle(url, raw):
    return {"url": url, "article":{"url": url, "section": "dummysection", "sourcetype": "dummy", "title":"dummytitle", "text": raw, "image": "fff", "author": "me", "published": None}}


def htufeed(url,raw):
    al=[]
    f=feedparser.parse(raw)
    for e in f['entries']:
        al.append(e['link'])
    return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}


def htuarticle(url,raw):
    sp=BeautifulSoup(raw)
    d={}
    h=sp.find("meta", {"property": "og:image"})
    if h is not None:
        d["image"]=h.attrs["content"]
        d["image2"]=d["image"]
    h=sp.find("div", {"class": "patternRevInfo"})
    if h is not None:
#        clogger.debug(h.text.strip())
        h1=  re.sub(r'.*- (\d+) ([a-zA-Z]+) (\d+) - ([:\d]+)[^\d]*', r'\3/\2/\1 \4',unicode(h.text.strip()))
#        clogger.debug(h1)
        d["published"]=parse(h1)
  #      clogger.debug(parse(h1))
  #      clogger.debug(d["published"])
    h=h.find("a")
    if h is not None:
        d["author"]=h.text.strip()
    h=sp.find("div", {"class": "foswikiTopic"})
    h1=h.find("h4")
    if h1 is not None:
        d["title"]= h1.text.strip()
        h1.extract() # remove head
    else:
        h1=sp.find("meta", {"name": "WEBTOPIC"})
        d["title"]= h1.attrs["content"]
    d["text"]=(h.encode_contents()).strip()
    d["section"]="HTU"
    d["url"]=url
#    clogger.debug(d)
    return {"article": d}


def fetarticle(url, raw):
    sp=BeautifulSoup(raw)
    d={}
    h=sp.find("h1", {"itemprop": "name"})
    d["title"]=unicode(h.text).strip()
    h=sp.find("div", {"itemprop": "articleBody"})
    if h is not None:
        d["text"]=(h.encode_contents()).strip()
    else:
        d["text"]=""
    d["url"]=url
    h=sp.find("span", {"itemprop": "author"})
    if h is not None:
        d["author"]=h.text.strip()
    h=sp.find("span", {"itemprop": "articleSection"})
    if h is not None:
        d["section"]= "FET - " + h.text.strip()


    h=sp.find("span", {"itemprop": "datePublished"})
    if h is not None:
        h1=  re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text).strip())
        d["published"]=parse(h1.strip())

    h=sp.find("meta", {"property": "og:image"})
    if h is not None:
        d["image"]=h.attrs["content"]
        d["image2"]=d["image"]
#    hh=sp.find_all("div", {"class":"media"})
#    for h in hh:
#        if h is not None:
#            h=h.find("div", {"class": "pull-left"})
#        if h is not None:
#            h=h.find("a")
#        if h is not None:
#            d["image2"]=downloadfile(fix_link(url,h.attrs["href"]))
    return {"article": d}


def fsarcharticle(url, raw):
    sp=BeautifulSoup(raw)
    d={}
    h=sp.find("h1", {"class": "title"})
    if h is not None:
        d["title"]=h.text.strip()
    d["url"]=url
    d["published"]=None
    h=sp.find("article")
    if h is not None:
        h=h.find("div", {"class": "content"})
        d["text"]=h.encode_contents().strip()
    h=sp.find("article")
    if h is not None:
        h=h.find("h1", {"class": "title"})
    if h is not None:
        d["title"]=h.text.strip()
    else:
        d["title"]=""
    d["image"]=""
    d["sourcetype"]="fsarcharticle"
    d["section"]="fsarch"
    d["author"]=None
    return {"article": d}

def fsbizarticle(url, raw):
    sp=BeautifulSoup(raw)
    d={}
    h=sp.find("h1", {"class": "entry-title"})
    if h is not None:
        d["title"]=h.text.strip()
    d["url"]=url
    h=sp.find("time", {"class": "entry-date"})
    if h is not None:
        d["published"] = parse(h.attrs["datetime"])
    else:
        d["published"]=None
    h=sp.find("div", {"class": "entry-content"})
    if h is not None:
        d["text"]=h.encode_contents().strip()
    d["image"]=""
    d["sourcetype"]="fsbizarticle"
    d["section"]="fsbiz"
    h=sp.find("span", {"class": "author"})
    d["author"]=None
    if h is not None:
        d["author"]=h.find("a").text.strip()
    return {"article": d}

def fetindex(url, raw):
    if raw is None:
        raise Error
#    clogger.debug("compile_fetindex: "+str(url))
    html=BeautifulSoup(raw)
    h = html.find("li", {"class": "next_page" })
    if h is not None:
        nl=h.find("a")
        nl=fix_link(url,nl.attrs["href"])
    else:
        nl=None
    h= html.find("ul", {"id": "neuigkeiten"})
    al = []
    if h is not None:
        links=h.find_all("a")
        for t in links:
            al.append(t.attrs["href"])
    return {"url": url, "next_page": nl, "article_links": al, "objecttype": "index" }

def fsarchindex(url, raw):
    if raw is None:
        raise Error
    html=BeautifulSoup(raw)
    h= html.find("article")
    print unicode(h)
    links=h.find_all("a")
    al = []
    fl=[]
    for t in links:
        url=t.attrs["href"]
        if re.search("fachschaftarchitektur\.at", url):
            al.append(t.attrs["href"])
        if re.search("facebook\.com/events", url):
            fl.append(t.attrs["href"])

    return {"url": url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"}


def fsbizindex(url, raw):
    if raw is None:
        raise Error
    print "compile_fsbizindex"
    html=BeautifulSoup(raw)
    h= html.find("section", {"id": "primary"})
    links=h.find_all("h1", {"class": "entry-title"})
    al = []
    for t in links:

        al.append(t.find("a").attrs["href"])
    return {"url": url,"article_links": al,"objecttype": "index"}


def fsmbindex(url, raw):
    if raw is None:
        raise Error
    html=BeautifulSoup(raw)
    h= html.find("a",{"class": "next"})
    if h is not None:
        np=h.attrs["href"]
    else:
        np=None
    h=html.find("div", {"id": "main"}).find("div", {"class": "inside"}).find("div", {"class": "mod_newslist"})
    if h is not None:
        ats=h.find_all("div",{"class": "block"})
        articles=[]
        for a in ats:
            aa={}
            h=a.find("h3")
            if h is not None:
                aa["title"] = h.text.strip()
            h=a.find("div", {"class": "ce_text"})
            if h is not None:
                aa["text"] = (h.encode_contents()).strip()
            aa["info"]=[]
            hh=a.find_all("p", {"class": "info"},recursive=False)
            for h in hh:
                aa["info"].append(unicode(h.text))
                if re.search(r'von', str(h)):
                    h1=  re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text))
                    aa["published"] =parse(h1.strip())
                    aa["author"]=re.sub(r'^.*von(.*)$', r'\1',unicode(h.text)).strip() #h.text + "--" #+ re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',hh)
            aa["section"]="FSMB"
            articles.append(aa)
    return {"url": url, "next_page": np, "articles": articles,"objecttype": "articles"}

compiler = {"fetindex": fetindex, "fetarticle": fetarticle, "fsarchindex": fsarchindex, "fsarcharticle": fsarcharticle, "fsmbindex": fsmbindex, "fsbizindex": fsbizindex, "dummyarticle": dummyarticle,"htuarticle": htuarticle, "htufeed": htufeed, "fbfeed": fbfeed, "fschfeed": rssfeed}

compiler = cfg.compiler
for i in compiler:
    compiler[i]=eval(compiler[i])


article_types={"fetindex" : "fetarticle", "fsarchindex": "fsarcharticle", "fsbizindex": "fsbizarticle", "dummyindex": "dummyarticle", "htufeed": "htuarticle"}