from bs4 import BeautifulSoup #import crawler.objects.models #from crawler.objects.models import Object from dateutil.parser import parse from datetime import datetime import re import urlparse from src import clogger, cfg from src.compiler.fixing import fix_link import feedparser #from fetching import downloadfile import json def do_compile(tpe, cont, params={}): if type(cont) != dict: clogger.error("Type Error for do compile for :"+str(cont["url"])) # Starting to compile an generic object if "url" not in cont: clogger.error("no url can't compile "+tpe) else: clogger.debug("compile: type:"+str(tpe)+ "| "+ str(cont["url"])) if tpe in compiler: cont=compiler[tpe](cont["url"], cont["raw"],params) else: clogger.error("Compiler for "+tpe+" not found.") return cont from comp import rssfeed from comp import fbfeed from comp.fsch import fscharticle def dummyarticle(url, raw): return {"url": url, "article":{"url": url, "section": "dummysection", "sourcetype": "dummy", "title":"dummytitle", "text": raw, "image": "fff", "author": "me", "published": None}} def htufeed(url,raw,params={}): al=[] f=feedparser.parse(raw) for e in f['entries']: al.append(e['link']) return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"} def htuarticle(url,raw,params={}): sp=BeautifulSoup(raw) d={} h=sp.find("meta", {"property": "og:image"}) if h is not None: d["image"]=h.attrs["content"] d["image2"]=d["image"] h=sp.find("div", {"class": "patternRevInfo"}) if h is not None: # clogger.debug(h.text.strip()) h1= re.sub(r'.*- (\d+) ([a-zA-Z]+) (\d+) - ([:\d]+)[^\d]*', r'\3/\2/\1 \4',unicode(h.text.strip())) # clogger.debug(h1) d["published"]=parse(h1) # clogger.debug(parse(h1)) # clogger.debug(d["published"]) h=h.find("a") if h is not None: d["author"]=h.text.strip() h=sp.find("div", {"class": "foswikiTopic"}) h1=h.find("h4") if h1 is not None: d["title"]= h1.text.strip() h1.extract() # remove head else: h1=sp.find("meta", {"name": "WEBTOPIC"}) d["title"]= h1.attrs["content"] d["text"]=(h.encode_contents()).strip() d["section"]="HTU" d["url"]=url # clogger.debug(d) return {"article": d} def fetarticle(url, raw,params={}): sp=BeautifulSoup(raw) d={} h=sp.find("h1", {"itemprop": "name"}) d["title"]=unicode(h.text).strip() h=sp.find("div", {"itemprop": "articleBody"}) if h is not None: d["text"]=(h.encode_contents()).strip() else: d["text"]="" d["url"]=url h=sp.find("span", {"itemprop": "author"}) if h is not None: d["author"]=h.text.strip() h=sp.find("span", {"itemprop": "articleSection"}) if h is not None: d["section"]= "FET - " + h.text.strip() h=sp.find("span", {"itemprop": "datePublished"}) if h is not None: h1= re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text).strip()) d["published"]=parse(h1.strip()) h=sp.find("meta", {"property": "og:image"}) if h is not None: d["image"]=h.attrs["content"] d["image2"]=d["image"] # hh=sp.find_all("div", {"class":"media"}) # for h in hh: # if h is not None: # h=h.find("div", {"class": "pull-left"}) # if h is not None: # h=h.find("a") # if h is not None: # d["image2"]=downloadfile(fix_link(url,h.attrs["href"])) return {"article": d} def fsarcharticle(url, raw,params={}): sp=BeautifulSoup(raw) d={} h=sp.find("h1", {"class": "title"}) if h is not None: d["title"]=h.text.strip() d["url"]=url d["published"]=None h=sp.find("article") if h is not None: h=h.find("div", {"class": "content"}) d["text"]=h.encode_contents().strip() h=sp.find("article") if h is not None: h=h.find("h1", {"class": "title"}) if h is not None: d["title"]=h.text.strip() else: d["title"]="" d["image"]="" d["sourcetype"]="fsarcharticle" d["section"]="fsarch" d["author"]=None return {"article": d} def fsbizarticle(url, raw,params={}): sp=BeautifulSoup(raw) d={} h=sp.find("h1", {"class": "entry-title"}) if h is not None: d["title"]=h.text.strip() d["url"]=url h=sp.find("time", {"class": "entry-date"}) if h is not None: d["published"] = parse(h.attrs["datetime"]) else: d["published"]=None h=sp.find("div", {"class": "entry-content"}) if h is not None: d["text"]=h.encode_contents().strip() d["image"]="" d["sourcetype"]="fsbizarticle" d["section"]="fsbiz" h=sp.find("span", {"class": "author"}) d["author"]=None if h is not None: d["author"]=h.find("a").text.strip() return {"article": d} def fetindex(url, raw,params={}): if raw is None: raise Error # clogger.debug("compile_fetindex: "+str(url)) html=BeautifulSoup(raw) h = html.find("li", {"class": "next_page" }) if h is not None: nl=h.find("a") nl=fix_link(url,nl.attrs["href"]) else: nl=None h= html.find("ul", {"id": "neuigkeiten"}) al = [] if h is not None: links=h.find_all("a") for t in links: al.append(t.attrs["href"]) return {"url": url, "next_page": nl, "article_links": al, "objecttype": "index" } def fsarchindex(url, raw,params={}): if raw is None: raise Error html=BeautifulSoup(raw) h= html.find("article") print unicode(h) links=h.find_all("a") al = [] fl=[] for t in links: url=t.attrs["href"] if re.search("fachschaftarchitektur\.at", url): al.append(t.attrs["href"]) if re.search("facebook\.com/events", url): fl.append(t.attrs["href"]) return {"url": url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"} def fsbizindex(url, raw,params={}): if raw is None: raise Error print "compile_fsbizindex" html=BeautifulSoup(raw) h= html.find("section", {"id": "primary"}) links=h.find_all("h1", {"class": "entry-title"}) al = [] for t in links: al.append(t.find("a").attrs["href"]) return {"url": url,"article_links": al,"objecttype": "index"} def fsmbindex(url, raw,params={}): if raw is None: raise Error html=BeautifulSoup(raw) h= html.find("a",{"class": "next"}) if h is not None: np=h.attrs["href"] else: np=None h=html.find("div", {"id": "main"}).find("div", {"class": "inside"}).find("div", {"class": "mod_newslist"}) if h is not None: ats=h.find_all("div",{"class": "block"}) articles=[] for a in ats: aa={} h=a.find("h3") if h is not None: aa["title"] = h.text.strip() h=a.find("div", {"class": "ce_text"}) if h is not None: aa["text"] = (h.encode_contents()).strip() aa["info"]=[] hh=a.find_all("p", {"class": "info"},recursive=False) for h in hh: aa["info"].append(unicode(h.text)) if re.search(r'von', str(h)): h1= re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text)) aa["published"] =parse(h1.strip()) aa["author"]=re.sub(r'^.*von(.*)$', r'\1',unicode(h.text)).strip() #h.text + "--" #+ re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',hh) aa["section"]="FSMB" articles.append(aa) return {"url": url, "next_page": np, "articles": articles,"objecttype": "articles"} compiler = {"fetindex": fetindex, "fetarticle": fetarticle, "fsarchindex": fsarchindex, "fsarcharticle": fsarcharticle, "fsmbindex": fsmbindex, "fsbizindex": fsbizindex, "dummyarticle": dummyarticle,"htuarticle": htuarticle, "htufeed": htufeed, "fbfeed": fbfeed, "fschfeed": rssfeed, "fscharticle": fscharticle} compiler = cfg.compiler for i in compiler: compiler[i]=eval(compiler[i]) article_types={"fetindex" : "fetarticle", "fsarchindex": "fsarcharticle", "fsbizindex": "fsbizarticle", "dummyindex": "dummyarticle", "htufeed": "htuarticle", "fschfeed": "fscharticle"}