init commit
This commit is contained in:
258
compiler/compiler.py
Normal file
258
compiler/compiler.py
Normal file
@@ -0,0 +1,258 @@
|
||||
from bs4 import BeautifulSoup
|
||||
#import crawler.objects.models
|
||||
#from crawler.objects.models import Object
|
||||
from dateutil.parser import parse
|
||||
from datetime import datetime
|
||||
import re
|
||||
import urlparse
|
||||
from src import clogger, cfg
|
||||
from src.fb import graph
|
||||
from fixing import fix_link
|
||||
from facebook import GraphAPIError
|
||||
#from fetching import downloadfile
|
||||
import json
|
||||
def do_compile(tpe, cont):
|
||||
if type(cont) != dict:
|
||||
clogger.error("Type Error for do compile for :"+str(cont["url"]))
|
||||
# Starting to compile an generic object
|
||||
if "url" not in cont:
|
||||
clogger.error("no url can't compile "+tpe)
|
||||
else:
|
||||
clogger.debug("compile: type:"+str(tpe)+ "| "+ str(cont["url"]))
|
||||
if tpe in compiler:
|
||||
cont=compiler[tpe](cont["url"], cont["raw"])
|
||||
return cont
|
||||
|
||||
from comp import rssfeed
|
||||
|
||||
def dummyarticle(url, raw):
|
||||
return {"url": url, "article":{"url": url, "section": "dummysection", "sourcetype": "dummy", "title":"dummytitle", "text": raw, "image": "fff", "author": "me", "published": None}}
|
||||
|
||||
|
||||
|
||||
def htufeed(url,raw):
|
||||
al=[]
|
||||
f=feedparser.parse(raw)
|
||||
for e in f['entries']:
|
||||
al.append(e['link'])
|
||||
return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}
|
||||
|
||||
|
||||
def htuarticle(url,raw):
|
||||
sp=BeautifulSoup(raw)
|
||||
d={}
|
||||
h=sp.find("meta", {"property": "og:image"})
|
||||
if h is not None:
|
||||
d["image"]=h.attrs["content"]
|
||||
d["image2"]=d["image"]
|
||||
h=sp.find("div", {"class": "patternRevInfo"})
|
||||
if h is not None:
|
||||
# clogger.debug(h.text.strip())
|
||||
h1= re.sub(r'.*- (\d+) ([a-zA-Z]+) (\d+) - ([:\d]+)[^\d]*', r'\3/\2/\1 \4',unicode(h.text.strip()))
|
||||
# clogger.debug(h1)
|
||||
d["published"]=parse(h1)
|
||||
# clogger.debug(parse(h1))
|
||||
# clogger.debug(d["published"])
|
||||
h=h.find("a")
|
||||
if h is not None:
|
||||
d["author"]=h.text.strip()
|
||||
h=sp.find("div", {"class": "foswikiTopic"})
|
||||
h1=h.find("h4")
|
||||
if h1 is not None:
|
||||
d["title"]= h1.text.strip()
|
||||
h1.extract() # remove head
|
||||
else:
|
||||
h1=sp.find("meta", {"name": "WEBTOPIC"})
|
||||
d["title"]= h1.attrs["content"]
|
||||
d["text"]=(h.encode_contents()).strip()
|
||||
d["section"]="HTU"
|
||||
d["url"]=url
|
||||
# clogger.debug(d)
|
||||
return {"article": d}
|
||||
|
||||
|
||||
def fetarticle(url, raw):
|
||||
sp=BeautifulSoup(raw)
|
||||
d={}
|
||||
h=sp.find("h1", {"itemprop": "name"})
|
||||
d["title"]=unicode(h.text).strip()
|
||||
h=sp.find("div", {"itemprop": "articleBody"})
|
||||
if h is not None:
|
||||
d["text"]=(h.encode_contents()).strip()
|
||||
else:
|
||||
d["text"]=""
|
||||
d["url"]=url
|
||||
h=sp.find("span", {"itemprop": "author"})
|
||||
if h is not None:
|
||||
d["author"]=h.text.strip()
|
||||
h=sp.find("span", {"itemprop": "articleSection"})
|
||||
if h is not None:
|
||||
d["section"]= "FET - " + h.text.strip()
|
||||
|
||||
h=sp.find("span", {"itemprop": "datePublished"})
|
||||
if h is not None:
|
||||
d["published"]=parse(h.encode_contents().strip())
|
||||
|
||||
h=sp.find("meta", {"property": "og:image"})
|
||||
if h is not None:
|
||||
d["image"]=h.attrs["content"]
|
||||
d["image2"]=d["image"]
|
||||
# hh=sp.find_all("div", {"class":"media"})
|
||||
# for h in hh:
|
||||
# if h is not None:
|
||||
# h=h.find("div", {"class": "pull-left"})
|
||||
# if h is not None:
|
||||
# h=h.find("a")
|
||||
# if h is not None:
|
||||
# d["image2"]=downloadfile(fix_link(url,h.attrs["href"]))
|
||||
return {"article": d}
|
||||
|
||||
|
||||
def fsarcharticle(url, raw):
|
||||
sp=BeautifulSoup(raw)
|
||||
d={}
|
||||
h=sp.find("h1", {"class": "title"})
|
||||
if h is not None:
|
||||
d["title"]=h.text.strip()
|
||||
d["url"]=url
|
||||
d["published"]=None
|
||||
h=sp.find("article")
|
||||
if h is not None:
|
||||
h=h.find("div", {"class": "content"})
|
||||
d["text"]=h.encode_contents().strip()
|
||||
h=sp.find("article")
|
||||
if h is not None:
|
||||
h=h.find("h1", {"class": "title"})
|
||||
if h is not None:
|
||||
d["title"]=h.text.strip()
|
||||
else:
|
||||
d["title"]=""
|
||||
d["image"]=""
|
||||
d["sourcetype"]="fsarcharticle"
|
||||
d["section"]="fsarch"
|
||||
d["author"]=None
|
||||
return {"article": d}
|
||||
|
||||
def fetindex(url, raw):
|
||||
if raw is None:
|
||||
raise Error
|
||||
# clogger.debug("compile_fetindex: "+str(url))
|
||||
html=BeautifulSoup(raw)
|
||||
h = html.find("li", {"class": "next_page" })
|
||||
if h is not None:
|
||||
nl=h.find("a")
|
||||
nl=fix_link(url,nl.attrs["href"])
|
||||
else:
|
||||
nl=None
|
||||
h= html.find("ul", {"id": "neuigkeiten"})
|
||||
al = []
|
||||
if h is not None:
|
||||
links=h.find_all("a")
|
||||
for t in links:
|
||||
al.append(t.attrs["href"])
|
||||
return {"url": url, "next_page": nl, "article_links": al, "objecttype": "index" }
|
||||
|
||||
def fsarchindex(url, raw):
|
||||
if raw is None:
|
||||
raise Error
|
||||
html=BeautifulSoup(raw)
|
||||
h= html.find("article")
|
||||
print unicode(h)
|
||||
links=h.find_all("a")
|
||||
al = []
|
||||
fl=[]
|
||||
for t in links:
|
||||
url=t.attrs["href"]
|
||||
if re.search("fachschaftarchitektur\.at", url):
|
||||
al.append(t.attrs["href"])
|
||||
if re.search("facebook\.com/events", url):
|
||||
fl.append(t.attrs["href"])
|
||||
|
||||
return {"url": url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"}
|
||||
|
||||
|
||||
def fsbizindex(url, raw):
|
||||
if raw is None:
|
||||
raise Error
|
||||
print "compile_fsbizindex"
|
||||
html=BeautifulSoup(raw)
|
||||
h= html.find("section", {"id": "primary"})
|
||||
links=h.find_all("h1", {"class": "entry-title"})
|
||||
al = []
|
||||
for t in links:
|
||||
|
||||
al.append(t.find("a").attrs["href"])
|
||||
return {"url": url,"article_links": al,"objecttype": "index"}
|
||||
|
||||
|
||||
|
||||
|
||||
def fbfeed(url, raw):
|
||||
js = json.loads(raw)
|
||||
arts=[]
|
||||
u=urlparse.urlparse(url)
|
||||
for m in js["data"]:
|
||||
aa={}
|
||||
aa["url"]=urlparse.urlunsplit(("http","www.facebook.at",m["id"],"",""))
|
||||
aa["published"] =parse(m["created_time"])
|
||||
if m.has_key("message")==True:
|
||||
aa["text"] = m["message"]
|
||||
else:
|
||||
try:
|
||||
h=graph.get_object(id=m["id"].split("_")[1])
|
||||
if h.has_key("description"):
|
||||
aa["text"]=h["description"]
|
||||
else:
|
||||
aa["text"]=json.dumps()
|
||||
except GraphAPIError:
|
||||
aa["text"]=""
|
||||
if m.has_key("story")==True:
|
||||
aa["title"] = m["story"]
|
||||
else:
|
||||
aa["title"] = u[1]+ " at " + aa["published"].strftime("%Y-%m-%d %H:%M")
|
||||
aa["section"]="Facebook: "+u[1]
|
||||
arts.append(aa)
|
||||
return {"url": url, "next_page": js["paging"]["next"],"articles": arts}
|
||||
|
||||
def fsmbindex(url, raw):
|
||||
if raw is None:
|
||||
raise Error
|
||||
html=BeautifulSoup(raw)
|
||||
h= html.find("a",{"class": "next"})
|
||||
if h is not None:
|
||||
np=h.attrs["href"]
|
||||
else:
|
||||
np=None
|
||||
h=html.find("div", {"id": "main"}).find("div", {"class": "inside"}).find("div", {"class": "mod_newslist"})
|
||||
if h is not None:
|
||||
ats=h.find_all("div",{"class": "block"})
|
||||
articles=[]
|
||||
for a in ats:
|
||||
aa={}
|
||||
h=a.find("h3")
|
||||
if h is not None:
|
||||
aa["title"] = h.text.strip()
|
||||
h=a.find("div", {"class": "ce_text"})
|
||||
if h is not None:
|
||||
aa["text"] = (h.encode_contents()).strip()
|
||||
aa["info"]=[]
|
||||
hh=a.find_all("p", {"class": "info"},recursive=False)
|
||||
for h in hh:
|
||||
aa["info"].append(unicode(h.text))
|
||||
if re.search(r'von', str(h)):
|
||||
h1= re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text))
|
||||
aa["published"] =parse(h1.strip())
|
||||
aa["author"]=re.sub(r'^.*von(.*)$', r'\1',unicode(h.text)).strip() #h.text + "--" #+ re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',hh)
|
||||
aa["section"]="FSMB"
|
||||
articles.append(aa)
|
||||
return {"url": url, "next_page": np, "articles": articles,"objecttype": "articles"}
|
||||
|
||||
compiler = {"fetindex": fetindex, "fetarticle": fetarticle, "fsarchindex": fsarchindex, "fsarcharticle": fsarcharticle, "fsmbindex": fsmbindex, "fsbizindex": fsbizindex, "dummyarticle": dummyarticle,"htuarticle": htuarticle, "htufeed": htufeed, "fbfeed": fbfeed, "fschfeed": rssfeed}
|
||||
|
||||
compiler = cfg.compiler
|
||||
for i in compiler:
|
||||
compiler[i]=eval(compiler[i])
|
||||
|
||||
|
||||
|
||||
article_types={"fetindex" : "fetarticle", "fsarchindex": "fsarcharticle", "fsbizindex": "fsbizarticle", "dummyindex": "dummyarticle", "htufeed": "htuarticle"}
|
||||
Reference in New Issue
Block a user