init commit

2017-01-14 12:23:04 +01:00
commit 8955bf17f5
32 changed files with 1555 additions and 0 deletions
--- a/compiler/compiler.py
+++ b/compiler/compiler.py
@@ -0,0 +1,258 @@
+from bs4 import BeautifulSoup
+#import crawler.objects.models 
+#from crawler.objects.models import Object
+from dateutil.parser import parse
+from datetime import datetime
+import re
+import urlparse
+from src import clogger, cfg
+from src.fb import graph
+from fixing import fix_link
+from facebook import GraphAPIError
+#from fetching import downloadfile
+import json
+def do_compile(tpe, cont):
+    if  type(cont) != dict:
+        clogger.error("Type Error for do compile for :"+str(cont["url"]))
+    # Starting to compile an generic object
+    if "url" not in cont:
+        clogger.error("no url can't compile "+tpe)
+    else:
+        clogger.debug("compile: type:"+str(tpe)+ "| "+ str(cont["url"]))
+        if tpe in compiler:
+            cont=compiler[tpe](cont["url"], cont["raw"])
+    return cont
+
+from comp import rssfeed
+
+def dummyarticle(url, raw):
+    return {"url": url, "article":{"url": url, "section": "dummysection", "sourcetype": "dummy", "title":"dummytitle", "text": raw, "image": "fff", "author": "me", "published": None}}
+
+
+
+def htufeed(url,raw):
+    al=[]
+    f=feedparser.parse(raw)
+    for e in f['entries']:
+        al.append(e['link'])
+    return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}
+    
+
+def htuarticle(url,raw):
+    sp=BeautifulSoup(raw)
+    d={}
+    h=sp.find("meta", {"property": "og:image"})    
+    if h is not None:
+        d["image"]=h.attrs["content"]
+        d["image2"]=d["image"]
+    h=sp.find("div", {"class": "patternRevInfo"})
+    if h is not None:
+#        clogger.debug(h.text.strip())
+        h1=  re.sub(r'.*- (\d+) ([a-zA-Z]+) (\d+) - ([:\d]+)[^\d]*', r'\3/\2/\1 \4',unicode(h.text.strip()))
+#        clogger.debug(h1)
+        d["published"]=parse(h1)
+  #      clogger.debug(parse(h1))
+  #      clogger.debug(d["published"])
+    h=h.find("a")
+    if h is not None:
+        d["author"]=h.text.strip()
+    h=sp.find("div", {"class": "foswikiTopic"})
+    h1=h.find("h4")
+    if h1 is not None:
+        d["title"]= h1.text.strip()
+        h1.extract() # remove head
+    else:
+        h1=sp.find("meta", {"name": "WEBTOPIC"})    
+        d["title"]= h1.attrs["content"]
+    d["text"]=(h.encode_contents()).strip()
+    d["section"]="HTU"
+    d["url"]=url
+#    clogger.debug(d)
+    return {"article": d}
+
+    
+def fetarticle(url, raw):
+    sp=BeautifulSoup(raw)
+    d={}
+    h=sp.find("h1", {"itemprop": "name"})
+    d["title"]=unicode(h.text).strip()
+    h=sp.find("div", {"itemprop": "articleBody"})
+    if h is not None:
+        d["text"]=(h.encode_contents()).strip()
+    else:
+        d["text"]=""
+    d["url"]=url
+    h=sp.find("span", {"itemprop": "author"})
+    if h is not None:
+        d["author"]=h.text.strip()
+    h=sp.find("span", {"itemprop": "articleSection"})
+    if h is not None:
+        d["section"]= "FET - " + h.text.strip()
+
+    h=sp.find("span", {"itemprop": "datePublished"})
+    if h is not None:
+        d["published"]=parse(h.encode_contents().strip())
+
+    h=sp.find("meta", {"property": "og:image"})    
+    if h is not None:
+        d["image"]=h.attrs["content"]
+        d["image2"]=d["image"]
+#    hh=sp.find_all("div", {"class":"media"})
+#    for h in hh:
+#        if h is not None:
+#            h=h.find("div", {"class": "pull-left"})
+#        if h is not None:
+#            h=h.find("a")    
+#        if h is not None:
+#            d["image2"]=downloadfile(fix_link(url,h.attrs["href"]))
+    return {"article": d}
+
+
+def fsarcharticle(url, raw):
+    sp=BeautifulSoup(raw)
+    d={}
+    h=sp.find("h1", {"class": "title"})
+    if h is not None:
+        d["title"]=h.text.strip()
+    d["url"]=url
+    d["published"]=None
+    h=sp.find("article")
+    if h is not None:
+        h=h.find("div", {"class": "content"})
+        d["text"]=h.encode_contents().strip()
+    h=sp.find("article")
+    if h is not None:
+        h=h.find("h1", {"class": "title"})
+    if h is not None:
+        d["title"]=h.text.strip()
+    else:
+        d["title"]=""
+    d["image"]=""
+    d["sourcetype"]="fsarcharticle"
+    d["section"]="fsarch"
+    d["author"]=None
+    return {"article": d}
+
+def fetindex(url, raw):
+    if raw is None:
+        raise Error
+#    clogger.debug("compile_fetindex: "+str(url))
+    html=BeautifulSoup(raw)
+    h = html.find("li", {"class": "next_page" })
+    if h is not None:
+        nl=h.find("a")
+        nl=fix_link(url,nl.attrs["href"])
+    else:
+        nl=None
+    h= html.find("ul", {"id": "neuigkeiten"})
+    al = []
+    if h is not None:
+        links=h.find_all("a")
+        for t in links:
+            al.append(t.attrs["href"])
+    return {"url": url, "next_page": nl, "article_links": al, "objecttype": "index" }
+
+def fsarchindex(url, raw):
+    if raw is None:
+        raise Error
+    html=BeautifulSoup(raw)
+    h= html.find("article")
+    print unicode(h)
+    links=h.find_all("a")
+    al = []
+    fl=[]
+    for t in links:
+        url=t.attrs["href"]
+        if re.search("fachschaftarchitektur\.at", url): 
+            al.append(t.attrs["href"])
+        if re.search("facebook\.com/events", url): 
+            fl.append(t.attrs["href"])
+        
+    return {"url": url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"}
+
+
+def fsbizindex(url, raw):
+    if raw is None:
+        raise Error
+    print "compile_fsbizindex"
+    html=BeautifulSoup(raw)
+    h= html.find("section", {"id": "primary"})
+    links=h.find_all("h1", {"class": "entry-title"})
+    al = []
+    for t in links:
+
+        al.append(t.find("a").attrs["href"])
+    return {"url": url,"article_links": al,"objecttype": "index"}
+
+
+
+
+def fbfeed(url, raw):
+    js = json.loads(raw)
+    arts=[]
+    u=urlparse.urlparse(url)
+    for m in js["data"]:
+        aa={}
+        aa["url"]=urlparse.urlunsplit(("http","www.facebook.at",m["id"],"",""))
+        aa["published"] =parse(m["created_time"])
+        if m.has_key("message")==True:
+            aa["text"] = m["message"]
+        else:
+            try:
+                h=graph.get_object(id=m["id"].split("_")[1])
+                if h.has_key("description"):
+                    aa["text"]=h["description"]
+                else:
+                    aa["text"]=json.dumps()
+            except GraphAPIError:
+                aa["text"]=""
+        if m.has_key("story")==True:
+            aa["title"] = m["story"]
+        else:
+            aa["title"] = u[1]+ " at " + aa["published"].strftime("%Y-%m-%d %H:%M")
+        aa["section"]="Facebook: "+u[1]
+        arts.append(aa)
+    return {"url": url, "next_page": js["paging"]["next"],"articles": arts}
+
+def fsmbindex(url, raw):
+    if raw is None:
+        raise Error
+    html=BeautifulSoup(raw)
+    h= html.find("a",{"class": "next"})
+    if h is not None:
+        np=h.attrs["href"]
+    else:
+        np=None
+    h=html.find("div", {"id": "main"}).find("div", {"class": "inside"}).find("div", {"class": "mod_newslist"})
+    if h is not None:
+        ats=h.find_all("div",{"class": "block"})
+        articles=[]
+        for a in ats:
+            aa={}
+            h=a.find("h3")
+            if h is not None:
+                aa["title"] = h.text.strip()
+            h=a.find("div", {"class": "ce_text"})
+            if h is not None:
+                aa["text"] = (h.encode_contents()).strip()
+            aa["info"]=[]
+            hh=a.find_all("p", {"class": "info"},recursive=False)
+            for h in hh:
+                aa["info"].append(unicode(h.text))
+                if re.search(r'von', str(h)):
+                    h1=  re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text)) 
+                    aa["published"] =parse(h1.strip())
+                    aa["author"]=re.sub(r'^.*von(.*)$', r'\1',unicode(h.text)).strip() #h.text + "--" #+ re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',hh) 
+            aa["section"]="FSMB"
+            articles.append(aa)
+    return {"url": url, "next_page": np, "articles": articles,"objecttype": "articles"}
+
+compiler = {"fetindex": fetindex, "fetarticle": fetarticle, "fsarchindex": fsarchindex, "fsarcharticle": fsarcharticle, "fsmbindex": fsmbindex, "fsbizindex": fsbizindex, "dummyarticle": dummyarticle,"htuarticle": htuarticle, "htufeed": htufeed, "fbfeed": fbfeed, "fschfeed": rssfeed}    
+
+compiler = cfg.compiler
+for i in compiler:
+    compiler[i]=eval(compiler[i])
+
+
+
+article_types={"fetindex" : "fetarticle", "fsarchindex": "fsarcharticle", "fsbizindex": "fsbizarticle", "dummyindex": "dummyarticle", "htufeed": "htuarticle"}