init commit

2017-01-14 12:23:04 +01:00
commit 8955bf17f5
32 changed files with 1555 additions and 0 deletions
--- a/compiler/compile.py
+++ b/compiler/compile.py
@@ -0,0 +1,153 @@
+from bs4 import BeautifulSoup
+import crawler.objects.models 
+#from crawler.objects.models import Object
+from dateutil.parser import parse
+from datetime import datetime
+import re
+def hello():
+    return "hello"
+
+
+def fetarticle(o):
+    sp=BeautifulSoup(o.raw_fixed)
+    d={}
+    h=sp.find("h1", {"itemprop": "name"})
+    d["title"]=unicode(h.text).strip()
+    h=sp.find("div", {"itemprop": "articleBody"})
+    if h is not None:
+        d["text"]=(h.encode_contents()).strip()
+    else:
+        d["text"]=""
+    d["url"]=o.url
+    h=sp.find("span", {"itemprop": "author"})
+    if h is not None:
+        d["author"]=h.text.strip()
+    h=sp.find("span", {"itemprop": "articleSection"})
+    if h is not None:
+        d["section"]= "FET - " + h.text.strip()
+
+    h=sp.find("span", {"itemprop": "datePublished"})
+    if h is not None:
+        d["published"]=parse(h.encode_contents().strip())
+    h=sp.find("meta", {"property": "og:image"})
+    
+    if h is not None:
+        d["image"]=h.attrs["content"]
+        
+    hh=sp.find_all("div", {"class":"media"})
+    for h in hh:
+        if h is not None:
+            h=h.find("div", {"class": "pull-left"})
+        if h is not None:
+            h=h.find("a")    
+        if h is not None:
+            d["image2"]=crawler.objects.models.download_file(h.attrs["href"])
+    return {"article": d}
+
+def fsarcharticle(o):
+    sp=BeautifulSoup(o.raw_fixed)
+    d={}
+    h=sp.find("h1", {"class": "title"})
+    if h is not None:
+        d["title"]=h.text.strip()
+    d["url"]=o.url
+    d["published"]=None
+    h=sp.find("article")
+    h=h.find("div", {"class": "content"})
+    d["text"]=h.encode_contents().strip()
+    h=sp.find("article").find("h1", {"class": "title"})
+    if h is not None:
+        d["title"]=h.text.strip()
+    else:
+        d["title"]=""
+    d["image"]=""
+    d["sourcetype"]="fsarcharticle"
+    d["section"]="fsarch"
+    d["author"]=None
+    return {"article": d}
+
+def fetindex(o):
+#    if type(o) is not Object:
+#        raise TypeError
+    if o.raw is None:
+        raise Error
+    print "compile_fetindex"
+    html=BeautifulSoup(o.raw_fixed)
+    h = html.find("li", {"class": "next_page" })
+    if h is not None:
+        nl=h.find("a")
+        nl=crawler.objects.models.fix_link(o.url,nl.attrs["href"])
+    else:
+        nl=None
+    h= html.find("ul", {"id": "neuigkeiten"})
+    links=h.find_all("a")
+    al = []
+    for t in links:
+        al.append(t.attrs["href"])
+    return {"url": o.url, "next_page": nl, "article_links": al, "objecttype": "index" }
+
+def fsarchindex(o):
+    if o.raw is None:
+        raise Error
+    html=BeautifulSoup(o.raw_fixed)
+    h= html.find("article")
+    print unicode(h)
+    links=h.find_all("a")
+    al = []
+    fl=[]
+    for t in links:
+        url=t.attrs["href"]
+        if re.search("fachschaftarchitektur\.at", url): 
+            al.append(t.attrs["href"])
+        if re.search("facebook\.com/events", url): 
+            fl.append(t.attrs["href"])
+        
+    return {"url": o.url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"}
+
+
+def fsbizindex(o):
+    if o.raw is None:
+        raise Error
+    print "compile_fsbizindex"
+    html=BeautifulSoup(o.raw_fixed)
+    h= html.find("section", {"id": "primary"})
+    links=h.find_all("h1", {"class": "entry-title"})
+    al = []
+    for t in links:
+
+        al.append(t.find("a").attrs["href"])
+    return {"url": o.url,"article_links": al,"objecttype": "index"}
+
+
+def fsmbindex(o):
+    if o.raw is None:
+        raise Error
+    html=BeautifulSoup(o.raw_fixed)
+    h= html.find("a",{"class": "next"})
+    if h is not None:
+        np=h.attrs["href"]
+    else:
+        np=None
+    h=html.find("div", {"id": "main"}).find("div", {"class": "inside"}).find("div", {"class": "mod_newslist"})
+    if h is not None:
+        ats=h.find_all("div",{"class": "block"})
+        articles=[]
+        for a in ats:
+            aa={}
+            h=a.find("h3")
+            if h is not None:
+                aa["title"] = h.text.strip()
+            h=a.find("div", {"class": "ce_text"})
+            if h is not None:
+                aa["text"] = (h.encode_contents()).strip()
+            aa["info"]=[]
+            hh=a.find_all("p", {"class": "info"},recursive=False)
+            for h in hh:
+                aa["info"].append(unicode(h.text))
+                if re.search(r'von', str(h)):
+                    h1=  re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text)) 
+                    aa["published"] =parse(h1.strip())
+                    aa["author"]=re.sub(r'^.*von(.*)$', r'\1',unicode(h.text)).strip() #h.text + "--" #+ re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',hh) 
+            aa["section"]="FSMB"
+            articles.append(aa)
+    return {"url": o.url, "next_page": np, "articles": articles,"objecttype": "articles"}