154 lines
4.7 KiB
Python
154 lines
4.7 KiB
Python
from bs4 import BeautifulSoup
|
|
import crawler.objects.models
|
|
#from crawler.objects.models import Object
|
|
from dateutil.parser import parse
|
|
from datetime import datetime
|
|
import re
|
|
def hello():
|
|
return "hello"
|
|
|
|
|
|
def fetarticle(o):
|
|
sp=BeautifulSoup(o.raw_fixed)
|
|
d={}
|
|
h=sp.find("h1", {"itemprop": "name"})
|
|
d["title"]=unicode(h.text).strip()
|
|
h=sp.find("div", {"itemprop": "articleBody"})
|
|
if h is not None:
|
|
d["text"]=(h.encode_contents()).strip()
|
|
else:
|
|
d["text"]=""
|
|
d["url"]=o.url
|
|
h=sp.find("span", {"itemprop": "author"})
|
|
if h is not None:
|
|
d["author"]=h.text.strip()
|
|
h=sp.find("span", {"itemprop": "articleSection"})
|
|
if h is not None:
|
|
d["section"]= "FET - " + h.text.strip()
|
|
|
|
h=sp.find("span", {"itemprop": "datePublished"})
|
|
if h is not None:
|
|
d["published"]=parse(h.encode_contents().strip())
|
|
h=sp.find("meta", {"property": "og:image"})
|
|
|
|
if h is not None:
|
|
d["image"]=h.attrs["content"]
|
|
|
|
hh=sp.find_all("div", {"class":"media"})
|
|
for h in hh:
|
|
if h is not None:
|
|
h=h.find("div", {"class": "pull-left"})
|
|
if h is not None:
|
|
h=h.find("a")
|
|
if h is not None:
|
|
d["image2"]=crawler.objects.models.download_file(h.attrs["href"])
|
|
return {"article": d}
|
|
|
|
def fsarcharticle(o):
|
|
sp=BeautifulSoup(o.raw_fixed)
|
|
d={}
|
|
h=sp.find("h1", {"class": "title"})
|
|
if h is not None:
|
|
d["title"]=h.text.strip()
|
|
d["url"]=o.url
|
|
d["published"]=None
|
|
h=sp.find("article")
|
|
h=h.find("div", {"class": "content"})
|
|
d["text"]=h.encode_contents().strip()
|
|
h=sp.find("article").find("h1", {"class": "title"})
|
|
if h is not None:
|
|
d["title"]=h.text.strip()
|
|
else:
|
|
d["title"]=""
|
|
d["image"]=""
|
|
d["sourcetype"]="fsarcharticle"
|
|
d["section"]="fsarch"
|
|
d["author"]=None
|
|
return {"article": d}
|
|
|
|
def fetindex(o):
|
|
# if type(o) is not Object:
|
|
# raise TypeError
|
|
if o.raw is None:
|
|
raise Error
|
|
print "compile_fetindex"
|
|
html=BeautifulSoup(o.raw_fixed)
|
|
h = html.find("li", {"class": "next_page" })
|
|
if h is not None:
|
|
nl=h.find("a")
|
|
nl=crawler.objects.models.fix_link(o.url,nl.attrs["href"])
|
|
else:
|
|
nl=None
|
|
h= html.find("ul", {"id": "neuigkeiten"})
|
|
links=h.find_all("a")
|
|
al = []
|
|
for t in links:
|
|
al.append(t.attrs["href"])
|
|
return {"url": o.url, "next_page": nl, "article_links": al, "objecttype": "index" }
|
|
|
|
def fsarchindex(o):
|
|
if o.raw is None:
|
|
raise Error
|
|
html=BeautifulSoup(o.raw_fixed)
|
|
h= html.find("article")
|
|
print unicode(h)
|
|
links=h.find_all("a")
|
|
al = []
|
|
fl=[]
|
|
for t in links:
|
|
url=t.attrs["href"]
|
|
if re.search("fachschaftarchitektur\.at", url):
|
|
al.append(t.attrs["href"])
|
|
if re.search("facebook\.com/events", url):
|
|
fl.append(t.attrs["href"])
|
|
|
|
return {"url": o.url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"}
|
|
|
|
|
|
def fsbizindex(o):
|
|
if o.raw is None:
|
|
raise Error
|
|
print "compile_fsbizindex"
|
|
html=BeautifulSoup(o.raw_fixed)
|
|
h= html.find("section", {"id": "primary"})
|
|
links=h.find_all("h1", {"class": "entry-title"})
|
|
al = []
|
|
for t in links:
|
|
|
|
al.append(t.find("a").attrs["href"])
|
|
return {"url": o.url,"article_links": al,"objecttype": "index"}
|
|
|
|
|
|
def fsmbindex(o):
|
|
if o.raw is None:
|
|
raise Error
|
|
html=BeautifulSoup(o.raw_fixed)
|
|
h= html.find("a",{"class": "next"})
|
|
if h is not None:
|
|
np=h.attrs["href"]
|
|
else:
|
|
np=None
|
|
h=html.find("div", {"id": "main"}).find("div", {"class": "inside"}).find("div", {"class": "mod_newslist"})
|
|
if h is not None:
|
|
ats=h.find_all("div",{"class": "block"})
|
|
articles=[]
|
|
for a in ats:
|
|
aa={}
|
|
h=a.find("h3")
|
|
if h is not None:
|
|
aa["title"] = h.text.strip()
|
|
h=a.find("div", {"class": "ce_text"})
|
|
if h is not None:
|
|
aa["text"] = (h.encode_contents()).strip()
|
|
aa["info"]=[]
|
|
hh=a.find_all("p", {"class": "info"},recursive=False)
|
|
for h in hh:
|
|
aa["info"].append(unicode(h.text))
|
|
if re.search(r'von', str(h)):
|
|
h1= re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text))
|
|
aa["published"] =parse(h1.strip())
|
|
aa["author"]=re.sub(r'^.*von(.*)$', r'\1',unicode(h.text)).strip() #h.text + "--" #+ re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',hh)
|
|
aa["section"]="FSMB"
|
|
articles.append(aa)
|
|
return {"url": o.url, "next_page": np, "articles": articles,"objecttype": "articles"}
|