init commit
This commit is contained in:
10
compiler/README
Normal file
10
compiler/README
Normal file
@@ -0,0 +1,10 @@
|
||||
Das ist die API für den Compiler
|
||||
Folgende Befehle sind implementiert:
|
||||
GET doc: Diese Dokumentation!
|
||||
GET initdb: Initialisiere die Datenbank, ACHTUNG Daten werden gelöscht
|
||||
POST urls:
|
||||
Erwartet Daten im Format {"url": {"type": typ, "url": "someurl.html"}}
|
||||
Fügt diese Url der Überwachung hinzu
|
||||
|
||||
IN PROCESS:
|
||||
GET urls: Alle Urls die überwacht werden sollen
|
||||
1
compiler/README.html
Normal file
1
compiler/README.html
Normal file
@@ -0,0 +1 @@
|
||||
sdf
|
||||
15
compiler/__init__.py
Normal file
15
compiler/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
|
||||
|
||||
#from mprocess import do_process, process_urllist
|
||||
#from compiler import do_compile
|
||||
#from mworker import run_fetch, run_process, run_compile
|
||||
|
||||
# include models for final objects
|
||||
from src.models import Article
|
||||
# starting workers
|
||||
from mworker import start_workers
|
||||
|
||||
from models import add_url, CrawlUrl
|
||||
#start_workers(1,1,1)
|
||||
|
||||
from fetching import announce_articleid
|
||||
1
compiler/comp/__init__.py
Normal file
1
compiler/comp/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from rss import rssfeed
|
||||
1
compiler/comp/__init__py
Normal file
1
compiler/comp/__init__py
Normal file
@@ -0,0 +1 @@
|
||||
from rss import rssfeed
|
||||
8
compiler/comp/rss.py
Normal file
8
compiler/comp/rss.py
Normal file
@@ -0,0 +1,8 @@
|
||||
import feedparser
|
||||
|
||||
def rssfeed(url,raw):
|
||||
al=[]
|
||||
f=feedparser.parse(raw)
|
||||
for e in f['entries']:
|
||||
al.append(e['link'])
|
||||
return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}
|
||||
153
compiler/compile.py
Normal file
153
compiler/compile.py
Normal file
@@ -0,0 +1,153 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import crawler.objects.models
|
||||
#from crawler.objects.models import Object
|
||||
from dateutil.parser import parse
|
||||
from datetime import datetime
|
||||
import re
|
||||
def hello():
|
||||
return "hello"
|
||||
|
||||
|
||||
def fetarticle(o):
|
||||
sp=BeautifulSoup(o.raw_fixed)
|
||||
d={}
|
||||
h=sp.find("h1", {"itemprop": "name"})
|
||||
d["title"]=unicode(h.text).strip()
|
||||
h=sp.find("div", {"itemprop": "articleBody"})
|
||||
if h is not None:
|
||||
d["text"]=(h.encode_contents()).strip()
|
||||
else:
|
||||
d["text"]=""
|
||||
d["url"]=o.url
|
||||
h=sp.find("span", {"itemprop": "author"})
|
||||
if h is not None:
|
||||
d["author"]=h.text.strip()
|
||||
h=sp.find("span", {"itemprop": "articleSection"})
|
||||
if h is not None:
|
||||
d["section"]= "FET - " + h.text.strip()
|
||||
|
||||
h=sp.find("span", {"itemprop": "datePublished"})
|
||||
if h is not None:
|
||||
d["published"]=parse(h.encode_contents().strip())
|
||||
h=sp.find("meta", {"property": "og:image"})
|
||||
|
||||
if h is not None:
|
||||
d["image"]=h.attrs["content"]
|
||||
|
||||
hh=sp.find_all("div", {"class":"media"})
|
||||
for h in hh:
|
||||
if h is not None:
|
||||
h=h.find("div", {"class": "pull-left"})
|
||||
if h is not None:
|
||||
h=h.find("a")
|
||||
if h is not None:
|
||||
d["image2"]=crawler.objects.models.download_file(h.attrs["href"])
|
||||
return {"article": d}
|
||||
|
||||
def fsarcharticle(o):
|
||||
sp=BeautifulSoup(o.raw_fixed)
|
||||
d={}
|
||||
h=sp.find("h1", {"class": "title"})
|
||||
if h is not None:
|
||||
d["title"]=h.text.strip()
|
||||
d["url"]=o.url
|
||||
d["published"]=None
|
||||
h=sp.find("article")
|
||||
h=h.find("div", {"class": "content"})
|
||||
d["text"]=h.encode_contents().strip()
|
||||
h=sp.find("article").find("h1", {"class": "title"})
|
||||
if h is not None:
|
||||
d["title"]=h.text.strip()
|
||||
else:
|
||||
d["title"]=""
|
||||
d["image"]=""
|
||||
d["sourcetype"]="fsarcharticle"
|
||||
d["section"]="fsarch"
|
||||
d["author"]=None
|
||||
return {"article": d}
|
||||
|
||||
def fetindex(o):
|
||||
# if type(o) is not Object:
|
||||
# raise TypeError
|
||||
if o.raw is None:
|
||||
raise Error
|
||||
print "compile_fetindex"
|
||||
html=BeautifulSoup(o.raw_fixed)
|
||||
h = html.find("li", {"class": "next_page" })
|
||||
if h is not None:
|
||||
nl=h.find("a")
|
||||
nl=crawler.objects.models.fix_link(o.url,nl.attrs["href"])
|
||||
else:
|
||||
nl=None
|
||||
h= html.find("ul", {"id": "neuigkeiten"})
|
||||
links=h.find_all("a")
|
||||
al = []
|
||||
for t in links:
|
||||
al.append(t.attrs["href"])
|
||||
return {"url": o.url, "next_page": nl, "article_links": al, "objecttype": "index" }
|
||||
|
||||
def fsarchindex(o):
|
||||
if o.raw is None:
|
||||
raise Error
|
||||
html=BeautifulSoup(o.raw_fixed)
|
||||
h= html.find("article")
|
||||
print unicode(h)
|
||||
links=h.find_all("a")
|
||||
al = []
|
||||
fl=[]
|
||||
for t in links:
|
||||
url=t.attrs["href"]
|
||||
if re.search("fachschaftarchitektur\.at", url):
|
||||
al.append(t.attrs["href"])
|
||||
if re.search("facebook\.com/events", url):
|
||||
fl.append(t.attrs["href"])
|
||||
|
||||
return {"url": o.url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"}
|
||||
|
||||
|
||||
def fsbizindex(o):
|
||||
if o.raw is None:
|
||||
raise Error
|
||||
print "compile_fsbizindex"
|
||||
html=BeautifulSoup(o.raw_fixed)
|
||||
h= html.find("section", {"id": "primary"})
|
||||
links=h.find_all("h1", {"class": "entry-title"})
|
||||
al = []
|
||||
for t in links:
|
||||
|
||||
al.append(t.find("a").attrs["href"])
|
||||
return {"url": o.url,"article_links": al,"objecttype": "index"}
|
||||
|
||||
|
||||
def fsmbindex(o):
|
||||
if o.raw is None:
|
||||
raise Error
|
||||
html=BeautifulSoup(o.raw_fixed)
|
||||
h= html.find("a",{"class": "next"})
|
||||
if h is not None:
|
||||
np=h.attrs["href"]
|
||||
else:
|
||||
np=None
|
||||
h=html.find("div", {"id": "main"}).find("div", {"class": "inside"}).find("div", {"class": "mod_newslist"})
|
||||
if h is not None:
|
||||
ats=h.find_all("div",{"class": "block"})
|
||||
articles=[]
|
||||
for a in ats:
|
||||
aa={}
|
||||
h=a.find("h3")
|
||||
if h is not None:
|
||||
aa["title"] = h.text.strip()
|
||||
h=a.find("div", {"class": "ce_text"})
|
||||
if h is not None:
|
||||
aa["text"] = (h.encode_contents()).strip()
|
||||
aa["info"]=[]
|
||||
hh=a.find_all("p", {"class": "info"},recursive=False)
|
||||
for h in hh:
|
||||
aa["info"].append(unicode(h.text))
|
||||
if re.search(r'von', str(h)):
|
||||
h1= re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text))
|
||||
aa["published"] =parse(h1.strip())
|
||||
aa["author"]=re.sub(r'^.*von(.*)$', r'\1',unicode(h.text)).strip() #h.text + "--" #+ re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',hh)
|
||||
aa["section"]="FSMB"
|
||||
articles.append(aa)
|
||||
return {"url": o.url, "next_page": np, "articles": articles,"objecttype": "articles"}
|
||||
258
compiler/compiler.py
Normal file
258
compiler/compiler.py
Normal file
@@ -0,0 +1,258 @@
|
||||
from bs4 import BeautifulSoup
|
||||
#import crawler.objects.models
|
||||
#from crawler.objects.models import Object
|
||||
from dateutil.parser import parse
|
||||
from datetime import datetime
|
||||
import re
|
||||
import urlparse
|
||||
from src import clogger, cfg
|
||||
from src.fb import graph
|
||||
from fixing import fix_link
|
||||
from facebook import GraphAPIError
|
||||
#from fetching import downloadfile
|
||||
import json
|
||||
def do_compile(tpe, cont):
|
||||
if type(cont) != dict:
|
||||
clogger.error("Type Error for do compile for :"+str(cont["url"]))
|
||||
# Starting to compile an generic object
|
||||
if "url" not in cont:
|
||||
clogger.error("no url can't compile "+tpe)
|
||||
else:
|
||||
clogger.debug("compile: type:"+str(tpe)+ "| "+ str(cont["url"]))
|
||||
if tpe in compiler:
|
||||
cont=compiler[tpe](cont["url"], cont["raw"])
|
||||
return cont
|
||||
|
||||
from comp import rssfeed
|
||||
|
||||
def dummyarticle(url, raw):
|
||||
return {"url": url, "article":{"url": url, "section": "dummysection", "sourcetype": "dummy", "title":"dummytitle", "text": raw, "image": "fff", "author": "me", "published": None}}
|
||||
|
||||
|
||||
|
||||
def htufeed(url,raw):
|
||||
al=[]
|
||||
f=feedparser.parse(raw)
|
||||
for e in f['entries']:
|
||||
al.append(e['link'])
|
||||
return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}
|
||||
|
||||
|
||||
def htuarticle(url,raw):
|
||||
sp=BeautifulSoup(raw)
|
||||
d={}
|
||||
h=sp.find("meta", {"property": "og:image"})
|
||||
if h is not None:
|
||||
d["image"]=h.attrs["content"]
|
||||
d["image2"]=d["image"]
|
||||
h=sp.find("div", {"class": "patternRevInfo"})
|
||||
if h is not None:
|
||||
# clogger.debug(h.text.strip())
|
||||
h1= re.sub(r'.*- (\d+) ([a-zA-Z]+) (\d+) - ([:\d]+)[^\d]*', r'\3/\2/\1 \4',unicode(h.text.strip()))
|
||||
# clogger.debug(h1)
|
||||
d["published"]=parse(h1)
|
||||
# clogger.debug(parse(h1))
|
||||
# clogger.debug(d["published"])
|
||||
h=h.find("a")
|
||||
if h is not None:
|
||||
d["author"]=h.text.strip()
|
||||
h=sp.find("div", {"class": "foswikiTopic"})
|
||||
h1=h.find("h4")
|
||||
if h1 is not None:
|
||||
d["title"]= h1.text.strip()
|
||||
h1.extract() # remove head
|
||||
else:
|
||||
h1=sp.find("meta", {"name": "WEBTOPIC"})
|
||||
d["title"]= h1.attrs["content"]
|
||||
d["text"]=(h.encode_contents()).strip()
|
||||
d["section"]="HTU"
|
||||
d["url"]=url
|
||||
# clogger.debug(d)
|
||||
return {"article": d}
|
||||
|
||||
|
||||
def fetarticle(url, raw):
|
||||
sp=BeautifulSoup(raw)
|
||||
d={}
|
||||
h=sp.find("h1", {"itemprop": "name"})
|
||||
d["title"]=unicode(h.text).strip()
|
||||
h=sp.find("div", {"itemprop": "articleBody"})
|
||||
if h is not None:
|
||||
d["text"]=(h.encode_contents()).strip()
|
||||
else:
|
||||
d["text"]=""
|
||||
d["url"]=url
|
||||
h=sp.find("span", {"itemprop": "author"})
|
||||
if h is not None:
|
||||
d["author"]=h.text.strip()
|
||||
h=sp.find("span", {"itemprop": "articleSection"})
|
||||
if h is not None:
|
||||
d["section"]= "FET - " + h.text.strip()
|
||||
|
||||
h=sp.find("span", {"itemprop": "datePublished"})
|
||||
if h is not None:
|
||||
d["published"]=parse(h.encode_contents().strip())
|
||||
|
||||
h=sp.find("meta", {"property": "og:image"})
|
||||
if h is not None:
|
||||
d["image"]=h.attrs["content"]
|
||||
d["image2"]=d["image"]
|
||||
# hh=sp.find_all("div", {"class":"media"})
|
||||
# for h in hh:
|
||||
# if h is not None:
|
||||
# h=h.find("div", {"class": "pull-left"})
|
||||
# if h is not None:
|
||||
# h=h.find("a")
|
||||
# if h is not None:
|
||||
# d["image2"]=downloadfile(fix_link(url,h.attrs["href"]))
|
||||
return {"article": d}
|
||||
|
||||
|
||||
def fsarcharticle(url, raw):
|
||||
sp=BeautifulSoup(raw)
|
||||
d={}
|
||||
h=sp.find("h1", {"class": "title"})
|
||||
if h is not None:
|
||||
d["title"]=h.text.strip()
|
||||
d["url"]=url
|
||||
d["published"]=None
|
||||
h=sp.find("article")
|
||||
if h is not None:
|
||||
h=h.find("div", {"class": "content"})
|
||||
d["text"]=h.encode_contents().strip()
|
||||
h=sp.find("article")
|
||||
if h is not None:
|
||||
h=h.find("h1", {"class": "title"})
|
||||
if h is not None:
|
||||
d["title"]=h.text.strip()
|
||||
else:
|
||||
d["title"]=""
|
||||
d["image"]=""
|
||||
d["sourcetype"]="fsarcharticle"
|
||||
d["section"]="fsarch"
|
||||
d["author"]=None
|
||||
return {"article": d}
|
||||
|
||||
def fetindex(url, raw):
|
||||
if raw is None:
|
||||
raise Error
|
||||
# clogger.debug("compile_fetindex: "+str(url))
|
||||
html=BeautifulSoup(raw)
|
||||
h = html.find("li", {"class": "next_page" })
|
||||
if h is not None:
|
||||
nl=h.find("a")
|
||||
nl=fix_link(url,nl.attrs["href"])
|
||||
else:
|
||||
nl=None
|
||||
h= html.find("ul", {"id": "neuigkeiten"})
|
||||
al = []
|
||||
if h is not None:
|
||||
links=h.find_all("a")
|
||||
for t in links:
|
||||
al.append(t.attrs["href"])
|
||||
return {"url": url, "next_page": nl, "article_links": al, "objecttype": "index" }
|
||||
|
||||
def fsarchindex(url, raw):
|
||||
if raw is None:
|
||||
raise Error
|
||||
html=BeautifulSoup(raw)
|
||||
h= html.find("article")
|
||||
print unicode(h)
|
||||
links=h.find_all("a")
|
||||
al = []
|
||||
fl=[]
|
||||
for t in links:
|
||||
url=t.attrs["href"]
|
||||
if re.search("fachschaftarchitektur\.at", url):
|
||||
al.append(t.attrs["href"])
|
||||
if re.search("facebook\.com/events", url):
|
||||
fl.append(t.attrs["href"])
|
||||
|
||||
return {"url": url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"}
|
||||
|
||||
|
||||
def fsbizindex(url, raw):
|
||||
if raw is None:
|
||||
raise Error
|
||||
print "compile_fsbizindex"
|
||||
html=BeautifulSoup(raw)
|
||||
h= html.find("section", {"id": "primary"})
|
||||
links=h.find_all("h1", {"class": "entry-title"})
|
||||
al = []
|
||||
for t in links:
|
||||
|
||||
al.append(t.find("a").attrs["href"])
|
||||
return {"url": url,"article_links": al,"objecttype": "index"}
|
||||
|
||||
|
||||
|
||||
|
||||
def fbfeed(url, raw):
|
||||
js = json.loads(raw)
|
||||
arts=[]
|
||||
u=urlparse.urlparse(url)
|
||||
for m in js["data"]:
|
||||
aa={}
|
||||
aa["url"]=urlparse.urlunsplit(("http","www.facebook.at",m["id"],"",""))
|
||||
aa["published"] =parse(m["created_time"])
|
||||
if m.has_key("message")==True:
|
||||
aa["text"] = m["message"]
|
||||
else:
|
||||
try:
|
||||
h=graph.get_object(id=m["id"].split("_")[1])
|
||||
if h.has_key("description"):
|
||||
aa["text"]=h["description"]
|
||||
else:
|
||||
aa["text"]=json.dumps()
|
||||
except GraphAPIError:
|
||||
aa["text"]=""
|
||||
if m.has_key("story")==True:
|
||||
aa["title"] = m["story"]
|
||||
else:
|
||||
aa["title"] = u[1]+ " at " + aa["published"].strftime("%Y-%m-%d %H:%M")
|
||||
aa["section"]="Facebook: "+u[1]
|
||||
arts.append(aa)
|
||||
return {"url": url, "next_page": js["paging"]["next"],"articles": arts}
|
||||
|
||||
def fsmbindex(url, raw):
|
||||
if raw is None:
|
||||
raise Error
|
||||
html=BeautifulSoup(raw)
|
||||
h= html.find("a",{"class": "next"})
|
||||
if h is not None:
|
||||
np=h.attrs["href"]
|
||||
else:
|
||||
np=None
|
||||
h=html.find("div", {"id": "main"}).find("div", {"class": "inside"}).find("div", {"class": "mod_newslist"})
|
||||
if h is not None:
|
||||
ats=h.find_all("div",{"class": "block"})
|
||||
articles=[]
|
||||
for a in ats:
|
||||
aa={}
|
||||
h=a.find("h3")
|
||||
if h is not None:
|
||||
aa["title"] = h.text.strip()
|
||||
h=a.find("div", {"class": "ce_text"})
|
||||
if h is not None:
|
||||
aa["text"] = (h.encode_contents()).strip()
|
||||
aa["info"]=[]
|
||||
hh=a.find_all("p", {"class": "info"},recursive=False)
|
||||
for h in hh:
|
||||
aa["info"].append(unicode(h.text))
|
||||
if re.search(r'von', str(h)):
|
||||
h1= re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text))
|
||||
aa["published"] =parse(h1.strip())
|
||||
aa["author"]=re.sub(r'^.*von(.*)$', r'\1',unicode(h.text)).strip() #h.text + "--" #+ re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',hh)
|
||||
aa["section"]="FSMB"
|
||||
articles.append(aa)
|
||||
return {"url": url, "next_page": np, "articles": articles,"objecttype": "articles"}
|
||||
|
||||
compiler = {"fetindex": fetindex, "fetarticle": fetarticle, "fsarchindex": fsarchindex, "fsarcharticle": fsarcharticle, "fsmbindex": fsmbindex, "fsbizindex": fsbizindex, "dummyarticle": dummyarticle,"htuarticle": htuarticle, "htufeed": htufeed, "fbfeed": fbfeed, "fschfeed": rssfeed}
|
||||
|
||||
compiler = cfg.compiler
|
||||
for i in compiler:
|
||||
compiler[i]=eval(compiler[i])
|
||||
|
||||
|
||||
|
||||
article_types={"fetindex" : "fetarticle", "fsarchindex": "fsarcharticle", "fsbizindex": "fsbizarticle", "dummyindex": "dummyarticle", "htufeed": "htuarticle"}
|
||||
67
compiler/fetching.py
Normal file
67
compiler/fetching.py
Normal file
@@ -0,0 +1,67 @@
|
||||
from requests import session
|
||||
s=session()
|
||||
from src import package_directory, download_path,cfg
|
||||
from os import path, makedirs
|
||||
import os
|
||||
import json
|
||||
from gevent import spawn
|
||||
from src import clogger
|
||||
from src.fb import graph
|
||||
from hashlib import md5
|
||||
import errno
|
||||
import urlparse
|
||||
def announce_articleid(id):
|
||||
for u in cfg.announcearticle_url:
|
||||
s.get( u % id)
|
||||
|
||||
def downloadfile(url):
|
||||
relative_name=path.join("downloads",str(md5(url).hexdigest()),url.split('/')[-1])
|
||||
local_filename = path.join(download_path,relative_name)
|
||||
if not os.path.exists(os.path.dirname(local_filename)):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(local_filename))
|
||||
except OSError as exc: # Guard against race condition
|
||||
if exc.errno != errno.EEXIST:
|
||||
raise
|
||||
if not path.exists(local_filename):
|
||||
spawn(fetch_load_file, url, local_filename)
|
||||
return relative_name
|
||||
|
||||
from models import CrawlCache
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
|
||||
|
||||
def fetch_page(furl):
|
||||
current_time = datetime.utcnow()
|
||||
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
|
||||
u=urlparse.urlparse(furl)
|
||||
if u[0] == '':
|
||||
furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
|
||||
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
|
||||
if cc is None:
|
||||
clogger.debug("fetching url: "+ str(furl))
|
||||
if u[0]=='fb':
|
||||
tx = json.dumps(graph.get_object(id=u[1]+u[2]))
|
||||
else:
|
||||
tx=s.get(furl).text
|
||||
CrawlCache.store(furl,tx)
|
||||
else:
|
||||
#if furl is not None:
|
||||
# clogger.debug("cache hit")
|
||||
tx=cc.raw
|
||||
return tx
|
||||
|
||||
def fetch_load_file(furl, path):
|
||||
try:
|
||||
clogger.info("Downloading "+ str(furl))
|
||||
r = s.get(furl, stream=True)
|
||||
f = open(path, 'wb')
|
||||
for chunk in r.iter_content(chunk_size=1024):
|
||||
if chunk: # filter out keep-alive new chunks
|
||||
f.write(chunk)
|
||||
f.close()
|
||||
except Exception, e:
|
||||
#clogger.error("Error Occured during fetching:"+str(furl))
|
||||
clogger.error(e,exc_info=True)
|
||||
37
compiler/fixing.py
Normal file
37
compiler/fixing.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from bs4 import BeautifulSoup
|
||||
from urlparse import urlparse, urlunparse, urljoin
|
||||
from fetching import downloadfile
|
||||
import bleach
|
||||
|
||||
def fix_link(url, link):
|
||||
r= urlparse(link)
|
||||
if r.scheme is None or r.scheme == '':
|
||||
return urljoin(url,link)
|
||||
else:
|
||||
return link
|
||||
|
||||
def fix_file(url, link):
|
||||
u=fix_link(url,link)
|
||||
return downloadfile(u)
|
||||
|
||||
def load_file(url, link):
|
||||
return fix_file(url,link)
|
||||
|
||||
|
||||
def fix_html(html, baseurl):
|
||||
html=bleach.clean(html, tags=['b','p','span','a','img','div','br','strong','ul','li'], strip=True)
|
||||
sp=BeautifulSoup(html)
|
||||
images=sp.find_all("img")
|
||||
for t in images:
|
||||
if "src" in t.attrs and t.attrs["src"] is not None:
|
||||
t.attrs["src"]=fix_file(baseurl,t.attrs["src"])
|
||||
links=sp.find_all("a")
|
||||
for t in links:
|
||||
if "href" in t.attrs:
|
||||
t.attrs["href"]=fix_link(baseurl, t.attrs["href"])
|
||||
for t in sp.find_all("script"):
|
||||
t.extract()
|
||||
b=sp.find("base")
|
||||
if b is not None:
|
||||
b.attrs["href"]=""
|
||||
return sp
|
||||
75
compiler/models.py
Normal file
75
compiler/models.py
Normal file
@@ -0,0 +1,75 @@
|
||||
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text
|
||||
from datetime import datetime
|
||||
from src.database import Base2
|
||||
from src.database import db_session2
|
||||
from mqueues import put_fetch_queue
|
||||
from marshmallow import Schema,fields,ValidationError
|
||||
import json
|
||||
import flask
|
||||
|
||||
def add_url(tpe, url):
|
||||
cu=CrawlUrl.find_or_create(tpe,url)
|
||||
db_session2.add(cu)
|
||||
db_session2.commit()
|
||||
cu.schedule()
|
||||
|
||||
|
||||
class CrawlUrlSchema(Schema):
|
||||
id=fields.Integer()
|
||||
tpe=fields.String()
|
||||
url=fields.String()
|
||||
last_fetched=fields.DateTime()
|
||||
fetched = fields.DateTime()
|
||||
|
||||
class CrawlUrl(Base2):
|
||||
__tablename__='crawlurls'
|
||||
id = Column(Integer, primary_key=True)
|
||||
tpe=Column(String(250))
|
||||
url = Column(String(250))
|
||||
last_fetched = Column(DateTime)
|
||||
def fetched(self):
|
||||
CrawlCache.query.find(CrawlCache.url==self.url).first()
|
||||
@classmethod
|
||||
def find_or_create(self, tpe, url):
|
||||
aa = CrawlUrl.query.filter(CrawlUrl.url==url).filter(CrawlUrl.tpe==tpe).first()
|
||||
if aa is None:
|
||||
aa=CrawlUrl(tpe,url)
|
||||
return aa
|
||||
def schedule(self):
|
||||
put_fetch_queue((0, self.tpe, self.url))
|
||||
def __init__(self, tpe, url):
|
||||
self.url=url
|
||||
self.tpe=tpe
|
||||
def __json__(self):
|
||||
return CrawlUrlSchema().dump(self)[0]
|
||||
|
||||
class CrawlCacheSchema(Schema):
|
||||
id=fields.Integer()
|
||||
raw=fields.String()
|
||||
url=fields.String()
|
||||
fetched=fields.DateTime()
|
||||
|
||||
class CrawlCache(Base2):
|
||||
__tablename__='crawlcache'
|
||||
id = Column(Integer, primary_key=True)
|
||||
url=Column(String(250))
|
||||
fetched=Column(DateTime)
|
||||
raw=Column(Text)
|
||||
|
||||
def __init__(self, url,rw):
|
||||
self.url=url
|
||||
self.raw=rw
|
||||
self.fetched=datetime.utcnow()
|
||||
def __json__(self):
|
||||
return CrawlCacheSchema().dump(self)
|
||||
|
||||
@classmethod
|
||||
def store(cls, url, rw):
|
||||
cc=CrawlCache(url,rw)
|
||||
db_session2.add(cc)
|
||||
db_session2.commit()
|
||||
|
||||
|
||||
|
||||
|
||||
#flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, CrawlUrl) else None)
|
||||
74
compiler/mprocess.py
Normal file
74
compiler/mprocess.py
Normal file
@@ -0,0 +1,74 @@
|
||||
from src import clogger # Logger for crawler
|
||||
from src.models import Article # Article model
|
||||
from datetime import datetime
|
||||
from src.database import db_session
|
||||
from mqueues import fetch_queue, compile_queue, put_fetch_queue
|
||||
from fetching import fetch_page, downloadfile, announce_articleid
|
||||
from fixing import fix_html, fix_file
|
||||
|
||||
from compiler import article_types
|
||||
from fixing import fix_link
|
||||
# process article expects an hash with raw data for the article and puts it into an
|
||||
# article object stored in the database it is intended to prevent dublicates
|
||||
|
||||
def is_article_hash(h):
|
||||
return "text" in h and "url" in h and "sourcetype" in h and "section" in h
|
||||
|
||||
def process_article(art):
|
||||
if not is_article_hash(art):
|
||||
clogger.error("Invalid article hash:" + str(art))
|
||||
aa=None
|
||||
else:
|
||||
art["text"]=fix_html(art["text"],art["url"])
|
||||
if "image" in art:
|
||||
art["image"]=fix_file(art["url"], art["image"])
|
||||
clogger.info(art)
|
||||
aa = Article.from_hash(art)
|
||||
aa.process_hash(art)
|
||||
aa.last_fetched=datetime.now()
|
||||
aa.sourcetype=art["sourcetype"]
|
||||
db_session.add(aa)
|
||||
db_session.commit()
|
||||
clogger.debug("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
|
||||
# announce_articleid(aa.id)
|
||||
return aa
|
||||
|
||||
# process a single found url
|
||||
def process_url(url,tpe, parent_url):
|
||||
#clogger.debug("process URL of type "+ tpe + ": " + url)
|
||||
if parent_url is not None:
|
||||
url=fix_link(parent_url, url)
|
||||
put_fetch_queue((0,tpe,url))
|
||||
|
||||
|
||||
# process a url list
|
||||
def process_urllist(urllist, tpe, parent_url):
|
||||
for u in urllist:
|
||||
process_url(u,tpe, parent_url)
|
||||
|
||||
|
||||
def do_process(tpe, cont):
|
||||
urllist=[]
|
||||
# clogger.debug("process :" + str(cont))
|
||||
if "article_links" in cont:
|
||||
process_urllist(cont["article_links"], article_types[tpe], cont["url"])
|
||||
if "index_links" in cont:
|
||||
process_urllist(cont["index_links"], tpe , cont["url"])
|
||||
|
||||
if "next_page" in cont and cont["next_page"] is not None:
|
||||
process_url(cont["next_page"],tpe, cont["url"])
|
||||
|
||||
if "article" in cont:
|
||||
art=cont["article"]
|
||||
art["sourcetype"]=tpe
|
||||
process_article(art)
|
||||
|
||||
if "articles" in cont:
|
||||
clogger.debug("articles")
|
||||
for a in cont["articles"]:
|
||||
if "title" in a:
|
||||
a["sourcetype"]=tpe
|
||||
if a.has_key("url")==False:
|
||||
a["url"]=cont["url"]
|
||||
process_article(a)
|
||||
return
|
||||
8
compiler/mqueues.py
Normal file
8
compiler/mqueues.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from gevent.queue import Queue, JoinableQueue
|
||||
fetch_queue = Queue()
|
||||
compile_queue = Queue()
|
||||
process_queue = Queue()
|
||||
|
||||
def put_fetch_queue(o):
|
||||
fetch_queue.put(o)
|
||||
|
||||
58
compiler/mworker.py
Normal file
58
compiler/mworker.py
Normal file
@@ -0,0 +1,58 @@
|
||||
|
||||
from mqueues import fetch_queue, compile_queue, process_queue
|
||||
from compiler import do_compile
|
||||
from mprocess import do_process
|
||||
from fetching import fetch_page
|
||||
from gevent import spawn
|
||||
from itertools import repeat
|
||||
from src import clogger
|
||||
def start_workers(f,c,p):
|
||||
for _ in range(f):
|
||||
clogger.debug("spawn fetchworker")
|
||||
spawn(work_fetch)
|
||||
for _ in range(c):
|
||||
spawn(work_compile)
|
||||
for _ in range(p):
|
||||
spawn(work_process)
|
||||
|
||||
def work_fetch():
|
||||
while True:
|
||||
run_fetch()
|
||||
|
||||
def work_process():
|
||||
while True:
|
||||
run_process()
|
||||
def work_compile():
|
||||
while True:
|
||||
run_compile()
|
||||
|
||||
|
||||
def queue_url(tpe, url):
|
||||
fetch_queue.put((0,tpe,url))
|
||||
|
||||
|
||||
# fetch a page from the url list
|
||||
def run_fetch():
|
||||
tc, tpe, url = fetch_queue.get()
|
||||
if tpe is not "dummyarticle" and tpe is not "dummyindex":
|
||||
rw=fetch_page(url)
|
||||
else:
|
||||
rw="<p> dummytext</p>"
|
||||
compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw}))
|
||||
return rw
|
||||
# fetch_queue.task_done()
|
||||
|
||||
#comile something from the compile list
|
||||
def run_compile():
|
||||
tc,tpe,h = compile_queue.get()
|
||||
h=do_compile(tpe,h)
|
||||
process_queue.put((0,tpe, h))
|
||||
return h
|
||||
# compile_queue.task_done()
|
||||
|
||||
def run_process():
|
||||
tc,tpe,h = process_queue.get()
|
||||
do_process(tpe, h)
|
||||
return h
|
||||
# process_queue.task_done()
|
||||
|
||||
146
compiler/views.py
Normal file
146
compiler/views.py
Normal file
@@ -0,0 +1,146 @@
|
||||
from flask import Blueprint, jsonify, render_template, abort, redirect, url_for, request
|
||||
compiler_pages = Blueprint('compiler', __name__,
|
||||
template_folder='.')
|
||||
|
||||
from src.database import db_session2,init_db,read_json,init_db2
|
||||
from .models import CrawlUrl
|
||||
from .models import CrawlCache, CrawlCacheSchema
|
||||
from .models import CrawlUrlSchema
|
||||
from src import clogger
|
||||
from src.articles import Article
|
||||
#import mworker
|
||||
import flask
|
||||
import json
|
||||
import mworker
|
||||
|
||||
from compiler import do_compile
|
||||
from fetching import fetch_page
|
||||
|
||||
#flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Article,CrawlUrl)) else None)
|
||||
|
||||
@compiler_pages.route("/")
|
||||
@compiler_pages.route("")
|
||||
@compiler_pages.route(".json")
|
||||
def index():
|
||||
status="For documentation goto /doc"
|
||||
return jsonify(status=status)
|
||||
|
||||
@compiler_pages.route("/doc")
|
||||
@compiler_pages.route("/doc.json")
|
||||
def doc():
|
||||
return render_template("README")
|
||||
# return jsonify(status=render_template("README"))
|
||||
#
|
||||
|
||||
|
||||
@compiler_pages.route("/initdb")
|
||||
@compiler_pages.route("/initdb.json")
|
||||
def initdb_json():
|
||||
init_db() # initialisiere Datenbank
|
||||
status="Datenbank Neu initialisiert"
|
||||
return jsonify(status=status)
|
||||
|
||||
@compiler_pages.route("/initdb2")
|
||||
@compiler_pages.route("/initdb2.json")
|
||||
def initdb_json2():
|
||||
init_db2() # initialisiere Datenbank
|
||||
status="Datenbank Neu initialisiert"
|
||||
return jsonify(status=status)
|
||||
|
||||
@compiler_pages.route("/start")
|
||||
@compiler_pages.route("/start.json")
|
||||
def start_json():
|
||||
mworker.start_workers(1,1,1) # initialisiere Datenbank
|
||||
status="Worker gestartet"
|
||||
return jsonify(status=status)
|
||||
|
||||
|
||||
@compiler_pages.route("/urls")
|
||||
@compiler_pages.route("/urls.json")
|
||||
def urls_index_json():
|
||||
# Lade Alle Urls
|
||||
status=CrawlUrl.query.all()
|
||||
return jsonify(urls=status)
|
||||
|
||||
# show an existing CrawlUrl
|
||||
@compiler_pages.route("/urls/<int:id>")
|
||||
@compiler_pages.route("/urls/<int:id>.json")
|
||||
def urls_json(id):
|
||||
# Lade Alle Urls
|
||||
status=CrawlUrl.query.get(id)
|
||||
cc=CrawlCache.query.filter(CrawlCache.url==status.url).first()
|
||||
return jsonify(urls=status, cache=cc.__json__())
|
||||
|
||||
# que an existing CrawlUrl for fetching
|
||||
@compiler_pages.route("/urls/<int:id>/que")
|
||||
@compiler_pages.route("/urls/<int:id>/que.json")
|
||||
def urls_que_json(id):
|
||||
# Lade Alle Urls
|
||||
cu=CrawlUrl.query.get(id)
|
||||
mworker.queue_url(cu.tpe, cu.url)
|
||||
cc=CrawlCache.query.filter(CrawlCache.url==cu.url)
|
||||
mworker.start_workers(1,1,1) # initialisiere Datenbank
|
||||
status="Worker gestartet"
|
||||
return jsonify(urls=cu, cache=cc)
|
||||
|
||||
|
||||
# que an existing CrawlUrl for fetching
|
||||
@compiler_pages.route("/urls/<int:id>/test")
|
||||
@compiler_pages.route("/urls/<int:id>/test.json")
|
||||
def urls_test_json(id):
|
||||
# Lade Alle Urls
|
||||
cu=CrawlUrl.query.get(id)
|
||||
rw=fetch_page(cu.url)
|
||||
h= {"url": cu.url, "sourcetype": cu.tpe, "raw": rw}
|
||||
h2=do_compile(cu.tpe, h)
|
||||
return jsonify(urls=cu,hs=h2,rw=rw)
|
||||
|
||||
|
||||
|
||||
|
||||
@compiler_pages.route("/debug",methods=['GET','PUT'])
|
||||
def debug():
|
||||
status="did nothing"
|
||||
js=read_json(request)
|
||||
clogger.info(request.get_json())
|
||||
if js["cmd"] == "runfetch":
|
||||
mworker.run_fetch()
|
||||
status="fetched something"
|
||||
if js["cmd"] == "que":
|
||||
cu = CrawlUrl.query.get(js["id"])
|
||||
mworker.queue_url(cu.tpe, cu.url)
|
||||
status= mworker.run_fetch()
|
||||
if js["cmd"] == "comp":
|
||||
status=mworker.run_compile()
|
||||
if js["cmd"]=="process":
|
||||
status=mworker.run_process()
|
||||
return jsonify(status=status)
|
||||
|
||||
@compiler_pages.route("/debugurl")
|
||||
def debugurl():
|
||||
s=CrawlUrlSchema()
|
||||
status=CrawlUrl.query.all()
|
||||
return jsonify(status=status)
|
||||
|
||||
|
||||
@compiler_pages.route("/urls",methods=['POST'])
|
||||
def add_urls():
|
||||
# Lese Daten
|
||||
js =read_json(request)
|
||||
# clogger.info(js)
|
||||
# Finde oder Erzeuge Url in der Datenbank
|
||||
url=CrawlUrlSchema().load(js["url"])
|
||||
clogger.info(url)
|
||||
url=CrawlUrl.find_or_create(url.data["tpe"], url.data["url"])
|
||||
db_session2.add(url)
|
||||
db_session2.commit()
|
||||
return jsonify(url=url, kk=js)
|
||||
|
||||
@compiler_pages.route("/urls/<int:id>",methods=['DELETE'])
|
||||
@compiler_pages.route("/urls<int:id>.json",methods=['DELETE'])
|
||||
def delete(id):
|
||||
cu=CrawlUrl.query.get(id)
|
||||
if cu != None:
|
||||
db_session2.delete(cu)
|
||||
db_session2.commit()
|
||||
return jsonify(url={})
|
||||
Reference in New Issue
Block a user