init commit

This commit is contained in:
Andreas Stephanides
2017-01-14 12:23:04 +01:00
commit 8955bf17f5
32 changed files with 1555 additions and 0 deletions

10
compiler/README Normal file
View File

@@ -0,0 +1,10 @@
Das ist die API für den Compiler
Folgende Befehle sind implementiert:
GET doc: Diese Dokumentation!
GET initdb: Initialisiere die Datenbank, ACHTUNG Daten werden gelöscht
POST urls:
Erwartet Daten im Format {"url": {"type": typ, "url": "someurl.html"}}
Fügt diese Url der Überwachung hinzu
IN PROCESS:
GET urls: Alle Urls die überwacht werden sollen

1
compiler/README.html Normal file
View File

@@ -0,0 +1 @@
sdf

15
compiler/__init__.py Normal file
View File

@@ -0,0 +1,15 @@
#from mprocess import do_process, process_urllist
#from compiler import do_compile
#from mworker import run_fetch, run_process, run_compile
# include models for final objects
from src.models import Article
# starting workers
from mworker import start_workers
from models import add_url, CrawlUrl
#start_workers(1,1,1)
from fetching import announce_articleid

View File

@@ -0,0 +1 @@
from rss import rssfeed

1
compiler/comp/__init__py Normal file
View File

@@ -0,0 +1 @@
from rss import rssfeed

8
compiler/comp/rss.py Normal file
View File

@@ -0,0 +1,8 @@
import feedparser
def rssfeed(url,raw):
al=[]
f=feedparser.parse(raw)
for e in f['entries']:
al.append(e['link'])
return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}

153
compiler/compile.py Normal file
View File

@@ -0,0 +1,153 @@
from bs4 import BeautifulSoup
import crawler.objects.models
#from crawler.objects.models import Object
from dateutil.parser import parse
from datetime import datetime
import re
def hello():
return "hello"
def fetarticle(o):
sp=BeautifulSoup(o.raw_fixed)
d={}
h=sp.find("h1", {"itemprop": "name"})
d["title"]=unicode(h.text).strip()
h=sp.find("div", {"itemprop": "articleBody"})
if h is not None:
d["text"]=(h.encode_contents()).strip()
else:
d["text"]=""
d["url"]=o.url
h=sp.find("span", {"itemprop": "author"})
if h is not None:
d["author"]=h.text.strip()
h=sp.find("span", {"itemprop": "articleSection"})
if h is not None:
d["section"]= "FET - " + h.text.strip()
h=sp.find("span", {"itemprop": "datePublished"})
if h is not None:
d["published"]=parse(h.encode_contents().strip())
h=sp.find("meta", {"property": "og:image"})
if h is not None:
d["image"]=h.attrs["content"]
hh=sp.find_all("div", {"class":"media"})
for h in hh:
if h is not None:
h=h.find("div", {"class": "pull-left"})
if h is not None:
h=h.find("a")
if h is not None:
d["image2"]=crawler.objects.models.download_file(h.attrs["href"])
return {"article": d}
def fsarcharticle(o):
sp=BeautifulSoup(o.raw_fixed)
d={}
h=sp.find("h1", {"class": "title"})
if h is not None:
d["title"]=h.text.strip()
d["url"]=o.url
d["published"]=None
h=sp.find("article")
h=h.find("div", {"class": "content"})
d["text"]=h.encode_contents().strip()
h=sp.find("article").find("h1", {"class": "title"})
if h is not None:
d["title"]=h.text.strip()
else:
d["title"]=""
d["image"]=""
d["sourcetype"]="fsarcharticle"
d["section"]="fsarch"
d["author"]=None
return {"article": d}
def fetindex(o):
# if type(o) is not Object:
# raise TypeError
if o.raw is None:
raise Error
print "compile_fetindex"
html=BeautifulSoup(o.raw_fixed)
h = html.find("li", {"class": "next_page" })
if h is not None:
nl=h.find("a")
nl=crawler.objects.models.fix_link(o.url,nl.attrs["href"])
else:
nl=None
h= html.find("ul", {"id": "neuigkeiten"})
links=h.find_all("a")
al = []
for t in links:
al.append(t.attrs["href"])
return {"url": o.url, "next_page": nl, "article_links": al, "objecttype": "index" }
def fsarchindex(o):
if o.raw is None:
raise Error
html=BeautifulSoup(o.raw_fixed)
h= html.find("article")
print unicode(h)
links=h.find_all("a")
al = []
fl=[]
for t in links:
url=t.attrs["href"]
if re.search("fachschaftarchitektur\.at", url):
al.append(t.attrs["href"])
if re.search("facebook\.com/events", url):
fl.append(t.attrs["href"])
return {"url": o.url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"}
def fsbizindex(o):
if o.raw is None:
raise Error
print "compile_fsbizindex"
html=BeautifulSoup(o.raw_fixed)
h= html.find("section", {"id": "primary"})
links=h.find_all("h1", {"class": "entry-title"})
al = []
for t in links:
al.append(t.find("a").attrs["href"])
return {"url": o.url,"article_links": al,"objecttype": "index"}
def fsmbindex(o):
if o.raw is None:
raise Error
html=BeautifulSoup(o.raw_fixed)
h= html.find("a",{"class": "next"})
if h is not None:
np=h.attrs["href"]
else:
np=None
h=html.find("div", {"id": "main"}).find("div", {"class": "inside"}).find("div", {"class": "mod_newslist"})
if h is not None:
ats=h.find_all("div",{"class": "block"})
articles=[]
for a in ats:
aa={}
h=a.find("h3")
if h is not None:
aa["title"] = h.text.strip()
h=a.find("div", {"class": "ce_text"})
if h is not None:
aa["text"] = (h.encode_contents()).strip()
aa["info"]=[]
hh=a.find_all("p", {"class": "info"},recursive=False)
for h in hh:
aa["info"].append(unicode(h.text))
if re.search(r'von', str(h)):
h1= re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text))
aa["published"] =parse(h1.strip())
aa["author"]=re.sub(r'^.*von(.*)$', r'\1',unicode(h.text)).strip() #h.text + "--" #+ re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',hh)
aa["section"]="FSMB"
articles.append(aa)
return {"url": o.url, "next_page": np, "articles": articles,"objecttype": "articles"}

258
compiler/compiler.py Normal file
View File

@@ -0,0 +1,258 @@
from bs4 import BeautifulSoup
#import crawler.objects.models
#from crawler.objects.models import Object
from dateutil.parser import parse
from datetime import datetime
import re
import urlparse
from src import clogger, cfg
from src.fb import graph
from fixing import fix_link
from facebook import GraphAPIError
#from fetching import downloadfile
import json
def do_compile(tpe, cont):
if type(cont) != dict:
clogger.error("Type Error for do compile for :"+str(cont["url"]))
# Starting to compile an generic object
if "url" not in cont:
clogger.error("no url can't compile "+tpe)
else:
clogger.debug("compile: type:"+str(tpe)+ "| "+ str(cont["url"]))
if tpe in compiler:
cont=compiler[tpe](cont["url"], cont["raw"])
return cont
from comp import rssfeed
def dummyarticle(url, raw):
return {"url": url, "article":{"url": url, "section": "dummysection", "sourcetype": "dummy", "title":"dummytitle", "text": raw, "image": "fff", "author": "me", "published": None}}
def htufeed(url,raw):
al=[]
f=feedparser.parse(raw)
for e in f['entries']:
al.append(e['link'])
return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}
def htuarticle(url,raw):
sp=BeautifulSoup(raw)
d={}
h=sp.find("meta", {"property": "og:image"})
if h is not None:
d["image"]=h.attrs["content"]
d["image2"]=d["image"]
h=sp.find("div", {"class": "patternRevInfo"})
if h is not None:
# clogger.debug(h.text.strip())
h1= re.sub(r'.*- (\d+) ([a-zA-Z]+) (\d+) - ([:\d]+)[^\d]*', r'\3/\2/\1 \4',unicode(h.text.strip()))
# clogger.debug(h1)
d["published"]=parse(h1)
# clogger.debug(parse(h1))
# clogger.debug(d["published"])
h=h.find("a")
if h is not None:
d["author"]=h.text.strip()
h=sp.find("div", {"class": "foswikiTopic"})
h1=h.find("h4")
if h1 is not None:
d["title"]= h1.text.strip()
h1.extract() # remove head
else:
h1=sp.find("meta", {"name": "WEBTOPIC"})
d["title"]= h1.attrs["content"]
d["text"]=(h.encode_contents()).strip()
d["section"]="HTU"
d["url"]=url
# clogger.debug(d)
return {"article": d}
def fetarticle(url, raw):
sp=BeautifulSoup(raw)
d={}
h=sp.find("h1", {"itemprop": "name"})
d["title"]=unicode(h.text).strip()
h=sp.find("div", {"itemprop": "articleBody"})
if h is not None:
d["text"]=(h.encode_contents()).strip()
else:
d["text"]=""
d["url"]=url
h=sp.find("span", {"itemprop": "author"})
if h is not None:
d["author"]=h.text.strip()
h=sp.find("span", {"itemprop": "articleSection"})
if h is not None:
d["section"]= "FET - " + h.text.strip()
h=sp.find("span", {"itemprop": "datePublished"})
if h is not None:
d["published"]=parse(h.encode_contents().strip())
h=sp.find("meta", {"property": "og:image"})
if h is not None:
d["image"]=h.attrs["content"]
d["image2"]=d["image"]
# hh=sp.find_all("div", {"class":"media"})
# for h in hh:
# if h is not None:
# h=h.find("div", {"class": "pull-left"})
# if h is not None:
# h=h.find("a")
# if h is not None:
# d["image2"]=downloadfile(fix_link(url,h.attrs["href"]))
return {"article": d}
def fsarcharticle(url, raw):
sp=BeautifulSoup(raw)
d={}
h=sp.find("h1", {"class": "title"})
if h is not None:
d["title"]=h.text.strip()
d["url"]=url
d["published"]=None
h=sp.find("article")
if h is not None:
h=h.find("div", {"class": "content"})
d["text"]=h.encode_contents().strip()
h=sp.find("article")
if h is not None:
h=h.find("h1", {"class": "title"})
if h is not None:
d["title"]=h.text.strip()
else:
d["title"]=""
d["image"]=""
d["sourcetype"]="fsarcharticle"
d["section"]="fsarch"
d["author"]=None
return {"article": d}
def fetindex(url, raw):
if raw is None:
raise Error
# clogger.debug("compile_fetindex: "+str(url))
html=BeautifulSoup(raw)
h = html.find("li", {"class": "next_page" })
if h is not None:
nl=h.find("a")
nl=fix_link(url,nl.attrs["href"])
else:
nl=None
h= html.find("ul", {"id": "neuigkeiten"})
al = []
if h is not None:
links=h.find_all("a")
for t in links:
al.append(t.attrs["href"])
return {"url": url, "next_page": nl, "article_links": al, "objecttype": "index" }
def fsarchindex(url, raw):
if raw is None:
raise Error
html=BeautifulSoup(raw)
h= html.find("article")
print unicode(h)
links=h.find_all("a")
al = []
fl=[]
for t in links:
url=t.attrs["href"]
if re.search("fachschaftarchitektur\.at", url):
al.append(t.attrs["href"])
if re.search("facebook\.com/events", url):
fl.append(t.attrs["href"])
return {"url": url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"}
def fsbizindex(url, raw):
if raw is None:
raise Error
print "compile_fsbizindex"
html=BeautifulSoup(raw)
h= html.find("section", {"id": "primary"})
links=h.find_all("h1", {"class": "entry-title"})
al = []
for t in links:
al.append(t.find("a").attrs["href"])
return {"url": url,"article_links": al,"objecttype": "index"}
def fbfeed(url, raw):
js = json.loads(raw)
arts=[]
u=urlparse.urlparse(url)
for m in js["data"]:
aa={}
aa["url"]=urlparse.urlunsplit(("http","www.facebook.at",m["id"],"",""))
aa["published"] =parse(m["created_time"])
if m.has_key("message")==True:
aa["text"] = m["message"]
else:
try:
h=graph.get_object(id=m["id"].split("_")[1])
if h.has_key("description"):
aa["text"]=h["description"]
else:
aa["text"]=json.dumps()
except GraphAPIError:
aa["text"]=""
if m.has_key("story")==True:
aa["title"] = m["story"]
else:
aa["title"] = u[1]+ " at " + aa["published"].strftime("%Y-%m-%d %H:%M")
aa["section"]="Facebook: "+u[1]
arts.append(aa)
return {"url": url, "next_page": js["paging"]["next"],"articles": arts}
def fsmbindex(url, raw):
if raw is None:
raise Error
html=BeautifulSoup(raw)
h= html.find("a",{"class": "next"})
if h is not None:
np=h.attrs["href"]
else:
np=None
h=html.find("div", {"id": "main"}).find("div", {"class": "inside"}).find("div", {"class": "mod_newslist"})
if h is not None:
ats=h.find_all("div",{"class": "block"})
articles=[]
for a in ats:
aa={}
h=a.find("h3")
if h is not None:
aa["title"] = h.text.strip()
h=a.find("div", {"class": "ce_text"})
if h is not None:
aa["text"] = (h.encode_contents()).strip()
aa["info"]=[]
hh=a.find_all("p", {"class": "info"},recursive=False)
for h in hh:
aa["info"].append(unicode(h.text))
if re.search(r'von', str(h)):
h1= re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text))
aa["published"] =parse(h1.strip())
aa["author"]=re.sub(r'^.*von(.*)$', r'\1',unicode(h.text)).strip() #h.text + "--" #+ re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',hh)
aa["section"]="FSMB"
articles.append(aa)
return {"url": url, "next_page": np, "articles": articles,"objecttype": "articles"}
compiler = {"fetindex": fetindex, "fetarticle": fetarticle, "fsarchindex": fsarchindex, "fsarcharticle": fsarcharticle, "fsmbindex": fsmbindex, "fsbizindex": fsbizindex, "dummyarticle": dummyarticle,"htuarticle": htuarticle, "htufeed": htufeed, "fbfeed": fbfeed, "fschfeed": rssfeed}
compiler = cfg.compiler
for i in compiler:
compiler[i]=eval(compiler[i])
article_types={"fetindex" : "fetarticle", "fsarchindex": "fsarcharticle", "fsbizindex": "fsbizarticle", "dummyindex": "dummyarticle", "htufeed": "htuarticle"}

67
compiler/fetching.py Normal file
View File

@@ -0,0 +1,67 @@
from requests import session
s=session()
from src import package_directory, download_path,cfg
from os import path, makedirs
import os
import json
from gevent import spawn
from src import clogger
from src.fb import graph
from hashlib import md5
import errno
import urlparse
def announce_articleid(id):
for u in cfg.announcearticle_url:
s.get( u % id)
def downloadfile(url):
relative_name=path.join("downloads",str(md5(url).hexdigest()),url.split('/')[-1])
local_filename = path.join(download_path,relative_name)
if not os.path.exists(os.path.dirname(local_filename)):
try:
os.makedirs(os.path.dirname(local_filename))
except OSError as exc: # Guard against race condition
if exc.errno != errno.EEXIST:
raise
if not path.exists(local_filename):
spawn(fetch_load_file, url, local_filename)
return relative_name
from models import CrawlCache
from datetime import datetime, timedelta
def fetch_page(furl):
current_time = datetime.utcnow()
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
u=urlparse.urlparse(furl)
if u[0] == '':
furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
if cc is None:
clogger.debug("fetching url: "+ str(furl))
if u[0]=='fb':
tx = json.dumps(graph.get_object(id=u[1]+u[2]))
else:
tx=s.get(furl).text
CrawlCache.store(furl,tx)
else:
#if furl is not None:
# clogger.debug("cache hit")
tx=cc.raw
return tx
def fetch_load_file(furl, path):
try:
clogger.info("Downloading "+ str(furl))
r = s.get(furl, stream=True)
f = open(path, 'wb')
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.close()
except Exception, e:
#clogger.error("Error Occured during fetching:"+str(furl))
clogger.error(e,exc_info=True)

37
compiler/fixing.py Normal file
View File

@@ -0,0 +1,37 @@
from bs4 import BeautifulSoup
from urlparse import urlparse, urlunparse, urljoin
from fetching import downloadfile
import bleach
def fix_link(url, link):
r= urlparse(link)
if r.scheme is None or r.scheme == '':
return urljoin(url,link)
else:
return link
def fix_file(url, link):
u=fix_link(url,link)
return downloadfile(u)
def load_file(url, link):
return fix_file(url,link)
def fix_html(html, baseurl):
html=bleach.clean(html, tags=['b','p','span','a','img','div','br','strong','ul','li'], strip=True)
sp=BeautifulSoup(html)
images=sp.find_all("img")
for t in images:
if "src" in t.attrs and t.attrs["src"] is not None:
t.attrs["src"]=fix_file(baseurl,t.attrs["src"])
links=sp.find_all("a")
for t in links:
if "href" in t.attrs:
t.attrs["href"]=fix_link(baseurl, t.attrs["href"])
for t in sp.find_all("script"):
t.extract()
b=sp.find("base")
if b is not None:
b.attrs["href"]=""
return sp

75
compiler/models.py Normal file
View File

@@ -0,0 +1,75 @@
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text
from datetime import datetime
from src.database import Base2
from src.database import db_session2
from mqueues import put_fetch_queue
from marshmallow import Schema,fields,ValidationError
import json
import flask
def add_url(tpe, url):
cu=CrawlUrl.find_or_create(tpe,url)
db_session2.add(cu)
db_session2.commit()
cu.schedule()
class CrawlUrlSchema(Schema):
id=fields.Integer()
tpe=fields.String()
url=fields.String()
last_fetched=fields.DateTime()
fetched = fields.DateTime()
class CrawlUrl(Base2):
__tablename__='crawlurls'
id = Column(Integer, primary_key=True)
tpe=Column(String(250))
url = Column(String(250))
last_fetched = Column(DateTime)
def fetched(self):
CrawlCache.query.find(CrawlCache.url==self.url).first()
@classmethod
def find_or_create(self, tpe, url):
aa = CrawlUrl.query.filter(CrawlUrl.url==url).filter(CrawlUrl.tpe==tpe).first()
if aa is None:
aa=CrawlUrl(tpe,url)
return aa
def schedule(self):
put_fetch_queue((0, self.tpe, self.url))
def __init__(self, tpe, url):
self.url=url
self.tpe=tpe
def __json__(self):
return CrawlUrlSchema().dump(self)[0]
class CrawlCacheSchema(Schema):
id=fields.Integer()
raw=fields.String()
url=fields.String()
fetched=fields.DateTime()
class CrawlCache(Base2):
__tablename__='crawlcache'
id = Column(Integer, primary_key=True)
url=Column(String(250))
fetched=Column(DateTime)
raw=Column(Text)
def __init__(self, url,rw):
self.url=url
self.raw=rw
self.fetched=datetime.utcnow()
def __json__(self):
return CrawlCacheSchema().dump(self)
@classmethod
def store(cls, url, rw):
cc=CrawlCache(url,rw)
db_session2.add(cc)
db_session2.commit()
#flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, CrawlUrl) else None)

74
compiler/mprocess.py Normal file
View File

@@ -0,0 +1,74 @@
from src import clogger # Logger for crawler
from src.models import Article # Article model
from datetime import datetime
from src.database import db_session
from mqueues import fetch_queue, compile_queue, put_fetch_queue
from fetching import fetch_page, downloadfile, announce_articleid
from fixing import fix_html, fix_file
from compiler import article_types
from fixing import fix_link
# process article expects an hash with raw data for the article and puts it into an
# article object stored in the database it is intended to prevent dublicates
def is_article_hash(h):
return "text" in h and "url" in h and "sourcetype" in h and "section" in h
def process_article(art):
if not is_article_hash(art):
clogger.error("Invalid article hash:" + str(art))
aa=None
else:
art["text"]=fix_html(art["text"],art["url"])
if "image" in art:
art["image"]=fix_file(art["url"], art["image"])
clogger.info(art)
aa = Article.from_hash(art)
aa.process_hash(art)
aa.last_fetched=datetime.now()
aa.sourcetype=art["sourcetype"]
db_session.add(aa)
db_session.commit()
clogger.debug("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
# announce_articleid(aa.id)
return aa
# process a single found url
def process_url(url,tpe, parent_url):
#clogger.debug("process URL of type "+ tpe + ": " + url)
if parent_url is not None:
url=fix_link(parent_url, url)
put_fetch_queue((0,tpe,url))
# process a url list
def process_urllist(urllist, tpe, parent_url):
for u in urllist:
process_url(u,tpe, parent_url)
def do_process(tpe, cont):
urllist=[]
# clogger.debug("process :" + str(cont))
if "article_links" in cont:
process_urllist(cont["article_links"], article_types[tpe], cont["url"])
if "index_links" in cont:
process_urllist(cont["index_links"], tpe , cont["url"])
if "next_page" in cont and cont["next_page"] is not None:
process_url(cont["next_page"],tpe, cont["url"])
if "article" in cont:
art=cont["article"]
art["sourcetype"]=tpe
process_article(art)
if "articles" in cont:
clogger.debug("articles")
for a in cont["articles"]:
if "title" in a:
a["sourcetype"]=tpe
if a.has_key("url")==False:
a["url"]=cont["url"]
process_article(a)
return

8
compiler/mqueues.py Normal file
View File

@@ -0,0 +1,8 @@
from gevent.queue import Queue, JoinableQueue
fetch_queue = Queue()
compile_queue = Queue()
process_queue = Queue()
def put_fetch_queue(o):
fetch_queue.put(o)

58
compiler/mworker.py Normal file
View File

@@ -0,0 +1,58 @@
from mqueues import fetch_queue, compile_queue, process_queue
from compiler import do_compile
from mprocess import do_process
from fetching import fetch_page
from gevent import spawn
from itertools import repeat
from src import clogger
def start_workers(f,c,p):
for _ in range(f):
clogger.debug("spawn fetchworker")
spawn(work_fetch)
for _ in range(c):
spawn(work_compile)
for _ in range(p):
spawn(work_process)
def work_fetch():
while True:
run_fetch()
def work_process():
while True:
run_process()
def work_compile():
while True:
run_compile()
def queue_url(tpe, url):
fetch_queue.put((0,tpe,url))
# fetch a page from the url list
def run_fetch():
tc, tpe, url = fetch_queue.get()
if tpe is not "dummyarticle" and tpe is not "dummyindex":
rw=fetch_page(url)
else:
rw="<p> dummytext</p>"
compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw}))
return rw
# fetch_queue.task_done()
#comile something from the compile list
def run_compile():
tc,tpe,h = compile_queue.get()
h=do_compile(tpe,h)
process_queue.put((0,tpe, h))
return h
# compile_queue.task_done()
def run_process():
tc,tpe,h = process_queue.get()
do_process(tpe, h)
return h
# process_queue.task_done()

146
compiler/views.py Normal file
View File

@@ -0,0 +1,146 @@
from flask import Blueprint, jsonify, render_template, abort, redirect, url_for, request
compiler_pages = Blueprint('compiler', __name__,
template_folder='.')
from src.database import db_session2,init_db,read_json,init_db2
from .models import CrawlUrl
from .models import CrawlCache, CrawlCacheSchema
from .models import CrawlUrlSchema
from src import clogger
from src.articles import Article
#import mworker
import flask
import json
import mworker
from compiler import do_compile
from fetching import fetch_page
#flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Article,CrawlUrl)) else None)
@compiler_pages.route("/")
@compiler_pages.route("")
@compiler_pages.route(".json")
def index():
status="For documentation goto /doc"
return jsonify(status=status)
@compiler_pages.route("/doc")
@compiler_pages.route("/doc.json")
def doc():
return render_template("README")
# return jsonify(status=render_template("README"))
#
@compiler_pages.route("/initdb")
@compiler_pages.route("/initdb.json")
def initdb_json():
init_db() # initialisiere Datenbank
status="Datenbank Neu initialisiert"
return jsonify(status=status)
@compiler_pages.route("/initdb2")
@compiler_pages.route("/initdb2.json")
def initdb_json2():
init_db2() # initialisiere Datenbank
status="Datenbank Neu initialisiert"
return jsonify(status=status)
@compiler_pages.route("/start")
@compiler_pages.route("/start.json")
def start_json():
mworker.start_workers(1,1,1) # initialisiere Datenbank
status="Worker gestartet"
return jsonify(status=status)
@compiler_pages.route("/urls")
@compiler_pages.route("/urls.json")
def urls_index_json():
# Lade Alle Urls
status=CrawlUrl.query.all()
return jsonify(urls=status)
# show an existing CrawlUrl
@compiler_pages.route("/urls/<int:id>")
@compiler_pages.route("/urls/<int:id>.json")
def urls_json(id):
# Lade Alle Urls
status=CrawlUrl.query.get(id)
cc=CrawlCache.query.filter(CrawlCache.url==status.url).first()
return jsonify(urls=status, cache=cc.__json__())
# que an existing CrawlUrl for fetching
@compiler_pages.route("/urls/<int:id>/que")
@compiler_pages.route("/urls/<int:id>/que.json")
def urls_que_json(id):
# Lade Alle Urls
cu=CrawlUrl.query.get(id)
mworker.queue_url(cu.tpe, cu.url)
cc=CrawlCache.query.filter(CrawlCache.url==cu.url)
mworker.start_workers(1,1,1) # initialisiere Datenbank
status="Worker gestartet"
return jsonify(urls=cu, cache=cc)
# que an existing CrawlUrl for fetching
@compiler_pages.route("/urls/<int:id>/test")
@compiler_pages.route("/urls/<int:id>/test.json")
def urls_test_json(id):
# Lade Alle Urls
cu=CrawlUrl.query.get(id)
rw=fetch_page(cu.url)
h= {"url": cu.url, "sourcetype": cu.tpe, "raw": rw}
h2=do_compile(cu.tpe, h)
return jsonify(urls=cu,hs=h2,rw=rw)
@compiler_pages.route("/debug",methods=['GET','PUT'])
def debug():
status="did nothing"
js=read_json(request)
clogger.info(request.get_json())
if js["cmd"] == "runfetch":
mworker.run_fetch()
status="fetched something"
if js["cmd"] == "que":
cu = CrawlUrl.query.get(js["id"])
mworker.queue_url(cu.tpe, cu.url)
status= mworker.run_fetch()
if js["cmd"] == "comp":
status=mworker.run_compile()
if js["cmd"]=="process":
status=mworker.run_process()
return jsonify(status=status)
@compiler_pages.route("/debugurl")
def debugurl():
s=CrawlUrlSchema()
status=CrawlUrl.query.all()
return jsonify(status=status)
@compiler_pages.route("/urls",methods=['POST'])
def add_urls():
# Lese Daten
js =read_json(request)
# clogger.info(js)
# Finde oder Erzeuge Url in der Datenbank
url=CrawlUrlSchema().load(js["url"])
clogger.info(url)
url=CrawlUrl.find_or_create(url.data["tpe"], url.data["url"])
db_session2.add(url)
db_session2.commit()
return jsonify(url=url, kk=js)
@compiler_pages.route("/urls/<int:id>",methods=['DELETE'])
@compiler_pages.route("/urls<int:id>.json",methods=['DELETE'])
def delete(id):
cu=CrawlUrl.query.get(id)
if cu != None:
db_session2.delete(cu)
db_session2.commit()
return jsonify(url={})