init commit

2017-01-14 12:23:04 +01:00
commit 8955bf17f5
32 changed files with 1555 additions and 0 deletions
--- a/compiler/README
+++ b/compiler/README
@@ -0,0 +1,10 @@
+Das ist die API für den Compiler
+Folgende Befehle sind implementiert:
+GET doc: Diese Dokumentation!
+GET initdb: Initialisiere die Datenbank, ACHTUNG Daten werden gelöscht
+POST urls:
+Erwartet Daten im Format {"url": {"type": typ, "url": "someurl.html"}}
+Fügt diese Url der Überwachung hinzu
+
+IN PROCESS:
+GET urls: Alle Urls die überwacht werden sollen
--- a/compiler/README.html
+++ b/compiler/README.html
@@ -0,0 +1 @@
+sdf
--- a/compiler/init.py
+++ b/compiler/init.py
@@ -0,0 +1,15 @@
+
+
+#from mprocess import do_process, process_urllist
+#from compiler import do_compile
+#from mworker import run_fetch, run_process, run_compile
+
+# include models for  final objects
+from src.models import Article
+# starting workers
+from mworker import start_workers
+
+from models import add_url, CrawlUrl
+#start_workers(1,1,1)
+
+from fetching import announce_articleid
--- a/compiler/comp/init.py
+++ b/compiler/comp/init.py
@@ -0,0 +1 @@
+from rss import rssfeed
--- a/compiler/comp/initpy
+++ b/compiler/comp/initpy
@@ -0,0 +1 @@
+from rss import rssfeed
--- a/compiler/comp/rss.py
+++ b/compiler/comp/rss.py
@@ -0,0 +1,8 @@
+import feedparser
+
+def rssfeed(url,raw):
+    al=[]
+    f=feedparser.parse(raw)
+    for e in f['entries']:
+        al.append(e['link'])
+    return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}
--- a/compiler/compile.py
+++ b/compiler/compile.py
@@ -0,0 +1,153 @@
+from bs4 import BeautifulSoup
+import crawler.objects.models 
+#from crawler.objects.models import Object
+from dateutil.parser import parse
+from datetime import datetime
+import re
+def hello():
+    return "hello"
+
+
+def fetarticle(o):
+    sp=BeautifulSoup(o.raw_fixed)
+    d={}
+    h=sp.find("h1", {"itemprop": "name"})
+    d["title"]=unicode(h.text).strip()
+    h=sp.find("div", {"itemprop": "articleBody"})
+    if h is not None:
+        d["text"]=(h.encode_contents()).strip()
+    else:
+        d["text"]=""
+    d["url"]=o.url
+    h=sp.find("span", {"itemprop": "author"})
+    if h is not None:
+        d["author"]=h.text.strip()
+    h=sp.find("span", {"itemprop": "articleSection"})
+    if h is not None:
+        d["section"]= "FET - " + h.text.strip()
+
+    h=sp.find("span", {"itemprop": "datePublished"})
+    if h is not None:
+        d["published"]=parse(h.encode_contents().strip())
+    h=sp.find("meta", {"property": "og:image"})
+    
+    if h is not None:
+        d["image"]=h.attrs["content"]
+        
+    hh=sp.find_all("div", {"class":"media"})
+    for h in hh:
+        if h is not None:
+            h=h.find("div", {"class": "pull-left"})
+        if h is not None:
+            h=h.find("a")    
+        if h is not None:
+            d["image2"]=crawler.objects.models.download_file(h.attrs["href"])
+    return {"article": d}
+
+def fsarcharticle(o):
+    sp=BeautifulSoup(o.raw_fixed)
+    d={}
+    h=sp.find("h1", {"class": "title"})
+    if h is not None:
+        d["title"]=h.text.strip()
+    d["url"]=o.url
+    d["published"]=None
+    h=sp.find("article")
+    h=h.find("div", {"class": "content"})
+    d["text"]=h.encode_contents().strip()
+    h=sp.find("article").find("h1", {"class": "title"})
+    if h is not None:
+        d["title"]=h.text.strip()
+    else:
+        d["title"]=""
+    d["image"]=""
+    d["sourcetype"]="fsarcharticle"
+    d["section"]="fsarch"
+    d["author"]=None
+    return {"article": d}
+
+def fetindex(o):
+#    if type(o) is not Object:
+#        raise TypeError
+    if o.raw is None:
+        raise Error
+    print "compile_fetindex"
+    html=BeautifulSoup(o.raw_fixed)
+    h = html.find("li", {"class": "next_page" })
+    if h is not None:
+        nl=h.find("a")
+        nl=crawler.objects.models.fix_link(o.url,nl.attrs["href"])
+    else:
+        nl=None
+    h= html.find("ul", {"id": "neuigkeiten"})
+    links=h.find_all("a")
+    al = []
+    for t in links:
+        al.append(t.attrs["href"])
+    return {"url": o.url, "next_page": nl, "article_links": al, "objecttype": "index" }
+
+def fsarchindex(o):
+    if o.raw is None:
+        raise Error
+    html=BeautifulSoup(o.raw_fixed)
+    h= html.find("article")
+    print unicode(h)
+    links=h.find_all("a")
+    al = []
+    fl=[]
+    for t in links:
+        url=t.attrs["href"]
+        if re.search("fachschaftarchitektur\.at", url): 
+            al.append(t.attrs["href"])
+        if re.search("facebook\.com/events", url): 
+            fl.append(t.attrs["href"])
+        
+    return {"url": o.url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"}
+
+
+def fsbizindex(o):
+    if o.raw is None:
+        raise Error
+    print "compile_fsbizindex"
+    html=BeautifulSoup(o.raw_fixed)
+    h= html.find("section", {"id": "primary"})
+    links=h.find_all("h1", {"class": "entry-title"})
+    al = []
+    for t in links:
+
+        al.append(t.find("a").attrs["href"])
+    return {"url": o.url,"article_links": al,"objecttype": "index"}
+
+
+def fsmbindex(o):
+    if o.raw is None:
+        raise Error
+    html=BeautifulSoup(o.raw_fixed)
+    h= html.find("a",{"class": "next"})
+    if h is not None:
+        np=h.attrs["href"]
+    else:
+        np=None
+    h=html.find("div", {"id": "main"}).find("div", {"class": "inside"}).find("div", {"class": "mod_newslist"})
+    if h is not None:
+        ats=h.find_all("div",{"class": "block"})
+        articles=[]
+        for a in ats:
+            aa={}
+            h=a.find("h3")
+            if h is not None:
+                aa["title"] = h.text.strip()
+            h=a.find("div", {"class": "ce_text"})
+            if h is not None:
+                aa["text"] = (h.encode_contents()).strip()
+            aa["info"]=[]
+            hh=a.find_all("p", {"class": "info"},recursive=False)
+            for h in hh:
+                aa["info"].append(unicode(h.text))
+                if re.search(r'von', str(h)):
+                    h1=  re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text)) 
+                    aa["published"] =parse(h1.strip())
+                    aa["author"]=re.sub(r'^.*von(.*)$', r'\1',unicode(h.text)).strip() #h.text + "--" #+ re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',hh) 
+            aa["section"]="FSMB"
+            articles.append(aa)
+    return {"url": o.url, "next_page": np, "articles": articles,"objecttype": "articles"}
--- a/compiler/compiler.py
+++ b/compiler/compiler.py
@@ -0,0 +1,258 @@
+from bs4 import BeautifulSoup
+#import crawler.objects.models 
+#from crawler.objects.models import Object
+from dateutil.parser import parse
+from datetime import datetime
+import re
+import urlparse
+from src import clogger, cfg
+from src.fb import graph
+from fixing import fix_link
+from facebook import GraphAPIError
+#from fetching import downloadfile
+import json
+def do_compile(tpe, cont):
+    if  type(cont) != dict:
+        clogger.error("Type Error for do compile for :"+str(cont["url"]))
+    # Starting to compile an generic object
+    if "url" not in cont:
+        clogger.error("no url can't compile "+tpe)
+    else:
+        clogger.debug("compile: type:"+str(tpe)+ "| "+ str(cont["url"]))
+        if tpe in compiler:
+            cont=compiler[tpe](cont["url"], cont["raw"])
+    return cont
+
+from comp import rssfeed
+
+def dummyarticle(url, raw):
+    return {"url": url, "article":{"url": url, "section": "dummysection", "sourcetype": "dummy", "title":"dummytitle", "text": raw, "image": "fff", "author": "me", "published": None}}
+
+
+
+def htufeed(url,raw):
+    al=[]
+    f=feedparser.parse(raw)
+    for e in f['entries']:
+        al.append(e['link'])
+    return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}
+    
+
+def htuarticle(url,raw):
+    sp=BeautifulSoup(raw)
+    d={}
+    h=sp.find("meta", {"property": "og:image"})    
+    if h is not None:
+        d["image"]=h.attrs["content"]
+        d["image2"]=d["image"]
+    h=sp.find("div", {"class": "patternRevInfo"})
+    if h is not None:
+#        clogger.debug(h.text.strip())
+        h1=  re.sub(r'.*- (\d+) ([a-zA-Z]+) (\d+) - ([:\d]+)[^\d]*', r'\3/\2/\1 \4',unicode(h.text.strip()))
+#        clogger.debug(h1)
+        d["published"]=parse(h1)
+  #      clogger.debug(parse(h1))
+  #      clogger.debug(d["published"])
+    h=h.find("a")
+    if h is not None:
+        d["author"]=h.text.strip()
+    h=sp.find("div", {"class": "foswikiTopic"})
+    h1=h.find("h4")
+    if h1 is not None:
+        d["title"]= h1.text.strip()
+        h1.extract() # remove head
+    else:
+        h1=sp.find("meta", {"name": "WEBTOPIC"})    
+        d["title"]= h1.attrs["content"]
+    d["text"]=(h.encode_contents()).strip()
+    d["section"]="HTU"
+    d["url"]=url
+#    clogger.debug(d)
+    return {"article": d}
+
+    
+def fetarticle(url, raw):
+    sp=BeautifulSoup(raw)
+    d={}
+    h=sp.find("h1", {"itemprop": "name"})
+    d["title"]=unicode(h.text).strip()
+    h=sp.find("div", {"itemprop": "articleBody"})
+    if h is not None:
+        d["text"]=(h.encode_contents()).strip()
+    else:
+        d["text"]=""
+    d["url"]=url
+    h=sp.find("span", {"itemprop": "author"})
+    if h is not None:
+        d["author"]=h.text.strip()
+    h=sp.find("span", {"itemprop": "articleSection"})
+    if h is not None:
+        d["section"]= "FET - " + h.text.strip()
+
+    h=sp.find("span", {"itemprop": "datePublished"})
+    if h is not None:
+        d["published"]=parse(h.encode_contents().strip())
+
+    h=sp.find("meta", {"property": "og:image"})    
+    if h is not None:
+        d["image"]=h.attrs["content"]
+        d["image2"]=d["image"]
+#    hh=sp.find_all("div", {"class":"media"})
+#    for h in hh:
+#        if h is not None:
+#            h=h.find("div", {"class": "pull-left"})
+#        if h is not None:
+#            h=h.find("a")    
+#        if h is not None:
+#            d["image2"]=downloadfile(fix_link(url,h.attrs["href"]))
+    return {"article": d}
+
+
+def fsarcharticle(url, raw):
+    sp=BeautifulSoup(raw)
+    d={}
+    h=sp.find("h1", {"class": "title"})
+    if h is not None:
+        d["title"]=h.text.strip()
+    d["url"]=url
+    d["published"]=None
+    h=sp.find("article")
+    if h is not None:
+        h=h.find("div", {"class": "content"})
+        d["text"]=h.encode_contents().strip()
+    h=sp.find("article")
+    if h is not None:
+        h=h.find("h1", {"class": "title"})
+    if h is not None:
+        d["title"]=h.text.strip()
+    else:
+        d["title"]=""
+    d["image"]=""
+    d["sourcetype"]="fsarcharticle"
+    d["section"]="fsarch"
+    d["author"]=None
+    return {"article": d}
+
+def fetindex(url, raw):
+    if raw is None:
+        raise Error
+#    clogger.debug("compile_fetindex: "+str(url))
+    html=BeautifulSoup(raw)
+    h = html.find("li", {"class": "next_page" })
+    if h is not None:
+        nl=h.find("a")
+        nl=fix_link(url,nl.attrs["href"])
+    else:
+        nl=None
+    h= html.find("ul", {"id": "neuigkeiten"})
+    al = []
+    if h is not None:
+        links=h.find_all("a")
+        for t in links:
+            al.append(t.attrs["href"])
+    return {"url": url, "next_page": nl, "article_links": al, "objecttype": "index" }
+
+def fsarchindex(url, raw):
+    if raw is None:
+        raise Error
+    html=BeautifulSoup(raw)
+    h= html.find("article")
+    print unicode(h)
+    links=h.find_all("a")
+    al = []
+    fl=[]
+    for t in links:
+        url=t.attrs["href"]
+        if re.search("fachschaftarchitektur\.at", url): 
+            al.append(t.attrs["href"])
+        if re.search("facebook\.com/events", url): 
+            fl.append(t.attrs["href"])
+        
+    return {"url": url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"}
+
+
+def fsbizindex(url, raw):
+    if raw is None:
+        raise Error
+    print "compile_fsbizindex"
+    html=BeautifulSoup(raw)
+    h= html.find("section", {"id": "primary"})
+    links=h.find_all("h1", {"class": "entry-title"})
+    al = []
+    for t in links:
+
+        al.append(t.find("a").attrs["href"])
+    return {"url": url,"article_links": al,"objecttype": "index"}
+
+
+
+
+def fbfeed(url, raw):
+    js = json.loads(raw)
+    arts=[]
+    u=urlparse.urlparse(url)
+    for m in js["data"]:
+        aa={}
+        aa["url"]=urlparse.urlunsplit(("http","www.facebook.at",m["id"],"",""))
+        aa["published"] =parse(m["created_time"])
+        if m.has_key("message")==True:
+            aa["text"] = m["message"]
+        else:
+            try:
+                h=graph.get_object(id=m["id"].split("_")[1])
+                if h.has_key("description"):
+                    aa["text"]=h["description"]
+                else:
+                    aa["text"]=json.dumps()
+            except GraphAPIError:
+                aa["text"]=""
+        if m.has_key("story")==True:
+            aa["title"] = m["story"]
+        else:
+            aa["title"] = u[1]+ " at " + aa["published"].strftime("%Y-%m-%d %H:%M")
+        aa["section"]="Facebook: "+u[1]
+        arts.append(aa)
+    return {"url": url, "next_page": js["paging"]["next"],"articles": arts}
+
+def fsmbindex(url, raw):
+    if raw is None:
+        raise Error
+    html=BeautifulSoup(raw)
+    h= html.find("a",{"class": "next"})
+    if h is not None:
+        np=h.attrs["href"]
+    else:
+        np=None
+    h=html.find("div", {"id": "main"}).find("div", {"class": "inside"}).find("div", {"class": "mod_newslist"})
+    if h is not None:
+        ats=h.find_all("div",{"class": "block"})
+        articles=[]
+        for a in ats:
+            aa={}
+            h=a.find("h3")
+            if h is not None:
+                aa["title"] = h.text.strip()
+            h=a.find("div", {"class": "ce_text"})
+            if h is not None:
+                aa["text"] = (h.encode_contents()).strip()
+            aa["info"]=[]
+            hh=a.find_all("p", {"class": "info"},recursive=False)
+            for h in hh:
+                aa["info"].append(unicode(h.text))
+                if re.search(r'von', str(h)):
+                    h1=  re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text)) 
+                    aa["published"] =parse(h1.strip())
+                    aa["author"]=re.sub(r'^.*von(.*)$', r'\1',unicode(h.text)).strip() #h.text + "--" #+ re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',hh) 
+            aa["section"]="FSMB"
+            articles.append(aa)
+    return {"url": url, "next_page": np, "articles": articles,"objecttype": "articles"}
+
+compiler = {"fetindex": fetindex, "fetarticle": fetarticle, "fsarchindex": fsarchindex, "fsarcharticle": fsarcharticle, "fsmbindex": fsmbindex, "fsbizindex": fsbizindex, "dummyarticle": dummyarticle,"htuarticle": htuarticle, "htufeed": htufeed, "fbfeed": fbfeed, "fschfeed": rssfeed}    
+
+compiler = cfg.compiler
+for i in compiler:
+    compiler[i]=eval(compiler[i])
+
+
+
+article_types={"fetindex" : "fetarticle", "fsarchindex": "fsarcharticle", "fsbizindex": "fsbizarticle", "dummyindex": "dummyarticle", "htufeed": "htuarticle"}
--- a/compiler/fetching.py
+++ b/compiler/fetching.py
@@ -0,0 +1,67 @@
+from requests import session
+s=session()
+from src import package_directory, download_path,cfg
+from os import path, makedirs
+import os
+import json
+from gevent import spawn
+from src import clogger
+from src.fb import graph
+from hashlib import md5
+import errno
+import urlparse
+def announce_articleid(id):
+    for u in cfg.announcearticle_url:
+        s.get( u % id)
+
+def downloadfile(url):
+    relative_name=path.join("downloads",str(md5(url).hexdigest()),url.split('/')[-1])
+    local_filename = path.join(download_path,relative_name)
+    if not os.path.exists(os.path.dirname(local_filename)):
+        try:
+            os.makedirs(os.path.dirname(local_filename))
+        except OSError as exc: # Guard against race condition
+            if exc.errno != errno.EEXIST:
+                raise
+    if not path.exists(local_filename):
+        spawn(fetch_load_file, url, local_filename)
+    return relative_name
+
+from models import CrawlCache 
+from datetime import datetime, timedelta
+
+
+
+
+def fetch_page(furl):
+    current_time = datetime.utcnow()
+    ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
+    u=urlparse.urlparse(furl)
+    if u[0] == '':
+        furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
+    cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
+    if cc is None:
+        clogger.debug("fetching url:  "+ str(furl))
+        if u[0]=='fb':
+            tx = json.dumps(graph.get_object(id=u[1]+u[2]))
+        else:
+            tx=s.get(furl).text
+        CrawlCache.store(furl,tx)
+    else:
+    #if furl is not None:
+#            clogger.debug("cache hit")
+        tx=cc.raw
+    return tx
+
+def fetch_load_file(furl, path):
+    try:
+        clogger.info("Downloading "+ str(furl))
+        r = s.get(furl, stream=True)
+        f = open(path, 'wb')
+        for chunk in r.iter_content(chunk_size=1024): 
+            if chunk: # filter out keep-alive new chunks
+                f.write(chunk)
+        f.close()
+    except Exception, e:
+        #clogger.error("Error Occured during fetching:"+str(furl))
+        clogger.error(e,exc_info=True)
--- a/compiler/fixing.py
+++ b/compiler/fixing.py
@@ -0,0 +1,37 @@
+from bs4 import BeautifulSoup
+from urlparse import urlparse, urlunparse, urljoin
+from fetching import downloadfile
+import bleach
+
+def fix_link(url, link):
+    r= urlparse(link)
+    if r.scheme is None or r.scheme == '':
+        return urljoin(url,link)
+    else:
+        return link
+
+def fix_file(url, link):
+    u=fix_link(url,link)
+    return downloadfile(u)
+
+def load_file(url, link):
+    return fix_file(url,link)
+
+
+def fix_html(html, baseurl):
+    html=bleach.clean(html, tags=['b','p','span','a','img','div','br','strong','ul','li'], strip=True)
+    sp=BeautifulSoup(html)
+    images=sp.find_all("img")
+    for t in images:
+        if "src" in t.attrs and t.attrs["src"] is not None:
+            t.attrs["src"]=fix_file(baseurl,t.attrs["src"])
+    links=sp.find_all("a")
+    for t in links:
+        if "href" in t.attrs:
+            t.attrs["href"]=fix_link(baseurl, t.attrs["href"])
+    for t in sp.find_all("script"):
+        t.extract()
+        b=sp.find("base")
+        if b is not None:
+            b.attrs["href"]=""
+    return sp
--- a/compiler/models.py
+++ b/compiler/models.py
@@ -0,0 +1,75 @@
+from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text
+from datetime import datetime
+from src.database import Base2
+from src.database import db_session2
+from mqueues import put_fetch_queue
+from marshmallow import Schema,fields,ValidationError
+import json
+import flask
+
+def add_url(tpe, url):
+    cu=CrawlUrl.find_or_create(tpe,url)
+    db_session2.add(cu)
+    db_session2.commit()
+    cu.schedule()
+
+
+class CrawlUrlSchema(Schema):
+    id=fields.Integer()
+    tpe=fields.String()
+    url=fields.String()
+    last_fetched=fields.DateTime()
+    fetched = fields.DateTime()
+    
+class CrawlUrl(Base2):
+    __tablename__='crawlurls'
+    id = Column(Integer, primary_key=True)
+    tpe=Column(String(250))
+    url = Column(String(250))
+    last_fetched = Column(DateTime)
+    def fetched(self):
+        CrawlCache.query.find(CrawlCache.url==self.url).first()
+    @classmethod
+    def find_or_create(self, tpe, url):
+        aa = CrawlUrl.query.filter(CrawlUrl.url==url).filter(CrawlUrl.tpe==tpe).first()        
+        if aa is None:
+            aa=CrawlUrl(tpe,url)
+        return aa
+    def schedule(self):
+        put_fetch_queue((0, self.tpe, self.url))
+    def __init__(self, tpe, url):
+        self.url=url
+        self.tpe=tpe
+    def __json__(self):
+        return CrawlUrlSchema().dump(self)[0]
+
+class CrawlCacheSchema(Schema):
+    id=fields.Integer()
+    raw=fields.String()
+    url=fields.String()
+    fetched=fields.DateTime()
+    
+class CrawlCache(Base2):
+    __tablename__='crawlcache'
+    id = Column(Integer, primary_key=True)
+    url=Column(String(250))
+    fetched=Column(DateTime)
+    raw=Column(Text)
+
+    def __init__(self, url,rw):
+        self.url=url
+        self.raw=rw
+        self.fetched=datetime.utcnow()
+    def __json__(self):
+        return CrawlCacheSchema().dump(self)
+
+    @classmethod
+    def store(cls, url, rw):
+        cc=CrawlCache(url,rw)
+        db_session2.add(cc)
+        db_session2.commit()
+
+
+
+        
+#flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, CrawlUrl) else None)
--- a/compiler/mprocess.py
+++ b/compiler/mprocess.py
@@ -0,0 +1,74 @@
+from src import clogger # Logger for crawler
+from src.models import Article # Article model
+from datetime import datetime 
+from src.database import db_session
+from mqueues import fetch_queue, compile_queue, put_fetch_queue
+from fetching import fetch_page, downloadfile, announce_articleid
+from fixing import fix_html, fix_file
+
+from compiler import article_types
+from fixing import fix_link
+# process article expects an hash with raw data for the article and puts it into an
+# article object stored in the database it is intended to prevent dublicates
+
+def is_article_hash(h):
+    return "text" in h  and "url" in h and "sourcetype" in h and "section" in h
+
+def process_article(art):
+    if not is_article_hash(art):
+        clogger.error("Invalid article hash:" + str(art))
+        aa=None
+    else:
+        art["text"]=fix_html(art["text"],art["url"])
+        if "image" in art:
+            art["image"]=fix_file(art["url"], art["image"])
+        clogger.info(art)
+        aa = Article.from_hash(art)
+        aa.process_hash(art)
+        aa.last_fetched=datetime.now()
+        aa.sourcetype=art["sourcetype"]
+        db_session.add(aa)
+        db_session.commit()
+        clogger.debug("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
+#        announce_articleid(aa.id)
+    return aa
+
+# process a single found url
+def process_url(url,tpe, parent_url):
+    #clogger.debug("process URL of type "+ tpe + ": " + url)
+    if parent_url is not None:
+        url=fix_link(parent_url, url)
+    put_fetch_queue((0,tpe,url))
+
+
+# process a url list
+def process_urllist(urllist, tpe, parent_url):
+    for u in urllist:
+        process_url(u,tpe, parent_url)
+        
+        
+def do_process(tpe, cont):
+    urllist=[]
+#    clogger.debug("process :" + str(cont))
+    if "article_links" in cont:
+        process_urllist(cont["article_links"],  article_types[tpe], cont["url"])
+    if "index_links" in cont:
+        process_urllist(cont["index_links"],  tpe , cont["url"])
+
+    if "next_page" in cont and cont["next_page"] is not None:
+        process_url(cont["next_page"],tpe, cont["url"])
+
+    if "article" in cont:
+        art=cont["article"]
+        art["sourcetype"]=tpe
+        process_article(art)
+
+    if "articles" in cont:
+        clogger.debug("articles")
+        for a in cont["articles"]:
+            if "title" in a:
+                a["sourcetype"]=tpe
+                if a.has_key("url")==False:
+                    a["url"]=cont["url"]
+                process_article(a)
+    return 
--- a/compiler/mqueues.py
+++ b/compiler/mqueues.py
@@ -0,0 +1,8 @@
+from gevent.queue import Queue, JoinableQueue
+fetch_queue = Queue()
+compile_queue = Queue()
+process_queue = Queue()
+
+def put_fetch_queue(o):
+    fetch_queue.put(o)
+
--- a/compiler/mworker.py
+++ b/compiler/mworker.py
@@ -0,0 +1,58 @@
+
+from mqueues import fetch_queue, compile_queue, process_queue
+from compiler import do_compile
+from mprocess import do_process
+from fetching import fetch_page
+from gevent import spawn
+from itertools import repeat
+from src import clogger
+def start_workers(f,c,p):
+    for _ in range(f):
+        clogger.debug("spawn fetchworker")
+        spawn(work_fetch)
+    for _ in range(c):
+        spawn(work_compile)
+    for _ in range(p):
+        spawn(work_process)
+    
+def work_fetch():
+    while True:
+        run_fetch()
+        
+def work_process():
+    while True:
+        run_process()
+def work_compile():
+    while True:
+        run_compile()
+    
+
+def queue_url(tpe, url):
+    fetch_queue.put((0,tpe,url))
+
+
+# fetch a page from the url list
+def run_fetch():
+    tc, tpe, url = fetch_queue.get()
+    if tpe is not "dummyarticle" and tpe is not "dummyindex":
+        rw=fetch_page(url)
+    else:
+        rw="<p> dummytext</p>"
+    compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw}))
+    return rw
+    #    fetch_queue.task_done()
+
+#comile something from the compile list
+def run_compile():
+    tc,tpe,h = compile_queue.get()
+    h=do_compile(tpe,h)
+    process_queue.put((0,tpe, h))
+    return h
+    #    compile_queue.task_done()
+
+def run_process():
+    tc,tpe,h = process_queue.get()
+    do_process(tpe, h)
+    return h
+#    process_queue.task_done()
+    
--- a/compiler/views.py
+++ b/compiler/views.py
@@ -0,0 +1,146 @@
+from flask import Blueprint, jsonify, render_template, abort, redirect, url_for, request
+compiler_pages = Blueprint('compiler', __name__,
+                        template_folder='.')
+
+from src.database import db_session2,init_db,read_json,init_db2
+from .models import CrawlUrl
+from .models import CrawlCache, CrawlCacheSchema
+from .models import CrawlUrlSchema
+from src import clogger
+from src.articles import Article
+#import mworker
+import flask
+import json
+import mworker
+
+from compiler import do_compile
+from fetching import fetch_page
+
+#flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Article,CrawlUrl)) else None) 
+
+@compiler_pages.route("/")
+@compiler_pages.route("")
+@compiler_pages.route(".json")
+def index():
+    status="For documentation goto /doc"
+    return jsonify(status=status)
+
+@compiler_pages.route("/doc")
+@compiler_pages.route("/doc.json")
+def doc():
+        return render_template("README")
+#    return jsonify(status=render_template("README"))
+#
+
+
+@compiler_pages.route("/initdb")
+@compiler_pages.route("/initdb.json")
+def initdb_json():
+    init_db() # initialisiere Datenbank
+    status="Datenbank Neu initialisiert"
+    return jsonify(status=status)
+
+@compiler_pages.route("/initdb2")
+@compiler_pages.route("/initdb2.json")
+def initdb_json2():
+    init_db2() # initialisiere Datenbank
+    status="Datenbank Neu initialisiert"
+    return jsonify(status=status)
+
+@compiler_pages.route("/start")
+@compiler_pages.route("/start.json")
+def start_json():
+    mworker.start_workers(1,1,1) # initialisiere Datenbank
+    status="Worker gestartet"
+    return jsonify(status=status)
+    
+
+@compiler_pages.route("/urls")
+@compiler_pages.route("/urls.json")
+def urls_index_json():
+    # Lade Alle Urls
+    status=CrawlUrl.query.all()
+    return jsonify(urls=status)
+
+# show an existing CrawlUrl
+@compiler_pages.route("/urls/<int:id>")
+@compiler_pages.route("/urls/<int:id>.json")
+def urls_json(id):
+    # Lade Alle Urls
+    status=CrawlUrl.query.get(id)
+    cc=CrawlCache.query.filter(CrawlCache.url==status.url).first()
+    return jsonify(urls=status, cache=cc.__json__())
+
+# que an existing CrawlUrl for fetching
+@compiler_pages.route("/urls/<int:id>/que")
+@compiler_pages.route("/urls/<int:id>/que.json")
+def urls_que_json(id):
+    # Lade Alle Urls
+    cu=CrawlUrl.query.get(id)
+    mworker.queue_url(cu.tpe, cu.url)
+    cc=CrawlCache.query.filter(CrawlCache.url==cu.url)
+    mworker.start_workers(1,1,1) # initialisiere Datenbank
+    status="Worker gestartet"
+    return jsonify(urls=cu, cache=cc)
+
+
+# que an existing CrawlUrl for fetching
+@compiler_pages.route("/urls/<int:id>/test")
+@compiler_pages.route("/urls/<int:id>/test.json")
+def urls_test_json(id):
+    # Lade Alle Urls
+    cu=CrawlUrl.query.get(id)
+    rw=fetch_page(cu.url)
+    h= {"url": cu.url, "sourcetype": cu.tpe, "raw": rw}
+    h2=do_compile(cu.tpe, h)
+    return jsonify(urls=cu,hs=h2,rw=rw)
+
+
+
+
+@compiler_pages.route("/debug",methods=['GET','PUT'])
+def debug():
+    status="did nothing"
+    js=read_json(request)
+    clogger.info(request.get_json())
+    if js["cmd"] == "runfetch":
+        mworker.run_fetch()
+        status="fetched something"
+    if js["cmd"] == "que":
+        cu = CrawlUrl.query.get(js["id"])
+        mworker.queue_url(cu.tpe, cu.url)
+        status= mworker.run_fetch()
+    if js["cmd"] == "comp":
+        status=mworker.run_compile()
+    if js["cmd"]=="process":
+        status=mworker.run_process()
+    return jsonify(status=status)
+
+@compiler_pages.route("/debugurl")
+def debugurl():
+    s=CrawlUrlSchema()
+    status=CrawlUrl.query.all()
+    return jsonify(status=status)
+    
+
+@compiler_pages.route("/urls",methods=['POST'])
+def add_urls():
+    # Lese Daten
+    js =read_json(request)
+    #    clogger.info(js)
+    # Finde oder Erzeuge Url in der Datenbank
+    url=CrawlUrlSchema().load(js["url"])
+    clogger.info(url)
+    url=CrawlUrl.find_or_create(url.data["tpe"], url.data["url"])
+    db_session2.add(url)
+    db_session2.commit()
+    return jsonify(url=url, kk=js)
+
+@compiler_pages.route("/urls/<int:id>",methods=['DELETE'])
+@compiler_pages.route("/urls<int:id>.json",methods=['DELETE'])
+def delete(id):
+    cu=CrawlUrl.query.get(id)
+    if cu != None:
+        db_session2.delete(cu)
+        db_session2.commit()
+    return jsonify(url={})