init commit

This commit is contained in:
Andreas Stephanides
2017-01-14 12:23:04 +01:00
commit 8955bf17f5
32 changed files with 1555 additions and 0 deletions

5
.gitignore vendored Normal file
View File

@@ -0,0 +1,5 @@
/__init__.py~
/__init__.pyc
*.pyc
*~
config.cfg

101
__init__.py Normal file
View File

@@ -0,0 +1,101 @@
import os
import sys
package_directory = os.path.dirname(os.path.abspath(__file__))
from config import Config
cfg = Config(file(os.path.join(package_directory, 'config.cfg')))
#--------------- Logging
import logging
download_path="./cdw"
file_handler=logging.FileHandler(cfg.logfile)
file_handler.setLevel(logging.DEBUG)
stream_handler=logging.StreamHandler(sys.stdout)
clt=logging.getLogger('mylogger')
clt.setLevel(logging.DEBUG)
clt.addHandler(file_handler)
clt.addHandler(stream_handler)
clogger=clt
#----------------
lg=clt
from gevent import spawn, monkey
monkey.patch_all()
from .compiler import start_workers
#start_workers(1,1,1)
# Framework
from flask import Flask, jsonify, render_template, redirect, request,send_from_directory
# Cross Site Scripting
from flask_cors import CORS, cross_origin
#Authentication
from flask_jwt import JWT, jwt_required, current_identity
from src.models import Article,Section
from src.users import authenticate, identity
from datetime import datetime
app = Flask(__name__)
CORS(app)
app.config['LOGGER_NAME']='mylogger'
app.logger.setLevel(logging.DEBUG)
app.logger.info("Server Started")
app.config['SECRET_KEY'] = 'super-secret'
import flask
import json
from database import Base
from models import Article, CrawlUrl, CrawlCache
flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Base, Article,Section, CrawlUrl,CrawlCache)) else None)
json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Base, Article,CrawlUrl,CrawlCache)) else None)
#bot.dosmth()
#lg.debug(bot.bot)
# Allow Cross Site Scripting
@app.after_request
def after_request(response):
response.headers.add('Access-Control-Allow-Origin', '*')
if request.method == 'OPTIONS':
response.headers['Access-Control-Allow-Methods'] = 'DELETE, GET, POST, PUT'
headers = request.headers.get('Access-Control-Request-Headers')
if headers:
response.headers['Access-Control-Allow-Headers'] = headers
return response
from .articles.views import article_pages
from .sections.views import section_pages
from .compiler.views import compiler_pages
@app.route("/")
@app.route("/index")
@app.route("/home")
def home():
text="It work's, please do something"
return jsonify(text=text)
app.register_blueprint(article_pages, url_prefix='/articles')
app.register_blueprint(section_pages, url_prefix='/sections')
app.register_blueprint(compiler_pages, url_prefix='/compiler')
from src.bot import bot
if not app.debug or os.environ.get("WERKZEUG_RUN_MAIN") == "true":
bot.message_loop()
# ------------ Telegram Bot
#from bot import bot_queue
#@app.route('/bot', methods=['GET', 'POST'])
#def pass_update():
# bot_queue.put(request.data) # pass update to bot
# return 'OK'

2
articles/__init__.py Normal file
View File

@@ -0,0 +1,2 @@
from .model import Article
from .views import article_pages

139
articles/model.py Normal file
View File

@@ -0,0 +1,139 @@
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey
from sqlalchemy.orm import relationship
from datetime import datetime
from src.database import Base
from src.database import db_session
from marshmallow import Schema, fields
from src.sections.model import Section
#import json
import json
import flask
#json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None)
import hashlib
#import clogger
import logging
#from crawler.compiler.mqueues import put_fetch_queue
from src import clogger
#json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None)
def calc_fingerprint(a):
return calc_fingerprint_h({"url": a.url, "title":a.title, "published": str(a.published_date)})
def calc_fingerprint_h(a):
if a["published"] is not None and a["published"]!= "None":
# clogger.info( "published:"+str(a["published"]))
if a["published"] is str:
pp=parse(a["published"])
else:
pp=a["published"]
else:
pp=""
#clogger.info( unicode(a["url"])+ unicode(a["title"])+unicode(pp))
h=hashlib.md5()
h.update(unicode(a["url"]))
h.update(a["title"].encode("utf-8"))
h.update(unicode(pp))
return h.hexdigest()
class ArticleSchema(Schema):
id=fields.Integer()
text=fields.String()
title=fields.String()
author=fields.String()
sourcetype =fields.String()
image =fields.String()
url =fields.String()
published_date=fields.DateTime()
date=fields.DateTime()
first_fetched=fields.DateTime()
section_id=fields.Integer()
class Article(Base):
__tablename__ = 'articles'
id = Column(Integer, primary_key=True)
parent_id= Column(Integer)
url = Column(String(250))
is_primary = Column(Boolean)
fingerprint = Column(String(250),unique=True)
hash = Column(String(250))
last_fetched = Column(DateTime)
first_fetched=Column(DateTime)
published_date = Column(DateTime)
date = Column(DateTime)
text = Column(Text)
title = Column(String(250))
author = Column(String(250))
section = relationship("Section")
section_id=Column(Integer, ForeignKey('sections.id'))
sourcetype = Column(String(250))
image=Column(String(250))
def __init__(self, url=None,title=None, published_date=None):
self.url=url
self.title=title
self.published_date=published_date
self.first_fetched=datetime.now()
def __json__(self):
return ArticleSchema().dump(self)[0]
def dict(self):
return {"id": str(int(self.id)), "title": self.title, "text": self.text, "author": self.author, "section":self.section, "sourcetype": self.sourcetype, "last_fetched": self.last_fetched, "first_fetched": self.first_fetched, "published_date": self.published_date, "date": self.date,"image": self.image, "url": self.url}
# @classmethod
# def sections(self):
# sects=db_session.query(Article.section).distinct().all()
# for i in range(len(sects)):
# sects[i]=sects[i][0]
# return sects
@classmethod
def from_hash(cls, a):
fp = calc_fingerprint_h(a)
aa = Article.query.filter(Article.fingerprint==fp).first()
if aa is None:
clogger.debug( "new Article")
if a["published"] is not None:
if a["published"] is str:
pd= parse(a["published"])
else:
pd=a["published"]
else:
pd=None
aa=Article(a["url"], a["title"],pd)
aa.fingerprint = calc_fingerprint(aa)
db_session.add(aa)
db_session.commit()
return aa
def process_hash(self, a):
self.text=a["text"].decode('utf8')
if "image" in a:
self.image=a["image"]
if "author" in a:
self.author=a["author"]
if "title" in a:
self.title=a["title"]
if "author" in a:
self.author=a["author"]
if "sourcetype" in a:
self.sourcetype=a["sourcetype"]
if "section" in a:
self.section=Section.find_or_create(a["section"])
# if "last_fetched" in a:
# self.last_fetched=a["last_fetched"]
if "published_date" in a:
self.published_date=a["published_date"]
#flask.json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None)
#json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None)

65
articles/views.py Normal file
View File

@@ -0,0 +1,65 @@
from flask import Blueprint, jsonify, render_template, abort, redirect, url_for, request
article_pages = Blueprint('articles', __name__)
from .model import Article
from .model import ArticleSchema
#import flask
from datetime import datetime
import json
#flask.json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None)
#flask.json.JSONEncoder.default = lambda self,obj: ((obj.dict()) if isinstance(obj, Article) else None)
from src import clogger
import json
from src.database import db_session, read_json
import flask
#flask.json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None)
flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Base, Article,CrawlUrl)) else None)
@article_pages.route("/")
@article_pages.route("")
@article_pages.route(".json")
def index():
articles=Article.query.all()
return jsonify(articles=articles)
@article_pages.route("/<int:id>",methods=['PUT'])
@article_pages.route("/<int:id>.json",methods=['PUT'])
def update(id):
article=Article.query.get(id)
clogger.info(request.data)
a=request.get_json()
article.text=a["text"]
db_session.commit()
return jsonify(article=article)
@article_pages.route("/<int:id>",methods=['GET'])
@article_pages.route("/<int:id>.json",methods=['GET'])
def get(id):
article=Article.query.get(id)
clogger.info(article)
# article=ArticleSchema().dump(article)[0]
return jsonify(article=article)
@article_pages.route("/<int:id>",methods=['DELETE'])
@article_pages.route("/<int:id>.json",methods=['DELETE'])
def delete(id):
article=Article.query.get(id)
clogger.info(id)
if article != None:
db_session.delete(article)
db_session.commit()
return jsonify(article={})
@article_pages.route("/",methods=['POST'])
@article_pages.route("",methods=['POST'])
@article_pages.route(".json",methods=['POST'])
def create():
article=Article()
a=read_json(request)
article.text=a["article"]["text"]
db_session.add(article)
db_session.commit()
return jsonify(article=article)

1
bot/__init__.py Normal file
View File

@@ -0,0 +1 @@
from .bot import bot

140
bot/bot.py Normal file
View File

@@ -0,0 +1,140 @@
import telepot
import datetime
import time
import json
from Queue import Queue
#import os
from src import lg,cfg
#from gevent import spawn
from telepot.namedtuple import InlineKeyboardMarkup, InlineKeyboardButton
from telepot.delegate import (
per_chat_id, pave_event_space, include_callback_query_chat_id, create_open, per_inline_from_id )
from src.compiler import CrawlUrl
from gevent import spawn, monkey, Greenlet
def IKB(h):
return InlineKeyboardButton(text=h["text"], callback_data=h["callback_data"])
def IKB2(h):
return [IKB(h)]
def IKM(h):
return InlineKeyboardMarkup(inline_keyboard=[ map(IKB,h)])
def IKM2(h):
return InlineKeyboardMarkup(inline_keyboard= map(IKB2,h))
def query_que_url(url):
print(json.dumps(url))
return {"text": url.url, "callback_data":"/urls/"+str(url.id)+"/que"}
def handle_urls(handler, cmd):
curls=CrawlUrl.query.all()
#sent=handler.sender.sendMessage(json.dumps(curls))
kb= IKM2(map(query_que_url,curls))
print json.dumps(cmd)
if len(cmd) >= 4 and cmd[3]=="que":
sent=handler.sender.sendMessage("I qued url "+str(cmd[2]), reply_markup=None)
else:
sent=handler.sender.sendMessage("que?", reply_markup=kb)
handler._edit_msg_ident = telepot.message_identifier(sent)
handler._editor = telepot.helper.Editor(handler.bot, sent)
def execute_command(handler,cmd,msg=None):
if cmd[1]=='urls':
handle_urls(handler,cmd)
def handle(handler,msg):
content_type,chat_type,chat_id = telepot.glance(msg)
if msg.has_key('text'):
if msg['text'][0]=='/':
cmd = msg['text'].split("/")
execute_command(handler, cmd, msg)
if msg.has_key('data'):
lg.debug(msg['data'])
class InlineHandler(telepot.helper.InlineUserHandler, telepot.helper.AnswererMixin):
def __init__(self, *args, **kwargs):
super(InlineHandler, self).__init__(*args, **kwargs)
def on_inline_query(self, msg):
def compute_answer():
query_id, from_id, query_string = telepot.glance(msg, flavor='inline_query')
print(self.id, ':', 'Inline Query:', query_id, from_id, query_string)
articles = [{'type': 'article',
'id': 'abc', 'title': query_string, 'message_text': query_string}]
return articles
self.answerer.answer(msg, compute_answer)
def on_chosen_inline_result(self, msg):
from pprint import pprint
pprint(msg)
result_id, from_id, query_string = telepot.glance(msg, flavor='chosen_inline_result')
print(self.id, ':', 'Chosen Inline Result:', result_id, from_id, query_string)
class FetBot(telepot.helper.ChatHandler):
def __init__(self, *args, **kwargs):
# super(FetBot,self).__init__(*args,**kwargs)
super(FetBot,self).__init__( *args,**kwargs)
_editor=None
_edit_msg_ident=None
keyboard=IKM([{"text":"START","callback_data": "start"},
{"text":"Don't Start","callback_data":"notstart"}
])
keyboard =InlineKeyboardMarkup(
inline_keyboard=[[
InlineKeyboardButton(text='START', callback_data='start'),
InlineKeyboardButton(text='START', callback_data='start')
]]
)
def on_chat_message(self,msg):
handle(self,msg)
content_type,chat_type,chat_id = telepot.glance(msg)
lg.debug(content_type)
if content_type=="photo" or content_type=="sticker":
lg.debug("try to download %s" % msg[content_type][-1]["file_id"])
f=self.bot.getFile(msg[content_type][-1]['file_id'])
lg.debug(f)
self.bot.download_file(f['file_id'], "dwn/" + f['file_path'])
# self.bot.getFile(msg['photo'][-1]['file_id']), "dwn")
#self._cancel_last()
#sent=self.sender.sendMessage("Hello World", reply_markup=self.keyboard)
#self._editor = telepot.helper.Editor(self.bot, sent)
#self._edit_msg_ident = telepot.message_identifier(sent)
def on_callback_query(self, msg):
query_id, from_id, query_data = telepot.glance(msg, flavor='callback_query')
lg.debug(json.dumps(msg))
self._cancel_last()
if query_data[0]=='/':
cmd = query_data.split("/")
execute_command(self, cmd, msg)
# self.sender.sendMessage("Danke")
self.bot.answerCallbackQuery(query_id, text='Ok. But I am going to keep asking.')
#self.bot.answerCallbackQuery(query_id)
def _cancel_last(self):
if self._editor:
self._editor.editMessageReplyMarkup(reply_markup=None)
self._editor = None
self._edit_msg_ident = None
bot=None
bot = telepot.DelegatorBot(cfg.token, [include_callback_query_chat_id(pave_event_space())(per_chat_id(),create_open,FetBot,timeout=20),
pave_event_space()(
per_inline_from_id(), create_open, InlineHandler, timeout=10),
])

10
compiler/README Normal file
View File

@@ -0,0 +1,10 @@
Das ist die API für den Compiler
Folgende Befehle sind implementiert:
GET doc: Diese Dokumentation!
GET initdb: Initialisiere die Datenbank, ACHTUNG Daten werden gelöscht
POST urls:
Erwartet Daten im Format {"url": {"type": typ, "url": "someurl.html"}}
Fügt diese Url der Überwachung hinzu
IN PROCESS:
GET urls: Alle Urls die überwacht werden sollen

1
compiler/README.html Normal file
View File

@@ -0,0 +1 @@
sdf

15
compiler/__init__.py Normal file
View File

@@ -0,0 +1,15 @@
#from mprocess import do_process, process_urllist
#from compiler import do_compile
#from mworker import run_fetch, run_process, run_compile
# include models for final objects
from src.models import Article
# starting workers
from mworker import start_workers
from models import add_url, CrawlUrl
#start_workers(1,1,1)
from fetching import announce_articleid

View File

@@ -0,0 +1 @@
from rss import rssfeed

1
compiler/comp/__init__py Normal file
View File

@@ -0,0 +1 @@
from rss import rssfeed

8
compiler/comp/rss.py Normal file
View File

@@ -0,0 +1,8 @@
import feedparser
def rssfeed(url,raw):
al=[]
f=feedparser.parse(raw)
for e in f['entries']:
al.append(e['link'])
return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}

153
compiler/compile.py Normal file
View File

@@ -0,0 +1,153 @@
from bs4 import BeautifulSoup
import crawler.objects.models
#from crawler.objects.models import Object
from dateutil.parser import parse
from datetime import datetime
import re
def hello():
return "hello"
def fetarticle(o):
sp=BeautifulSoup(o.raw_fixed)
d={}
h=sp.find("h1", {"itemprop": "name"})
d["title"]=unicode(h.text).strip()
h=sp.find("div", {"itemprop": "articleBody"})
if h is not None:
d["text"]=(h.encode_contents()).strip()
else:
d["text"]=""
d["url"]=o.url
h=sp.find("span", {"itemprop": "author"})
if h is not None:
d["author"]=h.text.strip()
h=sp.find("span", {"itemprop": "articleSection"})
if h is not None:
d["section"]= "FET - " + h.text.strip()
h=sp.find("span", {"itemprop": "datePublished"})
if h is not None:
d["published"]=parse(h.encode_contents().strip())
h=sp.find("meta", {"property": "og:image"})
if h is not None:
d["image"]=h.attrs["content"]
hh=sp.find_all("div", {"class":"media"})
for h in hh:
if h is not None:
h=h.find("div", {"class": "pull-left"})
if h is not None:
h=h.find("a")
if h is not None:
d["image2"]=crawler.objects.models.download_file(h.attrs["href"])
return {"article": d}
def fsarcharticle(o):
sp=BeautifulSoup(o.raw_fixed)
d={}
h=sp.find("h1", {"class": "title"})
if h is not None:
d["title"]=h.text.strip()
d["url"]=o.url
d["published"]=None
h=sp.find("article")
h=h.find("div", {"class": "content"})
d["text"]=h.encode_contents().strip()
h=sp.find("article").find("h1", {"class": "title"})
if h is not None:
d["title"]=h.text.strip()
else:
d["title"]=""
d["image"]=""
d["sourcetype"]="fsarcharticle"
d["section"]="fsarch"
d["author"]=None
return {"article": d}
def fetindex(o):
# if type(o) is not Object:
# raise TypeError
if o.raw is None:
raise Error
print "compile_fetindex"
html=BeautifulSoup(o.raw_fixed)
h = html.find("li", {"class": "next_page" })
if h is not None:
nl=h.find("a")
nl=crawler.objects.models.fix_link(o.url,nl.attrs["href"])
else:
nl=None
h= html.find("ul", {"id": "neuigkeiten"})
links=h.find_all("a")
al = []
for t in links:
al.append(t.attrs["href"])
return {"url": o.url, "next_page": nl, "article_links": al, "objecttype": "index" }
def fsarchindex(o):
if o.raw is None:
raise Error
html=BeautifulSoup(o.raw_fixed)
h= html.find("article")
print unicode(h)
links=h.find_all("a")
al = []
fl=[]
for t in links:
url=t.attrs["href"]
if re.search("fachschaftarchitektur\.at", url):
al.append(t.attrs["href"])
if re.search("facebook\.com/events", url):
fl.append(t.attrs["href"])
return {"url": o.url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"}
def fsbizindex(o):
if o.raw is None:
raise Error
print "compile_fsbizindex"
html=BeautifulSoup(o.raw_fixed)
h= html.find("section", {"id": "primary"})
links=h.find_all("h1", {"class": "entry-title"})
al = []
for t in links:
al.append(t.find("a").attrs["href"])
return {"url": o.url,"article_links": al,"objecttype": "index"}
def fsmbindex(o):
if o.raw is None:
raise Error
html=BeautifulSoup(o.raw_fixed)
h= html.find("a",{"class": "next"})
if h is not None:
np=h.attrs["href"]
else:
np=None
h=html.find("div", {"id": "main"}).find("div", {"class": "inside"}).find("div", {"class": "mod_newslist"})
if h is not None:
ats=h.find_all("div",{"class": "block"})
articles=[]
for a in ats:
aa={}
h=a.find("h3")
if h is not None:
aa["title"] = h.text.strip()
h=a.find("div", {"class": "ce_text"})
if h is not None:
aa["text"] = (h.encode_contents()).strip()
aa["info"]=[]
hh=a.find_all("p", {"class": "info"},recursive=False)
for h in hh:
aa["info"].append(unicode(h.text))
if re.search(r'von', str(h)):
h1= re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text))
aa["published"] =parse(h1.strip())
aa["author"]=re.sub(r'^.*von(.*)$', r'\1',unicode(h.text)).strip() #h.text + "--" #+ re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',hh)
aa["section"]="FSMB"
articles.append(aa)
return {"url": o.url, "next_page": np, "articles": articles,"objecttype": "articles"}

258
compiler/compiler.py Normal file
View File

@@ -0,0 +1,258 @@
from bs4 import BeautifulSoup
#import crawler.objects.models
#from crawler.objects.models import Object
from dateutil.parser import parse
from datetime import datetime
import re
import urlparse
from src import clogger, cfg
from src.fb import graph
from fixing import fix_link
from facebook import GraphAPIError
#from fetching import downloadfile
import json
def do_compile(tpe, cont):
if type(cont) != dict:
clogger.error("Type Error for do compile for :"+str(cont["url"]))
# Starting to compile an generic object
if "url" not in cont:
clogger.error("no url can't compile "+tpe)
else:
clogger.debug("compile: type:"+str(tpe)+ "| "+ str(cont["url"]))
if tpe in compiler:
cont=compiler[tpe](cont["url"], cont["raw"])
return cont
from comp import rssfeed
def dummyarticle(url, raw):
return {"url": url, "article":{"url": url, "section": "dummysection", "sourcetype": "dummy", "title":"dummytitle", "text": raw, "image": "fff", "author": "me", "published": None}}
def htufeed(url,raw):
al=[]
f=feedparser.parse(raw)
for e in f['entries']:
al.append(e['link'])
return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}
def htuarticle(url,raw):
sp=BeautifulSoup(raw)
d={}
h=sp.find("meta", {"property": "og:image"})
if h is not None:
d["image"]=h.attrs["content"]
d["image2"]=d["image"]
h=sp.find("div", {"class": "patternRevInfo"})
if h is not None:
# clogger.debug(h.text.strip())
h1= re.sub(r'.*- (\d+) ([a-zA-Z]+) (\d+) - ([:\d]+)[^\d]*', r'\3/\2/\1 \4',unicode(h.text.strip()))
# clogger.debug(h1)
d["published"]=parse(h1)
# clogger.debug(parse(h1))
# clogger.debug(d["published"])
h=h.find("a")
if h is not None:
d["author"]=h.text.strip()
h=sp.find("div", {"class": "foswikiTopic"})
h1=h.find("h4")
if h1 is not None:
d["title"]= h1.text.strip()
h1.extract() # remove head
else:
h1=sp.find("meta", {"name": "WEBTOPIC"})
d["title"]= h1.attrs["content"]
d["text"]=(h.encode_contents()).strip()
d["section"]="HTU"
d["url"]=url
# clogger.debug(d)
return {"article": d}
def fetarticle(url, raw):
sp=BeautifulSoup(raw)
d={}
h=sp.find("h1", {"itemprop": "name"})
d["title"]=unicode(h.text).strip()
h=sp.find("div", {"itemprop": "articleBody"})
if h is not None:
d["text"]=(h.encode_contents()).strip()
else:
d["text"]=""
d["url"]=url
h=sp.find("span", {"itemprop": "author"})
if h is not None:
d["author"]=h.text.strip()
h=sp.find("span", {"itemprop": "articleSection"})
if h is not None:
d["section"]= "FET - " + h.text.strip()
h=sp.find("span", {"itemprop": "datePublished"})
if h is not None:
d["published"]=parse(h.encode_contents().strip())
h=sp.find("meta", {"property": "og:image"})
if h is not None:
d["image"]=h.attrs["content"]
d["image2"]=d["image"]
# hh=sp.find_all("div", {"class":"media"})
# for h in hh:
# if h is not None:
# h=h.find("div", {"class": "pull-left"})
# if h is not None:
# h=h.find("a")
# if h is not None:
# d["image2"]=downloadfile(fix_link(url,h.attrs["href"]))
return {"article": d}
def fsarcharticle(url, raw):
sp=BeautifulSoup(raw)
d={}
h=sp.find("h1", {"class": "title"})
if h is not None:
d["title"]=h.text.strip()
d["url"]=url
d["published"]=None
h=sp.find("article")
if h is not None:
h=h.find("div", {"class": "content"})
d["text"]=h.encode_contents().strip()
h=sp.find("article")
if h is not None:
h=h.find("h1", {"class": "title"})
if h is not None:
d["title"]=h.text.strip()
else:
d["title"]=""
d["image"]=""
d["sourcetype"]="fsarcharticle"
d["section"]="fsarch"
d["author"]=None
return {"article": d}
def fetindex(url, raw):
if raw is None:
raise Error
# clogger.debug("compile_fetindex: "+str(url))
html=BeautifulSoup(raw)
h = html.find("li", {"class": "next_page" })
if h is not None:
nl=h.find("a")
nl=fix_link(url,nl.attrs["href"])
else:
nl=None
h= html.find("ul", {"id": "neuigkeiten"})
al = []
if h is not None:
links=h.find_all("a")
for t in links:
al.append(t.attrs["href"])
return {"url": url, "next_page": nl, "article_links": al, "objecttype": "index" }
def fsarchindex(url, raw):
if raw is None:
raise Error
html=BeautifulSoup(raw)
h= html.find("article")
print unicode(h)
links=h.find_all("a")
al = []
fl=[]
for t in links:
url=t.attrs["href"]
if re.search("fachschaftarchitektur\.at", url):
al.append(t.attrs["href"])
if re.search("facebook\.com/events", url):
fl.append(t.attrs["href"])
return {"url": url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"}
def fsbizindex(url, raw):
if raw is None:
raise Error
print "compile_fsbizindex"
html=BeautifulSoup(raw)
h= html.find("section", {"id": "primary"})
links=h.find_all("h1", {"class": "entry-title"})
al = []
for t in links:
al.append(t.find("a").attrs["href"])
return {"url": url,"article_links": al,"objecttype": "index"}
def fbfeed(url, raw):
js = json.loads(raw)
arts=[]
u=urlparse.urlparse(url)
for m in js["data"]:
aa={}
aa["url"]=urlparse.urlunsplit(("http","www.facebook.at",m["id"],"",""))
aa["published"] =parse(m["created_time"])
if m.has_key("message")==True:
aa["text"] = m["message"]
else:
try:
h=graph.get_object(id=m["id"].split("_")[1])
if h.has_key("description"):
aa["text"]=h["description"]
else:
aa["text"]=json.dumps()
except GraphAPIError:
aa["text"]=""
if m.has_key("story")==True:
aa["title"] = m["story"]
else:
aa["title"] = u[1]+ " at " + aa["published"].strftime("%Y-%m-%d %H:%M")
aa["section"]="Facebook: "+u[1]
arts.append(aa)
return {"url": url, "next_page": js["paging"]["next"],"articles": arts}
def fsmbindex(url, raw):
if raw is None:
raise Error
html=BeautifulSoup(raw)
h= html.find("a",{"class": "next"})
if h is not None:
np=h.attrs["href"]
else:
np=None
h=html.find("div", {"id": "main"}).find("div", {"class": "inside"}).find("div", {"class": "mod_newslist"})
if h is not None:
ats=h.find_all("div",{"class": "block"})
articles=[]
for a in ats:
aa={}
h=a.find("h3")
if h is not None:
aa["title"] = h.text.strip()
h=a.find("div", {"class": "ce_text"})
if h is not None:
aa["text"] = (h.encode_contents()).strip()
aa["info"]=[]
hh=a.find_all("p", {"class": "info"},recursive=False)
for h in hh:
aa["info"].append(unicode(h.text))
if re.search(r'von', str(h)):
h1= re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text))
aa["published"] =parse(h1.strip())
aa["author"]=re.sub(r'^.*von(.*)$', r'\1',unicode(h.text)).strip() #h.text + "--" #+ re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',hh)
aa["section"]="FSMB"
articles.append(aa)
return {"url": url, "next_page": np, "articles": articles,"objecttype": "articles"}
compiler = {"fetindex": fetindex, "fetarticle": fetarticle, "fsarchindex": fsarchindex, "fsarcharticle": fsarcharticle, "fsmbindex": fsmbindex, "fsbizindex": fsbizindex, "dummyarticle": dummyarticle,"htuarticle": htuarticle, "htufeed": htufeed, "fbfeed": fbfeed, "fschfeed": rssfeed}
compiler = cfg.compiler
for i in compiler:
compiler[i]=eval(compiler[i])
article_types={"fetindex" : "fetarticle", "fsarchindex": "fsarcharticle", "fsbizindex": "fsbizarticle", "dummyindex": "dummyarticle", "htufeed": "htuarticle"}

67
compiler/fetching.py Normal file
View File

@@ -0,0 +1,67 @@
from requests import session
s=session()
from src import package_directory, download_path,cfg
from os import path, makedirs
import os
import json
from gevent import spawn
from src import clogger
from src.fb import graph
from hashlib import md5
import errno
import urlparse
def announce_articleid(id):
for u in cfg.announcearticle_url:
s.get( u % id)
def downloadfile(url):
relative_name=path.join("downloads",str(md5(url).hexdigest()),url.split('/')[-1])
local_filename = path.join(download_path,relative_name)
if not os.path.exists(os.path.dirname(local_filename)):
try:
os.makedirs(os.path.dirname(local_filename))
except OSError as exc: # Guard against race condition
if exc.errno != errno.EEXIST:
raise
if not path.exists(local_filename):
spawn(fetch_load_file, url, local_filename)
return relative_name
from models import CrawlCache
from datetime import datetime, timedelta
def fetch_page(furl):
current_time = datetime.utcnow()
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
u=urlparse.urlparse(furl)
if u[0] == '':
furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
if cc is None:
clogger.debug("fetching url: "+ str(furl))
if u[0]=='fb':
tx = json.dumps(graph.get_object(id=u[1]+u[2]))
else:
tx=s.get(furl).text
CrawlCache.store(furl,tx)
else:
#if furl is not None:
# clogger.debug("cache hit")
tx=cc.raw
return tx
def fetch_load_file(furl, path):
try:
clogger.info("Downloading "+ str(furl))
r = s.get(furl, stream=True)
f = open(path, 'wb')
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.close()
except Exception, e:
#clogger.error("Error Occured during fetching:"+str(furl))
clogger.error(e,exc_info=True)

37
compiler/fixing.py Normal file
View File

@@ -0,0 +1,37 @@
from bs4 import BeautifulSoup
from urlparse import urlparse, urlunparse, urljoin
from fetching import downloadfile
import bleach
def fix_link(url, link):
r= urlparse(link)
if r.scheme is None or r.scheme == '':
return urljoin(url,link)
else:
return link
def fix_file(url, link):
u=fix_link(url,link)
return downloadfile(u)
def load_file(url, link):
return fix_file(url,link)
def fix_html(html, baseurl):
html=bleach.clean(html, tags=['b','p','span','a','img','div','br','strong','ul','li'], strip=True)
sp=BeautifulSoup(html)
images=sp.find_all("img")
for t in images:
if "src" in t.attrs and t.attrs["src"] is not None:
t.attrs["src"]=fix_file(baseurl,t.attrs["src"])
links=sp.find_all("a")
for t in links:
if "href" in t.attrs:
t.attrs["href"]=fix_link(baseurl, t.attrs["href"])
for t in sp.find_all("script"):
t.extract()
b=sp.find("base")
if b is not None:
b.attrs["href"]=""
return sp

75
compiler/models.py Normal file
View File

@@ -0,0 +1,75 @@
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text
from datetime import datetime
from src.database import Base2
from src.database import db_session2
from mqueues import put_fetch_queue
from marshmallow import Schema,fields,ValidationError
import json
import flask
def add_url(tpe, url):
cu=CrawlUrl.find_or_create(tpe,url)
db_session2.add(cu)
db_session2.commit()
cu.schedule()
class CrawlUrlSchema(Schema):
id=fields.Integer()
tpe=fields.String()
url=fields.String()
last_fetched=fields.DateTime()
fetched = fields.DateTime()
class CrawlUrl(Base2):
__tablename__='crawlurls'
id = Column(Integer, primary_key=True)
tpe=Column(String(250))
url = Column(String(250))
last_fetched = Column(DateTime)
def fetched(self):
CrawlCache.query.find(CrawlCache.url==self.url).first()
@classmethod
def find_or_create(self, tpe, url):
aa = CrawlUrl.query.filter(CrawlUrl.url==url).filter(CrawlUrl.tpe==tpe).first()
if aa is None:
aa=CrawlUrl(tpe,url)
return aa
def schedule(self):
put_fetch_queue((0, self.tpe, self.url))
def __init__(self, tpe, url):
self.url=url
self.tpe=tpe
def __json__(self):
return CrawlUrlSchema().dump(self)[0]
class CrawlCacheSchema(Schema):
id=fields.Integer()
raw=fields.String()
url=fields.String()
fetched=fields.DateTime()
class CrawlCache(Base2):
__tablename__='crawlcache'
id = Column(Integer, primary_key=True)
url=Column(String(250))
fetched=Column(DateTime)
raw=Column(Text)
def __init__(self, url,rw):
self.url=url
self.raw=rw
self.fetched=datetime.utcnow()
def __json__(self):
return CrawlCacheSchema().dump(self)
@classmethod
def store(cls, url, rw):
cc=CrawlCache(url,rw)
db_session2.add(cc)
db_session2.commit()
#flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, CrawlUrl) else None)

74
compiler/mprocess.py Normal file
View File

@@ -0,0 +1,74 @@
from src import clogger # Logger for crawler
from src.models import Article # Article model
from datetime import datetime
from src.database import db_session
from mqueues import fetch_queue, compile_queue, put_fetch_queue
from fetching import fetch_page, downloadfile, announce_articleid
from fixing import fix_html, fix_file
from compiler import article_types
from fixing import fix_link
# process article expects an hash with raw data for the article and puts it into an
# article object stored in the database it is intended to prevent dublicates
def is_article_hash(h):
return "text" in h and "url" in h and "sourcetype" in h and "section" in h
def process_article(art):
if not is_article_hash(art):
clogger.error("Invalid article hash:" + str(art))
aa=None
else:
art["text"]=fix_html(art["text"],art["url"])
if "image" in art:
art["image"]=fix_file(art["url"], art["image"])
clogger.info(art)
aa = Article.from_hash(art)
aa.process_hash(art)
aa.last_fetched=datetime.now()
aa.sourcetype=art["sourcetype"]
db_session.add(aa)
db_session.commit()
clogger.debug("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
# announce_articleid(aa.id)
return aa
# process a single found url
def process_url(url,tpe, parent_url):
#clogger.debug("process URL of type "+ tpe + ": " + url)
if parent_url is not None:
url=fix_link(parent_url, url)
put_fetch_queue((0,tpe,url))
# process a url list
def process_urllist(urllist, tpe, parent_url):
for u in urllist:
process_url(u,tpe, parent_url)
def do_process(tpe, cont):
urllist=[]
# clogger.debug("process :" + str(cont))
if "article_links" in cont:
process_urllist(cont["article_links"], article_types[tpe], cont["url"])
if "index_links" in cont:
process_urllist(cont["index_links"], tpe , cont["url"])
if "next_page" in cont and cont["next_page"] is not None:
process_url(cont["next_page"],tpe, cont["url"])
if "article" in cont:
art=cont["article"]
art["sourcetype"]=tpe
process_article(art)
if "articles" in cont:
clogger.debug("articles")
for a in cont["articles"]:
if "title" in a:
a["sourcetype"]=tpe
if a.has_key("url")==False:
a["url"]=cont["url"]
process_article(a)
return

8
compiler/mqueues.py Normal file
View File

@@ -0,0 +1,8 @@
from gevent.queue import Queue, JoinableQueue
fetch_queue = Queue()
compile_queue = Queue()
process_queue = Queue()
def put_fetch_queue(o):
fetch_queue.put(o)

58
compiler/mworker.py Normal file
View File

@@ -0,0 +1,58 @@
from mqueues import fetch_queue, compile_queue, process_queue
from compiler import do_compile
from mprocess import do_process
from fetching import fetch_page
from gevent import spawn
from itertools import repeat
from src import clogger
def start_workers(f,c,p):
for _ in range(f):
clogger.debug("spawn fetchworker")
spawn(work_fetch)
for _ in range(c):
spawn(work_compile)
for _ in range(p):
spawn(work_process)
def work_fetch():
while True:
run_fetch()
def work_process():
while True:
run_process()
def work_compile():
while True:
run_compile()
def queue_url(tpe, url):
fetch_queue.put((0,tpe,url))
# fetch a page from the url list
def run_fetch():
tc, tpe, url = fetch_queue.get()
if tpe is not "dummyarticle" and tpe is not "dummyindex":
rw=fetch_page(url)
else:
rw="<p> dummytext</p>"
compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw}))
return rw
# fetch_queue.task_done()
#comile something from the compile list
def run_compile():
tc,tpe,h = compile_queue.get()
h=do_compile(tpe,h)
process_queue.put((0,tpe, h))
return h
# compile_queue.task_done()
def run_process():
tc,tpe,h = process_queue.get()
do_process(tpe, h)
return h
# process_queue.task_done()

146
compiler/views.py Normal file
View File

@@ -0,0 +1,146 @@
from flask import Blueprint, jsonify, render_template, abort, redirect, url_for, request
compiler_pages = Blueprint('compiler', __name__,
template_folder='.')
from src.database import db_session2,init_db,read_json,init_db2
from .models import CrawlUrl
from .models import CrawlCache, CrawlCacheSchema
from .models import CrawlUrlSchema
from src import clogger
from src.articles import Article
#import mworker
import flask
import json
import mworker
from compiler import do_compile
from fetching import fetch_page
#flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Article,CrawlUrl)) else None)
@compiler_pages.route("/")
@compiler_pages.route("")
@compiler_pages.route(".json")
def index():
status="For documentation goto /doc"
return jsonify(status=status)
@compiler_pages.route("/doc")
@compiler_pages.route("/doc.json")
def doc():
return render_template("README")
# return jsonify(status=render_template("README"))
#
@compiler_pages.route("/initdb")
@compiler_pages.route("/initdb.json")
def initdb_json():
init_db() # initialisiere Datenbank
status="Datenbank Neu initialisiert"
return jsonify(status=status)
@compiler_pages.route("/initdb2")
@compiler_pages.route("/initdb2.json")
def initdb_json2():
init_db2() # initialisiere Datenbank
status="Datenbank Neu initialisiert"
return jsonify(status=status)
@compiler_pages.route("/start")
@compiler_pages.route("/start.json")
def start_json():
mworker.start_workers(1,1,1) # initialisiere Datenbank
status="Worker gestartet"
return jsonify(status=status)
@compiler_pages.route("/urls")
@compiler_pages.route("/urls.json")
def urls_index_json():
# Lade Alle Urls
status=CrawlUrl.query.all()
return jsonify(urls=status)
# show an existing CrawlUrl
@compiler_pages.route("/urls/<int:id>")
@compiler_pages.route("/urls/<int:id>.json")
def urls_json(id):
# Lade Alle Urls
status=CrawlUrl.query.get(id)
cc=CrawlCache.query.filter(CrawlCache.url==status.url).first()
return jsonify(urls=status, cache=cc.__json__())
# que an existing CrawlUrl for fetching
@compiler_pages.route("/urls/<int:id>/que")
@compiler_pages.route("/urls/<int:id>/que.json")
def urls_que_json(id):
# Lade Alle Urls
cu=CrawlUrl.query.get(id)
mworker.queue_url(cu.tpe, cu.url)
cc=CrawlCache.query.filter(CrawlCache.url==cu.url)
mworker.start_workers(1,1,1) # initialisiere Datenbank
status="Worker gestartet"
return jsonify(urls=cu, cache=cc)
# que an existing CrawlUrl for fetching
@compiler_pages.route("/urls/<int:id>/test")
@compiler_pages.route("/urls/<int:id>/test.json")
def urls_test_json(id):
# Lade Alle Urls
cu=CrawlUrl.query.get(id)
rw=fetch_page(cu.url)
h= {"url": cu.url, "sourcetype": cu.tpe, "raw": rw}
h2=do_compile(cu.tpe, h)
return jsonify(urls=cu,hs=h2,rw=rw)
@compiler_pages.route("/debug",methods=['GET','PUT'])
def debug():
status="did nothing"
js=read_json(request)
clogger.info(request.get_json())
if js["cmd"] == "runfetch":
mworker.run_fetch()
status="fetched something"
if js["cmd"] == "que":
cu = CrawlUrl.query.get(js["id"])
mworker.queue_url(cu.tpe, cu.url)
status= mworker.run_fetch()
if js["cmd"] == "comp":
status=mworker.run_compile()
if js["cmd"]=="process":
status=mworker.run_process()
return jsonify(status=status)
@compiler_pages.route("/debugurl")
def debugurl():
s=CrawlUrlSchema()
status=CrawlUrl.query.all()
return jsonify(status=status)
@compiler_pages.route("/urls",methods=['POST'])
def add_urls():
# Lese Daten
js =read_json(request)
# clogger.info(js)
# Finde oder Erzeuge Url in der Datenbank
url=CrawlUrlSchema().load(js["url"])
clogger.info(url)
url=CrawlUrl.find_or_create(url.data["tpe"], url.data["url"])
db_session2.add(url)
db_session2.commit()
return jsonify(url=url, kk=js)
@compiler_pages.route("/urls/<int:id>",methods=['DELETE'])
@compiler_pages.route("/urls<int:id>.json",methods=['DELETE'])
def delete(id):
cu=CrawlUrl.query.get(id)
if cu != None:
db_session2.delete(cu)
db_session2.commit()
return jsonify(url={})

4
crawler/__init__.py Normal file
View File

@@ -0,0 +1,4 @@
def init():
return " "

55
database.py Normal file
View File

@@ -0,0 +1,55 @@
from sqlalchemy import create_engine
from sqlalchemy.orm import scoped_session, sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from src import package_directory,clogger, cfg
from os import path
import json
#engine = create_engine('sqlite:////home/andreas/www/crawler/test.db', convert_unicode=True)
if cfg.get("db_path")==None or cfg.get("db_path").strip()=="":
db_path=package_directory
else:
db_path=cfg.get("db_path")
db_mainfile=cfg.get("db_mainfile")
if db_mainfile == None or db_mainfile.strip()=="":
db_mainfile="../srctest.db"
db_urlfile=cfg.get("db_mainfile")
if db_urlfile == None or db_urlfile.strip()=="":
db_urlfile="../srctest_cu.db"
engine = create_engine('sqlite:///'+ path.join(db_path,db_mainfile), convert_unicode=True)
db_session = scoped_session(sessionmaker(autocommit=False,
autoflush=False,
bind=engine))
engine2 = create_engine('sqlite:///'+ path.join(db_path,db_urlfile), convert_unicode=True)
db_session2 = scoped_session(sessionmaker(autocommit=False,
autoflush=False,
bind=engine2))
Base = declarative_base()
Base.query = db_session.query_property()
Base2 = declarative_base()
Base2.query = db_session2.query_property()
def read_json(rq):
js=rq.get_json()
clogger.info(rq.data)
if js is None:
js=rq.form.to_dict()
if js=={} and rq.data != "":
js=json.loads(rq.data)
return js
def init_db():
import src.models
Base.metadata.create_all(bind=engine)
def init_db2():
from .compiler.models import CrawlUrl, CrawlCache
Base2.metadata.create_all(bind=engine2)

4
fb.py Normal file
View File

@@ -0,0 +1,4 @@
from src import cfg
import facebook
graph = facebook.GraphAPI(access_token=cfg.fb_token, version='2.3')

21
meta.py Normal file
View File

@@ -0,0 +1,21 @@
import os
package_directory = os.path.dirname(os.path.abspath(__file__))
from config import Config
import logging
import sys
cfg = Config(file(os.path.join(package_directory, 'config.cfg')))
#--------------- Logging
file_handler=logging.FileHandler(cfg.logfile)
file_handler.setLevel(logging.INFO)
std_handler=logging.StreamHandler(stream=sys.stdout)
std_handler.setLevel(logging.DEBUG)
lg=logging.getLogger('mylogger')
lg.setLevel(logging.DEBUG)
lg.addHandler(file_handler)
lg.addHandler(std_handler)
#----------------

4
models.py Normal file
View File

@@ -0,0 +1,4 @@
from .articles.model import Article
from .sections.model import Section
from .compiler.models import CrawlUrl, CrawlCache

1
sections/__init__.py Normal file
View File

@@ -0,0 +1 @@
from .model import Section

44
sections/model.py Normal file
View File

@@ -0,0 +1,44 @@
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey
from sqlalchemy.orm import relationship
from datetime import datetime
from src.database import Base,db_session
from marshmallow import Schema, fields
import json
import flask
#from src.articles import Article
class SectionSchema(Schema):
id=fields.Integer()
foreign_name=fields.String()
name=fields.String()
class Section(Base):
__tablename__ = 'sections'
id = Column(Integer, primary_key=True)
url = Column(String(250))
crawlurl = Column(Integer)
foreign_name = Column(String(250),unique=True)
name=Column(String(250))
group = Column(String(250))
articles=relationship("Article", back_populates="section")
def __json__(self):
return SectionSchema().dump(self)[0]
def __init__(self, url=None,fname=None):
self.url=url
self.foreign_name=fname
@classmethod
def find_or_create(cls, fname):
s=Section.query.filter(Section.foreign_name==fname).first()
if s is None:
s=Section(fname)
db_session.add(s)
db_session.commit()
s.foreign_name=fname
db_session.add(s)
db_session.commit()
return s

37
sections/views.py Normal file
View File

@@ -0,0 +1,37 @@
from flask import Blueprint, jsonify, render_template, abort, redirect, url_for, request
section_pages = Blueprint('sections', __name__)
from .model import Section
from .model import SectionSchema
#import flask
from datetime import datetime
import json
from src import clogger
from src.database import db_session, read_json
import flask
@section_pages.route("/")
@section_pages.route("")
@section_pages.route(".json")
def index():
sections=Section.query.all()
return jsonify(sections=sections)
@section_pages.route("/<int:id>",methods=['PUT'])
@section_pages.route("/<int:id>.json",methods=['PUT'])
def update(id):
section=Section.query.get(id)
clogger.info(request.data)
a=request.get_json()
section.text=a["text"]
db_session.commit()
return jsonify(section=section)
@section_pages.route("/<int:id>",methods=['GET'])
@section_pages.route("/<int:id>.json",methods=['GET'])
def get(id):
section=Section.query.get(id)
clogger.info(section)
# section=SectionSchema().dump(section)[0]
return jsonify(section=section,articles=section.articles)

1
templates/home.html Normal file
View File

@@ -0,0 +1 @@
<h1>Hello World</h1>

19
users/users.py Normal file
View File

@@ -0,0 +1,19 @@
class User(object):
def __init__(self, id, username, password):
self.id = id
self.username = username
self.password = password
def __str__(self):
return "User(id='%s')" % self.id
user = User(1, 'user', 'password')
def authenticate(username, password):
if username == user.username and password == user.password:
return user
def identity(payload):
return user