init commit
This commit is contained in:
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
/__init__.py~
|
||||
/__init__.pyc
|
||||
*.pyc
|
||||
*~
|
||||
config.cfg
|
||||
101
__init__.py
Normal file
101
__init__.py
Normal file
@@ -0,0 +1,101 @@
|
||||
import os
|
||||
import sys
|
||||
package_directory = os.path.dirname(os.path.abspath(__file__))
|
||||
from config import Config
|
||||
|
||||
cfg = Config(file(os.path.join(package_directory, 'config.cfg')))
|
||||
#--------------- Logging
|
||||
import logging
|
||||
download_path="./cdw"
|
||||
file_handler=logging.FileHandler(cfg.logfile)
|
||||
file_handler.setLevel(logging.DEBUG)
|
||||
stream_handler=logging.StreamHandler(sys.stdout)
|
||||
|
||||
clt=logging.getLogger('mylogger')
|
||||
clt.setLevel(logging.DEBUG)
|
||||
clt.addHandler(file_handler)
|
||||
clt.addHandler(stream_handler)
|
||||
|
||||
clogger=clt
|
||||
#----------------
|
||||
lg=clt
|
||||
from gevent import spawn, monkey
|
||||
monkey.patch_all()
|
||||
from .compiler import start_workers
|
||||
#start_workers(1,1,1)
|
||||
|
||||
|
||||
|
||||
# Framework
|
||||
from flask import Flask, jsonify, render_template, redirect, request,send_from_directory
|
||||
# Cross Site Scripting
|
||||
from flask_cors import CORS, cross_origin
|
||||
#Authentication
|
||||
from flask_jwt import JWT, jwt_required, current_identity
|
||||
|
||||
from src.models import Article,Section
|
||||
from src.users import authenticate, identity
|
||||
from datetime import datetime
|
||||
|
||||
app = Flask(__name__)
|
||||
CORS(app)
|
||||
app.config['LOGGER_NAME']='mylogger'
|
||||
app.logger.setLevel(logging.DEBUG)
|
||||
app.logger.info("Server Started")
|
||||
|
||||
app.config['SECRET_KEY'] = 'super-secret'
|
||||
import flask
|
||||
import json
|
||||
from database import Base
|
||||
from models import Article, CrawlUrl, CrawlCache
|
||||
|
||||
|
||||
flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Base, Article,Section, CrawlUrl,CrawlCache)) else None)
|
||||
json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Base, Article,CrawlUrl,CrawlCache)) else None)
|
||||
|
||||
|
||||
|
||||
#bot.dosmth()
|
||||
#lg.debug(bot.bot)
|
||||
|
||||
|
||||
|
||||
|
||||
# Allow Cross Site Scripting
|
||||
@app.after_request
|
||||
def after_request(response):
|
||||
response.headers.add('Access-Control-Allow-Origin', '*')
|
||||
if request.method == 'OPTIONS':
|
||||
response.headers['Access-Control-Allow-Methods'] = 'DELETE, GET, POST, PUT'
|
||||
headers = request.headers.get('Access-Control-Request-Headers')
|
||||
if headers:
|
||||
response.headers['Access-Control-Allow-Headers'] = headers
|
||||
return response
|
||||
from .articles.views import article_pages
|
||||
from .sections.views import section_pages
|
||||
from .compiler.views import compiler_pages
|
||||
|
||||
|
||||
@app.route("/")
|
||||
@app.route("/index")
|
||||
@app.route("/home")
|
||||
def home():
|
||||
text="It work's, please do something"
|
||||
return jsonify(text=text)
|
||||
|
||||
app.register_blueprint(article_pages, url_prefix='/articles')
|
||||
app.register_blueprint(section_pages, url_prefix='/sections')
|
||||
app.register_blueprint(compiler_pages, url_prefix='/compiler')
|
||||
|
||||
|
||||
from src.bot import bot
|
||||
if not app.debug or os.environ.get("WERKZEUG_RUN_MAIN") == "true":
|
||||
bot.message_loop()
|
||||
|
||||
|
||||
# ------------ Telegram Bot
|
||||
#from bot import bot_queue
|
||||
#@app.route('/bot', methods=['GET', 'POST'])
|
||||
#def pass_update():
|
||||
# bot_queue.put(request.data) # pass update to bot
|
||||
# return 'OK'
|
||||
2
articles/__init__.py
Normal file
2
articles/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from .model import Article
|
||||
from .views import article_pages
|
||||
139
articles/model.py
Normal file
139
articles/model.py
Normal file
@@ -0,0 +1,139 @@
|
||||
|
||||
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey
|
||||
from sqlalchemy.orm import relationship
|
||||
from datetime import datetime
|
||||
from src.database import Base
|
||||
from src.database import db_session
|
||||
from marshmallow import Schema, fields
|
||||
from src.sections.model import Section
|
||||
|
||||
#import json
|
||||
import json
|
||||
import flask
|
||||
#json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None)
|
||||
import hashlib
|
||||
|
||||
#import clogger
|
||||
import logging
|
||||
#from crawler.compiler.mqueues import put_fetch_queue
|
||||
from src import clogger
|
||||
#json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None)
|
||||
|
||||
|
||||
|
||||
|
||||
def calc_fingerprint(a):
|
||||
return calc_fingerprint_h({"url": a.url, "title":a.title, "published": str(a.published_date)})
|
||||
|
||||
def calc_fingerprint_h(a):
|
||||
if a["published"] is not None and a["published"]!= "None":
|
||||
# clogger.info( "published:"+str(a["published"]))
|
||||
if a["published"] is str:
|
||||
pp=parse(a["published"])
|
||||
else:
|
||||
pp=a["published"]
|
||||
else:
|
||||
pp=""
|
||||
#clogger.info( unicode(a["url"])+ unicode(a["title"])+unicode(pp))
|
||||
h=hashlib.md5()
|
||||
h.update(unicode(a["url"]))
|
||||
h.update(a["title"].encode("utf-8"))
|
||||
h.update(unicode(pp))
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
class ArticleSchema(Schema):
|
||||
id=fields.Integer()
|
||||
text=fields.String()
|
||||
title=fields.String()
|
||||
author=fields.String()
|
||||
sourcetype =fields.String()
|
||||
image =fields.String()
|
||||
url =fields.String()
|
||||
published_date=fields.DateTime()
|
||||
date=fields.DateTime()
|
||||
first_fetched=fields.DateTime()
|
||||
section_id=fields.Integer()
|
||||
|
||||
class Article(Base):
|
||||
__tablename__ = 'articles'
|
||||
id = Column(Integer, primary_key=True)
|
||||
parent_id= Column(Integer)
|
||||
url = Column(String(250))
|
||||
is_primary = Column(Boolean)
|
||||
fingerprint = Column(String(250),unique=True)
|
||||
hash = Column(String(250))
|
||||
last_fetched = Column(DateTime)
|
||||
first_fetched=Column(DateTime)
|
||||
published_date = Column(DateTime)
|
||||
date = Column(DateTime)
|
||||
text = Column(Text)
|
||||
title = Column(String(250))
|
||||
author = Column(String(250))
|
||||
section = relationship("Section")
|
||||
section_id=Column(Integer, ForeignKey('sections.id'))
|
||||
sourcetype = Column(String(250))
|
||||
image=Column(String(250))
|
||||
|
||||
def __init__(self, url=None,title=None, published_date=None):
|
||||
self.url=url
|
||||
self.title=title
|
||||
self.published_date=published_date
|
||||
self.first_fetched=datetime.now()
|
||||
def __json__(self):
|
||||
return ArticleSchema().dump(self)[0]
|
||||
|
||||
def dict(self):
|
||||
return {"id": str(int(self.id)), "title": self.title, "text": self.text, "author": self.author, "section":self.section, "sourcetype": self.sourcetype, "last_fetched": self.last_fetched, "first_fetched": self.first_fetched, "published_date": self.published_date, "date": self.date,"image": self.image, "url": self.url}
|
||||
|
||||
|
||||
# @classmethod
|
||||
# def sections(self):
|
||||
# sects=db_session.query(Article.section).distinct().all()
|
||||
# for i in range(len(sects)):
|
||||
# sects[i]=sects[i][0]
|
||||
# return sects
|
||||
|
||||
@classmethod
|
||||
def from_hash(cls, a):
|
||||
fp = calc_fingerprint_h(a)
|
||||
aa = Article.query.filter(Article.fingerprint==fp).first()
|
||||
if aa is None:
|
||||
clogger.debug( "new Article")
|
||||
if a["published"] is not None:
|
||||
if a["published"] is str:
|
||||
pd= parse(a["published"])
|
||||
else:
|
||||
pd=a["published"]
|
||||
else:
|
||||
pd=None
|
||||
aa=Article(a["url"], a["title"],pd)
|
||||
aa.fingerprint = calc_fingerprint(aa)
|
||||
db_session.add(aa)
|
||||
db_session.commit()
|
||||
return aa
|
||||
|
||||
def process_hash(self, a):
|
||||
self.text=a["text"].decode('utf8')
|
||||
if "image" in a:
|
||||
self.image=a["image"]
|
||||
if "author" in a:
|
||||
self.author=a["author"]
|
||||
if "title" in a:
|
||||
self.title=a["title"]
|
||||
if "author" in a:
|
||||
self.author=a["author"]
|
||||
if "sourcetype" in a:
|
||||
self.sourcetype=a["sourcetype"]
|
||||
if "section" in a:
|
||||
self.section=Section.find_or_create(a["section"])
|
||||
# if "last_fetched" in a:
|
||||
# self.last_fetched=a["last_fetched"]
|
||||
if "published_date" in a:
|
||||
self.published_date=a["published_date"]
|
||||
|
||||
|
||||
|
||||
#flask.json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None)
|
||||
|
||||
#json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None)
|
||||
65
articles/views.py
Normal file
65
articles/views.py
Normal file
@@ -0,0 +1,65 @@
|
||||
from flask import Blueprint, jsonify, render_template, abort, redirect, url_for, request
|
||||
article_pages = Blueprint('articles', __name__)
|
||||
from .model import Article
|
||||
from .model import ArticleSchema
|
||||
#import flask
|
||||
from datetime import datetime
|
||||
import json
|
||||
|
||||
#flask.json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None)
|
||||
#flask.json.JSONEncoder.default = lambda self,obj: ((obj.dict()) if isinstance(obj, Article) else None)
|
||||
from src import clogger
|
||||
import json
|
||||
from src.database import db_session, read_json
|
||||
import flask
|
||||
|
||||
#flask.json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None)
|
||||
flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Base, Article,CrawlUrl)) else None)
|
||||
|
||||
@article_pages.route("/")
|
||||
@article_pages.route("")
|
||||
@article_pages.route(".json")
|
||||
def index():
|
||||
articles=Article.query.all()
|
||||
return jsonify(articles=articles)
|
||||
|
||||
@article_pages.route("/<int:id>",methods=['PUT'])
|
||||
@article_pages.route("/<int:id>.json",methods=['PUT'])
|
||||
def update(id):
|
||||
article=Article.query.get(id)
|
||||
clogger.info(request.data)
|
||||
a=request.get_json()
|
||||
article.text=a["text"]
|
||||
db_session.commit()
|
||||
return jsonify(article=article)
|
||||
|
||||
|
||||
@article_pages.route("/<int:id>",methods=['GET'])
|
||||
@article_pages.route("/<int:id>.json",methods=['GET'])
|
||||
def get(id):
|
||||
article=Article.query.get(id)
|
||||
clogger.info(article)
|
||||
# article=ArticleSchema().dump(article)[0]
|
||||
return jsonify(article=article)
|
||||
|
||||
@article_pages.route("/<int:id>",methods=['DELETE'])
|
||||
@article_pages.route("/<int:id>.json",methods=['DELETE'])
|
||||
def delete(id):
|
||||
article=Article.query.get(id)
|
||||
clogger.info(id)
|
||||
if article != None:
|
||||
db_session.delete(article)
|
||||
db_session.commit()
|
||||
return jsonify(article={})
|
||||
|
||||
|
||||
@article_pages.route("/",methods=['POST'])
|
||||
@article_pages.route("",methods=['POST'])
|
||||
@article_pages.route(".json",methods=['POST'])
|
||||
def create():
|
||||
article=Article()
|
||||
a=read_json(request)
|
||||
article.text=a["article"]["text"]
|
||||
db_session.add(article)
|
||||
db_session.commit()
|
||||
return jsonify(article=article)
|
||||
1
bot/__init__.py
Normal file
1
bot/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .bot import bot
|
||||
140
bot/bot.py
Normal file
140
bot/bot.py
Normal file
@@ -0,0 +1,140 @@
|
||||
import telepot
|
||||
import datetime
|
||||
import time
|
||||
import json
|
||||
from Queue import Queue
|
||||
#import os
|
||||
from src import lg,cfg
|
||||
#from gevent import spawn
|
||||
from telepot.namedtuple import InlineKeyboardMarkup, InlineKeyboardButton
|
||||
from telepot.delegate import (
|
||||
per_chat_id, pave_event_space, include_callback_query_chat_id, create_open, per_inline_from_id )
|
||||
from src.compiler import CrawlUrl
|
||||
from gevent import spawn, monkey, Greenlet
|
||||
|
||||
def IKB(h):
|
||||
return InlineKeyboardButton(text=h["text"], callback_data=h["callback_data"])
|
||||
|
||||
def IKB2(h):
|
||||
return [IKB(h)]
|
||||
def IKM(h):
|
||||
return InlineKeyboardMarkup(inline_keyboard=[ map(IKB,h)])
|
||||
|
||||
def IKM2(h):
|
||||
return InlineKeyboardMarkup(inline_keyboard= map(IKB2,h))
|
||||
|
||||
|
||||
def query_que_url(url):
|
||||
print(json.dumps(url))
|
||||
return {"text": url.url, "callback_data":"/urls/"+str(url.id)+"/que"}
|
||||
|
||||
def handle_urls(handler, cmd):
|
||||
curls=CrawlUrl.query.all()
|
||||
#sent=handler.sender.sendMessage(json.dumps(curls))
|
||||
kb= IKM2(map(query_que_url,curls))
|
||||
print json.dumps(cmd)
|
||||
if len(cmd) >= 4 and cmd[3]=="que":
|
||||
sent=handler.sender.sendMessage("I qued url "+str(cmd[2]), reply_markup=None)
|
||||
else:
|
||||
sent=handler.sender.sendMessage("que?", reply_markup=kb)
|
||||
handler._edit_msg_ident = telepot.message_identifier(sent)
|
||||
handler._editor = telepot.helper.Editor(handler.bot, sent)
|
||||
|
||||
def execute_command(handler,cmd,msg=None):
|
||||
if cmd[1]=='urls':
|
||||
handle_urls(handler,cmd)
|
||||
|
||||
|
||||
def handle(handler,msg):
|
||||
content_type,chat_type,chat_id = telepot.glance(msg)
|
||||
if msg.has_key('text'):
|
||||
if msg['text'][0]=='/':
|
||||
cmd = msg['text'].split("/")
|
||||
execute_command(handler, cmd, msg)
|
||||
if msg.has_key('data'):
|
||||
lg.debug(msg['data'])
|
||||
|
||||
|
||||
class InlineHandler(telepot.helper.InlineUserHandler, telepot.helper.AnswererMixin):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(InlineHandler, self).__init__(*args, **kwargs)
|
||||
|
||||
def on_inline_query(self, msg):
|
||||
def compute_answer():
|
||||
query_id, from_id, query_string = telepot.glance(msg, flavor='inline_query')
|
||||
print(self.id, ':', 'Inline Query:', query_id, from_id, query_string)
|
||||
|
||||
articles = [{'type': 'article',
|
||||
'id': 'abc', 'title': query_string, 'message_text': query_string}]
|
||||
|
||||
return articles
|
||||
|
||||
self.answerer.answer(msg, compute_answer)
|
||||
|
||||
def on_chosen_inline_result(self, msg):
|
||||
from pprint import pprint
|
||||
pprint(msg)
|
||||
result_id, from_id, query_string = telepot.glance(msg, flavor='chosen_inline_result')
|
||||
print(self.id, ':', 'Chosen Inline Result:', result_id, from_id, query_string)
|
||||
|
||||
|
||||
class FetBot(telepot.helper.ChatHandler):
|
||||
def __init__(self, *args, **kwargs):
|
||||
# super(FetBot,self).__init__(*args,**kwargs)
|
||||
super(FetBot,self).__init__( *args,**kwargs)
|
||||
|
||||
_editor=None
|
||||
_edit_msg_ident=None
|
||||
keyboard=IKM([{"text":"START","callback_data": "start"},
|
||||
{"text":"Don't Start","callback_data":"notstart"}
|
||||
])
|
||||
keyboard =InlineKeyboardMarkup(
|
||||
inline_keyboard=[[
|
||||
InlineKeyboardButton(text='START', callback_data='start'),
|
||||
InlineKeyboardButton(text='START', callback_data='start')
|
||||
]]
|
||||
)
|
||||
def on_chat_message(self,msg):
|
||||
handle(self,msg)
|
||||
content_type,chat_type,chat_id = telepot.glance(msg)
|
||||
lg.debug(content_type)
|
||||
if content_type=="photo" or content_type=="sticker":
|
||||
lg.debug("try to download %s" % msg[content_type][-1]["file_id"])
|
||||
f=self.bot.getFile(msg[content_type][-1]['file_id'])
|
||||
lg.debug(f)
|
||||
self.bot.download_file(f['file_id'], "dwn/" + f['file_path'])
|
||||
# self.bot.getFile(msg['photo'][-1]['file_id']), "dwn")
|
||||
#self._cancel_last()
|
||||
#sent=self.sender.sendMessage("Hello World", reply_markup=self.keyboard)
|
||||
#self._editor = telepot.helper.Editor(self.bot, sent)
|
||||
#self._edit_msg_ident = telepot.message_identifier(sent)
|
||||
|
||||
def on_callback_query(self, msg):
|
||||
query_id, from_id, query_data = telepot.glance(msg, flavor='callback_query')
|
||||
lg.debug(json.dumps(msg))
|
||||
self._cancel_last()
|
||||
if query_data[0]=='/':
|
||||
cmd = query_data.split("/")
|
||||
execute_command(self, cmd, msg)
|
||||
|
||||
# self.sender.sendMessage("Danke")
|
||||
self.bot.answerCallbackQuery(query_id, text='Ok. But I am going to keep asking.')
|
||||
#self.bot.answerCallbackQuery(query_id)
|
||||
def _cancel_last(self):
|
||||
if self._editor:
|
||||
self._editor.editMessageReplyMarkup(reply_markup=None)
|
||||
self._editor = None
|
||||
self._edit_msg_ident = None
|
||||
|
||||
|
||||
|
||||
|
||||
bot=None
|
||||
bot = telepot.DelegatorBot(cfg.token, [include_callback_query_chat_id(pave_event_space())(per_chat_id(),create_open,FetBot,timeout=20),
|
||||
pave_event_space()(
|
||||
per_inline_from_id(), create_open, InlineHandler, timeout=10),
|
||||
])
|
||||
|
||||
|
||||
|
||||
|
||||
10
compiler/README
Normal file
10
compiler/README
Normal file
@@ -0,0 +1,10 @@
|
||||
Das ist die API für den Compiler
|
||||
Folgende Befehle sind implementiert:
|
||||
GET doc: Diese Dokumentation!
|
||||
GET initdb: Initialisiere die Datenbank, ACHTUNG Daten werden gelöscht
|
||||
POST urls:
|
||||
Erwartet Daten im Format {"url": {"type": typ, "url": "someurl.html"}}
|
||||
Fügt diese Url der Überwachung hinzu
|
||||
|
||||
IN PROCESS:
|
||||
GET urls: Alle Urls die überwacht werden sollen
|
||||
1
compiler/README.html
Normal file
1
compiler/README.html
Normal file
@@ -0,0 +1 @@
|
||||
sdf
|
||||
15
compiler/__init__.py
Normal file
15
compiler/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
|
||||
|
||||
#from mprocess import do_process, process_urllist
|
||||
#from compiler import do_compile
|
||||
#from mworker import run_fetch, run_process, run_compile
|
||||
|
||||
# include models for final objects
|
||||
from src.models import Article
|
||||
# starting workers
|
||||
from mworker import start_workers
|
||||
|
||||
from models import add_url, CrawlUrl
|
||||
#start_workers(1,1,1)
|
||||
|
||||
from fetching import announce_articleid
|
||||
1
compiler/comp/__init__.py
Normal file
1
compiler/comp/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from rss import rssfeed
|
||||
1
compiler/comp/__init__py
Normal file
1
compiler/comp/__init__py
Normal file
@@ -0,0 +1 @@
|
||||
from rss import rssfeed
|
||||
8
compiler/comp/rss.py
Normal file
8
compiler/comp/rss.py
Normal file
@@ -0,0 +1,8 @@
|
||||
import feedparser
|
||||
|
||||
def rssfeed(url,raw):
|
||||
al=[]
|
||||
f=feedparser.parse(raw)
|
||||
for e in f['entries']:
|
||||
al.append(e['link'])
|
||||
return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}
|
||||
153
compiler/compile.py
Normal file
153
compiler/compile.py
Normal file
@@ -0,0 +1,153 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import crawler.objects.models
|
||||
#from crawler.objects.models import Object
|
||||
from dateutil.parser import parse
|
||||
from datetime import datetime
|
||||
import re
|
||||
def hello():
|
||||
return "hello"
|
||||
|
||||
|
||||
def fetarticle(o):
|
||||
sp=BeautifulSoup(o.raw_fixed)
|
||||
d={}
|
||||
h=sp.find("h1", {"itemprop": "name"})
|
||||
d["title"]=unicode(h.text).strip()
|
||||
h=sp.find("div", {"itemprop": "articleBody"})
|
||||
if h is not None:
|
||||
d["text"]=(h.encode_contents()).strip()
|
||||
else:
|
||||
d["text"]=""
|
||||
d["url"]=o.url
|
||||
h=sp.find("span", {"itemprop": "author"})
|
||||
if h is not None:
|
||||
d["author"]=h.text.strip()
|
||||
h=sp.find("span", {"itemprop": "articleSection"})
|
||||
if h is not None:
|
||||
d["section"]= "FET - " + h.text.strip()
|
||||
|
||||
h=sp.find("span", {"itemprop": "datePublished"})
|
||||
if h is not None:
|
||||
d["published"]=parse(h.encode_contents().strip())
|
||||
h=sp.find("meta", {"property": "og:image"})
|
||||
|
||||
if h is not None:
|
||||
d["image"]=h.attrs["content"]
|
||||
|
||||
hh=sp.find_all("div", {"class":"media"})
|
||||
for h in hh:
|
||||
if h is not None:
|
||||
h=h.find("div", {"class": "pull-left"})
|
||||
if h is not None:
|
||||
h=h.find("a")
|
||||
if h is not None:
|
||||
d["image2"]=crawler.objects.models.download_file(h.attrs["href"])
|
||||
return {"article": d}
|
||||
|
||||
def fsarcharticle(o):
|
||||
sp=BeautifulSoup(o.raw_fixed)
|
||||
d={}
|
||||
h=sp.find("h1", {"class": "title"})
|
||||
if h is not None:
|
||||
d["title"]=h.text.strip()
|
||||
d["url"]=o.url
|
||||
d["published"]=None
|
||||
h=sp.find("article")
|
||||
h=h.find("div", {"class": "content"})
|
||||
d["text"]=h.encode_contents().strip()
|
||||
h=sp.find("article").find("h1", {"class": "title"})
|
||||
if h is not None:
|
||||
d["title"]=h.text.strip()
|
||||
else:
|
||||
d["title"]=""
|
||||
d["image"]=""
|
||||
d["sourcetype"]="fsarcharticle"
|
||||
d["section"]="fsarch"
|
||||
d["author"]=None
|
||||
return {"article": d}
|
||||
|
||||
def fetindex(o):
|
||||
# if type(o) is not Object:
|
||||
# raise TypeError
|
||||
if o.raw is None:
|
||||
raise Error
|
||||
print "compile_fetindex"
|
||||
html=BeautifulSoup(o.raw_fixed)
|
||||
h = html.find("li", {"class": "next_page" })
|
||||
if h is not None:
|
||||
nl=h.find("a")
|
||||
nl=crawler.objects.models.fix_link(o.url,nl.attrs["href"])
|
||||
else:
|
||||
nl=None
|
||||
h= html.find("ul", {"id": "neuigkeiten"})
|
||||
links=h.find_all("a")
|
||||
al = []
|
||||
for t in links:
|
||||
al.append(t.attrs["href"])
|
||||
return {"url": o.url, "next_page": nl, "article_links": al, "objecttype": "index" }
|
||||
|
||||
def fsarchindex(o):
|
||||
if o.raw is None:
|
||||
raise Error
|
||||
html=BeautifulSoup(o.raw_fixed)
|
||||
h= html.find("article")
|
||||
print unicode(h)
|
||||
links=h.find_all("a")
|
||||
al = []
|
||||
fl=[]
|
||||
for t in links:
|
||||
url=t.attrs["href"]
|
||||
if re.search("fachschaftarchitektur\.at", url):
|
||||
al.append(t.attrs["href"])
|
||||
if re.search("facebook\.com/events", url):
|
||||
fl.append(t.attrs["href"])
|
||||
|
||||
return {"url": o.url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"}
|
||||
|
||||
|
||||
def fsbizindex(o):
|
||||
if o.raw is None:
|
||||
raise Error
|
||||
print "compile_fsbizindex"
|
||||
html=BeautifulSoup(o.raw_fixed)
|
||||
h= html.find("section", {"id": "primary"})
|
||||
links=h.find_all("h1", {"class": "entry-title"})
|
||||
al = []
|
||||
for t in links:
|
||||
|
||||
al.append(t.find("a").attrs["href"])
|
||||
return {"url": o.url,"article_links": al,"objecttype": "index"}
|
||||
|
||||
|
||||
def fsmbindex(o):
|
||||
if o.raw is None:
|
||||
raise Error
|
||||
html=BeautifulSoup(o.raw_fixed)
|
||||
h= html.find("a",{"class": "next"})
|
||||
if h is not None:
|
||||
np=h.attrs["href"]
|
||||
else:
|
||||
np=None
|
||||
h=html.find("div", {"id": "main"}).find("div", {"class": "inside"}).find("div", {"class": "mod_newslist"})
|
||||
if h is not None:
|
||||
ats=h.find_all("div",{"class": "block"})
|
||||
articles=[]
|
||||
for a in ats:
|
||||
aa={}
|
||||
h=a.find("h3")
|
||||
if h is not None:
|
||||
aa["title"] = h.text.strip()
|
||||
h=a.find("div", {"class": "ce_text"})
|
||||
if h is not None:
|
||||
aa["text"] = (h.encode_contents()).strip()
|
||||
aa["info"]=[]
|
||||
hh=a.find_all("p", {"class": "info"},recursive=False)
|
||||
for h in hh:
|
||||
aa["info"].append(unicode(h.text))
|
||||
if re.search(r'von', str(h)):
|
||||
h1= re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text))
|
||||
aa["published"] =parse(h1.strip())
|
||||
aa["author"]=re.sub(r'^.*von(.*)$', r'\1',unicode(h.text)).strip() #h.text + "--" #+ re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',hh)
|
||||
aa["section"]="FSMB"
|
||||
articles.append(aa)
|
||||
return {"url": o.url, "next_page": np, "articles": articles,"objecttype": "articles"}
|
||||
258
compiler/compiler.py
Normal file
258
compiler/compiler.py
Normal file
@@ -0,0 +1,258 @@
|
||||
from bs4 import BeautifulSoup
|
||||
#import crawler.objects.models
|
||||
#from crawler.objects.models import Object
|
||||
from dateutil.parser import parse
|
||||
from datetime import datetime
|
||||
import re
|
||||
import urlparse
|
||||
from src import clogger, cfg
|
||||
from src.fb import graph
|
||||
from fixing import fix_link
|
||||
from facebook import GraphAPIError
|
||||
#from fetching import downloadfile
|
||||
import json
|
||||
def do_compile(tpe, cont):
|
||||
if type(cont) != dict:
|
||||
clogger.error("Type Error for do compile for :"+str(cont["url"]))
|
||||
# Starting to compile an generic object
|
||||
if "url" not in cont:
|
||||
clogger.error("no url can't compile "+tpe)
|
||||
else:
|
||||
clogger.debug("compile: type:"+str(tpe)+ "| "+ str(cont["url"]))
|
||||
if tpe in compiler:
|
||||
cont=compiler[tpe](cont["url"], cont["raw"])
|
||||
return cont
|
||||
|
||||
from comp import rssfeed
|
||||
|
||||
def dummyarticle(url, raw):
|
||||
return {"url": url, "article":{"url": url, "section": "dummysection", "sourcetype": "dummy", "title":"dummytitle", "text": raw, "image": "fff", "author": "me", "published": None}}
|
||||
|
||||
|
||||
|
||||
def htufeed(url,raw):
|
||||
al=[]
|
||||
f=feedparser.parse(raw)
|
||||
for e in f['entries']:
|
||||
al.append(e['link'])
|
||||
return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}
|
||||
|
||||
|
||||
def htuarticle(url,raw):
|
||||
sp=BeautifulSoup(raw)
|
||||
d={}
|
||||
h=sp.find("meta", {"property": "og:image"})
|
||||
if h is not None:
|
||||
d["image"]=h.attrs["content"]
|
||||
d["image2"]=d["image"]
|
||||
h=sp.find("div", {"class": "patternRevInfo"})
|
||||
if h is not None:
|
||||
# clogger.debug(h.text.strip())
|
||||
h1= re.sub(r'.*- (\d+) ([a-zA-Z]+) (\d+) - ([:\d]+)[^\d]*', r'\3/\2/\1 \4',unicode(h.text.strip()))
|
||||
# clogger.debug(h1)
|
||||
d["published"]=parse(h1)
|
||||
# clogger.debug(parse(h1))
|
||||
# clogger.debug(d["published"])
|
||||
h=h.find("a")
|
||||
if h is not None:
|
||||
d["author"]=h.text.strip()
|
||||
h=sp.find("div", {"class": "foswikiTopic"})
|
||||
h1=h.find("h4")
|
||||
if h1 is not None:
|
||||
d["title"]= h1.text.strip()
|
||||
h1.extract() # remove head
|
||||
else:
|
||||
h1=sp.find("meta", {"name": "WEBTOPIC"})
|
||||
d["title"]= h1.attrs["content"]
|
||||
d["text"]=(h.encode_contents()).strip()
|
||||
d["section"]="HTU"
|
||||
d["url"]=url
|
||||
# clogger.debug(d)
|
||||
return {"article": d}
|
||||
|
||||
|
||||
def fetarticle(url, raw):
|
||||
sp=BeautifulSoup(raw)
|
||||
d={}
|
||||
h=sp.find("h1", {"itemprop": "name"})
|
||||
d["title"]=unicode(h.text).strip()
|
||||
h=sp.find("div", {"itemprop": "articleBody"})
|
||||
if h is not None:
|
||||
d["text"]=(h.encode_contents()).strip()
|
||||
else:
|
||||
d["text"]=""
|
||||
d["url"]=url
|
||||
h=sp.find("span", {"itemprop": "author"})
|
||||
if h is not None:
|
||||
d["author"]=h.text.strip()
|
||||
h=sp.find("span", {"itemprop": "articleSection"})
|
||||
if h is not None:
|
||||
d["section"]= "FET - " + h.text.strip()
|
||||
|
||||
h=sp.find("span", {"itemprop": "datePublished"})
|
||||
if h is not None:
|
||||
d["published"]=parse(h.encode_contents().strip())
|
||||
|
||||
h=sp.find("meta", {"property": "og:image"})
|
||||
if h is not None:
|
||||
d["image"]=h.attrs["content"]
|
||||
d["image2"]=d["image"]
|
||||
# hh=sp.find_all("div", {"class":"media"})
|
||||
# for h in hh:
|
||||
# if h is not None:
|
||||
# h=h.find("div", {"class": "pull-left"})
|
||||
# if h is not None:
|
||||
# h=h.find("a")
|
||||
# if h is not None:
|
||||
# d["image2"]=downloadfile(fix_link(url,h.attrs["href"]))
|
||||
return {"article": d}
|
||||
|
||||
|
||||
def fsarcharticle(url, raw):
|
||||
sp=BeautifulSoup(raw)
|
||||
d={}
|
||||
h=sp.find("h1", {"class": "title"})
|
||||
if h is not None:
|
||||
d["title"]=h.text.strip()
|
||||
d["url"]=url
|
||||
d["published"]=None
|
||||
h=sp.find("article")
|
||||
if h is not None:
|
||||
h=h.find("div", {"class": "content"})
|
||||
d["text"]=h.encode_contents().strip()
|
||||
h=sp.find("article")
|
||||
if h is not None:
|
||||
h=h.find("h1", {"class": "title"})
|
||||
if h is not None:
|
||||
d["title"]=h.text.strip()
|
||||
else:
|
||||
d["title"]=""
|
||||
d["image"]=""
|
||||
d["sourcetype"]="fsarcharticle"
|
||||
d["section"]="fsarch"
|
||||
d["author"]=None
|
||||
return {"article": d}
|
||||
|
||||
def fetindex(url, raw):
|
||||
if raw is None:
|
||||
raise Error
|
||||
# clogger.debug("compile_fetindex: "+str(url))
|
||||
html=BeautifulSoup(raw)
|
||||
h = html.find("li", {"class": "next_page" })
|
||||
if h is not None:
|
||||
nl=h.find("a")
|
||||
nl=fix_link(url,nl.attrs["href"])
|
||||
else:
|
||||
nl=None
|
||||
h= html.find("ul", {"id": "neuigkeiten"})
|
||||
al = []
|
||||
if h is not None:
|
||||
links=h.find_all("a")
|
||||
for t in links:
|
||||
al.append(t.attrs["href"])
|
||||
return {"url": url, "next_page": nl, "article_links": al, "objecttype": "index" }
|
||||
|
||||
def fsarchindex(url, raw):
|
||||
if raw is None:
|
||||
raise Error
|
||||
html=BeautifulSoup(raw)
|
||||
h= html.find("article")
|
||||
print unicode(h)
|
||||
links=h.find_all("a")
|
||||
al = []
|
||||
fl=[]
|
||||
for t in links:
|
||||
url=t.attrs["href"]
|
||||
if re.search("fachschaftarchitektur\.at", url):
|
||||
al.append(t.attrs["href"])
|
||||
if re.search("facebook\.com/events", url):
|
||||
fl.append(t.attrs["href"])
|
||||
|
||||
return {"url": url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"}
|
||||
|
||||
|
||||
def fsbizindex(url, raw):
|
||||
if raw is None:
|
||||
raise Error
|
||||
print "compile_fsbizindex"
|
||||
html=BeautifulSoup(raw)
|
||||
h= html.find("section", {"id": "primary"})
|
||||
links=h.find_all("h1", {"class": "entry-title"})
|
||||
al = []
|
||||
for t in links:
|
||||
|
||||
al.append(t.find("a").attrs["href"])
|
||||
return {"url": url,"article_links": al,"objecttype": "index"}
|
||||
|
||||
|
||||
|
||||
|
||||
def fbfeed(url, raw):
|
||||
js = json.loads(raw)
|
||||
arts=[]
|
||||
u=urlparse.urlparse(url)
|
||||
for m in js["data"]:
|
||||
aa={}
|
||||
aa["url"]=urlparse.urlunsplit(("http","www.facebook.at",m["id"],"",""))
|
||||
aa["published"] =parse(m["created_time"])
|
||||
if m.has_key("message")==True:
|
||||
aa["text"] = m["message"]
|
||||
else:
|
||||
try:
|
||||
h=graph.get_object(id=m["id"].split("_")[1])
|
||||
if h.has_key("description"):
|
||||
aa["text"]=h["description"]
|
||||
else:
|
||||
aa["text"]=json.dumps()
|
||||
except GraphAPIError:
|
||||
aa["text"]=""
|
||||
if m.has_key("story")==True:
|
||||
aa["title"] = m["story"]
|
||||
else:
|
||||
aa["title"] = u[1]+ " at " + aa["published"].strftime("%Y-%m-%d %H:%M")
|
||||
aa["section"]="Facebook: "+u[1]
|
||||
arts.append(aa)
|
||||
return {"url": url, "next_page": js["paging"]["next"],"articles": arts}
|
||||
|
||||
def fsmbindex(url, raw):
|
||||
if raw is None:
|
||||
raise Error
|
||||
html=BeautifulSoup(raw)
|
||||
h= html.find("a",{"class": "next"})
|
||||
if h is not None:
|
||||
np=h.attrs["href"]
|
||||
else:
|
||||
np=None
|
||||
h=html.find("div", {"id": "main"}).find("div", {"class": "inside"}).find("div", {"class": "mod_newslist"})
|
||||
if h is not None:
|
||||
ats=h.find_all("div",{"class": "block"})
|
||||
articles=[]
|
||||
for a in ats:
|
||||
aa={}
|
||||
h=a.find("h3")
|
||||
if h is not None:
|
||||
aa["title"] = h.text.strip()
|
||||
h=a.find("div", {"class": "ce_text"})
|
||||
if h is not None:
|
||||
aa["text"] = (h.encode_contents()).strip()
|
||||
aa["info"]=[]
|
||||
hh=a.find_all("p", {"class": "info"},recursive=False)
|
||||
for h in hh:
|
||||
aa["info"].append(unicode(h.text))
|
||||
if re.search(r'von', str(h)):
|
||||
h1= re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text))
|
||||
aa["published"] =parse(h1.strip())
|
||||
aa["author"]=re.sub(r'^.*von(.*)$', r'\1',unicode(h.text)).strip() #h.text + "--" #+ re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',hh)
|
||||
aa["section"]="FSMB"
|
||||
articles.append(aa)
|
||||
return {"url": url, "next_page": np, "articles": articles,"objecttype": "articles"}
|
||||
|
||||
compiler = {"fetindex": fetindex, "fetarticle": fetarticle, "fsarchindex": fsarchindex, "fsarcharticle": fsarcharticle, "fsmbindex": fsmbindex, "fsbizindex": fsbizindex, "dummyarticle": dummyarticle,"htuarticle": htuarticle, "htufeed": htufeed, "fbfeed": fbfeed, "fschfeed": rssfeed}
|
||||
|
||||
compiler = cfg.compiler
|
||||
for i in compiler:
|
||||
compiler[i]=eval(compiler[i])
|
||||
|
||||
|
||||
|
||||
article_types={"fetindex" : "fetarticle", "fsarchindex": "fsarcharticle", "fsbizindex": "fsbizarticle", "dummyindex": "dummyarticle", "htufeed": "htuarticle"}
|
||||
67
compiler/fetching.py
Normal file
67
compiler/fetching.py
Normal file
@@ -0,0 +1,67 @@
|
||||
from requests import session
|
||||
s=session()
|
||||
from src import package_directory, download_path,cfg
|
||||
from os import path, makedirs
|
||||
import os
|
||||
import json
|
||||
from gevent import spawn
|
||||
from src import clogger
|
||||
from src.fb import graph
|
||||
from hashlib import md5
|
||||
import errno
|
||||
import urlparse
|
||||
def announce_articleid(id):
|
||||
for u in cfg.announcearticle_url:
|
||||
s.get( u % id)
|
||||
|
||||
def downloadfile(url):
|
||||
relative_name=path.join("downloads",str(md5(url).hexdigest()),url.split('/')[-1])
|
||||
local_filename = path.join(download_path,relative_name)
|
||||
if not os.path.exists(os.path.dirname(local_filename)):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(local_filename))
|
||||
except OSError as exc: # Guard against race condition
|
||||
if exc.errno != errno.EEXIST:
|
||||
raise
|
||||
if not path.exists(local_filename):
|
||||
spawn(fetch_load_file, url, local_filename)
|
||||
return relative_name
|
||||
|
||||
from models import CrawlCache
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
|
||||
|
||||
def fetch_page(furl):
|
||||
current_time = datetime.utcnow()
|
||||
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
|
||||
u=urlparse.urlparse(furl)
|
||||
if u[0] == '':
|
||||
furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
|
||||
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
|
||||
if cc is None:
|
||||
clogger.debug("fetching url: "+ str(furl))
|
||||
if u[0]=='fb':
|
||||
tx = json.dumps(graph.get_object(id=u[1]+u[2]))
|
||||
else:
|
||||
tx=s.get(furl).text
|
||||
CrawlCache.store(furl,tx)
|
||||
else:
|
||||
#if furl is not None:
|
||||
# clogger.debug("cache hit")
|
||||
tx=cc.raw
|
||||
return tx
|
||||
|
||||
def fetch_load_file(furl, path):
|
||||
try:
|
||||
clogger.info("Downloading "+ str(furl))
|
||||
r = s.get(furl, stream=True)
|
||||
f = open(path, 'wb')
|
||||
for chunk in r.iter_content(chunk_size=1024):
|
||||
if chunk: # filter out keep-alive new chunks
|
||||
f.write(chunk)
|
||||
f.close()
|
||||
except Exception, e:
|
||||
#clogger.error("Error Occured during fetching:"+str(furl))
|
||||
clogger.error(e,exc_info=True)
|
||||
37
compiler/fixing.py
Normal file
37
compiler/fixing.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from bs4 import BeautifulSoup
|
||||
from urlparse import urlparse, urlunparse, urljoin
|
||||
from fetching import downloadfile
|
||||
import bleach
|
||||
|
||||
def fix_link(url, link):
|
||||
r= urlparse(link)
|
||||
if r.scheme is None or r.scheme == '':
|
||||
return urljoin(url,link)
|
||||
else:
|
||||
return link
|
||||
|
||||
def fix_file(url, link):
|
||||
u=fix_link(url,link)
|
||||
return downloadfile(u)
|
||||
|
||||
def load_file(url, link):
|
||||
return fix_file(url,link)
|
||||
|
||||
|
||||
def fix_html(html, baseurl):
|
||||
html=bleach.clean(html, tags=['b','p','span','a','img','div','br','strong','ul','li'], strip=True)
|
||||
sp=BeautifulSoup(html)
|
||||
images=sp.find_all("img")
|
||||
for t in images:
|
||||
if "src" in t.attrs and t.attrs["src"] is not None:
|
||||
t.attrs["src"]=fix_file(baseurl,t.attrs["src"])
|
||||
links=sp.find_all("a")
|
||||
for t in links:
|
||||
if "href" in t.attrs:
|
||||
t.attrs["href"]=fix_link(baseurl, t.attrs["href"])
|
||||
for t in sp.find_all("script"):
|
||||
t.extract()
|
||||
b=sp.find("base")
|
||||
if b is not None:
|
||||
b.attrs["href"]=""
|
||||
return sp
|
||||
75
compiler/models.py
Normal file
75
compiler/models.py
Normal file
@@ -0,0 +1,75 @@
|
||||
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text
|
||||
from datetime import datetime
|
||||
from src.database import Base2
|
||||
from src.database import db_session2
|
||||
from mqueues import put_fetch_queue
|
||||
from marshmallow import Schema,fields,ValidationError
|
||||
import json
|
||||
import flask
|
||||
|
||||
def add_url(tpe, url):
|
||||
cu=CrawlUrl.find_or_create(tpe,url)
|
||||
db_session2.add(cu)
|
||||
db_session2.commit()
|
||||
cu.schedule()
|
||||
|
||||
|
||||
class CrawlUrlSchema(Schema):
|
||||
id=fields.Integer()
|
||||
tpe=fields.String()
|
||||
url=fields.String()
|
||||
last_fetched=fields.DateTime()
|
||||
fetched = fields.DateTime()
|
||||
|
||||
class CrawlUrl(Base2):
|
||||
__tablename__='crawlurls'
|
||||
id = Column(Integer, primary_key=True)
|
||||
tpe=Column(String(250))
|
||||
url = Column(String(250))
|
||||
last_fetched = Column(DateTime)
|
||||
def fetched(self):
|
||||
CrawlCache.query.find(CrawlCache.url==self.url).first()
|
||||
@classmethod
|
||||
def find_or_create(self, tpe, url):
|
||||
aa = CrawlUrl.query.filter(CrawlUrl.url==url).filter(CrawlUrl.tpe==tpe).first()
|
||||
if aa is None:
|
||||
aa=CrawlUrl(tpe,url)
|
||||
return aa
|
||||
def schedule(self):
|
||||
put_fetch_queue((0, self.tpe, self.url))
|
||||
def __init__(self, tpe, url):
|
||||
self.url=url
|
||||
self.tpe=tpe
|
||||
def __json__(self):
|
||||
return CrawlUrlSchema().dump(self)[0]
|
||||
|
||||
class CrawlCacheSchema(Schema):
|
||||
id=fields.Integer()
|
||||
raw=fields.String()
|
||||
url=fields.String()
|
||||
fetched=fields.DateTime()
|
||||
|
||||
class CrawlCache(Base2):
|
||||
__tablename__='crawlcache'
|
||||
id = Column(Integer, primary_key=True)
|
||||
url=Column(String(250))
|
||||
fetched=Column(DateTime)
|
||||
raw=Column(Text)
|
||||
|
||||
def __init__(self, url,rw):
|
||||
self.url=url
|
||||
self.raw=rw
|
||||
self.fetched=datetime.utcnow()
|
||||
def __json__(self):
|
||||
return CrawlCacheSchema().dump(self)
|
||||
|
||||
@classmethod
|
||||
def store(cls, url, rw):
|
||||
cc=CrawlCache(url,rw)
|
||||
db_session2.add(cc)
|
||||
db_session2.commit()
|
||||
|
||||
|
||||
|
||||
|
||||
#flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, CrawlUrl) else None)
|
||||
74
compiler/mprocess.py
Normal file
74
compiler/mprocess.py
Normal file
@@ -0,0 +1,74 @@
|
||||
from src import clogger # Logger for crawler
|
||||
from src.models import Article # Article model
|
||||
from datetime import datetime
|
||||
from src.database import db_session
|
||||
from mqueues import fetch_queue, compile_queue, put_fetch_queue
|
||||
from fetching import fetch_page, downloadfile, announce_articleid
|
||||
from fixing import fix_html, fix_file
|
||||
|
||||
from compiler import article_types
|
||||
from fixing import fix_link
|
||||
# process article expects an hash with raw data for the article and puts it into an
|
||||
# article object stored in the database it is intended to prevent dublicates
|
||||
|
||||
def is_article_hash(h):
|
||||
return "text" in h and "url" in h and "sourcetype" in h and "section" in h
|
||||
|
||||
def process_article(art):
|
||||
if not is_article_hash(art):
|
||||
clogger.error("Invalid article hash:" + str(art))
|
||||
aa=None
|
||||
else:
|
||||
art["text"]=fix_html(art["text"],art["url"])
|
||||
if "image" in art:
|
||||
art["image"]=fix_file(art["url"], art["image"])
|
||||
clogger.info(art)
|
||||
aa = Article.from_hash(art)
|
||||
aa.process_hash(art)
|
||||
aa.last_fetched=datetime.now()
|
||||
aa.sourcetype=art["sourcetype"]
|
||||
db_session.add(aa)
|
||||
db_session.commit()
|
||||
clogger.debug("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
|
||||
# announce_articleid(aa.id)
|
||||
return aa
|
||||
|
||||
# process a single found url
|
||||
def process_url(url,tpe, parent_url):
|
||||
#clogger.debug("process URL of type "+ tpe + ": " + url)
|
||||
if parent_url is not None:
|
||||
url=fix_link(parent_url, url)
|
||||
put_fetch_queue((0,tpe,url))
|
||||
|
||||
|
||||
# process a url list
|
||||
def process_urllist(urllist, tpe, parent_url):
|
||||
for u in urllist:
|
||||
process_url(u,tpe, parent_url)
|
||||
|
||||
|
||||
def do_process(tpe, cont):
|
||||
urllist=[]
|
||||
# clogger.debug("process :" + str(cont))
|
||||
if "article_links" in cont:
|
||||
process_urllist(cont["article_links"], article_types[tpe], cont["url"])
|
||||
if "index_links" in cont:
|
||||
process_urllist(cont["index_links"], tpe , cont["url"])
|
||||
|
||||
if "next_page" in cont and cont["next_page"] is not None:
|
||||
process_url(cont["next_page"],tpe, cont["url"])
|
||||
|
||||
if "article" in cont:
|
||||
art=cont["article"]
|
||||
art["sourcetype"]=tpe
|
||||
process_article(art)
|
||||
|
||||
if "articles" in cont:
|
||||
clogger.debug("articles")
|
||||
for a in cont["articles"]:
|
||||
if "title" in a:
|
||||
a["sourcetype"]=tpe
|
||||
if a.has_key("url")==False:
|
||||
a["url"]=cont["url"]
|
||||
process_article(a)
|
||||
return
|
||||
8
compiler/mqueues.py
Normal file
8
compiler/mqueues.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from gevent.queue import Queue, JoinableQueue
|
||||
fetch_queue = Queue()
|
||||
compile_queue = Queue()
|
||||
process_queue = Queue()
|
||||
|
||||
def put_fetch_queue(o):
|
||||
fetch_queue.put(o)
|
||||
|
||||
58
compiler/mworker.py
Normal file
58
compiler/mworker.py
Normal file
@@ -0,0 +1,58 @@
|
||||
|
||||
from mqueues import fetch_queue, compile_queue, process_queue
|
||||
from compiler import do_compile
|
||||
from mprocess import do_process
|
||||
from fetching import fetch_page
|
||||
from gevent import spawn
|
||||
from itertools import repeat
|
||||
from src import clogger
|
||||
def start_workers(f,c,p):
|
||||
for _ in range(f):
|
||||
clogger.debug("spawn fetchworker")
|
||||
spawn(work_fetch)
|
||||
for _ in range(c):
|
||||
spawn(work_compile)
|
||||
for _ in range(p):
|
||||
spawn(work_process)
|
||||
|
||||
def work_fetch():
|
||||
while True:
|
||||
run_fetch()
|
||||
|
||||
def work_process():
|
||||
while True:
|
||||
run_process()
|
||||
def work_compile():
|
||||
while True:
|
||||
run_compile()
|
||||
|
||||
|
||||
def queue_url(tpe, url):
|
||||
fetch_queue.put((0,tpe,url))
|
||||
|
||||
|
||||
# fetch a page from the url list
|
||||
def run_fetch():
|
||||
tc, tpe, url = fetch_queue.get()
|
||||
if tpe is not "dummyarticle" and tpe is not "dummyindex":
|
||||
rw=fetch_page(url)
|
||||
else:
|
||||
rw="<p> dummytext</p>"
|
||||
compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw}))
|
||||
return rw
|
||||
# fetch_queue.task_done()
|
||||
|
||||
#comile something from the compile list
|
||||
def run_compile():
|
||||
tc,tpe,h = compile_queue.get()
|
||||
h=do_compile(tpe,h)
|
||||
process_queue.put((0,tpe, h))
|
||||
return h
|
||||
# compile_queue.task_done()
|
||||
|
||||
def run_process():
|
||||
tc,tpe,h = process_queue.get()
|
||||
do_process(tpe, h)
|
||||
return h
|
||||
# process_queue.task_done()
|
||||
|
||||
146
compiler/views.py
Normal file
146
compiler/views.py
Normal file
@@ -0,0 +1,146 @@
|
||||
from flask import Blueprint, jsonify, render_template, abort, redirect, url_for, request
|
||||
compiler_pages = Blueprint('compiler', __name__,
|
||||
template_folder='.')
|
||||
|
||||
from src.database import db_session2,init_db,read_json,init_db2
|
||||
from .models import CrawlUrl
|
||||
from .models import CrawlCache, CrawlCacheSchema
|
||||
from .models import CrawlUrlSchema
|
||||
from src import clogger
|
||||
from src.articles import Article
|
||||
#import mworker
|
||||
import flask
|
||||
import json
|
||||
import mworker
|
||||
|
||||
from compiler import do_compile
|
||||
from fetching import fetch_page
|
||||
|
||||
#flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Article,CrawlUrl)) else None)
|
||||
|
||||
@compiler_pages.route("/")
|
||||
@compiler_pages.route("")
|
||||
@compiler_pages.route(".json")
|
||||
def index():
|
||||
status="For documentation goto /doc"
|
||||
return jsonify(status=status)
|
||||
|
||||
@compiler_pages.route("/doc")
|
||||
@compiler_pages.route("/doc.json")
|
||||
def doc():
|
||||
return render_template("README")
|
||||
# return jsonify(status=render_template("README"))
|
||||
#
|
||||
|
||||
|
||||
@compiler_pages.route("/initdb")
|
||||
@compiler_pages.route("/initdb.json")
|
||||
def initdb_json():
|
||||
init_db() # initialisiere Datenbank
|
||||
status="Datenbank Neu initialisiert"
|
||||
return jsonify(status=status)
|
||||
|
||||
@compiler_pages.route("/initdb2")
|
||||
@compiler_pages.route("/initdb2.json")
|
||||
def initdb_json2():
|
||||
init_db2() # initialisiere Datenbank
|
||||
status="Datenbank Neu initialisiert"
|
||||
return jsonify(status=status)
|
||||
|
||||
@compiler_pages.route("/start")
|
||||
@compiler_pages.route("/start.json")
|
||||
def start_json():
|
||||
mworker.start_workers(1,1,1) # initialisiere Datenbank
|
||||
status="Worker gestartet"
|
||||
return jsonify(status=status)
|
||||
|
||||
|
||||
@compiler_pages.route("/urls")
|
||||
@compiler_pages.route("/urls.json")
|
||||
def urls_index_json():
|
||||
# Lade Alle Urls
|
||||
status=CrawlUrl.query.all()
|
||||
return jsonify(urls=status)
|
||||
|
||||
# show an existing CrawlUrl
|
||||
@compiler_pages.route("/urls/<int:id>")
|
||||
@compiler_pages.route("/urls/<int:id>.json")
|
||||
def urls_json(id):
|
||||
# Lade Alle Urls
|
||||
status=CrawlUrl.query.get(id)
|
||||
cc=CrawlCache.query.filter(CrawlCache.url==status.url).first()
|
||||
return jsonify(urls=status, cache=cc.__json__())
|
||||
|
||||
# que an existing CrawlUrl for fetching
|
||||
@compiler_pages.route("/urls/<int:id>/que")
|
||||
@compiler_pages.route("/urls/<int:id>/que.json")
|
||||
def urls_que_json(id):
|
||||
# Lade Alle Urls
|
||||
cu=CrawlUrl.query.get(id)
|
||||
mworker.queue_url(cu.tpe, cu.url)
|
||||
cc=CrawlCache.query.filter(CrawlCache.url==cu.url)
|
||||
mworker.start_workers(1,1,1) # initialisiere Datenbank
|
||||
status="Worker gestartet"
|
||||
return jsonify(urls=cu, cache=cc)
|
||||
|
||||
|
||||
# que an existing CrawlUrl for fetching
|
||||
@compiler_pages.route("/urls/<int:id>/test")
|
||||
@compiler_pages.route("/urls/<int:id>/test.json")
|
||||
def urls_test_json(id):
|
||||
# Lade Alle Urls
|
||||
cu=CrawlUrl.query.get(id)
|
||||
rw=fetch_page(cu.url)
|
||||
h= {"url": cu.url, "sourcetype": cu.tpe, "raw": rw}
|
||||
h2=do_compile(cu.tpe, h)
|
||||
return jsonify(urls=cu,hs=h2,rw=rw)
|
||||
|
||||
|
||||
|
||||
|
||||
@compiler_pages.route("/debug",methods=['GET','PUT'])
|
||||
def debug():
|
||||
status="did nothing"
|
||||
js=read_json(request)
|
||||
clogger.info(request.get_json())
|
||||
if js["cmd"] == "runfetch":
|
||||
mworker.run_fetch()
|
||||
status="fetched something"
|
||||
if js["cmd"] == "que":
|
||||
cu = CrawlUrl.query.get(js["id"])
|
||||
mworker.queue_url(cu.tpe, cu.url)
|
||||
status= mworker.run_fetch()
|
||||
if js["cmd"] == "comp":
|
||||
status=mworker.run_compile()
|
||||
if js["cmd"]=="process":
|
||||
status=mworker.run_process()
|
||||
return jsonify(status=status)
|
||||
|
||||
@compiler_pages.route("/debugurl")
|
||||
def debugurl():
|
||||
s=CrawlUrlSchema()
|
||||
status=CrawlUrl.query.all()
|
||||
return jsonify(status=status)
|
||||
|
||||
|
||||
@compiler_pages.route("/urls",methods=['POST'])
|
||||
def add_urls():
|
||||
# Lese Daten
|
||||
js =read_json(request)
|
||||
# clogger.info(js)
|
||||
# Finde oder Erzeuge Url in der Datenbank
|
||||
url=CrawlUrlSchema().load(js["url"])
|
||||
clogger.info(url)
|
||||
url=CrawlUrl.find_or_create(url.data["tpe"], url.data["url"])
|
||||
db_session2.add(url)
|
||||
db_session2.commit()
|
||||
return jsonify(url=url, kk=js)
|
||||
|
||||
@compiler_pages.route("/urls/<int:id>",methods=['DELETE'])
|
||||
@compiler_pages.route("/urls<int:id>.json",methods=['DELETE'])
|
||||
def delete(id):
|
||||
cu=CrawlUrl.query.get(id)
|
||||
if cu != None:
|
||||
db_session2.delete(cu)
|
||||
db_session2.commit()
|
||||
return jsonify(url={})
|
||||
4
crawler/__init__.py
Normal file
4
crawler/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
|
||||
def init():
|
||||
return " "
|
||||
55
database.py
Normal file
55
database.py
Normal file
@@ -0,0 +1,55 @@
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import scoped_session, sessionmaker
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from src import package_directory,clogger, cfg
|
||||
from os import path
|
||||
import json
|
||||
#engine = create_engine('sqlite:////home/andreas/www/crawler/test.db', convert_unicode=True)
|
||||
|
||||
if cfg.get("db_path")==None or cfg.get("db_path").strip()=="":
|
||||
db_path=package_directory
|
||||
else:
|
||||
db_path=cfg.get("db_path")
|
||||
|
||||
db_mainfile=cfg.get("db_mainfile")
|
||||
if db_mainfile == None or db_mainfile.strip()=="":
|
||||
db_mainfile="../srctest.db"
|
||||
|
||||
db_urlfile=cfg.get("db_mainfile")
|
||||
if db_urlfile == None or db_urlfile.strip()=="":
|
||||
db_urlfile="../srctest_cu.db"
|
||||
|
||||
|
||||
engine = create_engine('sqlite:///'+ path.join(db_path,db_mainfile), convert_unicode=True)
|
||||
|
||||
db_session = scoped_session(sessionmaker(autocommit=False,
|
||||
autoflush=False,
|
||||
bind=engine))
|
||||
|
||||
engine2 = create_engine('sqlite:///'+ path.join(db_path,db_urlfile), convert_unicode=True)
|
||||
|
||||
db_session2 = scoped_session(sessionmaker(autocommit=False,
|
||||
autoflush=False,
|
||||
bind=engine2))
|
||||
|
||||
Base = declarative_base()
|
||||
Base.query = db_session.query_property()
|
||||
Base2 = declarative_base()
|
||||
Base2.query = db_session2.query_property()
|
||||
|
||||
def read_json(rq):
|
||||
js=rq.get_json()
|
||||
clogger.info(rq.data)
|
||||
if js is None:
|
||||
js=rq.form.to_dict()
|
||||
if js=={} and rq.data != "":
|
||||
js=json.loads(rq.data)
|
||||
return js
|
||||
|
||||
def init_db():
|
||||
import src.models
|
||||
Base.metadata.create_all(bind=engine)
|
||||
|
||||
def init_db2():
|
||||
from .compiler.models import CrawlUrl, CrawlCache
|
||||
Base2.metadata.create_all(bind=engine2)
|
||||
4
fb.py
Normal file
4
fb.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from src import cfg
|
||||
import facebook
|
||||
|
||||
graph = facebook.GraphAPI(access_token=cfg.fb_token, version='2.3')
|
||||
21
meta.py
Normal file
21
meta.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import os
|
||||
package_directory = os.path.dirname(os.path.abspath(__file__))
|
||||
from config import Config
|
||||
import logging
|
||||
import sys
|
||||
|
||||
cfg = Config(file(os.path.join(package_directory, 'config.cfg')))
|
||||
#--------------- Logging
|
||||
|
||||
|
||||
file_handler=logging.FileHandler(cfg.logfile)
|
||||
file_handler.setLevel(logging.INFO)
|
||||
std_handler=logging.StreamHandler(stream=sys.stdout)
|
||||
std_handler.setLevel(logging.DEBUG)
|
||||
|
||||
lg=logging.getLogger('mylogger')
|
||||
lg.setLevel(logging.DEBUG)
|
||||
lg.addHandler(file_handler)
|
||||
lg.addHandler(std_handler)
|
||||
|
||||
#----------------
|
||||
4
models.py
Normal file
4
models.py
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
from .articles.model import Article
|
||||
from .sections.model import Section
|
||||
from .compiler.models import CrawlUrl, CrawlCache
|
||||
1
sections/__init__.py
Normal file
1
sections/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .model import Section
|
||||
44
sections/model.py
Normal file
44
sections/model.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey
|
||||
from sqlalchemy.orm import relationship
|
||||
|
||||
from datetime import datetime
|
||||
from src.database import Base,db_session
|
||||
from marshmallow import Schema, fields
|
||||
|
||||
import json
|
||||
import flask
|
||||
|
||||
#from src.articles import Article
|
||||
|
||||
class SectionSchema(Schema):
|
||||
id=fields.Integer()
|
||||
foreign_name=fields.String()
|
||||
name=fields.String()
|
||||
|
||||
class Section(Base):
|
||||
__tablename__ = 'sections'
|
||||
id = Column(Integer, primary_key=True)
|
||||
url = Column(String(250))
|
||||
crawlurl = Column(Integer)
|
||||
foreign_name = Column(String(250),unique=True)
|
||||
name=Column(String(250))
|
||||
group = Column(String(250))
|
||||
articles=relationship("Article", back_populates="section")
|
||||
|
||||
def __json__(self):
|
||||
return SectionSchema().dump(self)[0]
|
||||
def __init__(self, url=None,fname=None):
|
||||
self.url=url
|
||||
self.foreign_name=fname
|
||||
|
||||
@classmethod
|
||||
def find_or_create(cls, fname):
|
||||
s=Section.query.filter(Section.foreign_name==fname).first()
|
||||
if s is None:
|
||||
s=Section(fname)
|
||||
db_session.add(s)
|
||||
db_session.commit()
|
||||
s.foreign_name=fname
|
||||
db_session.add(s)
|
||||
db_session.commit()
|
||||
return s
|
||||
37
sections/views.py
Normal file
37
sections/views.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from flask import Blueprint, jsonify, render_template, abort, redirect, url_for, request
|
||||
section_pages = Blueprint('sections', __name__)
|
||||
from .model import Section
|
||||
from .model import SectionSchema
|
||||
#import flask
|
||||
from datetime import datetime
|
||||
import json
|
||||
from src import clogger
|
||||
|
||||
from src.database import db_session, read_json
|
||||
import flask
|
||||
|
||||
@section_pages.route("/")
|
||||
@section_pages.route("")
|
||||
@section_pages.route(".json")
|
||||
def index():
|
||||
sections=Section.query.all()
|
||||
return jsonify(sections=sections)
|
||||
|
||||
@section_pages.route("/<int:id>",methods=['PUT'])
|
||||
@section_pages.route("/<int:id>.json",methods=['PUT'])
|
||||
def update(id):
|
||||
section=Section.query.get(id)
|
||||
clogger.info(request.data)
|
||||
a=request.get_json()
|
||||
section.text=a["text"]
|
||||
db_session.commit()
|
||||
return jsonify(section=section)
|
||||
|
||||
|
||||
@section_pages.route("/<int:id>",methods=['GET'])
|
||||
@section_pages.route("/<int:id>.json",methods=['GET'])
|
||||
def get(id):
|
||||
section=Section.query.get(id)
|
||||
clogger.info(section)
|
||||
# section=SectionSchema().dump(section)[0]
|
||||
return jsonify(section=section,articles=section.articles)
|
||||
1
templates/home.html
Normal file
1
templates/home.html
Normal file
@@ -0,0 +1 @@
|
||||
<h1>Hello World</h1>
|
||||
19
users/users.py
Normal file
19
users/users.py
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
class User(object):
|
||||
def __init__(self, id, username, password):
|
||||
self.id = id
|
||||
self.username = username
|
||||
self.password = password
|
||||
|
||||
def __str__(self):
|
||||
return "User(id='%s')" % self.id
|
||||
|
||||
user = User(1, 'user', 'password')
|
||||
def authenticate(username, password):
|
||||
if username == user.username and password == user.password:
|
||||
return user
|
||||
|
||||
def identity(payload):
|
||||
return user
|
||||
|
||||
|
||||
Reference in New Issue
Block a user