init commit
This commit is contained in:
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
/__init__.py~
|
||||||
|
/__init__.pyc
|
||||||
|
*.pyc
|
||||||
|
*~
|
||||||
|
config.cfg
|
||||||
101
__init__.py
Normal file
101
__init__.py
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
package_directory = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
from config import Config
|
||||||
|
|
||||||
|
cfg = Config(file(os.path.join(package_directory, 'config.cfg')))
|
||||||
|
#--------------- Logging
|
||||||
|
import logging
|
||||||
|
download_path="./cdw"
|
||||||
|
file_handler=logging.FileHandler(cfg.logfile)
|
||||||
|
file_handler.setLevel(logging.DEBUG)
|
||||||
|
stream_handler=logging.StreamHandler(sys.stdout)
|
||||||
|
|
||||||
|
clt=logging.getLogger('mylogger')
|
||||||
|
clt.setLevel(logging.DEBUG)
|
||||||
|
clt.addHandler(file_handler)
|
||||||
|
clt.addHandler(stream_handler)
|
||||||
|
|
||||||
|
clogger=clt
|
||||||
|
#----------------
|
||||||
|
lg=clt
|
||||||
|
from gevent import spawn, monkey
|
||||||
|
monkey.patch_all()
|
||||||
|
from .compiler import start_workers
|
||||||
|
#start_workers(1,1,1)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Framework
|
||||||
|
from flask import Flask, jsonify, render_template, redirect, request,send_from_directory
|
||||||
|
# Cross Site Scripting
|
||||||
|
from flask_cors import CORS, cross_origin
|
||||||
|
#Authentication
|
||||||
|
from flask_jwt import JWT, jwt_required, current_identity
|
||||||
|
|
||||||
|
from src.models import Article,Section
|
||||||
|
from src.users import authenticate, identity
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
CORS(app)
|
||||||
|
app.config['LOGGER_NAME']='mylogger'
|
||||||
|
app.logger.setLevel(logging.DEBUG)
|
||||||
|
app.logger.info("Server Started")
|
||||||
|
|
||||||
|
app.config['SECRET_KEY'] = 'super-secret'
|
||||||
|
import flask
|
||||||
|
import json
|
||||||
|
from database import Base
|
||||||
|
from models import Article, CrawlUrl, CrawlCache
|
||||||
|
|
||||||
|
|
||||||
|
flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Base, Article,Section, CrawlUrl,CrawlCache)) else None)
|
||||||
|
json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Base, Article,CrawlUrl,CrawlCache)) else None)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#bot.dosmth()
|
||||||
|
#lg.debug(bot.bot)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Allow Cross Site Scripting
|
||||||
|
@app.after_request
|
||||||
|
def after_request(response):
|
||||||
|
response.headers.add('Access-Control-Allow-Origin', '*')
|
||||||
|
if request.method == 'OPTIONS':
|
||||||
|
response.headers['Access-Control-Allow-Methods'] = 'DELETE, GET, POST, PUT'
|
||||||
|
headers = request.headers.get('Access-Control-Request-Headers')
|
||||||
|
if headers:
|
||||||
|
response.headers['Access-Control-Allow-Headers'] = headers
|
||||||
|
return response
|
||||||
|
from .articles.views import article_pages
|
||||||
|
from .sections.views import section_pages
|
||||||
|
from .compiler.views import compiler_pages
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/")
|
||||||
|
@app.route("/index")
|
||||||
|
@app.route("/home")
|
||||||
|
def home():
|
||||||
|
text="It work's, please do something"
|
||||||
|
return jsonify(text=text)
|
||||||
|
|
||||||
|
app.register_blueprint(article_pages, url_prefix='/articles')
|
||||||
|
app.register_blueprint(section_pages, url_prefix='/sections')
|
||||||
|
app.register_blueprint(compiler_pages, url_prefix='/compiler')
|
||||||
|
|
||||||
|
|
||||||
|
from src.bot import bot
|
||||||
|
if not app.debug or os.environ.get("WERKZEUG_RUN_MAIN") == "true":
|
||||||
|
bot.message_loop()
|
||||||
|
|
||||||
|
|
||||||
|
# ------------ Telegram Bot
|
||||||
|
#from bot import bot_queue
|
||||||
|
#@app.route('/bot', methods=['GET', 'POST'])
|
||||||
|
#def pass_update():
|
||||||
|
# bot_queue.put(request.data) # pass update to bot
|
||||||
|
# return 'OK'
|
||||||
2
articles/__init__.py
Normal file
2
articles/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
from .model import Article
|
||||||
|
from .views import article_pages
|
||||||
139
articles/model.py
Normal file
139
articles/model.py
Normal file
@@ -0,0 +1,139 @@
|
|||||||
|
|
||||||
|
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey
|
||||||
|
from sqlalchemy.orm import relationship
|
||||||
|
from datetime import datetime
|
||||||
|
from src.database import Base
|
||||||
|
from src.database import db_session
|
||||||
|
from marshmallow import Schema, fields
|
||||||
|
from src.sections.model import Section
|
||||||
|
|
||||||
|
#import json
|
||||||
|
import json
|
||||||
|
import flask
|
||||||
|
#json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None)
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
#import clogger
|
||||||
|
import logging
|
||||||
|
#from crawler.compiler.mqueues import put_fetch_queue
|
||||||
|
from src import clogger
|
||||||
|
#json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def calc_fingerprint(a):
|
||||||
|
return calc_fingerprint_h({"url": a.url, "title":a.title, "published": str(a.published_date)})
|
||||||
|
|
||||||
|
def calc_fingerprint_h(a):
|
||||||
|
if a["published"] is not None and a["published"]!= "None":
|
||||||
|
# clogger.info( "published:"+str(a["published"]))
|
||||||
|
if a["published"] is str:
|
||||||
|
pp=parse(a["published"])
|
||||||
|
else:
|
||||||
|
pp=a["published"]
|
||||||
|
else:
|
||||||
|
pp=""
|
||||||
|
#clogger.info( unicode(a["url"])+ unicode(a["title"])+unicode(pp))
|
||||||
|
h=hashlib.md5()
|
||||||
|
h.update(unicode(a["url"]))
|
||||||
|
h.update(a["title"].encode("utf-8"))
|
||||||
|
h.update(unicode(pp))
|
||||||
|
return h.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
class ArticleSchema(Schema):
|
||||||
|
id=fields.Integer()
|
||||||
|
text=fields.String()
|
||||||
|
title=fields.String()
|
||||||
|
author=fields.String()
|
||||||
|
sourcetype =fields.String()
|
||||||
|
image =fields.String()
|
||||||
|
url =fields.String()
|
||||||
|
published_date=fields.DateTime()
|
||||||
|
date=fields.DateTime()
|
||||||
|
first_fetched=fields.DateTime()
|
||||||
|
section_id=fields.Integer()
|
||||||
|
|
||||||
|
class Article(Base):
|
||||||
|
__tablename__ = 'articles'
|
||||||
|
id = Column(Integer, primary_key=True)
|
||||||
|
parent_id= Column(Integer)
|
||||||
|
url = Column(String(250))
|
||||||
|
is_primary = Column(Boolean)
|
||||||
|
fingerprint = Column(String(250),unique=True)
|
||||||
|
hash = Column(String(250))
|
||||||
|
last_fetched = Column(DateTime)
|
||||||
|
first_fetched=Column(DateTime)
|
||||||
|
published_date = Column(DateTime)
|
||||||
|
date = Column(DateTime)
|
||||||
|
text = Column(Text)
|
||||||
|
title = Column(String(250))
|
||||||
|
author = Column(String(250))
|
||||||
|
section = relationship("Section")
|
||||||
|
section_id=Column(Integer, ForeignKey('sections.id'))
|
||||||
|
sourcetype = Column(String(250))
|
||||||
|
image=Column(String(250))
|
||||||
|
|
||||||
|
def __init__(self, url=None,title=None, published_date=None):
|
||||||
|
self.url=url
|
||||||
|
self.title=title
|
||||||
|
self.published_date=published_date
|
||||||
|
self.first_fetched=datetime.now()
|
||||||
|
def __json__(self):
|
||||||
|
return ArticleSchema().dump(self)[0]
|
||||||
|
|
||||||
|
def dict(self):
|
||||||
|
return {"id": str(int(self.id)), "title": self.title, "text": self.text, "author": self.author, "section":self.section, "sourcetype": self.sourcetype, "last_fetched": self.last_fetched, "first_fetched": self.first_fetched, "published_date": self.published_date, "date": self.date,"image": self.image, "url": self.url}
|
||||||
|
|
||||||
|
|
||||||
|
# @classmethod
|
||||||
|
# def sections(self):
|
||||||
|
# sects=db_session.query(Article.section).distinct().all()
|
||||||
|
# for i in range(len(sects)):
|
||||||
|
# sects[i]=sects[i][0]
|
||||||
|
# return sects
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_hash(cls, a):
|
||||||
|
fp = calc_fingerprint_h(a)
|
||||||
|
aa = Article.query.filter(Article.fingerprint==fp).first()
|
||||||
|
if aa is None:
|
||||||
|
clogger.debug( "new Article")
|
||||||
|
if a["published"] is not None:
|
||||||
|
if a["published"] is str:
|
||||||
|
pd= parse(a["published"])
|
||||||
|
else:
|
||||||
|
pd=a["published"]
|
||||||
|
else:
|
||||||
|
pd=None
|
||||||
|
aa=Article(a["url"], a["title"],pd)
|
||||||
|
aa.fingerprint = calc_fingerprint(aa)
|
||||||
|
db_session.add(aa)
|
||||||
|
db_session.commit()
|
||||||
|
return aa
|
||||||
|
|
||||||
|
def process_hash(self, a):
|
||||||
|
self.text=a["text"].decode('utf8')
|
||||||
|
if "image" in a:
|
||||||
|
self.image=a["image"]
|
||||||
|
if "author" in a:
|
||||||
|
self.author=a["author"]
|
||||||
|
if "title" in a:
|
||||||
|
self.title=a["title"]
|
||||||
|
if "author" in a:
|
||||||
|
self.author=a["author"]
|
||||||
|
if "sourcetype" in a:
|
||||||
|
self.sourcetype=a["sourcetype"]
|
||||||
|
if "section" in a:
|
||||||
|
self.section=Section.find_or_create(a["section"])
|
||||||
|
# if "last_fetched" in a:
|
||||||
|
# self.last_fetched=a["last_fetched"]
|
||||||
|
if "published_date" in a:
|
||||||
|
self.published_date=a["published_date"]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#flask.json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None)
|
||||||
|
|
||||||
|
#json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None)
|
||||||
65
articles/views.py
Normal file
65
articles/views.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
from flask import Blueprint, jsonify, render_template, abort, redirect, url_for, request
|
||||||
|
article_pages = Blueprint('articles', __name__)
|
||||||
|
from .model import Article
|
||||||
|
from .model import ArticleSchema
|
||||||
|
#import flask
|
||||||
|
from datetime import datetime
|
||||||
|
import json
|
||||||
|
|
||||||
|
#flask.json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None)
|
||||||
|
#flask.json.JSONEncoder.default = lambda self,obj: ((obj.dict()) if isinstance(obj, Article) else None)
|
||||||
|
from src import clogger
|
||||||
|
import json
|
||||||
|
from src.database import db_session, read_json
|
||||||
|
import flask
|
||||||
|
|
||||||
|
#flask.json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None)
|
||||||
|
flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Base, Article,CrawlUrl)) else None)
|
||||||
|
|
||||||
|
@article_pages.route("/")
|
||||||
|
@article_pages.route("")
|
||||||
|
@article_pages.route(".json")
|
||||||
|
def index():
|
||||||
|
articles=Article.query.all()
|
||||||
|
return jsonify(articles=articles)
|
||||||
|
|
||||||
|
@article_pages.route("/<int:id>",methods=['PUT'])
|
||||||
|
@article_pages.route("/<int:id>.json",methods=['PUT'])
|
||||||
|
def update(id):
|
||||||
|
article=Article.query.get(id)
|
||||||
|
clogger.info(request.data)
|
||||||
|
a=request.get_json()
|
||||||
|
article.text=a["text"]
|
||||||
|
db_session.commit()
|
||||||
|
return jsonify(article=article)
|
||||||
|
|
||||||
|
|
||||||
|
@article_pages.route("/<int:id>",methods=['GET'])
|
||||||
|
@article_pages.route("/<int:id>.json",methods=['GET'])
|
||||||
|
def get(id):
|
||||||
|
article=Article.query.get(id)
|
||||||
|
clogger.info(article)
|
||||||
|
# article=ArticleSchema().dump(article)[0]
|
||||||
|
return jsonify(article=article)
|
||||||
|
|
||||||
|
@article_pages.route("/<int:id>",methods=['DELETE'])
|
||||||
|
@article_pages.route("/<int:id>.json",methods=['DELETE'])
|
||||||
|
def delete(id):
|
||||||
|
article=Article.query.get(id)
|
||||||
|
clogger.info(id)
|
||||||
|
if article != None:
|
||||||
|
db_session.delete(article)
|
||||||
|
db_session.commit()
|
||||||
|
return jsonify(article={})
|
||||||
|
|
||||||
|
|
||||||
|
@article_pages.route("/",methods=['POST'])
|
||||||
|
@article_pages.route("",methods=['POST'])
|
||||||
|
@article_pages.route(".json",methods=['POST'])
|
||||||
|
def create():
|
||||||
|
article=Article()
|
||||||
|
a=read_json(request)
|
||||||
|
article.text=a["article"]["text"]
|
||||||
|
db_session.add(article)
|
||||||
|
db_session.commit()
|
||||||
|
return jsonify(article=article)
|
||||||
1
bot/__init__.py
Normal file
1
bot/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
from .bot import bot
|
||||||
140
bot/bot.py
Normal file
140
bot/bot.py
Normal file
@@ -0,0 +1,140 @@
|
|||||||
|
import telepot
|
||||||
|
import datetime
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
from Queue import Queue
|
||||||
|
#import os
|
||||||
|
from src import lg,cfg
|
||||||
|
#from gevent import spawn
|
||||||
|
from telepot.namedtuple import InlineKeyboardMarkup, InlineKeyboardButton
|
||||||
|
from telepot.delegate import (
|
||||||
|
per_chat_id, pave_event_space, include_callback_query_chat_id, create_open, per_inline_from_id )
|
||||||
|
from src.compiler import CrawlUrl
|
||||||
|
from gevent import spawn, monkey, Greenlet
|
||||||
|
|
||||||
|
def IKB(h):
|
||||||
|
return InlineKeyboardButton(text=h["text"], callback_data=h["callback_data"])
|
||||||
|
|
||||||
|
def IKB2(h):
|
||||||
|
return [IKB(h)]
|
||||||
|
def IKM(h):
|
||||||
|
return InlineKeyboardMarkup(inline_keyboard=[ map(IKB,h)])
|
||||||
|
|
||||||
|
def IKM2(h):
|
||||||
|
return InlineKeyboardMarkup(inline_keyboard= map(IKB2,h))
|
||||||
|
|
||||||
|
|
||||||
|
def query_que_url(url):
|
||||||
|
print(json.dumps(url))
|
||||||
|
return {"text": url.url, "callback_data":"/urls/"+str(url.id)+"/que"}
|
||||||
|
|
||||||
|
def handle_urls(handler, cmd):
|
||||||
|
curls=CrawlUrl.query.all()
|
||||||
|
#sent=handler.sender.sendMessage(json.dumps(curls))
|
||||||
|
kb= IKM2(map(query_que_url,curls))
|
||||||
|
print json.dumps(cmd)
|
||||||
|
if len(cmd) >= 4 and cmd[3]=="que":
|
||||||
|
sent=handler.sender.sendMessage("I qued url "+str(cmd[2]), reply_markup=None)
|
||||||
|
else:
|
||||||
|
sent=handler.sender.sendMessage("que?", reply_markup=kb)
|
||||||
|
handler._edit_msg_ident = telepot.message_identifier(sent)
|
||||||
|
handler._editor = telepot.helper.Editor(handler.bot, sent)
|
||||||
|
|
||||||
|
def execute_command(handler,cmd,msg=None):
|
||||||
|
if cmd[1]=='urls':
|
||||||
|
handle_urls(handler,cmd)
|
||||||
|
|
||||||
|
|
||||||
|
def handle(handler,msg):
|
||||||
|
content_type,chat_type,chat_id = telepot.glance(msg)
|
||||||
|
if msg.has_key('text'):
|
||||||
|
if msg['text'][0]=='/':
|
||||||
|
cmd = msg['text'].split("/")
|
||||||
|
execute_command(handler, cmd, msg)
|
||||||
|
if msg.has_key('data'):
|
||||||
|
lg.debug(msg['data'])
|
||||||
|
|
||||||
|
|
||||||
|
class InlineHandler(telepot.helper.InlineUserHandler, telepot.helper.AnswererMixin):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(InlineHandler, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
def on_inline_query(self, msg):
|
||||||
|
def compute_answer():
|
||||||
|
query_id, from_id, query_string = telepot.glance(msg, flavor='inline_query')
|
||||||
|
print(self.id, ':', 'Inline Query:', query_id, from_id, query_string)
|
||||||
|
|
||||||
|
articles = [{'type': 'article',
|
||||||
|
'id': 'abc', 'title': query_string, 'message_text': query_string}]
|
||||||
|
|
||||||
|
return articles
|
||||||
|
|
||||||
|
self.answerer.answer(msg, compute_answer)
|
||||||
|
|
||||||
|
def on_chosen_inline_result(self, msg):
|
||||||
|
from pprint import pprint
|
||||||
|
pprint(msg)
|
||||||
|
result_id, from_id, query_string = telepot.glance(msg, flavor='chosen_inline_result')
|
||||||
|
print(self.id, ':', 'Chosen Inline Result:', result_id, from_id, query_string)
|
||||||
|
|
||||||
|
|
||||||
|
class FetBot(telepot.helper.ChatHandler):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
# super(FetBot,self).__init__(*args,**kwargs)
|
||||||
|
super(FetBot,self).__init__( *args,**kwargs)
|
||||||
|
|
||||||
|
_editor=None
|
||||||
|
_edit_msg_ident=None
|
||||||
|
keyboard=IKM([{"text":"START","callback_data": "start"},
|
||||||
|
{"text":"Don't Start","callback_data":"notstart"}
|
||||||
|
])
|
||||||
|
keyboard =InlineKeyboardMarkup(
|
||||||
|
inline_keyboard=[[
|
||||||
|
InlineKeyboardButton(text='START', callback_data='start'),
|
||||||
|
InlineKeyboardButton(text='START', callback_data='start')
|
||||||
|
]]
|
||||||
|
)
|
||||||
|
def on_chat_message(self,msg):
|
||||||
|
handle(self,msg)
|
||||||
|
content_type,chat_type,chat_id = telepot.glance(msg)
|
||||||
|
lg.debug(content_type)
|
||||||
|
if content_type=="photo" or content_type=="sticker":
|
||||||
|
lg.debug("try to download %s" % msg[content_type][-1]["file_id"])
|
||||||
|
f=self.bot.getFile(msg[content_type][-1]['file_id'])
|
||||||
|
lg.debug(f)
|
||||||
|
self.bot.download_file(f['file_id'], "dwn/" + f['file_path'])
|
||||||
|
# self.bot.getFile(msg['photo'][-1]['file_id']), "dwn")
|
||||||
|
#self._cancel_last()
|
||||||
|
#sent=self.sender.sendMessage("Hello World", reply_markup=self.keyboard)
|
||||||
|
#self._editor = telepot.helper.Editor(self.bot, sent)
|
||||||
|
#self._edit_msg_ident = telepot.message_identifier(sent)
|
||||||
|
|
||||||
|
def on_callback_query(self, msg):
|
||||||
|
query_id, from_id, query_data = telepot.glance(msg, flavor='callback_query')
|
||||||
|
lg.debug(json.dumps(msg))
|
||||||
|
self._cancel_last()
|
||||||
|
if query_data[0]=='/':
|
||||||
|
cmd = query_data.split("/")
|
||||||
|
execute_command(self, cmd, msg)
|
||||||
|
|
||||||
|
# self.sender.sendMessage("Danke")
|
||||||
|
self.bot.answerCallbackQuery(query_id, text='Ok. But I am going to keep asking.')
|
||||||
|
#self.bot.answerCallbackQuery(query_id)
|
||||||
|
def _cancel_last(self):
|
||||||
|
if self._editor:
|
||||||
|
self._editor.editMessageReplyMarkup(reply_markup=None)
|
||||||
|
self._editor = None
|
||||||
|
self._edit_msg_ident = None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
bot=None
|
||||||
|
bot = telepot.DelegatorBot(cfg.token, [include_callback_query_chat_id(pave_event_space())(per_chat_id(),create_open,FetBot,timeout=20),
|
||||||
|
pave_event_space()(
|
||||||
|
per_inline_from_id(), create_open, InlineHandler, timeout=10),
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
10
compiler/README
Normal file
10
compiler/README
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
Das ist die API für den Compiler
|
||||||
|
Folgende Befehle sind implementiert:
|
||||||
|
GET doc: Diese Dokumentation!
|
||||||
|
GET initdb: Initialisiere die Datenbank, ACHTUNG Daten werden gelöscht
|
||||||
|
POST urls:
|
||||||
|
Erwartet Daten im Format {"url": {"type": typ, "url": "someurl.html"}}
|
||||||
|
Fügt diese Url der Überwachung hinzu
|
||||||
|
|
||||||
|
IN PROCESS:
|
||||||
|
GET urls: Alle Urls die überwacht werden sollen
|
||||||
1
compiler/README.html
Normal file
1
compiler/README.html
Normal file
@@ -0,0 +1 @@
|
|||||||
|
sdf
|
||||||
15
compiler/__init__.py
Normal file
15
compiler/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
|
||||||
|
|
||||||
|
#from mprocess import do_process, process_urllist
|
||||||
|
#from compiler import do_compile
|
||||||
|
#from mworker import run_fetch, run_process, run_compile
|
||||||
|
|
||||||
|
# include models for final objects
|
||||||
|
from src.models import Article
|
||||||
|
# starting workers
|
||||||
|
from mworker import start_workers
|
||||||
|
|
||||||
|
from models import add_url, CrawlUrl
|
||||||
|
#start_workers(1,1,1)
|
||||||
|
|
||||||
|
from fetching import announce_articleid
|
||||||
1
compiler/comp/__init__.py
Normal file
1
compiler/comp/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
from rss import rssfeed
|
||||||
1
compiler/comp/__init__py
Normal file
1
compiler/comp/__init__py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
from rss import rssfeed
|
||||||
8
compiler/comp/rss.py
Normal file
8
compiler/comp/rss.py
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
import feedparser
|
||||||
|
|
||||||
|
def rssfeed(url,raw):
|
||||||
|
al=[]
|
||||||
|
f=feedparser.parse(raw)
|
||||||
|
for e in f['entries']:
|
||||||
|
al.append(e['link'])
|
||||||
|
return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}
|
||||||
153
compiler/compile.py
Normal file
153
compiler/compile.py
Normal file
@@ -0,0 +1,153 @@
|
|||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import crawler.objects.models
|
||||||
|
#from crawler.objects.models import Object
|
||||||
|
from dateutil.parser import parse
|
||||||
|
from datetime import datetime
|
||||||
|
import re
|
||||||
|
def hello():
|
||||||
|
return "hello"
|
||||||
|
|
||||||
|
|
||||||
|
def fetarticle(o):
|
||||||
|
sp=BeautifulSoup(o.raw_fixed)
|
||||||
|
d={}
|
||||||
|
h=sp.find("h1", {"itemprop": "name"})
|
||||||
|
d["title"]=unicode(h.text).strip()
|
||||||
|
h=sp.find("div", {"itemprop": "articleBody"})
|
||||||
|
if h is not None:
|
||||||
|
d["text"]=(h.encode_contents()).strip()
|
||||||
|
else:
|
||||||
|
d["text"]=""
|
||||||
|
d["url"]=o.url
|
||||||
|
h=sp.find("span", {"itemprop": "author"})
|
||||||
|
if h is not None:
|
||||||
|
d["author"]=h.text.strip()
|
||||||
|
h=sp.find("span", {"itemprop": "articleSection"})
|
||||||
|
if h is not None:
|
||||||
|
d["section"]= "FET - " + h.text.strip()
|
||||||
|
|
||||||
|
h=sp.find("span", {"itemprop": "datePublished"})
|
||||||
|
if h is not None:
|
||||||
|
d["published"]=parse(h.encode_contents().strip())
|
||||||
|
h=sp.find("meta", {"property": "og:image"})
|
||||||
|
|
||||||
|
if h is not None:
|
||||||
|
d["image"]=h.attrs["content"]
|
||||||
|
|
||||||
|
hh=sp.find_all("div", {"class":"media"})
|
||||||
|
for h in hh:
|
||||||
|
if h is not None:
|
||||||
|
h=h.find("div", {"class": "pull-left"})
|
||||||
|
if h is not None:
|
||||||
|
h=h.find("a")
|
||||||
|
if h is not None:
|
||||||
|
d["image2"]=crawler.objects.models.download_file(h.attrs["href"])
|
||||||
|
return {"article": d}
|
||||||
|
|
||||||
|
def fsarcharticle(o):
|
||||||
|
sp=BeautifulSoup(o.raw_fixed)
|
||||||
|
d={}
|
||||||
|
h=sp.find("h1", {"class": "title"})
|
||||||
|
if h is not None:
|
||||||
|
d["title"]=h.text.strip()
|
||||||
|
d["url"]=o.url
|
||||||
|
d["published"]=None
|
||||||
|
h=sp.find("article")
|
||||||
|
h=h.find("div", {"class": "content"})
|
||||||
|
d["text"]=h.encode_contents().strip()
|
||||||
|
h=sp.find("article").find("h1", {"class": "title"})
|
||||||
|
if h is not None:
|
||||||
|
d["title"]=h.text.strip()
|
||||||
|
else:
|
||||||
|
d["title"]=""
|
||||||
|
d["image"]=""
|
||||||
|
d["sourcetype"]="fsarcharticle"
|
||||||
|
d["section"]="fsarch"
|
||||||
|
d["author"]=None
|
||||||
|
return {"article": d}
|
||||||
|
|
||||||
|
def fetindex(o):
|
||||||
|
# if type(o) is not Object:
|
||||||
|
# raise TypeError
|
||||||
|
if o.raw is None:
|
||||||
|
raise Error
|
||||||
|
print "compile_fetindex"
|
||||||
|
html=BeautifulSoup(o.raw_fixed)
|
||||||
|
h = html.find("li", {"class": "next_page" })
|
||||||
|
if h is not None:
|
||||||
|
nl=h.find("a")
|
||||||
|
nl=crawler.objects.models.fix_link(o.url,nl.attrs["href"])
|
||||||
|
else:
|
||||||
|
nl=None
|
||||||
|
h= html.find("ul", {"id": "neuigkeiten"})
|
||||||
|
links=h.find_all("a")
|
||||||
|
al = []
|
||||||
|
for t in links:
|
||||||
|
al.append(t.attrs["href"])
|
||||||
|
return {"url": o.url, "next_page": nl, "article_links": al, "objecttype": "index" }
|
||||||
|
|
||||||
|
def fsarchindex(o):
|
||||||
|
if o.raw is None:
|
||||||
|
raise Error
|
||||||
|
html=BeautifulSoup(o.raw_fixed)
|
||||||
|
h= html.find("article")
|
||||||
|
print unicode(h)
|
||||||
|
links=h.find_all("a")
|
||||||
|
al = []
|
||||||
|
fl=[]
|
||||||
|
for t in links:
|
||||||
|
url=t.attrs["href"]
|
||||||
|
if re.search("fachschaftarchitektur\.at", url):
|
||||||
|
al.append(t.attrs["href"])
|
||||||
|
if re.search("facebook\.com/events", url):
|
||||||
|
fl.append(t.attrs["href"])
|
||||||
|
|
||||||
|
return {"url": o.url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"}
|
||||||
|
|
||||||
|
|
||||||
|
def fsbizindex(o):
|
||||||
|
if o.raw is None:
|
||||||
|
raise Error
|
||||||
|
print "compile_fsbizindex"
|
||||||
|
html=BeautifulSoup(o.raw_fixed)
|
||||||
|
h= html.find("section", {"id": "primary"})
|
||||||
|
links=h.find_all("h1", {"class": "entry-title"})
|
||||||
|
al = []
|
||||||
|
for t in links:
|
||||||
|
|
||||||
|
al.append(t.find("a").attrs["href"])
|
||||||
|
return {"url": o.url,"article_links": al,"objecttype": "index"}
|
||||||
|
|
||||||
|
|
||||||
|
def fsmbindex(o):
|
||||||
|
if o.raw is None:
|
||||||
|
raise Error
|
||||||
|
html=BeautifulSoup(o.raw_fixed)
|
||||||
|
h= html.find("a",{"class": "next"})
|
||||||
|
if h is not None:
|
||||||
|
np=h.attrs["href"]
|
||||||
|
else:
|
||||||
|
np=None
|
||||||
|
h=html.find("div", {"id": "main"}).find("div", {"class": "inside"}).find("div", {"class": "mod_newslist"})
|
||||||
|
if h is not None:
|
||||||
|
ats=h.find_all("div",{"class": "block"})
|
||||||
|
articles=[]
|
||||||
|
for a in ats:
|
||||||
|
aa={}
|
||||||
|
h=a.find("h3")
|
||||||
|
if h is not None:
|
||||||
|
aa["title"] = h.text.strip()
|
||||||
|
h=a.find("div", {"class": "ce_text"})
|
||||||
|
if h is not None:
|
||||||
|
aa["text"] = (h.encode_contents()).strip()
|
||||||
|
aa["info"]=[]
|
||||||
|
hh=a.find_all("p", {"class": "info"},recursive=False)
|
||||||
|
for h in hh:
|
||||||
|
aa["info"].append(unicode(h.text))
|
||||||
|
if re.search(r'von', str(h)):
|
||||||
|
h1= re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text))
|
||||||
|
aa["published"] =parse(h1.strip())
|
||||||
|
aa["author"]=re.sub(r'^.*von(.*)$', r'\1',unicode(h.text)).strip() #h.text + "--" #+ re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',hh)
|
||||||
|
aa["section"]="FSMB"
|
||||||
|
articles.append(aa)
|
||||||
|
return {"url": o.url, "next_page": np, "articles": articles,"objecttype": "articles"}
|
||||||
258
compiler/compiler.py
Normal file
258
compiler/compiler.py
Normal file
@@ -0,0 +1,258 @@
|
|||||||
|
from bs4 import BeautifulSoup
|
||||||
|
#import crawler.objects.models
|
||||||
|
#from crawler.objects.models import Object
|
||||||
|
from dateutil.parser import parse
|
||||||
|
from datetime import datetime
|
||||||
|
import re
|
||||||
|
import urlparse
|
||||||
|
from src import clogger, cfg
|
||||||
|
from src.fb import graph
|
||||||
|
from fixing import fix_link
|
||||||
|
from facebook import GraphAPIError
|
||||||
|
#from fetching import downloadfile
|
||||||
|
import json
|
||||||
|
def do_compile(tpe, cont):
|
||||||
|
if type(cont) != dict:
|
||||||
|
clogger.error("Type Error for do compile for :"+str(cont["url"]))
|
||||||
|
# Starting to compile an generic object
|
||||||
|
if "url" not in cont:
|
||||||
|
clogger.error("no url can't compile "+tpe)
|
||||||
|
else:
|
||||||
|
clogger.debug("compile: type:"+str(tpe)+ "| "+ str(cont["url"]))
|
||||||
|
if tpe in compiler:
|
||||||
|
cont=compiler[tpe](cont["url"], cont["raw"])
|
||||||
|
return cont
|
||||||
|
|
||||||
|
from comp import rssfeed
|
||||||
|
|
||||||
|
def dummyarticle(url, raw):
|
||||||
|
return {"url": url, "article":{"url": url, "section": "dummysection", "sourcetype": "dummy", "title":"dummytitle", "text": raw, "image": "fff", "author": "me", "published": None}}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def htufeed(url,raw):
|
||||||
|
al=[]
|
||||||
|
f=feedparser.parse(raw)
|
||||||
|
for e in f['entries']:
|
||||||
|
al.append(e['link'])
|
||||||
|
return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}
|
||||||
|
|
||||||
|
|
||||||
|
def htuarticle(url,raw):
|
||||||
|
sp=BeautifulSoup(raw)
|
||||||
|
d={}
|
||||||
|
h=sp.find("meta", {"property": "og:image"})
|
||||||
|
if h is not None:
|
||||||
|
d["image"]=h.attrs["content"]
|
||||||
|
d["image2"]=d["image"]
|
||||||
|
h=sp.find("div", {"class": "patternRevInfo"})
|
||||||
|
if h is not None:
|
||||||
|
# clogger.debug(h.text.strip())
|
||||||
|
h1= re.sub(r'.*- (\d+) ([a-zA-Z]+) (\d+) - ([:\d]+)[^\d]*', r'\3/\2/\1 \4',unicode(h.text.strip()))
|
||||||
|
# clogger.debug(h1)
|
||||||
|
d["published"]=parse(h1)
|
||||||
|
# clogger.debug(parse(h1))
|
||||||
|
# clogger.debug(d["published"])
|
||||||
|
h=h.find("a")
|
||||||
|
if h is not None:
|
||||||
|
d["author"]=h.text.strip()
|
||||||
|
h=sp.find("div", {"class": "foswikiTopic"})
|
||||||
|
h1=h.find("h4")
|
||||||
|
if h1 is not None:
|
||||||
|
d["title"]= h1.text.strip()
|
||||||
|
h1.extract() # remove head
|
||||||
|
else:
|
||||||
|
h1=sp.find("meta", {"name": "WEBTOPIC"})
|
||||||
|
d["title"]= h1.attrs["content"]
|
||||||
|
d["text"]=(h.encode_contents()).strip()
|
||||||
|
d["section"]="HTU"
|
||||||
|
d["url"]=url
|
||||||
|
# clogger.debug(d)
|
||||||
|
return {"article": d}
|
||||||
|
|
||||||
|
|
||||||
|
def fetarticle(url, raw):
|
||||||
|
sp=BeautifulSoup(raw)
|
||||||
|
d={}
|
||||||
|
h=sp.find("h1", {"itemprop": "name"})
|
||||||
|
d["title"]=unicode(h.text).strip()
|
||||||
|
h=sp.find("div", {"itemprop": "articleBody"})
|
||||||
|
if h is not None:
|
||||||
|
d["text"]=(h.encode_contents()).strip()
|
||||||
|
else:
|
||||||
|
d["text"]=""
|
||||||
|
d["url"]=url
|
||||||
|
h=sp.find("span", {"itemprop": "author"})
|
||||||
|
if h is not None:
|
||||||
|
d["author"]=h.text.strip()
|
||||||
|
h=sp.find("span", {"itemprop": "articleSection"})
|
||||||
|
if h is not None:
|
||||||
|
d["section"]= "FET - " + h.text.strip()
|
||||||
|
|
||||||
|
h=sp.find("span", {"itemprop": "datePublished"})
|
||||||
|
if h is not None:
|
||||||
|
d["published"]=parse(h.encode_contents().strip())
|
||||||
|
|
||||||
|
h=sp.find("meta", {"property": "og:image"})
|
||||||
|
if h is not None:
|
||||||
|
d["image"]=h.attrs["content"]
|
||||||
|
d["image2"]=d["image"]
|
||||||
|
# hh=sp.find_all("div", {"class":"media"})
|
||||||
|
# for h in hh:
|
||||||
|
# if h is not None:
|
||||||
|
# h=h.find("div", {"class": "pull-left"})
|
||||||
|
# if h is not None:
|
||||||
|
# h=h.find("a")
|
||||||
|
# if h is not None:
|
||||||
|
# d["image2"]=downloadfile(fix_link(url,h.attrs["href"]))
|
||||||
|
return {"article": d}
|
||||||
|
|
||||||
|
|
||||||
|
def fsarcharticle(url, raw):
|
||||||
|
sp=BeautifulSoup(raw)
|
||||||
|
d={}
|
||||||
|
h=sp.find("h1", {"class": "title"})
|
||||||
|
if h is not None:
|
||||||
|
d["title"]=h.text.strip()
|
||||||
|
d["url"]=url
|
||||||
|
d["published"]=None
|
||||||
|
h=sp.find("article")
|
||||||
|
if h is not None:
|
||||||
|
h=h.find("div", {"class": "content"})
|
||||||
|
d["text"]=h.encode_contents().strip()
|
||||||
|
h=sp.find("article")
|
||||||
|
if h is not None:
|
||||||
|
h=h.find("h1", {"class": "title"})
|
||||||
|
if h is not None:
|
||||||
|
d["title"]=h.text.strip()
|
||||||
|
else:
|
||||||
|
d["title"]=""
|
||||||
|
d["image"]=""
|
||||||
|
d["sourcetype"]="fsarcharticle"
|
||||||
|
d["section"]="fsarch"
|
||||||
|
d["author"]=None
|
||||||
|
return {"article": d}
|
||||||
|
|
||||||
|
def fetindex(url, raw):
|
||||||
|
if raw is None:
|
||||||
|
raise Error
|
||||||
|
# clogger.debug("compile_fetindex: "+str(url))
|
||||||
|
html=BeautifulSoup(raw)
|
||||||
|
h = html.find("li", {"class": "next_page" })
|
||||||
|
if h is not None:
|
||||||
|
nl=h.find("a")
|
||||||
|
nl=fix_link(url,nl.attrs["href"])
|
||||||
|
else:
|
||||||
|
nl=None
|
||||||
|
h= html.find("ul", {"id": "neuigkeiten"})
|
||||||
|
al = []
|
||||||
|
if h is not None:
|
||||||
|
links=h.find_all("a")
|
||||||
|
for t in links:
|
||||||
|
al.append(t.attrs["href"])
|
||||||
|
return {"url": url, "next_page": nl, "article_links": al, "objecttype": "index" }
|
||||||
|
|
||||||
|
def fsarchindex(url, raw):
|
||||||
|
if raw is None:
|
||||||
|
raise Error
|
||||||
|
html=BeautifulSoup(raw)
|
||||||
|
h= html.find("article")
|
||||||
|
print unicode(h)
|
||||||
|
links=h.find_all("a")
|
||||||
|
al = []
|
||||||
|
fl=[]
|
||||||
|
for t in links:
|
||||||
|
url=t.attrs["href"]
|
||||||
|
if re.search("fachschaftarchitektur\.at", url):
|
||||||
|
al.append(t.attrs["href"])
|
||||||
|
if re.search("facebook\.com/events", url):
|
||||||
|
fl.append(t.attrs["href"])
|
||||||
|
|
||||||
|
return {"url": url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"}
|
||||||
|
|
||||||
|
|
||||||
|
def fsbizindex(url, raw):
|
||||||
|
if raw is None:
|
||||||
|
raise Error
|
||||||
|
print "compile_fsbizindex"
|
||||||
|
html=BeautifulSoup(raw)
|
||||||
|
h= html.find("section", {"id": "primary"})
|
||||||
|
links=h.find_all("h1", {"class": "entry-title"})
|
||||||
|
al = []
|
||||||
|
for t in links:
|
||||||
|
|
||||||
|
al.append(t.find("a").attrs["href"])
|
||||||
|
return {"url": url,"article_links": al,"objecttype": "index"}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def fbfeed(url, raw):
|
||||||
|
js = json.loads(raw)
|
||||||
|
arts=[]
|
||||||
|
u=urlparse.urlparse(url)
|
||||||
|
for m in js["data"]:
|
||||||
|
aa={}
|
||||||
|
aa["url"]=urlparse.urlunsplit(("http","www.facebook.at",m["id"],"",""))
|
||||||
|
aa["published"] =parse(m["created_time"])
|
||||||
|
if m.has_key("message")==True:
|
||||||
|
aa["text"] = m["message"]
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
h=graph.get_object(id=m["id"].split("_")[1])
|
||||||
|
if h.has_key("description"):
|
||||||
|
aa["text"]=h["description"]
|
||||||
|
else:
|
||||||
|
aa["text"]=json.dumps()
|
||||||
|
except GraphAPIError:
|
||||||
|
aa["text"]=""
|
||||||
|
if m.has_key("story")==True:
|
||||||
|
aa["title"] = m["story"]
|
||||||
|
else:
|
||||||
|
aa["title"] = u[1]+ " at " + aa["published"].strftime("%Y-%m-%d %H:%M")
|
||||||
|
aa["section"]="Facebook: "+u[1]
|
||||||
|
arts.append(aa)
|
||||||
|
return {"url": url, "next_page": js["paging"]["next"],"articles": arts}
|
||||||
|
|
||||||
|
def fsmbindex(url, raw):
|
||||||
|
if raw is None:
|
||||||
|
raise Error
|
||||||
|
html=BeautifulSoup(raw)
|
||||||
|
h= html.find("a",{"class": "next"})
|
||||||
|
if h is not None:
|
||||||
|
np=h.attrs["href"]
|
||||||
|
else:
|
||||||
|
np=None
|
||||||
|
h=html.find("div", {"id": "main"}).find("div", {"class": "inside"}).find("div", {"class": "mod_newslist"})
|
||||||
|
if h is not None:
|
||||||
|
ats=h.find_all("div",{"class": "block"})
|
||||||
|
articles=[]
|
||||||
|
for a in ats:
|
||||||
|
aa={}
|
||||||
|
h=a.find("h3")
|
||||||
|
if h is not None:
|
||||||
|
aa["title"] = h.text.strip()
|
||||||
|
h=a.find("div", {"class": "ce_text"})
|
||||||
|
if h is not None:
|
||||||
|
aa["text"] = (h.encode_contents()).strip()
|
||||||
|
aa["info"]=[]
|
||||||
|
hh=a.find_all("p", {"class": "info"},recursive=False)
|
||||||
|
for h in hh:
|
||||||
|
aa["info"].append(unicode(h.text))
|
||||||
|
if re.search(r'von', str(h)):
|
||||||
|
h1= re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text))
|
||||||
|
aa["published"] =parse(h1.strip())
|
||||||
|
aa["author"]=re.sub(r'^.*von(.*)$', r'\1',unicode(h.text)).strip() #h.text + "--" #+ re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',hh)
|
||||||
|
aa["section"]="FSMB"
|
||||||
|
articles.append(aa)
|
||||||
|
return {"url": url, "next_page": np, "articles": articles,"objecttype": "articles"}
|
||||||
|
|
||||||
|
compiler = {"fetindex": fetindex, "fetarticle": fetarticle, "fsarchindex": fsarchindex, "fsarcharticle": fsarcharticle, "fsmbindex": fsmbindex, "fsbizindex": fsbizindex, "dummyarticle": dummyarticle,"htuarticle": htuarticle, "htufeed": htufeed, "fbfeed": fbfeed, "fschfeed": rssfeed}
|
||||||
|
|
||||||
|
compiler = cfg.compiler
|
||||||
|
for i in compiler:
|
||||||
|
compiler[i]=eval(compiler[i])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
article_types={"fetindex" : "fetarticle", "fsarchindex": "fsarcharticle", "fsbizindex": "fsbizarticle", "dummyindex": "dummyarticle", "htufeed": "htuarticle"}
|
||||||
67
compiler/fetching.py
Normal file
67
compiler/fetching.py
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
from requests import session
|
||||||
|
s=session()
|
||||||
|
from src import package_directory, download_path,cfg
|
||||||
|
from os import path, makedirs
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
from gevent import spawn
|
||||||
|
from src import clogger
|
||||||
|
from src.fb import graph
|
||||||
|
from hashlib import md5
|
||||||
|
import errno
|
||||||
|
import urlparse
|
||||||
|
def announce_articleid(id):
|
||||||
|
for u in cfg.announcearticle_url:
|
||||||
|
s.get( u % id)
|
||||||
|
|
||||||
|
def downloadfile(url):
|
||||||
|
relative_name=path.join("downloads",str(md5(url).hexdigest()),url.split('/')[-1])
|
||||||
|
local_filename = path.join(download_path,relative_name)
|
||||||
|
if not os.path.exists(os.path.dirname(local_filename)):
|
||||||
|
try:
|
||||||
|
os.makedirs(os.path.dirname(local_filename))
|
||||||
|
except OSError as exc: # Guard against race condition
|
||||||
|
if exc.errno != errno.EEXIST:
|
||||||
|
raise
|
||||||
|
if not path.exists(local_filename):
|
||||||
|
spawn(fetch_load_file, url, local_filename)
|
||||||
|
return relative_name
|
||||||
|
|
||||||
|
from models import CrawlCache
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_page(furl):
|
||||||
|
current_time = datetime.utcnow()
|
||||||
|
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
|
||||||
|
u=urlparse.urlparse(furl)
|
||||||
|
if u[0] == '':
|
||||||
|
furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
|
||||||
|
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
|
||||||
|
if cc is None:
|
||||||
|
clogger.debug("fetching url: "+ str(furl))
|
||||||
|
if u[0]=='fb':
|
||||||
|
tx = json.dumps(graph.get_object(id=u[1]+u[2]))
|
||||||
|
else:
|
||||||
|
tx=s.get(furl).text
|
||||||
|
CrawlCache.store(furl,tx)
|
||||||
|
else:
|
||||||
|
#if furl is not None:
|
||||||
|
# clogger.debug("cache hit")
|
||||||
|
tx=cc.raw
|
||||||
|
return tx
|
||||||
|
|
||||||
|
def fetch_load_file(furl, path):
|
||||||
|
try:
|
||||||
|
clogger.info("Downloading "+ str(furl))
|
||||||
|
r = s.get(furl, stream=True)
|
||||||
|
f = open(path, 'wb')
|
||||||
|
for chunk in r.iter_content(chunk_size=1024):
|
||||||
|
if chunk: # filter out keep-alive new chunks
|
||||||
|
f.write(chunk)
|
||||||
|
f.close()
|
||||||
|
except Exception, e:
|
||||||
|
#clogger.error("Error Occured during fetching:"+str(furl))
|
||||||
|
clogger.error(e,exc_info=True)
|
||||||
37
compiler/fixing.py
Normal file
37
compiler/fixing.py
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from urlparse import urlparse, urlunparse, urljoin
|
||||||
|
from fetching import downloadfile
|
||||||
|
import bleach
|
||||||
|
|
||||||
|
def fix_link(url, link):
|
||||||
|
r= urlparse(link)
|
||||||
|
if r.scheme is None or r.scheme == '':
|
||||||
|
return urljoin(url,link)
|
||||||
|
else:
|
||||||
|
return link
|
||||||
|
|
||||||
|
def fix_file(url, link):
|
||||||
|
u=fix_link(url,link)
|
||||||
|
return downloadfile(u)
|
||||||
|
|
||||||
|
def load_file(url, link):
|
||||||
|
return fix_file(url,link)
|
||||||
|
|
||||||
|
|
||||||
|
def fix_html(html, baseurl):
|
||||||
|
html=bleach.clean(html, tags=['b','p','span','a','img','div','br','strong','ul','li'], strip=True)
|
||||||
|
sp=BeautifulSoup(html)
|
||||||
|
images=sp.find_all("img")
|
||||||
|
for t in images:
|
||||||
|
if "src" in t.attrs and t.attrs["src"] is not None:
|
||||||
|
t.attrs["src"]=fix_file(baseurl,t.attrs["src"])
|
||||||
|
links=sp.find_all("a")
|
||||||
|
for t in links:
|
||||||
|
if "href" in t.attrs:
|
||||||
|
t.attrs["href"]=fix_link(baseurl, t.attrs["href"])
|
||||||
|
for t in sp.find_all("script"):
|
||||||
|
t.extract()
|
||||||
|
b=sp.find("base")
|
||||||
|
if b is not None:
|
||||||
|
b.attrs["href"]=""
|
||||||
|
return sp
|
||||||
75
compiler/models.py
Normal file
75
compiler/models.py
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text
|
||||||
|
from datetime import datetime
|
||||||
|
from src.database import Base2
|
||||||
|
from src.database import db_session2
|
||||||
|
from mqueues import put_fetch_queue
|
||||||
|
from marshmallow import Schema,fields,ValidationError
|
||||||
|
import json
|
||||||
|
import flask
|
||||||
|
|
||||||
|
def add_url(tpe, url):
|
||||||
|
cu=CrawlUrl.find_or_create(tpe,url)
|
||||||
|
db_session2.add(cu)
|
||||||
|
db_session2.commit()
|
||||||
|
cu.schedule()
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlUrlSchema(Schema):
|
||||||
|
id=fields.Integer()
|
||||||
|
tpe=fields.String()
|
||||||
|
url=fields.String()
|
||||||
|
last_fetched=fields.DateTime()
|
||||||
|
fetched = fields.DateTime()
|
||||||
|
|
||||||
|
class CrawlUrl(Base2):
|
||||||
|
__tablename__='crawlurls'
|
||||||
|
id = Column(Integer, primary_key=True)
|
||||||
|
tpe=Column(String(250))
|
||||||
|
url = Column(String(250))
|
||||||
|
last_fetched = Column(DateTime)
|
||||||
|
def fetched(self):
|
||||||
|
CrawlCache.query.find(CrawlCache.url==self.url).first()
|
||||||
|
@classmethod
|
||||||
|
def find_or_create(self, tpe, url):
|
||||||
|
aa = CrawlUrl.query.filter(CrawlUrl.url==url).filter(CrawlUrl.tpe==tpe).first()
|
||||||
|
if aa is None:
|
||||||
|
aa=CrawlUrl(tpe,url)
|
||||||
|
return aa
|
||||||
|
def schedule(self):
|
||||||
|
put_fetch_queue((0, self.tpe, self.url))
|
||||||
|
def __init__(self, tpe, url):
|
||||||
|
self.url=url
|
||||||
|
self.tpe=tpe
|
||||||
|
def __json__(self):
|
||||||
|
return CrawlUrlSchema().dump(self)[0]
|
||||||
|
|
||||||
|
class CrawlCacheSchema(Schema):
|
||||||
|
id=fields.Integer()
|
||||||
|
raw=fields.String()
|
||||||
|
url=fields.String()
|
||||||
|
fetched=fields.DateTime()
|
||||||
|
|
||||||
|
class CrawlCache(Base2):
|
||||||
|
__tablename__='crawlcache'
|
||||||
|
id = Column(Integer, primary_key=True)
|
||||||
|
url=Column(String(250))
|
||||||
|
fetched=Column(DateTime)
|
||||||
|
raw=Column(Text)
|
||||||
|
|
||||||
|
def __init__(self, url,rw):
|
||||||
|
self.url=url
|
||||||
|
self.raw=rw
|
||||||
|
self.fetched=datetime.utcnow()
|
||||||
|
def __json__(self):
|
||||||
|
return CrawlCacheSchema().dump(self)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def store(cls, url, rw):
|
||||||
|
cc=CrawlCache(url,rw)
|
||||||
|
db_session2.add(cc)
|
||||||
|
db_session2.commit()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, CrawlUrl) else None)
|
||||||
74
compiler/mprocess.py
Normal file
74
compiler/mprocess.py
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
from src import clogger # Logger for crawler
|
||||||
|
from src.models import Article # Article model
|
||||||
|
from datetime import datetime
|
||||||
|
from src.database import db_session
|
||||||
|
from mqueues import fetch_queue, compile_queue, put_fetch_queue
|
||||||
|
from fetching import fetch_page, downloadfile, announce_articleid
|
||||||
|
from fixing import fix_html, fix_file
|
||||||
|
|
||||||
|
from compiler import article_types
|
||||||
|
from fixing import fix_link
|
||||||
|
# process article expects an hash with raw data for the article and puts it into an
|
||||||
|
# article object stored in the database it is intended to prevent dublicates
|
||||||
|
|
||||||
|
def is_article_hash(h):
|
||||||
|
return "text" in h and "url" in h and "sourcetype" in h and "section" in h
|
||||||
|
|
||||||
|
def process_article(art):
|
||||||
|
if not is_article_hash(art):
|
||||||
|
clogger.error("Invalid article hash:" + str(art))
|
||||||
|
aa=None
|
||||||
|
else:
|
||||||
|
art["text"]=fix_html(art["text"],art["url"])
|
||||||
|
if "image" in art:
|
||||||
|
art["image"]=fix_file(art["url"], art["image"])
|
||||||
|
clogger.info(art)
|
||||||
|
aa = Article.from_hash(art)
|
||||||
|
aa.process_hash(art)
|
||||||
|
aa.last_fetched=datetime.now()
|
||||||
|
aa.sourcetype=art["sourcetype"]
|
||||||
|
db_session.add(aa)
|
||||||
|
db_session.commit()
|
||||||
|
clogger.debug("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
|
||||||
|
# announce_articleid(aa.id)
|
||||||
|
return aa
|
||||||
|
|
||||||
|
# process a single found url
|
||||||
|
def process_url(url,tpe, parent_url):
|
||||||
|
#clogger.debug("process URL of type "+ tpe + ": " + url)
|
||||||
|
if parent_url is not None:
|
||||||
|
url=fix_link(parent_url, url)
|
||||||
|
put_fetch_queue((0,tpe,url))
|
||||||
|
|
||||||
|
|
||||||
|
# process a url list
|
||||||
|
def process_urllist(urllist, tpe, parent_url):
|
||||||
|
for u in urllist:
|
||||||
|
process_url(u,tpe, parent_url)
|
||||||
|
|
||||||
|
|
||||||
|
def do_process(tpe, cont):
|
||||||
|
urllist=[]
|
||||||
|
# clogger.debug("process :" + str(cont))
|
||||||
|
if "article_links" in cont:
|
||||||
|
process_urllist(cont["article_links"], article_types[tpe], cont["url"])
|
||||||
|
if "index_links" in cont:
|
||||||
|
process_urllist(cont["index_links"], tpe , cont["url"])
|
||||||
|
|
||||||
|
if "next_page" in cont and cont["next_page"] is not None:
|
||||||
|
process_url(cont["next_page"],tpe, cont["url"])
|
||||||
|
|
||||||
|
if "article" in cont:
|
||||||
|
art=cont["article"]
|
||||||
|
art["sourcetype"]=tpe
|
||||||
|
process_article(art)
|
||||||
|
|
||||||
|
if "articles" in cont:
|
||||||
|
clogger.debug("articles")
|
||||||
|
for a in cont["articles"]:
|
||||||
|
if "title" in a:
|
||||||
|
a["sourcetype"]=tpe
|
||||||
|
if a.has_key("url")==False:
|
||||||
|
a["url"]=cont["url"]
|
||||||
|
process_article(a)
|
||||||
|
return
|
||||||
8
compiler/mqueues.py
Normal file
8
compiler/mqueues.py
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
from gevent.queue import Queue, JoinableQueue
|
||||||
|
fetch_queue = Queue()
|
||||||
|
compile_queue = Queue()
|
||||||
|
process_queue = Queue()
|
||||||
|
|
||||||
|
def put_fetch_queue(o):
|
||||||
|
fetch_queue.put(o)
|
||||||
|
|
||||||
58
compiler/mworker.py
Normal file
58
compiler/mworker.py
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
|
||||||
|
from mqueues import fetch_queue, compile_queue, process_queue
|
||||||
|
from compiler import do_compile
|
||||||
|
from mprocess import do_process
|
||||||
|
from fetching import fetch_page
|
||||||
|
from gevent import spawn
|
||||||
|
from itertools import repeat
|
||||||
|
from src import clogger
|
||||||
|
def start_workers(f,c,p):
|
||||||
|
for _ in range(f):
|
||||||
|
clogger.debug("spawn fetchworker")
|
||||||
|
spawn(work_fetch)
|
||||||
|
for _ in range(c):
|
||||||
|
spawn(work_compile)
|
||||||
|
for _ in range(p):
|
||||||
|
spawn(work_process)
|
||||||
|
|
||||||
|
def work_fetch():
|
||||||
|
while True:
|
||||||
|
run_fetch()
|
||||||
|
|
||||||
|
def work_process():
|
||||||
|
while True:
|
||||||
|
run_process()
|
||||||
|
def work_compile():
|
||||||
|
while True:
|
||||||
|
run_compile()
|
||||||
|
|
||||||
|
|
||||||
|
def queue_url(tpe, url):
|
||||||
|
fetch_queue.put((0,tpe,url))
|
||||||
|
|
||||||
|
|
||||||
|
# fetch a page from the url list
|
||||||
|
def run_fetch():
|
||||||
|
tc, tpe, url = fetch_queue.get()
|
||||||
|
if tpe is not "dummyarticle" and tpe is not "dummyindex":
|
||||||
|
rw=fetch_page(url)
|
||||||
|
else:
|
||||||
|
rw="<p> dummytext</p>"
|
||||||
|
compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw}))
|
||||||
|
return rw
|
||||||
|
# fetch_queue.task_done()
|
||||||
|
|
||||||
|
#comile something from the compile list
|
||||||
|
def run_compile():
|
||||||
|
tc,tpe,h = compile_queue.get()
|
||||||
|
h=do_compile(tpe,h)
|
||||||
|
process_queue.put((0,tpe, h))
|
||||||
|
return h
|
||||||
|
# compile_queue.task_done()
|
||||||
|
|
||||||
|
def run_process():
|
||||||
|
tc,tpe,h = process_queue.get()
|
||||||
|
do_process(tpe, h)
|
||||||
|
return h
|
||||||
|
# process_queue.task_done()
|
||||||
|
|
||||||
146
compiler/views.py
Normal file
146
compiler/views.py
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
from flask import Blueprint, jsonify, render_template, abort, redirect, url_for, request
|
||||||
|
compiler_pages = Blueprint('compiler', __name__,
|
||||||
|
template_folder='.')
|
||||||
|
|
||||||
|
from src.database import db_session2,init_db,read_json,init_db2
|
||||||
|
from .models import CrawlUrl
|
||||||
|
from .models import CrawlCache, CrawlCacheSchema
|
||||||
|
from .models import CrawlUrlSchema
|
||||||
|
from src import clogger
|
||||||
|
from src.articles import Article
|
||||||
|
#import mworker
|
||||||
|
import flask
|
||||||
|
import json
|
||||||
|
import mworker
|
||||||
|
|
||||||
|
from compiler import do_compile
|
||||||
|
from fetching import fetch_page
|
||||||
|
|
||||||
|
#flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Article,CrawlUrl)) else None)
|
||||||
|
|
||||||
|
@compiler_pages.route("/")
|
||||||
|
@compiler_pages.route("")
|
||||||
|
@compiler_pages.route(".json")
|
||||||
|
def index():
|
||||||
|
status="For documentation goto /doc"
|
||||||
|
return jsonify(status=status)
|
||||||
|
|
||||||
|
@compiler_pages.route("/doc")
|
||||||
|
@compiler_pages.route("/doc.json")
|
||||||
|
def doc():
|
||||||
|
return render_template("README")
|
||||||
|
# return jsonify(status=render_template("README"))
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
@compiler_pages.route("/initdb")
|
||||||
|
@compiler_pages.route("/initdb.json")
|
||||||
|
def initdb_json():
|
||||||
|
init_db() # initialisiere Datenbank
|
||||||
|
status="Datenbank Neu initialisiert"
|
||||||
|
return jsonify(status=status)
|
||||||
|
|
||||||
|
@compiler_pages.route("/initdb2")
|
||||||
|
@compiler_pages.route("/initdb2.json")
|
||||||
|
def initdb_json2():
|
||||||
|
init_db2() # initialisiere Datenbank
|
||||||
|
status="Datenbank Neu initialisiert"
|
||||||
|
return jsonify(status=status)
|
||||||
|
|
||||||
|
@compiler_pages.route("/start")
|
||||||
|
@compiler_pages.route("/start.json")
|
||||||
|
def start_json():
|
||||||
|
mworker.start_workers(1,1,1) # initialisiere Datenbank
|
||||||
|
status="Worker gestartet"
|
||||||
|
return jsonify(status=status)
|
||||||
|
|
||||||
|
|
||||||
|
@compiler_pages.route("/urls")
|
||||||
|
@compiler_pages.route("/urls.json")
|
||||||
|
def urls_index_json():
|
||||||
|
# Lade Alle Urls
|
||||||
|
status=CrawlUrl.query.all()
|
||||||
|
return jsonify(urls=status)
|
||||||
|
|
||||||
|
# show an existing CrawlUrl
|
||||||
|
@compiler_pages.route("/urls/<int:id>")
|
||||||
|
@compiler_pages.route("/urls/<int:id>.json")
|
||||||
|
def urls_json(id):
|
||||||
|
# Lade Alle Urls
|
||||||
|
status=CrawlUrl.query.get(id)
|
||||||
|
cc=CrawlCache.query.filter(CrawlCache.url==status.url).first()
|
||||||
|
return jsonify(urls=status, cache=cc.__json__())
|
||||||
|
|
||||||
|
# que an existing CrawlUrl for fetching
|
||||||
|
@compiler_pages.route("/urls/<int:id>/que")
|
||||||
|
@compiler_pages.route("/urls/<int:id>/que.json")
|
||||||
|
def urls_que_json(id):
|
||||||
|
# Lade Alle Urls
|
||||||
|
cu=CrawlUrl.query.get(id)
|
||||||
|
mworker.queue_url(cu.tpe, cu.url)
|
||||||
|
cc=CrawlCache.query.filter(CrawlCache.url==cu.url)
|
||||||
|
mworker.start_workers(1,1,1) # initialisiere Datenbank
|
||||||
|
status="Worker gestartet"
|
||||||
|
return jsonify(urls=cu, cache=cc)
|
||||||
|
|
||||||
|
|
||||||
|
# que an existing CrawlUrl for fetching
|
||||||
|
@compiler_pages.route("/urls/<int:id>/test")
|
||||||
|
@compiler_pages.route("/urls/<int:id>/test.json")
|
||||||
|
def urls_test_json(id):
|
||||||
|
# Lade Alle Urls
|
||||||
|
cu=CrawlUrl.query.get(id)
|
||||||
|
rw=fetch_page(cu.url)
|
||||||
|
h= {"url": cu.url, "sourcetype": cu.tpe, "raw": rw}
|
||||||
|
h2=do_compile(cu.tpe, h)
|
||||||
|
return jsonify(urls=cu,hs=h2,rw=rw)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@compiler_pages.route("/debug",methods=['GET','PUT'])
|
||||||
|
def debug():
|
||||||
|
status="did nothing"
|
||||||
|
js=read_json(request)
|
||||||
|
clogger.info(request.get_json())
|
||||||
|
if js["cmd"] == "runfetch":
|
||||||
|
mworker.run_fetch()
|
||||||
|
status="fetched something"
|
||||||
|
if js["cmd"] == "que":
|
||||||
|
cu = CrawlUrl.query.get(js["id"])
|
||||||
|
mworker.queue_url(cu.tpe, cu.url)
|
||||||
|
status= mworker.run_fetch()
|
||||||
|
if js["cmd"] == "comp":
|
||||||
|
status=mworker.run_compile()
|
||||||
|
if js["cmd"]=="process":
|
||||||
|
status=mworker.run_process()
|
||||||
|
return jsonify(status=status)
|
||||||
|
|
||||||
|
@compiler_pages.route("/debugurl")
|
||||||
|
def debugurl():
|
||||||
|
s=CrawlUrlSchema()
|
||||||
|
status=CrawlUrl.query.all()
|
||||||
|
return jsonify(status=status)
|
||||||
|
|
||||||
|
|
||||||
|
@compiler_pages.route("/urls",methods=['POST'])
|
||||||
|
def add_urls():
|
||||||
|
# Lese Daten
|
||||||
|
js =read_json(request)
|
||||||
|
# clogger.info(js)
|
||||||
|
# Finde oder Erzeuge Url in der Datenbank
|
||||||
|
url=CrawlUrlSchema().load(js["url"])
|
||||||
|
clogger.info(url)
|
||||||
|
url=CrawlUrl.find_or_create(url.data["tpe"], url.data["url"])
|
||||||
|
db_session2.add(url)
|
||||||
|
db_session2.commit()
|
||||||
|
return jsonify(url=url, kk=js)
|
||||||
|
|
||||||
|
@compiler_pages.route("/urls/<int:id>",methods=['DELETE'])
|
||||||
|
@compiler_pages.route("/urls<int:id>.json",methods=['DELETE'])
|
||||||
|
def delete(id):
|
||||||
|
cu=CrawlUrl.query.get(id)
|
||||||
|
if cu != None:
|
||||||
|
db_session2.delete(cu)
|
||||||
|
db_session2.commit()
|
||||||
|
return jsonify(url={})
|
||||||
4
crawler/__init__.py
Normal file
4
crawler/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
|
||||||
|
|
||||||
|
def init():
|
||||||
|
return " "
|
||||||
55
database.py
Normal file
55
database.py
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
from sqlalchemy import create_engine
|
||||||
|
from sqlalchemy.orm import scoped_session, sessionmaker
|
||||||
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
|
from src import package_directory,clogger, cfg
|
||||||
|
from os import path
|
||||||
|
import json
|
||||||
|
#engine = create_engine('sqlite:////home/andreas/www/crawler/test.db', convert_unicode=True)
|
||||||
|
|
||||||
|
if cfg.get("db_path")==None or cfg.get("db_path").strip()=="":
|
||||||
|
db_path=package_directory
|
||||||
|
else:
|
||||||
|
db_path=cfg.get("db_path")
|
||||||
|
|
||||||
|
db_mainfile=cfg.get("db_mainfile")
|
||||||
|
if db_mainfile == None or db_mainfile.strip()=="":
|
||||||
|
db_mainfile="../srctest.db"
|
||||||
|
|
||||||
|
db_urlfile=cfg.get("db_mainfile")
|
||||||
|
if db_urlfile == None or db_urlfile.strip()=="":
|
||||||
|
db_urlfile="../srctest_cu.db"
|
||||||
|
|
||||||
|
|
||||||
|
engine = create_engine('sqlite:///'+ path.join(db_path,db_mainfile), convert_unicode=True)
|
||||||
|
|
||||||
|
db_session = scoped_session(sessionmaker(autocommit=False,
|
||||||
|
autoflush=False,
|
||||||
|
bind=engine))
|
||||||
|
|
||||||
|
engine2 = create_engine('sqlite:///'+ path.join(db_path,db_urlfile), convert_unicode=True)
|
||||||
|
|
||||||
|
db_session2 = scoped_session(sessionmaker(autocommit=False,
|
||||||
|
autoflush=False,
|
||||||
|
bind=engine2))
|
||||||
|
|
||||||
|
Base = declarative_base()
|
||||||
|
Base.query = db_session.query_property()
|
||||||
|
Base2 = declarative_base()
|
||||||
|
Base2.query = db_session2.query_property()
|
||||||
|
|
||||||
|
def read_json(rq):
|
||||||
|
js=rq.get_json()
|
||||||
|
clogger.info(rq.data)
|
||||||
|
if js is None:
|
||||||
|
js=rq.form.to_dict()
|
||||||
|
if js=={} and rq.data != "":
|
||||||
|
js=json.loads(rq.data)
|
||||||
|
return js
|
||||||
|
|
||||||
|
def init_db():
|
||||||
|
import src.models
|
||||||
|
Base.metadata.create_all(bind=engine)
|
||||||
|
|
||||||
|
def init_db2():
|
||||||
|
from .compiler.models import CrawlUrl, CrawlCache
|
||||||
|
Base2.metadata.create_all(bind=engine2)
|
||||||
4
fb.py
Normal file
4
fb.py
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
from src import cfg
|
||||||
|
import facebook
|
||||||
|
|
||||||
|
graph = facebook.GraphAPI(access_token=cfg.fb_token, version='2.3')
|
||||||
21
meta.py
Normal file
21
meta.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
import os
|
||||||
|
package_directory = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
from config import Config
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
|
||||||
|
cfg = Config(file(os.path.join(package_directory, 'config.cfg')))
|
||||||
|
#--------------- Logging
|
||||||
|
|
||||||
|
|
||||||
|
file_handler=logging.FileHandler(cfg.logfile)
|
||||||
|
file_handler.setLevel(logging.INFO)
|
||||||
|
std_handler=logging.StreamHandler(stream=sys.stdout)
|
||||||
|
std_handler.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
lg=logging.getLogger('mylogger')
|
||||||
|
lg.setLevel(logging.DEBUG)
|
||||||
|
lg.addHandler(file_handler)
|
||||||
|
lg.addHandler(std_handler)
|
||||||
|
|
||||||
|
#----------------
|
||||||
4
models.py
Normal file
4
models.py
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
|
||||||
|
from .articles.model import Article
|
||||||
|
from .sections.model import Section
|
||||||
|
from .compiler.models import CrawlUrl, CrawlCache
|
||||||
1
sections/__init__.py
Normal file
1
sections/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
from .model import Section
|
||||||
44
sections/model.py
Normal file
44
sections/model.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey
|
||||||
|
from sqlalchemy.orm import relationship
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from src.database import Base,db_session
|
||||||
|
from marshmallow import Schema, fields
|
||||||
|
|
||||||
|
import json
|
||||||
|
import flask
|
||||||
|
|
||||||
|
#from src.articles import Article
|
||||||
|
|
||||||
|
class SectionSchema(Schema):
|
||||||
|
id=fields.Integer()
|
||||||
|
foreign_name=fields.String()
|
||||||
|
name=fields.String()
|
||||||
|
|
||||||
|
class Section(Base):
|
||||||
|
__tablename__ = 'sections'
|
||||||
|
id = Column(Integer, primary_key=True)
|
||||||
|
url = Column(String(250))
|
||||||
|
crawlurl = Column(Integer)
|
||||||
|
foreign_name = Column(String(250),unique=True)
|
||||||
|
name=Column(String(250))
|
||||||
|
group = Column(String(250))
|
||||||
|
articles=relationship("Article", back_populates="section")
|
||||||
|
|
||||||
|
def __json__(self):
|
||||||
|
return SectionSchema().dump(self)[0]
|
||||||
|
def __init__(self, url=None,fname=None):
|
||||||
|
self.url=url
|
||||||
|
self.foreign_name=fname
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def find_or_create(cls, fname):
|
||||||
|
s=Section.query.filter(Section.foreign_name==fname).first()
|
||||||
|
if s is None:
|
||||||
|
s=Section(fname)
|
||||||
|
db_session.add(s)
|
||||||
|
db_session.commit()
|
||||||
|
s.foreign_name=fname
|
||||||
|
db_session.add(s)
|
||||||
|
db_session.commit()
|
||||||
|
return s
|
||||||
37
sections/views.py
Normal file
37
sections/views.py
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
from flask import Blueprint, jsonify, render_template, abort, redirect, url_for, request
|
||||||
|
section_pages = Blueprint('sections', __name__)
|
||||||
|
from .model import Section
|
||||||
|
from .model import SectionSchema
|
||||||
|
#import flask
|
||||||
|
from datetime import datetime
|
||||||
|
import json
|
||||||
|
from src import clogger
|
||||||
|
|
||||||
|
from src.database import db_session, read_json
|
||||||
|
import flask
|
||||||
|
|
||||||
|
@section_pages.route("/")
|
||||||
|
@section_pages.route("")
|
||||||
|
@section_pages.route(".json")
|
||||||
|
def index():
|
||||||
|
sections=Section.query.all()
|
||||||
|
return jsonify(sections=sections)
|
||||||
|
|
||||||
|
@section_pages.route("/<int:id>",methods=['PUT'])
|
||||||
|
@section_pages.route("/<int:id>.json",methods=['PUT'])
|
||||||
|
def update(id):
|
||||||
|
section=Section.query.get(id)
|
||||||
|
clogger.info(request.data)
|
||||||
|
a=request.get_json()
|
||||||
|
section.text=a["text"]
|
||||||
|
db_session.commit()
|
||||||
|
return jsonify(section=section)
|
||||||
|
|
||||||
|
|
||||||
|
@section_pages.route("/<int:id>",methods=['GET'])
|
||||||
|
@section_pages.route("/<int:id>.json",methods=['GET'])
|
||||||
|
def get(id):
|
||||||
|
section=Section.query.get(id)
|
||||||
|
clogger.info(section)
|
||||||
|
# section=SectionSchema().dump(section)[0]
|
||||||
|
return jsonify(section=section,articles=section.articles)
|
||||||
1
templates/home.html
Normal file
1
templates/home.html
Normal file
@@ -0,0 +1 @@
|
|||||||
|
<h1>Hello World</h1>
|
||||||
19
users/users.py
Normal file
19
users/users.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
|
||||||
|
class User(object):
|
||||||
|
def __init__(self, id, username, password):
|
||||||
|
self.id = id
|
||||||
|
self.username = username
|
||||||
|
self.password = password
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return "User(id='%s')" % self.id
|
||||||
|
|
||||||
|
user = User(1, 'user', 'password')
|
||||||
|
def authenticate(username, password):
|
||||||
|
if username == user.username and password == user.password:
|
||||||
|
return user
|
||||||
|
|
||||||
|
def identity(payload):
|
||||||
|
return user
|
||||||
|
|
||||||
|
|
||||||
Reference in New Issue
Block a user