From 8955bf17f552c7e2bcbca211633329de84f566dd Mon Sep 17 00:00:00 2001 From: Andreas Stephanides Date: Sat, 14 Jan 2017 12:23:04 +0100 Subject: [PATCH] init commit --- .gitignore | 5 + __init__.py | 101 +++++++++++++++ articles/__init__.py | 2 + articles/model.py | 139 ++++++++++++++++++++ articles/views.py | 65 ++++++++++ bot/__init__.py | 1 + bot/bot.py | 140 +++++++++++++++++++++ compiler/README | 10 ++ compiler/README.html | 1 + compiler/__init__.py | 15 +++ compiler/comp/__init__.py | 1 + compiler/comp/__init__py | 1 + compiler/comp/rss.py | 8 ++ compiler/compile.py | 153 ++++++++++++++++++++++ compiler/compiler.py | 258 ++++++++++++++++++++++++++++++++++++++ compiler/fetching.py | 67 ++++++++++ compiler/fixing.py | 37 ++++++ compiler/models.py | 75 +++++++++++ compiler/mprocess.py | 74 +++++++++++ compiler/mqueues.py | 8 ++ compiler/mworker.py | 58 +++++++++ compiler/views.py | 146 +++++++++++++++++++++ crawler/__init__.py | 4 + database.py | 55 ++++++++ fb.py | 4 + meta.py | 21 ++++ models.py | 4 + sections/__init__.py | 1 + sections/model.py | 44 +++++++ sections/views.py | 37 ++++++ templates/home.html | 1 + users/users.py | 19 +++ 32 files changed, 1555 insertions(+) create mode 100644 .gitignore create mode 100644 __init__.py create mode 100644 articles/__init__.py create mode 100644 articles/model.py create mode 100644 articles/views.py create mode 100644 bot/__init__.py create mode 100644 bot/bot.py create mode 100644 compiler/README create mode 100644 compiler/README.html create mode 100644 compiler/__init__.py create mode 100644 compiler/comp/__init__.py create mode 100644 compiler/comp/__init__py create mode 100644 compiler/comp/rss.py create mode 100644 compiler/compile.py create mode 100644 compiler/compiler.py create mode 100644 compiler/fetching.py create mode 100644 compiler/fixing.py create mode 100644 compiler/models.py create mode 100644 compiler/mprocess.py create mode 100644 compiler/mqueues.py create mode 100644 compiler/mworker.py create mode 100644 compiler/views.py create mode 100644 crawler/__init__.py create mode 100644 database.py create mode 100644 fb.py create mode 100644 meta.py create mode 100644 models.py create mode 100644 sections/__init__.py create mode 100644 sections/model.py create mode 100644 sections/views.py create mode 100644 templates/home.html create mode 100644 users/users.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e6e1129 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +/__init__.py~ +/__init__.pyc +*.pyc +*~ +config.cfg \ No newline at end of file diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..2198b8c --- /dev/null +++ b/__init__.py @@ -0,0 +1,101 @@ +import os +import sys +package_directory = os.path.dirname(os.path.abspath(__file__)) +from config import Config + +cfg = Config(file(os.path.join(package_directory, 'config.cfg'))) +#--------------- Logging +import logging +download_path="./cdw" +file_handler=logging.FileHandler(cfg.logfile) +file_handler.setLevel(logging.DEBUG) +stream_handler=logging.StreamHandler(sys.stdout) + +clt=logging.getLogger('mylogger') +clt.setLevel(logging.DEBUG) +clt.addHandler(file_handler) +clt.addHandler(stream_handler) + +clogger=clt +#---------------- +lg=clt +from gevent import spawn, monkey +monkey.patch_all() +from .compiler import start_workers +#start_workers(1,1,1) + + + +# Framework +from flask import Flask, jsonify, render_template, redirect, request,send_from_directory +# Cross Site Scripting +from flask_cors import CORS, cross_origin +#Authentication +from flask_jwt import JWT, jwt_required, current_identity + +from src.models import Article,Section +from src.users import authenticate, identity +from datetime import datetime + +app = Flask(__name__) +CORS(app) +app.config['LOGGER_NAME']='mylogger' +app.logger.setLevel(logging.DEBUG) +app.logger.info("Server Started") + +app.config['SECRET_KEY'] = 'super-secret' +import flask +import json +from database import Base +from models import Article, CrawlUrl, CrawlCache + + +flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Base, Article,Section, CrawlUrl,CrawlCache)) else None) +json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Base, Article,CrawlUrl,CrawlCache)) else None) + + + +#bot.dosmth() +#lg.debug(bot.bot) + + + + +# Allow Cross Site Scripting +@app.after_request +def after_request(response): + response.headers.add('Access-Control-Allow-Origin', '*') + if request.method == 'OPTIONS': + response.headers['Access-Control-Allow-Methods'] = 'DELETE, GET, POST, PUT' + headers = request.headers.get('Access-Control-Request-Headers') + if headers: + response.headers['Access-Control-Allow-Headers'] = headers + return response +from .articles.views import article_pages +from .sections.views import section_pages +from .compiler.views import compiler_pages + + +@app.route("/") +@app.route("/index") +@app.route("/home") +def home(): + text="It work's, please do something" + return jsonify(text=text) + +app.register_blueprint(article_pages, url_prefix='/articles') +app.register_blueprint(section_pages, url_prefix='/sections') +app.register_blueprint(compiler_pages, url_prefix='/compiler') + + +from src.bot import bot +if not app.debug or os.environ.get("WERKZEUG_RUN_MAIN") == "true": + bot.message_loop() + + +# ------------ Telegram Bot +#from bot import bot_queue +#@app.route('/bot', methods=['GET', 'POST']) +#def pass_update(): +# bot_queue.put(request.data) # pass update to bot +# return 'OK' diff --git a/articles/__init__.py b/articles/__init__.py new file mode 100644 index 0000000..fdf13cf --- /dev/null +++ b/articles/__init__.py @@ -0,0 +1,2 @@ +from .model import Article +from .views import article_pages diff --git a/articles/model.py b/articles/model.py new file mode 100644 index 0000000..14b57cd --- /dev/null +++ b/articles/model.py @@ -0,0 +1,139 @@ + +from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey +from sqlalchemy.orm import relationship +from datetime import datetime +from src.database import Base +from src.database import db_session +from marshmallow import Schema, fields +from src.sections.model import Section + +#import json +import json +import flask +#json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None) +import hashlib + +#import clogger +import logging +#from crawler.compiler.mqueues import put_fetch_queue +from src import clogger +#json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None) + + + + +def calc_fingerprint(a): + return calc_fingerprint_h({"url": a.url, "title":a.title, "published": str(a.published_date)}) + +def calc_fingerprint_h(a): + if a["published"] is not None and a["published"]!= "None": + # clogger.info( "published:"+str(a["published"])) + if a["published"] is str: + pp=parse(a["published"]) + else: + pp=a["published"] + else: + pp="" + #clogger.info( unicode(a["url"])+ unicode(a["title"])+unicode(pp)) + h=hashlib.md5() + h.update(unicode(a["url"])) + h.update(a["title"].encode("utf-8")) + h.update(unicode(pp)) + return h.hexdigest() + + +class ArticleSchema(Schema): + id=fields.Integer() + text=fields.String() + title=fields.String() + author=fields.String() + sourcetype =fields.String() + image =fields.String() + url =fields.String() + published_date=fields.DateTime() + date=fields.DateTime() + first_fetched=fields.DateTime() + section_id=fields.Integer() + +class Article(Base): + __tablename__ = 'articles' + id = Column(Integer, primary_key=True) + parent_id= Column(Integer) + url = Column(String(250)) + is_primary = Column(Boolean) + fingerprint = Column(String(250),unique=True) + hash = Column(String(250)) + last_fetched = Column(DateTime) + first_fetched=Column(DateTime) + published_date = Column(DateTime) + date = Column(DateTime) + text = Column(Text) + title = Column(String(250)) + author = Column(String(250)) + section = relationship("Section") + section_id=Column(Integer, ForeignKey('sections.id')) + sourcetype = Column(String(250)) + image=Column(String(250)) + + def __init__(self, url=None,title=None, published_date=None): + self.url=url + self.title=title + self.published_date=published_date + self.first_fetched=datetime.now() + def __json__(self): + return ArticleSchema().dump(self)[0] + + def dict(self): + return {"id": str(int(self.id)), "title": self.title, "text": self.text, "author": self.author, "section":self.section, "sourcetype": self.sourcetype, "last_fetched": self.last_fetched, "first_fetched": self.first_fetched, "published_date": self.published_date, "date": self.date,"image": self.image, "url": self.url} + + +# @classmethod +# def sections(self): +# sects=db_session.query(Article.section).distinct().all() +# for i in range(len(sects)): +# sects[i]=sects[i][0] +# return sects + + @classmethod + def from_hash(cls, a): + fp = calc_fingerprint_h(a) + aa = Article.query.filter(Article.fingerprint==fp).first() + if aa is None: + clogger.debug( "new Article") + if a["published"] is not None: + if a["published"] is str: + pd= parse(a["published"]) + else: + pd=a["published"] + else: + pd=None + aa=Article(a["url"], a["title"],pd) + aa.fingerprint = calc_fingerprint(aa) + db_session.add(aa) + db_session.commit() + return aa + + def process_hash(self, a): + self.text=a["text"].decode('utf8') + if "image" in a: + self.image=a["image"] + if "author" in a: + self.author=a["author"] + if "title" in a: + self.title=a["title"] + if "author" in a: + self.author=a["author"] + if "sourcetype" in a: + self.sourcetype=a["sourcetype"] + if "section" in a: + self.section=Section.find_or_create(a["section"]) +# if "last_fetched" in a: +# self.last_fetched=a["last_fetched"] + if "published_date" in a: + self.published_date=a["published_date"] + + + +#flask.json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None) + +#json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None) diff --git a/articles/views.py b/articles/views.py new file mode 100644 index 0000000..7202b4b --- /dev/null +++ b/articles/views.py @@ -0,0 +1,65 @@ +from flask import Blueprint, jsonify, render_template, abort, redirect, url_for, request +article_pages = Blueprint('articles', __name__) +from .model import Article +from .model import ArticleSchema +#import flask +from datetime import datetime +import json + +#flask.json.JSONEncoder.default = lambda self,obj: (obj.isoformat() if isinstance(obj, datetime) else None) +#flask.json.JSONEncoder.default = lambda self,obj: ((obj.dict()) if isinstance(obj, Article) else None) +from src import clogger +import json +from src.database import db_session, read_json +import flask + +#flask.json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None) +flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Base, Article,CrawlUrl)) else None) + +@article_pages.route("/") +@article_pages.route("") +@article_pages.route(".json") +def index(): + articles=Article.query.all() + return jsonify(articles=articles) + +@article_pages.route("/",methods=['PUT']) +@article_pages.route("/.json",methods=['PUT']) +def update(id): + article=Article.query.get(id) + clogger.info(request.data) + a=request.get_json() + article.text=a["text"] + db_session.commit() + return jsonify(article=article) + + +@article_pages.route("/",methods=['GET']) +@article_pages.route("/.json",methods=['GET']) +def get(id): + article=Article.query.get(id) + clogger.info(article) +# article=ArticleSchema().dump(article)[0] + return jsonify(article=article) + +@article_pages.route("/",methods=['DELETE']) +@article_pages.route("/.json",methods=['DELETE']) +def delete(id): + article=Article.query.get(id) + clogger.info(id) + if article != None: + db_session.delete(article) + db_session.commit() + return jsonify(article={}) + + +@article_pages.route("/",methods=['POST']) +@article_pages.route("",methods=['POST']) +@article_pages.route(".json",methods=['POST']) +def create(): + article=Article() + a=read_json(request) + article.text=a["article"]["text"] + db_session.add(article) + db_session.commit() + return jsonify(article=article) diff --git a/bot/__init__.py b/bot/__init__.py new file mode 100644 index 0000000..4cff7c7 --- /dev/null +++ b/bot/__init__.py @@ -0,0 +1 @@ +from .bot import bot diff --git a/bot/bot.py b/bot/bot.py new file mode 100644 index 0000000..7392b8f --- /dev/null +++ b/bot/bot.py @@ -0,0 +1,140 @@ +import telepot +import datetime +import time +import json +from Queue import Queue +#import os +from src import lg,cfg +#from gevent import spawn +from telepot.namedtuple import InlineKeyboardMarkup, InlineKeyboardButton +from telepot.delegate import ( + per_chat_id, pave_event_space, include_callback_query_chat_id, create_open, per_inline_from_id ) +from src.compiler import CrawlUrl +from gevent import spawn, monkey, Greenlet + +def IKB(h): + return InlineKeyboardButton(text=h["text"], callback_data=h["callback_data"]) + +def IKB2(h): + return [IKB(h)] +def IKM(h): + return InlineKeyboardMarkup(inline_keyboard=[ map(IKB,h)]) + +def IKM2(h): + return InlineKeyboardMarkup(inline_keyboard= map(IKB2,h)) + + +def query_que_url(url): + print(json.dumps(url)) + return {"text": url.url, "callback_data":"/urls/"+str(url.id)+"/que"} + +def handle_urls(handler, cmd): + curls=CrawlUrl.query.all() + #sent=handler.sender.sendMessage(json.dumps(curls)) + kb= IKM2(map(query_que_url,curls)) + print json.dumps(cmd) + if len(cmd) >= 4 and cmd[3]=="que": + sent=handler.sender.sendMessage("I qued url "+str(cmd[2]), reply_markup=None) + else: + sent=handler.sender.sendMessage("que?", reply_markup=kb) + handler._edit_msg_ident = telepot.message_identifier(sent) + handler._editor = telepot.helper.Editor(handler.bot, sent) + +def execute_command(handler,cmd,msg=None): + if cmd[1]=='urls': + handle_urls(handler,cmd) + + +def handle(handler,msg): + content_type,chat_type,chat_id = telepot.glance(msg) + if msg.has_key('text'): + if msg['text'][0]=='/': + cmd = msg['text'].split("/") + execute_command(handler, cmd, msg) + if msg.has_key('data'): + lg.debug(msg['data']) + + +class InlineHandler(telepot.helper.InlineUserHandler, telepot.helper.AnswererMixin): + def __init__(self, *args, **kwargs): + super(InlineHandler, self).__init__(*args, **kwargs) + + def on_inline_query(self, msg): + def compute_answer(): + query_id, from_id, query_string = telepot.glance(msg, flavor='inline_query') + print(self.id, ':', 'Inline Query:', query_id, from_id, query_string) + + articles = [{'type': 'article', + 'id': 'abc', 'title': query_string, 'message_text': query_string}] + + return articles + + self.answerer.answer(msg, compute_answer) + + def on_chosen_inline_result(self, msg): + from pprint import pprint + pprint(msg) + result_id, from_id, query_string = telepot.glance(msg, flavor='chosen_inline_result') + print(self.id, ':', 'Chosen Inline Result:', result_id, from_id, query_string) + + +class FetBot(telepot.helper.ChatHandler): + def __init__(self, *args, **kwargs): +# super(FetBot,self).__init__(*args,**kwargs) + super(FetBot,self).__init__( *args,**kwargs) + + _editor=None + _edit_msg_ident=None + keyboard=IKM([{"text":"START","callback_data": "start"}, + {"text":"Don't Start","callback_data":"notstart"} + ]) + keyboard =InlineKeyboardMarkup( + inline_keyboard=[[ + InlineKeyboardButton(text='START', callback_data='start'), + InlineKeyboardButton(text='START', callback_data='start') + ]] + ) + def on_chat_message(self,msg): + handle(self,msg) + content_type,chat_type,chat_id = telepot.glance(msg) + lg.debug(content_type) + if content_type=="photo" or content_type=="sticker": + lg.debug("try to download %s" % msg[content_type][-1]["file_id"]) + f=self.bot.getFile(msg[content_type][-1]['file_id']) + lg.debug(f) + self.bot.download_file(f['file_id'], "dwn/" + f['file_path']) + # self.bot.getFile(msg['photo'][-1]['file_id']), "dwn") + #self._cancel_last() + #sent=self.sender.sendMessage("Hello World", reply_markup=self.keyboard) + #self._editor = telepot.helper.Editor(self.bot, sent) + #self._edit_msg_ident = telepot.message_identifier(sent) + + def on_callback_query(self, msg): + query_id, from_id, query_data = telepot.glance(msg, flavor='callback_query') + lg.debug(json.dumps(msg)) + self._cancel_last() + if query_data[0]=='/': + cmd = query_data.split("/") + execute_command(self, cmd, msg) + +# self.sender.sendMessage("Danke") + self.bot.answerCallbackQuery(query_id, text='Ok. But I am going to keep asking.') + #self.bot.answerCallbackQuery(query_id) + def _cancel_last(self): + if self._editor: + self._editor.editMessageReplyMarkup(reply_markup=None) + self._editor = None + self._edit_msg_ident = None + + + + +bot=None +bot = telepot.DelegatorBot(cfg.token, [include_callback_query_chat_id(pave_event_space())(per_chat_id(),create_open,FetBot,timeout=20), + pave_event_space()( + per_inline_from_id(), create_open, InlineHandler, timeout=10), + ]) + + + + diff --git a/compiler/README b/compiler/README new file mode 100644 index 0000000..027667f --- /dev/null +++ b/compiler/README @@ -0,0 +1,10 @@ +Das ist die API für den Compiler +Folgende Befehle sind implementiert: +GET doc: Diese Dokumentation! +GET initdb: Initialisiere die Datenbank, ACHTUNG Daten werden gelöscht +POST urls: +Erwartet Daten im Format {"url": {"type": typ, "url": "someurl.html"}} +Fügt diese Url der Überwachung hinzu + +IN PROCESS: +GET urls: Alle Urls die überwacht werden sollen \ No newline at end of file diff --git a/compiler/README.html b/compiler/README.html new file mode 100644 index 0000000..2f4e349 --- /dev/null +++ b/compiler/README.html @@ -0,0 +1 @@ +sdf diff --git a/compiler/__init__.py b/compiler/__init__.py new file mode 100644 index 0000000..5284e55 --- /dev/null +++ b/compiler/__init__.py @@ -0,0 +1,15 @@ + + +#from mprocess import do_process, process_urllist +#from compiler import do_compile +#from mworker import run_fetch, run_process, run_compile + +# include models for final objects +from src.models import Article +# starting workers +from mworker import start_workers + +from models import add_url, CrawlUrl +#start_workers(1,1,1) + +from fetching import announce_articleid diff --git a/compiler/comp/__init__.py b/compiler/comp/__init__.py new file mode 100644 index 0000000..0a7c135 --- /dev/null +++ b/compiler/comp/__init__.py @@ -0,0 +1 @@ +from rss import rssfeed diff --git a/compiler/comp/__init__py b/compiler/comp/__init__py new file mode 100644 index 0000000..532849f --- /dev/null +++ b/compiler/comp/__init__py @@ -0,0 +1 @@ +from rss import rssfeed \ No newline at end of file diff --git a/compiler/comp/rss.py b/compiler/comp/rss.py new file mode 100644 index 0000000..cb5a95b --- /dev/null +++ b/compiler/comp/rss.py @@ -0,0 +1,8 @@ +import feedparser + +def rssfeed(url,raw): + al=[] + f=feedparser.parse(raw) + for e in f['entries']: + al.append(e['link']) + return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"} diff --git a/compiler/compile.py b/compiler/compile.py new file mode 100644 index 0000000..9cfcf37 --- /dev/null +++ b/compiler/compile.py @@ -0,0 +1,153 @@ +from bs4 import BeautifulSoup +import crawler.objects.models +#from crawler.objects.models import Object +from dateutil.parser import parse +from datetime import datetime +import re +def hello(): + return "hello" + + +def fetarticle(o): + sp=BeautifulSoup(o.raw_fixed) + d={} + h=sp.find("h1", {"itemprop": "name"}) + d["title"]=unicode(h.text).strip() + h=sp.find("div", {"itemprop": "articleBody"}) + if h is not None: + d["text"]=(h.encode_contents()).strip() + else: + d["text"]="" + d["url"]=o.url + h=sp.find("span", {"itemprop": "author"}) + if h is not None: + d["author"]=h.text.strip() + h=sp.find("span", {"itemprop": "articleSection"}) + if h is not None: + d["section"]= "FET - " + h.text.strip() + + h=sp.find("span", {"itemprop": "datePublished"}) + if h is not None: + d["published"]=parse(h.encode_contents().strip()) + h=sp.find("meta", {"property": "og:image"}) + + if h is not None: + d["image"]=h.attrs["content"] + + hh=sp.find_all("div", {"class":"media"}) + for h in hh: + if h is not None: + h=h.find("div", {"class": "pull-left"}) + if h is not None: + h=h.find("a") + if h is not None: + d["image2"]=crawler.objects.models.download_file(h.attrs["href"]) + return {"article": d} + +def fsarcharticle(o): + sp=BeautifulSoup(o.raw_fixed) + d={} + h=sp.find("h1", {"class": "title"}) + if h is not None: + d["title"]=h.text.strip() + d["url"]=o.url + d["published"]=None + h=sp.find("article") + h=h.find("div", {"class": "content"}) + d["text"]=h.encode_contents().strip() + h=sp.find("article").find("h1", {"class": "title"}) + if h is not None: + d["title"]=h.text.strip() + else: + d["title"]="" + d["image"]="" + d["sourcetype"]="fsarcharticle" + d["section"]="fsarch" + d["author"]=None + return {"article": d} + +def fetindex(o): +# if type(o) is not Object: +# raise TypeError + if o.raw is None: + raise Error + print "compile_fetindex" + html=BeautifulSoup(o.raw_fixed) + h = html.find("li", {"class": "next_page" }) + if h is not None: + nl=h.find("a") + nl=crawler.objects.models.fix_link(o.url,nl.attrs["href"]) + else: + nl=None + h= html.find("ul", {"id": "neuigkeiten"}) + links=h.find_all("a") + al = [] + for t in links: + al.append(t.attrs["href"]) + return {"url": o.url, "next_page": nl, "article_links": al, "objecttype": "index" } + +def fsarchindex(o): + if o.raw is None: + raise Error + html=BeautifulSoup(o.raw_fixed) + h= html.find("article") + print unicode(h) + links=h.find_all("a") + al = [] + fl=[] + for t in links: + url=t.attrs["href"] + if re.search("fachschaftarchitektur\.at", url): + al.append(t.attrs["href"]) + if re.search("facebook\.com/events", url): + fl.append(t.attrs["href"]) + + return {"url": o.url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"} + + +def fsbizindex(o): + if o.raw is None: + raise Error + print "compile_fsbizindex" + html=BeautifulSoup(o.raw_fixed) + h= html.find("section", {"id": "primary"}) + links=h.find_all("h1", {"class": "entry-title"}) + al = [] + for t in links: + + al.append(t.find("a").attrs["href"]) + return {"url": o.url,"article_links": al,"objecttype": "index"} + + +def fsmbindex(o): + if o.raw is None: + raise Error + html=BeautifulSoup(o.raw_fixed) + h= html.find("a",{"class": "next"}) + if h is not None: + np=h.attrs["href"] + else: + np=None + h=html.find("div", {"id": "main"}).find("div", {"class": "inside"}).find("div", {"class": "mod_newslist"}) + if h is not None: + ats=h.find_all("div",{"class": "block"}) + articles=[] + for a in ats: + aa={} + h=a.find("h3") + if h is not None: + aa["title"] = h.text.strip() + h=a.find("div", {"class": "ce_text"}) + if h is not None: + aa["text"] = (h.encode_contents()).strip() + aa["info"]=[] + hh=a.find_all("p", {"class": "info"},recursive=False) + for h in hh: + aa["info"].append(unicode(h.text)) + if re.search(r'von', str(h)): + h1= re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text)) + aa["published"] =parse(h1.strip()) + aa["author"]=re.sub(r'^.*von(.*)$', r'\1',unicode(h.text)).strip() #h.text + "--" #+ re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',hh) + aa["section"]="FSMB" + articles.append(aa) + return {"url": o.url, "next_page": np, "articles": articles,"objecttype": "articles"} diff --git a/compiler/compiler.py b/compiler/compiler.py new file mode 100644 index 0000000..de34084 --- /dev/null +++ b/compiler/compiler.py @@ -0,0 +1,258 @@ +from bs4 import BeautifulSoup +#import crawler.objects.models +#from crawler.objects.models import Object +from dateutil.parser import parse +from datetime import datetime +import re +import urlparse +from src import clogger, cfg +from src.fb import graph +from fixing import fix_link +from facebook import GraphAPIError +#from fetching import downloadfile +import json +def do_compile(tpe, cont): + if type(cont) != dict: + clogger.error("Type Error for do compile for :"+str(cont["url"])) + # Starting to compile an generic object + if "url" not in cont: + clogger.error("no url can't compile "+tpe) + else: + clogger.debug("compile: type:"+str(tpe)+ "| "+ str(cont["url"])) + if tpe in compiler: + cont=compiler[tpe](cont["url"], cont["raw"]) + return cont + +from comp import rssfeed + +def dummyarticle(url, raw): + return {"url": url, "article":{"url": url, "section": "dummysection", "sourcetype": "dummy", "title":"dummytitle", "text": raw, "image": "fff", "author": "me", "published": None}} + + + +def htufeed(url,raw): + al=[] + f=feedparser.parse(raw) + for e in f['entries']: + al.append(e['link']) + return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"} + + +def htuarticle(url,raw): + sp=BeautifulSoup(raw) + d={} + h=sp.find("meta", {"property": "og:image"}) + if h is not None: + d["image"]=h.attrs["content"] + d["image2"]=d["image"] + h=sp.find("div", {"class": "patternRevInfo"}) + if h is not None: +# clogger.debug(h.text.strip()) + h1= re.sub(r'.*- (\d+) ([a-zA-Z]+) (\d+) - ([:\d]+)[^\d]*', r'\3/\2/\1 \4',unicode(h.text.strip())) +# clogger.debug(h1) + d["published"]=parse(h1) + # clogger.debug(parse(h1)) + # clogger.debug(d["published"]) + h=h.find("a") + if h is not None: + d["author"]=h.text.strip() + h=sp.find("div", {"class": "foswikiTopic"}) + h1=h.find("h4") + if h1 is not None: + d["title"]= h1.text.strip() + h1.extract() # remove head + else: + h1=sp.find("meta", {"name": "WEBTOPIC"}) + d["title"]= h1.attrs["content"] + d["text"]=(h.encode_contents()).strip() + d["section"]="HTU" + d["url"]=url +# clogger.debug(d) + return {"article": d} + + +def fetarticle(url, raw): + sp=BeautifulSoup(raw) + d={} + h=sp.find("h1", {"itemprop": "name"}) + d["title"]=unicode(h.text).strip() + h=sp.find("div", {"itemprop": "articleBody"}) + if h is not None: + d["text"]=(h.encode_contents()).strip() + else: + d["text"]="" + d["url"]=url + h=sp.find("span", {"itemprop": "author"}) + if h is not None: + d["author"]=h.text.strip() + h=sp.find("span", {"itemprop": "articleSection"}) + if h is not None: + d["section"]= "FET - " + h.text.strip() + + h=sp.find("span", {"itemprop": "datePublished"}) + if h is not None: + d["published"]=parse(h.encode_contents().strip()) + + h=sp.find("meta", {"property": "og:image"}) + if h is not None: + d["image"]=h.attrs["content"] + d["image2"]=d["image"] +# hh=sp.find_all("div", {"class":"media"}) +# for h in hh: +# if h is not None: +# h=h.find("div", {"class": "pull-left"}) +# if h is not None: +# h=h.find("a") +# if h is not None: +# d["image2"]=downloadfile(fix_link(url,h.attrs["href"])) + return {"article": d} + + +def fsarcharticle(url, raw): + sp=BeautifulSoup(raw) + d={} + h=sp.find("h1", {"class": "title"}) + if h is not None: + d["title"]=h.text.strip() + d["url"]=url + d["published"]=None + h=sp.find("article") + if h is not None: + h=h.find("div", {"class": "content"}) + d["text"]=h.encode_contents().strip() + h=sp.find("article") + if h is not None: + h=h.find("h1", {"class": "title"}) + if h is not None: + d["title"]=h.text.strip() + else: + d["title"]="" + d["image"]="" + d["sourcetype"]="fsarcharticle" + d["section"]="fsarch" + d["author"]=None + return {"article": d} + +def fetindex(url, raw): + if raw is None: + raise Error +# clogger.debug("compile_fetindex: "+str(url)) + html=BeautifulSoup(raw) + h = html.find("li", {"class": "next_page" }) + if h is not None: + nl=h.find("a") + nl=fix_link(url,nl.attrs["href"]) + else: + nl=None + h= html.find("ul", {"id": "neuigkeiten"}) + al = [] + if h is not None: + links=h.find_all("a") + for t in links: + al.append(t.attrs["href"]) + return {"url": url, "next_page": nl, "article_links": al, "objecttype": "index" } + +def fsarchindex(url, raw): + if raw is None: + raise Error + html=BeautifulSoup(raw) + h= html.find("article") + print unicode(h) + links=h.find_all("a") + al = [] + fl=[] + for t in links: + url=t.attrs["href"] + if re.search("fachschaftarchitektur\.at", url): + al.append(t.attrs["href"]) + if re.search("facebook\.com/events", url): + fl.append(t.attrs["href"]) + + return {"url": url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"} + + +def fsbizindex(url, raw): + if raw is None: + raise Error + print "compile_fsbizindex" + html=BeautifulSoup(raw) + h= html.find("section", {"id": "primary"}) + links=h.find_all("h1", {"class": "entry-title"}) + al = [] + for t in links: + + al.append(t.find("a").attrs["href"]) + return {"url": url,"article_links": al,"objecttype": "index"} + + + + +def fbfeed(url, raw): + js = json.loads(raw) + arts=[] + u=urlparse.urlparse(url) + for m in js["data"]: + aa={} + aa["url"]=urlparse.urlunsplit(("http","www.facebook.at",m["id"],"","")) + aa["published"] =parse(m["created_time"]) + if m.has_key("message")==True: + aa["text"] = m["message"] + else: + try: + h=graph.get_object(id=m["id"].split("_")[1]) + if h.has_key("description"): + aa["text"]=h["description"] + else: + aa["text"]=json.dumps() + except GraphAPIError: + aa["text"]="" + if m.has_key("story")==True: + aa["title"] = m["story"] + else: + aa["title"] = u[1]+ " at " + aa["published"].strftime("%Y-%m-%d %H:%M") + aa["section"]="Facebook: "+u[1] + arts.append(aa) + return {"url": url, "next_page": js["paging"]["next"],"articles": arts} + +def fsmbindex(url, raw): + if raw is None: + raise Error + html=BeautifulSoup(raw) + h= html.find("a",{"class": "next"}) + if h is not None: + np=h.attrs["href"] + else: + np=None + h=html.find("div", {"id": "main"}).find("div", {"class": "inside"}).find("div", {"class": "mod_newslist"}) + if h is not None: + ats=h.find_all("div",{"class": "block"}) + articles=[] + for a in ats: + aa={} + h=a.find("h3") + if h is not None: + aa["title"] = h.text.strip() + h=a.find("div", {"class": "ce_text"}) + if h is not None: + aa["text"] = (h.encode_contents()).strip() + aa["info"]=[] + hh=a.find_all("p", {"class": "info"},recursive=False) + for h in hh: + aa["info"].append(unicode(h.text)) + if re.search(r'von', str(h)): + h1= re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text)) + aa["published"] =parse(h1.strip()) + aa["author"]=re.sub(r'^.*von(.*)$', r'\1',unicode(h.text)).strip() #h.text + "--" #+ re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',hh) + aa["section"]="FSMB" + articles.append(aa) + return {"url": url, "next_page": np, "articles": articles,"objecttype": "articles"} + +compiler = {"fetindex": fetindex, "fetarticle": fetarticle, "fsarchindex": fsarchindex, "fsarcharticle": fsarcharticle, "fsmbindex": fsmbindex, "fsbizindex": fsbizindex, "dummyarticle": dummyarticle,"htuarticle": htuarticle, "htufeed": htufeed, "fbfeed": fbfeed, "fschfeed": rssfeed} + +compiler = cfg.compiler +for i in compiler: + compiler[i]=eval(compiler[i]) + + + +article_types={"fetindex" : "fetarticle", "fsarchindex": "fsarcharticle", "fsbizindex": "fsbizarticle", "dummyindex": "dummyarticle", "htufeed": "htuarticle"} diff --git a/compiler/fetching.py b/compiler/fetching.py new file mode 100644 index 0000000..522278f --- /dev/null +++ b/compiler/fetching.py @@ -0,0 +1,67 @@ +from requests import session +s=session() +from src import package_directory, download_path,cfg +from os import path, makedirs +import os +import json +from gevent import spawn +from src import clogger +from src.fb import graph +from hashlib import md5 +import errno +import urlparse +def announce_articleid(id): + for u in cfg.announcearticle_url: + s.get( u % id) + +def downloadfile(url): + relative_name=path.join("downloads",str(md5(url).hexdigest()),url.split('/')[-1]) + local_filename = path.join(download_path,relative_name) + if not os.path.exists(os.path.dirname(local_filename)): + try: + os.makedirs(os.path.dirname(local_filename)) + except OSError as exc: # Guard against race condition + if exc.errno != errno.EEXIST: + raise + if not path.exists(local_filename): + spawn(fetch_load_file, url, local_filename) + return relative_name + +from models import CrawlCache +from datetime import datetime, timedelta + + + + +def fetch_page(furl): + current_time = datetime.utcnow() + ten_weeks_ago = current_time - timedelta(days=cfg.cache_days) + u=urlparse.urlparse(furl) + if u[0] == '': + furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4])) + cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first() + if cc is None: + clogger.debug("fetching url: "+ str(furl)) + if u[0]=='fb': + tx = json.dumps(graph.get_object(id=u[1]+u[2])) + else: + tx=s.get(furl).text + CrawlCache.store(furl,tx) + else: + #if furl is not None: +# clogger.debug("cache hit") + tx=cc.raw + return tx + +def fetch_load_file(furl, path): + try: + clogger.info("Downloading "+ str(furl)) + r = s.get(furl, stream=True) + f = open(path, 'wb') + for chunk in r.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + f.write(chunk) + f.close() + except Exception, e: + #clogger.error("Error Occured during fetching:"+str(furl)) + clogger.error(e,exc_info=True) diff --git a/compiler/fixing.py b/compiler/fixing.py new file mode 100644 index 0000000..e835844 --- /dev/null +++ b/compiler/fixing.py @@ -0,0 +1,37 @@ +from bs4 import BeautifulSoup +from urlparse import urlparse, urlunparse, urljoin +from fetching import downloadfile +import bleach + +def fix_link(url, link): + r= urlparse(link) + if r.scheme is None or r.scheme == '': + return urljoin(url,link) + else: + return link + +def fix_file(url, link): + u=fix_link(url,link) + return downloadfile(u) + +def load_file(url, link): + return fix_file(url,link) + + +def fix_html(html, baseurl): + html=bleach.clean(html, tags=['b','p','span','a','img','div','br','strong','ul','li'], strip=True) + sp=BeautifulSoup(html) + images=sp.find_all("img") + for t in images: + if "src" in t.attrs and t.attrs["src"] is not None: + t.attrs["src"]=fix_file(baseurl,t.attrs["src"]) + links=sp.find_all("a") + for t in links: + if "href" in t.attrs: + t.attrs["href"]=fix_link(baseurl, t.attrs["href"]) + for t in sp.find_all("script"): + t.extract() + b=sp.find("base") + if b is not None: + b.attrs["href"]="" + return sp diff --git a/compiler/models.py b/compiler/models.py new file mode 100644 index 0000000..e774590 --- /dev/null +++ b/compiler/models.py @@ -0,0 +1,75 @@ +from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text +from datetime import datetime +from src.database import Base2 +from src.database import db_session2 +from mqueues import put_fetch_queue +from marshmallow import Schema,fields,ValidationError +import json +import flask + +def add_url(tpe, url): + cu=CrawlUrl.find_or_create(tpe,url) + db_session2.add(cu) + db_session2.commit() + cu.schedule() + + +class CrawlUrlSchema(Schema): + id=fields.Integer() + tpe=fields.String() + url=fields.String() + last_fetched=fields.DateTime() + fetched = fields.DateTime() + +class CrawlUrl(Base2): + __tablename__='crawlurls' + id = Column(Integer, primary_key=True) + tpe=Column(String(250)) + url = Column(String(250)) + last_fetched = Column(DateTime) + def fetched(self): + CrawlCache.query.find(CrawlCache.url==self.url).first() + @classmethod + def find_or_create(self, tpe, url): + aa = CrawlUrl.query.filter(CrawlUrl.url==url).filter(CrawlUrl.tpe==tpe).first() + if aa is None: + aa=CrawlUrl(tpe,url) + return aa + def schedule(self): + put_fetch_queue((0, self.tpe, self.url)) + def __init__(self, tpe, url): + self.url=url + self.tpe=tpe + def __json__(self): + return CrawlUrlSchema().dump(self)[0] + +class CrawlCacheSchema(Schema): + id=fields.Integer() + raw=fields.String() + url=fields.String() + fetched=fields.DateTime() + +class CrawlCache(Base2): + __tablename__='crawlcache' + id = Column(Integer, primary_key=True) + url=Column(String(250)) + fetched=Column(DateTime) + raw=Column(Text) + + def __init__(self, url,rw): + self.url=url + self.raw=rw + self.fetched=datetime.utcnow() + def __json__(self): + return CrawlCacheSchema().dump(self) + + @classmethod + def store(cls, url, rw): + cc=CrawlCache(url,rw) + db_session2.add(cc) + db_session2.commit() + + + + +#flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, CrawlUrl) else None) diff --git a/compiler/mprocess.py b/compiler/mprocess.py new file mode 100644 index 0000000..86062bb --- /dev/null +++ b/compiler/mprocess.py @@ -0,0 +1,74 @@ +from src import clogger # Logger for crawler +from src.models import Article # Article model +from datetime import datetime +from src.database import db_session +from mqueues import fetch_queue, compile_queue, put_fetch_queue +from fetching import fetch_page, downloadfile, announce_articleid +from fixing import fix_html, fix_file + +from compiler import article_types +from fixing import fix_link +# process article expects an hash with raw data for the article and puts it into an +# article object stored in the database it is intended to prevent dublicates + +def is_article_hash(h): + return "text" in h and "url" in h and "sourcetype" in h and "section" in h + +def process_article(art): + if not is_article_hash(art): + clogger.error("Invalid article hash:" + str(art)) + aa=None + else: + art["text"]=fix_html(art["text"],art["url"]) + if "image" in art: + art["image"]=fix_file(art["url"], art["image"]) + clogger.info(art) + aa = Article.from_hash(art) + aa.process_hash(art) + aa.last_fetched=datetime.now() + aa.sourcetype=art["sourcetype"] + db_session.add(aa) + db_session.commit() + clogger.debug("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8"))) +# announce_articleid(aa.id) + return aa + +# process a single found url +def process_url(url,tpe, parent_url): + #clogger.debug("process URL of type "+ tpe + ": " + url) + if parent_url is not None: + url=fix_link(parent_url, url) + put_fetch_queue((0,tpe,url)) + + +# process a url list +def process_urllist(urllist, tpe, parent_url): + for u in urllist: + process_url(u,tpe, parent_url) + + +def do_process(tpe, cont): + urllist=[] +# clogger.debug("process :" + str(cont)) + if "article_links" in cont: + process_urllist(cont["article_links"], article_types[tpe], cont["url"]) + if "index_links" in cont: + process_urllist(cont["index_links"], tpe , cont["url"]) + + if "next_page" in cont and cont["next_page"] is not None: + process_url(cont["next_page"],tpe, cont["url"]) + + if "article" in cont: + art=cont["article"] + art["sourcetype"]=tpe + process_article(art) + + if "articles" in cont: + clogger.debug("articles") + for a in cont["articles"]: + if "title" in a: + a["sourcetype"]=tpe + if a.has_key("url")==False: + a["url"]=cont["url"] + process_article(a) + return diff --git a/compiler/mqueues.py b/compiler/mqueues.py new file mode 100644 index 0000000..b87c4ef --- /dev/null +++ b/compiler/mqueues.py @@ -0,0 +1,8 @@ +from gevent.queue import Queue, JoinableQueue +fetch_queue = Queue() +compile_queue = Queue() +process_queue = Queue() + +def put_fetch_queue(o): + fetch_queue.put(o) + diff --git a/compiler/mworker.py b/compiler/mworker.py new file mode 100644 index 0000000..b623978 --- /dev/null +++ b/compiler/mworker.py @@ -0,0 +1,58 @@ + +from mqueues import fetch_queue, compile_queue, process_queue +from compiler import do_compile +from mprocess import do_process +from fetching import fetch_page +from gevent import spawn +from itertools import repeat +from src import clogger +def start_workers(f,c,p): + for _ in range(f): + clogger.debug("spawn fetchworker") + spawn(work_fetch) + for _ in range(c): + spawn(work_compile) + for _ in range(p): + spawn(work_process) + +def work_fetch(): + while True: + run_fetch() + +def work_process(): + while True: + run_process() +def work_compile(): + while True: + run_compile() + + +def queue_url(tpe, url): + fetch_queue.put((0,tpe,url)) + + +# fetch a page from the url list +def run_fetch(): + tc, tpe, url = fetch_queue.get() + if tpe is not "dummyarticle" and tpe is not "dummyindex": + rw=fetch_page(url) + else: + rw="

dummytext

" + compile_queue.put((0, tpe, {"url": url, "sourcetype": tpe, "raw": rw})) + return rw + # fetch_queue.task_done() + +#comile something from the compile list +def run_compile(): + tc,tpe,h = compile_queue.get() + h=do_compile(tpe,h) + process_queue.put((0,tpe, h)) + return h + # compile_queue.task_done() + +def run_process(): + tc,tpe,h = process_queue.get() + do_process(tpe, h) + return h +# process_queue.task_done() + diff --git a/compiler/views.py b/compiler/views.py new file mode 100644 index 0000000..0cfbbad --- /dev/null +++ b/compiler/views.py @@ -0,0 +1,146 @@ +from flask import Blueprint, jsonify, render_template, abort, redirect, url_for, request +compiler_pages = Blueprint('compiler', __name__, + template_folder='.') + +from src.database import db_session2,init_db,read_json,init_db2 +from .models import CrawlUrl +from .models import CrawlCache, CrawlCacheSchema +from .models import CrawlUrlSchema +from src import clogger +from src.articles import Article +#import mworker +import flask +import json +import mworker + +from compiler import do_compile +from fetching import fetch_page + +#flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Article,CrawlUrl)) else None) + +@compiler_pages.route("/") +@compiler_pages.route("") +@compiler_pages.route(".json") +def index(): + status="For documentation goto /doc" + return jsonify(status=status) + +@compiler_pages.route("/doc") +@compiler_pages.route("/doc.json") +def doc(): + return render_template("README") +# return jsonify(status=render_template("README")) +# + + +@compiler_pages.route("/initdb") +@compiler_pages.route("/initdb.json") +def initdb_json(): + init_db() # initialisiere Datenbank + status="Datenbank Neu initialisiert" + return jsonify(status=status) + +@compiler_pages.route("/initdb2") +@compiler_pages.route("/initdb2.json") +def initdb_json2(): + init_db2() # initialisiere Datenbank + status="Datenbank Neu initialisiert" + return jsonify(status=status) + +@compiler_pages.route("/start") +@compiler_pages.route("/start.json") +def start_json(): + mworker.start_workers(1,1,1) # initialisiere Datenbank + status="Worker gestartet" + return jsonify(status=status) + + +@compiler_pages.route("/urls") +@compiler_pages.route("/urls.json") +def urls_index_json(): + # Lade Alle Urls + status=CrawlUrl.query.all() + return jsonify(urls=status) + +# show an existing CrawlUrl +@compiler_pages.route("/urls/") +@compiler_pages.route("/urls/.json") +def urls_json(id): + # Lade Alle Urls + status=CrawlUrl.query.get(id) + cc=CrawlCache.query.filter(CrawlCache.url==status.url).first() + return jsonify(urls=status, cache=cc.__json__()) + +# que an existing CrawlUrl for fetching +@compiler_pages.route("/urls//que") +@compiler_pages.route("/urls//que.json") +def urls_que_json(id): + # Lade Alle Urls + cu=CrawlUrl.query.get(id) + mworker.queue_url(cu.tpe, cu.url) + cc=CrawlCache.query.filter(CrawlCache.url==cu.url) + mworker.start_workers(1,1,1) # initialisiere Datenbank + status="Worker gestartet" + return jsonify(urls=cu, cache=cc) + + +# que an existing CrawlUrl for fetching +@compiler_pages.route("/urls//test") +@compiler_pages.route("/urls//test.json") +def urls_test_json(id): + # Lade Alle Urls + cu=CrawlUrl.query.get(id) + rw=fetch_page(cu.url) + h= {"url": cu.url, "sourcetype": cu.tpe, "raw": rw} + h2=do_compile(cu.tpe, h) + return jsonify(urls=cu,hs=h2,rw=rw) + + + + +@compiler_pages.route("/debug",methods=['GET','PUT']) +def debug(): + status="did nothing" + js=read_json(request) + clogger.info(request.get_json()) + if js["cmd"] == "runfetch": + mworker.run_fetch() + status="fetched something" + if js["cmd"] == "que": + cu = CrawlUrl.query.get(js["id"]) + mworker.queue_url(cu.tpe, cu.url) + status= mworker.run_fetch() + if js["cmd"] == "comp": + status=mworker.run_compile() + if js["cmd"]=="process": + status=mworker.run_process() + return jsonify(status=status) + +@compiler_pages.route("/debugurl") +def debugurl(): + s=CrawlUrlSchema() + status=CrawlUrl.query.all() + return jsonify(status=status) + + +@compiler_pages.route("/urls",methods=['POST']) +def add_urls(): + # Lese Daten + js =read_json(request) + # clogger.info(js) + # Finde oder Erzeuge Url in der Datenbank + url=CrawlUrlSchema().load(js["url"]) + clogger.info(url) + url=CrawlUrl.find_or_create(url.data["tpe"], url.data["url"]) + db_session2.add(url) + db_session2.commit() + return jsonify(url=url, kk=js) + +@compiler_pages.route("/urls/",methods=['DELETE']) +@compiler_pages.route("/urls.json",methods=['DELETE']) +def delete(id): + cu=CrawlUrl.query.get(id) + if cu != None: + db_session2.delete(cu) + db_session2.commit() + return jsonify(url={}) diff --git a/crawler/__init__.py b/crawler/__init__.py new file mode 100644 index 0000000..b31d450 --- /dev/null +++ b/crawler/__init__.py @@ -0,0 +1,4 @@ + + +def init(): + return " " diff --git a/database.py b/database.py new file mode 100644 index 0000000..bae434d --- /dev/null +++ b/database.py @@ -0,0 +1,55 @@ +from sqlalchemy import create_engine +from sqlalchemy.orm import scoped_session, sessionmaker +from sqlalchemy.ext.declarative import declarative_base +from src import package_directory,clogger, cfg +from os import path +import json +#engine = create_engine('sqlite:////home/andreas/www/crawler/test.db', convert_unicode=True) + +if cfg.get("db_path")==None or cfg.get("db_path").strip()=="": + db_path=package_directory +else: + db_path=cfg.get("db_path") + +db_mainfile=cfg.get("db_mainfile") +if db_mainfile == None or db_mainfile.strip()=="": + db_mainfile="../srctest.db" + +db_urlfile=cfg.get("db_mainfile") +if db_urlfile == None or db_urlfile.strip()=="": + db_urlfile="../srctest_cu.db" + + +engine = create_engine('sqlite:///'+ path.join(db_path,db_mainfile), convert_unicode=True) + +db_session = scoped_session(sessionmaker(autocommit=False, + autoflush=False, + bind=engine)) + +engine2 = create_engine('sqlite:///'+ path.join(db_path,db_urlfile), convert_unicode=True) + +db_session2 = scoped_session(sessionmaker(autocommit=False, + autoflush=False, + bind=engine2)) + +Base = declarative_base() +Base.query = db_session.query_property() +Base2 = declarative_base() +Base2.query = db_session2.query_property() + +def read_json(rq): + js=rq.get_json() + clogger.info(rq.data) + if js is None: + js=rq.form.to_dict() + if js=={} and rq.data != "": + js=json.loads(rq.data) + return js + +def init_db(): + import src.models + Base.metadata.create_all(bind=engine) + +def init_db2(): + from .compiler.models import CrawlUrl, CrawlCache + Base2.metadata.create_all(bind=engine2) diff --git a/fb.py b/fb.py new file mode 100644 index 0000000..30b1828 --- /dev/null +++ b/fb.py @@ -0,0 +1,4 @@ +from src import cfg +import facebook + +graph = facebook.GraphAPI(access_token=cfg.fb_token, version='2.3') diff --git a/meta.py b/meta.py new file mode 100644 index 0000000..53528c3 --- /dev/null +++ b/meta.py @@ -0,0 +1,21 @@ +import os +package_directory = os.path.dirname(os.path.abspath(__file__)) +from config import Config +import logging +import sys + +cfg = Config(file(os.path.join(package_directory, 'config.cfg'))) +#--------------- Logging + + +file_handler=logging.FileHandler(cfg.logfile) +file_handler.setLevel(logging.INFO) +std_handler=logging.StreamHandler(stream=sys.stdout) +std_handler.setLevel(logging.DEBUG) + +lg=logging.getLogger('mylogger') +lg.setLevel(logging.DEBUG) +lg.addHandler(file_handler) +lg.addHandler(std_handler) + +#---------------- diff --git a/models.py b/models.py new file mode 100644 index 0000000..2205562 --- /dev/null +++ b/models.py @@ -0,0 +1,4 @@ + +from .articles.model import Article +from .sections.model import Section +from .compiler.models import CrawlUrl, CrawlCache diff --git a/sections/__init__.py b/sections/__init__.py new file mode 100644 index 0000000..745c2cd --- /dev/null +++ b/sections/__init__.py @@ -0,0 +1 @@ +from .model import Section diff --git a/sections/model.py b/sections/model.py new file mode 100644 index 0000000..2d289f3 --- /dev/null +++ b/sections/model.py @@ -0,0 +1,44 @@ +from sqlalchemy import Column, Integer, String, Boolean, DateTime, Text, ForeignKey +from sqlalchemy.orm import relationship + +from datetime import datetime +from src.database import Base,db_session +from marshmallow import Schema, fields + +import json +import flask + +#from src.articles import Article + +class SectionSchema(Schema): + id=fields.Integer() + foreign_name=fields.String() + name=fields.String() + +class Section(Base): + __tablename__ = 'sections' + id = Column(Integer, primary_key=True) + url = Column(String(250)) + crawlurl = Column(Integer) + foreign_name = Column(String(250),unique=True) + name=Column(String(250)) + group = Column(String(250)) + articles=relationship("Article", back_populates="section") + + def __json__(self): + return SectionSchema().dump(self)[0] + def __init__(self, url=None,fname=None): + self.url=url + self.foreign_name=fname + + @classmethod + def find_or_create(cls, fname): + s=Section.query.filter(Section.foreign_name==fname).first() + if s is None: + s=Section(fname) + db_session.add(s) + db_session.commit() + s.foreign_name=fname + db_session.add(s) + db_session.commit() + return s diff --git a/sections/views.py b/sections/views.py new file mode 100644 index 0000000..f82929a --- /dev/null +++ b/sections/views.py @@ -0,0 +1,37 @@ +from flask import Blueprint, jsonify, render_template, abort, redirect, url_for, request +section_pages = Blueprint('sections', __name__) +from .model import Section +from .model import SectionSchema +#import flask +from datetime import datetime +import json +from src import clogger + +from src.database import db_session, read_json +import flask + +@section_pages.route("/") +@section_pages.route("") +@section_pages.route(".json") +def index(): + sections=Section.query.all() + return jsonify(sections=sections) + +@section_pages.route("/",methods=['PUT']) +@section_pages.route("/.json",methods=['PUT']) +def update(id): + section=Section.query.get(id) + clogger.info(request.data) + a=request.get_json() + section.text=a["text"] + db_session.commit() + return jsonify(section=section) + + +@section_pages.route("/",methods=['GET']) +@section_pages.route("/.json",methods=['GET']) +def get(id): + section=Section.query.get(id) + clogger.info(section) +# section=SectionSchema().dump(section)[0] + return jsonify(section=section,articles=section.articles) diff --git a/templates/home.html b/templates/home.html new file mode 100644 index 0000000..f3e333e --- /dev/null +++ b/templates/home.html @@ -0,0 +1 @@ +

Hello World

diff --git a/users/users.py b/users/users.py new file mode 100644 index 0000000..55896b3 --- /dev/null +++ b/users/users.py @@ -0,0 +1,19 @@ + +class User(object): + def __init__(self, id, username, password): + self.id = id + self.username = username + self.password = password + + def __str__(self): + return "User(id='%s')" % self.id + +user = User(1, 'user', 'password') +def authenticate(username, password): + if username == user.username and password == user.password: + return user + +def identity(payload): + return user + +