fbfeed + sectionreset

This commit is contained in:
Andreas Stephanides
2017-01-15 14:43:02 +01:00
parent 449a278d58
commit f475364213
7 changed files with 95 additions and 36 deletions

View File

@@ -15,12 +15,12 @@ import flask
#flask.json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None) #flask.json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None)
flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Base, Article,CrawlUrl)) else None) flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Base, Article,CrawlUrl)) else None)
import controller
@article_pages.route("/") @article_pages.route("/")
@article_pages.route("") @article_pages.route("")
@article_pages.route(".json") @article_pages.route(".json")
def index(): def index():
articles=Article.query.all() articles=controller.get_all()
return jsonify(articles=articles) return jsonify(articles=articles)
@article_pages.route("/<int:id>",methods=['PUT']) @article_pages.route("/<int:id>",methods=['PUT'])

View File

@@ -1 +1,2 @@
from rss import rssfeed from rss import rssfeed
from fb import fbfeed

50
compiler/comp/fb.py Normal file
View File

@@ -0,0 +1,50 @@
from dateutil.parser import parse
from datetime import datetime
import re
import urlparse
from src.fb import graph
from facebook import GraphAPIError
import json
def fbfeedelement(h):
art={}
art["text"]=""
if h.has_key("story"):
art["text"]=art["text"]+h["story"]+"<br>"
if h.has_key("attachments") and len(h["attachments"]["data"])>0:
for a in h["attachments"]["data"]:
if a.has_key("media") and a["media"].has_key("image") and a["media"]["image"].has_key("src") and not art.has_key("image"):
art["image"]=a["media"]["image"]["src"]
if a.has_key("title"):
art["title"]=a["title"]
if a.has_key("type") and a["type"] in ["event"]:
art["url"]=a["url"]
if a.has_key("description"):
art["text"]=art["text"]+a["description"]+"<br>"
if not art.has_key("title") and h.has_key("story"):
art["title"]=h["story"]
if h.has_key("message"):
art["text"]=art["text"]+h["message"]
art["published"] =parse(h["created_time"])
if not art.has_key("url"):
art["url"]=urlparse.urlunsplit(("http","www.facebook.at",h["id"],"",""))
return art
def fbfeed(url, raw):
js = json.loads(raw)
arts=[]
u=urlparse.urlparse(url)
for m in js["data"]:
aa=fbfeedelement(m)
if not aa.has_key("title"):
aa["title"] = u[1]+ " at " + aa["published"].strftime("%Y-%m-%d %H:%M")
aa["section"]="Facebook: "+u[1]
arts.append(aa)
nx=None
if js.has_key("paging") and js["paging"].has_key("next"):
nx=js["paging"]["next"]
return {"url": url, "next_page": nx,"articles": arts}

View File

@@ -6,9 +6,7 @@ from datetime import datetime
import re import re
import urlparse import urlparse
from src import clogger, cfg from src import clogger, cfg
from src.fb import graph
from fixing import fix_link from fixing import fix_link
from facebook import GraphAPIError
import feedparser import feedparser
#from fetching import downloadfile #from fetching import downloadfile
@@ -26,7 +24,7 @@ def do_compile(tpe, cont):
return cont return cont
from comp import rssfeed from comp import rssfeed
from comp import fbfeed
def dummyarticle(url, raw): def dummyarticle(url, raw):
return {"url": url, "article":{"url": url, "section": "dummysection", "sourcetype": "dummy", "title":"dummytitle", "text": raw, "image": "fff", "author": "me", "published": None}} return {"url": url, "article":{"url": url, "section": "dummysection", "sourcetype": "dummy", "title":"dummytitle", "text": raw, "image": "fff", "author": "me", "published": None}}
@@ -213,33 +211,6 @@ def fsbizindex(url, raw):
def fbfeed(url, raw):
js = json.loads(raw)
arts=[]
u=urlparse.urlparse(url)
for m in js["data"]:
aa={}
aa["url"]=urlparse.urlunsplit(("http","www.facebook.at",m["id"],"",""))
aa["published"] =parse(m["created_time"])
if m.has_key("message")==True:
aa["text"] = m["message"]
else:
try:
h=graph.get_object(id=m["id"].split("_")[1])
if h.has_key("description"):
aa["text"]=h["description"]
else:
aa["text"]=json.dumps()
except GraphAPIError:
aa["text"]=""
if m.has_key("story")==True:
aa["title"] = m["story"]
else:
aa["title"] = u[1]+ " at " + aa["published"].strftime("%Y-%m-%d %H:%M")
aa["section"]="Facebook: "+u[1]
arts.append(aa)
return {"url": url, "next_page": js["paging"]["next"],"articles": arts}
def fsmbindex(url, raw): def fsmbindex(url, raw):
if raw is None: if raw is None:
raise Error raise Error

View File

@@ -40,10 +40,10 @@ def fetch_page(furl):
if u[0] == '': if u[0] == '':
furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4])) furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first() cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
if cc is None: if cc is None or u[0]=='fb': # no caching for Facebook
clogger.debug("fetching url: "+ str(furl)) clogger.debug("fetching url: "+ str(furl))
if u[0]=='fb': if u[0]=='fb':
tx = json.dumps(graph.get_object(id=u[1]+u[2])) tx = json.dumps(graph.get_object(id=u[1]+u[2]+"?fields=story,created_time,id,message,attachments"))
else: else:
tx=s.get(furl).text tx=s.get(furl).text
CrawlCache.store(furl,tx) CrawlCache.store(furl,tx)

View File

@@ -1,13 +1,14 @@
from flask import Blueprint, jsonify, render_template, abort, redirect, url_for, request from flask import Blueprint, jsonify, render_template, abort, redirect, url_for, request, Response
compiler_pages = Blueprint('compiler', __name__, compiler_pages = Blueprint('compiler', __name__,
template_folder='.') template_folder='.')
from src.database import db_session2,init_db,read_json,init_db2 from src.database import db_session2,init_db,read_json,init_db2,db_session
from .models import CrawlUrl from .models import CrawlUrl
from .models import CrawlCache, CrawlCacheSchema from .models import CrawlCache, CrawlCacheSchema
from .models import CrawlUrlSchema from .models import CrawlUrlSchema
from src import clogger from src import clogger
from src.articles import Article from src.articles import Article
from src.sections import Section
#import mworker #import mworker
import flask import flask
import json import json
@@ -63,6 +64,15 @@ def urls_index_json():
status=CrawlUrl.query.all() status=CrawlUrl.query.all()
return jsonify(urls=status) return jsonify(urls=status)
@compiler_pages.route("/urls.lst")
def urls_lst():
cus=CrawlUrl.query.all()
urls=map((lambda cu: ("id %d %s " % (cu.id, cu.url))),cus)
urls=map((lambda u: u+"\n"),urls)
return Response(urls,mimetype='text/plain')
# show an existing CrawlUrl # show an existing CrawlUrl
@compiler_pages.route("/urls/<int:id>") @compiler_pages.route("/urls/<int:id>")
@compiler_pages.route("/urls/<int:id>.json") @compiler_pages.route("/urls/<int:id>.json")
@@ -82,6 +92,18 @@ def urls_que_json(id):
return jsonify(urls=cu, cache=cc) return jsonify(urls=cu, cache=cc)
@compiler_pages.route("/urls/que.lst")
def urls_que_lst():
# Lade Alle Urls
# cu=urls_que(id)
#cc=CrawlCache.query.filter(CrawlCache.url==cu.url)
cus=CrawlUrl.query.all()
urls=map((lambda cu: url_for('.urls_que_json',id=cu.id)),cus)
if request.values.has_key('url'):
urls=map((lambda u: request.values["url"]+ u),urls)
urls=map((lambda u: u+"\n"),urls)
return Response(urls,mimetype='text/plain')
# que an existing CrawlUrl for fetching # que an existing CrawlUrl for fetching
@compiler_pages.route("/urls/<int:id>/test") @compiler_pages.route("/urls/<int:id>/test")
@@ -137,3 +159,15 @@ def delete(id):
db_session2.delete(cu) db_session2.delete(cu)
db_session2.commit() db_session2.commit()
return jsonify(url={}) return jsonify(url={})
@compiler_pages.route("/section/<int:id>/reset",methods=['GET'])
@compiler_pages.route("/section/<int:id>/reset.json",methods=['GET'])
def reset(id):
section=Section.query.get(id)
clogger.info(section)
for a in section.articles:
db_session.delete(a)
db_session.commit()
section=Section.query.get(id)
return jsonify(section=section,articles=section.articles)

View File

@@ -17,6 +17,7 @@ def index():
sections=Section.query.all() sections=Section.query.all()
return jsonify(sections=sections) return jsonify(sections=sections)
@section_pages.route("/<int:id>",methods=['PUT']) @section_pages.route("/<int:id>",methods=['PUT'])
@section_pages.route("/<int:id>.json",methods=['PUT']) @section_pages.route("/<int:id>.json",methods=['PUT'])
def update(id): def update(id):
@@ -35,3 +36,5 @@ def get(id):
clogger.info(section) clogger.info(section)
# section=SectionSchema().dump(section)[0] # section=SectionSchema().dump(section)[0]
return jsonify(section=section,articles=section.articles) return jsonify(section=section,articles=section.articles)