From f475364213678f204a386751b4fabfa26f1077f7 Mon Sep 17 00:00:00 2001 From: Andreas Stephanides Date: Sun, 15 Jan 2017 14:43:02 +0100 Subject: [PATCH] fbfeed + sectionreset --- articles/views.py | 4 ++-- compiler/comp/__init__.py | 1 + compiler/comp/fb.py | 50 +++++++++++++++++++++++++++++++++++++++ compiler/compiler.py | 31 +----------------------- compiler/fetching.py | 4 ++-- compiler/views.py | 38 +++++++++++++++++++++++++++-- sections/views.py | 3 +++ 7 files changed, 95 insertions(+), 36 deletions(-) create mode 100644 compiler/comp/fb.py diff --git a/articles/views.py b/articles/views.py index 7202b4b..b31772f 100644 --- a/articles/views.py +++ b/articles/views.py @@ -15,12 +15,12 @@ import flask #flask.json.JSONEncoder.default = lambda self,obj: ((ArticleSchema().dump(obj)[0]) if isinstance(obj, Article) else None) flask.json.JSONEncoder.default = lambda self,obj: ((obj.__json__()) if isinstance(obj, (Base, Article,CrawlUrl)) else None) - +import controller @article_pages.route("/") @article_pages.route("") @article_pages.route(".json") def index(): - articles=Article.query.all() + articles=controller.get_all() return jsonify(articles=articles) @article_pages.route("/",methods=['PUT']) diff --git a/compiler/comp/__init__.py b/compiler/comp/__init__.py index 0a7c135..0abc57b 100644 --- a/compiler/comp/__init__.py +++ b/compiler/comp/__init__.py @@ -1 +1,2 @@ from rss import rssfeed +from fb import fbfeed diff --git a/compiler/comp/fb.py b/compiler/comp/fb.py new file mode 100644 index 0000000..9b987dc --- /dev/null +++ b/compiler/comp/fb.py @@ -0,0 +1,50 @@ +from dateutil.parser import parse +from datetime import datetime +import re +import urlparse +from src.fb import graph +from facebook import GraphAPIError +import json + + +def fbfeedelement(h): + art={} + art["text"]="" + if h.has_key("story"): + art["text"]=art["text"]+h["story"]+"
" + if h.has_key("attachments") and len(h["attachments"]["data"])>0: + for a in h["attachments"]["data"]: + if a.has_key("media") and a["media"].has_key("image") and a["media"]["image"].has_key("src") and not art.has_key("image"): + art["image"]=a["media"]["image"]["src"] + if a.has_key("title"): + art["title"]=a["title"] + if a.has_key("type") and a["type"] in ["event"]: + art["url"]=a["url"] + if a.has_key("description"): + art["text"]=art["text"]+a["description"]+"
" + + + if not art.has_key("title") and h.has_key("story"): + art["title"]=h["story"] + if h.has_key("message"): + art["text"]=art["text"]+h["message"] + art["published"] =parse(h["created_time"]) + if not art.has_key("url"): + art["url"]=urlparse.urlunsplit(("http","www.facebook.at",h["id"],"","")) + return art + + +def fbfeed(url, raw): + js = json.loads(raw) + arts=[] + u=urlparse.urlparse(url) + for m in js["data"]: + aa=fbfeedelement(m) + if not aa.has_key("title"): + aa["title"] = u[1]+ " at " + aa["published"].strftime("%Y-%m-%d %H:%M") + aa["section"]="Facebook: "+u[1] + arts.append(aa) + nx=None + if js.has_key("paging") and js["paging"].has_key("next"): + nx=js["paging"]["next"] + return {"url": url, "next_page": nx,"articles": arts} diff --git a/compiler/compiler.py b/compiler/compiler.py index d227330..69a6f91 100644 --- a/compiler/compiler.py +++ b/compiler/compiler.py @@ -6,9 +6,7 @@ from datetime import datetime import re import urlparse from src import clogger, cfg -from src.fb import graph from fixing import fix_link -from facebook import GraphAPIError import feedparser #from fetching import downloadfile @@ -26,7 +24,7 @@ def do_compile(tpe, cont): return cont from comp import rssfeed - +from comp import fbfeed def dummyarticle(url, raw): return {"url": url, "article":{"url": url, "section": "dummysection", "sourcetype": "dummy", "title":"dummytitle", "text": raw, "image": "fff", "author": "me", "published": None}} @@ -213,33 +211,6 @@ def fsbizindex(url, raw): -def fbfeed(url, raw): - js = json.loads(raw) - arts=[] - u=urlparse.urlparse(url) - for m in js["data"]: - aa={} - aa["url"]=urlparse.urlunsplit(("http","www.facebook.at",m["id"],"","")) - aa["published"] =parse(m["created_time"]) - if m.has_key("message")==True: - aa["text"] = m["message"] - else: - try: - h=graph.get_object(id=m["id"].split("_")[1]) - if h.has_key("description"): - aa["text"]=h["description"] - else: - aa["text"]=json.dumps() - except GraphAPIError: - aa["text"]="" - if m.has_key("story")==True: - aa["title"] = m["story"] - else: - aa["title"] = u[1]+ " at " + aa["published"].strftime("%Y-%m-%d %H:%M") - aa["section"]="Facebook: "+u[1] - arts.append(aa) - return {"url": url, "next_page": js["paging"]["next"],"articles": arts} - def fsmbindex(url, raw): if raw is None: raise Error diff --git a/compiler/fetching.py b/compiler/fetching.py index 522278f..8201de4 100644 --- a/compiler/fetching.py +++ b/compiler/fetching.py @@ -40,10 +40,10 @@ def fetch_page(furl): if u[0] == '': furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4])) cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first() - if cc is None: + if cc is None or u[0]=='fb': # no caching for Facebook clogger.debug("fetching url: "+ str(furl)) if u[0]=='fb': - tx = json.dumps(graph.get_object(id=u[1]+u[2])) + tx = json.dumps(graph.get_object(id=u[1]+u[2]+"?fields=story,created_time,id,message,attachments")) else: tx=s.get(furl).text CrawlCache.store(furl,tx) diff --git a/compiler/views.py b/compiler/views.py index 16aab69..424be49 100644 --- a/compiler/views.py +++ b/compiler/views.py @@ -1,13 +1,14 @@ -from flask import Blueprint, jsonify, render_template, abort, redirect, url_for, request +from flask import Blueprint, jsonify, render_template, abort, redirect, url_for, request, Response compiler_pages = Blueprint('compiler', __name__, template_folder='.') -from src.database import db_session2,init_db,read_json,init_db2 +from src.database import db_session2,init_db,read_json,init_db2,db_session from .models import CrawlUrl from .models import CrawlCache, CrawlCacheSchema from .models import CrawlUrlSchema from src import clogger from src.articles import Article +from src.sections import Section #import mworker import flask import json @@ -63,6 +64,15 @@ def urls_index_json(): status=CrawlUrl.query.all() return jsonify(urls=status) +@compiler_pages.route("/urls.lst") +def urls_lst(): + cus=CrawlUrl.query.all() + urls=map((lambda cu: ("id %d %s " % (cu.id, cu.url))),cus) + urls=map((lambda u: u+"\n"),urls) + return Response(urls,mimetype='text/plain') + + + # show an existing CrawlUrl @compiler_pages.route("/urls/") @compiler_pages.route("/urls/.json") @@ -82,6 +92,18 @@ def urls_que_json(id): return jsonify(urls=cu, cache=cc) +@compiler_pages.route("/urls/que.lst") +def urls_que_lst(): + # Lade Alle Urls + # cu=urls_que(id) + #cc=CrawlCache.query.filter(CrawlCache.url==cu.url) + cus=CrawlUrl.query.all() + urls=map((lambda cu: url_for('.urls_que_json',id=cu.id)),cus) + if request.values.has_key('url'): + urls=map((lambda u: request.values["url"]+ u),urls) + urls=map((lambda u: u+"\n"),urls) + return Response(urls,mimetype='text/plain') + # que an existing CrawlUrl for fetching @compiler_pages.route("/urls//test") @@ -137,3 +159,15 @@ def delete(id): db_session2.delete(cu) db_session2.commit() return jsonify(url={}) + + +@compiler_pages.route("/section//reset",methods=['GET']) +@compiler_pages.route("/section//reset.json",methods=['GET']) +def reset(id): + section=Section.query.get(id) + clogger.info(section) + for a in section.articles: + db_session.delete(a) + db_session.commit() + section=Section.query.get(id) + return jsonify(section=section,articles=section.articles) diff --git a/sections/views.py b/sections/views.py index f82929a..14c6ab7 100644 --- a/sections/views.py +++ b/sections/views.py @@ -17,6 +17,7 @@ def index(): sections=Section.query.all() return jsonify(sections=sections) + @section_pages.route("/",methods=['PUT']) @section_pages.route("/.json",methods=['PUT']) def update(id): @@ -35,3 +36,5 @@ def get(id): clogger.info(section) # section=SectionSchema().dump(section)[0] return jsonify(section=section,articles=section.articles) + +