div. updates

2017-02-17 10:02:20 +01:00
parent bdfa16728d
commit b71803c050
14 changed files with 224 additions and 65 deletions
--- a/compiler/comp/fb.py
+++ b/compiler/comp/fb.py
@@ -5,7 +5,7 @@ import urlparse
 from src.fb import graph
 from facebook import GraphAPIError
 import json
-
+import gevent

 def fbfeedelement(h):
   art={}
@@ -39,11 +39,11 @@ def fbfeed(url, raw, params={}):
    arts=[]
    u=urlparse.urlparse(url)
    for m in js["data"]:
-        aa=fbfeedelement(m)
-        if not aa.has_key("title"):
+      aa=fbfeedelement(m)
+      if not aa.has_key("title"):
            aa["title"] = u[1]+ " at " + aa["published"].strftime("%Y-%m-%d %H:%M")
-        aa["section"]="Facebook: "+u[1]
-        arts.append(aa)
+            aa["section"]="Facebook: "+u[1]
+      arts.append(aa)
    nx=None
    if js.has_key("paging") and js["paging"].has_key("next"):
       un=urlparse.urlparse(js["paging"]["next"])
--- a/compiler/comp/fsch.py
+++ b/compiler/comp/fsch.py
@@ -45,4 +45,5 @@ def fscharticle(url,raw,params={}):
            d["published"]=parse(pi["published"])
        d["pi"]=pi
    d["sourcetype"]="fscharticle"
+    d["section"]= "Fachschaft Chemie"
    return {"article": d}
--- a/compiler/fetching.py
+++ b/compiler/fetching.py
@@ -1,6 +1,6 @@
 from requests import session
 s=session()
-from src import package_directory, download_path,cfg
+from src import package_directory, download_path,cfg, clogger
 from os import path, makedirs
 import os
 import json
@@ -12,6 +12,7 @@ from src.database import db_session2
 from models import CrawlUrl
 import errno
 import urlparse
+from sqlalchemy.exc import OperationalError, InvalidRequestError
 def announce_articleid(id):
    for u in cfg.announcearticle_url:
        s.get( u % id)
@@ -34,23 +35,35 @@ from models import CrawlCache
 from datetime import datetime, timedelta


+def cleanup_cache():
+    current_time = datetime.utcnow()
+    ten_weeks_ago = current_time - timedelta(days=cfg.cache_days*2)
+    CrawlCache.query.filter(CrawlCache.fetched<ten_weeks_ago).delete()
+
+def get_cached_page(furl):
+    current_time = datetime.utcnow()
+    ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
+    try:
+        cc= CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
+    except OperationalError:
+        db_session2.rollback()
+        cc=None
+    return cc


 def fetch_page(furl):
-    current_time = datetime.utcnow()
-    ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
    u=urlparse.urlparse(furl)
-    cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first()
- 
+    current_time = datetime.utcnow()
+    cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first() 
    if u[0] == '':
        furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
-    cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
+    cc=get_cached_page(furl) #CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
    if cc is None or u[0]=='fb': # no caching for Facebook
        clogger.debug("fetching url:  "+ str(furl))
        if u[0]=='fb':
            fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds()))
            furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments"
-            cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
+            cc=get_cached_page(furl) #CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
            if cc is None:
                tx = json.dumps(graph.get_object(id=furl))
            else:
--- a/compiler/models.py
+++ b/compiler/models.py
@@ -27,19 +27,23 @@ class CrawlUrl(Base2):
    tpe=Column(String(250))
    url = Column(String(250))
    last_fetched = Column(DateTime)
+    __schema__=CrawlUrlSchema
+    __jsonid__='crawlurl'
+    __whiteattrs__=["id","tpe","url"]
+    __jsonattrs__=None
    def fetched(self):
        CrawlCache.query.find(CrawlCache.url==self.url).first()
    @classmethod
    def find_or_create(self, tpe, url):
        aa = CrawlUrl.query.filter(CrawlUrl.url==url).filter(CrawlUrl.tpe==tpe).first()        
        if aa is None:
-            aa=CrawlUrl(tpe,url)
+            aa=CrawlUrl({"tpe":tpe,"url": url})
        return aa
    def schedule(self):
        put_fetch_queue((0, self.tpe, self.url))
-    def __init__(self, tpe, url):
-        self.url=url
-        self.tpe=tpe
+#    def __init__(self, tpe, url):
+#        self.url=url
+#        self.tpe=tpe
    def __json__(self):
        return CrawlUrlSchema().dump(self)[0]

@@ -55,7 +59,10 @@ class CrawlCache(Base2):
    url=Column(String(250))
    fetched=Column(DateTime)
    raw=Column(Text)
-
+    __schema__=CrawlCacheSchema
+    __jsonattrs__=None
+    __jsonid__='crawlcache'
+    __whiteattrs__= []
    def __init__(self, url,rw):
        self.url=url
        self.raw=rw
--- a/compiler/mprocess.py
+++ b/compiler/mprocess.py
@@ -5,6 +5,7 @@ from src.database import db_session
 from mqueues import fetch_queue, compile_queue, put_fetch_queue
 from fetching import fetch_page, downloadfile, announce_articleid
 from fixing import fix_html, fix_file
+#from src import app

 from compiler import article_types
 from fixing import fix_link
@@ -28,12 +29,19 @@ def process_article(art):
        aa.last_fetched=datetime.now()
        aa.sourcetype=art["sourcetype"]
        db_session.add(aa)
-        db_session.commit()
+        try:
+            db_session.commit()
+        except InvalidRequestError,e:
+            db_session.rollback()
+            clogger.error(e)    
        clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
-        db_session.close()
+    return aa
+#        app.logger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
+#        db_session.close()
        #        announce_articleid(aa.id)
        #
-    return aa
+
+

 # process a single found url
 def process_url(url,tpe, parent_url,params={}):
--- a/compiler/mworker.py
+++ b/compiler/mworker.py
@@ -45,7 +45,7 @@ def run_fetch():
        tc, tpe, url, p= fetch_queue.get()
    except ValueError:
        tc, tpe, url= fetch_queue.get()
-        
+    clogger.debug("fetched : "+url)        
    if tpe is not "dummyarticle" and tpe is not "dummyindex":
        rw=fetch_page(url)
    else:
--- a/compiler/views.py
+++ b/compiler/views.py
@@ -79,8 +79,8 @@ def urls_lst():
 def urls_json(id):
    # Lade Alle Urls
    status=CrawlUrl.query.get(id)
-    cc=CrawlCache.query.filter(CrawlCache.url==status.url).first()
-    return jsonify(urls=status, cache=cc.__json__())
+#    cc=CrawlCache.query.filter(CrawlCache.url==status.url).first()
+    return jsonify(urls=status)

 # que an existing CrawlUrl for fetching
@compiler_pages.route("/urls/<int:id>/que")