From b71803c050a3f576590fc8f3b25779aea7ed1b51 Mon Sep 17 00:00:00 2001
From: andis <andis@fachschaften>
Date: Fri, 17 Feb 2017 10:02:20 +0100
Subject: [PATCH] div. updates

---
 __init__.py           | 92 +++++++++++++++++++++++++++----------------
 articles/model.py     | 15 +++++--
 articles/views.py     |  2 +-
 compiler/comp/fb.py   | 10 ++---
 compiler/comp/fsch.py |  1 +
 compiler/fetching.py  | 27 +++++++++----
 compiler/models.py    | 17 +++++---
 compiler/mprocess.py  | 14 +++++--
 compiler/mworker.py   |  2 +-
 compiler/views.py     |  4 +-
 database.py           |  9 +++--
 database_mbase.py     | 32 +++++++++++++++
 dump_urls.py          | 20 ++++++++++
 load_urls.py          | 44 +++++++++++++++++++++
 14 files changed, 224 insertions(+), 65 deletions(-)
 create mode 100644 dump_urls.py
 create mode 100644 load_urls.py

diff --git a/__init__.py b/__init__.py
index 0817fc4..5074144 100644
--- a/__init__.py
+++ b/__init__.py
@@ -1,40 +1,12 @@
 import os
 import sys
+import lockfile
+#from lockfile import LockFile
 package_directory = os.path.dirname(os.path.abspath(__file__))
 from config import Config
 
 cfg = Config(file(os.path.join(package_directory, 'config.cfg')))
 #--------------- Logging
-import logging
-
-file_handler=logging.FileHandler(cfg.logfile)
-file_handler.setLevel(logging.DEBUG)
-stream_handler=logging.StreamHandler(sys.stdout)
-stream_handler.setLevel(logging.DEBUG)
-
-clt=logging.getLogger('mylogger')
-clt.setLevel(logging.DEBUG)
-clt.addHandler(file_handler)
-clt.addHandler(stream_handler)
-
-clogger=clt
-#----------------
-download_path=cfg.download_path
-
-lg=clt
-
-from gevent import spawn, monkey
-monkey.patch_all()
-from .compiler import start_workers
-#start_workers(1,1,1)
-
-
-if cfg.bot_active:
-    from src.bot import bot
-    #if not app.debug or os.environ.get("WERKZEUG_RUN_MAIN") == "true":
-    bot.message_loop()
-
-
 
 
 # Framework
@@ -43,16 +15,66 @@ from flask import Flask, jsonify, render_template, redirect, request,send_from_d
 from flask_cors import CORS, cross_origin
 #Authentication
 from flask_jwt import JWT, jwt_required, current_identity
+import logging
+
+
+app = Flask(__name__)
+
+file_handler=logging.FileHandler(cfg.logfile)
+file_handler.setLevel(logging.DEBUG)
+stream_handler=logging.StreamHandler(sys.stdout)
+stream_handler.setLevel(logging.DEBUG)
+
+CORS(app)
+clt=logging.getLogger('mylogger')
+clt.setLevel(logging.DEBUG)
+clt.addHandler(file_handler)
+clt.addHandler(stream_handler)
+lg=clt
+
+#clogger=clt
+#----------------
+
+#app.config['LOGGER_NAME']='mylogger'
+app.logger.setLevel(logging.DEBUG)
+app.logger.info("Server Started")
+app.logger.setLevel(logging.DEBUG)
+app.logger.addHandler(file_handler)
+app.logger.addHandler(stream_handler)
+
+clogger=app.logger
+
+
+
+
+
+download_path=cfg.download_path
+
 
 from src.models import Article,Section
 from src.users import authenticate, identity
 from datetime import datetime
 
-app = Flask(__name__)
-CORS(app)
-app.config['LOGGER_NAME']='mylogger'
-app.logger.setLevel(logging.DEBUG)
-app.logger.info("Server Started")
+
+from gevent import spawn, monkey
+monkey.patch_all()
+from .compiler import start_workers
+#start_workers(1,1,1)
+
+
+lock = lockfile.LockFile("/srv/crawlerapi/bot.lock")
+
+try:
+    if cfg.bot_active and not lock.is_locked():
+        lock.acquire()
+        from src.bot import bot
+        #if not app.debug or os.environ.get("WERKZEUG_RUN_MAIN") == "true":
+        bot.message_loop()
+except lockfile.Error:
+    clogger.info("Couldn't Lock the bot file")
+
+
+
 
 app.config['SECRET_KEY'] = 'super-secret'
 import flask
diff --git a/articles/model.py b/articles/model.py
index cb19507..9d6ff99 100644
--- a/articles/model.py
+++ b/articles/model.py
@@ -59,6 +59,9 @@ class FullArticleSchema(Schema):
     author=fields.String(allow_none=True)
     section_id=fields.Integer()
     sourcetype =fields.String()
+    organization_name=fields.String(dump_only=True)
+    organization_image=fields.String(dump_only=True)
+    organization_id=fields.Integer(dump_only=True)
     image =fields.String(allow_none=True)
 #    @post_load
 #    def make_article(self, data):
@@ -95,9 +98,15 @@ class Article(Base):
         self.title=title
         self.published_date=published_date
         self.first_fetched=datetime.now()
-    def __json__(self):
-        return ArticleSchema().dump(self)[0]
-
+#    def __json__(self):
+#        return ArticleSchema().dump(self)[0]
+    def organization_name(self):
+        return self.section.organization.name
+    def organization_image(self):
+        return self.section.organization.image
+    def organization_id(self):
+        return self.section.organization.id
+    
 #    def dict(self):
 #        return {"id": str(int(self.id)), "title": self.title, "text": self.text, "author": self.author, "section":self.section, "sourcetype": self.sourcetype, "last_fetched": self.last_fetched, "first_fetched": self.first_fetched, "published_date": self.published_date, "date": self.date,"image": self.image, "url": self.url}
 
diff --git a/articles/views.py b/articles/views.py
index b55d707..93aaba2 100644
--- a/articles/views.py
+++ b/articles/views.py
@@ -50,7 +50,7 @@ def update(id):
 @article_pages.route("/<int:id>.json",methods=['GET'])
 def get(id):
     article=Article.query.get(id)
-    clogger.info(article)
+#    clogger.info(article)
 #    article=ArticleSchema().dump(article)[0]
     return jsonify(article=article)
 
diff --git a/compiler/comp/fb.py b/compiler/comp/fb.py
index d884fab..afb9e50 100644
--- a/compiler/comp/fb.py
+++ b/compiler/comp/fb.py
@@ -5,7 +5,7 @@ import urlparse
 from src.fb import graph
 from facebook import GraphAPIError
 import json
-
+import gevent
 
 def fbfeedelement(h):
    art={}
@@ -39,11 +39,11 @@ def fbfeed(url, raw, params={}):
     arts=[]
     u=urlparse.urlparse(url)
     for m in js["data"]:
-        aa=fbfeedelement(m)
-        if not aa.has_key("title"):
+      aa=fbfeedelement(m)
+      if not aa.has_key("title"):
             aa["title"] = u[1]+ " at " + aa["published"].strftime("%Y-%m-%d %H:%M")
-        aa["section"]="Facebook: "+u[1]
-        arts.append(aa)
+            aa["section"]="Facebook: "+u[1]
+      arts.append(aa)
     nx=None
     if js.has_key("paging") and js["paging"].has_key("next"):
        un=urlparse.urlparse(js["paging"]["next"])
diff --git a/compiler/comp/fsch.py b/compiler/comp/fsch.py
index 39b62ca..59667a6 100644
--- a/compiler/comp/fsch.py
+++ b/compiler/comp/fsch.py
@@ -45,4 +45,5 @@ def fscharticle(url,raw,params={}):
             d["published"]=parse(pi["published"])
         d["pi"]=pi
     d["sourcetype"]="fscharticle"
+    d["section"]= "Fachschaft Chemie"
     return {"article": d}
diff --git a/compiler/fetching.py b/compiler/fetching.py
index 54dfb10..3055a64 100644
--- a/compiler/fetching.py
+++ b/compiler/fetching.py
@@ -1,6 +1,6 @@
 from requests import session
 s=session()
-from src import package_directory, download_path,cfg
+from src import package_directory, download_path,cfg, clogger
 from os import path, makedirs
 import os
 import json
@@ -12,6 +12,7 @@ from src.database import db_session2
 from models import CrawlUrl
 import errno
 import urlparse
+from sqlalchemy.exc import OperationalError, InvalidRequestError
 def announce_articleid(id):
     for u in cfg.announcearticle_url:
         s.get( u % id)
@@ -34,23 +35,35 @@ from models import CrawlCache
 from datetime import datetime, timedelta
 
 
+def cleanup_cache():
+    current_time = datetime.utcnow()
+    ten_weeks_ago = current_time - timedelta(days=cfg.cache_days*2)
+    CrawlCache.query.filter(CrawlCache.fetched<ten_weeks_ago).delete()
+
+def get_cached_page(furl):
+    current_time = datetime.utcnow()
+    ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
+    try:
+        cc= CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
+    except OperationalError:
+        db_session2.rollback()
+        cc=None
+    return cc
 
 
 def fetch_page(furl):
-    current_time = datetime.utcnow()
-    ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
     u=urlparse.urlparse(furl)
-    cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first()
- 
+    current_time = datetime.utcnow()
+    cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first() 
     if u[0] == '':
         furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
-    cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
+    cc=get_cached_page(furl) #CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
     if cc is None or u[0]=='fb': # no caching for Facebook
         clogger.debug("fetching url:  "+ str(furl))
         if u[0]=='fb':
             fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds()))
             furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments"
-            cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
+            cc=get_cached_page(furl) #CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
             if cc is None:
                 tx = json.dumps(graph.get_object(id=furl))
             else:
diff --git a/compiler/models.py b/compiler/models.py
index e774590..35e3a6c 100644
--- a/compiler/models.py
+++ b/compiler/models.py
@@ -27,19 +27,23 @@ class CrawlUrl(Base2):
     tpe=Column(String(250))
     url = Column(String(250))
     last_fetched = Column(DateTime)
+    __schema__=CrawlUrlSchema
+    __jsonid__='crawlurl'
+    __whiteattrs__=["id","tpe","url"]
+    __jsonattrs__=None
     def fetched(self):
         CrawlCache.query.find(CrawlCache.url==self.url).first()
     @classmethod
     def find_or_create(self, tpe, url):
         aa = CrawlUrl.query.filter(CrawlUrl.url==url).filter(CrawlUrl.tpe==tpe).first()        
         if aa is None:
-            aa=CrawlUrl(tpe,url)
+            aa=CrawlUrl({"tpe":tpe,"url": url})
         return aa
     def schedule(self):
         put_fetch_queue((0, self.tpe, self.url))
-    def __init__(self, tpe, url):
-        self.url=url
-        self.tpe=tpe
+#    def __init__(self, tpe, url):
+#        self.url=url
+#        self.tpe=tpe
     def __json__(self):
         return CrawlUrlSchema().dump(self)[0]
 
@@ -55,7 +59,10 @@ class CrawlCache(Base2):
     url=Column(String(250))
     fetched=Column(DateTime)
     raw=Column(Text)
-
+    __schema__=CrawlCacheSchema
+    __jsonattrs__=None
+    __jsonid__='crawlcache'
+    __whiteattrs__= []
     def __init__(self, url,rw):
         self.url=url
         self.raw=rw
diff --git a/compiler/mprocess.py b/compiler/mprocess.py
index 8164a82..3fd6944 100644
--- a/compiler/mprocess.py
+++ b/compiler/mprocess.py
@@ -5,6 +5,7 @@ from src.database import db_session
 from mqueues import fetch_queue, compile_queue, put_fetch_queue
 from fetching import fetch_page, downloadfile, announce_articleid
 from fixing import fix_html, fix_file
+#from src import app
 
 from compiler import article_types
 from fixing import fix_link
@@ -28,12 +29,19 @@ def process_article(art):
         aa.last_fetched=datetime.now()
         aa.sourcetype=art["sourcetype"]
         db_session.add(aa)
-        db_session.commit()
+        try:
+            db_session.commit()
+        except InvalidRequestError,e:
+            db_session.rollback()
+            clogger.error(e)    
         clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
-        db_session.close()
+    return aa
+#        app.logger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
+#        db_session.close()
         #        announce_articleid(aa.id)
         #
-    return aa
+
+
 
 # process a single found url
 def process_url(url,tpe, parent_url,params={}):
diff --git a/compiler/mworker.py b/compiler/mworker.py
index 61aea55..d401cb2 100644
--- a/compiler/mworker.py
+++ b/compiler/mworker.py
@@ -45,7 +45,7 @@ def run_fetch():
         tc, tpe, url, p= fetch_queue.get()
     except ValueError:
         tc, tpe, url= fetch_queue.get()
-        
+    clogger.debug("fetched : "+url)        
     if tpe is not "dummyarticle" and tpe is not "dummyindex":
         rw=fetch_page(url)
     else:
diff --git a/compiler/views.py b/compiler/views.py
index ad3eaac..96cb443 100644
--- a/compiler/views.py
+++ b/compiler/views.py
@@ -79,8 +79,8 @@ def urls_lst():
 def urls_json(id):
     # Lade Alle Urls
     status=CrawlUrl.query.get(id)
-    cc=CrawlCache.query.filter(CrawlCache.url==status.url).first()
-    return jsonify(urls=status, cache=cc.__json__())
+#    cc=CrawlCache.query.filter(CrawlCache.url==status.url).first()
+    return jsonify(urls=status)
 
 # que an existing CrawlUrl for fetching
 @compiler_pages.route("/urls/<int:id>/que")
diff --git a/database.py b/database.py
index 86c8c9a..024c9f4 100644
--- a/database.py
+++ b/database.py
@@ -33,13 +33,16 @@ db_session = scoped_session(sessionmaker(autocommit=False,
 #                                         autoflush=False,
                                          bind=engine))
 
+if cfg.get("db_urls_type") == "mysql":
+    engine2 = create_engine("mysql+pymysql://%s:%s@localhost/crawler_urls?charset=utf8" % (cfg.get("db_urls_user"), cfg.get("db_urls_pw")))
+else:
+    engine2 = create_engine('sqlite:///'+  path.join(db_path,db_urlfile), convert_unicode=True)
 
-engine2 = create_engine('sqlite:///'+  path.join(db_path,db_urlfile), convert_unicode=True)
 
 db_session2 = scoped_session(sessionmaker(autocommit=False,
                                          autoflush=False,
                                           bind=engine2))
-from database_mbase import MyBase
+from database_mbase import MyBase,MyBase2
 
 
 #Base = declarative_base()
@@ -47,7 +50,7 @@ from database_mbase import MyBase
 Base=declarative_base(cls=MyBase)
 Base.query = db_session.query_property()
 
-Base2 = declarative_base()
+Base2 = declarative_base(cls=MyBase2)
 Base2.query = db_session2.query_property()
 
 def read_json(rq):
diff --git a/database_mbase.py b/database_mbase.py
index c7adc51..aee0b85 100644
--- a/database_mbase.py
+++ b/database_mbase.py
@@ -33,3 +33,35 @@ class MyBase(object):
                 setattr(a, c.key,data[c.key])
         return a
 
+class MyBase2(object):
+    id = Column(Integer, primary_key=True)
+#    created_at = Column(TIMESTAMP, default=datetime.utcnow, nullable=False)
+#    updated_at = Column(TIMESTAMP, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
+    def __json__(self):
+        if self.__jsonattrs__ is None:
+            return self.__schema__().dump(self)[0]
+        else:
+            return self.__schema__(only=self.__jsonattrs__).dump(self)[0]
+#    def __init__(self, data={}):
+#        self.update(data,False)
+        
+    def update(self,data, partial=True):
+        data, errors=self.__schema__( only=self.__whiteattrs__).load(data, partial=partial)
+        if len(errors)>0:
+            clogger.error(errors)
+            return (False,errors)
+        else:
+            for a in self.__whiteattrs__:
+                if data.has_key(a):
+                    setattr(self,a,data[a])
+            return (True, [])
+
+    @classmethod
+    def deserialize(cls,data):
+        data, errors=cls.__schema__().load(data,partial=True)
+        a=cls()
+        for c in cls.__table__.columns:
+            if data.has_key(c.key):
+                setattr(a, c.key,data[c.key])
+        return a
+    
diff --git a/dump_urls.py b/dump_urls.py
new file mode 100644
index 0000000..2dbc3da
--- /dev/null
+++ b/dump_urls.py
@@ -0,0 +1,20 @@
+from src.compiler.models import CrawlCache, CrawlCacheSchema
+from src.compiler.models import CrawlUrl, CrawlUrlSchema
+import sys
+import json
+
+if len(sys.argv) <= 1:
+    raise Error("Kein Zieldateiname angegeben")
+
+def dump_crawlurl(a):
+    return CrawlUrlSchema().dump(a)
+
+def dump_crawlcache(a):
+    return CrawlCacheSchema().dump(a)
+
+file = open(sys.argv[1], "w+")
+data={}
+data["crawlurls"] = map(dump_crawlurl,CrawlUrl.query.all()) 
+#data["crawlcache"] = map(dump_crawlcache,CrawlCache.query.all()) 
+json.dump (data, file)
+file.close()
diff --git a/load_urls.py b/load_urls.py
new file mode 100644
index 0000000..1d90dc6
--- /dev/null
+++ b/load_urls.py
@@ -0,0 +1,44 @@
+from src.compiler.models import CrawlCache, CrawlCacheSchema
+from src.compiler.models import CrawlUrl, CrawlUrlSchema
+import sys
+import json
+from src.database import db_session2
+from sqlalchemy.exc import IntegrityError
+
+if len(sys.argv) <= 1:
+    raise Error("Kein Zieldateiname angegeben")
+
+
+def insert_array(array, cls, session):
+    for s in array:
+        if not isinstance(s,cls):
+            print type(s)
+        else:
+            try:
+                session.add(s)
+                session.commit()
+            except IntegrityError:
+                session.rollback()
+
+
+
+def load_crawlurl(a):
+    print a
+    return CrawlUrl.deserialize(a[0])
+def load_crawlcache(a):
+    return CrawlCache.deserialize(a[0])
+
+
+file = open(sys.argv[1], "r")
+data=json.load(file)
+file.close()
+
+if data.has_key("crawlurls"):
+    crawlurls=data["crawlurls"]
+    crawlurls = map (load_crawlurl, crawlurls)
+    insert_array(crawlurls, CrawlUrl, db_session2)
+
+if data.has_key("crawlcache"):
+    crawlcache=data["crawlcache"]
+    crawlcache = map (load_crawlcache, crawlcache)
+    insert_array(crawlcache, CrawlCache, db_session2)