From b71803c050a3f576590fc8f3b25779aea7ed1b51 Mon Sep 17 00:00:00 2001 From: andis Date: Fri, 17 Feb 2017 10:02:20 +0100 Subject: [PATCH] div. updates --- __init__.py | 92 +++++++++++++++++++++++++++---------------- articles/model.py | 15 +++++-- articles/views.py | 2 +- compiler/comp/fb.py | 10 ++--- compiler/comp/fsch.py | 1 + compiler/fetching.py | 27 +++++++++---- compiler/models.py | 17 +++++--- compiler/mprocess.py | 14 +++++-- compiler/mworker.py | 2 +- compiler/views.py | 4 +- database.py | 9 +++-- database_mbase.py | 32 +++++++++++++++ dump_urls.py | 20 ++++++++++ load_urls.py | 44 +++++++++++++++++++++ 14 files changed, 224 insertions(+), 65 deletions(-) create mode 100644 dump_urls.py create mode 100644 load_urls.py diff --git a/__init__.py b/__init__.py index 0817fc4..5074144 100644 --- a/__init__.py +++ b/__init__.py @@ -1,40 +1,12 @@ import os import sys +import lockfile +#from lockfile import LockFile package_directory = os.path.dirname(os.path.abspath(__file__)) from config import Config cfg = Config(file(os.path.join(package_directory, 'config.cfg'))) #--------------- Logging -import logging - -file_handler=logging.FileHandler(cfg.logfile) -file_handler.setLevel(logging.DEBUG) -stream_handler=logging.StreamHandler(sys.stdout) -stream_handler.setLevel(logging.DEBUG) - -clt=logging.getLogger('mylogger') -clt.setLevel(logging.DEBUG) -clt.addHandler(file_handler) -clt.addHandler(stream_handler) - -clogger=clt -#---------------- -download_path=cfg.download_path - -lg=clt - -from gevent import spawn, monkey -monkey.patch_all() -from .compiler import start_workers -#start_workers(1,1,1) - - -if cfg.bot_active: - from src.bot import bot - #if not app.debug or os.environ.get("WERKZEUG_RUN_MAIN") == "true": - bot.message_loop() - - # Framework @@ -43,16 +15,66 @@ from flask import Flask, jsonify, render_template, redirect, request,send_from_d from flask_cors import CORS, cross_origin #Authentication from flask_jwt import JWT, jwt_required, current_identity +import logging + + +app = Flask(__name__) + +file_handler=logging.FileHandler(cfg.logfile) +file_handler.setLevel(logging.DEBUG) +stream_handler=logging.StreamHandler(sys.stdout) +stream_handler.setLevel(logging.DEBUG) + +CORS(app) +clt=logging.getLogger('mylogger') +clt.setLevel(logging.DEBUG) +clt.addHandler(file_handler) +clt.addHandler(stream_handler) +lg=clt + +#clogger=clt +#---------------- + +#app.config['LOGGER_NAME']='mylogger' +app.logger.setLevel(logging.DEBUG) +app.logger.info("Server Started") +app.logger.setLevel(logging.DEBUG) +app.logger.addHandler(file_handler) +app.logger.addHandler(stream_handler) + +clogger=app.logger + + + + + +download_path=cfg.download_path + from src.models import Article,Section from src.users import authenticate, identity from datetime import datetime -app = Flask(__name__) -CORS(app) -app.config['LOGGER_NAME']='mylogger' -app.logger.setLevel(logging.DEBUG) -app.logger.info("Server Started") + +from gevent import spawn, monkey +monkey.patch_all() +from .compiler import start_workers +#start_workers(1,1,1) + + +lock = lockfile.LockFile("/srv/crawlerapi/bot.lock") + +try: + if cfg.bot_active and not lock.is_locked(): + lock.acquire() + from src.bot import bot + #if not app.debug or os.environ.get("WERKZEUG_RUN_MAIN") == "true": + bot.message_loop() +except lockfile.Error: + clogger.info("Couldn't Lock the bot file") + + + app.config['SECRET_KEY'] = 'super-secret' import flask diff --git a/articles/model.py b/articles/model.py index cb19507..9d6ff99 100644 --- a/articles/model.py +++ b/articles/model.py @@ -59,6 +59,9 @@ class FullArticleSchema(Schema): author=fields.String(allow_none=True) section_id=fields.Integer() sourcetype =fields.String() + organization_name=fields.String(dump_only=True) + organization_image=fields.String(dump_only=True) + organization_id=fields.Integer(dump_only=True) image =fields.String(allow_none=True) # @post_load # def make_article(self, data): @@ -95,9 +98,15 @@ class Article(Base): self.title=title self.published_date=published_date self.first_fetched=datetime.now() - def __json__(self): - return ArticleSchema().dump(self)[0] - +# def __json__(self): +# return ArticleSchema().dump(self)[0] + def organization_name(self): + return self.section.organization.name + def organization_image(self): + return self.section.organization.image + def organization_id(self): + return self.section.organization.id + # def dict(self): # return {"id": str(int(self.id)), "title": self.title, "text": self.text, "author": self.author, "section":self.section, "sourcetype": self.sourcetype, "last_fetched": self.last_fetched, "first_fetched": self.first_fetched, "published_date": self.published_date, "date": self.date,"image": self.image, "url": self.url} diff --git a/articles/views.py b/articles/views.py index b55d707..93aaba2 100644 --- a/articles/views.py +++ b/articles/views.py @@ -50,7 +50,7 @@ def update(id): @article_pages.route("/.json",methods=['GET']) def get(id): article=Article.query.get(id) - clogger.info(article) +# clogger.info(article) # article=ArticleSchema().dump(article)[0] return jsonify(article=article) diff --git a/compiler/comp/fb.py b/compiler/comp/fb.py index d884fab..afb9e50 100644 --- a/compiler/comp/fb.py +++ b/compiler/comp/fb.py @@ -5,7 +5,7 @@ import urlparse from src.fb import graph from facebook import GraphAPIError import json - +import gevent def fbfeedelement(h): art={} @@ -39,11 +39,11 @@ def fbfeed(url, raw, params={}): arts=[] u=urlparse.urlparse(url) for m in js["data"]: - aa=fbfeedelement(m) - if not aa.has_key("title"): + aa=fbfeedelement(m) + if not aa.has_key("title"): aa["title"] = u[1]+ " at " + aa["published"].strftime("%Y-%m-%d %H:%M") - aa["section"]="Facebook: "+u[1] - arts.append(aa) + aa["section"]="Facebook: "+u[1] + arts.append(aa) nx=None if js.has_key("paging") and js["paging"].has_key("next"): un=urlparse.urlparse(js["paging"]["next"]) diff --git a/compiler/comp/fsch.py b/compiler/comp/fsch.py index 39b62ca..59667a6 100644 --- a/compiler/comp/fsch.py +++ b/compiler/comp/fsch.py @@ -45,4 +45,5 @@ def fscharticle(url,raw,params={}): d["published"]=parse(pi["published"]) d["pi"]=pi d["sourcetype"]="fscharticle" + d["section"]= "Fachschaft Chemie" return {"article": d} diff --git a/compiler/fetching.py b/compiler/fetching.py index 54dfb10..3055a64 100644 --- a/compiler/fetching.py +++ b/compiler/fetching.py @@ -1,6 +1,6 @@ from requests import session s=session() -from src import package_directory, download_path,cfg +from src import package_directory, download_path,cfg, clogger from os import path, makedirs import os import json @@ -12,6 +12,7 @@ from src.database import db_session2 from models import CrawlUrl import errno import urlparse +from sqlalchemy.exc import OperationalError, InvalidRequestError def announce_articleid(id): for u in cfg.announcearticle_url: s.get( u % id) @@ -34,23 +35,35 @@ from models import CrawlCache from datetime import datetime, timedelta +def cleanup_cache(): + current_time = datetime.utcnow() + ten_weeks_ago = current_time - timedelta(days=cfg.cache_days*2) + CrawlCache.query.filter(CrawlCache.fetchedten_weeks_ago).first() + except OperationalError: + db_session2.rollback() + cc=None + return cc def fetch_page(furl): - current_time = datetime.utcnow() - ten_weeks_ago = current_time - timedelta(days=cfg.cache_days) u=urlparse.urlparse(furl) - cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first() - + current_time = datetime.utcnow() + cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first() if u[0] == '': furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4])) - cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first() + cc=get_cached_page(furl) #CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first() if cc is None or u[0]=='fb': # no caching for Facebook clogger.debug("fetching url: "+ str(furl)) if u[0]=='fb': fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds())) furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments" - cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first() + cc=get_cached_page(furl) #CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first() if cc is None: tx = json.dumps(graph.get_object(id=furl)) else: diff --git a/compiler/models.py b/compiler/models.py index e774590..35e3a6c 100644 --- a/compiler/models.py +++ b/compiler/models.py @@ -27,19 +27,23 @@ class CrawlUrl(Base2): tpe=Column(String(250)) url = Column(String(250)) last_fetched = Column(DateTime) + __schema__=CrawlUrlSchema + __jsonid__='crawlurl' + __whiteattrs__=["id","tpe","url"] + __jsonattrs__=None def fetched(self): CrawlCache.query.find(CrawlCache.url==self.url).first() @classmethod def find_or_create(self, tpe, url): aa = CrawlUrl.query.filter(CrawlUrl.url==url).filter(CrawlUrl.tpe==tpe).first() if aa is None: - aa=CrawlUrl(tpe,url) + aa=CrawlUrl({"tpe":tpe,"url": url}) return aa def schedule(self): put_fetch_queue((0, self.tpe, self.url)) - def __init__(self, tpe, url): - self.url=url - self.tpe=tpe +# def __init__(self, tpe, url): +# self.url=url +# self.tpe=tpe def __json__(self): return CrawlUrlSchema().dump(self)[0] @@ -55,7 +59,10 @@ class CrawlCache(Base2): url=Column(String(250)) fetched=Column(DateTime) raw=Column(Text) - + __schema__=CrawlCacheSchema + __jsonattrs__=None + __jsonid__='crawlcache' + __whiteattrs__= [] def __init__(self, url,rw): self.url=url self.raw=rw diff --git a/compiler/mprocess.py b/compiler/mprocess.py index 8164a82..3fd6944 100644 --- a/compiler/mprocess.py +++ b/compiler/mprocess.py @@ -5,6 +5,7 @@ from src.database import db_session from mqueues import fetch_queue, compile_queue, put_fetch_queue from fetching import fetch_page, downloadfile, announce_articleid from fixing import fix_html, fix_file +#from src import app from compiler import article_types from fixing import fix_link @@ -28,12 +29,19 @@ def process_article(art): aa.last_fetched=datetime.now() aa.sourcetype=art["sourcetype"] db_session.add(aa) - db_session.commit() + try: + db_session.commit() + except InvalidRequestError,e: + db_session.rollback() + clogger.error(e) clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8"))) - db_session.close() + return aa +# app.logger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8"))) +# db_session.close() # announce_articleid(aa.id) # - return aa + + # process a single found url def process_url(url,tpe, parent_url,params={}): diff --git a/compiler/mworker.py b/compiler/mworker.py index 61aea55..d401cb2 100644 --- a/compiler/mworker.py +++ b/compiler/mworker.py @@ -45,7 +45,7 @@ def run_fetch(): tc, tpe, url, p= fetch_queue.get() except ValueError: tc, tpe, url= fetch_queue.get() - + clogger.debug("fetched : "+url) if tpe is not "dummyarticle" and tpe is not "dummyindex": rw=fetch_page(url) else: diff --git a/compiler/views.py b/compiler/views.py index ad3eaac..96cb443 100644 --- a/compiler/views.py +++ b/compiler/views.py @@ -79,8 +79,8 @@ def urls_lst(): def urls_json(id): # Lade Alle Urls status=CrawlUrl.query.get(id) - cc=CrawlCache.query.filter(CrawlCache.url==status.url).first() - return jsonify(urls=status, cache=cc.__json__()) +# cc=CrawlCache.query.filter(CrawlCache.url==status.url).first() + return jsonify(urls=status) # que an existing CrawlUrl for fetching @compiler_pages.route("/urls//que") diff --git a/database.py b/database.py index 86c8c9a..024c9f4 100644 --- a/database.py +++ b/database.py @@ -33,13 +33,16 @@ db_session = scoped_session(sessionmaker(autocommit=False, # autoflush=False, bind=engine)) +if cfg.get("db_urls_type") == "mysql": + engine2 = create_engine("mysql+pymysql://%s:%s@localhost/crawler_urls?charset=utf8" % (cfg.get("db_urls_user"), cfg.get("db_urls_pw"))) +else: + engine2 = create_engine('sqlite:///'+ path.join(db_path,db_urlfile), convert_unicode=True) -engine2 = create_engine('sqlite:///'+ path.join(db_path,db_urlfile), convert_unicode=True) db_session2 = scoped_session(sessionmaker(autocommit=False, autoflush=False, bind=engine2)) -from database_mbase import MyBase +from database_mbase import MyBase,MyBase2 #Base = declarative_base() @@ -47,7 +50,7 @@ from database_mbase import MyBase Base=declarative_base(cls=MyBase) Base.query = db_session.query_property() -Base2 = declarative_base() +Base2 = declarative_base(cls=MyBase2) Base2.query = db_session2.query_property() def read_json(rq): diff --git a/database_mbase.py b/database_mbase.py index c7adc51..aee0b85 100644 --- a/database_mbase.py +++ b/database_mbase.py @@ -33,3 +33,35 @@ class MyBase(object): setattr(a, c.key,data[c.key]) return a +class MyBase2(object): + id = Column(Integer, primary_key=True) +# created_at = Column(TIMESTAMP, default=datetime.utcnow, nullable=False) +# updated_at = Column(TIMESTAMP, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False) + def __json__(self): + if self.__jsonattrs__ is None: + return self.__schema__().dump(self)[0] + else: + return self.__schema__(only=self.__jsonattrs__).dump(self)[0] +# def __init__(self, data={}): +# self.update(data,False) + + def update(self,data, partial=True): + data, errors=self.__schema__( only=self.__whiteattrs__).load(data, partial=partial) + if len(errors)>0: + clogger.error(errors) + return (False,errors) + else: + for a in self.__whiteattrs__: + if data.has_key(a): + setattr(self,a,data[a]) + return (True, []) + + @classmethod + def deserialize(cls,data): + data, errors=cls.__schema__().load(data,partial=True) + a=cls() + for c in cls.__table__.columns: + if data.has_key(c.key): + setattr(a, c.key,data[c.key]) + return a + diff --git a/dump_urls.py b/dump_urls.py new file mode 100644 index 0000000..2dbc3da --- /dev/null +++ b/dump_urls.py @@ -0,0 +1,20 @@ +from src.compiler.models import CrawlCache, CrawlCacheSchema +from src.compiler.models import CrawlUrl, CrawlUrlSchema +import sys +import json + +if len(sys.argv) <= 1: + raise Error("Kein Zieldateiname angegeben") + +def dump_crawlurl(a): + return CrawlUrlSchema().dump(a) + +def dump_crawlcache(a): + return CrawlCacheSchema().dump(a) + +file = open(sys.argv[1], "w+") +data={} +data["crawlurls"] = map(dump_crawlurl,CrawlUrl.query.all()) +#data["crawlcache"] = map(dump_crawlcache,CrawlCache.query.all()) +json.dump (data, file) +file.close() diff --git a/load_urls.py b/load_urls.py new file mode 100644 index 0000000..1d90dc6 --- /dev/null +++ b/load_urls.py @@ -0,0 +1,44 @@ +from src.compiler.models import CrawlCache, CrawlCacheSchema +from src.compiler.models import CrawlUrl, CrawlUrlSchema +import sys +import json +from src.database import db_session2 +from sqlalchemy.exc import IntegrityError + +if len(sys.argv) <= 1: + raise Error("Kein Zieldateiname angegeben") + + +def insert_array(array, cls, session): + for s in array: + if not isinstance(s,cls): + print type(s) + else: + try: + session.add(s) + session.commit() + except IntegrityError: + session.rollback() + + + +def load_crawlurl(a): + print a + return CrawlUrl.deserialize(a[0]) +def load_crawlcache(a): + return CrawlCache.deserialize(a[0]) + + +file = open(sys.argv[1], "r") +data=json.load(file) +file.close() + +if data.has_key("crawlurls"): + crawlurls=data["crawlurls"] + crawlurls = map (load_crawlurl, crawlurls) + insert_array(crawlurls, CrawlUrl, db_session2) + +if data.has_key("crawlcache"): + crawlcache=data["crawlcache"] + crawlcache = map (load_crawlcache, crawlcache) + insert_array(crawlcache, CrawlCache, db_session2)