div. updates

This commit is contained in:
andis
2017-02-17 10:02:20 +01:00
parent bdfa16728d
commit b71803c050
14 changed files with 224 additions and 65 deletions

View File

@@ -1,40 +1,12 @@
import os import os
import sys import sys
import lockfile
#from lockfile import LockFile
package_directory = os.path.dirname(os.path.abspath(__file__)) package_directory = os.path.dirname(os.path.abspath(__file__))
from config import Config from config import Config
cfg = Config(file(os.path.join(package_directory, 'config.cfg'))) cfg = Config(file(os.path.join(package_directory, 'config.cfg')))
#--------------- Logging #--------------- Logging
import logging
file_handler=logging.FileHandler(cfg.logfile)
file_handler.setLevel(logging.DEBUG)
stream_handler=logging.StreamHandler(sys.stdout)
stream_handler.setLevel(logging.DEBUG)
clt=logging.getLogger('mylogger')
clt.setLevel(logging.DEBUG)
clt.addHandler(file_handler)
clt.addHandler(stream_handler)
clogger=clt
#----------------
download_path=cfg.download_path
lg=clt
from gevent import spawn, monkey
monkey.patch_all()
from .compiler import start_workers
#start_workers(1,1,1)
if cfg.bot_active:
from src.bot import bot
#if not app.debug or os.environ.get("WERKZEUG_RUN_MAIN") == "true":
bot.message_loop()
# Framework # Framework
@@ -43,16 +15,66 @@ from flask import Flask, jsonify, render_template, redirect, request,send_from_d
from flask_cors import CORS, cross_origin from flask_cors import CORS, cross_origin
#Authentication #Authentication
from flask_jwt import JWT, jwt_required, current_identity from flask_jwt import JWT, jwt_required, current_identity
import logging
app = Flask(__name__)
file_handler=logging.FileHandler(cfg.logfile)
file_handler.setLevel(logging.DEBUG)
stream_handler=logging.StreamHandler(sys.stdout)
stream_handler.setLevel(logging.DEBUG)
CORS(app)
clt=logging.getLogger('mylogger')
clt.setLevel(logging.DEBUG)
clt.addHandler(file_handler)
clt.addHandler(stream_handler)
lg=clt
#clogger=clt
#----------------
#app.config['LOGGER_NAME']='mylogger'
app.logger.setLevel(logging.DEBUG)
app.logger.info("Server Started")
app.logger.setLevel(logging.DEBUG)
app.logger.addHandler(file_handler)
app.logger.addHandler(stream_handler)
clogger=app.logger
download_path=cfg.download_path
from src.models import Article,Section from src.models import Article,Section
from src.users import authenticate, identity from src.users import authenticate, identity
from datetime import datetime from datetime import datetime
app = Flask(__name__)
CORS(app) from gevent import spawn, monkey
app.config['LOGGER_NAME']='mylogger' monkey.patch_all()
app.logger.setLevel(logging.DEBUG) from .compiler import start_workers
app.logger.info("Server Started") #start_workers(1,1,1)
lock = lockfile.LockFile("/srv/crawlerapi/bot.lock")
try:
if cfg.bot_active and not lock.is_locked():
lock.acquire()
from src.bot import bot
#if not app.debug or os.environ.get("WERKZEUG_RUN_MAIN") == "true":
bot.message_loop()
except lockfile.Error:
clogger.info("Couldn't Lock the bot file")
app.config['SECRET_KEY'] = 'super-secret' app.config['SECRET_KEY'] = 'super-secret'
import flask import flask

View File

@@ -59,6 +59,9 @@ class FullArticleSchema(Schema):
author=fields.String(allow_none=True) author=fields.String(allow_none=True)
section_id=fields.Integer() section_id=fields.Integer()
sourcetype =fields.String() sourcetype =fields.String()
organization_name=fields.String(dump_only=True)
organization_image=fields.String(dump_only=True)
organization_id=fields.Integer(dump_only=True)
image =fields.String(allow_none=True) image =fields.String(allow_none=True)
# @post_load # @post_load
# def make_article(self, data): # def make_article(self, data):
@@ -95,8 +98,14 @@ class Article(Base):
self.title=title self.title=title
self.published_date=published_date self.published_date=published_date
self.first_fetched=datetime.now() self.first_fetched=datetime.now()
def __json__(self): # def __json__(self):
return ArticleSchema().dump(self)[0] # return ArticleSchema().dump(self)[0]
def organization_name(self):
return self.section.organization.name
def organization_image(self):
return self.section.organization.image
def organization_id(self):
return self.section.organization.id
# def dict(self): # def dict(self):
# return {"id": str(int(self.id)), "title": self.title, "text": self.text, "author": self.author, "section":self.section, "sourcetype": self.sourcetype, "last_fetched": self.last_fetched, "first_fetched": self.first_fetched, "published_date": self.published_date, "date": self.date,"image": self.image, "url": self.url} # return {"id": str(int(self.id)), "title": self.title, "text": self.text, "author": self.author, "section":self.section, "sourcetype": self.sourcetype, "last_fetched": self.last_fetched, "first_fetched": self.first_fetched, "published_date": self.published_date, "date": self.date,"image": self.image, "url": self.url}

View File

@@ -50,7 +50,7 @@ def update(id):
@article_pages.route("/<int:id>.json",methods=['GET']) @article_pages.route("/<int:id>.json",methods=['GET'])
def get(id): def get(id):
article=Article.query.get(id) article=Article.query.get(id)
clogger.info(article) # clogger.info(article)
# article=ArticleSchema().dump(article)[0] # article=ArticleSchema().dump(article)[0]
return jsonify(article=article) return jsonify(article=article)

View File

@@ -5,7 +5,7 @@ import urlparse
from src.fb import graph from src.fb import graph
from facebook import GraphAPIError from facebook import GraphAPIError
import json import json
import gevent
def fbfeedelement(h): def fbfeedelement(h):
art={} art={}
@@ -39,11 +39,11 @@ def fbfeed(url, raw, params={}):
arts=[] arts=[]
u=urlparse.urlparse(url) u=urlparse.urlparse(url)
for m in js["data"]: for m in js["data"]:
aa=fbfeedelement(m) aa=fbfeedelement(m)
if not aa.has_key("title"): if not aa.has_key("title"):
aa["title"] = u[1]+ " at " + aa["published"].strftime("%Y-%m-%d %H:%M") aa["title"] = u[1]+ " at " + aa["published"].strftime("%Y-%m-%d %H:%M")
aa["section"]="Facebook: "+u[1] aa["section"]="Facebook: "+u[1]
arts.append(aa) arts.append(aa)
nx=None nx=None
if js.has_key("paging") and js["paging"].has_key("next"): if js.has_key("paging") and js["paging"].has_key("next"):
un=urlparse.urlparse(js["paging"]["next"]) un=urlparse.urlparse(js["paging"]["next"])

View File

@@ -45,4 +45,5 @@ def fscharticle(url,raw,params={}):
d["published"]=parse(pi["published"]) d["published"]=parse(pi["published"])
d["pi"]=pi d["pi"]=pi
d["sourcetype"]="fscharticle" d["sourcetype"]="fscharticle"
d["section"]= "Fachschaft Chemie"
return {"article": d} return {"article": d}

View File

@@ -1,6 +1,6 @@
from requests import session from requests import session
s=session() s=session()
from src import package_directory, download_path,cfg from src import package_directory, download_path,cfg, clogger
from os import path, makedirs from os import path, makedirs
import os import os
import json import json
@@ -12,6 +12,7 @@ from src.database import db_session2
from models import CrawlUrl from models import CrawlUrl
import errno import errno
import urlparse import urlparse
from sqlalchemy.exc import OperationalError, InvalidRequestError
def announce_articleid(id): def announce_articleid(id):
for u in cfg.announcearticle_url: for u in cfg.announcearticle_url:
s.get( u % id) s.get( u % id)
@@ -34,23 +35,35 @@ from models import CrawlCache
from datetime import datetime, timedelta from datetime import datetime, timedelta
def cleanup_cache():
current_time = datetime.utcnow()
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days*2)
CrawlCache.query.filter(CrawlCache.fetched<ten_weeks_ago).delete()
def get_cached_page(furl):
current_time = datetime.utcnow()
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
try:
cc= CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
except OperationalError:
db_session2.rollback()
cc=None
return cc
def fetch_page(furl): def fetch_page(furl):
current_time = datetime.utcnow()
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
u=urlparse.urlparse(furl) u=urlparse.urlparse(furl)
current_time = datetime.utcnow()
cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first() cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first()
if u[0] == '': if u[0] == '':
furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4])) furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first() cc=get_cached_page(furl) #CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
if cc is None or u[0]=='fb': # no caching for Facebook if cc is None or u[0]=='fb': # no caching for Facebook
clogger.debug("fetching url: "+ str(furl)) clogger.debug("fetching url: "+ str(furl))
if u[0]=='fb': if u[0]=='fb':
fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds())) fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds()))
furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments" furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments"
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first() cc=get_cached_page(furl) #CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
if cc is None: if cc is None:
tx = json.dumps(graph.get_object(id=furl)) tx = json.dumps(graph.get_object(id=furl))
else: else:

View File

@@ -27,19 +27,23 @@ class CrawlUrl(Base2):
tpe=Column(String(250)) tpe=Column(String(250))
url = Column(String(250)) url = Column(String(250))
last_fetched = Column(DateTime) last_fetched = Column(DateTime)
__schema__=CrawlUrlSchema
__jsonid__='crawlurl'
__whiteattrs__=["id","tpe","url"]
__jsonattrs__=None
def fetched(self): def fetched(self):
CrawlCache.query.find(CrawlCache.url==self.url).first() CrawlCache.query.find(CrawlCache.url==self.url).first()
@classmethod @classmethod
def find_or_create(self, tpe, url): def find_or_create(self, tpe, url):
aa = CrawlUrl.query.filter(CrawlUrl.url==url).filter(CrawlUrl.tpe==tpe).first() aa = CrawlUrl.query.filter(CrawlUrl.url==url).filter(CrawlUrl.tpe==tpe).first()
if aa is None: if aa is None:
aa=CrawlUrl(tpe,url) aa=CrawlUrl({"tpe":tpe,"url": url})
return aa return aa
def schedule(self): def schedule(self):
put_fetch_queue((0, self.tpe, self.url)) put_fetch_queue((0, self.tpe, self.url))
def __init__(self, tpe, url): # def __init__(self, tpe, url):
self.url=url # self.url=url
self.tpe=tpe # self.tpe=tpe
def __json__(self): def __json__(self):
return CrawlUrlSchema().dump(self)[0] return CrawlUrlSchema().dump(self)[0]
@@ -55,7 +59,10 @@ class CrawlCache(Base2):
url=Column(String(250)) url=Column(String(250))
fetched=Column(DateTime) fetched=Column(DateTime)
raw=Column(Text) raw=Column(Text)
__schema__=CrawlCacheSchema
__jsonattrs__=None
__jsonid__='crawlcache'
__whiteattrs__= []
def __init__(self, url,rw): def __init__(self, url,rw):
self.url=url self.url=url
self.raw=rw self.raw=rw

View File

@@ -5,6 +5,7 @@ from src.database import db_session
from mqueues import fetch_queue, compile_queue, put_fetch_queue from mqueues import fetch_queue, compile_queue, put_fetch_queue
from fetching import fetch_page, downloadfile, announce_articleid from fetching import fetch_page, downloadfile, announce_articleid
from fixing import fix_html, fix_file from fixing import fix_html, fix_file
#from src import app
from compiler import article_types from compiler import article_types
from fixing import fix_link from fixing import fix_link
@@ -28,12 +29,19 @@ def process_article(art):
aa.last_fetched=datetime.now() aa.last_fetched=datetime.now()
aa.sourcetype=art["sourcetype"] aa.sourcetype=art["sourcetype"]
db_session.add(aa) db_session.add(aa)
db_session.commit() try:
db_session.commit()
except InvalidRequestError,e:
db_session.rollback()
clogger.error(e)
clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8"))) clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
db_session.close() return aa
# app.logger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
# db_session.close()
# announce_articleid(aa.id) # announce_articleid(aa.id)
# #
return aa
# process a single found url # process a single found url
def process_url(url,tpe, parent_url,params={}): def process_url(url,tpe, parent_url,params={}):

View File

@@ -45,7 +45,7 @@ def run_fetch():
tc, tpe, url, p= fetch_queue.get() tc, tpe, url, p= fetch_queue.get()
except ValueError: except ValueError:
tc, tpe, url= fetch_queue.get() tc, tpe, url= fetch_queue.get()
clogger.debug("fetched : "+url)
if tpe is not "dummyarticle" and tpe is not "dummyindex": if tpe is not "dummyarticle" and tpe is not "dummyindex":
rw=fetch_page(url) rw=fetch_page(url)
else: else:

View File

@@ -79,8 +79,8 @@ def urls_lst():
def urls_json(id): def urls_json(id):
# Lade Alle Urls # Lade Alle Urls
status=CrawlUrl.query.get(id) status=CrawlUrl.query.get(id)
cc=CrawlCache.query.filter(CrawlCache.url==status.url).first() # cc=CrawlCache.query.filter(CrawlCache.url==status.url).first()
return jsonify(urls=status, cache=cc.__json__()) return jsonify(urls=status)
# que an existing CrawlUrl for fetching # que an existing CrawlUrl for fetching
@compiler_pages.route("/urls/<int:id>/que") @compiler_pages.route("/urls/<int:id>/que")

View File

@@ -33,13 +33,16 @@ db_session = scoped_session(sessionmaker(autocommit=False,
# autoflush=False, # autoflush=False,
bind=engine)) bind=engine))
if cfg.get("db_urls_type") == "mysql":
engine2 = create_engine("mysql+pymysql://%s:%s@localhost/crawler_urls?charset=utf8" % (cfg.get("db_urls_user"), cfg.get("db_urls_pw")))
else:
engine2 = create_engine('sqlite:///'+ path.join(db_path,db_urlfile), convert_unicode=True)
engine2 = create_engine('sqlite:///'+ path.join(db_path,db_urlfile), convert_unicode=True)
db_session2 = scoped_session(sessionmaker(autocommit=False, db_session2 = scoped_session(sessionmaker(autocommit=False,
autoflush=False, autoflush=False,
bind=engine2)) bind=engine2))
from database_mbase import MyBase from database_mbase import MyBase,MyBase2
#Base = declarative_base() #Base = declarative_base()
@@ -47,7 +50,7 @@ from database_mbase import MyBase
Base=declarative_base(cls=MyBase) Base=declarative_base(cls=MyBase)
Base.query = db_session.query_property() Base.query = db_session.query_property()
Base2 = declarative_base() Base2 = declarative_base(cls=MyBase2)
Base2.query = db_session2.query_property() Base2.query = db_session2.query_property()
def read_json(rq): def read_json(rq):

View File

@@ -33,3 +33,35 @@ class MyBase(object):
setattr(a, c.key,data[c.key]) setattr(a, c.key,data[c.key])
return a return a
class MyBase2(object):
id = Column(Integer, primary_key=True)
# created_at = Column(TIMESTAMP, default=datetime.utcnow, nullable=False)
# updated_at = Column(TIMESTAMP, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
def __json__(self):
if self.__jsonattrs__ is None:
return self.__schema__().dump(self)[0]
else:
return self.__schema__(only=self.__jsonattrs__).dump(self)[0]
# def __init__(self, data={}):
# self.update(data,False)
def update(self,data, partial=True):
data, errors=self.__schema__( only=self.__whiteattrs__).load(data, partial=partial)
if len(errors)>0:
clogger.error(errors)
return (False,errors)
else:
for a in self.__whiteattrs__:
if data.has_key(a):
setattr(self,a,data[a])
return (True, [])
@classmethod
def deserialize(cls,data):
data, errors=cls.__schema__().load(data,partial=True)
a=cls()
for c in cls.__table__.columns:
if data.has_key(c.key):
setattr(a, c.key,data[c.key])
return a

20
dump_urls.py Normal file
View File

@@ -0,0 +1,20 @@
from src.compiler.models import CrawlCache, CrawlCacheSchema
from src.compiler.models import CrawlUrl, CrawlUrlSchema
import sys
import json
if len(sys.argv) <= 1:
raise Error("Kein Zieldateiname angegeben")
def dump_crawlurl(a):
return CrawlUrlSchema().dump(a)
def dump_crawlcache(a):
return CrawlCacheSchema().dump(a)
file = open(sys.argv[1], "w+")
data={}
data["crawlurls"] = map(dump_crawlurl,CrawlUrl.query.all())
#data["crawlcache"] = map(dump_crawlcache,CrawlCache.query.all())
json.dump (data, file)
file.close()

44
load_urls.py Normal file
View File

@@ -0,0 +1,44 @@
from src.compiler.models import CrawlCache, CrawlCacheSchema
from src.compiler.models import CrawlUrl, CrawlUrlSchema
import sys
import json
from src.database import db_session2
from sqlalchemy.exc import IntegrityError
if len(sys.argv) <= 1:
raise Error("Kein Zieldateiname angegeben")
def insert_array(array, cls, session):
for s in array:
if not isinstance(s,cls):
print type(s)
else:
try:
session.add(s)
session.commit()
except IntegrityError:
session.rollback()
def load_crawlurl(a):
print a
return CrawlUrl.deserialize(a[0])
def load_crawlcache(a):
return CrawlCache.deserialize(a[0])
file = open(sys.argv[1], "r")
data=json.load(file)
file.close()
if data.has_key("crawlurls"):
crawlurls=data["crawlurls"]
crawlurls = map (load_crawlurl, crawlurls)
insert_array(crawlurls, CrawlUrl, db_session2)
if data.has_key("crawlcache"):
crawlcache=data["crawlcache"]
crawlcache = map (load_crawlcache, crawlcache)
insert_array(crawlcache, CrawlCache, db_session2)