div. updates

This commit is contained in:
andis
2017-02-17 10:02:20 +01:00
parent bdfa16728d
commit b71803c050
14 changed files with 224 additions and 65 deletions

View File

@@ -5,7 +5,7 @@ import urlparse
from src.fb import graph
from facebook import GraphAPIError
import json
import gevent
def fbfeedelement(h):
art={}
@@ -39,11 +39,11 @@ def fbfeed(url, raw, params={}):
arts=[]
u=urlparse.urlparse(url)
for m in js["data"]:
aa=fbfeedelement(m)
if not aa.has_key("title"):
aa=fbfeedelement(m)
if not aa.has_key("title"):
aa["title"] = u[1]+ " at " + aa["published"].strftime("%Y-%m-%d %H:%M")
aa["section"]="Facebook: "+u[1]
arts.append(aa)
aa["section"]="Facebook: "+u[1]
arts.append(aa)
nx=None
if js.has_key("paging") and js["paging"].has_key("next"):
un=urlparse.urlparse(js["paging"]["next"])

View File

@@ -45,4 +45,5 @@ def fscharticle(url,raw,params={}):
d["published"]=parse(pi["published"])
d["pi"]=pi
d["sourcetype"]="fscharticle"
d["section"]= "Fachschaft Chemie"
return {"article": d}

View File

@@ -1,6 +1,6 @@
from requests import session
s=session()
from src import package_directory, download_path,cfg
from src import package_directory, download_path,cfg, clogger
from os import path, makedirs
import os
import json
@@ -12,6 +12,7 @@ from src.database import db_session2
from models import CrawlUrl
import errno
import urlparse
from sqlalchemy.exc import OperationalError, InvalidRequestError
def announce_articleid(id):
for u in cfg.announcearticle_url:
s.get( u % id)
@@ -34,23 +35,35 @@ from models import CrawlCache
from datetime import datetime, timedelta
def cleanup_cache():
current_time = datetime.utcnow()
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days*2)
CrawlCache.query.filter(CrawlCache.fetched<ten_weeks_ago).delete()
def get_cached_page(furl):
current_time = datetime.utcnow()
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
try:
cc= CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
except OperationalError:
db_session2.rollback()
cc=None
return cc
def fetch_page(furl):
current_time = datetime.utcnow()
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
u=urlparse.urlparse(furl)
cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first()
current_time = datetime.utcnow()
cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first()
if u[0] == '':
furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
cc=get_cached_page(furl) #CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
if cc is None or u[0]=='fb': # no caching for Facebook
clogger.debug("fetching url: "+ str(furl))
if u[0]=='fb':
fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds()))
furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments"
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
cc=get_cached_page(furl) #CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
if cc is None:
tx = json.dumps(graph.get_object(id=furl))
else:

View File

@@ -27,19 +27,23 @@ class CrawlUrl(Base2):
tpe=Column(String(250))
url = Column(String(250))
last_fetched = Column(DateTime)
__schema__=CrawlUrlSchema
__jsonid__='crawlurl'
__whiteattrs__=["id","tpe","url"]
__jsonattrs__=None
def fetched(self):
CrawlCache.query.find(CrawlCache.url==self.url).first()
@classmethod
def find_or_create(self, tpe, url):
aa = CrawlUrl.query.filter(CrawlUrl.url==url).filter(CrawlUrl.tpe==tpe).first()
if aa is None:
aa=CrawlUrl(tpe,url)
aa=CrawlUrl({"tpe":tpe,"url": url})
return aa
def schedule(self):
put_fetch_queue((0, self.tpe, self.url))
def __init__(self, tpe, url):
self.url=url
self.tpe=tpe
# def __init__(self, tpe, url):
# self.url=url
# self.tpe=tpe
def __json__(self):
return CrawlUrlSchema().dump(self)[0]
@@ -55,7 +59,10 @@ class CrawlCache(Base2):
url=Column(String(250))
fetched=Column(DateTime)
raw=Column(Text)
__schema__=CrawlCacheSchema
__jsonattrs__=None
__jsonid__='crawlcache'
__whiteattrs__= []
def __init__(self, url,rw):
self.url=url
self.raw=rw

View File

@@ -5,6 +5,7 @@ from src.database import db_session
from mqueues import fetch_queue, compile_queue, put_fetch_queue
from fetching import fetch_page, downloadfile, announce_articleid
from fixing import fix_html, fix_file
#from src import app
from compiler import article_types
from fixing import fix_link
@@ -28,12 +29,19 @@ def process_article(art):
aa.last_fetched=datetime.now()
aa.sourcetype=art["sourcetype"]
db_session.add(aa)
db_session.commit()
try:
db_session.commit()
except InvalidRequestError,e:
db_session.rollback()
clogger.error(e)
clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
db_session.close()
return aa
# app.logger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
# db_session.close()
# announce_articleid(aa.id)
#
return aa
# process a single found url
def process_url(url,tpe, parent_url,params={}):

View File

@@ -45,7 +45,7 @@ def run_fetch():
tc, tpe, url, p= fetch_queue.get()
except ValueError:
tc, tpe, url= fetch_queue.get()
clogger.debug("fetched : "+url)
if tpe is not "dummyarticle" and tpe is not "dummyindex":
rw=fetch_page(url)
else:

View File

@@ -79,8 +79,8 @@ def urls_lst():
def urls_json(id):
# Lade Alle Urls
status=CrawlUrl.query.get(id)
cc=CrawlCache.query.filter(CrawlCache.url==status.url).first()
return jsonify(urls=status, cache=cc.__json__())
# cc=CrawlCache.query.filter(CrawlCache.url==status.url).first()
return jsonify(urls=status)
# que an existing CrawlUrl for fetching
@compiler_pages.route("/urls/<int:id>/que")