div. updates
This commit is contained in:
@@ -5,7 +5,7 @@ import urlparse
|
||||
from src.fb import graph
|
||||
from facebook import GraphAPIError
|
||||
import json
|
||||
|
||||
import gevent
|
||||
|
||||
def fbfeedelement(h):
|
||||
art={}
|
||||
@@ -39,11 +39,11 @@ def fbfeed(url, raw, params={}):
|
||||
arts=[]
|
||||
u=urlparse.urlparse(url)
|
||||
for m in js["data"]:
|
||||
aa=fbfeedelement(m)
|
||||
if not aa.has_key("title"):
|
||||
aa=fbfeedelement(m)
|
||||
if not aa.has_key("title"):
|
||||
aa["title"] = u[1]+ " at " + aa["published"].strftime("%Y-%m-%d %H:%M")
|
||||
aa["section"]="Facebook: "+u[1]
|
||||
arts.append(aa)
|
||||
aa["section"]="Facebook: "+u[1]
|
||||
arts.append(aa)
|
||||
nx=None
|
||||
if js.has_key("paging") and js["paging"].has_key("next"):
|
||||
un=urlparse.urlparse(js["paging"]["next"])
|
||||
|
||||
@@ -45,4 +45,5 @@ def fscharticle(url,raw,params={}):
|
||||
d["published"]=parse(pi["published"])
|
||||
d["pi"]=pi
|
||||
d["sourcetype"]="fscharticle"
|
||||
d["section"]= "Fachschaft Chemie"
|
||||
return {"article": d}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from requests import session
|
||||
s=session()
|
||||
from src import package_directory, download_path,cfg
|
||||
from src import package_directory, download_path,cfg, clogger
|
||||
from os import path, makedirs
|
||||
import os
|
||||
import json
|
||||
@@ -12,6 +12,7 @@ from src.database import db_session2
|
||||
from models import CrawlUrl
|
||||
import errno
|
||||
import urlparse
|
||||
from sqlalchemy.exc import OperationalError, InvalidRequestError
|
||||
def announce_articleid(id):
|
||||
for u in cfg.announcearticle_url:
|
||||
s.get( u % id)
|
||||
@@ -34,23 +35,35 @@ from models import CrawlCache
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
def cleanup_cache():
|
||||
current_time = datetime.utcnow()
|
||||
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days*2)
|
||||
CrawlCache.query.filter(CrawlCache.fetched<ten_weeks_ago).delete()
|
||||
|
||||
def get_cached_page(furl):
|
||||
current_time = datetime.utcnow()
|
||||
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
|
||||
try:
|
||||
cc= CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
|
||||
except OperationalError:
|
||||
db_session2.rollback()
|
||||
cc=None
|
||||
return cc
|
||||
|
||||
|
||||
def fetch_page(furl):
|
||||
current_time = datetime.utcnow()
|
||||
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
|
||||
u=urlparse.urlparse(furl)
|
||||
cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first()
|
||||
|
||||
current_time = datetime.utcnow()
|
||||
cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first()
|
||||
if u[0] == '':
|
||||
furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
|
||||
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
|
||||
cc=get_cached_page(furl) #CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
|
||||
if cc is None or u[0]=='fb': # no caching for Facebook
|
||||
clogger.debug("fetching url: "+ str(furl))
|
||||
if u[0]=='fb':
|
||||
fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds()))
|
||||
furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments"
|
||||
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
|
||||
cc=get_cached_page(furl) #CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
|
||||
if cc is None:
|
||||
tx = json.dumps(graph.get_object(id=furl))
|
||||
else:
|
||||
|
||||
@@ -27,19 +27,23 @@ class CrawlUrl(Base2):
|
||||
tpe=Column(String(250))
|
||||
url = Column(String(250))
|
||||
last_fetched = Column(DateTime)
|
||||
__schema__=CrawlUrlSchema
|
||||
__jsonid__='crawlurl'
|
||||
__whiteattrs__=["id","tpe","url"]
|
||||
__jsonattrs__=None
|
||||
def fetched(self):
|
||||
CrawlCache.query.find(CrawlCache.url==self.url).first()
|
||||
@classmethod
|
||||
def find_or_create(self, tpe, url):
|
||||
aa = CrawlUrl.query.filter(CrawlUrl.url==url).filter(CrawlUrl.tpe==tpe).first()
|
||||
if aa is None:
|
||||
aa=CrawlUrl(tpe,url)
|
||||
aa=CrawlUrl({"tpe":tpe,"url": url})
|
||||
return aa
|
||||
def schedule(self):
|
||||
put_fetch_queue((0, self.tpe, self.url))
|
||||
def __init__(self, tpe, url):
|
||||
self.url=url
|
||||
self.tpe=tpe
|
||||
# def __init__(self, tpe, url):
|
||||
# self.url=url
|
||||
# self.tpe=tpe
|
||||
def __json__(self):
|
||||
return CrawlUrlSchema().dump(self)[0]
|
||||
|
||||
@@ -55,7 +59,10 @@ class CrawlCache(Base2):
|
||||
url=Column(String(250))
|
||||
fetched=Column(DateTime)
|
||||
raw=Column(Text)
|
||||
|
||||
__schema__=CrawlCacheSchema
|
||||
__jsonattrs__=None
|
||||
__jsonid__='crawlcache'
|
||||
__whiteattrs__= []
|
||||
def __init__(self, url,rw):
|
||||
self.url=url
|
||||
self.raw=rw
|
||||
|
||||
@@ -5,6 +5,7 @@ from src.database import db_session
|
||||
from mqueues import fetch_queue, compile_queue, put_fetch_queue
|
||||
from fetching import fetch_page, downloadfile, announce_articleid
|
||||
from fixing import fix_html, fix_file
|
||||
#from src import app
|
||||
|
||||
from compiler import article_types
|
||||
from fixing import fix_link
|
||||
@@ -28,12 +29,19 @@ def process_article(art):
|
||||
aa.last_fetched=datetime.now()
|
||||
aa.sourcetype=art["sourcetype"]
|
||||
db_session.add(aa)
|
||||
db_session.commit()
|
||||
try:
|
||||
db_session.commit()
|
||||
except InvalidRequestError,e:
|
||||
db_session.rollback()
|
||||
clogger.error(e)
|
||||
clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
|
||||
db_session.close()
|
||||
return aa
|
||||
# app.logger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
|
||||
# db_session.close()
|
||||
# announce_articleid(aa.id)
|
||||
#
|
||||
return aa
|
||||
|
||||
|
||||
|
||||
# process a single found url
|
||||
def process_url(url,tpe, parent_url,params={}):
|
||||
|
||||
@@ -45,7 +45,7 @@ def run_fetch():
|
||||
tc, tpe, url, p= fetch_queue.get()
|
||||
except ValueError:
|
||||
tc, tpe, url= fetch_queue.get()
|
||||
|
||||
clogger.debug("fetched : "+url)
|
||||
if tpe is not "dummyarticle" and tpe is not "dummyindex":
|
||||
rw=fetch_page(url)
|
||||
else:
|
||||
|
||||
@@ -79,8 +79,8 @@ def urls_lst():
|
||||
def urls_json(id):
|
||||
# Lade Alle Urls
|
||||
status=CrawlUrl.query.get(id)
|
||||
cc=CrawlCache.query.filter(CrawlCache.url==status.url).first()
|
||||
return jsonify(urls=status, cache=cc.__json__())
|
||||
# cc=CrawlCache.query.filter(CrawlCache.url==status.url).first()
|
||||
return jsonify(urls=status)
|
||||
|
||||
# que an existing CrawlUrl for fetching
|
||||
@compiler_pages.route("/urls/<int:id>/que")
|
||||
|
||||
Reference in New Issue
Block a user