div. updates

This commit is contained in:
andis
2017-02-17 10:02:20 +01:00
parent bdfa16728d
commit b71803c050
14 changed files with 224 additions and 65 deletions

View File

@@ -1,6 +1,6 @@
from requests import session
s=session()
from src import package_directory, download_path,cfg
from src import package_directory, download_path,cfg, clogger
from os import path, makedirs
import os
import json
@@ -12,6 +12,7 @@ from src.database import db_session2
from models import CrawlUrl
import errno
import urlparse
from sqlalchemy.exc import OperationalError, InvalidRequestError
def announce_articleid(id):
for u in cfg.announcearticle_url:
s.get( u % id)
@@ -34,23 +35,35 @@ from models import CrawlCache
from datetime import datetime, timedelta
def cleanup_cache():
current_time = datetime.utcnow()
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days*2)
CrawlCache.query.filter(CrawlCache.fetched<ten_weeks_ago).delete()
def get_cached_page(furl):
current_time = datetime.utcnow()
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
try:
cc= CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
except OperationalError:
db_session2.rollback()
cc=None
return cc
def fetch_page(furl):
current_time = datetime.utcnow()
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
u=urlparse.urlparse(furl)
cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first()
current_time = datetime.utcnow()
cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first()
if u[0] == '':
furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
cc=get_cached_page(furl) #CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
if cc is None or u[0]=='fb': # no caching for Facebook
clogger.debug("fetching url: "+ str(furl))
if u[0]=='fb':
fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds()))
furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments"
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
cc=get_cached_page(furl) #CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
if cc is None:
tx = json.dumps(graph.get_object(id=furl))
else: