fachschaften/compiler/fetching.py

from requests import session
s=session()
from src import package_directory, download_path,cfg
from os import path, makedirs
import os
import json
from gevent import spawn
from src import clogger
from src.fb import graph
from hashlib import md5
import errno
import urlparse
def announce_articleid(id):
    for u in cfg.announcearticle_url:
        s.get( u % id)

def downloadfile(url):
    relative_name=path.join("downloads",str(md5(url).hexdigest()),url.split('/')[-1])
    local_filename = path.join(download_path,relative_name)
    if not os.path.exists(os.path.dirname(local_filename)):
        try:
            os.makedirs(os.path.dirname(local_filename))
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
    if not path.exists(local_filename):
        spawn(fetch_load_file, url, local_filename)
    return relative_name

from models import CrawlCache
from datetime import datetime, timedelta


def fetch_page(furl):
    current_time = datetime.utcnow()
    ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
    u=urlparse.urlparse(furl)
    if u[0] == '':
        furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
    cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
    if cc is None or u[0]=='fb': # no caching for Facebook
        clogger.debug("fetching url:  "+ str(furl))
        if u[0]=='fb':
            tx = json.dumps(graph.get_object(id=u[1]+u[2]+"?fields=story,created_time,id,message,attachments"))
        else:
            tx=s.get(furl).text
        CrawlCache.store(furl,tx)
    else:
    #if furl is not None:
#            clogger.debug("cache hit")
        tx=cc.raw
    return tx

def fetch_load_file(furl, path):
    try:
        clogger.info("Downloading "+ str(furl))
        r = s.get(furl, stream=True)
        f = open(path, 'wb')
        for chunk in r.iter_content(chunk_size=1024):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
        f.close()
    except Exception, e:
        #clogger.error("Error Occured during fetching:"+str(furl))
        clogger.error(e,exc_info=True)