fachschaften/compiler/fetching.py

from requests import session
s=session()
from src import package_directory, download_path,cfg
from os import path, makedirs
import os
import json
from gevent import spawn
from src import clogger
from src.fb import graph
from hashlib import md5
from src.database import db_session2
from models import CrawlUrl
import errno
import urlparse
def announce_articleid(id):
    for u in cfg.announcearticle_url:
        s.get( u % id)

def downloadfile(url):
    u=urlparse.urlparse(url)
    relative_name=path.join("downloads",str(md5(url).hexdigest()),u[2].split('/')[-1])
    local_filename = path.join(download_path,relative_name)
    if not os.path.exists(os.path.dirname(local_filename)):
        try:
            os.makedirs(os.path.dirname(local_filename))
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
    if not path.exists(local_filename):
        spawn(fetch_load_file, url, local_filename)
    return relative_name

from models import CrawlCache
from datetime import datetime, timedelta


def fetch_page(furl,p={}):
    current_time = datetime.utcnow()
    ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
    u=urlparse.urlparse(furl)
    cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first()

    if u[0] == '':
        furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
    cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
    if cc is None or u[0]=='fb': # no caching for Facebook
        clogger.debug("fetching url:  "+ str(furl))
        if u[0]=='fb':
            fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds()))
            if p.has_key("nofollow") and p["nofollow"]==False:
                furl=u[1]+u[2]+"?fields=story,created_time,id,message,attachments"
            else:
                furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments"
            cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
            if cc is None:
                tx = json.dumps(graph.get_object(id=furl))
            else:
                tx = cc.raw
                if not cu==None:
                    cu.last_fetched = datetime.utcnow()
                    db_session2.add(cu)
                    db_session2.commit()

        else:
            tx=s.get(furl).text
            if not cu==None:
                cu.last_fetched = datetime.utcnow()
                db_session2.add(cu)
                db_session2.commit()
        CrawlCache.store(furl,tx)
    else:
    #if furl is not None:
#            clogger.debug("cache hit")
        tx=cc.raw
    return tx

def fetch_load_file(furl, path):
    try:
        clogger.info("Downloading "+ str(furl))
        r = s.get(furl, stream=True)
        f = open(path, 'wb')
        for chunk in r.iter_content(chunk_size=1024):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
        f.close()
    except Exception, e:
        #clogger.error("Error Occured during fetching:"+str(furl))
        clogger.error(e,exc_info=True)