from requests import session s=session() from src import package_directory, download_path,cfg from os import path, makedirs import os import json from gevent import spawn from src import clogger from src.fb import graph from hashlib import md5 import errno import urlparse def announce_articleid(id): for u in cfg.announcearticle_url: s.get( u % id) def downloadfile(url): relative_name=path.join("downloads",str(md5(url).hexdigest()),url.split('/')[-1]) local_filename = path.join(download_path,relative_name) if not os.path.exists(os.path.dirname(local_filename)): try: os.makedirs(os.path.dirname(local_filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise if not path.exists(local_filename): spawn(fetch_load_file, url, local_filename) return relative_name from models import CrawlCache from datetime import datetime, timedelta def fetch_page(furl): current_time = datetime.utcnow() ten_weeks_ago = current_time - timedelta(days=cfg.cache_days) u=urlparse.urlparse(furl) if u[0] == '': furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4])) cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first() if cc is None: clogger.debug("fetching url: "+ str(furl)) if u[0]=='fb': tx = json.dumps(graph.get_object(id=u[1]+u[2])) else: tx=s.get(furl).text CrawlCache.store(furl,tx) else: #if furl is not None: # clogger.debug("cache hit") tx=cc.raw return tx def fetch_load_file(furl, path): try: clogger.info("Downloading "+ str(furl)) r = s.get(furl, stream=True) f = open(path, 'wb') for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.close() except Exception, e: #clogger.error("Error Occured during fetching:"+str(furl)) clogger.error(e,exc_info=True)