from requests import session s=session() from src import package_directory, download_path,cfg from os import path, makedirs import os import json from gevent import spawn from src import clogger from src.fb import graph from hashlib import md5 from src.database import db_session2 from models import CrawlUrl import errno import urlparse def announce_articleid(id): for u in cfg.announcearticle_url: s.get( u % id) def downloadfile(url): u=urlparse.urlparse(url) relative_name=path.join("downloads",str(md5(url).hexdigest()),u[2].split('/')[-1]) local_filename = path.join(download_path,relative_name) if not os.path.exists(os.path.dirname(local_filename)): try: os.makedirs(os.path.dirname(local_filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise if not path.exists(local_filename): spawn(fetch_load_file, url, local_filename) return relative_name from models import CrawlCache from datetime import datetime, timedelta def fetch_page(furl,p={}): current_time = datetime.utcnow() ten_weeks_ago = current_time - timedelta(days=cfg.cache_days) u=urlparse.urlparse(furl) cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first() if u[0] == '': furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4])) cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first() if cc is None or u[0]=='fb': # no caching for Facebook clogger.debug("fetching url: "+ str(furl)) if u[0]=='fb': fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds())) if p.has_key("nofollow") and p["nofollow"]==False: furl=u[1]+u[2]+"?fields=story,created_time,id,message,attachments" else: furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments" cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first() if cc is None: tx = json.dumps(graph.get_object(id=furl)) else: tx = cc.raw if not cu==None: cu.last_fetched = datetime.utcnow() db_session2.add(cu) db_session2.commit() else: tx=s.get(furl).text if not cu==None: cu.last_fetched = datetime.utcnow() db_session2.add(cu) db_session2.commit() CrawlCache.store(furl,tx) else: #if furl is not None: # clogger.debug("cache hit") tx=cc.raw return tx def fetch_load_file(furl, path): try: clogger.info("Downloading "+ str(furl)) r = s.get(furl, stream=True) f = open(path, 'wb') for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.close() except Exception, e: #clogger.error("Error Occured during fetching:"+str(furl)) clogger.error(e,exc_info=True)