init commit
This commit is contained in:
67
compiler/fetching.py
Normal file
67
compiler/fetching.py
Normal file
@@ -0,0 +1,67 @@
|
||||
from requests import session
|
||||
s=session()
|
||||
from src import package_directory, download_path,cfg
|
||||
from os import path, makedirs
|
||||
import os
|
||||
import json
|
||||
from gevent import spawn
|
||||
from src import clogger
|
||||
from src.fb import graph
|
||||
from hashlib import md5
|
||||
import errno
|
||||
import urlparse
|
||||
def announce_articleid(id):
|
||||
for u in cfg.announcearticle_url:
|
||||
s.get( u % id)
|
||||
|
||||
def downloadfile(url):
|
||||
relative_name=path.join("downloads",str(md5(url).hexdigest()),url.split('/')[-1])
|
||||
local_filename = path.join(download_path,relative_name)
|
||||
if not os.path.exists(os.path.dirname(local_filename)):
|
||||
try:
|
||||
os.makedirs(os.path.dirname(local_filename))
|
||||
except OSError as exc: # Guard against race condition
|
||||
if exc.errno != errno.EEXIST:
|
||||
raise
|
||||
if not path.exists(local_filename):
|
||||
spawn(fetch_load_file, url, local_filename)
|
||||
return relative_name
|
||||
|
||||
from models import CrawlCache
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
|
||||
|
||||
def fetch_page(furl):
|
||||
current_time = datetime.utcnow()
|
||||
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
|
||||
u=urlparse.urlparse(furl)
|
||||
if u[0] == '':
|
||||
furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
|
||||
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
|
||||
if cc is None:
|
||||
clogger.debug("fetching url: "+ str(furl))
|
||||
if u[0]=='fb':
|
||||
tx = json.dumps(graph.get_object(id=u[1]+u[2]))
|
||||
else:
|
||||
tx=s.get(furl).text
|
||||
CrawlCache.store(furl,tx)
|
||||
else:
|
||||
#if furl is not None:
|
||||
# clogger.debug("cache hit")
|
||||
tx=cc.raw
|
||||
return tx
|
||||
|
||||
def fetch_load_file(furl, path):
|
||||
try:
|
||||
clogger.info("Downloading "+ str(furl))
|
||||
r = s.get(furl, stream=True)
|
||||
f = open(path, 'wb')
|
||||
for chunk in r.iter_content(chunk_size=1024):
|
||||
if chunk: # filter out keep-alive new chunks
|
||||
f.write(chunk)
|
||||
f.close()
|
||||
except Exception, e:
|
||||
#clogger.error("Error Occured during fetching:"+str(furl))
|
||||
clogger.error(e,exc_info=True)
|
||||
Reference in New Issue
Block a user