Files
fachschaften/compiler/fetching.py
Andreas Stephanides f475364213 fbfeed + sectionreset
2017-01-15 14:43:02 +01:00

68 lines
2.2 KiB
Python

from requests import session
s=session()
from src import package_directory, download_path,cfg
from os import path, makedirs
import os
import json
from gevent import spawn
from src import clogger
from src.fb import graph
from hashlib import md5
import errno
import urlparse
def announce_articleid(id):
for u in cfg.announcearticle_url:
s.get( u % id)
def downloadfile(url):
relative_name=path.join("downloads",str(md5(url).hexdigest()),url.split('/')[-1])
local_filename = path.join(download_path,relative_name)
if not os.path.exists(os.path.dirname(local_filename)):
try:
os.makedirs(os.path.dirname(local_filename))
except OSError as exc: # Guard against race condition
if exc.errno != errno.EEXIST:
raise
if not path.exists(local_filename):
spawn(fetch_load_file, url, local_filename)
return relative_name
from models import CrawlCache
from datetime import datetime, timedelta
def fetch_page(furl):
current_time = datetime.utcnow()
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
u=urlparse.urlparse(furl)
if u[0] == '':
furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
if cc is None or u[0]=='fb': # no caching for Facebook
clogger.debug("fetching url: "+ str(furl))
if u[0]=='fb':
tx = json.dumps(graph.get_object(id=u[1]+u[2]+"?fields=story,created_time,id,message,attachments"))
else:
tx=s.get(furl).text
CrawlCache.store(furl,tx)
else:
#if furl is not None:
# clogger.debug("cache hit")
tx=cc.raw
return tx
def fetch_load_file(furl, path):
try:
clogger.info("Downloading "+ str(furl))
r = s.get(furl, stream=True)
f = open(path, 'wb')
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.close()
except Exception, e:
#clogger.error("Error Occured during fetching:"+str(furl))
clogger.error(e,exc_info=True)