Files
fachschaften/compiler/fetching.py
2017-01-19 15:09:24 +01:00

88 lines
3.0 KiB
Python

from requests import session
s=session()
from src import package_directory, download_path,cfg
from os import path, makedirs
import os
import json
from gevent import spawn
from src import clogger
from src.fb import graph
from hashlib import md5
from src.database import db_session2
from models import CrawlUrl
import errno
import urlparse
def announce_articleid(id):
for u in cfg.announcearticle_url:
s.get( u % id)
def downloadfile(url):
u=urlparse.urlparse(url)
relative_name=path.join("downloads",str(md5(url).hexdigest()),u[2].split('/')[-1])
local_filename = path.join(download_path,relative_name)
if not os.path.exists(os.path.dirname(local_filename)):
try:
os.makedirs(os.path.dirname(local_filename))
except OSError as exc: # Guard against race condition
if exc.errno != errno.EEXIST:
raise
if not path.exists(local_filename):
spawn(fetch_load_file, url, local_filename)
return relative_name
from models import CrawlCache
from datetime import datetime, timedelta
def fetch_page(furl):
current_time = datetime.utcnow()
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
u=urlparse.urlparse(furl)
cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first()
if u[0] == '':
furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
if cc is None or u[0]=='fb': # no caching for Facebook
clogger.debug("fetching url: "+ str(furl))
if u[0]=='fb':
fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds()))
furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments"
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
if cc is None:
tx = json.dumps(graph.get_object(id=furl))
else:
tx = cc.raw
if not cu==None:
cu.last_fetched = datetime.utcnow()
db_session2.add(cu)
db_session2.commit()
else:
tx=s.get(furl).text
if not cu==None:
cu.last_fetched = datetime.utcnow()
db_session2.add(cu)
db_session2.commit()
CrawlCache.store(furl,tx)
else:
#if furl is not None:
# clogger.debug("cache hit")
tx=cc.raw
return tx
def fetch_load_file(furl, path):
try:
clogger.info("Downloading "+ str(furl))
r = s.get(furl, stream=True)
f = open(path, 'wb')
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.close()
except Exception, e:
#clogger.error("Error Occured during fetching:"+str(furl))
clogger.error(e,exc_info=True)