init commit

This commit is contained in:
Andreas Stephanides
2017-01-14 12:23:04 +01:00
commit 8955bf17f5
32 changed files with 1555 additions and 0 deletions

67
compiler/fetching.py Normal file
View File

@@ -0,0 +1,67 @@
from requests import session
s=session()
from src import package_directory, download_path,cfg
from os import path, makedirs
import os
import json
from gevent import spawn
from src import clogger
from src.fb import graph
from hashlib import md5
import errno
import urlparse
def announce_articleid(id):
for u in cfg.announcearticle_url:
s.get( u % id)
def downloadfile(url):
relative_name=path.join("downloads",str(md5(url).hexdigest()),url.split('/')[-1])
local_filename = path.join(download_path,relative_name)
if not os.path.exists(os.path.dirname(local_filename)):
try:
os.makedirs(os.path.dirname(local_filename))
except OSError as exc: # Guard against race condition
if exc.errno != errno.EEXIST:
raise
if not path.exists(local_filename):
spawn(fetch_load_file, url, local_filename)
return relative_name
from models import CrawlCache
from datetime import datetime, timedelta
def fetch_page(furl):
current_time = datetime.utcnow()
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
u=urlparse.urlparse(furl)
if u[0] == '':
furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
if cc is None:
clogger.debug("fetching url: "+ str(furl))
if u[0]=='fb':
tx = json.dumps(graph.get_object(id=u[1]+u[2]))
else:
tx=s.get(furl).text
CrawlCache.store(furl,tx)
else:
#if furl is not None:
# clogger.debug("cache hit")
tx=cc.raw
return tx
def fetch_load_file(furl, path):
try:
clogger.info("Downloading "+ str(furl))
r = s.get(furl, stream=True)
f = open(path, 'wb')
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.close()
except Exception, e:
#clogger.error("Error Occured during fetching:"+str(furl))
clogger.error(e,exc_info=True)