init commit

2017-01-14 12:23:04 +01:00
commit 8955bf17f5
32 changed files with 1555 additions and 0 deletions
--- a/compiler/fetching.py
+++ b/compiler/fetching.py
@@ -0,0 +1,67 @@
+from requests import session
+s=session()
+from src import package_directory, download_path,cfg
+from os import path, makedirs
+import os
+import json
+from gevent import spawn
+from src import clogger
+from src.fb import graph
+from hashlib import md5
+import errno
+import urlparse
+def announce_articleid(id):
+    for u in cfg.announcearticle_url:
+        s.get( u % id)
+
+def downloadfile(url):
+    relative_name=path.join("downloads",str(md5(url).hexdigest()),url.split('/')[-1])
+    local_filename = path.join(download_path,relative_name)
+    if not os.path.exists(os.path.dirname(local_filename)):
+        try:
+            os.makedirs(os.path.dirname(local_filename))
+        except OSError as exc: # Guard against race condition
+            if exc.errno != errno.EEXIST:
+                raise
+    if not path.exists(local_filename):
+        spawn(fetch_load_file, url, local_filename)
+    return relative_name
+
+from models import CrawlCache 
+from datetime import datetime, timedelta
+
+
+
+
+def fetch_page(furl):
+    current_time = datetime.utcnow()
+    ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
+    u=urlparse.urlparse(furl)
+    if u[0] == '':
+        furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
+    cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
+    if cc is None:
+        clogger.debug("fetching url:  "+ str(furl))
+        if u[0]=='fb':
+            tx = json.dumps(graph.get_object(id=u[1]+u[2]))
+        else:
+            tx=s.get(furl).text
+        CrawlCache.store(furl,tx)
+    else:
+    #if furl is not None:
+#            clogger.debug("cache hit")
+        tx=cc.raw
+    return tx
+
+def fetch_load_file(furl, path):
+    try:
+        clogger.info("Downloading "+ str(furl))
+        r = s.get(furl, stream=True)
+        f = open(path, 'wb')
+        for chunk in r.iter_content(chunk_size=1024): 
+            if chunk: # filter out keep-alive new chunks
+                f.write(chunk)
+        f.close()
+    except Exception, e:
+        #clogger.error("Error Occured during fetching:"+str(furl))
+        clogger.error(e,exc_info=True)