init commit

2017-01-14 12:23:04 +01:00
commit 8955bf17f5
32 changed files with 1555 additions and 0 deletions
--- a/compiler/mprocess.py
+++ b/compiler/mprocess.py
@@ -0,0 +1,74 @@
+from src import clogger # Logger for crawler
+from src.models import Article # Article model
+from datetime import datetime 
+from src.database import db_session
+from mqueues import fetch_queue, compile_queue, put_fetch_queue
+from fetching import fetch_page, downloadfile, announce_articleid
+from fixing import fix_html, fix_file
+
+from compiler import article_types
+from fixing import fix_link
+# process article expects an hash with raw data for the article and puts it into an
+# article object stored in the database it is intended to prevent dublicates
+
+def is_article_hash(h):
+    return "text" in h  and "url" in h and "sourcetype" in h and "section" in h
+
+def process_article(art):
+    if not is_article_hash(art):
+        clogger.error("Invalid article hash:" + str(art))
+        aa=None
+    else:
+        art["text"]=fix_html(art["text"],art["url"])
+        if "image" in art:
+            art["image"]=fix_file(art["url"], art["image"])
+        clogger.info(art)
+        aa = Article.from_hash(art)
+        aa.process_hash(art)
+        aa.last_fetched=datetime.now()
+        aa.sourcetype=art["sourcetype"]
+        db_session.add(aa)
+        db_session.commit()
+        clogger.debug("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
+#        announce_articleid(aa.id)
+    return aa
+
+# process a single found url
+def process_url(url,tpe, parent_url):
+    #clogger.debug("process URL of type "+ tpe + ": " + url)
+    if parent_url is not None:
+        url=fix_link(parent_url, url)
+    put_fetch_queue((0,tpe,url))
+
+
+# process a url list
+def process_urllist(urllist, tpe, parent_url):
+    for u in urllist:
+        process_url(u,tpe, parent_url)
+        
+        
+def do_process(tpe, cont):
+    urllist=[]
+#    clogger.debug("process :" + str(cont))
+    if "article_links" in cont:
+        process_urllist(cont["article_links"],  article_types[tpe], cont["url"])
+    if "index_links" in cont:
+        process_urllist(cont["index_links"],  tpe , cont["url"])
+
+    if "next_page" in cont and cont["next_page"] is not None:
+        process_url(cont["next_page"],tpe, cont["url"])
+
+    if "article" in cont:
+        art=cont["article"]
+        art["sourcetype"]=tpe
+        process_article(art)
+
+    if "articles" in cont:
+        clogger.debug("articles")
+        for a in cont["articles"]:
+            if "title" in a:
+                a["sourcetype"]=tpe
+                if a.has_key("url")==False:
+                    a["url"]=cont["url"]
+                process_article(a)
+    return