init commit

This commit is contained in:
Andreas Stephanides
2017-01-14 12:23:04 +01:00
commit 8955bf17f5
32 changed files with 1555 additions and 0 deletions

74
compiler/mprocess.py Normal file
View File

@@ -0,0 +1,74 @@
from src import clogger # Logger for crawler
from src.models import Article # Article model
from datetime import datetime
from src.database import db_session
from mqueues import fetch_queue, compile_queue, put_fetch_queue
from fetching import fetch_page, downloadfile, announce_articleid
from fixing import fix_html, fix_file
from compiler import article_types
from fixing import fix_link
# process article expects an hash with raw data for the article and puts it into an
# article object stored in the database it is intended to prevent dublicates
def is_article_hash(h):
return "text" in h and "url" in h and "sourcetype" in h and "section" in h
def process_article(art):
if not is_article_hash(art):
clogger.error("Invalid article hash:" + str(art))
aa=None
else:
art["text"]=fix_html(art["text"],art["url"])
if "image" in art:
art["image"]=fix_file(art["url"], art["image"])
clogger.info(art)
aa = Article.from_hash(art)
aa.process_hash(art)
aa.last_fetched=datetime.now()
aa.sourcetype=art["sourcetype"]
db_session.add(aa)
db_session.commit()
clogger.debug("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
# announce_articleid(aa.id)
return aa
# process a single found url
def process_url(url,tpe, parent_url):
#clogger.debug("process URL of type "+ tpe + ": " + url)
if parent_url is not None:
url=fix_link(parent_url, url)
put_fetch_queue((0,tpe,url))
# process a url list
def process_urllist(urllist, tpe, parent_url):
for u in urllist:
process_url(u,tpe, parent_url)
def do_process(tpe, cont):
urllist=[]
# clogger.debug("process :" + str(cont))
if "article_links" in cont:
process_urllist(cont["article_links"], article_types[tpe], cont["url"])
if "index_links" in cont:
process_urllist(cont["index_links"], tpe , cont["url"])
if "next_page" in cont and cont["next_page"] is not None:
process_url(cont["next_page"],tpe, cont["url"])
if "article" in cont:
art=cont["article"]
art["sourcetype"]=tpe
process_article(art)
if "articles" in cont:
clogger.debug("articles")
for a in cont["articles"]:
if "title" in a:
a["sourcetype"]=tpe
if a.has_key("url")==False:
a["url"]=cont["url"]
process_article(a)
return