init commit
This commit is contained in:
74
compiler/mprocess.py
Normal file
74
compiler/mprocess.py
Normal file
@@ -0,0 +1,74 @@
|
||||
from src import clogger # Logger for crawler
|
||||
from src.models import Article # Article model
|
||||
from datetime import datetime
|
||||
from src.database import db_session
|
||||
from mqueues import fetch_queue, compile_queue, put_fetch_queue
|
||||
from fetching import fetch_page, downloadfile, announce_articleid
|
||||
from fixing import fix_html, fix_file
|
||||
|
||||
from compiler import article_types
|
||||
from fixing import fix_link
|
||||
# process article expects an hash with raw data for the article and puts it into an
|
||||
# article object stored in the database it is intended to prevent dublicates
|
||||
|
||||
def is_article_hash(h):
|
||||
return "text" in h and "url" in h and "sourcetype" in h and "section" in h
|
||||
|
||||
def process_article(art):
|
||||
if not is_article_hash(art):
|
||||
clogger.error("Invalid article hash:" + str(art))
|
||||
aa=None
|
||||
else:
|
||||
art["text"]=fix_html(art["text"],art["url"])
|
||||
if "image" in art:
|
||||
art["image"]=fix_file(art["url"], art["image"])
|
||||
clogger.info(art)
|
||||
aa = Article.from_hash(art)
|
||||
aa.process_hash(art)
|
||||
aa.last_fetched=datetime.now()
|
||||
aa.sourcetype=art["sourcetype"]
|
||||
db_session.add(aa)
|
||||
db_session.commit()
|
||||
clogger.debug("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
|
||||
# announce_articleid(aa.id)
|
||||
return aa
|
||||
|
||||
# process a single found url
|
||||
def process_url(url,tpe, parent_url):
|
||||
#clogger.debug("process URL of type "+ tpe + ": " + url)
|
||||
if parent_url is not None:
|
||||
url=fix_link(parent_url, url)
|
||||
put_fetch_queue((0,tpe,url))
|
||||
|
||||
|
||||
# process a url list
|
||||
def process_urllist(urllist, tpe, parent_url):
|
||||
for u in urllist:
|
||||
process_url(u,tpe, parent_url)
|
||||
|
||||
|
||||
def do_process(tpe, cont):
|
||||
urllist=[]
|
||||
# clogger.debug("process :" + str(cont))
|
||||
if "article_links" in cont:
|
||||
process_urllist(cont["article_links"], article_types[tpe], cont["url"])
|
||||
if "index_links" in cont:
|
||||
process_urllist(cont["index_links"], tpe , cont["url"])
|
||||
|
||||
if "next_page" in cont and cont["next_page"] is not None:
|
||||
process_url(cont["next_page"],tpe, cont["url"])
|
||||
|
||||
if "article" in cont:
|
||||
art=cont["article"]
|
||||
art["sourcetype"]=tpe
|
||||
process_article(art)
|
||||
|
||||
if "articles" in cont:
|
||||
clogger.debug("articles")
|
||||
for a in cont["articles"]:
|
||||
if "title" in a:
|
||||
a["sourcetype"]=tpe
|
||||
if a.has_key("url")==False:
|
||||
a["url"]=cont["url"]
|
||||
process_article(a)
|
||||
return
|
||||
Reference in New Issue
Block a user