Files
fachschaften/compiler/mprocess.py
Andreas Stephanides 80c42c04cd fetch all fb
2017-02-17 09:58:35 +01:00

90 lines
3.1 KiB
Python

from src import clogger # Logger for crawler
from src.models import Article # Article model
from datetime import datetime
from src.database import db_session
from mqueues import fetch_queue, compile_queue, put_fetch_queue
from fetching import fetch_page, downloadfile, announce_articleid
from fixing import fix_html, fix_file
from sqlalchemy.exc import InvalidRequestError
from compiler import article_types
from fixing import fix_link
# process article expects an hash with raw data for the article and puts it into an
# article object stored in the database it is intended to prevent dublicates
def is_article_hash(h):
return "text" in h and "url" in h and "sourcetype" in h and "section" in h
def process_article(art):
if not is_article_hash(art):
clogger.error("Invalid article hash:" + str(art))
aa=None
else:
art["text"] = fix_html(art["text"], art["url"])
if "image" in art:
art["image"]=fix_file(art["url"], art["image"])
clogger.info(art)
aa = Article.from_hash(art)
aa.process_hash(art)
aa.last_fetched = datetime.now()
aa.sourcetype = art["sourcetype"]
db_session.add(aa)
try:
db_session.commit()
except InvalidRequestError,e:
db_session.rollback()
clogger.error(e)
clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
return aa
# process a single found url
def process_url(url,tpe, parent_url,params={}):
#clogger.debug("process URL of type "+ tpe + ": " + url)
if parent_url is not None:
url=fix_link(parent_url, url)
params.update({"nofollow":False})
put_fetch_queue((0,tpe,url,params))
# process a url list
def process_urllist(urllist, tpe, parent_url,params={}):
for u in urllist:
if isinstance(u, basestring):
process_url(u,tpe, parent_url,params)
elif isinstance(u,tuple):
params.update({"parent_item": u[1]})
process_url(u[0], tpe ,parent_url,params)
else:
clogger.error("url has wrong type: "+ type(u))
def do_process(tpe, cont,params={}):
urllist=[]
# clogger.debug("process :" + str(cont))
if "article_links" in cont:
process_urllist(cont["article_links"], article_types[tpe], cont["url"], params)
if "index_links" in cont:
process_urllist(cont["index_links"], tpe , cont["url"])
if params.has_key("nofollow") and params["nofollow"]:
nofollow=True
else:
nofollow=False
if "next_page" in cont and (cont["next_page"] is not None) and (not nofollow):
process_url(cont["next_page"],tpe, cont["url"])
if "article" in cont:
art=cont["article"]
art["sourcetype"]=tpe
process_article(art)
if "articles" in cont:
clogger.debug("articles")
for a in cont["articles"]:
if "title" in a:
a["sourcetype"]=tpe
if a.has_key("url")==False:
a["url"]=cont["url"]
process_article(a)
db_session.remove()
return