diff --git a/compiler/comp/fsch.py b/compiler/comp/fsch.py new file mode 100644 index 0000000..39b62ca --- /dev/null +++ b/compiler/comp/fsch.py @@ -0,0 +1,48 @@ +from bs4 import BeautifulSoup +from dateutil.parser import parse +from datetime import datetime +import re +import urlparse +from src import clogger, cfg +from src.compiler.fixing import fix_link + +# d["title"], d["image"], d["published"], d["text"], d["section"], d["url"] + +# h=html.find("h2", {"class":"item-page-title"}) +# h1= re.sub(r'.*- (\d+) ([a-zA-Z]+) (\d+) - ([:\d]+)[^\d]*', r'\3/\2/\1 \4',unicode(h.text.strip())) + +# h1= re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text).strip()) +# d["published"]=parse(h1.strip()) +# d["text"]=h.encode_contents().strip() + + + +def fscharticle(url,raw,params={}): + if raw is None: + raise Error + html=BeautifulSoup(raw) + d={} + h=html.find("h2", {"class":"item-page-title"}) + if h is not None: + d["title"]=h.text.strip() + d["author"]=None + h=html.find("div", {"class":"item-page"}) + if h is not None: + h2=h.find("h2", {"class":"item-page-title"}) + if h2 is not None: + h2.extract() + #d["text"]=h.encode_contents().strip() + h2= h.find("img") + if h2 is not None: + d["image"]=h2.attrs["src"] + else: + d["image"]="" + if params.has_key("parent_item"): + pi=params["parent_item"] + if pi.has_key("author_detail"): + d["author"]=pi["author_detail"] + if pi.has_key("published"): + d["published"]=parse(pi["published"]) + d["pi"]=pi + d["sourcetype"]="fscharticle" + return {"article": d} diff --git a/compiler/comp/rss.py b/compiler/comp/rss.py index cb5a95b..9eee0aa 100644 --- a/compiler/comp/rss.py +++ b/compiler/comp/rss.py @@ -1,8 +1,8 @@ import feedparser -def rssfeed(url,raw): +def rssfeed(url,raw,params={}): al=[] f=feedparser.parse(raw) for e in f['entries']: - al.append(e['link']) + al.append((e['link'],e)) return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"} diff --git a/compiler/compiler.py b/compiler/compiler.py index 4543cfa..aaf93ad 100644 --- a/compiler/compiler.py +++ b/compiler/compiler.py @@ -6,12 +6,12 @@ from datetime import datetime import re import urlparse from src import clogger, cfg -from fixing import fix_link +from src.compiler.fixing import fix_link import feedparser #from fetching import downloadfile import json -def do_compile(tpe, cont): +def do_compile(tpe, cont, params={}): if type(cont) != dict: clogger.error("Type Error for do compile for :"+str(cont["url"])) # Starting to compile an generic object @@ -20,17 +20,20 @@ def do_compile(tpe, cont): else: clogger.debug("compile: type:"+str(tpe)+ "| "+ str(cont["url"])) if tpe in compiler: - cont=compiler[tpe](cont["url"], cont["raw"]) + cont=compiler[tpe](cont["url"], cont["raw"],params) + else: + clogger.error("Compiler for "+tpe+" not found.") return cont from comp import rssfeed from comp import fbfeed +from comp.fsch import fscharticle def dummyarticle(url, raw): return {"url": url, "article":{"url": url, "section": "dummysection", "sourcetype": "dummy", "title":"dummytitle", "text": raw, "image": "fff", "author": "me", "published": None}} -def htufeed(url,raw): +def htufeed(url,raw,params={}): al=[] f=feedparser.parse(raw) for e in f['entries']: @@ -38,7 +41,7 @@ def htufeed(url,raw): return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"} -def htuarticle(url,raw): +def htuarticle(url,raw,params={}): sp=BeautifulSoup(raw) d={} h=sp.find("meta", {"property": "og:image"}) @@ -71,7 +74,7 @@ def htuarticle(url,raw): return {"article": d} -def fetarticle(url, raw): +def fetarticle(url, raw,params={}): sp=BeautifulSoup(raw) d={} h=sp.find("h1", {"itemprop": "name"}) @@ -110,7 +113,7 @@ def fetarticle(url, raw): return {"article": d} -def fsarcharticle(url, raw): +def fsarcharticle(url, raw,params={}): sp=BeautifulSoup(raw) d={} h=sp.find("h1", {"class": "title"}) @@ -135,7 +138,7 @@ def fsarcharticle(url, raw): d["author"]=None return {"article": d} -def fsbizarticle(url, raw): +def fsbizarticle(url, raw,params={}): sp=BeautifulSoup(raw) d={} h=sp.find("h1", {"class": "entry-title"}) @@ -159,7 +162,7 @@ def fsbizarticle(url, raw): d["author"]=h.find("a").text.strip() return {"article": d} -def fetindex(url, raw): +def fetindex(url, raw,params={}): if raw is None: raise Error # clogger.debug("compile_fetindex: "+str(url)) @@ -178,7 +181,7 @@ def fetindex(url, raw): al.append(t.attrs["href"]) return {"url": url, "next_page": nl, "article_links": al, "objecttype": "index" } -def fsarchindex(url, raw): +def fsarchindex(url, raw,params={}): if raw is None: raise Error html=BeautifulSoup(raw) @@ -197,7 +200,7 @@ def fsarchindex(url, raw): return {"url": url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"} -def fsbizindex(url, raw): +def fsbizindex(url, raw,params={}): if raw is None: raise Error print "compile_fsbizindex" @@ -212,8 +215,7 @@ def fsbizindex(url, raw): - -def fsmbindex(url, raw): +def fsmbindex(url, raw,params={}): if raw is None: raise Error html=BeautifulSoup(raw) @@ -246,7 +248,7 @@ def fsmbindex(url, raw): articles.append(aa) return {"url": url, "next_page": np, "articles": articles,"objecttype": "articles"} -compiler = {"fetindex": fetindex, "fetarticle": fetarticle, "fsarchindex": fsarchindex, "fsarcharticle": fsarcharticle, "fsmbindex": fsmbindex, "fsbizindex": fsbizindex, "dummyarticle": dummyarticle,"htuarticle": htuarticle, "htufeed": htufeed, "fbfeed": fbfeed, "fschfeed": rssfeed} +compiler = {"fetindex": fetindex, "fetarticle": fetarticle, "fsarchindex": fsarchindex, "fsarcharticle": fsarcharticle, "fsmbindex": fsmbindex, "fsbizindex": fsbizindex, "dummyarticle": dummyarticle,"htuarticle": htuarticle, "htufeed": htufeed, "fbfeed": fbfeed, "fschfeed": rssfeed, "fscharticle": fscharticle} compiler = cfg.compiler for i in compiler: @@ -254,4 +256,4 @@ for i in compiler: -article_types={"fetindex" : "fetarticle", "fsarchindex": "fsarcharticle", "fsbizindex": "fsbizarticle", "dummyindex": "dummyarticle", "htufeed": "htuarticle"} +article_types={"fetindex" : "fetarticle", "fsarchindex": "fsarcharticle", "fsbizindex": "fsbizarticle", "dummyindex": "dummyarticle", "htufeed": "htuarticle", "fschfeed": "fscharticle"} diff --git a/compiler/controller.py b/compiler/controller.py index a95946b..73cbaa8 100644 --- a/compiler/controller.py +++ b/compiler/controller.py @@ -10,7 +10,8 @@ def urls_test(id): rw=fetch_page(cu.url) h= {"url": cu.url, "sourcetype": cu.tpe, "raw": rw} h2=do_compile(cu.tpe, h) - return {"rw": rw, "url": h, "comp": h2} + h2["raw"]="raw - html -blocked" + return {"comp": h2} def urls_que(id): cu=CrawlUrl.query.get(id) diff --git a/compiler/mprocess.py b/compiler/mprocess.py index faf45a9..c7b0dc7 100644 --- a/compiler/mprocess.py +++ b/compiler/mprocess.py @@ -34,24 +34,30 @@ def process_article(art): return aa # process a single found url -def process_url(url,tpe, parent_url): +def process_url(url,tpe, parent_url,params={}): #clogger.debug("process URL of type "+ tpe + ": " + url) if parent_url is not None: url=fix_link(parent_url, url) - put_fetch_queue((0,tpe,url,{"nofollow":False})) + params.update({"nofollow":False}) + put_fetch_queue((0,tpe,url,params)) # process a url list -def process_urllist(urllist, tpe, parent_url): +def process_urllist(urllist, tpe, parent_url,params={}): for u in urllist: - process_url(u,tpe, parent_url) - + if isinstance(u, basestring): + process_url(u,tpe, parent_url,params) + elif isinstance(u,tuple): + params.update({"parent_item": u[1]}) + process_url(u[0], tpe ,parent_url,params) + else: + clogger.error("url has wrong type: "+ type(u)) def do_process(tpe, cont,params={}): urllist=[] # clogger.debug("process :" + str(cont)) if "article_links" in cont: - process_urllist(cont["article_links"], article_types[tpe], cont["url"]) + process_urllist(cont["article_links"], article_types[tpe], cont["url"], params) if "index_links" in cont: process_urllist(cont["index_links"], tpe , cont["url"]) diff --git a/compiler/mworker.py b/compiler/mworker.py index 2ff6626..0a80e28 100644 --- a/compiler/mworker.py +++ b/compiler/mworker.py @@ -55,7 +55,9 @@ def run_fetch(): #comile something from the compile list def run_compile(): tc,tpe,h, p = compile_queue.get() - h=do_compile(tpe,h) + if p.has_key('parent_item'): + h["parent_item"]=p["parent_item"] + h=do_compile(tpe,h,p) process_queue.put((0,tpe, h,p)) return h # compile_queue.task_done()