fsch
This commit is contained in:
48
compiler/comp/fsch.py
Normal file
48
compiler/comp/fsch.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from dateutil.parser import parse
|
||||||
|
from datetime import datetime
|
||||||
|
import re
|
||||||
|
import urlparse
|
||||||
|
from src import clogger, cfg
|
||||||
|
from src.compiler.fixing import fix_link
|
||||||
|
|
||||||
|
# d["title"], d["image"], d["published"], d["text"], d["section"], d["url"]
|
||||||
|
|
||||||
|
# h=html.find("h2", {"class":"item-page-title"})
|
||||||
|
# h1= re.sub(r'.*- (\d+) ([a-zA-Z]+) (\d+) - ([:\d]+)[^\d]*', r'\3/\2/\1 \4',unicode(h.text.strip()))
|
||||||
|
|
||||||
|
# h1= re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text).strip())
|
||||||
|
# d["published"]=parse(h1.strip())
|
||||||
|
# d["text"]=h.encode_contents().strip()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def fscharticle(url,raw,params={}):
|
||||||
|
if raw is None:
|
||||||
|
raise Error
|
||||||
|
html=BeautifulSoup(raw)
|
||||||
|
d={}
|
||||||
|
h=html.find("h2", {"class":"item-page-title"})
|
||||||
|
if h is not None:
|
||||||
|
d["title"]=h.text.strip()
|
||||||
|
d["author"]=None
|
||||||
|
h=html.find("div", {"class":"item-page"})
|
||||||
|
if h is not None:
|
||||||
|
h2=h.find("h2", {"class":"item-page-title"})
|
||||||
|
if h2 is not None:
|
||||||
|
h2.extract()
|
||||||
|
#d["text"]=h.encode_contents().strip()
|
||||||
|
h2= h.find("img")
|
||||||
|
if h2 is not None:
|
||||||
|
d["image"]=h2.attrs["src"]
|
||||||
|
else:
|
||||||
|
d["image"]=""
|
||||||
|
if params.has_key("parent_item"):
|
||||||
|
pi=params["parent_item"]
|
||||||
|
if pi.has_key("author_detail"):
|
||||||
|
d["author"]=pi["author_detail"]
|
||||||
|
if pi.has_key("published"):
|
||||||
|
d["published"]=parse(pi["published"])
|
||||||
|
d["pi"]=pi
|
||||||
|
d["sourcetype"]="fscharticle"
|
||||||
|
return {"article": d}
|
||||||
@@ -1,8 +1,8 @@
|
|||||||
import feedparser
|
import feedparser
|
||||||
|
|
||||||
def rssfeed(url,raw):
|
def rssfeed(url,raw,params={}):
|
||||||
al=[]
|
al=[]
|
||||||
f=feedparser.parse(raw)
|
f=feedparser.parse(raw)
|
||||||
for e in f['entries']:
|
for e in f['entries']:
|
||||||
al.append(e['link'])
|
al.append((e['link'],e))
|
||||||
return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}
|
return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}
|
||||||
|
|||||||
@@ -6,12 +6,12 @@ from datetime import datetime
|
|||||||
import re
|
import re
|
||||||
import urlparse
|
import urlparse
|
||||||
from src import clogger, cfg
|
from src import clogger, cfg
|
||||||
from fixing import fix_link
|
from src.compiler.fixing import fix_link
|
||||||
import feedparser
|
import feedparser
|
||||||
|
|
||||||
#from fetching import downloadfile
|
#from fetching import downloadfile
|
||||||
import json
|
import json
|
||||||
def do_compile(tpe, cont):
|
def do_compile(tpe, cont, params={}):
|
||||||
if type(cont) != dict:
|
if type(cont) != dict:
|
||||||
clogger.error("Type Error for do compile for :"+str(cont["url"]))
|
clogger.error("Type Error for do compile for :"+str(cont["url"]))
|
||||||
# Starting to compile an generic object
|
# Starting to compile an generic object
|
||||||
@@ -20,17 +20,20 @@ def do_compile(tpe, cont):
|
|||||||
else:
|
else:
|
||||||
clogger.debug("compile: type:"+str(tpe)+ "| "+ str(cont["url"]))
|
clogger.debug("compile: type:"+str(tpe)+ "| "+ str(cont["url"]))
|
||||||
if tpe in compiler:
|
if tpe in compiler:
|
||||||
cont=compiler[tpe](cont["url"], cont["raw"])
|
cont=compiler[tpe](cont["url"], cont["raw"],params)
|
||||||
|
else:
|
||||||
|
clogger.error("Compiler for "+tpe+" not found.")
|
||||||
return cont
|
return cont
|
||||||
|
|
||||||
from comp import rssfeed
|
from comp import rssfeed
|
||||||
from comp import fbfeed
|
from comp import fbfeed
|
||||||
|
from comp.fsch import fscharticle
|
||||||
def dummyarticle(url, raw):
|
def dummyarticle(url, raw):
|
||||||
return {"url": url, "article":{"url": url, "section": "dummysection", "sourcetype": "dummy", "title":"dummytitle", "text": raw, "image": "fff", "author": "me", "published": None}}
|
return {"url": url, "article":{"url": url, "section": "dummysection", "sourcetype": "dummy", "title":"dummytitle", "text": raw, "image": "fff", "author": "me", "published": None}}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def htufeed(url,raw):
|
def htufeed(url,raw,params={}):
|
||||||
al=[]
|
al=[]
|
||||||
f=feedparser.parse(raw)
|
f=feedparser.parse(raw)
|
||||||
for e in f['entries']:
|
for e in f['entries']:
|
||||||
@@ -38,7 +41,7 @@ def htufeed(url,raw):
|
|||||||
return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}
|
return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}
|
||||||
|
|
||||||
|
|
||||||
def htuarticle(url,raw):
|
def htuarticle(url,raw,params={}):
|
||||||
sp=BeautifulSoup(raw)
|
sp=BeautifulSoup(raw)
|
||||||
d={}
|
d={}
|
||||||
h=sp.find("meta", {"property": "og:image"})
|
h=sp.find("meta", {"property": "og:image"})
|
||||||
@@ -71,7 +74,7 @@ def htuarticle(url,raw):
|
|||||||
return {"article": d}
|
return {"article": d}
|
||||||
|
|
||||||
|
|
||||||
def fetarticle(url, raw):
|
def fetarticle(url, raw,params={}):
|
||||||
sp=BeautifulSoup(raw)
|
sp=BeautifulSoup(raw)
|
||||||
d={}
|
d={}
|
||||||
h=sp.find("h1", {"itemprop": "name"})
|
h=sp.find("h1", {"itemprop": "name"})
|
||||||
@@ -110,7 +113,7 @@ def fetarticle(url, raw):
|
|||||||
return {"article": d}
|
return {"article": d}
|
||||||
|
|
||||||
|
|
||||||
def fsarcharticle(url, raw):
|
def fsarcharticle(url, raw,params={}):
|
||||||
sp=BeautifulSoup(raw)
|
sp=BeautifulSoup(raw)
|
||||||
d={}
|
d={}
|
||||||
h=sp.find("h1", {"class": "title"})
|
h=sp.find("h1", {"class": "title"})
|
||||||
@@ -135,7 +138,7 @@ def fsarcharticle(url, raw):
|
|||||||
d["author"]=None
|
d["author"]=None
|
||||||
return {"article": d}
|
return {"article": d}
|
||||||
|
|
||||||
def fsbizarticle(url, raw):
|
def fsbizarticle(url, raw,params={}):
|
||||||
sp=BeautifulSoup(raw)
|
sp=BeautifulSoup(raw)
|
||||||
d={}
|
d={}
|
||||||
h=sp.find("h1", {"class": "entry-title"})
|
h=sp.find("h1", {"class": "entry-title"})
|
||||||
@@ -159,7 +162,7 @@ def fsbizarticle(url, raw):
|
|||||||
d["author"]=h.find("a").text.strip()
|
d["author"]=h.find("a").text.strip()
|
||||||
return {"article": d}
|
return {"article": d}
|
||||||
|
|
||||||
def fetindex(url, raw):
|
def fetindex(url, raw,params={}):
|
||||||
if raw is None:
|
if raw is None:
|
||||||
raise Error
|
raise Error
|
||||||
# clogger.debug("compile_fetindex: "+str(url))
|
# clogger.debug("compile_fetindex: "+str(url))
|
||||||
@@ -178,7 +181,7 @@ def fetindex(url, raw):
|
|||||||
al.append(t.attrs["href"])
|
al.append(t.attrs["href"])
|
||||||
return {"url": url, "next_page": nl, "article_links": al, "objecttype": "index" }
|
return {"url": url, "next_page": nl, "article_links": al, "objecttype": "index" }
|
||||||
|
|
||||||
def fsarchindex(url, raw):
|
def fsarchindex(url, raw,params={}):
|
||||||
if raw is None:
|
if raw is None:
|
||||||
raise Error
|
raise Error
|
||||||
html=BeautifulSoup(raw)
|
html=BeautifulSoup(raw)
|
||||||
@@ -197,7 +200,7 @@ def fsarchindex(url, raw):
|
|||||||
return {"url": url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"}
|
return {"url": url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"}
|
||||||
|
|
||||||
|
|
||||||
def fsbizindex(url, raw):
|
def fsbizindex(url, raw,params={}):
|
||||||
if raw is None:
|
if raw is None:
|
||||||
raise Error
|
raise Error
|
||||||
print "compile_fsbizindex"
|
print "compile_fsbizindex"
|
||||||
@@ -212,8 +215,7 @@ def fsbizindex(url, raw):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def fsmbindex(url, raw,params={}):
|
||||||
def fsmbindex(url, raw):
|
|
||||||
if raw is None:
|
if raw is None:
|
||||||
raise Error
|
raise Error
|
||||||
html=BeautifulSoup(raw)
|
html=BeautifulSoup(raw)
|
||||||
@@ -246,7 +248,7 @@ def fsmbindex(url, raw):
|
|||||||
articles.append(aa)
|
articles.append(aa)
|
||||||
return {"url": url, "next_page": np, "articles": articles,"objecttype": "articles"}
|
return {"url": url, "next_page": np, "articles": articles,"objecttype": "articles"}
|
||||||
|
|
||||||
compiler = {"fetindex": fetindex, "fetarticle": fetarticle, "fsarchindex": fsarchindex, "fsarcharticle": fsarcharticle, "fsmbindex": fsmbindex, "fsbizindex": fsbizindex, "dummyarticle": dummyarticle,"htuarticle": htuarticle, "htufeed": htufeed, "fbfeed": fbfeed, "fschfeed": rssfeed}
|
compiler = {"fetindex": fetindex, "fetarticle": fetarticle, "fsarchindex": fsarchindex, "fsarcharticle": fsarcharticle, "fsmbindex": fsmbindex, "fsbizindex": fsbizindex, "dummyarticle": dummyarticle,"htuarticle": htuarticle, "htufeed": htufeed, "fbfeed": fbfeed, "fschfeed": rssfeed, "fscharticle": fscharticle}
|
||||||
|
|
||||||
compiler = cfg.compiler
|
compiler = cfg.compiler
|
||||||
for i in compiler:
|
for i in compiler:
|
||||||
@@ -254,4 +256,4 @@ for i in compiler:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
article_types={"fetindex" : "fetarticle", "fsarchindex": "fsarcharticle", "fsbizindex": "fsbizarticle", "dummyindex": "dummyarticle", "htufeed": "htuarticle"}
|
article_types={"fetindex" : "fetarticle", "fsarchindex": "fsarcharticle", "fsbizindex": "fsbizarticle", "dummyindex": "dummyarticle", "htufeed": "htuarticle", "fschfeed": "fscharticle"}
|
||||||
|
|||||||
@@ -10,7 +10,8 @@ def urls_test(id):
|
|||||||
rw=fetch_page(cu.url)
|
rw=fetch_page(cu.url)
|
||||||
h= {"url": cu.url, "sourcetype": cu.tpe, "raw": rw}
|
h= {"url": cu.url, "sourcetype": cu.tpe, "raw": rw}
|
||||||
h2=do_compile(cu.tpe, h)
|
h2=do_compile(cu.tpe, h)
|
||||||
return {"rw": rw, "url": h, "comp": h2}
|
h2["raw"]="raw - html -blocked"
|
||||||
|
return {"comp": h2}
|
||||||
|
|
||||||
def urls_que(id):
|
def urls_que(id):
|
||||||
cu=CrawlUrl.query.get(id)
|
cu=CrawlUrl.query.get(id)
|
||||||
|
|||||||
@@ -34,24 +34,30 @@ def process_article(art):
|
|||||||
return aa
|
return aa
|
||||||
|
|
||||||
# process a single found url
|
# process a single found url
|
||||||
def process_url(url,tpe, parent_url):
|
def process_url(url,tpe, parent_url,params={}):
|
||||||
#clogger.debug("process URL of type "+ tpe + ": " + url)
|
#clogger.debug("process URL of type "+ tpe + ": " + url)
|
||||||
if parent_url is not None:
|
if parent_url is not None:
|
||||||
url=fix_link(parent_url, url)
|
url=fix_link(parent_url, url)
|
||||||
put_fetch_queue((0,tpe,url,{"nofollow":False}))
|
params.update({"nofollow":False})
|
||||||
|
put_fetch_queue((0,tpe,url,params))
|
||||||
|
|
||||||
|
|
||||||
# process a url list
|
# process a url list
|
||||||
def process_urllist(urllist, tpe, parent_url):
|
def process_urllist(urllist, tpe, parent_url,params={}):
|
||||||
for u in urllist:
|
for u in urllist:
|
||||||
process_url(u,tpe, parent_url)
|
if isinstance(u, basestring):
|
||||||
|
process_url(u,tpe, parent_url,params)
|
||||||
|
elif isinstance(u,tuple):
|
||||||
|
params.update({"parent_item": u[1]})
|
||||||
|
process_url(u[0], tpe ,parent_url,params)
|
||||||
|
else:
|
||||||
|
clogger.error("url has wrong type: "+ type(u))
|
||||||
|
|
||||||
def do_process(tpe, cont,params={}):
|
def do_process(tpe, cont,params={}):
|
||||||
urllist=[]
|
urllist=[]
|
||||||
# clogger.debug("process :" + str(cont))
|
# clogger.debug("process :" + str(cont))
|
||||||
if "article_links" in cont:
|
if "article_links" in cont:
|
||||||
process_urllist(cont["article_links"], article_types[tpe], cont["url"])
|
process_urllist(cont["article_links"], article_types[tpe], cont["url"], params)
|
||||||
if "index_links" in cont:
|
if "index_links" in cont:
|
||||||
process_urllist(cont["index_links"], tpe , cont["url"])
|
process_urllist(cont["index_links"], tpe , cont["url"])
|
||||||
|
|
||||||
|
|||||||
@@ -55,7 +55,9 @@ def run_fetch():
|
|||||||
#comile something from the compile list
|
#comile something from the compile list
|
||||||
def run_compile():
|
def run_compile():
|
||||||
tc,tpe,h, p = compile_queue.get()
|
tc,tpe,h, p = compile_queue.get()
|
||||||
h=do_compile(tpe,h)
|
if p.has_key('parent_item'):
|
||||||
|
h["parent_item"]=p["parent_item"]
|
||||||
|
h=do_compile(tpe,h,p)
|
||||||
process_queue.put((0,tpe, h,p))
|
process_queue.put((0,tpe, h,p))
|
||||||
return h
|
return h
|
||||||
# compile_queue.task_done()
|
# compile_queue.task_done()
|
||||||
|
|||||||
Reference in New Issue
Block a user