fsch

2017-02-08 07:13:53 +01:00
parent 589807f5e2
commit 127bc9c557
6 changed files with 84 additions and 25 deletions
--- a/compiler/comp/fsch.py
+++ b/compiler/comp/fsch.py
@@ -0,0 +1,48 @@
 from bs4 import BeautifulSoup
 from dateutil.parser import parse
 from datetime import datetime
 import re
 import urlparse
 from src import clogger, cfg
 from src.compiler.fixing import fix_link
 # d["title"], d["image"], d["published"], d["text"], d["section"], d["url"]
 # h=html.find("h2", {"class":"item-page-title"})
 # h1=  re.sub(r'.*- (\d+) ([a-zA-Z]+) (\d+) - ([:\d]+)[^\d]*', r'\3/\2/\1 \4',unicode(h.text.strip()))
 # h1=  re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text).strip()) 
 # d["published"]=parse(h1.strip())
 #   d["text"]=h.encode_contents().strip()
 def fscharticle(url,raw,params={}):
    if raw is None:
        raise Error
    html=BeautifulSoup(raw)
    d={}
    h=html.find("h2", {"class":"item-page-title"})
    if h is not None:
        d["title"]=h.text.strip()
    d["author"]=None
    h=html.find("div", {"class":"item-page"})
    if h is not None:
        h2=h.find("h2", {"class":"item-page-title"})
        if h2 is not None:
            h2.extract()
        #d["text"]=h.encode_contents().strip()
        h2= h.find("img")
        if h2 is not None:
            d["image"]=h2.attrs["src"]
        else:
            d["image"]=""
    if params.has_key("parent_item"):
        pi=params["parent_item"]
        if pi.has_key("author_detail"):
            d["author"]=pi["author_detail"]
        if pi.has_key("published"):
            d["published"]=parse(pi["published"])
        d["pi"]=pi
    d["sourcetype"]="fscharticle"
    return {"article": d}
--- a/compiler/comp/rss.py
+++ b/compiler/comp/rss.py
@@ -1,8 +1,8 @@
 import feedparser
-def rssfeed(url,raw):
+def rssfeed(url,raw,params={}):
    al=[]
    f=feedparser.parse(raw)
    for e in f['entries']:
-        al.append(e['link'])
+        al.append((e['link'],e))
    return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}
--- a/compiler/compiler.py
+++ b/compiler/compiler.py
@@ -6,12 +6,12 @@ from datetime import datetime
 import re
 import urlparse
 from src import clogger, cfg
-from fixing import fix_link
+from src.compiler.fixing import fix_link
 import feedparser
 #from fetching import downloadfile
 import json
-def do_compile(tpe, cont):
+def do_compile(tpe, cont, params={}):
    if  type(cont) != dict:
        clogger.error("Type Error for do compile for :"+str(cont["url"]))
    # Starting to compile an generic object
@@ -20,17 +20,20 @@ def do_compile(tpe, cont):
    else:
        clogger.debug("compile: type:"+str(tpe)+ "| "+ str(cont["url"]))
        if tpe in compiler:
-            cont=compiler[tpe](cont["url"], cont["raw"])
+            cont=compiler[tpe](cont["url"], cont["raw"],params)
        else:
            clogger.error("Compiler for "+tpe+" not found.")
    return cont
 from comp import rssfeed
 from comp import fbfeed
 from comp.fsch import fscharticle
 def dummyarticle(url, raw):
    return {"url": url, "article":{"url": url, "section": "dummysection", "sourcetype": "dummy", "title":"dummytitle", "text": raw, "image": "fff", "author": "me", "published": None}}
-def htufeed(url,raw):
+def htufeed(url,raw,params={}):
    al=[]
    f=feedparser.parse(raw)
    for e in f['entries']:
@@ -38,7 +41,7 @@ def htufeed(url,raw):
    return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}
-def htuarticle(url,raw):
+def htuarticle(url,raw,params={}):
    sp=BeautifulSoup(raw)
    d={}
    h=sp.find("meta", {"property": "og:image"})    
@@ -71,7 +74,7 @@ def htuarticle(url,raw):
    return {"article": d}
-def fetarticle(url, raw):
+def fetarticle(url, raw,params={}):
    sp=BeautifulSoup(raw)
    d={}
    h=sp.find("h1", {"itemprop": "name"})
@@ -110,7 +113,7 @@ def fetarticle(url, raw):
    return {"article": d}
-def fsarcharticle(url, raw):
+def fsarcharticle(url, raw,params={}):
    sp=BeautifulSoup(raw)
    d={}
    h=sp.find("h1", {"class": "title"})
@@ -135,7 +138,7 @@ def fsarcharticle(url, raw):
    d["author"]=None
    return {"article": d}
-def fsbizarticle(url, raw):
+def fsbizarticle(url, raw,params={}):
    sp=BeautifulSoup(raw)
    d={}
    h=sp.find("h1", {"class": "entry-title"})
@@ -159,7 +162,7 @@ def fsbizarticle(url, raw):
        d["author"]=h.find("a").text.strip()
    return {"article": d}
-def fetindex(url, raw):
+def fetindex(url, raw,params={}):
    if raw is None:
        raise Error
 #    clogger.debug("compile_fetindex: "+str(url))
@@ -178,7 +181,7 @@ def fetindex(url, raw):
            al.append(t.attrs["href"])
    return {"url": url, "next_page": nl, "article_links": al, "objecttype": "index" }
-def fsarchindex(url, raw):
+def fsarchindex(url, raw,params={}):
    if raw is None:
        raise Error
    html=BeautifulSoup(raw)
@@ -197,7 +200,7 @@ def fsarchindex(url, raw):
    return {"url": url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"}
-def fsbizindex(url, raw):
+def fsbizindex(url, raw,params={}):
    if raw is None:
        raise Error
    print "compile_fsbizindex"
@@ -212,8 +215,7 @@ def fsbizindex(url, raw):
-
+def fsmbindex(url, raw,params={}):
 def fsmbindex(url, raw):
    if raw is None:
        raise Error
    html=BeautifulSoup(raw)
@@ -246,7 +248,7 @@ def fsmbindex(url, raw):
            articles.append(aa)
    return {"url": url, "next_page": np, "articles": articles,"objecttype": "articles"}
-compiler = {"fetindex": fetindex, "fetarticle": fetarticle, "fsarchindex": fsarchindex, "fsarcharticle": fsarcharticle, "fsmbindex": fsmbindex, "fsbizindex": fsbizindex, "dummyarticle": dummyarticle,"htuarticle": htuarticle, "htufeed": htufeed, "fbfeed": fbfeed, "fschfeed": rssfeed}    
+compiler = {"fetindex": fetindex, "fetarticle": fetarticle, "fsarchindex": fsarchindex, "fsarcharticle": fsarcharticle, "fsmbindex": fsmbindex, "fsbizindex": fsbizindex, "dummyarticle": dummyarticle,"htuarticle": htuarticle, "htufeed": htufeed, "fbfeed": fbfeed, "fschfeed": rssfeed, "fscharticle": fscharticle}    
 compiler = cfg.compiler
 for i in compiler:
@@ -254,4 +256,4 @@ for i in compiler:
-article_types={"fetindex" : "fetarticle", "fsarchindex": "fsarcharticle", "fsbizindex": "fsbizarticle", "dummyindex": "dummyarticle", "htufeed": "htuarticle"}
+article_types={"fetindex" : "fetarticle", "fsarchindex": "fsarcharticle", "fsbizindex": "fsbizarticle", "dummyindex": "dummyarticle", "htufeed": "htuarticle", "fschfeed": "fscharticle"}
--- a/compiler/controller.py
+++ b/compiler/controller.py
@@ -10,7 +10,8 @@ def urls_test(id):
    rw=fetch_page(cu.url)
    h= {"url": cu.url, "sourcetype": cu.tpe, "raw": rw}
    h2=do_compile(cu.tpe, h)
-    return {"rw": rw, "url": h, "comp": h2}
+    h2["raw"]="raw - html -blocked"
    return {"comp": h2}
 def urls_que(id):
    cu=CrawlUrl.query.get(id)
--- a/compiler/mprocess.py
+++ b/compiler/mprocess.py
@@ -34,24 +34,30 @@ def process_article(art):
    return aa
 # process a single found url
-def process_url(url,tpe, parent_url):
+def process_url(url,tpe, parent_url,params={}):
    #clogger.debug("process URL of type "+ tpe + ": " + url)
    if parent_url is not None:
        url=fix_link(parent_url, url)
-    put_fetch_queue((0,tpe,url,{"nofollow":False}))
+    params.update({"nofollow":False})
    put_fetch_queue((0,tpe,url,params))
 # process a url list
-def process_urllist(urllist, tpe, parent_url):
+def process_urllist(urllist, tpe, parent_url,params={}):
    for u in urllist:
-        process_url(u,tpe, parent_url)
+        if isinstance(u, basestring):
-        
+            process_url(u,tpe, parent_url,params)
        elif isinstance(u,tuple):
            params.update({"parent_item": u[1]})
            process_url(u[0], tpe ,parent_url,params)
        else:
            clogger.error("url has wrong type: "+ type(u))
 def do_process(tpe, cont,params={}):
    urllist=[]
 #    clogger.debug("process :" + str(cont))
    if "article_links" in cont:
-        process_urllist(cont["article_links"],  article_types[tpe], cont["url"])
+        process_urllist(cont["article_links"],  article_types[tpe], cont["url"], params)
    if "index_links" in cont:
        process_urllist(cont["index_links"],  tpe , cont["url"])
--- a/compiler/mworker.py
+++ b/compiler/mworker.py
@@ -55,7 +55,9 @@ def run_fetch():
 #comile something from the compile list
 def run_compile():
    tc,tpe,h, p = compile_queue.get()
-    h=do_compile(tpe,h)
+    if p.has_key('parent_item'):
        h["parent_item"]=p["parent_item"]
    h=do_compile(tpe,h,p)
    process_queue.put((0,tpe, h,p))
    return h
    #    compile_queue.task_done()