fsch

2017-02-08 07:13:53 +01:00
parent 589807f5e2
commit 127bc9c557
6 changed files with 84 additions and 25 deletions
--- a/compiler/comp/fsch.py
+++ b/compiler/comp/fsch.py
@@ -0,0 +1,48 @@
+from bs4 import BeautifulSoup
+from dateutil.parser import parse
+from datetime import datetime
+import re
+import urlparse
+from src import clogger, cfg
+from src.compiler.fixing import fix_link
+
+# d["title"], d["image"], d["published"], d["text"], d["section"], d["url"]
+
+# h=html.find("h2", {"class":"item-page-title"})
+# h1=  re.sub(r'.*- (\d+) ([a-zA-Z]+) (\d+) - ([:\d]+)[^\d]*', r'\3/\2/\1 \4',unicode(h.text.strip()))
+
+# h1=  re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text).strip()) 
+# d["published"]=parse(h1.strip())
+#   d["text"]=h.encode_contents().strip()
+ 
+
+
+def fscharticle(url,raw,params={}):
+    if raw is None:
+        raise Error
+    html=BeautifulSoup(raw)
+    d={}
+    h=html.find("h2", {"class":"item-page-title"})
+    if h is not None:
+        d["title"]=h.text.strip()
+    d["author"]=None
+    h=html.find("div", {"class":"item-page"})
+    if h is not None:
+        h2=h.find("h2", {"class":"item-page-title"})
+        if h2 is not None:
+            h2.extract()
+        #d["text"]=h.encode_contents().strip()
+        h2= h.find("img")
+        if h2 is not None:
+            d["image"]=h2.attrs["src"]
+        else:
+            d["image"]=""
+    if params.has_key("parent_item"):
+        pi=params["parent_item"]
+        if pi.has_key("author_detail"):
+            d["author"]=pi["author_detail"]
+        if pi.has_key("published"):
+            d["published"]=parse(pi["published"])
+        d["pi"]=pi
+    d["sourcetype"]="fscharticle"
+    return {"article": d}
--- a/compiler/comp/rss.py
+++ b/compiler/comp/rss.py
@@ -1,8 +1,8 @@
 import feedparser

-def rssfeed(url,raw):
+def rssfeed(url,raw,params={}):
    al=[]
    f=feedparser.parse(raw)
    for e in f['entries']:
-        al.append(e['link'])
+        al.append((e['link'],e))
    return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}
--- a/compiler/compiler.py
+++ b/compiler/compiler.py
@@ -6,12 +6,12 @@ from datetime import datetime
 import re
 import urlparse
 from src import clogger, cfg
-from fixing import fix_link
+from src.compiler.fixing import fix_link
 import feedparser

 #from fetching import downloadfile
 import json
-def do_compile(tpe, cont):
+def do_compile(tpe, cont, params={}):
    if  type(cont) != dict:
        clogger.error("Type Error for do compile for :"+str(cont["url"]))
    # Starting to compile an generic object
@@ -20,17 +20,20 @@ def do_compile(tpe, cont):
    else:
        clogger.debug("compile: type:"+str(tpe)+ "| "+ str(cont["url"]))
        if tpe in compiler:
-            cont=compiler[tpe](cont["url"], cont["raw"])
+            cont=compiler[tpe](cont["url"], cont["raw"],params)
+        else:
+            clogger.error("Compiler for "+tpe+" not found.")
    return cont

 from comp import rssfeed
 from comp import fbfeed
+from comp.fsch import fscharticle
 def dummyarticle(url, raw):
    return {"url": url, "article":{"url": url, "section": "dummysection", "sourcetype": "dummy", "title":"dummytitle", "text": raw, "image": "fff", "author": "me", "published": None}}



-def htufeed(url,raw):
+def htufeed(url,raw,params={}):
    al=[]
    f=feedparser.parse(raw)
    for e in f['entries']:
@@ -38,7 +41,7 @@ def htufeed(url,raw):
    return {"url": url, "next_page": None, "article_links": al, "objecttype":"index"}
    

-def htuarticle(url,raw):
+def htuarticle(url,raw,params={}):
    sp=BeautifulSoup(raw)
    d={}
    h=sp.find("meta", {"property": "og:image"})    
@@ -71,7 +74,7 @@ def htuarticle(url,raw):
    return {"article": d}

    
-def fetarticle(url, raw):
+def fetarticle(url, raw,params={}):
    sp=BeautifulSoup(raw)
    d={}
    h=sp.find("h1", {"itemprop": "name"})
@@ -110,7 +113,7 @@ def fetarticle(url, raw):
    return {"article": d}


-def fsarcharticle(url, raw):
+def fsarcharticle(url, raw,params={}):
    sp=BeautifulSoup(raw)
    d={}
    h=sp.find("h1", {"class": "title"})
@@ -135,7 +138,7 @@ def fsarcharticle(url, raw):
    d["author"]=None
    return {"article": d}

-def fsbizarticle(url, raw):
+def fsbizarticle(url, raw,params={}):
    sp=BeautifulSoup(raw)
    d={}
    h=sp.find("h1", {"class": "entry-title"})
@@ -159,7 +162,7 @@ def fsbizarticle(url, raw):
        d["author"]=h.find("a").text.strip()
    return {"article": d}

-def fetindex(url, raw):
+def fetindex(url, raw,params={}):
    if raw is None:
        raise Error
 #    clogger.debug("compile_fetindex: "+str(url))
@@ -178,7 +181,7 @@ def fetindex(url, raw):
            al.append(t.attrs["href"])
    return {"url": url, "next_page": nl, "article_links": al, "objecttype": "index" }

-def fsarchindex(url, raw):
+def fsarchindex(url, raw,params={}):
    if raw is None:
        raise Error
    html=BeautifulSoup(raw)
@@ -197,7 +200,7 @@ def fsarchindex(url, raw):
    return {"url": url, "next_page": None, "article_links": al, "facebook_links": fl,"objecttype":"index"}


-def fsbizindex(url, raw):
+def fsbizindex(url, raw,params={}):
    if raw is None:
        raise Error
    print "compile_fsbizindex"
@@ -212,8 +215,7 @@ def fsbizindex(url, raw):



-
-def fsmbindex(url, raw):
+def fsmbindex(url, raw,params={}):
    if raw is None:
        raise Error
    html=BeautifulSoup(raw)
@@ -246,7 +248,7 @@ def fsmbindex(url, raw):
            articles.append(aa)
    return {"url": url, "next_page": np, "articles": articles,"objecttype": "articles"}

-compiler = {"fetindex": fetindex, "fetarticle": fetarticle, "fsarchindex": fsarchindex, "fsarcharticle": fsarcharticle, "fsmbindex": fsmbindex, "fsbizindex": fsbizindex, "dummyarticle": dummyarticle,"htuarticle": htuarticle, "htufeed": htufeed, "fbfeed": fbfeed, "fschfeed": rssfeed}    
+compiler = {"fetindex": fetindex, "fetarticle": fetarticle, "fsarchindex": fsarchindex, "fsarcharticle": fsarcharticle, "fsmbindex": fsmbindex, "fsbizindex": fsbizindex, "dummyarticle": dummyarticle,"htuarticle": htuarticle, "htufeed": htufeed, "fbfeed": fbfeed, "fschfeed": rssfeed, "fscharticle": fscharticle}    

 compiler = cfg.compiler
 for i in compiler:
@@ -254,4 +256,4 @@ for i in compiler:



-article_types={"fetindex" : "fetarticle", "fsarchindex": "fsarcharticle", "fsbizindex": "fsbizarticle", "dummyindex": "dummyarticle", "htufeed": "htuarticle"}
+article_types={"fetindex" : "fetarticle", "fsarchindex": "fsarcharticle", "fsbizindex": "fsbizarticle", "dummyindex": "dummyarticle", "htufeed": "htuarticle", "fschfeed": "fscharticle"}
--- a/compiler/controller.py
+++ b/compiler/controller.py
@@ -10,7 +10,8 @@ def urls_test(id):
    rw=fetch_page(cu.url)
    h= {"url": cu.url, "sourcetype": cu.tpe, "raw": rw}
    h2=do_compile(cu.tpe, h)
-    return {"rw": rw, "url": h, "comp": h2}
+    h2["raw"]="raw - html -blocked"
+    return {"comp": h2}

 def urls_que(id):
    cu=CrawlUrl.query.get(id)
--- a/compiler/mprocess.py
+++ b/compiler/mprocess.py
@@ -34,24 +34,30 @@ def process_article(art):
    return aa

 # process a single found url
-def process_url(url,tpe, parent_url):
+def process_url(url,tpe, parent_url,params={}):
    #clogger.debug("process URL of type "+ tpe + ": " + url)
    if parent_url is not None:
        url=fix_link(parent_url, url)
-    put_fetch_queue((0,tpe,url,{"nofollow":False}))
+    params.update({"nofollow":False})
+    put_fetch_queue((0,tpe,url,params))


 # process a url list
-def process_urllist(urllist, tpe, parent_url):
+def process_urllist(urllist, tpe, parent_url,params={}):
    for u in urllist:
-        process_url(u,tpe, parent_url)
-        
+        if isinstance(u, basestring):
+            process_url(u,tpe, parent_url,params)
+        elif isinstance(u,tuple):
+            params.update({"parent_item": u[1]})
+            process_url(u[0], tpe ,parent_url,params)
+        else:
+            clogger.error("url has wrong type: "+ type(u))
        
 def do_process(tpe, cont,params={}):
    urllist=[]
 #    clogger.debug("process :" + str(cont))
    if "article_links" in cont:
-        process_urllist(cont["article_links"],  article_types[tpe], cont["url"])
+        process_urllist(cont["article_links"],  article_types[tpe], cont["url"], params)
    if "index_links" in cont:
        process_urllist(cont["index_links"],  tpe , cont["url"])
    
--- a/compiler/mworker.py
+++ b/compiler/mworker.py
@@ -55,7 +55,9 @@ def run_fetch():
 #comile something from the compile list
 def run_compile():
    tc,tpe,h, p = compile_queue.get()
-    h=do_compile(tpe,h)
+    if p.has_key('parent_item'):
+        h["parent_item"]=p["parent_item"]
+    h=do_compile(tpe,h,p)
    process_queue.put((0,tpe, h,p))
    return h
    #    compile_queue.task_done()