fsch

2017-02-08 07:13:53 +01:00
parent 589807f5e2
commit 127bc9c557
6 changed files with 84 additions and 25 deletions
--- a/compiler/comp/fsch.py
+++ b/compiler/comp/fsch.py
@@ -0,0 +1,48 @@
+from bs4 import BeautifulSoup
+from dateutil.parser import parse
+from datetime import datetime
+import re
+import urlparse
+from src import clogger, cfg
+from src.compiler.fixing import fix_link
+
+# d["title"], d["image"], d["published"], d["text"], d["section"], d["url"]
+
+# h=html.find("h2", {"class":"item-page-title"})
+# h1=  re.sub(r'.*- (\d+) ([a-zA-Z]+) (\d+) - ([:\d]+)[^\d]*', r'\3/\2/\1 \4',unicode(h.text.strip()))
+
+# h1=  re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text).strip()) 
+# d["published"]=parse(h1.strip())
+#   d["text"]=h.encode_contents().strip()
+ 
+
+
+def fscharticle(url,raw,params={}):
+    if raw is None:
+        raise Error
+    html=BeautifulSoup(raw)
+    d={}
+    h=html.find("h2", {"class":"item-page-title"})
+    if h is not None:
+        d["title"]=h.text.strip()
+    d["author"]=None
+    h=html.find("div", {"class":"item-page"})
+    if h is not None:
+        h2=h.find("h2", {"class":"item-page-title"})
+        if h2 is not None:
+            h2.extract()
+        #d["text"]=h.encode_contents().strip()
+        h2= h.find("img")
+        if h2 is not None:
+            d["image"]=h2.attrs["src"]
+        else:
+            d["image"]=""
+    if params.has_key("parent_item"):
+        pi=params["parent_item"]
+        if pi.has_key("author_detail"):
+            d["author"]=pi["author_detail"]
+        if pi.has_key("published"):
+            d["published"]=parse(pi["published"])
+        d["pi"]=pi
+    d["sourcetype"]="fscharticle"
+    return {"article": d}