fsch
This commit is contained in:
48
compiler/comp/fsch.py
Normal file
48
compiler/comp/fsch.py
Normal file
@@ -0,0 +1,48 @@
|
||||
from bs4 import BeautifulSoup
|
||||
from dateutil.parser import parse
|
||||
from datetime import datetime
|
||||
import re
|
||||
import urlparse
|
||||
from src import clogger, cfg
|
||||
from src.compiler.fixing import fix_link
|
||||
|
||||
# d["title"], d["image"], d["published"], d["text"], d["section"], d["url"]
|
||||
|
||||
# h=html.find("h2", {"class":"item-page-title"})
|
||||
# h1= re.sub(r'.*- (\d+) ([a-zA-Z]+) (\d+) - ([:\d]+)[^\d]*', r'\3/\2/\1 \4',unicode(h.text.strip()))
|
||||
|
||||
# h1= re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text).strip())
|
||||
# d["published"]=parse(h1.strip())
|
||||
# d["text"]=h.encode_contents().strip()
|
||||
|
||||
|
||||
|
||||
def fscharticle(url,raw,params={}):
|
||||
if raw is None:
|
||||
raise Error
|
||||
html=BeautifulSoup(raw)
|
||||
d={}
|
||||
h=html.find("h2", {"class":"item-page-title"})
|
||||
if h is not None:
|
||||
d["title"]=h.text.strip()
|
||||
d["author"]=None
|
||||
h=html.find("div", {"class":"item-page"})
|
||||
if h is not None:
|
||||
h2=h.find("h2", {"class":"item-page-title"})
|
||||
if h2 is not None:
|
||||
h2.extract()
|
||||
#d["text"]=h.encode_contents().strip()
|
||||
h2= h.find("img")
|
||||
if h2 is not None:
|
||||
d["image"]=h2.attrs["src"]
|
||||
else:
|
||||
d["image"]=""
|
||||
if params.has_key("parent_item"):
|
||||
pi=params["parent_item"]
|
||||
if pi.has_key("author_detail"):
|
||||
d["author"]=pi["author_detail"]
|
||||
if pi.has_key("published"):
|
||||
d["published"]=parse(pi["published"])
|
||||
d["pi"]=pi
|
||||
d["sourcetype"]="fscharticle"
|
||||
return {"article": d}
|
||||
Reference in New Issue
Block a user