This commit is contained in:
Andreas Stephanides
2017-02-08 07:13:53 +01:00
parent 589807f5e2
commit 127bc9c557
6 changed files with 84 additions and 25 deletions

48
compiler/comp/fsch.py Normal file
View File

@@ -0,0 +1,48 @@
from bs4 import BeautifulSoup
from dateutil.parser import parse
from datetime import datetime
import re
import urlparse
from src import clogger, cfg
from src.compiler.fixing import fix_link
# d["title"], d["image"], d["published"], d["text"], d["section"], d["url"]
# h=html.find("h2", {"class":"item-page-title"})
# h1= re.sub(r'.*- (\d+) ([a-zA-Z]+) (\d+) - ([:\d]+)[^\d]*', r'\3/\2/\1 \4',unicode(h.text.strip()))
# h1= re.sub(r'[^\d]*(\d+)\.(\d+)\.(\d+)[^\d]*', r'\3/\2/\1',unicode(h.text).strip())
# d["published"]=parse(h1.strip())
# d["text"]=h.encode_contents().strip()
def fscharticle(url,raw,params={}):
if raw is None:
raise Error
html=BeautifulSoup(raw)
d={}
h=html.find("h2", {"class":"item-page-title"})
if h is not None:
d["title"]=h.text.strip()
d["author"]=None
h=html.find("div", {"class":"item-page"})
if h is not None:
h2=h.find("h2", {"class":"item-page-title"})
if h2 is not None:
h2.extract()
#d["text"]=h.encode_contents().strip()
h2= h.find("img")
if h2 is not None:
d["image"]=h2.attrs["src"]
else:
d["image"]=""
if params.has_key("parent_item"):
pi=params["parent_item"]
if pi.has_key("author_detail"):
d["author"]=pi["author_detail"]
if pi.has_key("published"):
d["published"]=parse(pi["published"])
d["pi"]=pi
d["sourcetype"]="fscharticle"
return {"article": d}