35 lines
1.2 KiB
Python
35 lines
1.2 KiB
Python
from lxml.html.clean import clean_html, Cleaner
|
|
from bs4 import BeautifulSoup
|
|
def PostKeyError(KeyError):
|
|
pass
|
|
|
|
def post_to_solr(p):
|
|
# if a list is given call for each element
|
|
if type(p) is list:
|
|
return [post_to_solr(o) for o in p]
|
|
def get_text(l):
|
|
if not l:
|
|
return ""
|
|
if len(l)<1:
|
|
return ""
|
|
c=Cleaner(allow_tags=['i','em'], remove_tags=['p','div','ul','li']) #
|
|
h=c.clean_html(l.replace("\n"," ").replace("\r"," ").replace("\t"," ").replace("\\"," ")).text_content()
|
|
return h
|
|
def get_text2(l):
|
|
if not l: return ""
|
|
soup=BeautifulSoup(l,features="lxml")
|
|
return soup.get_text().replace("\n"," ").replace("\r"," ").replace("\t"," ")
|
|
if type(p) is list:
|
|
return [post_to_solr(pp) for pp in p]
|
|
# Check Dict and keys
|
|
assert type(p) is dict, "Argument for post_to_solr needs to be a dict or list of dicts"
|
|
for k in ["url", "body"]:
|
|
if not k in p:
|
|
raise PostKeyError("Post needs to have key '%s'" % k)
|
|
# Return the solr structure
|
|
return {
|
|
"id": p["slug"],
|
|
"date_dt": p["public_date"],
|
|
"text_txt": (get_text2(p.get("body","")) or "")+" "+get_text2(p.get("agenda_html","") or "")
|
|
}
|