Files
intern2020/solrfet2020/convert.py
2020-12-14 20:16:49 +00:00

39 lines
1.3 KiB
Python

from lxml.html.clean import clean_html, Cleaner
from bs4 import BeautifulSoup
def PostKeyError(KeyError):
pass
def post_to_solr(p):
# if a list is given call for each element
if type(p) is list:
return [post_to_solr(o) for o in p]
def get_text2(l):
if not l: return ""
soup=BeautifulSoup(l,features="lxml")
return soup.get_text().replace("\n"," ").replace("\r"," ").replace("\t"," ")
if type(p) is list:
return [post_to_solr(pp) for pp in p]
# Check Dict and keys
assert type(p) is dict, "Argument for post_to_solr needs to be a dict or list of dicts"
for k in ["url", "body"]:
if not k in p:
raise PostKeyError("Post needs to have key '%s'" % k)
# Return the solr structure
return {
"id": p["url"],
"date_dt": p["public_date"],
"tags_tkn": p["tag_string"],
"title_txt": p["title"],
"text_txt": (get_text2(p.get("body","")) or "")+" "+get_text2(p.get("agenda_html","") or "")
}
def member_to_solr(m):
if type(m) is list:
return [member_to_solr(o) for o in m]
return {
"id": "/member/"+str(m["id"]),
"date_dt": None,
"tags_tkn": m["nickname"],
"title_txt": m["firstname"]+" "+m["surname"]+"("+m["nickname"]+")",
"text_txt": m["description"]
}