from lxml.html.clean import clean_html, Cleaner from bs4 import BeautifulSoup def PostKeyError(KeyError): pass def post_to_solr(p): # if a list is given call for each element if type(p) is list: return [post_to_solr(o) for o in p] if not type(p) is dict: raise TypeError("Argument for post_to_solr needs to be a dict or list of dicts") for k in ["url", "body"]: if not k in p: raise PostKeyError("Post needs to have key '%s'" % k) def get_text2(l): if not l: return "" soup = BeautifulSoup(l, features="lxml") return soup.get_text().replace("\n", " ").replace("\r", " ").replace("\t", " ") # Return the solr structure return { "id": p["url"], "date_dt": p["public_date"], "tags_tkn": p["tag_string"], "title_txt": p["title"], "text_txt": (get_text2(p.get("body", "")) or "") + " " + get_text2(p.get("agenda_html", "") or ""), } def member_to_solr(m): if type(m) is list: return [member_to_solr(o) for o in m] return { "id": "/member/" + str(m["id"]), "date_dt": None, "tags_tkn": m["nickname"], "title_txt": m["firstname"] + " " + m["surname"] + "(" + m["nickname"] + ")", "text_txt": m["description"], }