from lxml.html.clean import clean_html, Cleaner def PostKeyError(KeyError): pass def post_to_solr(p): # if a list is given call for each element if type(p) is list: return [post_to_solr(o) for o in p] def get_text(l): if not l: return "" if len(l)<1: return "" c=Cleaner(allow_tags=['i','em','p'], remove_tags=['p','div']) h=c.clean_html(l.replace("\n"," ").replace("\r"," ").replace("\t"," ").replace("\\"," ")) return h if type(p) is list: return [post_to_solr(pp) for pp in p] # Check Dict and keys assert type(p) is dict, "Argument for post_to_solr needs to be a dict or list of dicts" for k in ["url", "body"]: if not k in p: raise PostKeyError("Post needs to have key '%s'" % k) # Return the solr structure return { "id": p["slug"], "date_dt": p["public_date"], "text_txt": (get_text(p.get("body","")) or "")+(p.get("agenda_html","") or "") }