refactor major

This commit is contained in:
www
2020-12-02 00:04:39 +00:00
parent 58be5c779a
commit 894a99cdef
110 changed files with 20639 additions and 155 deletions

30
solrfet2020/convert.py Normal file
View File

@@ -0,0 +1,30 @@
from lxml.html.clean import clean_html, Cleaner
def PostKeyError(KeyError):
pass
def post_to_solr(p):
# if a list is given call for each element
if type(p) is list:
return [post_to_solr(o) for o in p]
def get_text(l):
if not l:
return ""
if len(l)<1:
return ""
c=Cleaner(allow_tags=['i','em','p'], remove_tags=['p','div'])
h=c.clean_html(l.replace("\n"," ").replace("\r"," ").replace("\t"," ").replace("\\"," "))
return h
if type(p) is list:
return [post_to_solr(pp) for pp in p]
# Check Dict and keys
assert type(p) is dict, "Argument for post_to_solr needs to be a dict or list of dicts"
for k in ["url", "body"]:
if not k in p:
raise PostKeyError("Post needs to have key '%s'" % k)
# Return the solr structure
return {
"id": p["slug"],
"date_dt": p["public_date"],
"text_txt": (get_text(p.get("body","")) or "")+(p.get("agenda_html","") or "")
}