introduce search interface

This commit is contained in:
www
2020-12-07 09:57:30 +00:00
parent 894a99cdef
commit ad36db964e
21 changed files with 32004 additions and 52 deletions

View File

@@ -1,5 +1,5 @@
from lxml.html.clean import clean_html, Cleaner
from bs4 import BeautifulSoup
def PostKeyError(KeyError):
pass
@@ -12,9 +12,13 @@ def post_to_solr(p):
return ""
if len(l)<1:
return ""
c=Cleaner(allow_tags=['i','em','p'], remove_tags=['p','div'])
h=c.clean_html(l.replace("\n"," ").replace("\r"," ").replace("\t"," ").replace("\\"," "))
c=Cleaner(allow_tags=['i','em'], remove_tags=['p','div','ul','li']) #
h=c.clean_html(l.replace("\n"," ").replace("\r"," ").replace("\t"," ").replace("\\"," ")).text_content()
return h
def get_text2(l):
if not l: return ""
soup=BeautifulSoup(l,features="lxml")
return soup.get_text().replace("\n"," ").replace("\r"," ").replace("\t"," ")
if type(p) is list:
return [post_to_solr(pp) for pp in p]
# Check Dict and keys
@@ -26,5 +30,5 @@ def post_to_solr(p):
return {
"id": p["slug"],
"date_dt": p["public_date"],
"text_txt": (get_text(p.get("body","")) or "")+(p.get("agenda_html","") or "")
"text_txt": (get_text2(p.get("body","")) or "")+" "+get_text2(p.get("agenda_html","") or "")
}