introduce search interface

2020-12-07 09:57:30 +00:00
parent 894a99cdef
commit ad36db964e
21 changed files with 32004 additions and 52 deletions
--- a/solrfet2020/init.py
+++ b/solrfet2020/init.py
@@ -2,7 +2,7 @@ from lxml.html.clean import clean_html, Cleaner
 import environ
 import pysolr
 from .convert import post_to_solr
-from urllib.parse import urljoin
+from urllib.parse import urljoin,urlparse
 from fet2020api import fet2020postapi
 import yaml
 env=environ.Env(
@@ -43,9 +43,17 @@ def result_to_posts(result):
        urljoin(env('TARGET'),
        p["url"]).rstrip("/")+"  "+\
        str(strip_html(highlights[p["slug"]]["text_txt"]))
+    def create_highlights(p):
+        return  str(strip_html(highlights[p["slug"]]["text_txt"]))
+
    for post in posts:
-        post["text"] = create_text(post)
+        if post: 
+            post["text"] = create_text(post)
+            post["highlights"]=create_highlights(post)
+            if post["url"]: post["url"]= urljoin(env('TARGET'),post["url"]).rstrip("/")
+            if post["image"]:post["image"]=urljoin(env('TARGET'),urlparse(post["image"]).path)
    return posts
+    
 class SolrFet2020():
    def __init__(self):
        self.solr=pysolr.Solr(
--- a/solrfet2020/convert.py
+++ b/solrfet2020/convert.py
@@ -1,5 +1,5 @@
 from lxml.html.clean import clean_html, Cleaner
-
+from bs4 import BeautifulSoup
 def PostKeyError(KeyError):
    pass

@@ -12,9 +12,13 @@ def post_to_solr(p):
            return ""
        if len(l)<1:
            return ""
-        c=Cleaner(allow_tags=['i','em','p'], remove_tags=['p','div'])
-        h=c.clean_html(l.replace("\n"," ").replace("\r"," ").replace("\t"," ").replace("\\"," "))
+        c=Cleaner(allow_tags=['i','em'], remove_tags=['p','div','ul','li']) #
+        h=c.clean_html(l.replace("\n"," ").replace("\r"," ").replace("\t"," ").replace("\\"," ")).text_content()
        return h
+    def get_text2(l):
+        if not l: return ""
+        soup=BeautifulSoup(l,features="lxml")
+        return soup.get_text().replace("\n"," ").replace("\r"," ").replace("\t"," ")
    if type(p) is list:
        return [post_to_solr(pp) for pp in p]
    # Check Dict and keys
@@ -26,5 +30,5 @@ def post_to_solr(p):
    return {
        "id": p["slug"],
        "date_dt": p["public_date"],
-        "text_txt": (get_text(p.get("body","")) or "")+(p.get("agenda_html","") or "")
+        "text_txt": (get_text2(p.get("body","")) or "")+" "+get_text2(p.get("agenda_html","") or "")
    }