refactor major

2020-12-02 00:04:39 +00:00
parent 58be5c779a
commit 894a99cdef
110 changed files with 20639 additions and 155 deletions
--- a/solrfet2020/init.py
+++ b/solrfet2020/init.py
@@ -0,0 +1,69 @@
+from lxml.html.clean import clean_html, Cleaner
+import environ
+import pysolr
+from .convert import post_to_solr
+from urllib.parse import urljoin
+from fet2020api import fet2020postapi
+import yaml
+env=environ.Env(
+    SOLR_HOST=(str,"http://localhost:8980"),
+    TARGET=(str,"https://alpha.2020.fet.at")
+    )
+
+
+fet=fet2020postapi(urljoin(env('TARGET'),"api/posts/"))
+def search_post(text=""):
+    pass
+def reindex():
+    pass
+
+def replace_special(t):
+    return t.replace("\n","").replace("\r","").replace("\t","").replace("\\","")
+
+def strip_html(text):
+    c=Cleaner(
+        allow_tags=['i','em','p'], 
+        remove_tags=['p','div'])
+    if type(text) is list:
+        h=""
+        for item in text:
+            h=h+" "+strip_html(item)+";"
+        return h
+    return c.clean_html(replace_special(text))[5:-6]
+
+
+def result_to_posts(result):
+    docs = result.docs
+    highlights =result.highlighting
+    posts = [fet.find_one({"slug": rr["id"]}) for rr in docs ]
+    def create_text(p):
+        return "<b>" + \
+        p["title"]+ "</b>: "+ \
+        "(%s) " % p["public_date"] + \
+        urljoin(env('TARGET'),
+        p["url"]).rstrip("/")+"  "+\
+        str(strip_html(highlights[p["slug"]]["text_txt"]))
+    for post in posts:
+        post["text"] = create_text(post)
+    return posts
+class SolrFet2020():
+    def __init__(self):
+        self.solr=pysolr.Solr(
+            urljoin(env('SOLR_HOST'),'/solr/core'), 
+            always_commit=True
+            )
+    def reindex(self):
+        self.solr.delete(q='*:*')
+        p=post_to_solr(fet.find({"slug":""}))
+        self.solr.add(p)
+
+    def search(self,query):
+        r=self.solr.search("text_txt:*%s*" % query,sort="date_dt desc",**{
+                'hl':'true',
+                'hl.fragsize': 100,
+                'hl.fl': '*', 
+                'hl.maxAnalyzedChars': -1,
+                'hl.snippets': 100, })
+        links = result_to_posts(r)
+        #print(yaml.dump(r))
+        return links, r.hits
--- a/solrfet2020/convert.py
+++ b/solrfet2020/convert.py
@@ -0,0 +1,30 @@
+from lxml.html.clean import clean_html, Cleaner
+
+def PostKeyError(KeyError):
+    pass
+
+def post_to_solr(p):
+    # if a list is given call for each element
+    if type(p) is list:
+        return [post_to_solr(o) for o in p]
+    def get_text(l):
+        if not l:
+            return ""
+        if len(l)<1:
+            return ""
+        c=Cleaner(allow_tags=['i','em','p'], remove_tags=['p','div'])
+        h=c.clean_html(l.replace("\n"," ").replace("\r"," ").replace("\t"," ").replace("\\"," "))
+        return h
+    if type(p) is list:
+        return [post_to_solr(pp) for pp in p]
+    # Check Dict and keys
+    assert type(p) is dict, "Argument for post_to_solr needs to be a dict or list of dicts"
+    for k in ["url", "body"]:
+        if not  k in p: 
+            raise PostKeyError("Post needs to have key '%s'" % k)
+    # Return the solr structure
+    return {
+        "id": p["slug"],
+        "date_dt": p["public_date"],
+        "text_txt": (get_text(p.get("body","")) or "")+(p.get("agenda_html","") or "")
+    }