From 14f641ec085eef1260bd71415592b88b1c870d54 Mon Sep 17 00:00:00 2001 From: www Date: Mon, 14 Dec 2020 20:16:49 +0000 Subject: [PATCH] index member --- bot1/chats.py | 10 ++++++ fet2020api/djangoapi.py | 2 +- solrfet2020/__init__.py | 64 +++++++++------------------------- solrfet2020/convert.py | 19 +++++----- solrfet2020/solr_to_objects.py | 60 +++++++++++++++++++++++++++++++ solrfet2020/utils.py | 28 +++++++++++++++ 6 files changed, 126 insertions(+), 57 deletions(-) create mode 100644 solrfet2020/solr_to_objects.py create mode 100644 solrfet2020/utils.py diff --git a/bot1/chats.py b/bot1/chats.py index e87b240..9828e7b 100644 --- a/bot1/chats.py +++ b/bot1/chats.py @@ -179,6 +179,7 @@ class Chat(BaseChat): self.mode=txt self.reply_msg("Mode: %s" % txt) return True + elif cmd == "/debug": if not u.fet_user: self.reply_msg("bitte vorher /auth ausführen wenn du ein FET Mitglied bist") @@ -196,6 +197,7 @@ class Chat(BaseChat): return True self.workflows[get_from_id(update)]=CreatePostWorkflow(chat=self) return True + elif cmd == "/reindex": if not u.fet_user: self.reply_msg("bitte vorher /auth ausführen wenn du ein FET Mitglied bist") @@ -204,6 +206,14 @@ class Chat(BaseChat): solr.reindex() self.send_msg("Fertig mit dem neuen Index") return True + elif cmd == "/reindextest": + if not u.fet_user: + self.reply_msg("bitte vorher /auth ausführen wenn du ein FET Mitglied bist") + return True + self.reply_msg("Das kann ein bissl dauern...") + solr.reindextest() + self.send_msg("Fertig mit dem neuen Index") + return True elif cmd == "/auth": if u.fet_user: self.reply_msg("Du bist schon authentifiziert...") diff --git a/fet2020api/djangoapi.py b/fet2020api/djangoapi.py index 83ff9cb..bf35533 100644 --- a/fet2020api/djangoapi.py +++ b/fet2020api/djangoapi.py @@ -20,7 +20,7 @@ class django_crud_api(): if r is None: return None if len(r)>1: - raise LookupError("Mehr als ein Objekt von der API zurückgegeben") + raise LookupError("Mehr als ein Objekt von der API zurückgegeben filter: %s" % str(filter)) if len(r)==0: return None return r[0] diff --git a/solrfet2020/__init__.py b/solrfet2020/__init__.py index 72c85e7..79cedc2 100644 --- a/solrfet2020/__init__.py +++ b/solrfet2020/__init__.py @@ -1,74 +1,41 @@ from lxml.html.clean import clean_html, Cleaner import environ -import pysolr -from .convert import post_to_solr -from urllib.parse import urljoin,urlparse -from fet2020api import fet2020postapi, fet2020memberapi -import yaml + env=environ.Env( SOLR_HOST=(str,"http://localhost:8980"), TARGET=(str,"https://alpha.2020.fet.at") ) +import pysolr +from .convert import post_to_solr, member_to_solr +from urllib.parse import urljoin +from fet2020api import fet2020postapi, fet2020memberapi +import yaml +from .solr_to_objects import result_to_object fet=fet2020postapi(urljoin(env('TARGET'),"api/posts/")) fetmember=fet2020memberapi(urljoin(env('TARGET'),"api/members/")) -def search_post(text=""): - pass -def reindex(): - pass - -def replace_special(t): - return t.replace("\n","").replace("\r","").replace("\t","").replace("\\","") - -def strip_html(text): - if text=="": return "" - c=Cleaner( - allow_tags=['i','em','p'], - remove_tags=['p','div']) - if type(text) is list: - h="" - for item in text: - h=h+" "+strip_html(item)+";" - return h - return c.clean_html(replace_special(text))[5:-6] -def result_to_posts(result): - docs = result.docs - highlights =result.highlighting - posts = [fet.find_one({"slug": rr["id"].split("/")[2]}) for rr in docs ] - def create_text(p): - return "" + \ - p["title"]+ ": "+ \ - "(%s) " % p["public_date"] + \ - urljoin(env('TARGET'), - p["url"]).rstrip("/")+" "+\ - str(strip_html(highlights["/posts/"+p["slug"]].get("text_txt",""))) - def create_highlights(p): - print(highlights["/posts/"+p["slug"]]) - return str(strip_html(highlights["/posts/"+p["slug"]].get("text_txt",""))) - - for post in posts: - if post: - post["text"] = create_text(post) - post["highlights"]=create_highlights(post) - if post["url"]: post["url"]= urljoin(env('TARGET'),post["url"]).rstrip("/") - if post["imageurl"]:post["image"]=urljoin(env('TARGET'),urlparse(post["imageurl"]).path) - return posts - class SolrFet2020(): def __init__(self): self.solr=pysolr.Solr( urljoin(env('SOLR_HOST'),'/solr/core'), always_commit=True ) + def reindex(self): self.solr.delete(q='*:*') p=post_to_solr(fet.find({"slug":""})) self.solr.add(p) + m=member_to_solr(fetmember.find({"nickname":""})) # search all members + self.solr.add(m) + #print(m) + + def reindextest(self): m=fetmember.find({"nickname":""}) print(m) + def search(self,query): querystring="text_txt:*%s* title_txt:*%s*^2 tags_tkn:*%s*^2" % (query,query,query) @@ -78,6 +45,7 @@ class SolrFet2020(): 'hl.fl': '*', 'hl.maxAnalyzedChars': -1, 'hl.snippets': 100, }) - links = result_to_posts(r) + + links = result_to_object(r) #print(yaml.dump(r)) return links, r.hits \ No newline at end of file diff --git a/solrfet2020/convert.py b/solrfet2020/convert.py index 314268c..337554e 100644 --- a/solrfet2020/convert.py +++ b/solrfet2020/convert.py @@ -7,14 +7,6 @@ def post_to_solr(p): # if a list is given call for each element if type(p) is list: return [post_to_solr(o) for o in p] - def get_text(l): - if not l: - return "" - if len(l)<1: - return "" - c=Cleaner(allow_tags=['i','em'], remove_tags=['p','div','ul','li']) # - h=c.clean_html(l.replace("\n"," ").replace("\r"," ").replace("\t"," ").replace("\\"," ")).text_content() - return h def get_text2(l): if not l: return "" soup=BeautifulSoup(l,features="lxml") @@ -34,3 +26,14 @@ def post_to_solr(p): "title_txt": p["title"], "text_txt": (get_text2(p.get("body","")) or "")+" "+get_text2(p.get("agenda_html","") or "") } +def member_to_solr(m): + if type(m) is list: + return [member_to_solr(o) for o in m] + return { + "id": "/member/"+str(m["id"]), + "date_dt": None, + "tags_tkn": m["nickname"], + "title_txt": m["firstname"]+" "+m["surname"]+"("+m["nickname"]+")", + "text_txt": m["description"] + } + \ No newline at end of file diff --git a/solrfet2020/solr_to_objects.py b/solrfet2020/solr_to_objects.py new file mode 100644 index 0000000..f3f247b --- /dev/null +++ b/solrfet2020/solr_to_objects.py @@ -0,0 +1,60 @@ +from .utils import fet, fetmember +from .utils import strip_html +from urllib.parse import urljoin,urlparse +from . import env + +def pull_post(slug, o): + #docs = result.docs + highlights =o["highlights"] + post = fet.find_one({"slug": slug}) + + def create_text(p): + return "" + \ + p["title"]+ ": "+ \ + "(%s) " % p["public_date"] + \ + urljoin(env('TARGET'), + p["url"]).rstrip("/")+" "+\ + p["highlights"] + + def create_highlights(h): + return str(strip_html(h.get("text_txt",""))) + + if post: + post["typ"] = "posts" + post["highlights"]=create_highlights(o["highlights"]) + post["text"] = create_text(post) + if post["url"]: post["url"]= urljoin(env('TARGET'),post["url"]).rstrip("/") + if post["imageurl"]:post["image"]=urljoin(env('TARGET'),urlparse(post["imageurl"]).path) + return post + +def pull_member(id, o): + member = fetmember.get(id) + if not member: return None + member["url"]=urljoin(env('TARGET'), "/member/%s" % id) + member["text"]=""+member["firstname"]+" "+member["surname"]+""+" "+member["url"] + member["imageurl"]=member["image"] + member["title"] =member["firstname"]+" "+member["surname"] + return member + +pull_original={"posts": pull_post,"member": pull_member} + + +def result_to_object(result): + docs=result.docs + highlights =result.highlighting + #objects=[(rr["id"],rr["id"].split("/")[1],rr["id"].) for rr in docs] + + def split_id(url): + "Split the id into its parts // -> , " + return { + "id": url["id"] , + "typ": url["id"] .split("/")[1], + "term": url["id"] .split("/")[2] + } + + def doc_to_object(r): + o=split_id(r) + o["highlights"]=highlights.get(o["id"]) + o=pull_original[o["typ"]](o["term"],o) + return o + return [doc_to_object(d) for d in docs] \ No newline at end of file diff --git a/solrfet2020/utils.py b/solrfet2020/utils.py new file mode 100644 index 0000000..5a05eb3 --- /dev/null +++ b/solrfet2020/utils.py @@ -0,0 +1,28 @@ +from lxml.html.clean import clean_html, Cleaner +from fet2020api import fet2020postapi, fet2020memberapi +from urllib.parse import urljoin,urlparse +from . import env + + +fet=fet2020postapi(urljoin(env('TARGET'),"api/posts/")) +fetmember=fet2020memberapi(urljoin(env('TARGET'),"api/members/")) + + +def replace_special(t): + if type(t)is dict: raise TypeError("String needed git a %s"% str(type(t))) + return t.replace("\n","").replace("\r","").replace("\t","").replace("\\","") + +def strip_html(text): + if text=="": return "" + c=Cleaner( + allow_tags=['i','em','p'], + remove_tags=['p','div']) + if type(text) is list: + h="" + for item in text: + h=h+" "+strip_html(item)+";" + return h + print(text) + return c.clean_html(replace_special(text))[5:-6] + +