From c42b80ec3f696dc30b40d09631ede45ece21dd4f Mon Sep 17 00:00:00 2001 From: www Date: Mon, 28 Dec 2020 09:29:46 +0000 Subject: [PATCH] format solrfet --- solrfet2020/__init__.py | 60 +++++++++++++++++++++++------------------ solrfet2020/convert.py | 36 +++++++++++++++---------- 2 files changed, 56 insertions(+), 40 deletions(-) diff --git a/solrfet2020/__init__.py b/solrfet2020/__init__.py index cce1871..c21658e 100644 --- a/solrfet2020/__init__.py +++ b/solrfet2020/__init__.py @@ -1,50 +1,58 @@ from lxml.html.clean import clean_html, Cleaner import environ +import settings -env=environ.Env( - SOLR_HOST=(str,"http://localhost:8980"), - TARGET=(str,"https://alpha.2020.fet.at") - ) +env = environ.Env( + SOLR_HOST=(str, "http://localhost:8980"), TARGET=(str, "https://alpha.2020.fet.at") +) import pysolr from .convert import post_to_solr, member_to_solr from urllib.parse import urljoin -from fet2020api import fet2020postapi, fet2020memberapi +from fet2020api import fet2020memberapi, fet2020api import yaml from .solr_to_objects import result_to_object -fet=fet2020postapi(urljoin(env('TARGET'),"api/posts/")) -fetmember=fet2020memberapi(urljoin(env('TARGET'),"api/members/")) +fet = fet2020api(urljoin(env("TARGET"), "api/posts/"), pk="slug") +fetmember = fet2020memberapi(urljoin(env("TARGET"), "api/members/")) -class SolrFet2020(): +class SolrFet2020: def __init__(self): - self.solr=pysolr.Solr( - urljoin(env('SOLR_HOST'),'/solr/core'), - always_commit=True - ) + self.solr = pysolr.Solr( + urljoin(env("SOLR_HOST"), "/solr/core"), always_commit=True + ) def reindex(self): - self.solr.delete(q='*:*') - p=post_to_solr(fet.find({"slug":""})) + self.solr.delete(q="*:*") + p = post_to_solr(fet.find({"slug": ""})) self.solr.add(p) - m=member_to_solr(fetmember.find({"nickname":""})) # search all members + m = member_to_solr(fetmember.find({"nickname": ""})) # search all members self.solr.add(m) def reindextest(self): - m=fetmember.find({"nickname":""}) + m = fetmember.find({"nickname": ""}) print(m) - def search(self,query): - querystring="text_txt:*%s* title_txt:*%s*^2 tags_tkn:*%s*^2" % (query,query,query) + def search(self, query): + querystring = "text_txt:*%s* title_txt:*%s*^2 tags_tkn:*%s*^2" % ( + query, + query, + query, + ) - r=self.solr.search(querystring,sort="score desc, date_dt desc",**{ - 'hl':'true', - 'hl.fragsize': 100, - 'hl.fl': '*', - 'hl.maxAnalyzedChars': -1, - 'hl.snippets': 100, }) + r = self.solr.search( + querystring, + sort="score desc, date_dt desc", + **{ + "hl": "true", + "hl.fragsize": 100, + "hl.fl": "*", + "hl.maxAnalyzedChars": -1, + "hl.snippets": 10, + } + ) links = result_to_object(r) - #print(yaml.dump(r)) - return links, r.hits \ No newline at end of file + # print(yaml.dump(r)) + return links, r.hits diff --git a/solrfet2020/convert.py b/solrfet2020/convert.py index 337554e..bf3d342 100644 --- a/solrfet2020/convert.py +++ b/solrfet2020/convert.py @@ -1,39 +1,47 @@ from lxml.html.clean import clean_html, Cleaner from bs4 import BeautifulSoup + + def PostKeyError(KeyError): pass + def post_to_solr(p): # if a list is given call for each element if type(p) is list: return [post_to_solr(o) for o in p] - def get_text2(l): - if not l: return "" - soup=BeautifulSoup(l,features="lxml") - return soup.get_text().replace("\n"," ").replace("\r"," ").replace("\t"," ") - if type(p) is list: - return [post_to_solr(pp) for pp in p] - # Check Dict and keys - assert type(p) is dict, "Argument for post_to_solr needs to be a dict or list of dicts" + if not type(p) is dict: + raise TypeError("Argument for post_to_solr needs to be a dict or list of dicts") for k in ["url", "body"]: - if not k in p: + if not k in p: raise PostKeyError("Post needs to have key '%s'" % k) + + def get_text2(l): + if not l: + return "" + soup = BeautifulSoup(l, features="lxml") + return soup.get_text().replace("\n", " ").replace("\r", " ").replace("\t", " ") + # Return the solr structure return { "id": p["url"], "date_dt": p["public_date"], "tags_tkn": p["tag_string"], "title_txt": p["title"], - "text_txt": (get_text2(p.get("body","")) or "")+" "+get_text2(p.get("agenda_html","") or "") + "text_txt": (get_text2(p.get("body", "")) or "") + + " " + + get_text2(p.get("agenda_html", "") or ""), } + + def member_to_solr(m): if type(m) is list: return [member_to_solr(o) for o in m] + return { - "id": "/member/"+str(m["id"]), + "id": "/member/" + str(m["id"]), "date_dt": None, "tags_tkn": m["nickname"], - "title_txt": m["firstname"]+" "+m["surname"]+"("+m["nickname"]+")", - "text_txt": m["description"] + "title_txt": m["firstname"] + " " + m["surname"] + "(" + m["nickname"] + ")", + "text_txt": m["description"], } - \ No newline at end of file