upgrade scraping and searching

This commit is contained in:
2023-05-14 18:15:10 +02:00
parent 38a428eb52
commit fc84fdf4f6
12 changed files with 239 additions and 120 deletions

View File

@@ -1,69 +1,94 @@
import os
from elasticsearch import Elasticsearch, helpers
import contextlib
import logging
ELASTIC_HOST = os.environ.get("ELASTIC_HOST","http://localhost:9200")
ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD","*l9qNGoojiCC4n9KcZhj")
ELASTIC_QUERY = os.environ.get("ELASTIC_QUERY","Anwesend")
ELASTIC_INDEX = os.environ.get("ELASTIC_INDEX","legacy")
ELASTIC_HOST = os.environ.get("ELASTIC_HOST", "http://localhost:9200")
ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD", "*l9qNGoojiCC4n9KcZhj")
ELASTIC_QUERY = os.environ.get("ELASTIC_QUERY", "Anwesend")
ELASTIC_INDEX = os.environ.get("ELASTIC_INDEX", "legacy")
# Verbinde mit Client
@contextlib.contextmanager
def es_client():
logging.debug(f"ELASIC HOST:%s" % ELASTIC_HOST)
client = Elasticsearch(ELASTIC_HOST, verify_certs=False, basic_auth=('elastic', ELASTIC_PASSWORD))
yield client
client.close()
logging.debug(f"ELASIC HOST:%s" % ELASTIC_HOST)
client = Elasticsearch(
ELASTIC_HOST, verify_certs=False, basic_auth=("elastic", ELASTIC_PASSWORD)
)
yield client
client.close()
def es_query(query:str):
query ={
"multi_match":{
"query": query,
"fields": ["title^20","title.ngrams^10","text^5","text.ngrams"],
"tie_breaker": 0.3
#"type": "most_fields"
}
def es_query(query: str):
query = {
"multi_match": {
"query": query,
"fields": ["title^20", "title.ngrams^10", "text^5", "text.ngrams"],
"tie_breaker": 0.3
# "type": "most_fields"
}
return query
}
return query
def es_highlight():
highlight = {
"fields": {
"title": {},
"text": {"fragment_size" : 150},
"title.ngrams": {},
"text.ngrams": {"fragment_size" : 150},
}
}
return highlight
highlight = {
"fields": {
"title": {},
"text": {"fragment_size": 150},
"title.ngrams": {},
"text.ngrams": {"fragment_size": 150},
}
}
return highlight
def es_search(query:str):
with es_client() as client:
sorting = {
"updated_at": {"order": "desc"},
"_score": {"order": "desc"},
"prio": {"order": "desc"},
}
def es_sorting():
return {
"_script": {
"type": "number",
"script": {
"lang": "painless",
"source": "Math.log10(1+doc['updated_at'].value.toInstant().toEpochMilli()/1000000000/100) + Math.log10(1+_score)/10 + Math.log10(1+doc['prio'].value/1000) ", # * Math.log10(1+) * Math.log10(doc['prio'].value/10)" #* doc['_score'].value
},
"order": "desc",
}
}
def es_search(query: str):
with es_client() as client:
result = client.search(
index = ELASTIC_INDEX,
size=10,
query= es_query(query),
highlight = es_highlight()
index=ELASTIC_INDEX,
size=30,
query=es_query(query),
sort=es_sorting(),
highlight=es_highlight(),
)
return result
#for hit in resp["hits"]["hits"]:
return result
# for hit in resp["hits"]["hits"]:
# print(hit)
if __name__ =="__main__":
resp = es_search(ELASTIC_QUERY)
logging.info(f"Found %d recorts in hits" % resp["hits"]["hits"])
for hit in resp["hits"]["hits"]:
print(f"\n\n%s\n%s\n%s - %s" % (
hit.get("_source",{}).get("url",""),
hit.get("_source",{}).get("title",""),
" ".join(hit.get("highlight",{}).get("title",[""])),
" ".join(hit.get("highlight",{}).get("text",[""]))
))
if __name__ == "__main__":
resp = es_search(ELASTIC_QUERY)
logging.info(f"Found %d recorts in hits" % resp["hits"]["hits"])
for hit in resp["hits"]["hits"]:
print(
f"\n\n%s\n%s\n%s - %s"
% (
hit.get("_source", {}).get("url", ""),
hit.get("_source", {}).get("title", ""),
" ".join(hit.get("highlight", {}).get("title", [""])),
" ".join(hit.get("highlight", {}).get("text", [""])),
)
)