upgrade scraping and searching
This commit is contained in:
@@ -1,69 +1,94 @@
|
||||
|
||||
|
||||
|
||||
import os
|
||||
from elasticsearch import Elasticsearch, helpers
|
||||
import contextlib
|
||||
import logging
|
||||
ELASTIC_HOST = os.environ.get("ELASTIC_HOST","http://localhost:9200")
|
||||
ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD","*l9qNGoojiCC4n9KcZhj")
|
||||
ELASTIC_QUERY = os.environ.get("ELASTIC_QUERY","Anwesend")
|
||||
ELASTIC_INDEX = os.environ.get("ELASTIC_INDEX","legacy")
|
||||
|
||||
ELASTIC_HOST = os.environ.get("ELASTIC_HOST", "http://localhost:9200")
|
||||
ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD", "*l9qNGoojiCC4n9KcZhj")
|
||||
ELASTIC_QUERY = os.environ.get("ELASTIC_QUERY", "Anwesend")
|
||||
ELASTIC_INDEX = os.environ.get("ELASTIC_INDEX", "legacy")
|
||||
|
||||
|
||||
# Verbinde mit Client
|
||||
@contextlib.contextmanager
|
||||
def es_client():
|
||||
logging.debug(f"ELASIC HOST:%s" % ELASTIC_HOST)
|
||||
client = Elasticsearch(ELASTIC_HOST, verify_certs=False, basic_auth=('elastic', ELASTIC_PASSWORD))
|
||||
yield client
|
||||
client.close()
|
||||
logging.debug(f"ELASIC HOST:%s" % ELASTIC_HOST)
|
||||
client = Elasticsearch(
|
||||
ELASTIC_HOST, verify_certs=False, basic_auth=("elastic", ELASTIC_PASSWORD)
|
||||
)
|
||||
yield client
|
||||
client.close()
|
||||
|
||||
|
||||
def es_query(query:str):
|
||||
query ={
|
||||
"multi_match":{
|
||||
"query": query,
|
||||
"fields": ["title^20","title.ngrams^10","text^5","text.ngrams"],
|
||||
"tie_breaker": 0.3
|
||||
#"type": "most_fields"
|
||||
}
|
||||
def es_query(query: str):
|
||||
query = {
|
||||
"multi_match": {
|
||||
"query": query,
|
||||
"fields": ["title^20", "title.ngrams^10", "text^5", "text.ngrams"],
|
||||
"tie_breaker": 0.3
|
||||
# "type": "most_fields"
|
||||
}
|
||||
return query
|
||||
}
|
||||
return query
|
||||
|
||||
|
||||
def es_highlight():
|
||||
highlight = {
|
||||
"fields": {
|
||||
"title": {},
|
||||
"text": {"fragment_size" : 150},
|
||||
"title.ngrams": {},
|
||||
"text.ngrams": {"fragment_size" : 150},
|
||||
|
||||
}
|
||||
}
|
||||
return highlight
|
||||
highlight = {
|
||||
"fields": {
|
||||
"title": {},
|
||||
"text": {"fragment_size": 150},
|
||||
"title.ngrams": {},
|
||||
"text.ngrams": {"fragment_size": 150},
|
||||
}
|
||||
}
|
||||
return highlight
|
||||
|
||||
def es_search(query:str):
|
||||
with es_client() as client:
|
||||
|
||||
sorting = {
|
||||
"updated_at": {"order": "desc"},
|
||||
"_score": {"order": "desc"},
|
||||
"prio": {"order": "desc"},
|
||||
}
|
||||
|
||||
|
||||
def es_sorting():
|
||||
return {
|
||||
"_script": {
|
||||
"type": "number",
|
||||
"script": {
|
||||
"lang": "painless",
|
||||
"source": "Math.log10(1+doc['updated_at'].value.toInstant().toEpochMilli()/1000000000/100) + Math.log10(1+_score)/10 + Math.log10(1+doc['prio'].value/1000) ", # * Math.log10(1+) * Math.log10(doc['prio'].value/10)" #* doc['_score'].value
|
||||
},
|
||||
"order": "desc",
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def es_search(query: str):
|
||||
with es_client() as client:
|
||||
result = client.search(
|
||||
index = ELASTIC_INDEX,
|
||||
size=10,
|
||||
query= es_query(query),
|
||||
highlight = es_highlight()
|
||||
index=ELASTIC_INDEX,
|
||||
size=30,
|
||||
query=es_query(query),
|
||||
sort=es_sorting(),
|
||||
highlight=es_highlight(),
|
||||
)
|
||||
return result
|
||||
#for hit in resp["hits"]["hits"]:
|
||||
return result
|
||||
|
||||
|
||||
# for hit in resp["hits"]["hits"]:
|
||||
# print(hit)
|
||||
|
||||
if __name__ =="__main__":
|
||||
resp = es_search(ELASTIC_QUERY)
|
||||
logging.info(f"Found %d recorts in hits" % resp["hits"]["hits"])
|
||||
for hit in resp["hits"]["hits"]:
|
||||
print(f"\n\n%s\n%s\n%s - %s" % (
|
||||
hit.get("_source",{}).get("url",""),
|
||||
hit.get("_source",{}).get("title",""),
|
||||
" ".join(hit.get("highlight",{}).get("title",[""])),
|
||||
" ".join(hit.get("highlight",{}).get("text",[""]))
|
||||
))
|
||||
|
||||
if __name__ == "__main__":
|
||||
resp = es_search(ELASTIC_QUERY)
|
||||
logging.info(f"Found %d recorts in hits" % resp["hits"]["hits"])
|
||||
for hit in resp["hits"]["hits"]:
|
||||
print(
|
||||
f"\n\n%s\n%s\n%s - %s"
|
||||
% (
|
||||
hit.get("_source", {}).get("url", ""),
|
||||
hit.get("_source", {}).get("title", ""),
|
||||
" ".join(hit.get("highlight", {}).get("title", [""])),
|
||||
" ".join(hit.get("highlight", {}).get("text", [""])),
|
||||
)
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user