upgrade scraping and searching

This commit is contained in:
2023-05-14 18:15:10 +02:00
parent 38a428eb52
commit fc84fdf4f6
12 changed files with 239 additions and 120 deletions

View File

@@ -20,6 +20,7 @@ def check_elastic_document(element):
"text": element["text"], "text": element["text"],
"title": element["title"], "title": element["title"],
#"source": get_source(post), #"source": get_source(post),
"prio": element.get("prio", 1000),
"url": element["url"], "url": element["url"],
"updated_at": str(element["updated_at"]) "updated_at": str(element["updated_at"])
} }

32
html_scrapy/guess_date.py Normal file
View File

@@ -0,0 +1,32 @@
from datetime import datetime
from dateutil.parser import parse
import locale
from contextlib import suppress
def parse(s):
with suppress(BaseException):
date_format = '%H:%M, %d. %b. %Y'
locale.setlocale(locale.LC_TIME, 'en_US')
parsed_date = datetime.strptime(s, date_format)
return parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
with suppress(BaseException):
date_format = '%H:%M, %d. %b. %Y'
locale.setlocale(locale.LC_TIME, 'de_DE')
parsed_date = datetime.strptime(s, date_format)
return parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
with suppress(BaseException):
date_format = '%H:%M, %d. %B %Y'
locale.setlocale(locale.LC_TIME, 'de_DE')
parsed_date = datetime.strptime(s, date_format)
return parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
with suppress(BaseException):
locale.setlocale(locale.LC_TIME, 'de_DE')
parsed_date = parse(s, dayfirst=True)
return parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
return s

View File

@@ -4,11 +4,11 @@ import scrapy
import bs4 import bs4
import re import re
import getpass import getpass
import urllib.parse
from html_scrapy.elastic_publish import push_to_index, check_elastic_document from html_scrapy.elastic_publish import push_to_index, check_elastic_document
from furl import furl from furl import furl
from scrapy import signals from scrapy import signals
from html_scrapy import guess_date
def publish(response: scrapy.http.response.html.HtmlResponse): def publish(response: scrapy.http.response.html.HtmlResponse):
print("Response typ: %s, obj: %s" % (type(response), response)) print("Response typ: %s, obj: %s" % (type(response), response))
with open("scraped_urls.log", "a+") as f: with open("scraped_urls.log", "a+") as f:
@@ -22,6 +22,7 @@ class FetWikiSpider(SitemapSpider):
sitemap_rules = [("/", "parse_page")] sitemap_rules = [("/", "parse_page")]
http_user = "andis" http_user = "andis"
http_pass = getpass.getpass("Passwort von FET USer andis: ") http_pass = getpass.getpass("Passwort von FET USer andis: ")
login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden"
pages = {} pages = {}
http_auth_domain = "wiki.fet.at" http_auth_domain = "wiki.fet.at"
# rules = ( # rules = (
@@ -36,34 +37,27 @@ class FetWikiSpider(SitemapSpider):
@classmethod @classmethod
def from_crawler(cls, crawler, *args, **kwargs): def from_crawler(cls, crawler, *args, **kwargs):
spider = super(FetWikiSpider, cls).from_crawler(crawler, *args, **kwargs) spider = super(FetWikiSpider, cls).from_crawler(crawler, *args, **kwargs)
print("connect signal")
crawler.signals.connect(spider.closed_handler, signal=signals.spider_closed) crawler.signals.connect(spider.closed_handler, signal=signals.spider_closed)
return spider return spider
def closed_handler(self, spider,reason): def closed_handler(self, spider,reason):
print("closing !! Handler") with open("scraped_urls.log", "a+") as f:
print(reason)
print("found %d elements in pages" % len(spider.pages))
print("found %d elements in pages" % len(self.pages))
for id,p in spider.pages.items(): for id,p in spider.pages.items():
try: try:
output = check_elastic_document(p) output = check_elastic_document(p)
print(f"pushing: %s" % output["url"]) print(f"pushing: %s" % output["url"])
push_to_index(output["url"], output) push_to_index(output["url"], output)
f.write(output["url"] + "\n")
except AttributeError as e: except AttributeError as e:
print(e) print(e)
print(f"Error occured at id: --%s--" %id) print(f"Error occured at id: --%s--" %id)
# do stuff here
def callback_login_done(self, response):
def callback_login_step2(self, response):
html = bs4.BeautifulSoup(response.body, "lxml") html = bs4.BeautifulSoup(response.body, "lxml")
h = html.find("h1", {"id": "firstHeading"}).text h = html.find("h1", {"id": "firstHeading"}).text
print(f"\nlogin callback done %s\n" % h) if h == "FET-Wiki": # Login successful
if h == "FET-Wiki":
for url in self.sitemap_urls: for url in self.sitemap_urls:
yield scrapy.Request( url, self._parse_sitemap) yield scrapy.Request( url, self._parse_sitemap)
else: else:
@@ -72,12 +66,11 @@ class FetWikiSpider(SitemapSpider):
print(f"\nerrorbox %s" % h.text) print(f"\nerrorbox %s" % h.text)
return return
def callback_login(self, response): def callback_login_step1(self, response):
print(f"\nStart Login:\n") print(f"\nStart Login:\n")
html = bs4.BeautifulSoup(response.body, "lxml") html = bs4.BeautifulSoup(response.body, "lxml")
token = html.find("input", {"name": "wpLoginToken"})["value"] token = html.find("input", {"name": "wpLoginToken"})["value"]
login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden"
formdata = { formdata = {
"wpName": self.http_user, "wpName": self.http_user,
"wpPassword": self.http_pass, "wpPassword": self.http_pass,
@@ -88,22 +81,17 @@ class FetWikiSpider(SitemapSpider):
"title": "Spezial:Anmelden" "title": "Spezial:Anmelden"
} }
yield scrapy.FormRequest( yield scrapy.FormRequest(
login_url, self.login_url,
formdata=formdata, formdata=formdata,
#meta={"cookiejar": response.meta["cookiejar"]}, callback=self.callback_login_step2,
callback=self.callback_login_done,
) )
def start_requests(self): def start_requests(self):
login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden"
#self.cookie_jar = CookieJar()
yield scrapy.Request( yield scrapy.Request(
"https://wiki.fet.at/index.php?title=Spezial:Anmelden", self.login_url,
callback=self.callback_login, callback=self.callback_login_step1,
) )
print(f"\nLogin done Processing Sitemap:\n")
def parse_history(self, response, id): def parse_history(self, response, id):
print(f"\n\n Parsing: %s\n" % response.url) print(f"\n\n Parsing: %s\n" % response.url)
html = bs4.BeautifulSoup(response.body, "lxml") html = bs4.BeautifulSoup(response.body, "lxml")
@@ -111,10 +99,9 @@ class FetWikiSpider(SitemapSpider):
last = ul[0].find("a", {"class": "mw-changeslist-date"}).text last = ul[0].find("a", {"class": "mw-changeslist-date"}).text
created = ul[-1].find("a", {"class": "mw-changeslist-date"}).text created = ul[-1].find("a", {"class": "mw-changeslist-date"}).text
d = self.pages.get(id,{}) d = self.pages.get(id,{})
d["published"]= created d["published"]= guess_date.parse(created)
d["updated_at"]= last d["updated_at"]= guess_date.parse(last)
self.pages[id]=d self.pages[id]=d
return return
def parse_page(self, response): def parse_page(self, response):
@@ -122,10 +109,9 @@ class FetWikiSpider(SitemapSpider):
html = bs4.BeautifulSoup(response.body, "lxml") html = bs4.BeautifulSoup(response.body, "lxml")
title = html.find("h1", {"id": "firstHeading"}).text.strip() title = html.find("h1", {"id": "firstHeading"}).text.strip()
if title == "Anmeldung erforderlich": if title == "Anmeldung erforderlich":
print("login erforderlich")
yield scrapy.Request( yield scrapy.Request(
"https://wiki.fet.at/index.php?title=Spezial:Anmelden", "https://wiki.fet.at/index.php?title=Spezial:Anmelden",
callback=self.callback_login, callback=self.callback_login_step1,
) )
print(f"Exiting Title : %s\n\n" % title) print(f"Exiting Title : %s\n\n" % title)
return return

View File

@@ -1,30 +1,22 @@
from scrapy.spiders import SitemapSpider,CrawlSpider, Rule, Spider from scrapy.spiders import SitemapSpider,CrawlSpider, Rule, Spider
from scrapy.linkextractors import LinkExtractor from scrapy.linkextractors import LinkExtractor
import pickle
import scrapy import scrapy
#import redis as redis
import bs4 import bs4
import re import re
import getpass import getpass
#r = redis.Redis(host='localhost', port=6379, password="12345678", db=0)
import pickle import pickle
import json import json
from html_scrapy.elastic_publish import push_to_index, check_elastic_document from html_scrapy.elastic_publish import push_to_index, check_elastic_document
def publish(response: scrapy.http.response.html.HtmlResponse): def publish(response: scrapy.http.response.html.HtmlResponse):
print("Response typ: %s, obj: %s" % (type(response),response)) print("Response typ: %s, obj: %s" % (type(response),response))
# r.publish("fetlegacy", pickle.dumps({"url": response.url, "body": response.body}))
# r.set(response.url, response.body)
with open("scraped_urls.log", "a+") as f: with open("scraped_urls.log", "a+") as f:
f.write(response.url+"\n") f.write(response.url+"\n")
documents={} documents={}
class LegacySpider(CrawlSpider): class LegacySpider(CrawlSpider):
name = 'legacy_spider' name = 'legacy_spider'
allowed_domains = ['legacy.fet.at'] allowed_domains = ['legacy.fet.at']
@@ -112,6 +104,7 @@ class LegacySpider(CrawlSpider):
"title": d.get("name","")+ " - " + d.get("thema", {}).get("title","") + " - " + d.get("meeting",{}).get("title",""), "title": d.get("name","")+ " - " + d.get("thema", {}).get("title","") + " - " + d.get("meeting",{}).get("title",""),
"text": d.get("text",""), "text": d.get("text",""),
"raw": d.get("raw",""), "raw": d.get("raw",""),
"prio": 100,
"url": "legacy.fet.at/documents/" + str(d["id"]), "url": "legacy.fet.at/documents/" + str(d["id"]),
"published": d["created_at"], "published": d["created_at"],
"updated_at": d["updated_at"] "updated_at": d["updated_at"]

View File

@@ -1,26 +1,31 @@
from fastapi import FastAPI from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates from fastapi.templating import Jinja2Templates
from fastapi import Request from fastapi import Request
# from elasticsearch import Elasticsearch # from elasticsearch import Elasticsearch
import sys import sys
import elastic_transport import elastic_transport
from searching import es_search, es_query from searching import es_search, es_query
import json import json
import yaml import yaml
app = FastAPI(debug=True) app = FastAPI(debug=True)
templates = Jinja2Templates(directory="./httpdemo") templates = Jinja2Templates(directory="./httpdemo")
# app.mount("/", StaticFiles(directory="/")) # app.mount("/", StaticFiles(directory="/"))
@app.get("/") @app.get("/")
def serve_home(request: Request, q: str = ""): def serve_home(request: Request, q: str = ""):
try: try:
resp = es_search(q) resp = es_search(q)
query = es_query(q) query = es_query(q)
message = "found ...?" message = f"found ...? %d" % len(resp["hits"]["hits"])
except (elastic_transport.ConnectionError, elastic_transport.ConnectionTimeout) as e: except (
elastic_transport.ConnectionError,
elastic_transport.ConnectionTimeout,
) as e:
print(e, sys.stderr) print(e, sys.stderr)
results = [] results = []
resp = {} resp = {}
@@ -30,5 +35,17 @@ def serve_home(request: Request, q: str=""):
results = resp["hits"]["hits"] results = resp["hits"]["hits"]
templates.env.filters["json"] = lambda x: yaml.dump(dict(x)) templates.env.filters["json"] = lambda x: yaml.dump(dict(x))
return templates.TemplateResponse("index.html", context= { return templates.TemplateResponse(
"request": resp,"results": results,"message": message, "query": query}) "index.html",
context={
"request": resp,
"results": results,
"message": message,
"query": query,
},
)
@app.get("test")
def test(request: Request):
return {"test": "test"}

View File

@@ -9,8 +9,12 @@
{% for r in results %} {% for r in results %}
<li> <li>
<a href="https://{{r['_source']['url']}}"> <a href="https://{{r['_source']['url']}}">
{{r["_source"]["title"]|safe }} {{r["_source"]["title"]|safe }}
</a> </a>
{{r["_score"]|safe }}
{{r["sort"]|safe }}
<br /><br />
{% for hh in r["highlight"] %} {% for hh in r["highlight"] %}
{% for h in r["highlight"][hh] %} {% for h in r["highlight"][hh] %}
{{ h |safe}} <br /> {{ h |safe}} <br />

View File

@@ -1,12 +0,0 @@
# https://pypi.org/project/facebook-page-scraper/
from facebook_page_scraper import Facebook_scraper
page_name = "fsmbtu"
posts_count = 20
browser = "chrome"
#proxy = "IP:PORT" #if proxy requires authentication then user:password@IP:PORT
timeout = 120 #600 seconds
headless = True
meta_ai = Facebook_scraper(page_name, posts_count, browser, timeout=timeout, headless=headless)
json_data = meta_ai.scrap_to_json()
print(json_data)

View File

@@ -1,4 +0,0 @@
posts = [
{"pfbid02Zpa9GaGnyh431W29SMTdMFzwCnvndgCqDuRKY4kfmLzMhVfZyf1BG2mSokE4HXCsl": {"name": "Fachschaft Maschinenbau & Verfahrenstechnik der TU Wien", "shares": 0, "reactions": {"likes": 1, "loves": 0, "wow": 0, "cares": 0, "sad": 0, "angry": 0, "haha": 0}, "reaction_count": 1, "comments": 0, "content": "Du studierst oder hast Interesse am Master Materialwissenschaften?Dann komm zum Einführungstutorium/ Semesteropening, dort kannst du deine KollegInnen kennenlernen und alle Fragen klären!Wann? 13.10. 18:00Wo? GM7 Kleiner Schiffbau#bussi", "posted_on": "2022-10-11T09:34:00", "video": [], "image": ["https://scontent-vie1-1.xx.fbcdn.net/v/t39.30808-6/311170027_629045595356569_7847357821067350500_n.jpg?stp=dst-jpg_s600x600&_nc_cat=109&ccb=1-7&_nc_sid=730e14&_nc_ohc=aKyj-g-6ZzcAX_fgezX&_nc_ht=scontent-vie1-1.xx&oh=00_AfAIA7Wm9pPltcSK1n-ZJ2DPcIUFSKdvljZ14FNp208FCg&oe=63632F0B"], "post_url": "https://www.facebook.com/fsmbtu/posts/pfbid02Zpa9GaGnyh431W29SMTdMFzwCnvndgCqDuRKY4kfmLzMhVfZyf1BG2mSokE4HXCsl?__cft__[0]=AZVKCuhSkdYgll3ZFvIsXEx0U9ybfnyKM7ua43FdC1OloGDAkzTrNwNqNI3tcmBsqUbme0jH423_h1Wvy_ec-Xj-2QZxcQy49C6VeA78y4mD8Ex_VbitnZvxkJIm0rbYwkFiCOnwBnLbUTy5bia7yQXGNVtgJixRiY8aYIgC50mPMwjf4dLZbzX2NARd-eAXx6BBhNfeScm8n4TAkEuZiTk5FaiG97WMv2_AucJshoZ72g&__tn__=%2CO%2CP-R"}},
{"pfbid094MJ3H1jaioLwm9ZBdyN4bEQHnFr8JAgpB6K7UBZ7xTqfZdfifVzvhJdgyLrwdNAl": {"name": "Fachschaft Elektrotechnik", "shares": 0, "reactions": {"likes": 17, "loves": 1, "wow": 0, "cares": 0, "sad": 0, "angry": 0, "haha": 0}, "reaction_count": 18, "comments": 0, "content": "Nach 2 Jahren Pause ist es nun wieder so weit. Wir feiern am 21.10. das 20. Fetzn Fliegen Komm vorbei und lassen wir gemeinsam die Fetzn Fliegen #comeback #jubiläum #party #fetznfliegen #tuwien #semesteropening", "posted_on": "2022-10-13T12:09:00", "video": [], "image": ["https://scontent-vie1-1.xx.fbcdn.net/v/t39.30808-6/311172846_632995741860626_2839195490689716775_n.jpg?stp=dst-jpg_p526x296&_nc_cat=102&ccb=1-7&_nc_sid=730e14&_nc_ohc=5crB-6ISDf0AX-pE9Iw&_nc_oc=AQmfhBkwtkWsD_RCLws86g4MwFGNXJCU1ZnufsKI3mnZkdFla-Mx7s9SOgbIhpd2PVk&_nc_ht=scontent-vie1-1.xx&oh=00_AfDwNsC-aZ3Jhxbeok5P9Bvf0IpIvyY61sDyhl7uWK3MKg&oe=63640FE3"], "post_url": "https://www.facebook.com/FachschaftET/posts/pfbid094MJ3H1jaioLwm9ZBdyN4bEQHnFr8JAgpB6K7UBZ7xTqfZdfifVzvhJdgyLrwdNAl?__cft__[0]=AZWL_tw6LUTREPksX4y2qquFekfyKm-6l3a7UTsojWf-Ij9D8fObP55jGZUYB0QFe3zomuCLsWOeV7fTrsz6sXO6otteao4t0g0JUy6hHKeQvKNyfHB9lymnXvzwremH5sCh7CqJOQOeqlGCVtQ8Pqbcq82KGA09P5BdWyVfToNkoenUOMHxdaoso9cK-ijPG_fsn6pivRT38MdC1UXWENU3Edw4eXee92n18KvjKVhVTQ&__tn__=%2CO%2CP-R"}}
]

46
scraped_urls.log Normal file
View File

@@ -0,0 +1,46 @@
wiki.fet.at/index.php/Rezept:_Nathans_Apfelstreuselkuchen
wiki.fet.at/index.php/How_to:_Start_@_FET
wiki.fet.at/index.php/Kino
wiki.fet.at/index.php/How_to:_Kellerputz
wiki.fet.at/index.php/Schinkenfleckerl
wiki.fet.at/index.php/Waffeln
wiki.fet.at/index.php/%C3%96H-Wahl:_Wahlstand
wiki.fet.at/index.php/Keller:_Bestandsaufnahme
wiki.fet.at/index.php/M%C3%B6rderspiel
wiki.fet.at/index.php/Schl%C3%BCssel-_und_Oimlliste
wiki.fet.at/index.php/TU-Phone
wiki.fet.at/index.php/Bierbaron
wiki.fet.at/index.php/ET-Crashkurs
wiki.fet.at/index.php/%C3%96H-Wahl
wiki.fet.at/index.php/How-To_Festl
wiki.fet.at/index.php/Bouldern
wiki.fet.at/index.php/Spritzerstand
wiki.fet.at/index.php/Beratung
wiki.fet.at/index.php/Email-Anleitung
wiki.fet.at/index.php/LDAP
wiki.fet.at/index.php/Wiki-Anleitung
wiki.fet.at/index.php/User_anlegen
wiki.fet.at/index.php/Mailinglisten
wiki.fet.at/index.php/Rezept:_Nathans_Apfelstreuselkuchen
wiki.fet.at/index.php/Schinkenfleckerl
wiki.fet.at/index.php/M%C3%B6rderspiel
wiki.fet.at/index.php/How_to:_Kellerputz
wiki.fet.at/index.php/Keller:_Bestandsaufnahme
wiki.fet.at/index.php/How_to:_Start_@_FET
wiki.fet.at/index.php/Kino
wiki.fet.at/index.php/Waffeln
wiki.fet.at/index.php/%C3%96H-Wahl:_Wahlstand
wiki.fet.at/index.php/TU-Phone
wiki.fet.at/index.php/Schl%C3%BCssel-_und_Oimlliste
wiki.fet.at/index.php/Bierbaron
wiki.fet.at/index.php/%C3%96H-Wahl
wiki.fet.at/index.php/Bouldern
wiki.fet.at/index.php/Spritzerstand
wiki.fet.at/index.php/ET-Crashkurs
wiki.fet.at/index.php/How-To_Festl
wiki.fet.at/index.php/Beratung
wiki.fet.at/index.php/Wiki-Anleitung
wiki.fet.at/index.php/User_anlegen
wiki.fet.at/index.php/Email-Anleitung
wiki.fet.at/index.php/LDAP
wiki.fet.at/index.php/Mailinglisten

30
search.html Normal file
View File

@@ -0,0 +1,30 @@
<!DOCTYPE html>
<html>
<head>
<title>API Request Example</title>
</head>
<body>
<form>
<label for="input-field">Enter some text:</label>
<input type="text" id="input-field" name="input-field">
<button type="button" onclick="makeRequest()">Submit</button>
</form>
<div id="response-container"></div>
<script>
function makeRequest() {
const inputField = document.getElementById('input-field');
const inputValue = inputField.value;
fetch(`https://api.example.com/?q=${inputValue}`)
.then(response => response.json())
.then(data => {
const responseContainer = document.getElementById('response-container');
const html = `
<h2>API Response:</h2>
<pre>${JSON.stringify(data, null, 2)}</pre>
`;
responseContainer.innerHTML = html;
});
}
</script>
</body>
</html>

View File

@@ -1,10 +1,8 @@
import os import os
from elasticsearch import Elasticsearch, helpers from elasticsearch import Elasticsearch, helpers
import contextlib import contextlib
import logging import logging
ELASTIC_HOST = os.environ.get("ELASTIC_HOST", "http://localhost:9200") ELASTIC_HOST = os.environ.get("ELASTIC_HOST", "http://localhost:9200")
ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD", "*l9qNGoojiCC4n9KcZhj") ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD", "*l9qNGoojiCC4n9KcZhj")
ELASTIC_QUERY = os.environ.get("ELASTIC_QUERY", "Anwesend") ELASTIC_QUERY = os.environ.get("ELASTIC_QUERY", "Anwesend")
@@ -15,7 +13,9 @@ ELASTIC_INDEX = os.environ.get("ELASTIC_INDEX","legacy")
@contextlib.contextmanager @contextlib.contextmanager
def es_client(): def es_client():
logging.debug(f"ELASIC HOST:%s" % ELASTIC_HOST) logging.debug(f"ELASIC HOST:%s" % ELASTIC_HOST)
client = Elasticsearch(ELASTIC_HOST, verify_certs=False, basic_auth=('elastic', ELASTIC_PASSWORD)) client = Elasticsearch(
ELASTIC_HOST, verify_certs=False, basic_auth=("elastic", ELASTIC_PASSWORD)
)
yield client yield client
client.close() client.close()
@@ -39,20 +39,43 @@ def es_highlight():
"text": {"fragment_size": 150}, "text": {"fragment_size": 150},
"title.ngrams": {}, "title.ngrams": {},
"text.ngrams": {"fragment_size": 150}, "text.ngrams": {"fragment_size": 150},
} }
} }
return highlight return highlight
sorting = {
"updated_at": {"order": "desc"},
"_score": {"order": "desc"},
"prio": {"order": "desc"},
}
def es_sorting():
return {
"_script": {
"type": "number",
"script": {
"lang": "painless",
"source": "Math.log10(1+doc['updated_at'].value.toInstant().toEpochMilli()/1000000000/100) + Math.log10(1+_score)/10 + Math.log10(1+doc['prio'].value/1000) ", # * Math.log10(1+) * Math.log10(doc['prio'].value/10)" #* doc['_score'].value
},
"order": "desc",
}
}
def es_search(query: str): def es_search(query: str):
with es_client() as client: with es_client() as client:
result = client.search( result = client.search(
index=ELASTIC_INDEX, index=ELASTIC_INDEX,
size=10, size=30,
query=es_query(query), query=es_query(query),
highlight = es_highlight() sort=es_sorting(),
highlight=es_highlight(),
) )
return result return result
# for hit in resp["hits"]["hits"]: # for hit in resp["hits"]["hits"]:
# print(hit) # print(hit)
@@ -60,10 +83,12 @@ if __name__ =="__main__":
resp = es_search(ELASTIC_QUERY) resp = es_search(ELASTIC_QUERY)
logging.info(f"Found %d recorts in hits" % resp["hits"]["hits"]) logging.info(f"Found %d recorts in hits" % resp["hits"]["hits"])
for hit in resp["hits"]["hits"]: for hit in resp["hits"]["hits"]:
print(f"\n\n%s\n%s\n%s - %s" % ( print(
f"\n\n%s\n%s\n%s - %s"
% (
hit.get("_source", {}).get("url", ""), hit.get("_source", {}).get("url", ""),
hit.get("_source", {}).get("title", ""), hit.get("_source", {}).get("title", ""),
" ".join(hit.get("highlight", {}).get("title", [""])), " ".join(hit.get("highlight", {}).get("title", [""])),
" ".join(hit.get("highlight",{}).get("text",[""])) " ".join(hit.get("highlight", {}).get("text", [""])),
)) )
)

View File

@@ -54,6 +54,7 @@ mapping = {
"url": { "type": "text", "index": False}, "url": { "type": "text", "index": False},
"published": {"type": "date", "format": "date_optional_time"}, "published": {"type": "date", "format": "date_optional_time"},
"updated_at": {"type": "date", "format": "date_optional_time"}, "updated_at": {"type": "date", "format": "date_optional_time"},
"prio": {"type": "integer"},
"raw": { "raw": {
"type": "text", "type": "text",
"index": False "index": False