upgrade scraping and searching
This commit is contained in:
@@ -20,6 +20,7 @@ def check_elastic_document(element):
|
|||||||
"text": element["text"],
|
"text": element["text"],
|
||||||
"title": element["title"],
|
"title": element["title"],
|
||||||
#"source": get_source(post),
|
#"source": get_source(post),
|
||||||
|
"prio": element.get("prio", 1000),
|
||||||
"url": element["url"],
|
"url": element["url"],
|
||||||
"updated_at": str(element["updated_at"])
|
"updated_at": str(element["updated_at"])
|
||||||
}
|
}
|
||||||
32
html_scrapy/guess_date.py
Normal file
32
html_scrapy/guess_date.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from dateutil.parser import parse
|
||||||
|
import locale
|
||||||
|
from contextlib import suppress
|
||||||
|
|
||||||
|
def parse(s):
|
||||||
|
with suppress(BaseException):
|
||||||
|
date_format = '%H:%M, %d. %b. %Y'
|
||||||
|
locale.setlocale(locale.LC_TIME, 'en_US')
|
||||||
|
parsed_date = datetime.strptime(s, date_format)
|
||||||
|
return parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
||||||
|
|
||||||
|
with suppress(BaseException):
|
||||||
|
date_format = '%H:%M, %d. %b. %Y'
|
||||||
|
locale.setlocale(locale.LC_TIME, 'de_DE')
|
||||||
|
parsed_date = datetime.strptime(s, date_format)
|
||||||
|
return parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
||||||
|
|
||||||
|
with suppress(BaseException):
|
||||||
|
date_format = '%H:%M, %d. %B %Y'
|
||||||
|
locale.setlocale(locale.LC_TIME, 'de_DE')
|
||||||
|
parsed_date = datetime.strptime(s, date_format)
|
||||||
|
return parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
||||||
|
|
||||||
|
with suppress(BaseException):
|
||||||
|
locale.setlocale(locale.LC_TIME, 'de_DE')
|
||||||
|
parsed_date = parse(s, dayfirst=True)
|
||||||
|
return parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
||||||
|
|
||||||
|
|
||||||
|
return s
|
||||||
@@ -4,11 +4,11 @@ import scrapy
|
|||||||
import bs4
|
import bs4
|
||||||
import re
|
import re
|
||||||
import getpass
|
import getpass
|
||||||
import urllib.parse
|
|
||||||
from html_scrapy.elastic_publish import push_to_index, check_elastic_document
|
from html_scrapy.elastic_publish import push_to_index, check_elastic_document
|
||||||
from furl import furl
|
from furl import furl
|
||||||
from scrapy import signals
|
from scrapy import signals
|
||||||
|
from html_scrapy import guess_date
|
||||||
def publish(response: scrapy.http.response.html.HtmlResponse):
|
def publish(response: scrapy.http.response.html.HtmlResponse):
|
||||||
print("Response typ: %s, obj: %s" % (type(response), response))
|
print("Response typ: %s, obj: %s" % (type(response), response))
|
||||||
with open("scraped_urls.log", "a+") as f:
|
with open("scraped_urls.log", "a+") as f:
|
||||||
@@ -22,6 +22,7 @@ class FetWikiSpider(SitemapSpider):
|
|||||||
sitemap_rules = [("/", "parse_page")]
|
sitemap_rules = [("/", "parse_page")]
|
||||||
http_user = "andis"
|
http_user = "andis"
|
||||||
http_pass = getpass.getpass("Passwort von FET USer andis: ")
|
http_pass = getpass.getpass("Passwort von FET USer andis: ")
|
||||||
|
login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden"
|
||||||
pages = {}
|
pages = {}
|
||||||
http_auth_domain = "wiki.fet.at"
|
http_auth_domain = "wiki.fet.at"
|
||||||
# rules = (
|
# rules = (
|
||||||
@@ -36,34 +37,27 @@ class FetWikiSpider(SitemapSpider):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def from_crawler(cls, crawler, *args, **kwargs):
|
def from_crawler(cls, crawler, *args, **kwargs):
|
||||||
spider = super(FetWikiSpider, cls).from_crawler(crawler, *args, **kwargs)
|
spider = super(FetWikiSpider, cls).from_crawler(crawler, *args, **kwargs)
|
||||||
print("connect signal")
|
|
||||||
crawler.signals.connect(spider.closed_handler, signal=signals.spider_closed)
|
crawler.signals.connect(spider.closed_handler, signal=signals.spider_closed)
|
||||||
return spider
|
return spider
|
||||||
|
|
||||||
def closed_handler(self, spider,reason):
|
def closed_handler(self, spider,reason):
|
||||||
print("closing !! Handler")
|
with open("scraped_urls.log", "a+") as f:
|
||||||
print(reason)
|
|
||||||
print("found %d elements in pages" % len(spider.pages))
|
|
||||||
print("found %d elements in pages" % len(self.pages))
|
|
||||||
for id,p in spider.pages.items():
|
for id,p in spider.pages.items():
|
||||||
try:
|
try:
|
||||||
output = check_elastic_document(p)
|
output = check_elastic_document(p)
|
||||||
print(f"pushing: %s" % output["url"])
|
print(f"pushing: %s" % output["url"])
|
||||||
push_to_index(output["url"], output)
|
push_to_index(output["url"], output)
|
||||||
|
f.write(output["url"] + "\n")
|
||||||
except AttributeError as e:
|
except AttributeError as e:
|
||||||
print(e)
|
print(e)
|
||||||
print(f"Error occured at id: --%s--" %id)
|
print(f"Error occured at id: --%s--" %id)
|
||||||
|
|
||||||
# do stuff here
|
|
||||||
|
|
||||||
def callback_login_done(self, response):
|
|
||||||
|
|
||||||
|
def callback_login_step2(self, response):
|
||||||
html = bs4.BeautifulSoup(response.body, "lxml")
|
html = bs4.BeautifulSoup(response.body, "lxml")
|
||||||
h = html.find("h1", {"id": "firstHeading"}).text
|
h = html.find("h1", {"id": "firstHeading"}).text
|
||||||
print(f"\nlogin callback done %s\n" % h)
|
if h == "FET-Wiki": # Login successful
|
||||||
|
|
||||||
|
|
||||||
if h == "FET-Wiki":
|
|
||||||
for url in self.sitemap_urls:
|
for url in self.sitemap_urls:
|
||||||
yield scrapy.Request( url, self._parse_sitemap)
|
yield scrapy.Request( url, self._parse_sitemap)
|
||||||
else:
|
else:
|
||||||
@@ -72,12 +66,11 @@ class FetWikiSpider(SitemapSpider):
|
|||||||
print(f"\nerrorbox %s" % h.text)
|
print(f"\nerrorbox %s" % h.text)
|
||||||
return
|
return
|
||||||
|
|
||||||
def callback_login(self, response):
|
def callback_login_step1(self, response):
|
||||||
print(f"\nStart Login:\n")
|
print(f"\nStart Login:\n")
|
||||||
|
|
||||||
html = bs4.BeautifulSoup(response.body, "lxml")
|
html = bs4.BeautifulSoup(response.body, "lxml")
|
||||||
token = html.find("input", {"name": "wpLoginToken"})["value"]
|
token = html.find("input", {"name": "wpLoginToken"})["value"]
|
||||||
login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden"
|
|
||||||
formdata = {
|
formdata = {
|
||||||
"wpName": self.http_user,
|
"wpName": self.http_user,
|
||||||
"wpPassword": self.http_pass,
|
"wpPassword": self.http_pass,
|
||||||
@@ -88,22 +81,17 @@ class FetWikiSpider(SitemapSpider):
|
|||||||
"title": "Spezial:Anmelden"
|
"title": "Spezial:Anmelden"
|
||||||
}
|
}
|
||||||
yield scrapy.FormRequest(
|
yield scrapy.FormRequest(
|
||||||
login_url,
|
self.login_url,
|
||||||
formdata=formdata,
|
formdata=formdata,
|
||||||
#meta={"cookiejar": response.meta["cookiejar"]},
|
callback=self.callback_login_step2,
|
||||||
callback=self.callback_login_done,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def start_requests(self):
|
def start_requests(self):
|
||||||
login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden"
|
|
||||||
#self.cookie_jar = CookieJar()
|
|
||||||
yield scrapy.Request(
|
yield scrapy.Request(
|
||||||
"https://wiki.fet.at/index.php?title=Spezial:Anmelden",
|
self.login_url,
|
||||||
callback=self.callback_login,
|
callback=self.callback_login_step1,
|
||||||
)
|
)
|
||||||
|
|
||||||
print(f"\nLogin done Processing Sitemap:\n")
|
|
||||||
|
|
||||||
def parse_history(self, response, id):
|
def parse_history(self, response, id):
|
||||||
print(f"\n\n Parsing: %s\n" % response.url)
|
print(f"\n\n Parsing: %s\n" % response.url)
|
||||||
html = bs4.BeautifulSoup(response.body, "lxml")
|
html = bs4.BeautifulSoup(response.body, "lxml")
|
||||||
@@ -111,10 +99,9 @@ class FetWikiSpider(SitemapSpider):
|
|||||||
last = ul[0].find("a", {"class": "mw-changeslist-date"}).text
|
last = ul[0].find("a", {"class": "mw-changeslist-date"}).text
|
||||||
created = ul[-1].find("a", {"class": "mw-changeslist-date"}).text
|
created = ul[-1].find("a", {"class": "mw-changeslist-date"}).text
|
||||||
d = self.pages.get(id,{})
|
d = self.pages.get(id,{})
|
||||||
d["published"]= created
|
d["published"]= guess_date.parse(created)
|
||||||
d["updated_at"]= last
|
d["updated_at"]= guess_date.parse(last)
|
||||||
self.pages[id]=d
|
self.pages[id]=d
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def parse_page(self, response):
|
def parse_page(self, response):
|
||||||
@@ -122,10 +109,9 @@ class FetWikiSpider(SitemapSpider):
|
|||||||
html = bs4.BeautifulSoup(response.body, "lxml")
|
html = bs4.BeautifulSoup(response.body, "lxml")
|
||||||
title = html.find("h1", {"id": "firstHeading"}).text.strip()
|
title = html.find("h1", {"id": "firstHeading"}).text.strip()
|
||||||
if title == "Anmeldung erforderlich":
|
if title == "Anmeldung erforderlich":
|
||||||
print("login erforderlich")
|
|
||||||
yield scrapy.Request(
|
yield scrapy.Request(
|
||||||
"https://wiki.fet.at/index.php?title=Spezial:Anmelden",
|
"https://wiki.fet.at/index.php?title=Spezial:Anmelden",
|
||||||
callback=self.callback_login,
|
callback=self.callback_login_step1,
|
||||||
)
|
)
|
||||||
print(f"Exiting Title : %s\n\n" % title)
|
print(f"Exiting Title : %s\n\n" % title)
|
||||||
return
|
return
|
||||||
|
|||||||
@@ -1,30 +1,22 @@
|
|||||||
from scrapy.spiders import SitemapSpider,CrawlSpider, Rule, Spider
|
from scrapy.spiders import SitemapSpider,CrawlSpider, Rule, Spider
|
||||||
from scrapy.linkextractors import LinkExtractor
|
from scrapy.linkextractors import LinkExtractor
|
||||||
import pickle
|
|
||||||
import scrapy
|
import scrapy
|
||||||
#import redis as redis
|
|
||||||
import bs4
|
import bs4
|
||||||
import re
|
import re
|
||||||
import getpass
|
import getpass
|
||||||
#r = redis.Redis(host='localhost', port=6379, password="12345678", db=0)
|
|
||||||
import pickle
|
import pickle
|
||||||
import json
|
import json
|
||||||
from html_scrapy.elastic_publish import push_to_index, check_elastic_document
|
from html_scrapy.elastic_publish import push_to_index, check_elastic_document
|
||||||
|
|
||||||
def publish(response: scrapy.http.response.html.HtmlResponse):
|
def publish(response: scrapy.http.response.html.HtmlResponse):
|
||||||
print("Response typ: %s, obj: %s" % (type(response),response))
|
print("Response typ: %s, obj: %s" % (type(response),response))
|
||||||
|
|
||||||
# r.publish("fetlegacy", pickle.dumps({"url": response.url, "body": response.body}))
|
|
||||||
# r.set(response.url, response.body)
|
|
||||||
|
|
||||||
with open("scraped_urls.log", "a+") as f:
|
with open("scraped_urls.log", "a+") as f:
|
||||||
f.write(response.url+"\n")
|
f.write(response.url+"\n")
|
||||||
|
|
||||||
documents={}
|
documents={}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class LegacySpider(CrawlSpider):
|
class LegacySpider(CrawlSpider):
|
||||||
name = 'legacy_spider'
|
name = 'legacy_spider'
|
||||||
allowed_domains = ['legacy.fet.at']
|
allowed_domains = ['legacy.fet.at']
|
||||||
@@ -112,6 +104,7 @@ class LegacySpider(CrawlSpider):
|
|||||||
"title": d.get("name","")+ " - " + d.get("thema", {}).get("title","") + " - " + d.get("meeting",{}).get("title",""),
|
"title": d.get("name","")+ " - " + d.get("thema", {}).get("title","") + " - " + d.get("meeting",{}).get("title",""),
|
||||||
"text": d.get("text",""),
|
"text": d.get("text",""),
|
||||||
"raw": d.get("raw",""),
|
"raw": d.get("raw",""),
|
||||||
|
"prio": 100,
|
||||||
"url": "legacy.fet.at/documents/" + str(d["id"]),
|
"url": "legacy.fet.at/documents/" + str(d["id"]),
|
||||||
"published": d["created_at"],
|
"published": d["created_at"],
|
||||||
"updated_at": d["updated_at"]
|
"updated_at": d["updated_at"]
|
||||||
|
|||||||
@@ -1,26 +1,31 @@
|
|||||||
|
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
from fastapi.templating import Jinja2Templates
|
from fastapi.templating import Jinja2Templates
|
||||||
from fastapi import Request
|
from fastapi import Request
|
||||||
|
|
||||||
# from elasticsearch import Elasticsearch
|
# from elasticsearch import Elasticsearch
|
||||||
import sys
|
import sys
|
||||||
import elastic_transport
|
import elastic_transport
|
||||||
from searching import es_search, es_query
|
from searching import es_search, es_query
|
||||||
import json
|
import json
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
app = FastAPI(debug=True)
|
app = FastAPI(debug=True)
|
||||||
|
|
||||||
templates = Jinja2Templates(directory="./httpdemo")
|
templates = Jinja2Templates(directory="./httpdemo")
|
||||||
# app.mount("/", StaticFiles(directory="/"))
|
# app.mount("/", StaticFiles(directory="/"))
|
||||||
|
|
||||||
|
|
||||||
@app.get("/")
|
@app.get("/")
|
||||||
def serve_home(request: Request, q: str = ""):
|
def serve_home(request: Request, q: str = ""):
|
||||||
try:
|
try:
|
||||||
resp = es_search(q)
|
resp = es_search(q)
|
||||||
query = es_query(q)
|
query = es_query(q)
|
||||||
message = "found ...?"
|
message = f"found ...? %d" % len(resp["hits"]["hits"])
|
||||||
except (elastic_transport.ConnectionError, elastic_transport.ConnectionTimeout) as e:
|
except (
|
||||||
|
elastic_transport.ConnectionError,
|
||||||
|
elastic_transport.ConnectionTimeout,
|
||||||
|
) as e:
|
||||||
print(e, sys.stderr)
|
print(e, sys.stderr)
|
||||||
results = []
|
results = []
|
||||||
resp = {}
|
resp = {}
|
||||||
@@ -30,5 +35,17 @@ def serve_home(request: Request, q: str=""):
|
|||||||
results = resp["hits"]["hits"]
|
results = resp["hits"]["hits"]
|
||||||
|
|
||||||
templates.env.filters["json"] = lambda x: yaml.dump(dict(x))
|
templates.env.filters["json"] = lambda x: yaml.dump(dict(x))
|
||||||
return templates.TemplateResponse("index.html", context= {
|
return templates.TemplateResponse(
|
||||||
"request": resp,"results": results,"message": message, "query": query})
|
"index.html",
|
||||||
|
context={
|
||||||
|
"request": resp,
|
||||||
|
"results": results,
|
||||||
|
"message": message,
|
||||||
|
"query": query,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("test")
|
||||||
|
def test(request: Request):
|
||||||
|
return {"test": "test"}
|
||||||
|
|||||||
@@ -9,8 +9,12 @@
|
|||||||
{% for r in results %}
|
{% for r in results %}
|
||||||
<li>
|
<li>
|
||||||
<a href="https://{{r['_source']['url']}}">
|
<a href="https://{{r['_source']['url']}}">
|
||||||
|
|
||||||
{{r["_source"]["title"]|safe }}
|
{{r["_source"]["title"]|safe }}
|
||||||
</a>
|
</a>
|
||||||
|
{{r["_score"]|safe }}
|
||||||
|
{{r["sort"]|safe }}
|
||||||
|
<br /><br />
|
||||||
{% for hh in r["highlight"] %}
|
{% for hh in r["highlight"] %}
|
||||||
{% for h in r["highlight"][hh] %}
|
{% for h in r["highlight"][hh] %}
|
||||||
{{ h |safe}} <br />
|
{{ h |safe}} <br />
|
||||||
|
|||||||
@@ -1,12 +0,0 @@
|
|||||||
# https://pypi.org/project/facebook-page-scraper/
|
|
||||||
from facebook_page_scraper import Facebook_scraper
|
|
||||||
page_name = "fsmbtu"
|
|
||||||
posts_count = 20
|
|
||||||
browser = "chrome"
|
|
||||||
#proxy = "IP:PORT" #if proxy requires authentication then user:password@IP:PORT
|
|
||||||
timeout = 120 #600 seconds
|
|
||||||
headless = True
|
|
||||||
|
|
||||||
meta_ai = Facebook_scraper(page_name, posts_count, browser, timeout=timeout, headless=headless)
|
|
||||||
json_data = meta_ai.scrap_to_json()
|
|
||||||
print(json_data)
|
|
||||||
@@ -1,4 +0,0 @@
|
|||||||
posts = [
|
|
||||||
{"pfbid02Zpa9GaGnyh431W29SMTdMFzwCnvndgCqDuRKY4kfmLzMhVfZyf1BG2mSokE4HXCsl": {"name": "Fachschaft Maschinenbau & Verfahrenstechnik der TU Wien", "shares": 0, "reactions": {"likes": 1, "loves": 0, "wow": 0, "cares": 0, "sad": 0, "angry": 0, "haha": 0}, "reaction_count": 1, "comments": 0, "content": "Du studierst oder hast Interesse am Master Materialwissenschaften?Dann komm zum Einführungstutorium/ Semesteropening, dort kannst du deine KollegInnen kennenlernen und alle Fragen klären!Wann? 13.10. 18:00Wo? GM7 Kleiner Schiffbau#bussi", "posted_on": "2022-10-11T09:34:00", "video": [], "image": ["https://scontent-vie1-1.xx.fbcdn.net/v/t39.30808-6/311170027_629045595356569_7847357821067350500_n.jpg?stp=dst-jpg_s600x600&_nc_cat=109&ccb=1-7&_nc_sid=730e14&_nc_ohc=aKyj-g-6ZzcAX_fgezX&_nc_ht=scontent-vie1-1.xx&oh=00_AfAIA7Wm9pPltcSK1n-ZJ2DPcIUFSKdvljZ14FNp208FCg&oe=63632F0B"], "post_url": "https://www.facebook.com/fsmbtu/posts/pfbid02Zpa9GaGnyh431W29SMTdMFzwCnvndgCqDuRKY4kfmLzMhVfZyf1BG2mSokE4HXCsl?__cft__[0]=AZVKCuhSkdYgll3ZFvIsXEx0U9ybfnyKM7ua43FdC1OloGDAkzTrNwNqNI3tcmBsqUbme0jH423_h1Wvy_ec-Xj-2QZxcQy49C6VeA78y4mD8Ex_VbitnZvxkJIm0rbYwkFiCOnwBnLbUTy5bia7yQXGNVtgJixRiY8aYIgC50mPMwjf4dLZbzX2NARd-eAXx6BBhNfeScm8n4TAkEuZiTk5FaiG97WMv2_AucJshoZ72g&__tn__=%2CO%2CP-R"}},
|
|
||||||
{"pfbid094MJ3H1jaioLwm9ZBdyN4bEQHnFr8JAgpB6K7UBZ7xTqfZdfifVzvhJdgyLrwdNAl": {"name": "Fachschaft Elektrotechnik", "shares": 0, "reactions": {"likes": 17, "loves": 1, "wow": 0, "cares": 0, "sad": 0, "angry": 0, "haha": 0}, "reaction_count": 18, "comments": 0, "content": "Nach 2 Jahren Pause ist es nun wieder so weit. Wir feiern am 21.10. das 20. Fetz’n Fliegen Komm vorbei und lassen wir gemeinsam die Fetz‘n Fliegen #comeback #jubiläum #party #fetznfliegen #tuwien #semesteropening", "posted_on": "2022-10-13T12:09:00", "video": [], "image": ["https://scontent-vie1-1.xx.fbcdn.net/v/t39.30808-6/311172846_632995741860626_2839195490689716775_n.jpg?stp=dst-jpg_p526x296&_nc_cat=102&ccb=1-7&_nc_sid=730e14&_nc_ohc=5crB-6ISDf0AX-pE9Iw&_nc_oc=AQmfhBkwtkWsD_RCLws86g4MwFGNXJCU1ZnufsKI3mnZkdFla-Mx7s9SOgbIhpd2PVk&_nc_ht=scontent-vie1-1.xx&oh=00_AfDwNsC-aZ3Jhxbeok5P9Bvf0IpIvyY61sDyhl7uWK3MKg&oe=63640FE3"], "post_url": "https://www.facebook.com/FachschaftET/posts/pfbid094MJ3H1jaioLwm9ZBdyN4bEQHnFr8JAgpB6K7UBZ7xTqfZdfifVzvhJdgyLrwdNAl?__cft__[0]=AZWL_tw6LUTREPksX4y2qquFekfyKm-6l3a7UTsojWf-Ij9D8fObP55jGZUYB0QFe3zomuCLsWOeV7fTrsz6sXO6otteao4t0g0JUy6hHKeQvKNyfHB9lymnXvzwremH5sCh7CqJOQOeqlGCVtQ8Pqbcq82KGA09P5BdWyVfToNkoenUOMHxdaoso9cK-ijPG_fsn6pivRT38MdC1UXWENU3Edw4eXee92n18KvjKVhVTQ&__tn__=%2CO%2CP-R"}}
|
|
||||||
]
|
|
||||||
46
scraped_urls.log
Normal file
46
scraped_urls.log
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
wiki.fet.at/index.php/Rezept:_Nathans_Apfelstreuselkuchen
|
||||||
|
wiki.fet.at/index.php/How_to:_Start_@_FET
|
||||||
|
wiki.fet.at/index.php/Kino
|
||||||
|
wiki.fet.at/index.php/How_to:_Kellerputz
|
||||||
|
wiki.fet.at/index.php/Schinkenfleckerl
|
||||||
|
wiki.fet.at/index.php/Waffeln
|
||||||
|
wiki.fet.at/index.php/%C3%96H-Wahl:_Wahlstand
|
||||||
|
wiki.fet.at/index.php/Keller:_Bestandsaufnahme
|
||||||
|
wiki.fet.at/index.php/M%C3%B6rderspiel
|
||||||
|
wiki.fet.at/index.php/Schl%C3%BCssel-_und_Oimlliste
|
||||||
|
wiki.fet.at/index.php/TU-Phone
|
||||||
|
wiki.fet.at/index.php/Bierbaron
|
||||||
|
wiki.fet.at/index.php/ET-Crashkurs
|
||||||
|
wiki.fet.at/index.php/%C3%96H-Wahl
|
||||||
|
wiki.fet.at/index.php/How-To_Festl
|
||||||
|
wiki.fet.at/index.php/Bouldern
|
||||||
|
wiki.fet.at/index.php/Spritzerstand
|
||||||
|
wiki.fet.at/index.php/Beratung
|
||||||
|
wiki.fet.at/index.php/Email-Anleitung
|
||||||
|
wiki.fet.at/index.php/LDAP
|
||||||
|
wiki.fet.at/index.php/Wiki-Anleitung
|
||||||
|
wiki.fet.at/index.php/User_anlegen
|
||||||
|
wiki.fet.at/index.php/Mailinglisten
|
||||||
|
wiki.fet.at/index.php/Rezept:_Nathans_Apfelstreuselkuchen
|
||||||
|
wiki.fet.at/index.php/Schinkenfleckerl
|
||||||
|
wiki.fet.at/index.php/M%C3%B6rderspiel
|
||||||
|
wiki.fet.at/index.php/How_to:_Kellerputz
|
||||||
|
wiki.fet.at/index.php/Keller:_Bestandsaufnahme
|
||||||
|
wiki.fet.at/index.php/How_to:_Start_@_FET
|
||||||
|
wiki.fet.at/index.php/Kino
|
||||||
|
wiki.fet.at/index.php/Waffeln
|
||||||
|
wiki.fet.at/index.php/%C3%96H-Wahl:_Wahlstand
|
||||||
|
wiki.fet.at/index.php/TU-Phone
|
||||||
|
wiki.fet.at/index.php/Schl%C3%BCssel-_und_Oimlliste
|
||||||
|
wiki.fet.at/index.php/Bierbaron
|
||||||
|
wiki.fet.at/index.php/%C3%96H-Wahl
|
||||||
|
wiki.fet.at/index.php/Bouldern
|
||||||
|
wiki.fet.at/index.php/Spritzerstand
|
||||||
|
wiki.fet.at/index.php/ET-Crashkurs
|
||||||
|
wiki.fet.at/index.php/How-To_Festl
|
||||||
|
wiki.fet.at/index.php/Beratung
|
||||||
|
wiki.fet.at/index.php/Wiki-Anleitung
|
||||||
|
wiki.fet.at/index.php/User_anlegen
|
||||||
|
wiki.fet.at/index.php/Email-Anleitung
|
||||||
|
wiki.fet.at/index.php/LDAP
|
||||||
|
wiki.fet.at/index.php/Mailinglisten
|
||||||
30
search.html
Normal file
30
search.html
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>API Request Example</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<form>
|
||||||
|
<label for="input-field">Enter some text:</label>
|
||||||
|
<input type="text" id="input-field" name="input-field">
|
||||||
|
<button type="button" onclick="makeRequest()">Submit</button>
|
||||||
|
</form>
|
||||||
|
<div id="response-container"></div>
|
||||||
|
<script>
|
||||||
|
function makeRequest() {
|
||||||
|
const inputField = document.getElementById('input-field');
|
||||||
|
const inputValue = inputField.value;
|
||||||
|
fetch(`https://api.example.com/?q=${inputValue}`)
|
||||||
|
.then(response => response.json())
|
||||||
|
.then(data => {
|
||||||
|
const responseContainer = document.getElementById('response-container');
|
||||||
|
const html = `
|
||||||
|
<h2>API Response:</h2>
|
||||||
|
<pre>${JSON.stringify(data, null, 2)}</pre>
|
||||||
|
`;
|
||||||
|
responseContainer.innerHTML = html;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -1,10 +1,8 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from elasticsearch import Elasticsearch, helpers
|
from elasticsearch import Elasticsearch, helpers
|
||||||
import contextlib
|
import contextlib
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
ELASTIC_HOST = os.environ.get("ELASTIC_HOST", "http://localhost:9200")
|
ELASTIC_HOST = os.environ.get("ELASTIC_HOST", "http://localhost:9200")
|
||||||
ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD", "*l9qNGoojiCC4n9KcZhj")
|
ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD", "*l9qNGoojiCC4n9KcZhj")
|
||||||
ELASTIC_QUERY = os.environ.get("ELASTIC_QUERY", "Anwesend")
|
ELASTIC_QUERY = os.environ.get("ELASTIC_QUERY", "Anwesend")
|
||||||
@@ -15,7 +13,9 @@ ELASTIC_INDEX = os.environ.get("ELASTIC_INDEX","legacy")
|
|||||||
@contextlib.contextmanager
|
@contextlib.contextmanager
|
||||||
def es_client():
|
def es_client():
|
||||||
logging.debug(f"ELASIC HOST:%s" % ELASTIC_HOST)
|
logging.debug(f"ELASIC HOST:%s" % ELASTIC_HOST)
|
||||||
client = Elasticsearch(ELASTIC_HOST, verify_certs=False, basic_auth=('elastic', ELASTIC_PASSWORD))
|
client = Elasticsearch(
|
||||||
|
ELASTIC_HOST, verify_certs=False, basic_auth=("elastic", ELASTIC_PASSWORD)
|
||||||
|
)
|
||||||
yield client
|
yield client
|
||||||
client.close()
|
client.close()
|
||||||
|
|
||||||
@@ -39,20 +39,43 @@ def es_highlight():
|
|||||||
"text": {"fragment_size": 150},
|
"text": {"fragment_size": 150},
|
||||||
"title.ngrams": {},
|
"title.ngrams": {},
|
||||||
"text.ngrams": {"fragment_size": 150},
|
"text.ngrams": {"fragment_size": 150},
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return highlight
|
return highlight
|
||||||
|
|
||||||
|
|
||||||
|
sorting = {
|
||||||
|
"updated_at": {"order": "desc"},
|
||||||
|
"_score": {"order": "desc"},
|
||||||
|
"prio": {"order": "desc"},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def es_sorting():
|
||||||
|
return {
|
||||||
|
"_script": {
|
||||||
|
"type": "number",
|
||||||
|
"script": {
|
||||||
|
"lang": "painless",
|
||||||
|
"source": "Math.log10(1+doc['updated_at'].value.toInstant().toEpochMilli()/1000000000/100) + Math.log10(1+_score)/10 + Math.log10(1+doc['prio'].value/1000) ", # * Math.log10(1+) * Math.log10(doc['prio'].value/10)" #* doc['_score'].value
|
||||||
|
},
|
||||||
|
"order": "desc",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def es_search(query: str):
|
def es_search(query: str):
|
||||||
with es_client() as client:
|
with es_client() as client:
|
||||||
result = client.search(
|
result = client.search(
|
||||||
index=ELASTIC_INDEX,
|
index=ELASTIC_INDEX,
|
||||||
size=10,
|
size=30,
|
||||||
query=es_query(query),
|
query=es_query(query),
|
||||||
highlight = es_highlight()
|
sort=es_sorting(),
|
||||||
|
highlight=es_highlight(),
|
||||||
)
|
)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
# for hit in resp["hits"]["hits"]:
|
# for hit in resp["hits"]["hits"]:
|
||||||
# print(hit)
|
# print(hit)
|
||||||
|
|
||||||
@@ -60,10 +83,12 @@ if __name__ =="__main__":
|
|||||||
resp = es_search(ELASTIC_QUERY)
|
resp = es_search(ELASTIC_QUERY)
|
||||||
logging.info(f"Found %d recorts in hits" % resp["hits"]["hits"])
|
logging.info(f"Found %d recorts in hits" % resp["hits"]["hits"])
|
||||||
for hit in resp["hits"]["hits"]:
|
for hit in resp["hits"]["hits"]:
|
||||||
print(f"\n\n%s\n%s\n%s - %s" % (
|
print(
|
||||||
|
f"\n\n%s\n%s\n%s - %s"
|
||||||
|
% (
|
||||||
hit.get("_source", {}).get("url", ""),
|
hit.get("_source", {}).get("url", ""),
|
||||||
hit.get("_source", {}).get("title", ""),
|
hit.get("_source", {}).get("title", ""),
|
||||||
" ".join(hit.get("highlight", {}).get("title", [""])),
|
" ".join(hit.get("highlight", {}).get("title", [""])),
|
||||||
" ".join(hit.get("highlight",{}).get("text",[""]))
|
" ".join(hit.get("highlight", {}).get("text", [""])),
|
||||||
))
|
)
|
||||||
|
)
|
||||||
|
|||||||
@@ -54,6 +54,7 @@ mapping = {
|
|||||||
"url": { "type": "text", "index": False},
|
"url": { "type": "text", "index": False},
|
||||||
"published": {"type": "date", "format": "date_optional_time"},
|
"published": {"type": "date", "format": "date_optional_time"},
|
||||||
"updated_at": {"type": "date", "format": "date_optional_time"},
|
"updated_at": {"type": "date", "format": "date_optional_time"},
|
||||||
|
"prio": {"type": "integer"},
|
||||||
"raw": {
|
"raw": {
|
||||||
"type": "text",
|
"type": "text",
|
||||||
"index": False
|
"index": False
|
||||||
|
|||||||
Reference in New Issue
Block a user