first_commit
This commit is contained in:
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
archiv/*
|
||||||
|
.pytest_cache
|
||||||
|
test_elastic
|
||||||
|
*.pyc
|
||||||
|
*.bak
|
||||||
16
Dockerfile
Normal file
16
Dockerfile
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
from python:3.8
|
||||||
|
|
||||||
|
workdir /srv
|
||||||
|
|
||||||
|
RUN pip3 install setuptools_scm
|
||||||
|
|
||||||
|
RUN pip3 install elasticsearch
|
||||||
|
RUN pip3 install scrapy
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip3 install -r requirements.txt
|
||||||
|
COPY scrapy.cfg .
|
||||||
|
|
||||||
|
COPY html_scrapy ./html_scrapy
|
||||||
|
COPY searching ./searching
|
||||||
|
CMD ls && scrapy crawl legacy_spider
|
||||||
|
#spider html_scrapy/spiders/legacy_spider.py
|
||||||
9
Dockerfile_index
Normal file
9
Dockerfile_index
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
from python:3.8
|
||||||
|
workdir /srv
|
||||||
|
|
||||||
|
RUN pip3 install setuptools_scm
|
||||||
|
RUN pip3 install elasticsearch
|
||||||
|
|
||||||
|
COPY searching ./searching
|
||||||
|
COPY reset_index.py .
|
||||||
|
CMD python3 reset_index.py
|
||||||
2
build_docker
Normal file
2
build_docker
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
docker build -t docker.fet.at/andis_scrapy:0.1.2 .
|
||||||
|
docker build -t docker.fet.at/andis_index:0.1.2 -f Dockerfile_index .
|
||||||
0
html_scrapy/__init__.py
Normal file
0
html_scrapy/__init__.py
Normal file
25
html_scrapy/elastic_publish.py
Normal file
25
html_scrapy/elastic_publish.py
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
import os
|
||||||
|
from elasticsearch import Elasticsearch
|
||||||
|
from searching import es_client, ELASTIC_INDEX
|
||||||
|
|
||||||
|
def push_to_index(id, element):
|
||||||
|
element = check_elastic_document(element)
|
||||||
|
try:
|
||||||
|
with es_client() as client:
|
||||||
|
client.index(index=ELASTIC_INDEX, id=id, document=element)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
def check_elastic_document(element):
|
||||||
|
|
||||||
|
for e in ["url", "title", "text", "published", "updated_at"]:
|
||||||
|
if not e in element:
|
||||||
|
raise AttributeError(f"A %s is needed for the Elastic Element" % e)
|
||||||
|
|
||||||
|
return { "published": str(element["published"]),
|
||||||
|
"text": element["text"],
|
||||||
|
"title": element["title"],
|
||||||
|
#"source": get_source(post),
|
||||||
|
"url": element["url"],
|
||||||
|
"updated_at": str(element["updated_at"])
|
||||||
|
}
|
||||||
93
html_scrapy/settings.py
Normal file
93
html_scrapy/settings.py
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
# Scrapy settings for t project
|
||||||
|
#
|
||||||
|
# For simplicity, this file contains only settings considered important or
|
||||||
|
# commonly used. You can find more settings consulting the documentation:
|
||||||
|
#
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
BOT_NAME = 'html_scrapy'
|
||||||
|
|
||||||
|
SPIDER_MODULES = ['html_scrapy.spiders']
|
||||||
|
NEWSPIDER_MODULE = 'html_scrapy.spiders'
|
||||||
|
|
||||||
|
|
||||||
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
|
#USER_AGENT = 't (+http://www.yourdomain.com)'
|
||||||
|
|
||||||
|
# Obey robots.txt rules
|
||||||
|
ROBOTSTXT_OBEY = True
|
||||||
|
|
||||||
|
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||||
|
#CONCURRENT_REQUESTS = 32
|
||||||
|
|
||||||
|
# Configure a delay for requests for the same website (default: 0)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||||
|
# See also autothrottle settings and docs
|
||||||
|
#DOWNLOAD_DELAY = 3
|
||||||
|
# The download delay setting will honor only one of:
|
||||||
|
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||||
|
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||||
|
|
||||||
|
# Disable cookies (enabled by default)
|
||||||
|
#COOKIES_ENABLED = False
|
||||||
|
|
||||||
|
# Disable Telnet Console (enabled by default)
|
||||||
|
#TELNETCONSOLE_ENABLED = False
|
||||||
|
|
||||||
|
# Override the default request headers:
|
||||||
|
#DEFAULT_REQUEST_HEADERS = {
|
||||||
|
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||||
|
# 'Accept-Language': 'en',
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable spider middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
#SPIDER_MIDDLEWARES = {
|
||||||
|
# 't.middlewares.TSpiderMiddleware': 543,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable downloader middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
DOWNLOADER_MIDDLEWARES = {
|
||||||
|
# 't.middlewares.TDownloaderMiddleware': 543,
|
||||||
|
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 543
|
||||||
|
}
|
||||||
|
|
||||||
|
# Enable or disable extensions
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||||
|
#EXTENSIONS = {
|
||||||
|
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Configure item pipelines
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
#ITEM_PIPELINES = {
|
||||||
|
# 't.pipelines.TPipeline': 300,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||||
|
#AUTOTHROTTLE_ENABLED = True
|
||||||
|
# The initial download delay
|
||||||
|
#AUTOTHROTTLE_START_DELAY = 5
|
||||||
|
# The maximum download delay to be set in case of high latencies
|
||||||
|
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||||
|
# The average number of requests Scrapy should be sending in parallel to
|
||||||
|
# each remote server
|
||||||
|
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||||
|
# Enable showing throttling stats for every response received:
|
||||||
|
#AUTOTHROTTLE_DEBUG = False
|
||||||
|
|
||||||
|
# Enable and configure HTTP caching (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||||
|
#HTTPCACHE_ENABLED = True
|
||||||
|
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||||
|
#HTTPCACHE_DIR = 'httpcache'
|
||||||
|
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
|
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||||
|
|
||||||
|
# Set settings whose default value is deprecated to a future-proof value
|
||||||
|
REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
|
||||||
|
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
|
||||||
0
html_scrapy/spiders/__init:__.py
Normal file
0
html_scrapy/spiders/__init:__.py
Normal file
123
html_scrapy/spiders/legacy_spider.py
Normal file
123
html_scrapy/spiders/legacy_spider.py
Normal file
@@ -0,0 +1,123 @@
|
|||||||
|
from scrapy.spiders import SitemapSpider,CrawlSpider, Rule, Spider
|
||||||
|
from scrapy.linkextractors import LinkExtractor
|
||||||
|
import pickle
|
||||||
|
import scrapy
|
||||||
|
#import redis as redis
|
||||||
|
import bs4
|
||||||
|
import re
|
||||||
|
import getpass
|
||||||
|
#r = redis.Redis(host='localhost', port=6379, password="12345678", db=0)
|
||||||
|
import pickle
|
||||||
|
import json
|
||||||
|
from html_scrapy.elastic_publish import push_to_index, check_elastic_document
|
||||||
|
|
||||||
|
def publish(response: scrapy.http.response.html.HtmlResponse):
|
||||||
|
print("Response typ: %s, obj: %s" % (type(response),response))
|
||||||
|
|
||||||
|
# r.publish("fetlegacy", pickle.dumps({"url": response.url, "body": response.body}))
|
||||||
|
# r.set(response.url, response.body)
|
||||||
|
|
||||||
|
with open("scraped_urls.log", "a+") as f:
|
||||||
|
f.write(response.url+"\n")
|
||||||
|
|
||||||
|
documents={}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class LegacySpider(CrawlSpider):
|
||||||
|
name = 'legacy_spider'
|
||||||
|
allowed_domains = ['legacy.fet.at']
|
||||||
|
#sitemap_urls=['https://fet.at/sitemap.xml']
|
||||||
|
#sitemap_rules = [('/posts/', 'parse')]
|
||||||
|
http_user = 'andis'
|
||||||
|
http_pass = getpass.getpass("Passwort von FET USer andis: ")
|
||||||
|
#http_auth_domain = 'legacy.fet.at'
|
||||||
|
rules = (
|
||||||
|
Rule(LinkExtractor(allow=('documents', )), callback='parse_document', process_links='fix_document_links'),
|
||||||
|
Rule(LinkExtractor(allow=(r"themengruppen/\d+$", )), callback='parse_themengruppe'),
|
||||||
|
)
|
||||||
|
start_urls = ['https://legacy.fet.at/themengruppen/15']
|
||||||
|
|
||||||
|
custom_settings = {
|
||||||
|
'DEPTH_LIMIT': '1',
|
||||||
|
}
|
||||||
|
def fix_document_links(self, links):
|
||||||
|
for link in links:
|
||||||
|
if re.match(r".*documents/\d+$", link.url):
|
||||||
|
link.url=link.url +".json"
|
||||||
|
yield link
|
||||||
|
def fix_themen_links(self, links):
|
||||||
|
for link in links:
|
||||||
|
if re.match(r".*themen/\d+$", link.url):
|
||||||
|
link.url=link.url +".json"
|
||||||
|
yield link
|
||||||
|
|
||||||
|
def parse_document(self, response):
|
||||||
|
global documents
|
||||||
|
body=json.loads(response.body)
|
||||||
|
body["raw"] = body["text"]
|
||||||
|
body["text"] = bs4.BeautifulSoup(body["text"], 'lxml').get_text(" ")
|
||||||
|
documents[int(body['id'])]=body
|
||||||
|
return
|
||||||
|
def parse_themen(self, response):
|
||||||
|
|
||||||
|
body=json.loads(response.body)
|
||||||
|
body["text"] = bs4.BeautifulSoup(body["text"], 'lxml').get_text(" ")
|
||||||
|
documents[int(body['id'])]=body
|
||||||
|
return
|
||||||
|
|
||||||
|
def parse_themengruppe(self, response):
|
||||||
|
global documents
|
||||||
|
|
||||||
|
themengruppe = bs4.BeautifulSoup(response.body, 'lxml')
|
||||||
|
|
||||||
|
thema_tags = themengruppe.find_all("div", itemtype="http://schema.org/Article")
|
||||||
|
print(f"found %d topics"% len(thema_tags))
|
||||||
|
for thema in thema_tags:
|
||||||
|
|
||||||
|
t_link = thema.find("h2").find_all("a")[0]
|
||||||
|
t = {
|
||||||
|
'url': t_link["href"],
|
||||||
|
'title': t_link.text,
|
||||||
|
'id': int( re.search(r"/(\d+)$", t_link["href"]).group(1)),
|
||||||
|
|
||||||
|
}
|
||||||
|
print(f"\n\ncrawling topic %s - %s" %(t.get("id","?"),t.get("title","?")))
|
||||||
|
meeting_tags = thema.find_all("div", id=re.compile(r'meeting_\d+'))
|
||||||
|
print(f"\nfound %d meetings for topic %s" % (len(meeting_tags), t["title"]))
|
||||||
|
for meeting in meeting_tags:
|
||||||
|
m_id =re.search(r"meeting_(\d+)$", meeting['id']).group(1)
|
||||||
|
m ={'id': m_id,
|
||||||
|
'title': meeting.find("a").text
|
||||||
|
}
|
||||||
|
|
||||||
|
docs=meeting.find_all(id=re.compile(r'document_\d+'))
|
||||||
|
print(f"crawling meeting %s with %d meetings" % (m.get("title"),len(docs)))
|
||||||
|
for d in docs:
|
||||||
|
doc_id = int( re.search(r"document_(\d+)$", d['id']).group(1))
|
||||||
|
if not doc_id in documents:
|
||||||
|
continue
|
||||||
|
documents[doc_id]["meeting"]=m
|
||||||
|
documents[doc_id]["thema"]= t
|
||||||
|
for d in thema.find_all(id=re.compile(r'document_\d+')):
|
||||||
|
doc_id = int( re.search(r"document_(\d+)$", d['id']).group(1))
|
||||||
|
if not doc_id in documents:
|
||||||
|
continue
|
||||||
|
documents[doc_id]["thema"]= t
|
||||||
|
|
||||||
|
output={}
|
||||||
|
for k, d in documents.items():
|
||||||
|
output[k]= check_elastic_document({
|
||||||
|
"title": d.get("name","")+ " - " + d.get("thema", {}).get("title","") + " - " + d.get("meeting",{}).get("title",""),
|
||||||
|
"text": d.get("text",""),
|
||||||
|
"raw": d.get("raw",""),
|
||||||
|
"url": "legacy.fet.at/documents/" + str(d["id"]),
|
||||||
|
"published": d["created_at"],
|
||||||
|
"updated_at": d["updated_at"]
|
||||||
|
})
|
||||||
|
push_to_index(output[k]["url"],output[k])
|
||||||
|
|
||||||
|
print(f"Document added: %s" % output[k].get("title",""))
|
||||||
|
#print("\n\nDocuments"+json.dumps(output))
|
||||||
|
return None
|
||||||
68
httpdemo/__init__.py
Normal file
68
httpdemo/__init__.py
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
|
||||||
|
from fastapi import FastAPI
|
||||||
|
from fastapi.staticfiles import StaticFiles
|
||||||
|
from fastapi.templating import Jinja2Templates
|
||||||
|
from fastapi import Request
|
||||||
|
from elasticsearch import Elasticsearch
|
||||||
|
import sys
|
||||||
|
import elastic_transport
|
||||||
|
|
||||||
|
ELASTIC_HOST = "http://localhost:9200"
|
||||||
|
client = Elasticsearch(ELASTIC_HOST, verify_certs=False)
|
||||||
|
|
||||||
|
app = FastAPI(debug=True)
|
||||||
|
|
||||||
|
templates = Jinja2Templates(directory="./httpdemo")
|
||||||
|
#app.mount("/", StaticFiles(directory="/"))
|
||||||
|
|
||||||
|
|
||||||
|
#@app.get("/")
|
||||||
|
#def read_root():
|
||||||
|
# return {"Hello"}
|
||||||
|
|
||||||
|
|
||||||
|
query ={
|
||||||
|
"multi_match":{
|
||||||
|
"query": ELASTIC_QUERY,
|
||||||
|
"fields": ["title^20","title.ngrams^10","text","text.ngrams"],
|
||||||
|
"type": "most_fields"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
highlight = {
|
||||||
|
"fields": {
|
||||||
|
"title": {},
|
||||||
|
"text": {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/")
|
||||||
|
def serve_home(request: Request, q: str):
|
||||||
|
query= {
|
||||||
|
"bool":{
|
||||||
|
"should": [{"wildcard": {"body": {"value": "*%s*"% q, "case_insensitive": True }}},
|
||||||
|
{"wildcard": {"title": {"value": "*%s*" % q, "case_insensitive": True }}}],
|
||||||
|
"minimum_should_match": 1
|
||||||
|
}}
|
||||||
|
query = {
|
||||||
|
"match": {
|
||||||
|
"body": q
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = client.search(
|
||||||
|
index = "posts2",
|
||||||
|
size=10,
|
||||||
|
#analyze_wildcard=True,
|
||||||
|
#q="sdf*",
|
||||||
|
query= query
|
||||||
|
)
|
||||||
|
except (elastic_transport.ConnectionError, elastic_transport.ConnectionTimeout) as e:
|
||||||
|
print(e,sys.stderr)
|
||||||
|
results=[]
|
||||||
|
else:
|
||||||
|
results=resp["hits"]["hits"]
|
||||||
|
return templates.TemplateResponse("index.html", context= {"request": request,"results": results})
|
||||||
10
httpdemo/index.html
Normal file
10
httpdemo/index.html
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
<body>
|
||||||
|
<h1>Hello Index httpdemo</h1>
|
||||||
|
<p><pre>{{request}}</pre></p>
|
||||||
|
<ul>{% for r in results %}
|
||||||
|
<li><a href="{{r['url'}}">
|
||||||
|
{{r["body"]|safe}}
|
||||||
|
</li></a>
|
||||||
|
{% endfor %}
|
||||||
|
</ul>
|
||||||
|
</body>
|
||||||
10909
httpdemo/jquery.js
vendored
Normal file
10909
httpdemo/jquery.js
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3
requirements.txt
Normal file
3
requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
beautifulsoup4
|
||||||
|
scrapy
|
||||||
|
|
||||||
4
reset_index.py
Normal file
4
reset_index.py
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
from searching.index import reset_index
|
||||||
|
|
||||||
|
|
||||||
|
reset_index()
|
||||||
1
run
Normal file
1
run
Normal file
@@ -0,0 +1 @@
|
|||||||
|
docker run --rm -it --network elastic $(docker build -q .)
|
||||||
1
run_elastic
Normal file
1
run_elastic
Normal file
@@ -0,0 +1 @@
|
|||||||
|
docker run --name elastic --rm --net elastic -v esdata:/usr/share/elasticsearch/data -e ES_JAVA_OPTS="-Xms1g -Xmx1g" -e xpack.security.transport.ssl.enabled=false -e discovery.type=single-node -e ELASTIC_PASSWORD=*l9qNGoojiCC4n9KcZhj -p 9200:9200 -it docker.elastic.co/elasticsearch/elasticsearch:8.6.1
|
||||||
1
run_index
Normal file
1
run_index
Normal file
@@ -0,0 +1 @@
|
|||||||
|
docker run --rm -it -e ELASTIC_HOST=https://elastic:9200 --network elastic $(docker build -q -f Dockerfile_index .)
|
||||||
12
scrap_facebook.py
Normal file
12
scrap_facebook.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
# https://pypi.org/project/facebook-page-scraper/
|
||||||
|
from facebook_page_scraper import Facebook_scraper
|
||||||
|
page_name = "fsmbtu"
|
||||||
|
posts_count = 20
|
||||||
|
browser = "chrome"
|
||||||
|
#proxy = "IP:PORT" #if proxy requires authentication then user:password@IP:PORT
|
||||||
|
timeout = 120 #600 seconds
|
||||||
|
headless = True
|
||||||
|
|
||||||
|
meta_ai = Facebook_scraper(page_name, posts_count, browser, timeout=timeout, headless=headless)
|
||||||
|
json_data = meta_ai.scrap_to_json()
|
||||||
|
print(json_data)
|
||||||
4
scrap_facebook_data.py
Normal file
4
scrap_facebook_data.py
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
posts = [
|
||||||
|
{"pfbid02Zpa9GaGnyh431W29SMTdMFzwCnvndgCqDuRKY4kfmLzMhVfZyf1BG2mSokE4HXCsl": {"name": "Fachschaft Maschinenbau & Verfahrenstechnik der TU Wien", "shares": 0, "reactions": {"likes": 1, "loves": 0, "wow": 0, "cares": 0, "sad": 0, "angry": 0, "haha": 0}, "reaction_count": 1, "comments": 0, "content": "Du studierst oder hast Interesse am Master Materialwissenschaften?Dann komm zum Einführungstutorium/ Semesteropening, dort kannst du deine KollegInnen kennenlernen und alle Fragen klären!Wann? 13.10. 18:00Wo? GM7 Kleiner Schiffbau#bussi", "posted_on": "2022-10-11T09:34:00", "video": [], "image": ["https://scontent-vie1-1.xx.fbcdn.net/v/t39.30808-6/311170027_629045595356569_7847357821067350500_n.jpg?stp=dst-jpg_s600x600&_nc_cat=109&ccb=1-7&_nc_sid=730e14&_nc_ohc=aKyj-g-6ZzcAX_fgezX&_nc_ht=scontent-vie1-1.xx&oh=00_AfAIA7Wm9pPltcSK1n-ZJ2DPcIUFSKdvljZ14FNp208FCg&oe=63632F0B"], "post_url": "https://www.facebook.com/fsmbtu/posts/pfbid02Zpa9GaGnyh431W29SMTdMFzwCnvndgCqDuRKY4kfmLzMhVfZyf1BG2mSokE4HXCsl?__cft__[0]=AZVKCuhSkdYgll3ZFvIsXEx0U9ybfnyKM7ua43FdC1OloGDAkzTrNwNqNI3tcmBsqUbme0jH423_h1Wvy_ec-Xj-2QZxcQy49C6VeA78y4mD8Ex_VbitnZvxkJIm0rbYwkFiCOnwBnLbUTy5bia7yQXGNVtgJixRiY8aYIgC50mPMwjf4dLZbzX2NARd-eAXx6BBhNfeScm8n4TAkEuZiTk5FaiG97WMv2_AucJshoZ72g&__tn__=%2CO%2CP-R"}},
|
||||||
|
{"pfbid094MJ3H1jaioLwm9ZBdyN4bEQHnFr8JAgpB6K7UBZ7xTqfZdfifVzvhJdgyLrwdNAl": {"name": "Fachschaft Elektrotechnik", "shares": 0, "reactions": {"likes": 17, "loves": 1, "wow": 0, "cares": 0, "sad": 0, "angry": 0, "haha": 0}, "reaction_count": 18, "comments": 0, "content": "Nach 2 Jahren Pause ist es nun wieder so weit. Wir feiern am 21.10. das 20. Fetz’n Fliegen Komm vorbei und lassen wir gemeinsam die Fetz‘n Fliegen #comeback #jubiläum #party #fetznfliegen #tuwien #semesteropening", "posted_on": "2022-10-13T12:09:00", "video": [], "image": ["https://scontent-vie1-1.xx.fbcdn.net/v/t39.30808-6/311172846_632995741860626_2839195490689716775_n.jpg?stp=dst-jpg_p526x296&_nc_cat=102&ccb=1-7&_nc_sid=730e14&_nc_ohc=5crB-6ISDf0AX-pE9Iw&_nc_oc=AQmfhBkwtkWsD_RCLws86g4MwFGNXJCU1ZnufsKI3mnZkdFla-Mx7s9SOgbIhpd2PVk&_nc_ht=scontent-vie1-1.xx&oh=00_AfDwNsC-aZ3Jhxbeok5P9Bvf0IpIvyY61sDyhl7uWK3MKg&oe=63640FE3"], "post_url": "https://www.facebook.com/FachschaftET/posts/pfbid094MJ3H1jaioLwm9ZBdyN4bEQHnFr8JAgpB6K7UBZ7xTqfZdfifVzvhJdgyLrwdNAl?__cft__[0]=AZWL_tw6LUTREPksX4y2qquFekfyKm-6l3a7UTsojWf-Ij9D8fObP55jGZUYB0QFe3zomuCLsWOeV7fTrsz6sXO6otteao4t0g0JUy6hHKeQvKNyfHB9lymnXvzwremH5sCh7CqJOQOeqlGCVtQ8Pqbcq82KGA09P5BdWyVfToNkoenUOMHxdaoso9cK-ijPG_fsn6pivRT38MdC1UXWENU3Edw4eXee92n18KvjKVhVTQ&__tn__=%2CO%2CP-R"}}
|
||||||
|
]
|
||||||
2
scrapy.cfg
Normal file
2
scrapy.cfg
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
[settings]
|
||||||
|
default = html_scrapy.settings
|
||||||
57
searching/__init__.py
Normal file
57
searching/__init__.py
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
import os
|
||||||
|
from elasticsearch import Elasticsearch, helpers
|
||||||
|
import contextlib
|
||||||
|
ELASTIC_HOST = os.environ.get("ELASTIC_HOST","http://localhost:9200")
|
||||||
|
ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD","*l9qNGoojiCC4n9KcZhj")
|
||||||
|
ELASTIC_QUERY = os.environ.get("ELASTIC_QUERY","Anwesend")
|
||||||
|
ELASTIC_INDEX = os.environ.get("ELASTIC_INDEX","legacy")
|
||||||
|
|
||||||
|
|
||||||
|
# Verbinde mit Client
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def es_client():
|
||||||
|
client = Elasticsearch(ELASTIC_HOST, verify_certs=False, basic_auth=('elastic', ELASTIC_PASSWORD))
|
||||||
|
yield client
|
||||||
|
client.close()
|
||||||
|
|
||||||
|
|
||||||
|
def es_query(query:str):
|
||||||
|
query ={
|
||||||
|
"multi_match":{
|
||||||
|
"query": query,
|
||||||
|
"fields": ["title^20","title.ngrams^10","text","text.ngrams"],
|
||||||
|
"type": "most_fields"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
def es_highlight():
|
||||||
|
highlight = {
|
||||||
|
"fields": {
|
||||||
|
"title": {},
|
||||||
|
"text": {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def es_search(query:str):
|
||||||
|
with es_client() as client:
|
||||||
|
result = client.search(
|
||||||
|
index = ELASTIC_INDEX,
|
||||||
|
size=10,
|
||||||
|
query= es_query(query),
|
||||||
|
highlight = es_highlight()
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
#for hit in resp["hits"]["hits"]:
|
||||||
|
# print(hit)
|
||||||
|
|
||||||
|
if __name__ =="__main__":
|
||||||
|
resp = es_search(ELASTIC_QUERY)
|
||||||
|
for hit in resp["hits"]["hits"]:
|
||||||
|
print(f"\n\n%s\n%s\n%s - %s" % (
|
||||||
|
hit.get("_source",{}).get("url",""),
|
||||||
|
hit.get("_source",{}).get("title",""),
|
||||||
|
" ".join(hit.get("highlight",{}).get("title",[""])),
|
||||||
|
" ".join(hit.get("highlight",{}).get("text",[""]))
|
||||||
|
))
|
||||||
71
searching/index.py
Normal file
71
searching/index.py
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
from . import ELASTIC_INDEX
|
||||||
|
from . import es_client
|
||||||
|
import elasticsearch
|
||||||
|
|
||||||
|
settings = {
|
||||||
|
"index":
|
||||||
|
{"max_ngram_diff": 3
|
||||||
|
},
|
||||||
|
"analysis": {
|
||||||
|
"analyzer": {
|
||||||
|
"my_analyzer": {
|
||||||
|
"tokenizer": "my_tokenizer",
|
||||||
|
"filter": [
|
||||||
|
"lowercase",
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tokenizer": {
|
||||||
|
"my_tokenizer": {
|
||||||
|
"type": "ngram",
|
||||||
|
"min_gram": 3,
|
||||||
|
"max_gram": 6,
|
||||||
|
"token_chars": [
|
||||||
|
"letter",
|
||||||
|
"digit"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}}
|
||||||
|
mapping = {
|
||||||
|
|
||||||
|
"properties": {
|
||||||
|
"title": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"ngrams": {
|
||||||
|
"type":"text",
|
||||||
|
"analyzer": "my_analyzer",
|
||||||
|
"search_analyzer": "standard",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
},
|
||||||
|
"text": {
|
||||||
|
"type": "text",
|
||||||
|
"fields": {
|
||||||
|
"ngrams": {
|
||||||
|
"type":"text",
|
||||||
|
"analyzer": "my_analyzer",
|
||||||
|
"search_analyzer": "standard",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"url": { "type": "text", "index": False},
|
||||||
|
"published": {"type": "date", "format": "date_optional_time"},
|
||||||
|
"updated_at": {"type": "date", "format": "date_optional_time"},
|
||||||
|
"raw": {
|
||||||
|
"type": "text",
|
||||||
|
"index": False
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
def reset_index():
|
||||||
|
with es_client() as client:
|
||||||
|
try:
|
||||||
|
|
||||||
|
client.indices.delete(index=ELASTIC_INDEX)
|
||||||
|
except elasticsearch.NotFoundError:
|
||||||
|
print("Index already removed")
|
||||||
|
|
||||||
|
client.indices.create(index=ELASTIC_INDEX, settings=settings, mappings=mapping)
|
||||||
18
test_search.py
Normal file
18
test_search.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
|
||||||
|
import os
|
||||||
|
from elasticsearch import Elasticsearch, helpers
|
||||||
|
from searching import es_search, es_client
|
||||||
|
ELASTIC_QUERY = os.environ.get("ELASTIC_QUERY","Anwesend")
|
||||||
|
|
||||||
|
#for hit in resp["hits"]["hits"]:
|
||||||
|
# print(hit)
|
||||||
|
|
||||||
|
if __name__ =="__main__":
|
||||||
|
resp = es_search(ELASTIC_QUERY)
|
||||||
|
for hit in resp["hits"]["hits"]:
|
||||||
|
print(f"\n\n%s\n%s\n%s - %s" % (
|
||||||
|
hit.get("_source",{}).get("url",""),
|
||||||
|
hit.get("_source",{}).get("title",""),
|
||||||
|
" ".join(hit.get("highlight",{}).get("title",[""])),
|
||||||
|
" ".join(hit.get("highlight",{}).get("text",[""]))
|
||||||
|
))
|
||||||
Reference in New Issue
Block a user