first_commit

2023-02-19 10:04:10 +01:00
commit ed610970cb
23 changed files with 11434 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,5 @@
+archiv/*
+.pytest_cache
+test_elastic
+*.pyc
+*.bak
--- a/16
+++ b/16
@@ -0,0 +1,16 @@
+from python:3.8
+
+workdir /srv
+
+RUN pip3 install setuptools_scm
+
+RUN pip3 install elasticsearch
+RUN pip3 install scrapy
+COPY requirements.txt .
+RUN pip3 install -r requirements.txt
+COPY scrapy.cfg .
+
+COPY html_scrapy ./html_scrapy
+COPY searching ./searching
+CMD ls && scrapy crawl legacy_spider
+#spider html_scrapy/spiders/legacy_spider.py
--- a/9
+++ b/9
@@ -0,0 +1,9 @@
+from python:3.8
+workdir /srv
+
+RUN pip3 install setuptools_scm
+RUN pip3 install elasticsearch
+
+COPY searching ./searching
+COPY reset_index.py .
+CMD python3 reset_index.py
--- a/2
+++ b/2
@@ -0,0 +1,2 @@
+docker build -t docker.fet.at/andis_scrapy:0.1.2 .
+docker build -t docker.fet.at/andis_index:0.1.2 -f Dockerfile_index .
--- a/html_scrapy/init.py
+++ b/html_scrapy/init.py
--- a/html_scrapy/elastic_publish.py
+++ b/html_scrapy/elastic_publish.py
@@ -0,0 +1,25 @@
+import os
+from elasticsearch import Elasticsearch
+from searching import es_client, ELASTIC_INDEX
+
+def push_to_index(id, element):
+    element = check_elastic_document(element)
+    try:
+     with es_client() as client:
+      client.index(index=ELASTIC_INDEX, id=id,  document=element)
+    except Exception as e:
+        print(e)
+
+def check_elastic_document(element):
+    
+    for e in ["url", "title", "text", "published", "updated_at"]:
+        if not e in element:
+            raise AttributeError(f"A %s is needed for the Elastic Element" % e)
+    
+    return {    "published": str(element["published"]),
+                "text": element["text"],
+                "title": element["title"],
+                #"source": get_source(post),
+                "url": element["url"],
+                "updated_at": str(element["updated_at"])
+                }
--- a/html_scrapy/settings.py
+++ b/html_scrapy/settings.py
@@ -0,0 +1,93 @@
+# Scrapy settings for t project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'html_scrapy'
+
+SPIDER_MODULES = ['html_scrapy.spiders']
+NEWSPIDER_MODULE = 'html_scrapy.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 't (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    't.middlewares.TSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+DOWNLOADER_MIDDLEWARES = {
+#   't.middlewares.TDownloaderMiddleware': 543,
+   'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 543
+}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    't.pipelines.TPipeline': 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
+# Set settings whose default value is deprecated to a future-proof value
+REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
+TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
--- a/html_scrapy/spiders/init:.py
+++ b/html_scrapy/spiders/init:.py
--- a/html_scrapy/spiders/legacy_spider.py
+++ b/html_scrapy/spiders/legacy_spider.py
@@ -0,0 +1,123 @@
+from scrapy.spiders import SitemapSpider,CrawlSpider, Rule, Spider
+from scrapy.linkextractors import LinkExtractor
+import pickle
+import scrapy
+#import redis as redis
+import bs4 
+import re
+import getpass 
+#r = redis.Redis(host='localhost', port=6379, password="12345678", db=0)
+import pickle
+import json
+from html_scrapy.elastic_publish import push_to_index, check_elastic_document
+
+def publish(response: scrapy.http.response.html.HtmlResponse):
+    print("Response typ: %s, obj: %s" % (type(response),response))
+    
+# r.publish("fetlegacy", pickle.dumps({"url": response.url, "body": response.body}))
+#   r.set(response.url, response.body) 
+
+    with open("scraped_urls.log", "a+") as f:
+        f.write(response.url+"\n")
+        
+documents={}
+
+
+
+
+class LegacySpider(CrawlSpider):
+    name = 'legacy_spider'
+    allowed_domains = ['legacy.fet.at']
+    #sitemap_urls=['https://fet.at/sitemap.xml']
+    #sitemap_rules = [('/posts/', 'parse')]
+    http_user = 'andis'
+    http_pass = getpass.getpass("Passwort von FET USer andis: ")
+    #http_auth_domain = 'legacy.fet.at'
+    rules = (
+       Rule(LinkExtractor(allow=('documents', )), callback='parse_document', process_links='fix_document_links'),
+       Rule(LinkExtractor(allow=(r"themengruppen/\d+$", )), callback='parse_themengruppe'),
+    )
+    start_urls = ['https://legacy.fet.at/themengruppen/15']
+    
+    custom_settings = {
+        'DEPTH_LIMIT': '1',
+    }
+    def fix_document_links(self, links):
+        for link in links:
+            if re.match(r".*documents/\d+$", link.url):
+                link.url=link.url +".json"
+                yield link
+    def fix_themen_links(self, links):
+        for link in links:
+            if re.match(r".*themen/\d+$", link.url):
+                link.url=link.url +".json"
+                yield link
+
+    def parse_document(self, response):
+        global documents
+        body=json.loads(response.body)
+        body["raw"] = body["text"]
+        body["text"] = bs4.BeautifulSoup(body["text"], 'lxml').get_text(" ")
+        documents[int(body['id'])]=body
+        return 
+    def parse_themen(self, response):
+        
+        body=json.loads(response.body)
+        body["text"] = bs4.BeautifulSoup(body["text"], 'lxml').get_text(" ")
+        documents[int(body['id'])]=body
+        return 
+
+    def parse_themengruppe(self, response):
+        global documents
+
+        themengruppe = bs4.BeautifulSoup(response.body, 'lxml')
+
+        thema_tags = themengruppe.find_all("div", itemtype="http://schema.org/Article")
+        print(f"found %d topics"% len(thema_tags))
+        for thema in thema_tags:
+         
+            t_link = thema.find("h2").find_all("a")[0]
+            t = {
+                'url':  t_link["href"],
+                'title': t_link.text,
+                'id': int( re.search(r"/(\d+)$", t_link["href"]).group(1)),
+               
+            }
+            print(f"\n\ncrawling topic %s - %s" %(t.get("id","?"),t.get("title","?")))
+            meeting_tags = thema.find_all("div", id=re.compile(r'meeting_\d+'))
+            print(f"\nfound %d meetings for topic %s" % (len(meeting_tags), t["title"]))
+            for meeting in meeting_tags:
+                m_id =re.search(r"meeting_(\d+)$", meeting['id']).group(1)
+                m ={'id': m_id, 
+                    'title': meeting.find("a").text 
+                }
+                
+                docs=meeting.find_all(id=re.compile(r'document_\d+'))  
+                print(f"crawling meeting %s  with %d meetings" % (m.get("title"),len(docs)))
+                for d in docs:
+                    doc_id = int( re.search(r"document_(\d+)$", d['id']).group(1))
+                    if not doc_id in documents: 
+                        continue
+                    documents[doc_id]["meeting"]=m 
+                    documents[doc_id]["thema"]= t  
+            for d in thema.find_all(id=re.compile(r'document_\d+')):
+                doc_id = int( re.search(r"document_(\d+)$", d['id']).group(1))
+                if not doc_id in documents: 
+                    continue
+                documents[doc_id]["thema"]= t  
+
+        output={}
+        for k, d in documents.items():
+            output[k]= check_elastic_document({
+                "title": d.get("name","")+ " - " + d.get("thema", {}).get("title","")  + " - " + d.get("meeting",{}).get("title",""),
+                "text": d.get("text",""),
+                "raw": d.get("raw",""),
+                "url": "legacy.fet.at/documents/" + str(d["id"]),
+                "published": d["created_at"],
+                "updated_at": d["updated_at"]
+            })
+            push_to_index(output[k]["url"],output[k])
+            
+            print(f"Document added: %s" % output[k].get("title",""))
+        #print("\n\nDocuments"+json.dumps(output))
+        return None
--- a/httpdemo/init.py
+++ b/httpdemo/init.py
@@ -0,0 +1,68 @@
+
+from fastapi import FastAPI
+from fastapi.staticfiles import StaticFiles
+from fastapi.templating import Jinja2Templates
+from fastapi import Request
+from elasticsearch import Elasticsearch
+import sys
+import elastic_transport
+
+ELASTIC_HOST = "http://localhost:9200"
+client = Elasticsearch(ELASTIC_HOST, verify_certs=False)
+
+app = FastAPI(debug=True)
+
+templates = Jinja2Templates(directory="./httpdemo")
+#app.mount("/", StaticFiles(directory="/"))
+
+
+#@app.get("/")
+#def read_root():
+#    return {"Hello"}
+
+
+query ={
+        "multi_match":{ 
+                "query": ELASTIC_QUERY,
+                "fields": ["title^20","title.ngrams^10","text","text.ngrams"],
+                "type": "most_fields"
+        }
+}
+highlight = {
+              "fields": {
+                "title": {},
+                "text": {}
+              }
+            }
+
+
+
+
+@app.get("/")
+def serve_home(request: Request, q: str):
+    query=  {
+                "bool":{
+                "should": [{"wildcard": {"body": {"value": "*%s*"% q, "case_insensitive": True }}},
+                {"wildcard": {"title": {"value": "*%s*" % q, "case_insensitive": True }}}],
+                "minimum_should_match": 1
+            }}
+    query = {
+        "match": {
+            "body": q
+        }
+    }
+
+    try:
+        resp = client.search(
+            index = "posts2",
+            size=10,
+            #analyze_wildcard=True,
+            #q="sdf*",
+            query=  query
+        )
+    except (elastic_transport.ConnectionError, elastic_transport.ConnectionTimeout) as e:
+        print(e,sys.stderr)
+        results=[]
+    else:
+        results=resp["hits"]["hits"]
+    return templates.TemplateResponse("index.html", context= {"request": request,"results": results}) 
--- a/httpdemo/index.html
+++ b/httpdemo/index.html
@@ -0,0 +1,10 @@
+<body>
+    <h1>Hello Index httpdemo</h1>
+<p><pre>{{request}}</pre></p>
+<ul>{% for r in results %}
+    <li><a href="{{r['url'}}">
+    {{r["body"]|safe}}
+    </li></a>
+    {% endfor %}
+</ul>
+</body>
--- a/httpdemo/jquery.js
+++ b/httpdemo/jquery.js
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+beautifulsoup4
+scrapy
+
--- a/reset_index.py
+++ b/reset_index.py
@@ -0,0 +1,4 @@
+from searching.index import reset_index
+
+
+reset_index()
--- a/1
+++ b/1
@@ -0,0 +1 @@
+docker run --rm -it --network elastic $(docker build  -q .)
--- a/1
+++ b/1
@@ -0,0 +1 @@
+docker run --name elastic --rm --net elastic -v esdata:/usr/share/elasticsearch/data -e ES_JAVA_OPTS="-Xms1g -Xmx1g" -e xpack.security.transport.ssl.enabled=false -e discovery.type=single-node -e ELASTIC_PASSWORD=*l9qNGoojiCC4n9KcZhj  -p 9200:9200 -it docker.elastic.co/elasticsearch/elasticsearch:8.6.1
--- a/1
+++ b/1
@@ -0,0 +1 @@
+docker run --rm -it -e ELASTIC_HOST=https://elastic:9200 --network elastic $(docker build -q -f Dockerfile_index .)
--- a/scrap_facebook.py
+++ b/scrap_facebook.py
@@ -0,0 +1,12 @@
+# https://pypi.org/project/facebook-page-scraper/
+from facebook_page_scraper import Facebook_scraper
+page_name = "fsmbtu"
+posts_count = 20
+browser = "chrome"
+#proxy = "IP:PORT" #if proxy requires authentication then user:password@IP:PORT
+timeout = 120 #600 seconds
+headless = True
+
+meta_ai = Facebook_scraper(page_name, posts_count, browser, timeout=timeout, headless=headless)
+json_data = meta_ai.scrap_to_json()
+print(json_data)
--- a/scrap_facebook_data.py
+++ b/scrap_facebook_data.py
@@ -0,0 +1,4 @@
+posts = [
+{"pfbid02Zpa9GaGnyh431W29SMTdMFzwCnvndgCqDuRKY4kfmLzMhVfZyf1BG2mSokE4HXCsl": {"name": "Fachschaft Maschinenbau & Verfahrenstechnik der TU Wien", "shares": 0, "reactions": {"likes": 1, "loves": 0, "wow": 0, "cares": 0, "sad": 0, "angry": 0, "haha": 0}, "reaction_count": 1, "comments": 0, "content": "Du studierst oder hast Interesse am Master Materialwissenschaften?Dann komm zum Einführungstutorium/ Semesteropening, dort kannst du deine KollegInnen kennenlernen und alle Fragen klären!Wann? 13.10. 18:00Wo? GM7 Kleiner Schiffbau#bussi", "posted_on": "2022-10-11T09:34:00", "video": [], "image": ["https://scontent-vie1-1.xx.fbcdn.net/v/t39.30808-6/311170027_629045595356569_7847357821067350500_n.jpg?stp=dst-jpg_s600x600&_nc_cat=109&ccb=1-7&_nc_sid=730e14&_nc_ohc=aKyj-g-6ZzcAX_fgezX&_nc_ht=scontent-vie1-1.xx&oh=00_AfAIA7Wm9pPltcSK1n-ZJ2DPcIUFSKdvljZ14FNp208FCg&oe=63632F0B"], "post_url": "https://www.facebook.com/fsmbtu/posts/pfbid02Zpa9GaGnyh431W29SMTdMFzwCnvndgCqDuRKY4kfmLzMhVfZyf1BG2mSokE4HXCsl?__cft__[0]=AZVKCuhSkdYgll3ZFvIsXEx0U9ybfnyKM7ua43FdC1OloGDAkzTrNwNqNI3tcmBsqUbme0jH423_h1Wvy_ec-Xj-2QZxcQy49C6VeA78y4mD8Ex_VbitnZvxkJIm0rbYwkFiCOnwBnLbUTy5bia7yQXGNVtgJixRiY8aYIgC50mPMwjf4dLZbzX2NARd-eAXx6BBhNfeScm8n4TAkEuZiTk5FaiG97WMv2_AucJshoZ72g&__tn__=%2CO%2CP-R"}},
+{"pfbid094MJ3H1jaioLwm9ZBdyN4bEQHnFr8JAgpB6K7UBZ7xTqfZdfifVzvhJdgyLrwdNAl": {"name": "Fachschaft Elektrotechnik", "shares": 0, "reactions": {"likes": 17, "loves": 1, "wow": 0, "cares": 0, "sad": 0, "angry": 0, "haha": 0}, "reaction_count": 18, "comments": 0, "content": "Nach 2 Jahren Pause ist es nun wieder so weit. Wir feiern am 21.10. das 20. Fetz’n Fliegen Komm vorbei und lassen wir gemeinsam die Fetz‘n Fliegen #comeback #jubiläum #party #fetznfliegen #tuwien #semesteropening", "posted_on": "2022-10-13T12:09:00", "video": [], "image": ["https://scontent-vie1-1.xx.fbcdn.net/v/t39.30808-6/311172846_632995741860626_2839195490689716775_n.jpg?stp=dst-jpg_p526x296&_nc_cat=102&ccb=1-7&_nc_sid=730e14&_nc_ohc=5crB-6ISDf0AX-pE9Iw&_nc_oc=AQmfhBkwtkWsD_RCLws86g4MwFGNXJCU1ZnufsKI3mnZkdFla-Mx7s9SOgbIhpd2PVk&_nc_ht=scontent-vie1-1.xx&oh=00_AfDwNsC-aZ3Jhxbeok5P9Bvf0IpIvyY61sDyhl7uWK3MKg&oe=63640FE3"], "post_url": "https://www.facebook.com/FachschaftET/posts/pfbid094MJ3H1jaioLwm9ZBdyN4bEQHnFr8JAgpB6K7UBZ7xTqfZdfifVzvhJdgyLrwdNAl?__cft__[0]=AZWL_tw6LUTREPksX4y2qquFekfyKm-6l3a7UTsojWf-Ij9D8fObP55jGZUYB0QFe3zomuCLsWOeV7fTrsz6sXO6otteao4t0g0JUy6hHKeQvKNyfHB9lymnXvzwremH5sCh7CqJOQOeqlGCVtQ8Pqbcq82KGA09P5BdWyVfToNkoenUOMHxdaoso9cK-ijPG_fsn6pivRT38MdC1UXWENU3Edw4eXee92n18KvjKVhVTQ&__tn__=%2CO%2CP-R"}}
+]
--- a/scrapy.cfg
+++ b/scrapy.cfg
@@ -0,0 +1,2 @@
+[settings]
+default = html_scrapy.settings
--- a/searching/init.py
+++ b/searching/init.py
@@ -0,0 +1,57 @@
+
+
+
+import os
+from elasticsearch import Elasticsearch, helpers
+import contextlib
+ELASTIC_HOST = os.environ.get("ELASTIC_HOST","http://localhost:9200")
+ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD","*l9qNGoojiCC4n9KcZhj")
+ELASTIC_QUERY = os.environ.get("ELASTIC_QUERY","Anwesend")
+ELASTIC_INDEX = os.environ.get("ELASTIC_INDEX","legacy")
+
+
+# Verbinde mit Client
+@contextlib.contextmanager
+def es_client():
+  client = Elasticsearch(ELASTIC_HOST, verify_certs=False, basic_auth=('elastic', ELASTIC_PASSWORD))
+  yield client
+  client.close()
+
+
+def es_query(query:str):
+        query ={
+        "multi_match":{ 
+                "query": query,
+                "fields": ["title^20","title.ngrams^10","text","text.ngrams"],
+                "type": "most_fields"
+                }
+        }
+def es_highlight():
+        highlight = {
+              "fields": {
+                "title": {},
+                "text": {}
+              }
+            }
+
+def es_search(query:str):
+    with es_client() as client:    
+        result = client.search(
+            index = ELASTIC_INDEX,
+            size=10,
+            query= es_query(query),
+            highlight = es_highlight()
+        )
+    return result    
+#for hit in resp["hits"]["hits"]:
+#        print(hit)
+
+if __name__ =="__main__":
+   resp = es_search(ELASTIC_QUERY)
+   for hit in resp["hits"]["hits"]:
+        print(f"\n\n%s\n%s\n%s - %s" % (
+                hit.get("_source",{}).get("url",""),
+                hit.get("_source",{}).get("title",""),
+                 " ".join(hit.get("highlight",{}).get("title",[""])),
+                " ".join(hit.get("highlight",{}).get("text",[""]))
+        ))
--- a/searching/index.py
+++ b/searching/index.py
@@ -0,0 +1,71 @@
+from . import ELASTIC_INDEX
+from . import es_client
+import elasticsearch
+
+settings = {
+  "index":
+  {"max_ngram_diff": 3
+  },
+  "analysis": {
+      "analyzer": {
+        "my_analyzer": {
+          "tokenizer": "my_tokenizer",
+           "filter": [
+            "lowercase",
+           ]
+        }
+      },
+      "tokenizer": {
+        "my_tokenizer": {
+          "type": "ngram",
+          "min_gram": 3,
+          "max_gram": 6,
+          "token_chars": [
+            "letter",
+            "digit"
+          ]
+        }
+      }
+    }}
+mapping = {
+  
+    "properties": {
+      "title": {
+         "type": "text",
+         "fields": {
+          "ngrams": {
+            "type":"text",
+            "analyzer": "my_analyzer",
+            "search_analyzer": "standard",   
+          }
+         }     
+             
+       },
+       "text": {
+         "type": "text",
+         "fields": {
+          "ngrams": {
+            "type":"text",
+            "analyzer": "my_analyzer",
+            "search_analyzer": "standard",   
+          }
+         }         
+       },
+       "url": { "type": "text", "index": False},
+       "published": {"type": "date", "format": "date_optional_time"},
+       "updated_at": {"type": "date", "format": "date_optional_time"},
+      "raw": {
+         "type": "text",
+         "index": False       
+       },
+    }  
+}
+def reset_index():
+  with es_client() as client:
+    try: 
+
+        client.indices.delete(index=ELASTIC_INDEX)
+    except elasticsearch.NotFoundError:
+        print("Index already removed")
+
+    client.indices.create(index=ELASTIC_INDEX, settings=settings, mappings=mapping)
--- a/test_search.py
+++ b/test_search.py
@@ -0,0 +1,18 @@
+
+import os
+from elasticsearch import Elasticsearch, helpers
+from searching import es_search, es_client
+ELASTIC_QUERY = os.environ.get("ELASTIC_QUERY","Anwesend")
+
+#for hit in resp["hits"]["hits"]:
+#        print(hit)
+
+if __name__ =="__main__":
+   resp = es_search(ELASTIC_QUERY)
+   for hit in resp["hits"]["hits"]:
+        print(f"\n\n%s\n%s\n%s - %s" % (
+                hit.get("_source",{}).get("url",""),
+                hit.get("_source",{}).get("title",""),
+                 " ".join(hit.get("highlight",{}).get("title",[""])),
+                " ".join(hit.get("highlight",{}).get("text",[""]))
+                ))
				`@@ -0,0 +1 @@`
				`docker run --rm -it --network elastic $(docker build -q .)`
				`@@ -0,0 +1 @@`
				`docker run --name elastic --rm --net elastic -v esdata:/usr/share/elasticsearch/data -e ES_JAVA_OPTS="-Xms1g -Xmx1g" -e xpack.security.transport.ssl.enabled=false -e discovery.type=single-node -e ELASTIC_PASSWORD=*l9qNGoojiCC4n9KcZhj -p 9200:9200 -it docker.elastic.co/elasticsearch/elasticsearch:8.6.1`
				`@@ -0,0 +1 @@`
				`docker run --rm -it -e ELASTIC_HOST=https://elastic:9200 --network elastic $(docker build -q -f Dockerfile_index .)`