upgrade scraping and searching

2023-05-14 18:15:10 +02:00
parent 38a428eb52
commit fc84fdf4f6
12 changed files with 239 additions and 120 deletions
--- a/html_scrapy/elastic_publish.py
+++ b/html_scrapy/elastic_publish.py
@@ -20,6 +20,7 @@ def check_elastic_document(element):
                "text": element["text"],
                "title": element["title"],
                #"source": get_source(post),
                "prio": element.get("prio", 1000),
                "url": element["url"],
                "updated_at": str(element["updated_at"])
                }
--- a/html_scrapy/guess_date.py
+++ b/html_scrapy/guess_date.py
@@ -0,0 +1,32 @@
 from datetime import datetime
 from dateutil.parser import parse
 import locale
 from contextlib import suppress
 def parse(s):
    with suppress(BaseException):
        date_format = '%H:%M, %d. %b. %Y'
        locale.setlocale(locale.LC_TIME, 'en_US')   
        parsed_date = datetime.strptime(s, date_format)
        return parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
    with suppress(BaseException):
        date_format = '%H:%M, %d. %b. %Y'
        locale.setlocale(locale.LC_TIME, 'de_DE')
        parsed_date = datetime.strptime(s, date_format)
        return parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
    with suppress(BaseException):
        date_format = '%H:%M, %d. %B %Y'
        locale.setlocale(locale.LC_TIME, 'de_DE')
        parsed_date = datetime.strptime(s, date_format)
        return parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
    with suppress(BaseException):
        locale.setlocale(locale.LC_TIME, 'de_DE')
        parsed_date = parse(s, dayfirst=True)
        return parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
    return s
--- a/html_scrapy/spiders/fetwiki_spider.py
+++ b/html_scrapy/spiders/fetwiki_spider.py
@@ -4,11 +4,11 @@ import scrapy
 import bs4
 import re
 import getpass
-import urllib.parse
+
 from html_scrapy.elastic_publish import push_to_index, check_elastic_document
 from furl import furl
 from scrapy import signals
-
+from html_scrapy import guess_date 
 def publish(response: scrapy.http.response.html.HtmlResponse):
    print("Response typ: %s, obj: %s" % (type(response), response))
    with open("scraped_urls.log", "a+") as f:
@@ -22,6 +22,7 @@ class FetWikiSpider(SitemapSpider):
    sitemap_rules = [("/", "parse_page")]
    http_user = "andis"
    http_pass = getpass.getpass("Passwort von FET USer andis: ")
    login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden"
    pages = {}
    http_auth_domain = "wiki.fet.at"
    # rules = (
@@ -36,34 +37,27 @@ class FetWikiSpider(SitemapSpider):
    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(FetWikiSpider, cls).from_crawler(crawler, *args, **kwargs)
        print("connect signal")
        crawler.signals.connect(spider.closed_handler, signal=signals.spider_closed)
        return spider
    def closed_handler(self, spider,reason):
-        print("closing !! Handler")
+      with open("scraped_urls.log", "a+") as f:
        print(reason)
        print("found %d elements in pages" % len(spider.pages))
        print("found %d elements in pages" % len(self.pages))
        for id,p in spider.pages.items():
            try:
                output = check_elastic_document(p)
                print(f"pushing: %s" % output["url"])
                push_to_index(output["url"], output)
                f.write(output["url"] + "\n")
            except AttributeError as e:
                print(e)
                print(f"Error occured at id: --%s--" %id)
        # do stuff here
    def callback_login_done(self, response):
    def callback_login_step2(self, response):
        html = bs4.BeautifulSoup(response.body, "lxml")
        h = html.find("h1", {"id": "firstHeading"}).text
-        print(f"\nlogin callback done %s\n" % h)
+        if h == "FET-Wiki": # Login successful
        if h == "FET-Wiki":
            for url in self.sitemap_urls:
                yield scrapy.Request( url, self._parse_sitemap)
        else:
@@ -72,12 +66,11 @@ class FetWikiSpider(SitemapSpider):
                print(f"\nerrorbox %s" % h.text)
        return
-    def callback_login(self, response):
+    def callback_login_step1(self, response):
        print(f"\nStart Login:\n")
        html = bs4.BeautifulSoup(response.body, "lxml")
        token = html.find("input", {"name": "wpLoginToken"})["value"]
-        login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden"
+        
        formdata = {
            "wpName": self.http_user,
            "wpPassword": self.http_pass,
@@ -88,22 +81,17 @@ class FetWikiSpider(SitemapSpider):
            "title": "Spezial:Anmelden"
        }
        yield scrapy.FormRequest(
-            login_url,
+            self.login_url,
            formdata=formdata,
-            #meta={"cookiejar": response.meta["cookiejar"]},
+            callback=self.callback_login_step2,
            callback=self.callback_login_done,
        )
    def start_requests(self):
        login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden"
        #self.cookie_jar = CookieJar()
        yield scrapy.Request(
-            "https://wiki.fet.at/index.php?title=Spezial:Anmelden",
+            self.login_url,
-            callback=self.callback_login,
+            callback=self.callback_login_step1,
        )
        print(f"\nLogin done Processing Sitemap:\n")
    def parse_history(self, response, id):
        print(f"\n\n Parsing: %s\n" % response.url)
        html = bs4.BeautifulSoup(response.body, "lxml")
@@ -111,10 +99,9 @@ class FetWikiSpider(SitemapSpider):
        last = ul[0].find("a", {"class": "mw-changeslist-date"}).text
        created = ul[-1].find("a", {"class": "mw-changeslist-date"}).text
        d = self.pages.get(id,{})
-        d["published"]= created
+        d["published"]= guess_date.parse(created)
-        d["updated_at"]= last
+        d["updated_at"]= guess_date.parse(last)
        self.pages[id]=d
        return
    def parse_page(self, response):
@@ -122,10 +109,9 @@ class FetWikiSpider(SitemapSpider):
        html = bs4.BeautifulSoup(response.body, "lxml")
        title = html.find("h1", {"id": "firstHeading"}).text.strip()
        if title == "Anmeldung erforderlich":
            print("login erforderlich")
            yield scrapy.Request(
                "https://wiki.fet.at/index.php?title=Spezial:Anmelden",
-                callback=self.callback_login,
+                callback=self.callback_login_step1,
            )
            print(f"Exiting Title : %s\n\n" % title)
            return
--- a/html_scrapy/spiders/legacy_spider.py
+++ b/html_scrapy/spiders/legacy_spider.py
@@ -1,30 +1,22 @@
 from scrapy.spiders import SitemapSpider,CrawlSpider, Rule, Spider
 from scrapy.linkextractors import LinkExtractor
-import pickle
+
 import scrapy
 #import redis as redis
 import bs4 
 import re
 import getpass 
 #r = redis.Redis(host='localhost', port=6379, password="12345678", db=0)
 import pickle
 import json
 from html_scrapy.elastic_publish import push_to_index, check_elastic_document
 def publish(response: scrapy.http.response.html.HtmlResponse):
    print("Response typ: %s, obj: %s" % (type(response),response))
 # r.publish("fetlegacy", pickle.dumps({"url": response.url, "body": response.body}))
 #   r.set(response.url, response.body) 
    with open("scraped_urls.log", "a+") as f:
        f.write(response.url+"\n")
 documents={}
 class LegacySpider(CrawlSpider):
    name = 'legacy_spider'
    allowed_domains = ['legacy.fet.at']
@@ -112,6 +104,7 @@ class LegacySpider(CrawlSpider):
                "title": d.get("name","")+ " - " + d.get("thema", {}).get("title","")  + " - " + d.get("meeting",{}).get("title",""),
                "text": d.get("text",""),
                "raw": d.get("raw",""),
                "prio": 100,
                "url": "legacy.fet.at/documents/" + str(d["id"]),
                "published": d["created_at"],
                "updated_at": d["updated_at"]
--- a/httpdemo/init.py
+++ b/httpdemo/init.py
@@ -1,26 +1,31 @@
 from fastapi import FastAPI
 from fastapi.staticfiles import StaticFiles
 from fastapi.templating import Jinja2Templates
 from fastapi import Request
 # from elasticsearch import Elasticsearch
 import sys
 import elastic_transport
 from searching import es_search, es_query
 import json
 import yaml
 app = FastAPI(debug=True)
 templates = Jinja2Templates(directory="./httpdemo")
 # app.mount("/", StaticFiles(directory="/"))
@app.get("/")
 def serve_home(request: Request, q: str = ""):
    try:
        resp = es_search(q)
        query = es_query(q)
-        message = "found ...?"
+        message = f"found ...? %d" % len(resp["hits"]["hits"])
-    except (elastic_transport.ConnectionError, elastic_transport.ConnectionTimeout) as e:
+    except (
        elastic_transport.ConnectionError,
        elastic_transport.ConnectionTimeout,
    ) as e:
        print(e, sys.stderr)
        results = []
        resp = {}
@@ -30,5 +35,17 @@ def serve_home(request: Request, q: str=""):
        results = resp["hits"]["hits"]
    templates.env.filters["json"] = lambda x: yaml.dump(dict(x))
-    return templates.TemplateResponse("index.html", context= {
+    return templates.TemplateResponse(
-        "request": resp,"results": results,"message": message, "query": query}) 
+        "index.html",
        context={
            "request": resp,
            "results": results,
            "message": message,
            "query": query,
        },
    )
@app.get("test")
 def test(request: Request):
    return {"test": "test"}
--- a/httpdemo/index.html
+++ b/httpdemo/index.html
@@ -9,8 +9,12 @@
    {% for r in results %}
    <li>
        <a href="https://{{r['_source']['url']}}">
            {{r["_source"]["title"]|safe }}
        </a>
        {{r["_score"]|safe }}
        {{r["sort"]|safe }}
        <br /><br />
        {% for hh in r["highlight"] %}
        {% for h in r["highlight"][hh] %}
        {{ h |safe}} <br />
--- a/scrap_facebook.py
+++ b/scrap_facebook.py
@@ -1,12 +0,0 @@
 # https://pypi.org/project/facebook-page-scraper/
 from facebook_page_scraper import Facebook_scraper
 page_name = "fsmbtu"
 posts_count = 20
 browser = "chrome"
 #proxy = "IP:PORT" #if proxy requires authentication then user:password@IP:PORT
 timeout = 120 #600 seconds
 headless = True
 meta_ai = Facebook_scraper(page_name, posts_count, browser, timeout=timeout, headless=headless)
 json_data = meta_ai.scrap_to_json()
 print(json_data)
--- a/scrap_facebook_data.py
+++ b/scrap_facebook_data.py
@@ -1,4 +0,0 @@
 posts = [
 {"pfbid02Zpa9GaGnyh431W29SMTdMFzwCnvndgCqDuRKY4kfmLzMhVfZyf1BG2mSokE4HXCsl": {"name": "Fachschaft Maschinenbau & Verfahrenstechnik der TU Wien", "shares": 0, "reactions": {"likes": 1, "loves": 0, "wow": 0, "cares": 0, "sad": 0, "angry": 0, "haha": 0}, "reaction_count": 1, "comments": 0, "content": "Du studierst oder hast Interesse am Master Materialwissenschaften?Dann komm zum Einführungstutorium/ Semesteropening, dort kannst du deine KollegInnen kennenlernen und alle Fragen klären!Wann? 13.10. 18:00Wo? GM7 Kleiner Schiffbau#bussi", "posted_on": "2022-10-11T09:34:00", "video": [], "image": ["https://scontent-vie1-1.xx.fbcdn.net/v/t39.30808-6/311170027_629045595356569_7847357821067350500_n.jpg?stp=dst-jpg_s600x600&_nc_cat=109&ccb=1-7&_nc_sid=730e14&_nc_ohc=aKyj-g-6ZzcAX_fgezX&_nc_ht=scontent-vie1-1.xx&oh=00_AfAIA7Wm9pPltcSK1n-ZJ2DPcIUFSKdvljZ14FNp208FCg&oe=63632F0B"], "post_url": "https://www.facebook.com/fsmbtu/posts/pfbid02Zpa9GaGnyh431W29SMTdMFzwCnvndgCqDuRKY4kfmLzMhVfZyf1BG2mSokE4HXCsl?__cft__[0]=AZVKCuhSkdYgll3ZFvIsXEx0U9ybfnyKM7ua43FdC1OloGDAkzTrNwNqNI3tcmBsqUbme0jH423_h1Wvy_ec-Xj-2QZxcQy49C6VeA78y4mD8Ex_VbitnZvxkJIm0rbYwkFiCOnwBnLbUTy5bia7yQXGNVtgJixRiY8aYIgC50mPMwjf4dLZbzX2NARd-eAXx6BBhNfeScm8n4TAkEuZiTk5FaiG97WMv2_AucJshoZ72g&__tn__=%2CO%2CP-R"}},
 {"pfbid094MJ3H1jaioLwm9ZBdyN4bEQHnFr8JAgpB6K7UBZ7xTqfZdfifVzvhJdgyLrwdNAl": {"name": "Fachschaft Elektrotechnik", "shares": 0, "reactions": {"likes": 17, "loves": 1, "wow": 0, "cares": 0, "sad": 0, "angry": 0, "haha": 0}, "reaction_count": 18, "comments": 0, "content": "Nach 2 Jahren Pause ist es nun wieder so weit. Wir feiern am 21.10. das 20. Fetz’n Fliegen Komm vorbei und lassen wir gemeinsam die Fetz‘n Fliegen #comeback #jubiläum #party #fetznfliegen #tuwien #semesteropening", "posted_on": "2022-10-13T12:09:00", "video": [], "image": ["https://scontent-vie1-1.xx.fbcdn.net/v/t39.30808-6/311172846_632995741860626_2839195490689716775_n.jpg?stp=dst-jpg_p526x296&_nc_cat=102&ccb=1-7&_nc_sid=730e14&_nc_ohc=5crB-6ISDf0AX-pE9Iw&_nc_oc=AQmfhBkwtkWsD_RCLws86g4MwFGNXJCU1ZnufsKI3mnZkdFla-Mx7s9SOgbIhpd2PVk&_nc_ht=scontent-vie1-1.xx&oh=00_AfDwNsC-aZ3Jhxbeok5P9Bvf0IpIvyY61sDyhl7uWK3MKg&oe=63640FE3"], "post_url": "https://www.facebook.com/FachschaftET/posts/pfbid094MJ3H1jaioLwm9ZBdyN4bEQHnFr8JAgpB6K7UBZ7xTqfZdfifVzvhJdgyLrwdNAl?__cft__[0]=AZWL_tw6LUTREPksX4y2qquFekfyKm-6l3a7UTsojWf-Ij9D8fObP55jGZUYB0QFe3zomuCLsWOeV7fTrsz6sXO6otteao4t0g0JUy6hHKeQvKNyfHB9lymnXvzwremH5sCh7CqJOQOeqlGCVtQ8Pqbcq82KGA09P5BdWyVfToNkoenUOMHxdaoso9cK-ijPG_fsn6pivRT38MdC1UXWENU3Edw4eXee92n18KvjKVhVTQ&__tn__=%2CO%2CP-R"}}
 ]
--- a/scraped_urls.log
+++ b/scraped_urls.log
@@ -0,0 +1,46 @@
 wiki.fet.at/index.php/Rezept:_Nathans_Apfelstreuselkuchen
 wiki.fet.at/index.php/How_to:_Start_@_FET
 wiki.fet.at/index.php/Kino
 wiki.fet.at/index.php/How_to:_Kellerputz
 wiki.fet.at/index.php/Schinkenfleckerl
 wiki.fet.at/index.php/Waffeln
 wiki.fet.at/index.php/%C3%96H-Wahl:_Wahlstand
 wiki.fet.at/index.php/Keller:_Bestandsaufnahme
 wiki.fet.at/index.php/M%C3%B6rderspiel
 wiki.fet.at/index.php/Schl%C3%BCssel-_und_Oimlliste
 wiki.fet.at/index.php/TU-Phone
 wiki.fet.at/index.php/Bierbaron
 wiki.fet.at/index.php/ET-Crashkurs
 wiki.fet.at/index.php/%C3%96H-Wahl
 wiki.fet.at/index.php/How-To_Festl
 wiki.fet.at/index.php/Bouldern
 wiki.fet.at/index.php/Spritzerstand
 wiki.fet.at/index.php/Beratung
 wiki.fet.at/index.php/Email-Anleitung
 wiki.fet.at/index.php/LDAP
 wiki.fet.at/index.php/Wiki-Anleitung
 wiki.fet.at/index.php/User_anlegen
 wiki.fet.at/index.php/Mailinglisten
 wiki.fet.at/index.php/Rezept:_Nathans_Apfelstreuselkuchen
 wiki.fet.at/index.php/Schinkenfleckerl
 wiki.fet.at/index.php/M%C3%B6rderspiel
 wiki.fet.at/index.php/How_to:_Kellerputz
 wiki.fet.at/index.php/Keller:_Bestandsaufnahme
 wiki.fet.at/index.php/How_to:_Start_@_FET
 wiki.fet.at/index.php/Kino
 wiki.fet.at/index.php/Waffeln
 wiki.fet.at/index.php/%C3%96H-Wahl:_Wahlstand
 wiki.fet.at/index.php/TU-Phone
 wiki.fet.at/index.php/Schl%C3%BCssel-_und_Oimlliste
 wiki.fet.at/index.php/Bierbaron
 wiki.fet.at/index.php/%C3%96H-Wahl
 wiki.fet.at/index.php/Bouldern
 wiki.fet.at/index.php/Spritzerstand
 wiki.fet.at/index.php/ET-Crashkurs
 wiki.fet.at/index.php/How-To_Festl
 wiki.fet.at/index.php/Beratung
 wiki.fet.at/index.php/Wiki-Anleitung
 wiki.fet.at/index.php/User_anlegen
 wiki.fet.at/index.php/Email-Anleitung
 wiki.fet.at/index.php/LDAP
 wiki.fet.at/index.php/Mailinglisten
--- a/search.html
+++ b/search.html
@@ -0,0 +1,30 @@
 <!DOCTYPE html>  
 <html>  
  <head>  
    <title>API Request Example</title>  
  </head>  
  <body>  
    <form>  
      <label for="input-field">Enter some text:</label>  
      <input type="text" id="input-field" name="input-field">  
      <button type="button" onclick="makeRequest()">Submit</button>  
    </form>  
    <div id="response-container"></div>  
    <script>  
      function makeRequest() {  
        const inputField = document.getElementById('input-field');  
        const inputValue = inputField.value;  
        fetch(`https://api.example.com/?q=${inputValue}`)  
          .then(response => response.json())  
          .then(data => {  
            const responseContainer = document.getElementById('response-container');  
            const html = `  
              <h2>API Response:</h2>  
              <pre>${JSON.stringify(data, null, 2)}</pre>  
            `;  
            responseContainer.innerHTML = html;  
          });  
      }  
    </script>  
  </body>  
 </html>  
--- a/searching/init.py
+++ b/searching/init.py
@@ -1,10 +1,8 @@
 import os
 from elasticsearch import Elasticsearch, helpers
 import contextlib
 import logging
 ELASTIC_HOST = os.environ.get("ELASTIC_HOST", "http://localhost:9200")
 ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD", "*l9qNGoojiCC4n9KcZhj")
 ELASTIC_QUERY = os.environ.get("ELASTIC_QUERY", "Anwesend")
@@ -15,7 +13,9 @@ ELASTIC_INDEX = os.environ.get("ELASTIC_INDEX","legacy")
@contextlib.contextmanager
 def es_client():
    logging.debug(f"ELASIC HOST:%s" % ELASTIC_HOST)
-  client = Elasticsearch(ELASTIC_HOST, verify_certs=False, basic_auth=('elastic', ELASTIC_PASSWORD))
+    client = Elasticsearch(
        ELASTIC_HOST, verify_certs=False, basic_auth=("elastic", ELASTIC_PASSWORD)
    )
    yield client
    client.close()
@@ -39,20 +39,43 @@ def es_highlight():
            "text": {"fragment_size": 150},
            "title.ngrams": {},
            "text.ngrams": {"fragment_size": 150},
        }
    }
    return highlight
 sorting = {
    "updated_at": {"order": "desc"},
    "_score": {"order": "desc"},
    "prio": {"order": "desc"},
 }
 def es_sorting():
    return {
        "_script": {
            "type": "number",
            "script": {
                "lang": "painless",
                "source": "Math.log10(1+doc['updated_at'].value.toInstant().toEpochMilli()/1000000000/100) + Math.log10(1+_score)/10 + Math.log10(1+doc['prio'].value/1000) ",  # * Math.log10(1+) * Math.log10(doc['prio'].value/10)"  #* doc['_score'].value
            },
            "order": "desc",
        }
    }
 def es_search(query: str):
    with es_client() as client:
        result = client.search(
            index=ELASTIC_INDEX,
-            size=10,
+            size=30,
            query=es_query(query),
-            highlight = es_highlight()
+            sort=es_sorting(),
            highlight=es_highlight(),
        )
    return result
 # for hit in resp["hits"]["hits"]:
 #        print(hit)
@@ -60,10 +83,12 @@ if __name__ =="__main__":
    resp = es_search(ELASTIC_QUERY)
    logging.info(f"Found %d recorts in hits" % resp["hits"]["hits"])
    for hit in resp["hits"]["hits"]:
-        print(f"\n\n%s\n%s\n%s - %s" % (
+        print(
            f"\n\n%s\n%s\n%s - %s"
            % (
                hit.get("_source", {}).get("url", ""),
                hit.get("_source", {}).get("title", ""),
                " ".join(hit.get("highlight", {}).get("title", [""])),
-                " ".join(hit.get("highlight",{}).get("text",[""]))
+                " ".join(hit.get("highlight", {}).get("text", [""])),
-        ))
+            )
-        
+        )
--- a/searching/index.py
+++ b/searching/index.py
@@ -54,6 +54,7 @@ mapping = {
       "url": { "type": "text", "index": False},
       "published": {"type": "date", "format": "date_optional_time"},
       "updated_at": {"type": "date", "format": "date_optional_time"},
       "prio": {"type": "integer"},
      "raw": {
         "type": "text",  
         "index": False