diff --git a/html_scrapy/elastic_publish.py b/html_scrapy/elastic_publish.py index b27760d..2974fed 100644 --- a/html_scrapy/elastic_publish.py +++ b/html_scrapy/elastic_publish.py @@ -20,6 +20,7 @@ def check_elastic_document(element): "text": element["text"], "title": element["title"], #"source": get_source(post), + "prio": element.get("prio", 1000), "url": element["url"], "updated_at": str(element["updated_at"]) } \ No newline at end of file diff --git a/html_scrapy/guess_date.py b/html_scrapy/guess_date.py new file mode 100644 index 0000000..909c7a2 --- /dev/null +++ b/html_scrapy/guess_date.py @@ -0,0 +1,32 @@ + +from datetime import datetime +from dateutil.parser import parse +import locale +from contextlib import suppress + +def parse(s): + with suppress(BaseException): + date_format = '%H:%M, %d. %b. %Y' + locale.setlocale(locale.LC_TIME, 'en_US') + parsed_date = datetime.strptime(s, date_format) + return parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ") + + with suppress(BaseException): + date_format = '%H:%M, %d. %b. %Y' + locale.setlocale(locale.LC_TIME, 'de_DE') + parsed_date = datetime.strptime(s, date_format) + return parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ") + + with suppress(BaseException): + date_format = '%H:%M, %d. %B %Y' + locale.setlocale(locale.LC_TIME, 'de_DE') + parsed_date = datetime.strptime(s, date_format) + return parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ") + + with suppress(BaseException): + locale.setlocale(locale.LC_TIME, 'de_DE') + parsed_date = parse(s, dayfirst=True) + return parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ") + + + return s diff --git a/html_scrapy/spiders/fetwiki_spider.py b/html_scrapy/spiders/fetwiki_spider.py index 523b732..8d6cca3 100644 --- a/html_scrapy/spiders/fetwiki_spider.py +++ b/html_scrapy/spiders/fetwiki_spider.py @@ -4,11 +4,11 @@ import scrapy import bs4 import re import getpass -import urllib.parse + from html_scrapy.elastic_publish import push_to_index, check_elastic_document from furl import furl from scrapy import signals - +from html_scrapy import guess_date def publish(response: scrapy.http.response.html.HtmlResponse): print("Response typ: %s, obj: %s" % (type(response), response)) with open("scraped_urls.log", "a+") as f: @@ -22,6 +22,7 @@ class FetWikiSpider(SitemapSpider): sitemap_rules = [("/", "parse_page")] http_user = "andis" http_pass = getpass.getpass("Passwort von FET USer andis: ") + login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden" pages = {} http_auth_domain = "wiki.fet.at" # rules = ( @@ -36,34 +37,27 @@ class FetWikiSpider(SitemapSpider): @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super(FetWikiSpider, cls).from_crawler(crawler, *args, **kwargs) - print("connect signal") crawler.signals.connect(spider.closed_handler, signal=signals.spider_closed) return spider def closed_handler(self, spider,reason): - print("closing !! Handler") - print(reason) - print("found %d elements in pages" % len(spider.pages)) - print("found %d elements in pages" % len(self.pages)) + with open("scraped_urls.log", "a+") as f: for id,p in spider.pages.items(): try: output = check_elastic_document(p) print(f"pushing: %s" % output["url"]) push_to_index(output["url"], output) + f.write(output["url"] + "\n") except AttributeError as e: print(e) print(f"Error occured at id: --%s--" %id) - # do stuff here - - def callback_login_done(self, response): + + def callback_login_step2(self, response): html = bs4.BeautifulSoup(response.body, "lxml") h = html.find("h1", {"id": "firstHeading"}).text - print(f"\nlogin callback done %s\n" % h) - - - if h == "FET-Wiki": + if h == "FET-Wiki": # Login successful for url in self.sitemap_urls: yield scrapy.Request( url, self._parse_sitemap) else: @@ -72,12 +66,11 @@ class FetWikiSpider(SitemapSpider): print(f"\nerrorbox %s" % h.text) return - def callback_login(self, response): + def callback_login_step1(self, response): print(f"\nStart Login:\n") - html = bs4.BeautifulSoup(response.body, "lxml") token = html.find("input", {"name": "wpLoginToken"})["value"] - login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden" + formdata = { "wpName": self.http_user, "wpPassword": self.http_pass, @@ -88,22 +81,17 @@ class FetWikiSpider(SitemapSpider): "title": "Spezial:Anmelden" } yield scrapy.FormRequest( - login_url, + self.login_url, formdata=formdata, - #meta={"cookiejar": response.meta["cookiejar"]}, - callback=self.callback_login_done, + callback=self.callback_login_step2, ) def start_requests(self): - login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden" - #self.cookie_jar = CookieJar() yield scrapy.Request( - "https://wiki.fet.at/index.php?title=Spezial:Anmelden", - callback=self.callback_login, + self.login_url, + callback=self.callback_login_step1, ) - print(f"\nLogin done Processing Sitemap:\n") - def parse_history(self, response, id): print(f"\n\n Parsing: %s\n" % response.url) html = bs4.BeautifulSoup(response.body, "lxml") @@ -111,10 +99,9 @@ class FetWikiSpider(SitemapSpider): last = ul[0].find("a", {"class": "mw-changeslist-date"}).text created = ul[-1].find("a", {"class": "mw-changeslist-date"}).text d = self.pages.get(id,{}) - d["published"]= created - d["updated_at"]= last + d["published"]= guess_date.parse(created) + d["updated_at"]= guess_date.parse(last) self.pages[id]=d - return def parse_page(self, response): @@ -122,10 +109,9 @@ class FetWikiSpider(SitemapSpider): html = bs4.BeautifulSoup(response.body, "lxml") title = html.find("h1", {"id": "firstHeading"}).text.strip() if title == "Anmeldung erforderlich": - print("login erforderlich") yield scrapy.Request( "https://wiki.fet.at/index.php?title=Spezial:Anmelden", - callback=self.callback_login, + callback=self.callback_login_step1, ) print(f"Exiting Title : %s\n\n" % title) return diff --git a/html_scrapy/spiders/legacy_spider.py b/html_scrapy/spiders/legacy_spider.py index 35c5f50..6d27ff2 100644 --- a/html_scrapy/spiders/legacy_spider.py +++ b/html_scrapy/spiders/legacy_spider.py @@ -1,30 +1,22 @@ from scrapy.spiders import SitemapSpider,CrawlSpider, Rule, Spider from scrapy.linkextractors import LinkExtractor -import pickle + import scrapy -#import redis as redis import bs4 import re import getpass -#r = redis.Redis(host='localhost', port=6379, password="12345678", db=0) import pickle import json from html_scrapy.elastic_publish import push_to_index, check_elastic_document def publish(response: scrapy.http.response.html.HtmlResponse): print("Response typ: %s, obj: %s" % (type(response),response)) - -# r.publish("fetlegacy", pickle.dumps({"url": response.url, "body": response.body})) -# r.set(response.url, response.body) - with open("scraped_urls.log", "a+") as f: f.write(response.url+"\n") documents={} - - class LegacySpider(CrawlSpider): name = 'legacy_spider' allowed_domains = ['legacy.fet.at'] @@ -112,6 +104,7 @@ class LegacySpider(CrawlSpider): "title": d.get("name","")+ " - " + d.get("thema", {}).get("title","") + " - " + d.get("meeting",{}).get("title",""), "text": d.get("text",""), "raw": d.get("raw",""), + "prio": 100, "url": "legacy.fet.at/documents/" + str(d["id"]), "published": d["created_at"], "updated_at": d["updated_at"] diff --git a/httpdemo/__init__.py b/httpdemo/__init__.py index e956ed0..f54e1fc 100644 --- a/httpdemo/__init__.py +++ b/httpdemo/__init__.py @@ -1,34 +1,51 @@ - from fastapi import FastAPI from fastapi.staticfiles import StaticFiles from fastapi.templating import Jinja2Templates from fastapi import Request -#from elasticsearch import Elasticsearch + +# from elasticsearch import Elasticsearch import sys import elastic_transport from searching import es_search, es_query import json import yaml + app = FastAPI(debug=True) templates = Jinja2Templates(directory="./httpdemo") -#app.mount("/", StaticFiles(directory="/")) +# app.mount("/", StaticFiles(directory="/")) + @app.get("/") -def serve_home(request: Request, q: str=""): +def serve_home(request: Request, q: str = ""): try: resp = es_search(q) - query=es_query(q) - message = "found ...?" - except (elastic_transport.ConnectionError, elastic_transport.ConnectionTimeout) as e: - print(e,sys.stderr) - results=[] - resp={} - query={} + query = es_query(q) + message = f"found ...? %d" % len(resp["hits"]["hits"]) + except ( + elastic_transport.ConnectionError, + elastic_transport.ConnectionTimeout, + ) as e: + print(e, sys.stderr) + results = [] + resp = {} + query = {} message = f"cannot reach the search server! : %s" % e else: - results=resp["hits"]["hits"] + results = resp["hits"]["hits"] - templates.env.filters["json"]=lambda x: yaml.dump(dict(x)) - return templates.TemplateResponse("index.html", context= { - "request": resp,"results": results,"message": message, "query": query}) \ No newline at end of file + templates.env.filters["json"] = lambda x: yaml.dump(dict(x)) + return templates.TemplateResponse( + "index.html", + context={ + "request": resp, + "results": results, + "message": message, + "query": query, + }, + ) + + +@app.get("test") +def test(request: Request): + return {"test": "test"} diff --git a/httpdemo/index.html b/httpdemo/index.html index f54bff1..ab6d0c3 100644 --- a/httpdemo/index.html +++ b/httpdemo/index.html @@ -9,8 +9,12 @@ {% for r in results %}
  • + {{r["_source"]["title"]|safe }} + {{r["_score"]|safe }} + {{r["sort"]|safe }} +

    {% for hh in r["highlight"] %} {% for h in r["highlight"][hh] %} {{ h |safe}}
    diff --git a/scrap_facebook.py b/scrap_facebook.py deleted file mode 100644 index 3bd8379..0000000 --- a/scrap_facebook.py +++ /dev/null @@ -1,12 +0,0 @@ -# https://pypi.org/project/facebook-page-scraper/ -from facebook_page_scraper import Facebook_scraper -page_name = "fsmbtu" -posts_count = 20 -browser = "chrome" -#proxy = "IP:PORT" #if proxy requires authentication then user:password@IP:PORT -timeout = 120 #600 seconds -headless = True - -meta_ai = Facebook_scraper(page_name, posts_count, browser, timeout=timeout, headless=headless) -json_data = meta_ai.scrap_to_json() -print(json_data) \ No newline at end of file diff --git a/scrap_facebook_data.py b/scrap_facebook_data.py deleted file mode 100644 index 953975e..0000000 --- a/scrap_facebook_data.py +++ /dev/null @@ -1,4 +0,0 @@ -posts = [ -{"pfbid02Zpa9GaGnyh431W29SMTdMFzwCnvndgCqDuRKY4kfmLzMhVfZyf1BG2mSokE4HXCsl": {"name": "Fachschaft Maschinenbau & Verfahrenstechnik der TU Wien", "shares": 0, "reactions": {"likes": 1, "loves": 0, "wow": 0, "cares": 0, "sad": 0, "angry": 0, "haha": 0}, "reaction_count": 1, "comments": 0, "content": "Du studierst oder hast Interesse am Master Materialwissenschaften?Dann komm zum Einführungstutorium/ Semesteropening, dort kannst du deine KollegInnen kennenlernen und alle Fragen klären!Wann? 13.10. 18:00Wo? GM7 Kleiner Schiffbau#bussi", "posted_on": "2022-10-11T09:34:00", "video": [], "image": ["https://scontent-vie1-1.xx.fbcdn.net/v/t39.30808-6/311170027_629045595356569_7847357821067350500_n.jpg?stp=dst-jpg_s600x600&_nc_cat=109&ccb=1-7&_nc_sid=730e14&_nc_ohc=aKyj-g-6ZzcAX_fgezX&_nc_ht=scontent-vie1-1.xx&oh=00_AfAIA7Wm9pPltcSK1n-ZJ2DPcIUFSKdvljZ14FNp208FCg&oe=63632F0B"], "post_url": "https://www.facebook.com/fsmbtu/posts/pfbid02Zpa9GaGnyh431W29SMTdMFzwCnvndgCqDuRKY4kfmLzMhVfZyf1BG2mSokE4HXCsl?__cft__[0]=AZVKCuhSkdYgll3ZFvIsXEx0U9ybfnyKM7ua43FdC1OloGDAkzTrNwNqNI3tcmBsqUbme0jH423_h1Wvy_ec-Xj-2QZxcQy49C6VeA78y4mD8Ex_VbitnZvxkJIm0rbYwkFiCOnwBnLbUTy5bia7yQXGNVtgJixRiY8aYIgC50mPMwjf4dLZbzX2NARd-eAXx6BBhNfeScm8n4TAkEuZiTk5FaiG97WMv2_AucJshoZ72g&__tn__=%2CO%2CP-R"}}, -{"pfbid094MJ3H1jaioLwm9ZBdyN4bEQHnFr8JAgpB6K7UBZ7xTqfZdfifVzvhJdgyLrwdNAl": {"name": "Fachschaft Elektrotechnik", "shares": 0, "reactions": {"likes": 17, "loves": 1, "wow": 0, "cares": 0, "sad": 0, "angry": 0, "haha": 0}, "reaction_count": 18, "comments": 0, "content": "Nach 2 Jahren Pause ist es nun wieder so weit. Wir feiern am 21.10. das 20. Fetz’n Fliegen Komm vorbei und lassen wir gemeinsam die Fetz‘n Fliegen #comeback #jubiläum #party #fetznfliegen #tuwien #semesteropening", "posted_on": "2022-10-13T12:09:00", "video": [], "image": ["https://scontent-vie1-1.xx.fbcdn.net/v/t39.30808-6/311172846_632995741860626_2839195490689716775_n.jpg?stp=dst-jpg_p526x296&_nc_cat=102&ccb=1-7&_nc_sid=730e14&_nc_ohc=5crB-6ISDf0AX-pE9Iw&_nc_oc=AQmfhBkwtkWsD_RCLws86g4MwFGNXJCU1ZnufsKI3mnZkdFla-Mx7s9SOgbIhpd2PVk&_nc_ht=scontent-vie1-1.xx&oh=00_AfDwNsC-aZ3Jhxbeok5P9Bvf0IpIvyY61sDyhl7uWK3MKg&oe=63640FE3"], "post_url": "https://www.facebook.com/FachschaftET/posts/pfbid094MJ3H1jaioLwm9ZBdyN4bEQHnFr8JAgpB6K7UBZ7xTqfZdfifVzvhJdgyLrwdNAl?__cft__[0]=AZWL_tw6LUTREPksX4y2qquFekfyKm-6l3a7UTsojWf-Ij9D8fObP55jGZUYB0QFe3zomuCLsWOeV7fTrsz6sXO6otteao4t0g0JUy6hHKeQvKNyfHB9lymnXvzwremH5sCh7CqJOQOeqlGCVtQ8Pqbcq82KGA09P5BdWyVfToNkoenUOMHxdaoso9cK-ijPG_fsn6pivRT38MdC1UXWENU3Edw4eXee92n18KvjKVhVTQ&__tn__=%2CO%2CP-R"}} -] \ No newline at end of file diff --git a/scraped_urls.log b/scraped_urls.log new file mode 100644 index 0000000..8b23ad9 --- /dev/null +++ b/scraped_urls.log @@ -0,0 +1,46 @@ +wiki.fet.at/index.php/Rezept:_Nathans_Apfelstreuselkuchen +wiki.fet.at/index.php/How_to:_Start_@_FET +wiki.fet.at/index.php/Kino +wiki.fet.at/index.php/How_to:_Kellerputz +wiki.fet.at/index.php/Schinkenfleckerl +wiki.fet.at/index.php/Waffeln +wiki.fet.at/index.php/%C3%96H-Wahl:_Wahlstand +wiki.fet.at/index.php/Keller:_Bestandsaufnahme +wiki.fet.at/index.php/M%C3%B6rderspiel +wiki.fet.at/index.php/Schl%C3%BCssel-_und_Oimlliste +wiki.fet.at/index.php/TU-Phone +wiki.fet.at/index.php/Bierbaron +wiki.fet.at/index.php/ET-Crashkurs +wiki.fet.at/index.php/%C3%96H-Wahl +wiki.fet.at/index.php/How-To_Festl +wiki.fet.at/index.php/Bouldern +wiki.fet.at/index.php/Spritzerstand +wiki.fet.at/index.php/Beratung +wiki.fet.at/index.php/Email-Anleitung +wiki.fet.at/index.php/LDAP +wiki.fet.at/index.php/Wiki-Anleitung +wiki.fet.at/index.php/User_anlegen +wiki.fet.at/index.php/Mailinglisten +wiki.fet.at/index.php/Rezept:_Nathans_Apfelstreuselkuchen +wiki.fet.at/index.php/Schinkenfleckerl +wiki.fet.at/index.php/M%C3%B6rderspiel +wiki.fet.at/index.php/How_to:_Kellerputz +wiki.fet.at/index.php/Keller:_Bestandsaufnahme +wiki.fet.at/index.php/How_to:_Start_@_FET +wiki.fet.at/index.php/Kino +wiki.fet.at/index.php/Waffeln +wiki.fet.at/index.php/%C3%96H-Wahl:_Wahlstand +wiki.fet.at/index.php/TU-Phone +wiki.fet.at/index.php/Schl%C3%BCssel-_und_Oimlliste +wiki.fet.at/index.php/Bierbaron +wiki.fet.at/index.php/%C3%96H-Wahl +wiki.fet.at/index.php/Bouldern +wiki.fet.at/index.php/Spritzerstand +wiki.fet.at/index.php/ET-Crashkurs +wiki.fet.at/index.php/How-To_Festl +wiki.fet.at/index.php/Beratung +wiki.fet.at/index.php/Wiki-Anleitung +wiki.fet.at/index.php/User_anlegen +wiki.fet.at/index.php/Email-Anleitung +wiki.fet.at/index.php/LDAP +wiki.fet.at/index.php/Mailinglisten diff --git a/search.html b/search.html new file mode 100644 index 0000000..3190210 --- /dev/null +++ b/search.html @@ -0,0 +1,30 @@ + + + + API Request Example + + +
    + + + +
    +
    + + + \ No newline at end of file diff --git a/searching/__init__.py b/searching/__init__.py index 7e60198..2ff86c1 100644 --- a/searching/__init__.py +++ b/searching/__init__.py @@ -1,69 +1,94 @@ - - - import os from elasticsearch import Elasticsearch, helpers import contextlib import logging -ELASTIC_HOST = os.environ.get("ELASTIC_HOST","http://localhost:9200") -ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD","*l9qNGoojiCC4n9KcZhj") -ELASTIC_QUERY = os.environ.get("ELASTIC_QUERY","Anwesend") -ELASTIC_INDEX = os.environ.get("ELASTIC_INDEX","legacy") + +ELASTIC_HOST = os.environ.get("ELASTIC_HOST", "http://localhost:9200") +ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD", "*l9qNGoojiCC4n9KcZhj") +ELASTIC_QUERY = os.environ.get("ELASTIC_QUERY", "Anwesend") +ELASTIC_INDEX = os.environ.get("ELASTIC_INDEX", "legacy") # Verbinde mit Client @contextlib.contextmanager def es_client(): - logging.debug(f"ELASIC HOST:%s" % ELASTIC_HOST) - client = Elasticsearch(ELASTIC_HOST, verify_certs=False, basic_auth=('elastic', ELASTIC_PASSWORD)) - yield client - client.close() + logging.debug(f"ELASIC HOST:%s" % ELASTIC_HOST) + client = Elasticsearch( + ELASTIC_HOST, verify_certs=False, basic_auth=("elastic", ELASTIC_PASSWORD) + ) + yield client + client.close() -def es_query(query:str): - query ={ - "multi_match":{ - "query": query, - "fields": ["title^20","title.ngrams^10","text^5","text.ngrams"], - "tie_breaker": 0.3 - #"type": "most_fields" - } +def es_query(query: str): + query = { + "multi_match": { + "query": query, + "fields": ["title^20", "title.ngrams^10", "text^5", "text.ngrams"], + "tie_breaker": 0.3 + # "type": "most_fields" } - return query + } + return query def es_highlight(): - highlight = { - "fields": { - "title": {}, - "text": {"fragment_size" : 150}, - "title.ngrams": {}, - "text.ngrams": {"fragment_size" : 150}, - - } - } - return highlight + highlight = { + "fields": { + "title": {}, + "text": {"fragment_size": 150}, + "title.ngrams": {}, + "text.ngrams": {"fragment_size": 150}, + } + } + return highlight -def es_search(query:str): - with es_client() as client: + +sorting = { + "updated_at": {"order": "desc"}, + "_score": {"order": "desc"}, + "prio": {"order": "desc"}, +} + + +def es_sorting(): + return { + "_script": { + "type": "number", + "script": { + "lang": "painless", + "source": "Math.log10(1+doc['updated_at'].value.toInstant().toEpochMilli()/1000000000/100) + Math.log10(1+_score)/10 + Math.log10(1+doc['prio'].value/1000) ", # * Math.log10(1+) * Math.log10(doc['prio'].value/10)" #* doc['_score'].value + }, + "order": "desc", + } + } + + +def es_search(query: str): + with es_client() as client: result = client.search( - index = ELASTIC_INDEX, - size=10, - query= es_query(query), - highlight = es_highlight() + index=ELASTIC_INDEX, + size=30, + query=es_query(query), + sort=es_sorting(), + highlight=es_highlight(), ) - return result -#for hit in resp["hits"]["hits"]: + return result + + +# for hit in resp["hits"]["hits"]: # print(hit) -if __name__ =="__main__": - resp = es_search(ELASTIC_QUERY) - logging.info(f"Found %d recorts in hits" % resp["hits"]["hits"]) - for hit in resp["hits"]["hits"]: - print(f"\n\n%s\n%s\n%s - %s" % ( - hit.get("_source",{}).get("url",""), - hit.get("_source",{}).get("title",""), - " ".join(hit.get("highlight",{}).get("title",[""])), - " ".join(hit.get("highlight",{}).get("text",[""])) - )) - \ No newline at end of file +if __name__ == "__main__": + resp = es_search(ELASTIC_QUERY) + logging.info(f"Found %d recorts in hits" % resp["hits"]["hits"]) + for hit in resp["hits"]["hits"]: + print( + f"\n\n%s\n%s\n%s - %s" + % ( + hit.get("_source", {}).get("url", ""), + hit.get("_source", {}).get("title", ""), + " ".join(hit.get("highlight", {}).get("title", [""])), + " ".join(hit.get("highlight", {}).get("text", [""])), + ) + ) diff --git a/searching/index.py b/searching/index.py index 1927489..aaa7e30 100644 --- a/searching/index.py +++ b/searching/index.py @@ -54,6 +54,7 @@ mapping = { "url": { "type": "text", "index": False}, "published": {"type": "date", "format": "date_optional_time"}, "updated_at": {"type": "date", "format": "date_optional_time"}, + "prio": {"type": "integer"}, "raw": { "type": "text", "index": False