commit ed610970cb529a3418cbb72958546bba76f99147
Author: andis
Date: Sun Feb 19 10:04:10 2023 +0100
first_commit
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3eca2d2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+archiv/*
+.pytest_cache
+test_elastic
+*.pyc
+*.bak
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..d375949
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,16 @@
+from python:3.8
+
+workdir /srv
+
+RUN pip3 install setuptools_scm
+
+RUN pip3 install elasticsearch
+RUN pip3 install scrapy
+COPY requirements.txt .
+RUN pip3 install -r requirements.txt
+COPY scrapy.cfg .
+
+COPY html_scrapy ./html_scrapy
+COPY searching ./searching
+CMD ls && scrapy crawl legacy_spider
+#spider html_scrapy/spiders/legacy_spider.py
\ No newline at end of file
diff --git a/Dockerfile_index b/Dockerfile_index
new file mode 100644
index 0000000..2a8ecb9
--- /dev/null
+++ b/Dockerfile_index
@@ -0,0 +1,9 @@
+from python:3.8
+workdir /srv
+
+RUN pip3 install setuptools_scm
+RUN pip3 install elasticsearch
+
+COPY searching ./searching
+COPY reset_index.py .
+CMD python3 reset_index.py
\ No newline at end of file
diff --git a/build_docker b/build_docker
new file mode 100644
index 0000000..ded159f
--- /dev/null
+++ b/build_docker
@@ -0,0 +1,2 @@
+docker build -t docker.fet.at/andis_scrapy:0.1.2 .
+docker build -t docker.fet.at/andis_index:0.1.2 -f Dockerfile_index .
\ No newline at end of file
diff --git a/html_scrapy/__init__.py b/html_scrapy/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/html_scrapy/elastic_publish.py b/html_scrapy/elastic_publish.py
new file mode 100644
index 0000000..b27760d
--- /dev/null
+++ b/html_scrapy/elastic_publish.py
@@ -0,0 +1,25 @@
+import os
+from elasticsearch import Elasticsearch
+from searching import es_client, ELASTIC_INDEX
+
+def push_to_index(id, element):
+ element = check_elastic_document(element)
+ try:
+ with es_client() as client:
+ client.index(index=ELASTIC_INDEX, id=id, document=element)
+ except Exception as e:
+ print(e)
+
+def check_elastic_document(element):
+
+ for e in ["url", "title", "text", "published", "updated_at"]:
+ if not e in element:
+ raise AttributeError(f"A %s is needed for the Elastic Element" % e)
+
+ return { "published": str(element["published"]),
+ "text": element["text"],
+ "title": element["title"],
+ #"source": get_source(post),
+ "url": element["url"],
+ "updated_at": str(element["updated_at"])
+ }
\ No newline at end of file
diff --git a/html_scrapy/settings.py b/html_scrapy/settings.py
new file mode 100644
index 0000000..e72d208
--- /dev/null
+++ b/html_scrapy/settings.py
@@ -0,0 +1,93 @@
+# Scrapy settings for t project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+# https://docs.scrapy.org/en/latest/topics/settings.html
+# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'html_scrapy'
+
+SPIDER_MODULES = ['html_scrapy.spiders']
+NEWSPIDER_MODULE = 'html_scrapy.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 't (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+# 'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+# 't.middlewares.TSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+DOWNLOADER_MIDDLEWARES = {
+# 't.middlewares.TDownloaderMiddleware': 543,
+ 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 543
+}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+# 'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+# 't.pipelines.TPipeline': 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
+# Set settings whose default value is deprecated to a future-proof value
+REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
+TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
diff --git a/html_scrapy/spiders/__init:__.py b/html_scrapy/spiders/__init:__.py
new file mode 100644
index 0000000..e69de29
diff --git a/html_scrapy/spiders/legacy_spider.py b/html_scrapy/spiders/legacy_spider.py
new file mode 100644
index 0000000..35c5f50
--- /dev/null
+++ b/html_scrapy/spiders/legacy_spider.py
@@ -0,0 +1,123 @@
+from scrapy.spiders import SitemapSpider,CrawlSpider, Rule, Spider
+from scrapy.linkextractors import LinkExtractor
+import pickle
+import scrapy
+#import redis as redis
+import bs4
+import re
+import getpass
+#r = redis.Redis(host='localhost', port=6379, password="12345678", db=0)
+import pickle
+import json
+from html_scrapy.elastic_publish import push_to_index, check_elastic_document
+
+def publish(response: scrapy.http.response.html.HtmlResponse):
+ print("Response typ: %s, obj: %s" % (type(response),response))
+
+# r.publish("fetlegacy", pickle.dumps({"url": response.url, "body": response.body}))
+# r.set(response.url, response.body)
+
+ with open("scraped_urls.log", "a+") as f:
+ f.write(response.url+"\n")
+
+documents={}
+
+
+
+
+class LegacySpider(CrawlSpider):
+ name = 'legacy_spider'
+ allowed_domains = ['legacy.fet.at']
+ #sitemap_urls=['https://fet.at/sitemap.xml']
+ #sitemap_rules = [('/posts/', 'parse')]
+ http_user = 'andis'
+ http_pass = getpass.getpass("Passwort von FET USer andis: ")
+ #http_auth_domain = 'legacy.fet.at'
+ rules = (
+ Rule(LinkExtractor(allow=('documents', )), callback='parse_document', process_links='fix_document_links'),
+ Rule(LinkExtractor(allow=(r"themengruppen/\d+$", )), callback='parse_themengruppe'),
+ )
+ start_urls = ['https://legacy.fet.at/themengruppen/15']
+
+ custom_settings = {
+ 'DEPTH_LIMIT': '1',
+ }
+ def fix_document_links(self, links):
+ for link in links:
+ if re.match(r".*documents/\d+$", link.url):
+ link.url=link.url +".json"
+ yield link
+ def fix_themen_links(self, links):
+ for link in links:
+ if re.match(r".*themen/\d+$", link.url):
+ link.url=link.url +".json"
+ yield link
+
+ def parse_document(self, response):
+ global documents
+ body=json.loads(response.body)
+ body["raw"] = body["text"]
+ body["text"] = bs4.BeautifulSoup(body["text"], 'lxml').get_text(" ")
+ documents[int(body['id'])]=body
+ return
+ def parse_themen(self, response):
+
+ body=json.loads(response.body)
+ body["text"] = bs4.BeautifulSoup(body["text"], 'lxml').get_text(" ")
+ documents[int(body['id'])]=body
+ return
+
+ def parse_themengruppe(self, response):
+ global documents
+
+ themengruppe = bs4.BeautifulSoup(response.body, 'lxml')
+
+ thema_tags = themengruppe.find_all("div", itemtype="http://schema.org/Article")
+ print(f"found %d topics"% len(thema_tags))
+ for thema in thema_tags:
+
+ t_link = thema.find("h2").find_all("a")[0]
+ t = {
+ 'url': t_link["href"],
+ 'title': t_link.text,
+ 'id': int( re.search(r"/(\d+)$", t_link["href"]).group(1)),
+
+ }
+ print(f"\n\ncrawling topic %s - %s" %(t.get("id","?"),t.get("title","?")))
+ meeting_tags = thema.find_all("div", id=re.compile(r'meeting_\d+'))
+ print(f"\nfound %d meetings for topic %s" % (len(meeting_tags), t["title"]))
+ for meeting in meeting_tags:
+ m_id =re.search(r"meeting_(\d+)$", meeting['id']).group(1)
+ m ={'id': m_id,
+ 'title': meeting.find("a").text
+ }
+
+ docs=meeting.find_all(id=re.compile(r'document_\d+'))
+ print(f"crawling meeting %s with %d meetings" % (m.get("title"),len(docs)))
+ for d in docs:
+ doc_id = int( re.search(r"document_(\d+)$", d['id']).group(1))
+ if not doc_id in documents:
+ continue
+ documents[doc_id]["meeting"]=m
+ documents[doc_id]["thema"]= t
+ for d in thema.find_all(id=re.compile(r'document_\d+')):
+ doc_id = int( re.search(r"document_(\d+)$", d['id']).group(1))
+ if not doc_id in documents:
+ continue
+ documents[doc_id]["thema"]= t
+
+ output={}
+ for k, d in documents.items():
+ output[k]= check_elastic_document({
+ "title": d.get("name","")+ " - " + d.get("thema", {}).get("title","") + " - " + d.get("meeting",{}).get("title",""),
+ "text": d.get("text",""),
+ "raw": d.get("raw",""),
+ "url": "legacy.fet.at/documents/" + str(d["id"]),
+ "published": d["created_at"],
+ "updated_at": d["updated_at"]
+ })
+ push_to_index(output[k]["url"],output[k])
+
+ print(f"Document added: %s" % output[k].get("title",""))
+ #print("\n\nDocuments"+json.dumps(output))
+ return None
\ No newline at end of file
diff --git a/httpdemo/__init__.py b/httpdemo/__init__.py
new file mode 100644
index 0000000..c43ca44
--- /dev/null
+++ b/httpdemo/__init__.py
@@ -0,0 +1,68 @@
+
+from fastapi import FastAPI
+from fastapi.staticfiles import StaticFiles
+from fastapi.templating import Jinja2Templates
+from fastapi import Request
+from elasticsearch import Elasticsearch
+import sys
+import elastic_transport
+
+ELASTIC_HOST = "http://localhost:9200"
+client = Elasticsearch(ELASTIC_HOST, verify_certs=False)
+
+app = FastAPI(debug=True)
+
+templates = Jinja2Templates(directory="./httpdemo")
+#app.mount("/", StaticFiles(directory="/"))
+
+
+#@app.get("/")
+#def read_root():
+# return {"Hello"}
+
+
+query ={
+ "multi_match":{
+ "query": ELASTIC_QUERY,
+ "fields": ["title^20","title.ngrams^10","text","text.ngrams"],
+ "type": "most_fields"
+ }
+}
+highlight = {
+ "fields": {
+ "title": {},
+ "text": {}
+ }
+ }
+
+
+
+
+@app.get("/")
+def serve_home(request: Request, q: str):
+ query= {
+ "bool":{
+ "should": [{"wildcard": {"body": {"value": "*%s*"% q, "case_insensitive": True }}},
+ {"wildcard": {"title": {"value": "*%s*" % q, "case_insensitive": True }}}],
+ "minimum_should_match": 1
+ }}
+ query = {
+ "match": {
+ "body": q
+ }
+ }
+
+ try:
+ resp = client.search(
+ index = "posts2",
+ size=10,
+ #analyze_wildcard=True,
+ #q="sdf*",
+ query= query
+ )
+ except (elastic_transport.ConnectionError, elastic_transport.ConnectionTimeout) as e:
+ print(e,sys.stderr)
+ results=[]
+ else:
+ results=resp["hits"]["hits"]
+ return templates.TemplateResponse("index.html", context= {"request": request,"results": results})
\ No newline at end of file
diff --git a/httpdemo/index.html b/httpdemo/index.html
new file mode 100644
index 0000000..368b7a2
--- /dev/null
+++ b/httpdemo/index.html
@@ -0,0 +1,10 @@
+
+ Hello Index httpdemo
+{{request}}
+
+
\ No newline at end of file
diff --git a/httpdemo/jquery.js b/httpdemo/jquery.js
new file mode 100644
index 0000000..12e65d0
--- /dev/null
+++ b/httpdemo/jquery.js
@@ -0,0 +1,10909 @@
+/*!
+ * jQuery JavaScript Library v3.6.1
+ * https://jquery.com/
+ *
+ * Includes Sizzle.js
+ * https://sizzlejs.com/
+ *
+ * Copyright OpenJS Foundation and other contributors
+ * Released under the MIT license
+ * https://jquery.org/license
+ *
+ * Date: 2022-08-26T17:52Z
+ */
+( function( global, factory ) {
+
+ "use strict";
+
+ if ( typeof module === "object" && typeof module.exports === "object" ) {
+
+ // For CommonJS and CommonJS-like environments where a proper `window`
+ // is present, execute the factory and get jQuery.
+ // For environments that do not have a `window` with a `document`
+ // (such as Node.js), expose a factory as module.exports.
+ // This accentuates the need for the creation of a real `window`.
+ // e.g. var jQuery = require("jquery")(window);
+ // See ticket trac-14549 for more info.
+ module.exports = global.document ?
+ factory( global, true ) :
+ function( w ) {
+ if ( !w.document ) {
+ throw new Error( "jQuery requires a window with a document" );
+ }
+ return factory( w );
+ };
+ } else {
+ factory( global );
+ }
+
+// Pass this if window is not defined yet
+} )( typeof window !== "undefined" ? window : this, function( window, noGlobal ) {
+
+// Edge <= 12 - 13+, Firefox <=18 - 45+, IE 10 - 11, Safari 5.1 - 9+, iOS 6 - 9.1
+// throw exceptions when non-strict code (e.g., ASP.NET 4.5) accesses strict mode
+// arguments.callee.caller (trac-13335). But as of jQuery 3.0 (2016), strict mode should be common
+// enough that all such attempts are guarded in a try block.
+"use strict";
+
+var arr = [];
+
+var getProto = Object.getPrototypeOf;
+
+var slice = arr.slice;
+
+var flat = arr.flat ? function( array ) {
+ return arr.flat.call( array );
+} : function( array ) {
+ return arr.concat.apply( [], array );
+};
+
+
+var push = arr.push;
+
+var indexOf = arr.indexOf;
+
+var class2type = {};
+
+var toString = class2type.toString;
+
+var hasOwn = class2type.hasOwnProperty;
+
+var fnToString = hasOwn.toString;
+
+var ObjectFunctionString = fnToString.call( Object );
+
+var support = {};
+
+var isFunction = function isFunction( obj ) {
+
+ // Support: Chrome <=57, Firefox <=52
+ // In some browsers, typeof returns "function" for HTML