first_commit

2023-02-19 10:04:10 +01:00
commit ed610970cb
23 changed files with 11434 additions and 0 deletions
--- a/html_scrapy/init.py
+++ b/html_scrapy/init.py
--- a/html_scrapy/elastic_publish.py
+++ b/html_scrapy/elastic_publish.py
@@ -0,0 +1,25 @@
+import os
+from elasticsearch import Elasticsearch
+from searching import es_client, ELASTIC_INDEX
+
+def push_to_index(id, element):
+    element = check_elastic_document(element)
+    try:
+     with es_client() as client:
+      client.index(index=ELASTIC_INDEX, id=id,  document=element)
+    except Exception as e:
+        print(e)
+
+def check_elastic_document(element):
+    
+    for e in ["url", "title", "text", "published", "updated_at"]:
+        if not e in element:
+            raise AttributeError(f"A %s is needed for the Elastic Element" % e)
+    
+    return {    "published": str(element["published"]),
+                "text": element["text"],
+                "title": element["title"],
+                #"source": get_source(post),
+                "url": element["url"],
+                "updated_at": str(element["updated_at"])
+                }
--- a/html_scrapy/settings.py
+++ b/html_scrapy/settings.py
@@ -0,0 +1,93 @@
+# Scrapy settings for t project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'html_scrapy'
+
+SPIDER_MODULES = ['html_scrapy.spiders']
+NEWSPIDER_MODULE = 'html_scrapy.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 't (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    't.middlewares.TSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+DOWNLOADER_MIDDLEWARES = {
+#   't.middlewares.TDownloaderMiddleware': 543,
+   'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 543
+}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    't.pipelines.TPipeline': 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
+# Set settings whose default value is deprecated to a future-proof value
+REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
+TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
--- a/html_scrapy/spiders/init:.py
+++ b/html_scrapy/spiders/init:.py
--- a/html_scrapy/spiders/legacy_spider.py
+++ b/html_scrapy/spiders/legacy_spider.py
@@ -0,0 +1,123 @@
+from scrapy.spiders import SitemapSpider,CrawlSpider, Rule, Spider
+from scrapy.linkextractors import LinkExtractor
+import pickle
+import scrapy
+#import redis as redis
+import bs4 
+import re
+import getpass 
+#r = redis.Redis(host='localhost', port=6379, password="12345678", db=0)
+import pickle
+import json
+from html_scrapy.elastic_publish import push_to_index, check_elastic_document
+
+def publish(response: scrapy.http.response.html.HtmlResponse):
+    print("Response typ: %s, obj: %s" % (type(response),response))
+    
+# r.publish("fetlegacy", pickle.dumps({"url": response.url, "body": response.body}))
+#   r.set(response.url, response.body) 
+
+    with open("scraped_urls.log", "a+") as f:
+        f.write(response.url+"\n")
+        
+documents={}
+
+
+
+
+class LegacySpider(CrawlSpider):
+    name = 'legacy_spider'
+    allowed_domains = ['legacy.fet.at']
+    #sitemap_urls=['https://fet.at/sitemap.xml']
+    #sitemap_rules = [('/posts/', 'parse')]
+    http_user = 'andis'
+    http_pass = getpass.getpass("Passwort von FET USer andis: ")
+    #http_auth_domain = 'legacy.fet.at'
+    rules = (
+       Rule(LinkExtractor(allow=('documents', )), callback='parse_document', process_links='fix_document_links'),
+       Rule(LinkExtractor(allow=(r"themengruppen/\d+$", )), callback='parse_themengruppe'),
+    )
+    start_urls = ['https://legacy.fet.at/themengruppen/15']
+    
+    custom_settings = {
+        'DEPTH_LIMIT': '1',
+    }
+    def fix_document_links(self, links):
+        for link in links:
+            if re.match(r".*documents/\d+$", link.url):
+                link.url=link.url +".json"
+                yield link
+    def fix_themen_links(self, links):
+        for link in links:
+            if re.match(r".*themen/\d+$", link.url):
+                link.url=link.url +".json"
+                yield link
+
+    def parse_document(self, response):
+        global documents
+        body=json.loads(response.body)
+        body["raw"] = body["text"]
+        body["text"] = bs4.BeautifulSoup(body["text"], 'lxml').get_text(" ")
+        documents[int(body['id'])]=body
+        return 
+    def parse_themen(self, response):
+        
+        body=json.loads(response.body)
+        body["text"] = bs4.BeautifulSoup(body["text"], 'lxml').get_text(" ")
+        documents[int(body['id'])]=body
+        return 
+
+    def parse_themengruppe(self, response):
+        global documents
+
+        themengruppe = bs4.BeautifulSoup(response.body, 'lxml')
+
+        thema_tags = themengruppe.find_all("div", itemtype="http://schema.org/Article")
+        print(f"found %d topics"% len(thema_tags))
+        for thema in thema_tags:
+         
+            t_link = thema.find("h2").find_all("a")[0]
+            t = {
+                'url':  t_link["href"],
+                'title': t_link.text,
+                'id': int( re.search(r"/(\d+)$", t_link["href"]).group(1)),
+               
+            }
+            print(f"\n\ncrawling topic %s - %s" %(t.get("id","?"),t.get("title","?")))
+            meeting_tags = thema.find_all("div", id=re.compile(r'meeting_\d+'))
+            print(f"\nfound %d meetings for topic %s" % (len(meeting_tags), t["title"]))
+            for meeting in meeting_tags:
+                m_id =re.search(r"meeting_(\d+)$", meeting['id']).group(1)
+                m ={'id': m_id, 
+                    'title': meeting.find("a").text 
+                }
+                
+                docs=meeting.find_all(id=re.compile(r'document_\d+'))  
+                print(f"crawling meeting %s  with %d meetings" % (m.get("title"),len(docs)))
+                for d in docs:
+                    doc_id = int( re.search(r"document_(\d+)$", d['id']).group(1))
+                    if not doc_id in documents: 
+                        continue
+                    documents[doc_id]["meeting"]=m 
+                    documents[doc_id]["thema"]= t  
+            for d in thema.find_all(id=re.compile(r'document_\d+')):
+                doc_id = int( re.search(r"document_(\d+)$", d['id']).group(1))
+                if not doc_id in documents: 
+                    continue
+                documents[doc_id]["thema"]= t  
+
+        output={}
+        for k, d in documents.items():
+            output[k]= check_elastic_document({
+                "title": d.get("name","")+ " - " + d.get("thema", {}).get("title","")  + " - " + d.get("meeting",{}).get("title",""),
+                "text": d.get("text",""),
+                "raw": d.get("raw",""),
+                "url": "legacy.fet.at/documents/" + str(d["id"]),
+                "published": d["created_at"],
+                "updated_at": d["updated_at"]
+            })
+            push_to_index(output[k]["url"],output[k])
+            
+            print(f"Document added: %s" % output[k].get("title",""))
+        #print("\n\nDocuments"+json.dumps(output))
+        return None