wikispider added

2023-04-13 09:44:55 +02:00
parent ef6074eaa9
commit e757290649
3 changed files with 163 additions and 3 deletions
--- a/html_scrapy/settings.py
+++ b/html_scrapy/settings.py
@@ -31,8 +31,8 @@ ROBOTSTXT_OBEY = True
 #CONCURRENT_REQUESTS_PER_IP = 16

 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
-
+COOKIES_ENABLED = True
+COOKIES_DEBUG = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False

--- a/html_scrapy/spiders/fetwiki_spider.py
+++ b/html_scrapy/spiders/fetwiki_spider.py
@@ -0,0 +1,160 @@
+from scrapy.spiders import SitemapSpider, CrawlSpider, Rule, Spider
+from scrapy.linkextractors import LinkExtractor
+import scrapy
+import bs4
+import re
+import getpass
+import urllib.parse
+from html_scrapy.elastic_publish import push_to_index, check_elastic_document
+from furl import furl
+from scrapy import signals
+
+def publish(response: scrapy.http.response.html.HtmlResponse):
+    print("Response typ: %s, obj: %s" % (type(response), response))
+    with open("scraped_urls.log", "a+") as f:
+        f.write(response.url + "\n")
+
+
+class FetWikiSpider(SitemapSpider):
+    name = "fetwiki_spider"
+    allowed_domains = ["wiki.fet.at"]
+    sitemap_urls = ["https://wiki.fet.at/sitemap/sitemap-wiki-NS_0-0.xml"]
+    sitemap_rules = [("/", "parse_page")]
+    http_user = "andis"
+    http_pass = getpass.getpass("Passwort von FET USer andis: ")
+    pages = {}
+    http_auth_domain = "wiki.fet.at"
+    # rules = (
+    #  Rule(LinkExtractor(allow=('documents', )), callback='parse_document', process_links='fix_document_links'),
+    #    Rule(LinkExtractor(allow=(r"themengruppen/\d+$", )), callback='parse_themengruppe'),
+    # )
+    # start_urls = ['https://legacy.fet.at/themengruppen/15']
+
+    # custom_settings = {
+    #   'DEPTH_LIMIT': '1',
+    # }
+    @classmethod
+    def from_crawler(cls, crawler, *args, **kwargs):
+        spider = super(FetWikiSpider, cls).from_crawler(crawler, *args, **kwargs)
+        print("connect signal")
+        crawler.signals.connect(spider.closed_handler, signal=signals.spider_closed)
+        return spider
+    
+    def closed_handler(self, spider,reason):
+        print("closing !! Handler")
+        print(reason)
+        print("found %d elements in pages" % len(spider.pages))
+        print("found %d elements in pages" % len(self.pages))
+        for id,p in spider.pages.items():
+            try:
+                output = check_elastic_document(p)
+                print(f"pushing: %s" % output["url"])
+                push_to_index(output["url"], output)
+            except AttributeError as e:
+                print(e)
+                print(f"Error occured at id: --%s--" %id)
+        
+        # do stuff here
+
+    def callback_login_done(self, response):
+
+        html = bs4.BeautifulSoup(response.body, "lxml")
+        h = html.find("h1", {"id": "firstHeading"}).text
+        print(f"\nlogin callback done %s\n" % h)
+        
+       
+        if h == "FET-Wiki":
+            for url in self.sitemap_urls:
+                yield scrapy.Request( url, self._parse_sitemap)
+        else:
+             h = html.find("div", {"class": "errorbox"})
+             if h: 
+                print(f"\nerrorbox %s" % h.text)
+        return
+
+    def callback_login(self, response):
+        print(f"\nStart Login:\n")
+
+        html = bs4.BeautifulSoup(response.body, "lxml")
+        token = html.find("input", {"name": "wpLoginToken"})["value"]
+        login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden"
+        formdata = {
+            "wpName": self.http_user,
+            "wpPassword": self.http_pass,
+            "domain": "fet.htu.tuwien.ac.at",
+            "wpLoginToken": token,
+            "authAction": "login",
+            "pluggableauthlogin": "Anmelden",
+            "title": "Spezial:Anmelden"
+        }
+        yield scrapy.FormRequest(
+            login_url,
+            formdata=formdata,
+            #meta={"cookiejar": response.meta["cookiejar"]},
+            callback=self.callback_login_done,
+        )
+
+    def start_requests(self):
+        login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden"
+        #self.cookie_jar = CookieJar()
+        yield scrapy.Request(
+            "https://wiki.fet.at/index.php?title=Spezial:Anmelden",
+            callback=self.callback_login,
+        )
+
+        print(f"\nLogin done Processing Sitemap:\n")
+
+    def parse_history(self, response, id):
+        print(f"\n\n Parsing: %s\n" % response.url)
+        html = bs4.BeautifulSoup(response.body, "lxml")
+        ul = html.find("ul", {"id": "pagehistory"}).find_all("li")
+        last = ul[0].find("a", {"class": "mw-changeslist-date"}).text
+        created = ul[-1].find("a", {"class": "mw-changeslist-date"}).text
+        d = self.pages.get(id,{})
+        d["published"]= created
+        d["updated_at"]= last
+        self.pages[id]=d
+        
+        return
+
+    def parse_page(self, response):
+        print(f"\n\n Parsing: %s\n" % response.url)
+        html = bs4.BeautifulSoup(response.body, "lxml")
+        title = html.find("h1", {"id": "firstHeading"}).text.strip()
+        if title == "Anmeldung erforderlich":
+            print("login erforderlich")
+            yield scrapy.Request(
+                "https://wiki.fet.at/index.php?title=Spezial:Anmelden",
+                callback=self.callback_login,
+            )
+            print(f"Exiting Title : %s\n\n" % title)
+            return
+        print(f"Title : %s\n\n" % title)
+
+        h = html.find("link", {"rel": "alternate"})
+        if h:
+            hist_url = furl(
+                h["href"]
+            )  # /index.php?title=ET-Crashkurs&amp;action=history
+            hist_url.host = "wiki.fet.at"
+            hist_url.args["action"] = "history"
+            hist_url.scheme = "https"
+            id = hist_url.args["title"]
+            yield scrapy.Request(
+                hist_url.url,
+                self.parse_history,
+                cb_kwargs={"id": hist_url.args["title"]},
+            )
+        else:
+            print("No Histoy for this page: %s" % id)
+        d = self.pages.get(id, {}) 
+        d["title"] = title
+        d["raw"] = html.find("div", {"id": "bodyContent"})
+        d["text"] = d["raw"].get_text(" ")
+
+        d["url"] = re.search(r"http[s]?://(.+)$", response.url).group(1)
+
+        self.pages[id] = d
+        
+
+        return None
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
 beautifulsoup4
 scrapy
-
+furl