elasticsearch/html_scrapy/spiders/fetwiki_spider.py

from scrapy.spiders import SitemapSpider, CrawlSpider, Rule, Spider
from scrapy.linkextractors import LinkExtractor
import scrapy
import bs4
import re
import getpass
import urllib.parse
from html_scrapy.elastic_publish import push_to_index, check_elastic_document
from furl import furl
from scrapy import signals

def publish(response: scrapy.http.response.html.HtmlResponse):
    print("Response typ: %s, obj: %s" % (type(response), response))
    with open("scraped_urls.log", "a+") as f:
        f.write(response.url + "\n")


class FetWikiSpider(SitemapSpider):
    name = "fetwiki_spider"
    allowed_domains = ["wiki.fet.at"]
    sitemap_urls = ["https://wiki.fet.at/sitemap/sitemap-wiki-NS_0-0.xml"]
    sitemap_rules = [("/", "parse_page")]
    http_user = "andis"
    http_pass = getpass.getpass("Passwort von FET USer andis: ")
    pages = {}
    http_auth_domain = "wiki.fet.at"
    # rules = (
    #  Rule(LinkExtractor(allow=('documents', )), callback='parse_document', process_links='fix_document_links'),
    #    Rule(LinkExtractor(allow=(r"themengruppen/\d+$", )), callback='parse_themengruppe'),
    # )
    # start_urls = ['https://legacy.fet.at/themengruppen/15']

    # custom_settings = {
    #   'DEPTH_LIMIT': '1',
    # }
    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(FetWikiSpider, cls).from_crawler(crawler, *args, **kwargs)
        print("connect signal")
        crawler.signals.connect(spider.closed_handler, signal=signals.spider_closed)
        return spider

    def closed_handler(self, spider,reason):
        print("closing !! Handler")
        print(reason)
        print("found %d elements in pages" % len(spider.pages))
        print("found %d elements in pages" % len(self.pages))
        for id,p in spider.pages.items():
            try:
                output = check_elastic_document(p)
                print(f"pushing: %s" % output["url"])
                push_to_index(output["url"], output)
            except AttributeError as e:
                print(e)
                print(f"Error occured at id: --%s--" %id)

        # do stuff here

    def callback_login_done(self, response):

        html = bs4.BeautifulSoup(response.body, "lxml")
        h = html.find("h1", {"id": "firstHeading"}).text
        print(f"\nlogin callback done %s\n" % h)


        if h == "FET-Wiki":
            for url in self.sitemap_urls:
                yield scrapy.Request( url, self._parse_sitemap)
        else:
             h = html.find("div", {"class": "errorbox"})
             if h:
                print(f"\nerrorbox %s" % h.text)
        return

    def callback_login(self, response):
        print(f"\nStart Login:\n")

        html = bs4.BeautifulSoup(response.body, "lxml")
        token = html.find("input", {"name": "wpLoginToken"})["value"]
        login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden"
        formdata = {
            "wpName": self.http_user,
            "wpPassword": self.http_pass,
            "domain": "fet.htu.tuwien.ac.at",
            "wpLoginToken": token,
            "authAction": "login",
            "pluggableauthlogin": "Anmelden",
            "title": "Spezial:Anmelden"
        }
        yield scrapy.FormRequest(
            login_url,
            formdata=formdata,
            #meta={"cookiejar": response.meta["cookiejar"]},
            callback=self.callback_login_done,
        )

    def start_requests(self):
        login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden"
        #self.cookie_jar = CookieJar()
        yield scrapy.Request(
            "https://wiki.fet.at/index.php?title=Spezial:Anmelden",
            callback=self.callback_login,
        )

        print(f"\nLogin done Processing Sitemap:\n")

    def parse_history(self, response, id):
        print(f"\n\n Parsing: %s\n" % response.url)
        html = bs4.BeautifulSoup(response.body, "lxml")
        ul = html.find("ul", {"id": "pagehistory"}).find_all("li")
        last = ul[0].find("a", {"class": "mw-changeslist-date"}).text
        created = ul[-1].find("a", {"class": "mw-changeslist-date"}).text
        d = self.pages.get(id,{})
        d["published"]= created
        d["updated_at"]= last
        self.pages[id]=d

        return

    def parse_page(self, response):
        print(f"\n\n Parsing: %s\n" % response.url)
        html = bs4.BeautifulSoup(response.body, "lxml")
        title = html.find("h1", {"id": "firstHeading"}).text.strip()
        if title == "Anmeldung erforderlich":
            print("login erforderlich")
            yield scrapy.Request(
                "https://wiki.fet.at/index.php?title=Spezial:Anmelden",
                callback=self.callback_login,
            )
            print(f"Exiting Title : %s\n\n" % title)
            return
        print(f"Title : %s\n\n" % title)

        h = html.find("link", {"rel": "alternate"})
        if h:
            hist_url = furl(
                h["href"]
            )  # /index.php?title=ET-Crashkurs&amp;action=history
            hist_url.host = "wiki.fet.at"
            hist_url.args["action"] = "history"
            hist_url.scheme = "https"
            id = hist_url.args["title"]
            yield scrapy.Request(
                hist_url.url,
                self.parse_history,
                cb_kwargs={"id": hist_url.args["title"]},
            )
        else:
            print("No Histoy for this page: %s" % id)
        d = self.pages.get(id, {})
        d["title"] = title
        d["raw"] = html.find("div", {"id": "bodyContent"})
        d["text"] = d["raw"].get_text(" ")

        d["url"] = re.search(r"http[s]?://(.+)$", response.url).group(1)

        self.pages[id] = d


        return None