from scrapy.spiders import SitemapSpider, CrawlSpider, Rule, Spider from scrapy.linkextractors import LinkExtractor import scrapy import bs4 import re import getpass import urllib.parse from html_scrapy.elastic_publish import push_to_index, check_elastic_document from furl import furl from scrapy import signals def publish(response: scrapy.http.response.html.HtmlResponse): print("Response typ: %s, obj: %s" % (type(response), response)) with open("scraped_urls.log", "a+") as f: f.write(response.url + "\n") class FetWikiSpider(SitemapSpider): name = "fetwiki_spider" allowed_domains = ["wiki.fet.at"] sitemap_urls = ["https://wiki.fet.at/sitemap/sitemap-wiki-NS_0-0.xml"] sitemap_rules = [("/", "parse_page")] http_user = "andis" http_pass = getpass.getpass("Passwort von FET USer andis: ") pages = {} http_auth_domain = "wiki.fet.at" # rules = ( # Rule(LinkExtractor(allow=('documents', )), callback='parse_document', process_links='fix_document_links'), # Rule(LinkExtractor(allow=(r"themengruppen/\d+$", )), callback='parse_themengruppe'), # ) # start_urls = ['https://legacy.fet.at/themengruppen/15'] # custom_settings = { # 'DEPTH_LIMIT': '1', # } @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super(FetWikiSpider, cls).from_crawler(crawler, *args, **kwargs) print("connect signal") crawler.signals.connect(spider.closed_handler, signal=signals.spider_closed) return spider def closed_handler(self, spider,reason): print("closing !! Handler") print(reason) print("found %d elements in pages" % len(spider.pages)) print("found %d elements in pages" % len(self.pages)) for id,p in spider.pages.items(): try: output = check_elastic_document(p) print(f"pushing: %s" % output["url"]) push_to_index(output["url"], output) except AttributeError as e: print(e) print(f"Error occured at id: --%s--" %id) # do stuff here def callback_login_done(self, response): html = bs4.BeautifulSoup(response.body, "lxml") h = html.find("h1", {"id": "firstHeading"}).text print(f"\nlogin callback done %s\n" % h) if h == "FET-Wiki": for url in self.sitemap_urls: yield scrapy.Request( url, self._parse_sitemap) else: h = html.find("div", {"class": "errorbox"}) if h: print(f"\nerrorbox %s" % h.text) return def callback_login(self, response): print(f"\nStart Login:\n") html = bs4.BeautifulSoup(response.body, "lxml") token = html.find("input", {"name": "wpLoginToken"})["value"] login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden" formdata = { "wpName": self.http_user, "wpPassword": self.http_pass, "domain": "fet.htu.tuwien.ac.at", "wpLoginToken": token, "authAction": "login", "pluggableauthlogin": "Anmelden", "title": "Spezial:Anmelden" } yield scrapy.FormRequest( login_url, formdata=formdata, #meta={"cookiejar": response.meta["cookiejar"]}, callback=self.callback_login_done, ) def start_requests(self): login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden" #self.cookie_jar = CookieJar() yield scrapy.Request( "https://wiki.fet.at/index.php?title=Spezial:Anmelden", callback=self.callback_login, ) print(f"\nLogin done Processing Sitemap:\n") def parse_history(self, response, id): print(f"\n\n Parsing: %s\n" % response.url) html = bs4.BeautifulSoup(response.body, "lxml") ul = html.find("ul", {"id": "pagehistory"}).find_all("li") last = ul[0].find("a", {"class": "mw-changeslist-date"}).text created = ul[-1].find("a", {"class": "mw-changeslist-date"}).text d = self.pages.get(id,{}) d["published"]= created d["updated_at"]= last self.pages[id]=d return def parse_page(self, response): print(f"\n\n Parsing: %s\n" % response.url) html = bs4.BeautifulSoup(response.body, "lxml") title = html.find("h1", {"id": "firstHeading"}).text.strip() if title == "Anmeldung erforderlich": print("login erforderlich") yield scrapy.Request( "https://wiki.fet.at/index.php?title=Spezial:Anmelden", callback=self.callback_login, ) print(f"Exiting Title : %s\n\n" % title) return print(f"Title : %s\n\n" % title) h = html.find("link", {"rel": "alternate"}) if h: hist_url = furl( h["href"] ) # /index.php?title=ET-Crashkurs&action=history hist_url.host = "wiki.fet.at" hist_url.args["action"] = "history" hist_url.scheme = "https" id = hist_url.args["title"] yield scrapy.Request( hist_url.url, self.parse_history, cb_kwargs={"id": hist_url.args["title"]}, ) else: print("No Histoy for this page: %s" % id) d = self.pages.get(id, {}) d["title"] = title d["raw"] = html.find("div", {"id": "bodyContent"}) d["text"] = d["raw"].get_text(" ") d["url"] = re.search(r"http[s]?://(.+)$", response.url).group(1) self.pages[id] = d return None