From e757290649d864f6f1e7cee5ecf1473e1b3091d1 Mon Sep 17 00:00:00 2001 From: Andi Date: Thu, 13 Apr 2023 09:44:55 +0200 Subject: [PATCH] wikispider added --- html_scrapy/settings.py | 4 +- html_scrapy/spiders/fetwiki_spider.py | 160 ++++++++++++++++++++++++++ requirements.txt | 2 +- 3 files changed, 163 insertions(+), 3 deletions(-) create mode 100644 html_scrapy/spiders/fetwiki_spider.py diff --git a/html_scrapy/settings.py b/html_scrapy/settings.py index e72d208..17f320f 100644 --- a/html_scrapy/settings.py +++ b/html_scrapy/settings.py @@ -31,8 +31,8 @@ ROBOTSTXT_OBEY = True #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) -#COOKIES_ENABLED = False - +COOKIES_ENABLED = True +COOKIES_DEBUG = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False diff --git a/html_scrapy/spiders/fetwiki_spider.py b/html_scrapy/spiders/fetwiki_spider.py new file mode 100644 index 0000000..523b732 --- /dev/null +++ b/html_scrapy/spiders/fetwiki_spider.py @@ -0,0 +1,160 @@ +from scrapy.spiders import SitemapSpider, CrawlSpider, Rule, Spider +from scrapy.linkextractors import LinkExtractor +import scrapy +import bs4 +import re +import getpass +import urllib.parse +from html_scrapy.elastic_publish import push_to_index, check_elastic_document +from furl import furl +from scrapy import signals + +def publish(response: scrapy.http.response.html.HtmlResponse): + print("Response typ: %s, obj: %s" % (type(response), response)) + with open("scraped_urls.log", "a+") as f: + f.write(response.url + "\n") + + +class FetWikiSpider(SitemapSpider): + name = "fetwiki_spider" + allowed_domains = ["wiki.fet.at"] + sitemap_urls = ["https://wiki.fet.at/sitemap/sitemap-wiki-NS_0-0.xml"] + sitemap_rules = [("/", "parse_page")] + http_user = "andis" + http_pass = getpass.getpass("Passwort von FET USer andis: ") + pages = {} + http_auth_domain = "wiki.fet.at" + # rules = ( + # Rule(LinkExtractor(allow=('documents', )), callback='parse_document', process_links='fix_document_links'), + # Rule(LinkExtractor(allow=(r"themengruppen/\d+$", )), callback='parse_themengruppe'), + # ) + # start_urls = ['https://legacy.fet.at/themengruppen/15'] + + # custom_settings = { + # 'DEPTH_LIMIT': '1', + # } + @classmethod + def from_crawler(cls, crawler, *args, **kwargs): + spider = super(FetWikiSpider, cls).from_crawler(crawler, *args, **kwargs) + print("connect signal") + crawler.signals.connect(spider.closed_handler, signal=signals.spider_closed) + return spider + + def closed_handler(self, spider,reason): + print("closing !! Handler") + print(reason) + print("found %d elements in pages" % len(spider.pages)) + print("found %d elements in pages" % len(self.pages)) + for id,p in spider.pages.items(): + try: + output = check_elastic_document(p) + print(f"pushing: %s" % output["url"]) + push_to_index(output["url"], output) + except AttributeError as e: + print(e) + print(f"Error occured at id: --%s--" %id) + + # do stuff here + + def callback_login_done(self, response): + + html = bs4.BeautifulSoup(response.body, "lxml") + h = html.find("h1", {"id": "firstHeading"}).text + print(f"\nlogin callback done %s\n" % h) + + + if h == "FET-Wiki": + for url in self.sitemap_urls: + yield scrapy.Request( url, self._parse_sitemap) + else: + h = html.find("div", {"class": "errorbox"}) + if h: + print(f"\nerrorbox %s" % h.text) + return + + def callback_login(self, response): + print(f"\nStart Login:\n") + + html = bs4.BeautifulSoup(response.body, "lxml") + token = html.find("input", {"name": "wpLoginToken"})["value"] + login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden" + formdata = { + "wpName": self.http_user, + "wpPassword": self.http_pass, + "domain": "fet.htu.tuwien.ac.at", + "wpLoginToken": token, + "authAction": "login", + "pluggableauthlogin": "Anmelden", + "title": "Spezial:Anmelden" + } + yield scrapy.FormRequest( + login_url, + formdata=formdata, + #meta={"cookiejar": response.meta["cookiejar"]}, + callback=self.callback_login_done, + ) + + def start_requests(self): + login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden" + #self.cookie_jar = CookieJar() + yield scrapy.Request( + "https://wiki.fet.at/index.php?title=Spezial:Anmelden", + callback=self.callback_login, + ) + + print(f"\nLogin done Processing Sitemap:\n") + + def parse_history(self, response, id): + print(f"\n\n Parsing: %s\n" % response.url) + html = bs4.BeautifulSoup(response.body, "lxml") + ul = html.find("ul", {"id": "pagehistory"}).find_all("li") + last = ul[0].find("a", {"class": "mw-changeslist-date"}).text + created = ul[-1].find("a", {"class": "mw-changeslist-date"}).text + d = self.pages.get(id,{}) + d["published"]= created + d["updated_at"]= last + self.pages[id]=d + + return + + def parse_page(self, response): + print(f"\n\n Parsing: %s\n" % response.url) + html = bs4.BeautifulSoup(response.body, "lxml") + title = html.find("h1", {"id": "firstHeading"}).text.strip() + if title == "Anmeldung erforderlich": + print("login erforderlich") + yield scrapy.Request( + "https://wiki.fet.at/index.php?title=Spezial:Anmelden", + callback=self.callback_login, + ) + print(f"Exiting Title : %s\n\n" % title) + return + print(f"Title : %s\n\n" % title) + + h = html.find("link", {"rel": "alternate"}) + if h: + hist_url = furl( + h["href"] + ) # /index.php?title=ET-Crashkurs&action=history + hist_url.host = "wiki.fet.at" + hist_url.args["action"] = "history" + hist_url.scheme = "https" + id = hist_url.args["title"] + yield scrapy.Request( + hist_url.url, + self.parse_history, + cb_kwargs={"id": hist_url.args["title"]}, + ) + else: + print("No Histoy for this page: %s" % id) + d = self.pages.get(id, {}) + d["title"] = title + d["raw"] = html.find("div", {"id": "bodyContent"}) + d["text"] = d["raw"].get_text(" ") + + d["url"] = re.search(r"http[s]?://(.+)$", response.url).group(1) + + self.pages[id] = d + + + return None diff --git a/requirements.txt b/requirements.txt index eb894a2..6e208d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ beautifulsoup4 scrapy - +furl