161 lines
5.7 KiB
Python
161 lines
5.7 KiB
Python
from scrapy.spiders import SitemapSpider, CrawlSpider, Rule, Spider
|
|
from scrapy.linkextractors import LinkExtractor
|
|
import scrapy
|
|
import bs4
|
|
import re
|
|
import getpass
|
|
import urllib.parse
|
|
from html_scrapy.elastic_publish import push_to_index, check_elastic_document
|
|
from furl import furl
|
|
from scrapy import signals
|
|
|
|
def publish(response: scrapy.http.response.html.HtmlResponse):
|
|
print("Response typ: %s, obj: %s" % (type(response), response))
|
|
with open("scraped_urls.log", "a+") as f:
|
|
f.write(response.url + "\n")
|
|
|
|
|
|
class FetWikiSpider(SitemapSpider):
|
|
name = "fetwiki_spider"
|
|
allowed_domains = ["wiki.fet.at"]
|
|
sitemap_urls = ["https://wiki.fet.at/sitemap/sitemap-wiki-NS_0-0.xml"]
|
|
sitemap_rules = [("/", "parse_page")]
|
|
http_user = "andis"
|
|
http_pass = getpass.getpass("Passwort von FET USer andis: ")
|
|
pages = {}
|
|
http_auth_domain = "wiki.fet.at"
|
|
# rules = (
|
|
# Rule(LinkExtractor(allow=('documents', )), callback='parse_document', process_links='fix_document_links'),
|
|
# Rule(LinkExtractor(allow=(r"themengruppen/\d+$", )), callback='parse_themengruppe'),
|
|
# )
|
|
# start_urls = ['https://legacy.fet.at/themengruppen/15']
|
|
|
|
# custom_settings = {
|
|
# 'DEPTH_LIMIT': '1',
|
|
# }
|
|
@classmethod
|
|
def from_crawler(cls, crawler, *args, **kwargs):
|
|
spider = super(FetWikiSpider, cls).from_crawler(crawler, *args, **kwargs)
|
|
print("connect signal")
|
|
crawler.signals.connect(spider.closed_handler, signal=signals.spider_closed)
|
|
return spider
|
|
|
|
def closed_handler(self, spider,reason):
|
|
print("closing !! Handler")
|
|
print(reason)
|
|
print("found %d elements in pages" % len(spider.pages))
|
|
print("found %d elements in pages" % len(self.pages))
|
|
for id,p in spider.pages.items():
|
|
try:
|
|
output = check_elastic_document(p)
|
|
print(f"pushing: %s" % output["url"])
|
|
push_to_index(output["url"], output)
|
|
except AttributeError as e:
|
|
print(e)
|
|
print(f"Error occured at id: --%s--" %id)
|
|
|
|
# do stuff here
|
|
|
|
def callback_login_done(self, response):
|
|
|
|
html = bs4.BeautifulSoup(response.body, "lxml")
|
|
h = html.find("h1", {"id": "firstHeading"}).text
|
|
print(f"\nlogin callback done %s\n" % h)
|
|
|
|
|
|
if h == "FET-Wiki":
|
|
for url in self.sitemap_urls:
|
|
yield scrapy.Request( url, self._parse_sitemap)
|
|
else:
|
|
h = html.find("div", {"class": "errorbox"})
|
|
if h:
|
|
print(f"\nerrorbox %s" % h.text)
|
|
return
|
|
|
|
def callback_login(self, response):
|
|
print(f"\nStart Login:\n")
|
|
|
|
html = bs4.BeautifulSoup(response.body, "lxml")
|
|
token = html.find("input", {"name": "wpLoginToken"})["value"]
|
|
login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden"
|
|
formdata = {
|
|
"wpName": self.http_user,
|
|
"wpPassword": self.http_pass,
|
|
"domain": "fet.htu.tuwien.ac.at",
|
|
"wpLoginToken": token,
|
|
"authAction": "login",
|
|
"pluggableauthlogin": "Anmelden",
|
|
"title": "Spezial:Anmelden"
|
|
}
|
|
yield scrapy.FormRequest(
|
|
login_url,
|
|
formdata=formdata,
|
|
#meta={"cookiejar": response.meta["cookiejar"]},
|
|
callback=self.callback_login_done,
|
|
)
|
|
|
|
def start_requests(self):
|
|
login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden"
|
|
#self.cookie_jar = CookieJar()
|
|
yield scrapy.Request(
|
|
"https://wiki.fet.at/index.php?title=Spezial:Anmelden",
|
|
callback=self.callback_login,
|
|
)
|
|
|
|
print(f"\nLogin done Processing Sitemap:\n")
|
|
|
|
def parse_history(self, response, id):
|
|
print(f"\n\n Parsing: %s\n" % response.url)
|
|
html = bs4.BeautifulSoup(response.body, "lxml")
|
|
ul = html.find("ul", {"id": "pagehistory"}).find_all("li")
|
|
last = ul[0].find("a", {"class": "mw-changeslist-date"}).text
|
|
created = ul[-1].find("a", {"class": "mw-changeslist-date"}).text
|
|
d = self.pages.get(id,{})
|
|
d["published"]= created
|
|
d["updated_at"]= last
|
|
self.pages[id]=d
|
|
|
|
return
|
|
|
|
def parse_page(self, response):
|
|
print(f"\n\n Parsing: %s\n" % response.url)
|
|
html = bs4.BeautifulSoup(response.body, "lxml")
|
|
title = html.find("h1", {"id": "firstHeading"}).text.strip()
|
|
if title == "Anmeldung erforderlich":
|
|
print("login erforderlich")
|
|
yield scrapy.Request(
|
|
"https://wiki.fet.at/index.php?title=Spezial:Anmelden",
|
|
callback=self.callback_login,
|
|
)
|
|
print(f"Exiting Title : %s\n\n" % title)
|
|
return
|
|
print(f"Title : %s\n\n" % title)
|
|
|
|
h = html.find("link", {"rel": "alternate"})
|
|
if h:
|
|
hist_url = furl(
|
|
h["href"]
|
|
) # /index.php?title=ET-Crashkurs&action=history
|
|
hist_url.host = "wiki.fet.at"
|
|
hist_url.args["action"] = "history"
|
|
hist_url.scheme = "https"
|
|
id = hist_url.args["title"]
|
|
yield scrapy.Request(
|
|
hist_url.url,
|
|
self.parse_history,
|
|
cb_kwargs={"id": hist_url.args["title"]},
|
|
)
|
|
else:
|
|
print("No Histoy for this page: %s" % id)
|
|
d = self.pages.get(id, {})
|
|
d["title"] = title
|
|
d["raw"] = html.find("div", {"id": "bodyContent"})
|
|
d["text"] = d["raw"].get_text(" ")
|
|
|
|
d["url"] = re.search(r"http[s]?://(.+)$", response.url).group(1)
|
|
|
|
self.pages[id] = d
|
|
|
|
|
|
return None
|