upgrade scraping and searching

This commit is contained in:
2023-05-14 18:15:10 +02:00
parent 38a428eb52
commit fc84fdf4f6
12 changed files with 239 additions and 120 deletions

View File

@@ -4,11 +4,11 @@ import scrapy
import bs4
import re
import getpass
import urllib.parse
from html_scrapy.elastic_publish import push_to_index, check_elastic_document
from furl import furl
from scrapy import signals
from html_scrapy import guess_date
def publish(response: scrapy.http.response.html.HtmlResponse):
print("Response typ: %s, obj: %s" % (type(response), response))
with open("scraped_urls.log", "a+") as f:
@@ -22,6 +22,7 @@ class FetWikiSpider(SitemapSpider):
sitemap_rules = [("/", "parse_page")]
http_user = "andis"
http_pass = getpass.getpass("Passwort von FET USer andis: ")
login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden"
pages = {}
http_auth_domain = "wiki.fet.at"
# rules = (
@@ -36,34 +37,27 @@ class FetWikiSpider(SitemapSpider):
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(FetWikiSpider, cls).from_crawler(crawler, *args, **kwargs)
print("connect signal")
crawler.signals.connect(spider.closed_handler, signal=signals.spider_closed)
return spider
def closed_handler(self, spider,reason):
print("closing !! Handler")
print(reason)
print("found %d elements in pages" % len(spider.pages))
print("found %d elements in pages" % len(self.pages))
with open("scraped_urls.log", "a+") as f:
for id,p in spider.pages.items():
try:
output = check_elastic_document(p)
print(f"pushing: %s" % output["url"])
push_to_index(output["url"], output)
f.write(output["url"] + "\n")
except AttributeError as e:
print(e)
print(f"Error occured at id: --%s--" %id)
# do stuff here
def callback_login_done(self, response):
def callback_login_step2(self, response):
html = bs4.BeautifulSoup(response.body, "lxml")
h = html.find("h1", {"id": "firstHeading"}).text
print(f"\nlogin callback done %s\n" % h)
if h == "FET-Wiki":
if h == "FET-Wiki": # Login successful
for url in self.sitemap_urls:
yield scrapy.Request( url, self._parse_sitemap)
else:
@@ -72,12 +66,11 @@ class FetWikiSpider(SitemapSpider):
print(f"\nerrorbox %s" % h.text)
return
def callback_login(self, response):
def callback_login_step1(self, response):
print(f"\nStart Login:\n")
html = bs4.BeautifulSoup(response.body, "lxml")
token = html.find("input", {"name": "wpLoginToken"})["value"]
login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden"
formdata = {
"wpName": self.http_user,
"wpPassword": self.http_pass,
@@ -88,22 +81,17 @@ class FetWikiSpider(SitemapSpider):
"title": "Spezial:Anmelden"
}
yield scrapy.FormRequest(
login_url,
self.login_url,
formdata=formdata,
#meta={"cookiejar": response.meta["cookiejar"]},
callback=self.callback_login_done,
callback=self.callback_login_step2,
)
def start_requests(self):
login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden"
#self.cookie_jar = CookieJar()
yield scrapy.Request(
"https://wiki.fet.at/index.php?title=Spezial:Anmelden",
callback=self.callback_login,
self.login_url,
callback=self.callback_login_step1,
)
print(f"\nLogin done Processing Sitemap:\n")
def parse_history(self, response, id):
print(f"\n\n Parsing: %s\n" % response.url)
html = bs4.BeautifulSoup(response.body, "lxml")
@@ -111,10 +99,9 @@ class FetWikiSpider(SitemapSpider):
last = ul[0].find("a", {"class": "mw-changeslist-date"}).text
created = ul[-1].find("a", {"class": "mw-changeslist-date"}).text
d = self.pages.get(id,{})
d["published"]= created
d["updated_at"]= last
d["published"]= guess_date.parse(created)
d["updated_at"]= guess_date.parse(last)
self.pages[id]=d
return
def parse_page(self, response):
@@ -122,10 +109,9 @@ class FetWikiSpider(SitemapSpider):
html = bs4.BeautifulSoup(response.body, "lxml")
title = html.find("h1", {"id": "firstHeading"}).text.strip()
if title == "Anmeldung erforderlich":
print("login erforderlich")
yield scrapy.Request(
"https://wiki.fet.at/index.php?title=Spezial:Anmelden",
callback=self.callback_login,
callback=self.callback_login_step1,
)
print(f"Exiting Title : %s\n\n" % title)
return