upgrade scraping and searching
This commit is contained in:
@@ -20,6 +20,7 @@ def check_elastic_document(element):
|
||||
"text": element["text"],
|
||||
"title": element["title"],
|
||||
#"source": get_source(post),
|
||||
"prio": element.get("prio", 1000),
|
||||
"url": element["url"],
|
||||
"updated_at": str(element["updated_at"])
|
||||
}
|
||||
32
html_scrapy/guess_date.py
Normal file
32
html_scrapy/guess_date.py
Normal file
@@ -0,0 +1,32 @@
|
||||
|
||||
from datetime import datetime
|
||||
from dateutil.parser import parse
|
||||
import locale
|
||||
from contextlib import suppress
|
||||
|
||||
def parse(s):
|
||||
with suppress(BaseException):
|
||||
date_format = '%H:%M, %d. %b. %Y'
|
||||
locale.setlocale(locale.LC_TIME, 'en_US')
|
||||
parsed_date = datetime.strptime(s, date_format)
|
||||
return parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
||||
|
||||
with suppress(BaseException):
|
||||
date_format = '%H:%M, %d. %b. %Y'
|
||||
locale.setlocale(locale.LC_TIME, 'de_DE')
|
||||
parsed_date = datetime.strptime(s, date_format)
|
||||
return parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
||||
|
||||
with suppress(BaseException):
|
||||
date_format = '%H:%M, %d. %B %Y'
|
||||
locale.setlocale(locale.LC_TIME, 'de_DE')
|
||||
parsed_date = datetime.strptime(s, date_format)
|
||||
return parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
||||
|
||||
with suppress(BaseException):
|
||||
locale.setlocale(locale.LC_TIME, 'de_DE')
|
||||
parsed_date = parse(s, dayfirst=True)
|
||||
return parsed_date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
||||
|
||||
|
||||
return s
|
||||
@@ -4,11 +4,11 @@ import scrapy
|
||||
import bs4
|
||||
import re
|
||||
import getpass
|
||||
import urllib.parse
|
||||
|
||||
from html_scrapy.elastic_publish import push_to_index, check_elastic_document
|
||||
from furl import furl
|
||||
from scrapy import signals
|
||||
|
||||
from html_scrapy import guess_date
|
||||
def publish(response: scrapy.http.response.html.HtmlResponse):
|
||||
print("Response typ: %s, obj: %s" % (type(response), response))
|
||||
with open("scraped_urls.log", "a+") as f:
|
||||
@@ -22,6 +22,7 @@ class FetWikiSpider(SitemapSpider):
|
||||
sitemap_rules = [("/", "parse_page")]
|
||||
http_user = "andis"
|
||||
http_pass = getpass.getpass("Passwort von FET USer andis: ")
|
||||
login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden"
|
||||
pages = {}
|
||||
http_auth_domain = "wiki.fet.at"
|
||||
# rules = (
|
||||
@@ -36,34 +37,27 @@ class FetWikiSpider(SitemapSpider):
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler, *args, **kwargs):
|
||||
spider = super(FetWikiSpider, cls).from_crawler(crawler, *args, **kwargs)
|
||||
print("connect signal")
|
||||
crawler.signals.connect(spider.closed_handler, signal=signals.spider_closed)
|
||||
return spider
|
||||
|
||||
def closed_handler(self, spider,reason):
|
||||
print("closing !! Handler")
|
||||
print(reason)
|
||||
print("found %d elements in pages" % len(spider.pages))
|
||||
print("found %d elements in pages" % len(self.pages))
|
||||
with open("scraped_urls.log", "a+") as f:
|
||||
for id,p in spider.pages.items():
|
||||
try:
|
||||
output = check_elastic_document(p)
|
||||
print(f"pushing: %s" % output["url"])
|
||||
push_to_index(output["url"], output)
|
||||
f.write(output["url"] + "\n")
|
||||
except AttributeError as e:
|
||||
print(e)
|
||||
print(f"Error occured at id: --%s--" %id)
|
||||
|
||||
# do stuff here
|
||||
|
||||
def callback_login_done(self, response):
|
||||
|
||||
|
||||
def callback_login_step2(self, response):
|
||||
html = bs4.BeautifulSoup(response.body, "lxml")
|
||||
h = html.find("h1", {"id": "firstHeading"}).text
|
||||
print(f"\nlogin callback done %s\n" % h)
|
||||
|
||||
|
||||
if h == "FET-Wiki":
|
||||
if h == "FET-Wiki": # Login successful
|
||||
for url in self.sitemap_urls:
|
||||
yield scrapy.Request( url, self._parse_sitemap)
|
||||
else:
|
||||
@@ -72,12 +66,11 @@ class FetWikiSpider(SitemapSpider):
|
||||
print(f"\nerrorbox %s" % h.text)
|
||||
return
|
||||
|
||||
def callback_login(self, response):
|
||||
def callback_login_step1(self, response):
|
||||
print(f"\nStart Login:\n")
|
||||
|
||||
html = bs4.BeautifulSoup(response.body, "lxml")
|
||||
token = html.find("input", {"name": "wpLoginToken"})["value"]
|
||||
login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden"
|
||||
|
||||
formdata = {
|
||||
"wpName": self.http_user,
|
||||
"wpPassword": self.http_pass,
|
||||
@@ -88,22 +81,17 @@ class FetWikiSpider(SitemapSpider):
|
||||
"title": "Spezial:Anmelden"
|
||||
}
|
||||
yield scrapy.FormRequest(
|
||||
login_url,
|
||||
self.login_url,
|
||||
formdata=formdata,
|
||||
#meta={"cookiejar": response.meta["cookiejar"]},
|
||||
callback=self.callback_login_done,
|
||||
callback=self.callback_login_step2,
|
||||
)
|
||||
|
||||
def start_requests(self):
|
||||
login_url = "https://wiki.fet.at/index.php?title=Spezial:Anmelden"
|
||||
#self.cookie_jar = CookieJar()
|
||||
yield scrapy.Request(
|
||||
"https://wiki.fet.at/index.php?title=Spezial:Anmelden",
|
||||
callback=self.callback_login,
|
||||
self.login_url,
|
||||
callback=self.callback_login_step1,
|
||||
)
|
||||
|
||||
print(f"\nLogin done Processing Sitemap:\n")
|
||||
|
||||
def parse_history(self, response, id):
|
||||
print(f"\n\n Parsing: %s\n" % response.url)
|
||||
html = bs4.BeautifulSoup(response.body, "lxml")
|
||||
@@ -111,10 +99,9 @@ class FetWikiSpider(SitemapSpider):
|
||||
last = ul[0].find("a", {"class": "mw-changeslist-date"}).text
|
||||
created = ul[-1].find("a", {"class": "mw-changeslist-date"}).text
|
||||
d = self.pages.get(id,{})
|
||||
d["published"]= created
|
||||
d["updated_at"]= last
|
||||
d["published"]= guess_date.parse(created)
|
||||
d["updated_at"]= guess_date.parse(last)
|
||||
self.pages[id]=d
|
||||
|
||||
return
|
||||
|
||||
def parse_page(self, response):
|
||||
@@ -122,10 +109,9 @@ class FetWikiSpider(SitemapSpider):
|
||||
html = bs4.BeautifulSoup(response.body, "lxml")
|
||||
title = html.find("h1", {"id": "firstHeading"}).text.strip()
|
||||
if title == "Anmeldung erforderlich":
|
||||
print("login erforderlich")
|
||||
yield scrapy.Request(
|
||||
"https://wiki.fet.at/index.php?title=Spezial:Anmelden",
|
||||
callback=self.callback_login,
|
||||
callback=self.callback_login_step1,
|
||||
)
|
||||
print(f"Exiting Title : %s\n\n" % title)
|
||||
return
|
||||
|
||||
@@ -1,30 +1,22 @@
|
||||
from scrapy.spiders import SitemapSpider,CrawlSpider, Rule, Spider
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
import pickle
|
||||
|
||||
import scrapy
|
||||
#import redis as redis
|
||||
import bs4
|
||||
import re
|
||||
import getpass
|
||||
#r = redis.Redis(host='localhost', port=6379, password="12345678", db=0)
|
||||
import pickle
|
||||
import json
|
||||
from html_scrapy.elastic_publish import push_to_index, check_elastic_document
|
||||
|
||||
def publish(response: scrapy.http.response.html.HtmlResponse):
|
||||
print("Response typ: %s, obj: %s" % (type(response),response))
|
||||
|
||||
# r.publish("fetlegacy", pickle.dumps({"url": response.url, "body": response.body}))
|
||||
# r.set(response.url, response.body)
|
||||
|
||||
with open("scraped_urls.log", "a+") as f:
|
||||
f.write(response.url+"\n")
|
||||
|
||||
documents={}
|
||||
|
||||
|
||||
|
||||
|
||||
class LegacySpider(CrawlSpider):
|
||||
name = 'legacy_spider'
|
||||
allowed_domains = ['legacy.fet.at']
|
||||
@@ -112,6 +104,7 @@ class LegacySpider(CrawlSpider):
|
||||
"title": d.get("name","")+ " - " + d.get("thema", {}).get("title","") + " - " + d.get("meeting",{}).get("title",""),
|
||||
"text": d.get("text",""),
|
||||
"raw": d.get("raw",""),
|
||||
"prio": 100,
|
||||
"url": "legacy.fet.at/documents/" + str(d["id"]),
|
||||
"published": d["created_at"],
|
||||
"updated_at": d["updated_at"]
|
||||
|
||||
Reference in New Issue
Block a user