from scrapy.spiders import SitemapSpider,CrawlSpider, Rule, Spider from scrapy.linkextractors import LinkExtractor import pickle import scrapy #import redis as redis import bs4 import re import getpass #r = redis.Redis(host='localhost', port=6379, password="12345678", db=0) import pickle import json from html_scrapy.elastic_publish import push_to_index, check_elastic_document def publish(response: scrapy.http.response.html.HtmlResponse): print("Response typ: %s, obj: %s" % (type(response),response)) # r.publish("fetlegacy", pickle.dumps({"url": response.url, "body": response.body})) # r.set(response.url, response.body) with open("scraped_urls.log", "a+") as f: f.write(response.url+"\n") documents={} class LegacySpider(CrawlSpider): name = 'legacy_spider' allowed_domains = ['legacy.fet.at'] #sitemap_urls=['https://fet.at/sitemap.xml'] #sitemap_rules = [('/posts/', 'parse')] http_user = 'andis' http_pass = getpass.getpass("Passwort von FET USer andis: ") #http_auth_domain = 'legacy.fet.at' rules = ( Rule(LinkExtractor(allow=('documents', )), callback='parse_document', process_links='fix_document_links'), Rule(LinkExtractor(allow=(r"themengruppen/\d+$", )), callback='parse_themengruppe'), ) start_urls = ['https://legacy.fet.at/themengruppen/15'] custom_settings = { 'DEPTH_LIMIT': '1', } def fix_document_links(self, links): for link in links: if re.match(r".*documents/\d+$", link.url): link.url=link.url +".json" yield link def fix_themen_links(self, links): for link in links: if re.match(r".*themen/\d+$", link.url): link.url=link.url +".json" yield link def parse_document(self, response): global documents body=json.loads(response.body) body["raw"] = body["text"] body["text"] = bs4.BeautifulSoup(body["text"], 'lxml').get_text(" ") documents[int(body['id'])]=body return def parse_themen(self, response): body=json.loads(response.body) body["text"] = bs4.BeautifulSoup(body["text"], 'lxml').get_text(" ") documents[int(body['id'])]=body return def parse_themengruppe(self, response): global documents themengruppe = bs4.BeautifulSoup(response.body, 'lxml') thema_tags = themengruppe.find_all("div", itemtype="http://schema.org/Article") print(f"found %d topics"% len(thema_tags)) for thema in thema_tags: t_link = thema.find("h2").find_all("a")[0] t = { 'url': t_link["href"], 'title': t_link.text, 'id': int( re.search(r"/(\d+)$", t_link["href"]).group(1)), } print(f"\n\ncrawling topic %s - %s" %(t.get("id","?"),t.get("title","?"))) meeting_tags = thema.find_all("div", id=re.compile(r'meeting_\d+')) print(f"\nfound %d meetings for topic %s" % (len(meeting_tags), t["title"])) for meeting in meeting_tags: m_id =re.search(r"meeting_(\d+)$", meeting['id']).group(1) m ={'id': m_id, 'title': meeting.find("a").text } docs=meeting.find_all(id=re.compile(r'document_\d+')) print(f"crawling meeting %s with %d meetings" % (m.get("title"),len(docs))) for d in docs: doc_id = int( re.search(r"document_(\d+)$", d['id']).group(1)) if not doc_id in documents: continue documents[doc_id]["meeting"]=m documents[doc_id]["thema"]= t for d in thema.find_all(id=re.compile(r'document_\d+')): doc_id = int( re.search(r"document_(\d+)$", d['id']).group(1)) if not doc_id in documents: continue documents[doc_id]["thema"]= t output={} for k, d in documents.items(): output[k]= check_elastic_document({ "title": d.get("name","")+ " - " + d.get("thema", {}).get("title","") + " - " + d.get("meeting",{}).get("title",""), "text": d.get("text",""), "raw": d.get("raw",""), "url": "legacy.fet.at/documents/" + str(d["id"]), "published": d["created_at"], "updated_at": d["updated_at"] }) push_to_index(output[k]["url"],output[k]) print(f"Document added: %s" % output[k].get("title","")) #print("\n\nDocuments"+json.dumps(output)) return None