116 lines
4.5 KiB
Python
116 lines
4.5 KiB
Python
from scrapy.spiders import SitemapSpider,CrawlSpider, Rule, Spider
|
|
from scrapy.linkextractors import LinkExtractor
|
|
|
|
import scrapy
|
|
import bs4
|
|
import re
|
|
import getpass
|
|
import pickle
|
|
import json
|
|
from html_scrapy.elastic_publish import push_to_index, check_elastic_document
|
|
|
|
def publish(response: scrapy.http.response.html.HtmlResponse):
|
|
print("Response typ: %s, obj: %s" % (type(response),response))
|
|
with open("scraped_urls.log", "a+") as f:
|
|
f.write(response.url+"\n")
|
|
|
|
documents={}
|
|
|
|
|
|
class LegacySpider(CrawlSpider):
|
|
name = 'legacy_spider'
|
|
allowed_domains = ['legacy.fet.at']
|
|
#sitemap_urls=['https://fet.at/sitemap.xml']
|
|
#sitemap_rules = [('/posts/', 'parse')]
|
|
http_user = 'andis'
|
|
http_pass = getpass.getpass("Passwort von FET USer andis: ")
|
|
#http_auth_domain = 'legacy.fet.at'
|
|
rules = (
|
|
Rule(LinkExtractor(allow=('documents', )), callback='parse_document', process_links='fix_document_links'),
|
|
Rule(LinkExtractor(allow=(r"themengruppen/\d+$", )), callback='parse_themengruppe'),
|
|
)
|
|
start_urls = ['https://legacy.fet.at/themengruppen/15']
|
|
|
|
custom_settings = {
|
|
'DEPTH_LIMIT': '1',
|
|
}
|
|
def fix_document_links(self, links):
|
|
for link in links:
|
|
if re.match(r".*documents/\d+$", link.url):
|
|
link.url=link.url +".json"
|
|
yield link
|
|
def fix_themen_links(self, links):
|
|
for link in links:
|
|
if re.match(r".*themen/\d+$", link.url):
|
|
link.url=link.url +".json"
|
|
yield link
|
|
|
|
def parse_document(self, response):
|
|
global documents
|
|
body=json.loads(response.body)
|
|
body["raw"] = body["text"]
|
|
body["text"] = bs4.BeautifulSoup(body["text"], 'lxml').get_text(" ")
|
|
documents[int(body['id'])]=body
|
|
return
|
|
def parse_themen(self, response):
|
|
|
|
body=json.loads(response.body)
|
|
body["text"] = bs4.BeautifulSoup(body["text"], 'lxml').get_text(" ")
|
|
documents[int(body['id'])]=body
|
|
return
|
|
|
|
def parse_themengruppe(self, response):
|
|
global documents
|
|
|
|
themengruppe = bs4.BeautifulSoup(response.body, 'lxml')
|
|
|
|
thema_tags = themengruppe.find_all("div", itemtype="http://schema.org/Article")
|
|
print(f"found %d topics"% len(thema_tags))
|
|
for thema in thema_tags:
|
|
|
|
t_link = thema.find("h2").find_all("a")[0]
|
|
t = {
|
|
'url': t_link["href"],
|
|
'title': t_link.text,
|
|
'id': int( re.search(r"/(\d+)$", t_link["href"]).group(1)),
|
|
|
|
}
|
|
print(f"\n\ncrawling topic %s - %s" %(t.get("id","?"),t.get("title","?")))
|
|
meeting_tags = thema.find_all("div", id=re.compile(r'meeting_\d+'))
|
|
print(f"\nfound %d meetings for topic %s" % (len(meeting_tags), t["title"]))
|
|
for meeting in meeting_tags:
|
|
m_id =re.search(r"meeting_(\d+)$", meeting['id']).group(1)
|
|
m ={'id': m_id,
|
|
'title': meeting.find("a").text
|
|
}
|
|
|
|
docs=meeting.find_all(id=re.compile(r'document_\d+'))
|
|
print(f"crawling meeting %s with %d meetings" % (m.get("title"),len(docs)))
|
|
for d in docs:
|
|
doc_id = int( re.search(r"document_(\d+)$", d['id']).group(1))
|
|
if not doc_id in documents:
|
|
continue
|
|
documents[doc_id]["meeting"]=m
|
|
documents[doc_id]["thema"]= t
|
|
for d in thema.find_all(id=re.compile(r'document_\d+')):
|
|
doc_id = int( re.search(r"document_(\d+)$", d['id']).group(1))
|
|
if not doc_id in documents:
|
|
continue
|
|
documents[doc_id]["thema"]= t
|
|
|
|
output={}
|
|
for k, d in documents.items():
|
|
output[k]= check_elastic_document({
|
|
"title": d.get("name","")+ " - " + d.get("thema", {}).get("title","") + " - " + d.get("meeting",{}).get("title",""),
|
|
"text": d.get("text",""),
|
|
"raw": d.get("raw",""),
|
|
"prio": 100,
|
|
"url": "legacy.fet.at/documents/" + str(d["id"]),
|
|
"published": d["created_at"],
|
|
"updated_at": d["updated_at"]
|
|
})
|
|
push_to_index(output[k]["url"],output[k])
|
|
|
|
print(f"Document added: %s" % output[k].get("title",""))
|
|
#print("\n\nDocuments"+json.dumps(output))
|
|
return None |