first_commit
This commit is contained in:
0
html_scrapy/__init__.py
Normal file
0
html_scrapy/__init__.py
Normal file
25
html_scrapy/elastic_publish.py
Normal file
25
html_scrapy/elastic_publish.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import os
|
||||
from elasticsearch import Elasticsearch
|
||||
from searching import es_client, ELASTIC_INDEX
|
||||
|
||||
def push_to_index(id, element):
|
||||
element = check_elastic_document(element)
|
||||
try:
|
||||
with es_client() as client:
|
||||
client.index(index=ELASTIC_INDEX, id=id, document=element)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
def check_elastic_document(element):
|
||||
|
||||
for e in ["url", "title", "text", "published", "updated_at"]:
|
||||
if not e in element:
|
||||
raise AttributeError(f"A %s is needed for the Elastic Element" % e)
|
||||
|
||||
return { "published": str(element["published"]),
|
||||
"text": element["text"],
|
||||
"title": element["title"],
|
||||
#"source": get_source(post),
|
||||
"url": element["url"],
|
||||
"updated_at": str(element["updated_at"])
|
||||
}
|
||||
93
html_scrapy/settings.py
Normal file
93
html_scrapy/settings.py
Normal file
@@ -0,0 +1,93 @@
|
||||
# Scrapy settings for t project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = 'html_scrapy'
|
||||
|
||||
SPIDER_MODULES = ['html_scrapy.spiders']
|
||||
NEWSPIDER_MODULE = 'html_scrapy.spiders'
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
#USER_AGENT = 't (+http://www.yourdomain.com)'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
#DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
# 'Accept-Language': 'en',
|
||||
#}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# 't.middlewares.TSpiderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
DOWNLOADER_MIDDLEWARES = {
|
||||
# 't.middlewares.TDownloaderMiddleware': 543,
|
||||
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 543
|
||||
}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
#ITEM_PIPELINES = {
|
||||
# 't.pipelines.TPipeline': 300,
|
||||
#}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = 'httpcache'
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
|
||||
# Set settings whose default value is deprecated to a future-proof value
|
||||
REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
|
||||
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
|
||||
0
html_scrapy/spiders/__init:__.py
Normal file
0
html_scrapy/spiders/__init:__.py
Normal file
123
html_scrapy/spiders/legacy_spider.py
Normal file
123
html_scrapy/spiders/legacy_spider.py
Normal file
@@ -0,0 +1,123 @@
|
||||
from scrapy.spiders import SitemapSpider,CrawlSpider, Rule, Spider
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
import pickle
|
||||
import scrapy
|
||||
#import redis as redis
|
||||
import bs4
|
||||
import re
|
||||
import getpass
|
||||
#r = redis.Redis(host='localhost', port=6379, password="12345678", db=0)
|
||||
import pickle
|
||||
import json
|
||||
from html_scrapy.elastic_publish import push_to_index, check_elastic_document
|
||||
|
||||
def publish(response: scrapy.http.response.html.HtmlResponse):
|
||||
print("Response typ: %s, obj: %s" % (type(response),response))
|
||||
|
||||
# r.publish("fetlegacy", pickle.dumps({"url": response.url, "body": response.body}))
|
||||
# r.set(response.url, response.body)
|
||||
|
||||
with open("scraped_urls.log", "a+") as f:
|
||||
f.write(response.url+"\n")
|
||||
|
||||
documents={}
|
||||
|
||||
|
||||
|
||||
|
||||
class LegacySpider(CrawlSpider):
|
||||
name = 'legacy_spider'
|
||||
allowed_domains = ['legacy.fet.at']
|
||||
#sitemap_urls=['https://fet.at/sitemap.xml']
|
||||
#sitemap_rules = [('/posts/', 'parse')]
|
||||
http_user = 'andis'
|
||||
http_pass = getpass.getpass("Passwort von FET USer andis: ")
|
||||
#http_auth_domain = 'legacy.fet.at'
|
||||
rules = (
|
||||
Rule(LinkExtractor(allow=('documents', )), callback='parse_document', process_links='fix_document_links'),
|
||||
Rule(LinkExtractor(allow=(r"themengruppen/\d+$", )), callback='parse_themengruppe'),
|
||||
)
|
||||
start_urls = ['https://legacy.fet.at/themengruppen/15']
|
||||
|
||||
custom_settings = {
|
||||
'DEPTH_LIMIT': '1',
|
||||
}
|
||||
def fix_document_links(self, links):
|
||||
for link in links:
|
||||
if re.match(r".*documents/\d+$", link.url):
|
||||
link.url=link.url +".json"
|
||||
yield link
|
||||
def fix_themen_links(self, links):
|
||||
for link in links:
|
||||
if re.match(r".*themen/\d+$", link.url):
|
||||
link.url=link.url +".json"
|
||||
yield link
|
||||
|
||||
def parse_document(self, response):
|
||||
global documents
|
||||
body=json.loads(response.body)
|
||||
body["raw"] = body["text"]
|
||||
body["text"] = bs4.BeautifulSoup(body["text"], 'lxml').get_text(" ")
|
||||
documents[int(body['id'])]=body
|
||||
return
|
||||
def parse_themen(self, response):
|
||||
|
||||
body=json.loads(response.body)
|
||||
body["text"] = bs4.BeautifulSoup(body["text"], 'lxml').get_text(" ")
|
||||
documents[int(body['id'])]=body
|
||||
return
|
||||
|
||||
def parse_themengruppe(self, response):
|
||||
global documents
|
||||
|
||||
themengruppe = bs4.BeautifulSoup(response.body, 'lxml')
|
||||
|
||||
thema_tags = themengruppe.find_all("div", itemtype="http://schema.org/Article")
|
||||
print(f"found %d topics"% len(thema_tags))
|
||||
for thema in thema_tags:
|
||||
|
||||
t_link = thema.find("h2").find_all("a")[0]
|
||||
t = {
|
||||
'url': t_link["href"],
|
||||
'title': t_link.text,
|
||||
'id': int( re.search(r"/(\d+)$", t_link["href"]).group(1)),
|
||||
|
||||
}
|
||||
print(f"\n\ncrawling topic %s - %s" %(t.get("id","?"),t.get("title","?")))
|
||||
meeting_tags = thema.find_all("div", id=re.compile(r'meeting_\d+'))
|
||||
print(f"\nfound %d meetings for topic %s" % (len(meeting_tags), t["title"]))
|
||||
for meeting in meeting_tags:
|
||||
m_id =re.search(r"meeting_(\d+)$", meeting['id']).group(1)
|
||||
m ={'id': m_id,
|
||||
'title': meeting.find("a").text
|
||||
}
|
||||
|
||||
docs=meeting.find_all(id=re.compile(r'document_\d+'))
|
||||
print(f"crawling meeting %s with %d meetings" % (m.get("title"),len(docs)))
|
||||
for d in docs:
|
||||
doc_id = int( re.search(r"document_(\d+)$", d['id']).group(1))
|
||||
if not doc_id in documents:
|
||||
continue
|
||||
documents[doc_id]["meeting"]=m
|
||||
documents[doc_id]["thema"]= t
|
||||
for d in thema.find_all(id=re.compile(r'document_\d+')):
|
||||
doc_id = int( re.search(r"document_(\d+)$", d['id']).group(1))
|
||||
if not doc_id in documents:
|
||||
continue
|
||||
documents[doc_id]["thema"]= t
|
||||
|
||||
output={}
|
||||
for k, d in documents.items():
|
||||
output[k]= check_elastic_document({
|
||||
"title": d.get("name","")+ " - " + d.get("thema", {}).get("title","") + " - " + d.get("meeting",{}).get("title",""),
|
||||
"text": d.get("text",""),
|
||||
"raw": d.get("raw",""),
|
||||
"url": "legacy.fet.at/documents/" + str(d["id"]),
|
||||
"published": d["created_at"],
|
||||
"updated_at": d["updated_at"]
|
||||
})
|
||||
push_to_index(output[k]["url"],output[k])
|
||||
|
||||
print(f"Document added: %s" % output[k].get("title",""))
|
||||
#print("\n\nDocuments"+json.dumps(output))
|
||||
return None
|
||||
Reference in New Issue
Block a user