upgrade scraping and searching

This commit is contained in:
2023-05-14 18:15:10 +02:00
parent 38a428eb52
commit fc84fdf4f6
12 changed files with 239 additions and 120 deletions

View File

@@ -1,30 +1,22 @@
from scrapy.spiders import SitemapSpider,CrawlSpider, Rule, Spider
from scrapy.linkextractors import LinkExtractor
import pickle
import scrapy
#import redis as redis
import bs4
import re
import getpass
#r = redis.Redis(host='localhost', port=6379, password="12345678", db=0)
import pickle
import json
from html_scrapy.elastic_publish import push_to_index, check_elastic_document
def publish(response: scrapy.http.response.html.HtmlResponse):
print("Response typ: %s, obj: %s" % (type(response),response))
# r.publish("fetlegacy", pickle.dumps({"url": response.url, "body": response.body}))
# r.set(response.url, response.body)
with open("scraped_urls.log", "a+") as f:
f.write(response.url+"\n")
documents={}
class LegacySpider(CrawlSpider):
name = 'legacy_spider'
allowed_domains = ['legacy.fet.at']
@@ -112,6 +104,7 @@ class LegacySpider(CrawlSpider):
"title": d.get("name","")+ " - " + d.get("thema", {}).get("title","") + " - " + d.get("meeting",{}).get("title",""),
"text": d.get("text",""),
"raw": d.get("raw",""),
"prio": 100,
"url": "legacy.fet.at/documents/" + str(d["id"]),
"published": d["created_at"],
"updated_at": d["updated_at"]