upgrade scraping and searching
This commit is contained in:
@@ -1,30 +1,22 @@
|
||||
from scrapy.spiders import SitemapSpider,CrawlSpider, Rule, Spider
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
import pickle
|
||||
|
||||
import scrapy
|
||||
#import redis as redis
|
||||
import bs4
|
||||
import re
|
||||
import getpass
|
||||
#r = redis.Redis(host='localhost', port=6379, password="12345678", db=0)
|
||||
import pickle
|
||||
import json
|
||||
from html_scrapy.elastic_publish import push_to_index, check_elastic_document
|
||||
|
||||
def publish(response: scrapy.http.response.html.HtmlResponse):
|
||||
print("Response typ: %s, obj: %s" % (type(response),response))
|
||||
|
||||
# r.publish("fetlegacy", pickle.dumps({"url": response.url, "body": response.body}))
|
||||
# r.set(response.url, response.body)
|
||||
|
||||
with open("scraped_urls.log", "a+") as f:
|
||||
f.write(response.url+"\n")
|
||||
|
||||
documents={}
|
||||
|
||||
|
||||
|
||||
|
||||
class LegacySpider(CrawlSpider):
|
||||
name = 'legacy_spider'
|
||||
allowed_domains = ['legacy.fet.at']
|
||||
@@ -112,6 +104,7 @@ class LegacySpider(CrawlSpider):
|
||||
"title": d.get("name","")+ " - " + d.get("thema", {}).get("title","") + " - " + d.get("meeting",{}).get("title",""),
|
||||
"text": d.get("text",""),
|
||||
"raw": d.get("raw",""),
|
||||
"prio": 100,
|
||||
"url": "legacy.fet.at/documents/" + str(d["id"]),
|
||||
"published": d["created_at"],
|
||||
"updated_at": d["updated_at"]
|
||||
|
||||
Reference in New Issue
Block a user