first_commit

This commit is contained in:
2023-02-19 10:04:10 +01:00
commit ed610970cb
23 changed files with 11434 additions and 0 deletions

5
.gitignore vendored Normal file
View File

@@ -0,0 +1,5 @@
archiv/*
.pytest_cache
test_elastic
*.pyc
*.bak

16
Dockerfile Normal file
View File

@@ -0,0 +1,16 @@
from python:3.8
workdir /srv
RUN pip3 install setuptools_scm
RUN pip3 install elasticsearch
RUN pip3 install scrapy
COPY requirements.txt .
RUN pip3 install -r requirements.txt
COPY scrapy.cfg .
COPY html_scrapy ./html_scrapy
COPY searching ./searching
CMD ls && scrapy crawl legacy_spider
#spider html_scrapy/spiders/legacy_spider.py

9
Dockerfile_index Normal file
View File

@@ -0,0 +1,9 @@
from python:3.8
workdir /srv
RUN pip3 install setuptools_scm
RUN pip3 install elasticsearch
COPY searching ./searching
COPY reset_index.py .
CMD python3 reset_index.py

2
build_docker Normal file
View File

@@ -0,0 +1,2 @@
docker build -t docker.fet.at/andis_scrapy:0.1.2 .
docker build -t docker.fet.at/andis_index:0.1.2 -f Dockerfile_index .

0
html_scrapy/__init__.py Normal file
View File

View File

@@ -0,0 +1,25 @@
import os
from elasticsearch import Elasticsearch
from searching import es_client, ELASTIC_INDEX
def push_to_index(id, element):
element = check_elastic_document(element)
try:
with es_client() as client:
client.index(index=ELASTIC_INDEX, id=id, document=element)
except Exception as e:
print(e)
def check_elastic_document(element):
for e in ["url", "title", "text", "published", "updated_at"]:
if not e in element:
raise AttributeError(f"A %s is needed for the Elastic Element" % e)
return { "published": str(element["published"]),
"text": element["text"],
"title": element["title"],
#"source": get_source(post),
"url": element["url"],
"updated_at": str(element["updated_at"])
}

93
html_scrapy/settings.py Normal file
View File

@@ -0,0 +1,93 @@
# Scrapy settings for t project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'html_scrapy'
SPIDER_MODULES = ['html_scrapy.spiders']
NEWSPIDER_MODULE = 'html_scrapy.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 't (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 't.middlewares.TSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
# 't.middlewares.TDownloaderMiddleware': 543,
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 543
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 't.pipelines.TPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'

View File

View File

@@ -0,0 +1,123 @@
from scrapy.spiders import SitemapSpider,CrawlSpider, Rule, Spider
from scrapy.linkextractors import LinkExtractor
import pickle
import scrapy
#import redis as redis
import bs4
import re
import getpass
#r = redis.Redis(host='localhost', port=6379, password="12345678", db=0)
import pickle
import json
from html_scrapy.elastic_publish import push_to_index, check_elastic_document
def publish(response: scrapy.http.response.html.HtmlResponse):
print("Response typ: %s, obj: %s" % (type(response),response))
# r.publish("fetlegacy", pickle.dumps({"url": response.url, "body": response.body}))
# r.set(response.url, response.body)
with open("scraped_urls.log", "a+") as f:
f.write(response.url+"\n")
documents={}
class LegacySpider(CrawlSpider):
name = 'legacy_spider'
allowed_domains = ['legacy.fet.at']
#sitemap_urls=['https://fet.at/sitemap.xml']
#sitemap_rules = [('/posts/', 'parse')]
http_user = 'andis'
http_pass = getpass.getpass("Passwort von FET USer andis: ")
#http_auth_domain = 'legacy.fet.at'
rules = (
Rule(LinkExtractor(allow=('documents', )), callback='parse_document', process_links='fix_document_links'),
Rule(LinkExtractor(allow=(r"themengruppen/\d+$", )), callback='parse_themengruppe'),
)
start_urls = ['https://legacy.fet.at/themengruppen/15']
custom_settings = {
'DEPTH_LIMIT': '1',
}
def fix_document_links(self, links):
for link in links:
if re.match(r".*documents/\d+$", link.url):
link.url=link.url +".json"
yield link
def fix_themen_links(self, links):
for link in links:
if re.match(r".*themen/\d+$", link.url):
link.url=link.url +".json"
yield link
def parse_document(self, response):
global documents
body=json.loads(response.body)
body["raw"] = body["text"]
body["text"] = bs4.BeautifulSoup(body["text"], 'lxml').get_text(" ")
documents[int(body['id'])]=body
return
def parse_themen(self, response):
body=json.loads(response.body)
body["text"] = bs4.BeautifulSoup(body["text"], 'lxml').get_text(" ")
documents[int(body['id'])]=body
return
def parse_themengruppe(self, response):
global documents
themengruppe = bs4.BeautifulSoup(response.body, 'lxml')
thema_tags = themengruppe.find_all("div", itemtype="http://schema.org/Article")
print(f"found %d topics"% len(thema_tags))
for thema in thema_tags:
t_link = thema.find("h2").find_all("a")[0]
t = {
'url': t_link["href"],
'title': t_link.text,
'id': int( re.search(r"/(\d+)$", t_link["href"]).group(1)),
}
print(f"\n\ncrawling topic %s - %s" %(t.get("id","?"),t.get("title","?")))
meeting_tags = thema.find_all("div", id=re.compile(r'meeting_\d+'))
print(f"\nfound %d meetings for topic %s" % (len(meeting_tags), t["title"]))
for meeting in meeting_tags:
m_id =re.search(r"meeting_(\d+)$", meeting['id']).group(1)
m ={'id': m_id,
'title': meeting.find("a").text
}
docs=meeting.find_all(id=re.compile(r'document_\d+'))
print(f"crawling meeting %s with %d meetings" % (m.get("title"),len(docs)))
for d in docs:
doc_id = int( re.search(r"document_(\d+)$", d['id']).group(1))
if not doc_id in documents:
continue
documents[doc_id]["meeting"]=m
documents[doc_id]["thema"]= t
for d in thema.find_all(id=re.compile(r'document_\d+')):
doc_id = int( re.search(r"document_(\d+)$", d['id']).group(1))
if not doc_id in documents:
continue
documents[doc_id]["thema"]= t
output={}
for k, d in documents.items():
output[k]= check_elastic_document({
"title": d.get("name","")+ " - " + d.get("thema", {}).get("title","") + " - " + d.get("meeting",{}).get("title",""),
"text": d.get("text",""),
"raw": d.get("raw",""),
"url": "legacy.fet.at/documents/" + str(d["id"]),
"published": d["created_at"],
"updated_at": d["updated_at"]
})
push_to_index(output[k]["url"],output[k])
print(f"Document added: %s" % output[k].get("title",""))
#print("\n\nDocuments"+json.dumps(output))
return None

68
httpdemo/__init__.py Normal file
View File

@@ -0,0 +1,68 @@
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from fastapi import Request
from elasticsearch import Elasticsearch
import sys
import elastic_transport
ELASTIC_HOST = "http://localhost:9200"
client = Elasticsearch(ELASTIC_HOST, verify_certs=False)
app = FastAPI(debug=True)
templates = Jinja2Templates(directory="./httpdemo")
#app.mount("/", StaticFiles(directory="/"))
#@app.get("/")
#def read_root():
# return {"Hello"}
query ={
"multi_match":{
"query": ELASTIC_QUERY,
"fields": ["title^20","title.ngrams^10","text","text.ngrams"],
"type": "most_fields"
}
}
highlight = {
"fields": {
"title": {},
"text": {}
}
}
@app.get("/")
def serve_home(request: Request, q: str):
query= {
"bool":{
"should": [{"wildcard": {"body": {"value": "*%s*"% q, "case_insensitive": True }}},
{"wildcard": {"title": {"value": "*%s*" % q, "case_insensitive": True }}}],
"minimum_should_match": 1
}}
query = {
"match": {
"body": q
}
}
try:
resp = client.search(
index = "posts2",
size=10,
#analyze_wildcard=True,
#q="sdf*",
query= query
)
except (elastic_transport.ConnectionError, elastic_transport.ConnectionTimeout) as e:
print(e,sys.stderr)
results=[]
else:
results=resp["hits"]["hits"]
return templates.TemplateResponse("index.html", context= {"request": request,"results": results})

10
httpdemo/index.html Normal file
View File

@@ -0,0 +1,10 @@
<body>
<h1>Hello Index httpdemo</h1>
<p><pre>{{request}}</pre></p>
<ul>{% for r in results %}
<li><a href="{{r['url'}}">
{{r["body"]|safe}}
</li></a>
{% endfor %}
</ul>
</body>

10909
httpdemo/jquery.js vendored Normal file

File diff suppressed because it is too large Load Diff

3
requirements.txt Normal file
View File

@@ -0,0 +1,3 @@
beautifulsoup4
scrapy

4
reset_index.py Normal file
View File

@@ -0,0 +1,4 @@
from searching.index import reset_index
reset_index()

1
run Normal file
View File

@@ -0,0 +1 @@
docker run --rm -it --network elastic $(docker build -q .)

1
run_elastic Normal file
View File

@@ -0,0 +1 @@
docker run --name elastic --rm --net elastic -v esdata:/usr/share/elasticsearch/data -e ES_JAVA_OPTS="-Xms1g -Xmx1g" -e xpack.security.transport.ssl.enabled=false -e discovery.type=single-node -e ELASTIC_PASSWORD=*l9qNGoojiCC4n9KcZhj -p 9200:9200 -it docker.elastic.co/elasticsearch/elasticsearch:8.6.1

1
run_index Normal file
View File

@@ -0,0 +1 @@
docker run --rm -it -e ELASTIC_HOST=https://elastic:9200 --network elastic $(docker build -q -f Dockerfile_index .)

12
scrap_facebook.py Normal file
View File

@@ -0,0 +1,12 @@
# https://pypi.org/project/facebook-page-scraper/
from facebook_page_scraper import Facebook_scraper
page_name = "fsmbtu"
posts_count = 20
browser = "chrome"
#proxy = "IP:PORT" #if proxy requires authentication then user:password@IP:PORT
timeout = 120 #600 seconds
headless = True
meta_ai = Facebook_scraper(page_name, posts_count, browser, timeout=timeout, headless=headless)
json_data = meta_ai.scrap_to_json()
print(json_data)

4
scrap_facebook_data.py Normal file
View File

@@ -0,0 +1,4 @@
posts = [
{"pfbid02Zpa9GaGnyh431W29SMTdMFzwCnvndgCqDuRKY4kfmLzMhVfZyf1BG2mSokE4HXCsl": {"name": "Fachschaft Maschinenbau & Verfahrenstechnik der TU Wien", "shares": 0, "reactions": {"likes": 1, "loves": 0, "wow": 0, "cares": 0, "sad": 0, "angry": 0, "haha": 0}, "reaction_count": 1, "comments": 0, "content": "Du studierst oder hast Interesse am Master Materialwissenschaften?Dann komm zum Einführungstutorium/ Semesteropening, dort kannst du deine KollegInnen kennenlernen und alle Fragen klären!Wann? 13.10. 18:00Wo? GM7 Kleiner Schiffbau#bussi", "posted_on": "2022-10-11T09:34:00", "video": [], "image": ["https://scontent-vie1-1.xx.fbcdn.net/v/t39.30808-6/311170027_629045595356569_7847357821067350500_n.jpg?stp=dst-jpg_s600x600&_nc_cat=109&ccb=1-7&_nc_sid=730e14&_nc_ohc=aKyj-g-6ZzcAX_fgezX&_nc_ht=scontent-vie1-1.xx&oh=00_AfAIA7Wm9pPltcSK1n-ZJ2DPcIUFSKdvljZ14FNp208FCg&oe=63632F0B"], "post_url": "https://www.facebook.com/fsmbtu/posts/pfbid02Zpa9GaGnyh431W29SMTdMFzwCnvndgCqDuRKY4kfmLzMhVfZyf1BG2mSokE4HXCsl?__cft__[0]=AZVKCuhSkdYgll3ZFvIsXEx0U9ybfnyKM7ua43FdC1OloGDAkzTrNwNqNI3tcmBsqUbme0jH423_h1Wvy_ec-Xj-2QZxcQy49C6VeA78y4mD8Ex_VbitnZvxkJIm0rbYwkFiCOnwBnLbUTy5bia7yQXGNVtgJixRiY8aYIgC50mPMwjf4dLZbzX2NARd-eAXx6BBhNfeScm8n4TAkEuZiTk5FaiG97WMv2_AucJshoZ72g&__tn__=%2CO%2CP-R"}},
{"pfbid094MJ3H1jaioLwm9ZBdyN4bEQHnFr8JAgpB6K7UBZ7xTqfZdfifVzvhJdgyLrwdNAl": {"name": "Fachschaft Elektrotechnik", "shares": 0, "reactions": {"likes": 17, "loves": 1, "wow": 0, "cares": 0, "sad": 0, "angry": 0, "haha": 0}, "reaction_count": 18, "comments": 0, "content": "Nach 2 Jahren Pause ist es nun wieder so weit. Wir feiern am 21.10. das 20. Fetzn Fliegen Komm vorbei und lassen wir gemeinsam die Fetzn Fliegen #comeback #jubiläum #party #fetznfliegen #tuwien #semesteropening", "posted_on": "2022-10-13T12:09:00", "video": [], "image": ["https://scontent-vie1-1.xx.fbcdn.net/v/t39.30808-6/311172846_632995741860626_2839195490689716775_n.jpg?stp=dst-jpg_p526x296&_nc_cat=102&ccb=1-7&_nc_sid=730e14&_nc_ohc=5crB-6ISDf0AX-pE9Iw&_nc_oc=AQmfhBkwtkWsD_RCLws86g4MwFGNXJCU1ZnufsKI3mnZkdFla-Mx7s9SOgbIhpd2PVk&_nc_ht=scontent-vie1-1.xx&oh=00_AfDwNsC-aZ3Jhxbeok5P9Bvf0IpIvyY61sDyhl7uWK3MKg&oe=63640FE3"], "post_url": "https://www.facebook.com/FachschaftET/posts/pfbid094MJ3H1jaioLwm9ZBdyN4bEQHnFr8JAgpB6K7UBZ7xTqfZdfifVzvhJdgyLrwdNAl?__cft__[0]=AZWL_tw6LUTREPksX4y2qquFekfyKm-6l3a7UTsojWf-Ij9D8fObP55jGZUYB0QFe3zomuCLsWOeV7fTrsz6sXO6otteao4t0g0JUy6hHKeQvKNyfHB9lymnXvzwremH5sCh7CqJOQOeqlGCVtQ8Pqbcq82KGA09P5BdWyVfToNkoenUOMHxdaoso9cK-ijPG_fsn6pivRT38MdC1UXWENU3Edw4eXee92n18KvjKVhVTQ&__tn__=%2CO%2CP-R"}}
]

2
scrapy.cfg Normal file
View File

@@ -0,0 +1,2 @@
[settings]
default = html_scrapy.settings

57
searching/__init__.py Normal file
View File

@@ -0,0 +1,57 @@
import os
from elasticsearch import Elasticsearch, helpers
import contextlib
ELASTIC_HOST = os.environ.get("ELASTIC_HOST","http://localhost:9200")
ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD","*l9qNGoojiCC4n9KcZhj")
ELASTIC_QUERY = os.environ.get("ELASTIC_QUERY","Anwesend")
ELASTIC_INDEX = os.environ.get("ELASTIC_INDEX","legacy")
# Verbinde mit Client
@contextlib.contextmanager
def es_client():
client = Elasticsearch(ELASTIC_HOST, verify_certs=False, basic_auth=('elastic', ELASTIC_PASSWORD))
yield client
client.close()
def es_query(query:str):
query ={
"multi_match":{
"query": query,
"fields": ["title^20","title.ngrams^10","text","text.ngrams"],
"type": "most_fields"
}
}
def es_highlight():
highlight = {
"fields": {
"title": {},
"text": {}
}
}
def es_search(query:str):
with es_client() as client:
result = client.search(
index = ELASTIC_INDEX,
size=10,
query= es_query(query),
highlight = es_highlight()
)
return result
#for hit in resp["hits"]["hits"]:
# print(hit)
if __name__ =="__main__":
resp = es_search(ELASTIC_QUERY)
for hit in resp["hits"]["hits"]:
print(f"\n\n%s\n%s\n%s - %s" % (
hit.get("_source",{}).get("url",""),
hit.get("_source",{}).get("title",""),
" ".join(hit.get("highlight",{}).get("title",[""])),
" ".join(hit.get("highlight",{}).get("text",[""]))
))

71
searching/index.py Normal file
View File

@@ -0,0 +1,71 @@
from . import ELASTIC_INDEX
from . import es_client
import elasticsearch
settings = {
"index":
{"max_ngram_diff": 3
},
"analysis": {
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tokenizer",
"filter": [
"lowercase",
]
}
},
"tokenizer": {
"my_tokenizer": {
"type": "ngram",
"min_gram": 3,
"max_gram": 6,
"token_chars": [
"letter",
"digit"
]
}
}
}}
mapping = {
"properties": {
"title": {
"type": "text",
"fields": {
"ngrams": {
"type":"text",
"analyzer": "my_analyzer",
"search_analyzer": "standard",
}
}
},
"text": {
"type": "text",
"fields": {
"ngrams": {
"type":"text",
"analyzer": "my_analyzer",
"search_analyzer": "standard",
}
}
},
"url": { "type": "text", "index": False},
"published": {"type": "date", "format": "date_optional_time"},
"updated_at": {"type": "date", "format": "date_optional_time"},
"raw": {
"type": "text",
"index": False
},
}
}
def reset_index():
with es_client() as client:
try:
client.indices.delete(index=ELASTIC_INDEX)
except elasticsearch.NotFoundError:
print("Index already removed")
client.indices.create(index=ELASTIC_INDEX, settings=settings, mappings=mapping)

18
test_search.py Normal file
View File

@@ -0,0 +1,18 @@
import os
from elasticsearch import Elasticsearch, helpers
from searching import es_search, es_client
ELASTIC_QUERY = os.environ.get("ELASTIC_QUERY","Anwesend")
#for hit in resp["hits"]["hits"]:
# print(hit)
if __name__ =="__main__":
resp = es_search(ELASTIC_QUERY)
for hit in resp["hits"]["hits"]:
print(f"\n\n%s\n%s\n%s - %s" % (
hit.get("_source",{}).get("url",""),
hit.get("_source",{}).get("title",""),
" ".join(hit.get("highlight",{}).get("title",[""])),
" ".join(hit.get("highlight",{}).get("text",[""]))
))