forked from bofh/fetsite
crawler
This commit is contained in:
29
app/controllers/crawler_controller.rb
Normal file
29
app/controllers/crawler_controller.rb
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
class CrawlerController < ApplicationController
|
||||||
|
|
||||||
|
def index
|
||||||
|
authorize! :doadmin, User
|
||||||
|
|
||||||
|
@crawlobjs=Crawlobject.where(:objtype=>5)
|
||||||
|
end
|
||||||
|
def load_attachments
|
||||||
|
authorize! :doadmin, User
|
||||||
|
|
||||||
|
c = Crawlobject.find(params[:id])
|
||||||
|
if c.has_attachments?
|
||||||
|
c.load_attachments
|
||||||
|
end
|
||||||
|
respond_to do |format|
|
||||||
|
format.html {redirect_to c.something}
|
||||||
|
format.js
|
||||||
|
end
|
||||||
|
end
|
||||||
|
def move_to_news
|
||||||
|
authorize! :doadmin, User
|
||||||
|
|
||||||
|
co=Crawlobject.find(params[:id])
|
||||||
|
if co.objtype == 5
|
||||||
|
co.move_to_neuigkeit(User.find(Rails.configuration.crawlconfig[5]["user_id"]), Rubrik.find(Rails.configuration.crawlconfig[5]["rubrik_id"]))
|
||||||
|
redirect_to co.something
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
4
app/helpers/crawlobject_helper.rb
Normal file
4
app/helpers/crawlobject_helper.rb
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
module CrawlobjectHelper
|
||||||
|
|
||||||
|
|
||||||
|
end
|
||||||
@@ -1,8 +1,44 @@
|
|||||||
class Crawlobject < ActiveRecord::Base
|
class Crawlobject < ActiveRecord::Base
|
||||||
attr_accessible :children_count, :crawltime, :crawlurl, :depth, :lft, :name, :parent_id, :published_at, :raw, :referenced, :rgt, :schematype, :text, :type, :url
|
attr_accessible :children_count, :crawltime, :crawlurl, :depth, :lft, :name, :parent_id, :published_at, :raw, :referenced, :rgt, :schematype, :text, :type, :url
|
||||||
acts_as_nested_set
|
acts_as_nested_set
|
||||||
|
has_many :attachments, :as=>:parent
|
||||||
|
|
||||||
belongs_to :something, :polymorphic=>true
|
belongs_to :something, :polymorphic=>true
|
||||||
|
def self.config
|
||||||
|
Rails.application.config.crawlconfig
|
||||||
|
end
|
||||||
|
def has_attachments?
|
||||||
|
if self.objtype==2
|
||||||
|
return true
|
||||||
|
else
|
||||||
|
return false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
def move_to_neuigkeit(user,rubrik)
|
||||||
|
if self.objtype == 5 and self.something.nil?
|
||||||
|
n=Neuigkeit.new
|
||||||
|
n.title=self.name
|
||||||
|
n.text=self.text
|
||||||
|
n.datum=self.published_at
|
||||||
|
n.author=user
|
||||||
|
n.rubrik=rubrik
|
||||||
|
n.origurl = self.url
|
||||||
|
n.save
|
||||||
|
self.something=n
|
||||||
|
self.save
|
||||||
|
return n
|
||||||
|
elsif self.objtype == 5
|
||||||
|
n=self.something
|
||||||
|
n.title=self.name
|
||||||
|
n.text=self.text
|
||||||
|
n.datum=self.published_at
|
||||||
|
n.author=user
|
||||||
|
n.rubrik=rubrik
|
||||||
|
n.origurl = self.url
|
||||||
|
n.save
|
||||||
|
|
||||||
|
end
|
||||||
|
end
|
||||||
def parse_children
|
def parse_children
|
||||||
if self.objtype == 1 # ET Forum Article loaded
|
if self.objtype == 1 # ET Forum Article loaded
|
||||||
self.json["comments"].each do |com|
|
self.json["comments"].each do |com|
|
||||||
@@ -25,6 +61,25 @@ class Crawlobject < ActiveRecord::Base
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def load_attachments
|
||||||
|
if self.objtype == 2 # ET Comments only
|
||||||
|
self.json["attachments"].each do |url|
|
||||||
|
fn = `python ../microdata/download_file.py "#{url}"`
|
||||||
|
|
||||||
|
unless self.attachments.where(:name=>"Et_21.01.2015_L_sung.pdf").count > 0
|
||||||
|
|
||||||
|
a=Attachment.new
|
||||||
|
a.datei=File.open("/home/andreas/www/microdata/tmp/"+fn.strip)
|
||||||
|
a.name=fn.strip
|
||||||
|
a.parent=self
|
||||||
|
a.save
|
||||||
|
self.attachments<< a
|
||||||
|
a.save
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
def parse_object
|
def parse_object
|
||||||
|
|
||||||
if self.objtype == 1 # ET Forum Article loaded
|
if self.objtype == 1 # ET Forum Article loaded
|
||||||
@@ -41,9 +96,12 @@ class Crawlobject < ActiveRecord::Base
|
|||||||
self.name = self.json["properties"]["name"].try(:first)
|
self.name = self.json["properties"]["name"].try(:first)
|
||||||
self.published_at = self.json["properties"]["commentTime"].try(:first)
|
self.published_at = self.json["properties"]["commentTime"].try(:first)
|
||||||
self.text = self.json["properties"]["commentText"].try(:first)
|
self.text = self.json["properties"]["commentText"].try(:first)
|
||||||
|
end
|
||||||
|
if self.objtype==5
|
||||||
|
self.name=self.json["name"].strip
|
||||||
|
self.text=self.json["text"]
|
||||||
|
self.published_at=Time.parse(self.json["date"].strip)
|
||||||
|
self.url="http://www.htu.at"
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
def calc_hash
|
def calc_hash
|
||||||
@@ -54,5 +112,22 @@ class Crawlobject < ActiveRecord::Base
|
|||||||
def json
|
def json
|
||||||
JSON.parse(self.raw)
|
JSON.parse(self.raw)
|
||||||
end
|
end
|
||||||
|
def self.crawl_htu
|
||||||
|
res = JSON.parse(`python ../microdata/foswikicrawl.py`)
|
||||||
|
res.each do |r|
|
||||||
|
cc=Crawlobject.new(:raw=>r.to_json)
|
||||||
|
cc.objtype=5
|
||||||
|
cc.parse_object
|
||||||
|
cc.calc_hash
|
||||||
|
if Crawlobject.where(:objhash2=>cc.objhash2, :objtype=>5).count==0
|
||||||
|
cc.save
|
||||||
|
else
|
||||||
|
cc = Crawlobject.where(:objhash2=>cc.objhash2, :objtype=>5).first
|
||||||
|
cc.raw=r.to_json
|
||||||
|
cc.parse_object
|
||||||
|
cc.calc_hash
|
||||||
|
cc.save
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
9
app/views/crawler/index.html.erb
Normal file
9
app/views/crawler/index.html.erb
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
<h2>HTU News</h2>
|
||||||
|
<table>
|
||||||
|
<% @crawlobjs.each do |co| %>
|
||||||
|
<tr> <td> <%= render co %> </td>
|
||||||
|
<td>
|
||||||
|
<%= link_to "move_to_news",move_to_news_crawler_path(co) %>
|
||||||
|
<%= render co.something unless co.something.nil? %>
|
||||||
|
</td> </tr><% end %>
|
||||||
|
</table>
|
||||||
1
app/views/crawler/load_attachments.js.erb
Normal file
1
app/views/crawler/load_attachments.js.erb
Normal file
@@ -0,0 +1 @@
|
|||||||
|
alert("attachments loaded")neuigkeit.updated_at.try(:utc).try(:to_s)
|
||||||
@@ -1,4 +1,9 @@
|
|||||||
<li><%= link_to crawlobject.url do %> <b> <%= crawlobject.name %> (<%= crawlobject.published_at %>)</b> <%= crawlobject.text %>
|
<li><%= link_to crawlobject.url do %>
|
||||||
|
<b> <%= crawlobject.name %> (<%= crawlobject.published_at %>)</b><% end %>
|
||||||
|
<%= (crawlobject.text.html_safe) unless crawlobject.text.nil?%>
|
||||||
|
<% if crawlobject.has_attachments? %>
|
||||||
|
<%= render_attachments_for(crawlobject) %>
|
||||||
|
<%= link_to "load attachments", load_attachments_crawler_path(crawlobject), remote: true %>
|
||||||
<% end %>
|
<% end %>
|
||||||
|
|
||||||
<% if crawlobject.children.count >0 %>
|
<% if crawlobject.children.count >0 %>
|
||||||
@@ -8,4 +13,5 @@
|
|||||||
<% end %>
|
<% end %>
|
||||||
</ul>
|
</ul>
|
||||||
<% end %>
|
<% end %>
|
||||||
|
|
||||||
</li>
|
</li>
|
||||||
|
|||||||
@@ -49,7 +49,9 @@ end
|
|||||||
</div>
|
</div>
|
||||||
<% unless @neuigkeit.published? %>
|
<% unless @neuigkeit.published? %>
|
||||||
<div class="sticker sticker-red"> Not Published</div> <% end %>
|
<div class="sticker sticker-red"> Not Published</div> <% end %>
|
||||||
|
<% unless @neuigkeit.origurl.nil? || @neuigkeit.origurl.empty? %>
|
||||||
|
<div class="alert"><%= link_to "Zitiert von "+ @neuigkeit.origurl, @neuigkeit.origurl %></div>
|
||||||
|
<% end %>
|
||||||
<div class="media">
|
<div class="media">
|
||||||
<% unless @neuigkeit.picture_robust.big_thumb.to_s.empty? %>
|
<% unless @neuigkeit.picture_robust.big_thumb.to_s.empty? %>
|
||||||
<div class="pull-left" href="#">
|
<div class="pull-left" href="#">
|
||||||
|
|||||||
2
config/initializers/crawler.rb
Normal file
2
config/initializers/crawler.rb
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
Rails.application.config.crawlconfig= YAML.load_file("#{::Rails.root.to_s}/config/crawl_config.yml")
|
||||||
|
|
||||||
5
db/migrate/20150417131341_add_origurl_to_neuigkeiten.rb
Normal file
5
db/migrate/20150417131341_add_origurl_to_neuigkeiten.rb
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
class AddOrigurlToNeuigkeiten < ActiveRecord::Migration
|
||||||
|
def change
|
||||||
|
add_column :neuigkeiten, :origurl, :string
|
||||||
|
end
|
||||||
|
end
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
class AddCrawledAtToCrawlobjects < ActiveRecord::Migration
|
||||||
|
def change
|
||||||
|
add_column :crawlobjects, :crawled_at, :timestamp
|
||||||
|
end
|
||||||
|
end
|
||||||
Reference in New Issue
Block a user