crawler
This commit is contained in:
29
app/controllers/crawler_controller.rb
Normal file
29
app/controllers/crawler_controller.rb
Normal file
@@ -0,0 +1,29 @@
|
||||
class CrawlerController < ApplicationController
|
||||
|
||||
def index
|
||||
authorize! :doadmin, User
|
||||
|
||||
@crawlobjs=Crawlobject.where(:objtype=>5)
|
||||
end
|
||||
def load_attachments
|
||||
authorize! :doadmin, User
|
||||
|
||||
c = Crawlobject.find(params[:id])
|
||||
if c.has_attachments?
|
||||
c.load_attachments
|
||||
end
|
||||
respond_to do |format|
|
||||
format.html {redirect_to c.something}
|
||||
format.js
|
||||
end
|
||||
end
|
||||
def move_to_news
|
||||
authorize! :doadmin, User
|
||||
|
||||
co=Crawlobject.find(params[:id])
|
||||
if co.objtype == 5
|
||||
co.move_to_neuigkeit(User.find(Rails.configuration.crawlconfig[5]["user_id"]), Rubrik.find(Rails.configuration.crawlconfig[5]["rubrik_id"]))
|
||||
redirect_to co.something
|
||||
end
|
||||
end
|
||||
end
|
||||
4
app/helpers/crawlobject_helper.rb
Normal file
4
app/helpers/crawlobject_helper.rb
Normal file
@@ -0,0 +1,4 @@
|
||||
module CrawlobjectHelper
|
||||
|
||||
|
||||
end
|
||||
@@ -1,8 +1,44 @@
|
||||
class Crawlobject < ActiveRecord::Base
|
||||
attr_accessible :children_count, :crawltime, :crawlurl, :depth, :lft, :name, :parent_id, :published_at, :raw, :referenced, :rgt, :schematype, :text, :type, :url
|
||||
acts_as_nested_set
|
||||
|
||||
acts_as_nested_set
|
||||
has_many :attachments, :as=>:parent
|
||||
|
||||
belongs_to :something, :polymorphic=>true
|
||||
def self.config
|
||||
Rails.application.config.crawlconfig
|
||||
end
|
||||
def has_attachments?
|
||||
if self.objtype==2
|
||||
return true
|
||||
else
|
||||
return false
|
||||
end
|
||||
end
|
||||
def move_to_neuigkeit(user,rubrik)
|
||||
if self.objtype == 5 and self.something.nil?
|
||||
n=Neuigkeit.new
|
||||
n.title=self.name
|
||||
n.text=self.text
|
||||
n.datum=self.published_at
|
||||
n.author=user
|
||||
n.rubrik=rubrik
|
||||
n.origurl = self.url
|
||||
n.save
|
||||
self.something=n
|
||||
self.save
|
||||
return n
|
||||
elsif self.objtype == 5
|
||||
n=self.something
|
||||
n.title=self.name
|
||||
n.text=self.text
|
||||
n.datum=self.published_at
|
||||
n.author=user
|
||||
n.rubrik=rubrik
|
||||
n.origurl = self.url
|
||||
n.save
|
||||
|
||||
end
|
||||
end
|
||||
def parse_children
|
||||
if self.objtype == 1 # ET Forum Article loaded
|
||||
self.json["comments"].each do |com|
|
||||
@@ -25,6 +61,25 @@ class Crawlobject < ActiveRecord::Base
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def load_attachments
|
||||
if self.objtype == 2 # ET Comments only
|
||||
self.json["attachments"].each do |url|
|
||||
fn = `python ../microdata/download_file.py "#{url}"`
|
||||
|
||||
unless self.attachments.where(:name=>"Et_21.01.2015_L_sung.pdf").count > 0
|
||||
|
||||
a=Attachment.new
|
||||
a.datei=File.open("/home/andreas/www/microdata/tmp/"+fn.strip)
|
||||
a.name=fn.strip
|
||||
a.parent=self
|
||||
a.save
|
||||
self.attachments<< a
|
||||
a.save
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
def parse_object
|
||||
|
||||
if self.objtype == 1 # ET Forum Article loaded
|
||||
@@ -41,9 +96,12 @@ class Crawlobject < ActiveRecord::Base
|
||||
self.name = self.json["properties"]["name"].try(:first)
|
||||
self.published_at = self.json["properties"]["commentTime"].try(:first)
|
||||
self.text = self.json["properties"]["commentText"].try(:first)
|
||||
|
||||
|
||||
|
||||
end
|
||||
if self.objtype==5
|
||||
self.name=self.json["name"].strip
|
||||
self.text=self.json["text"]
|
||||
self.published_at=Time.parse(self.json["date"].strip)
|
||||
self.url="http://www.htu.at"
|
||||
end
|
||||
end
|
||||
def calc_hash
|
||||
@@ -54,5 +112,22 @@ class Crawlobject < ActiveRecord::Base
|
||||
def json
|
||||
JSON.parse(self.raw)
|
||||
end
|
||||
|
||||
def self.crawl_htu
|
||||
res = JSON.parse(`python ../microdata/foswikicrawl.py`)
|
||||
res.each do |r|
|
||||
cc=Crawlobject.new(:raw=>r.to_json)
|
||||
cc.objtype=5
|
||||
cc.parse_object
|
||||
cc.calc_hash
|
||||
if Crawlobject.where(:objhash2=>cc.objhash2, :objtype=>5).count==0
|
||||
cc.save
|
||||
else
|
||||
cc = Crawlobject.where(:objhash2=>cc.objhash2, :objtype=>5).first
|
||||
cc.raw=r.to_json
|
||||
cc.parse_object
|
||||
cc.calc_hash
|
||||
cc.save
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
9
app/views/crawler/index.html.erb
Normal file
9
app/views/crawler/index.html.erb
Normal file
@@ -0,0 +1,9 @@
|
||||
<h2>HTU News</h2>
|
||||
<table>
|
||||
<% @crawlobjs.each do |co| %>
|
||||
<tr> <td> <%= render co %> </td>
|
||||
<td>
|
||||
<%= link_to "move_to_news",move_to_news_crawler_path(co) %>
|
||||
<%= render co.something unless co.something.nil? %>
|
||||
</td> </tr><% end %>
|
||||
</table>
|
||||
1
app/views/crawler/load_attachments.js.erb
Normal file
1
app/views/crawler/load_attachments.js.erb
Normal file
@@ -0,0 +1 @@
|
||||
alert("attachments loaded")neuigkeit.updated_at.try(:utc).try(:to_s)
|
||||
@@ -1,4 +1,9 @@
|
||||
<li><%= link_to crawlobject.url do %> <b> <%= crawlobject.name %> (<%= crawlobject.published_at %>)</b> <%= crawlobject.text %>
|
||||
<li><%= link_to crawlobject.url do %>
|
||||
<b> <%= crawlobject.name %> (<%= crawlobject.published_at %>)</b><% end %>
|
||||
<%= (crawlobject.text.html_safe) unless crawlobject.text.nil?%>
|
||||
<% if crawlobject.has_attachments? %>
|
||||
<%= render_attachments_for(crawlobject) %>
|
||||
<%= link_to "load attachments", load_attachments_crawler_path(crawlobject), remote: true %>
|
||||
<% end %>
|
||||
|
||||
<% if crawlobject.children.count >0 %>
|
||||
@@ -8,4 +13,5 @@
|
||||
<% end %>
|
||||
</ul>
|
||||
<% end %>
|
||||
|
||||
</li>
|
||||
|
||||
@@ -49,7 +49,9 @@ end
|
||||
</div>
|
||||
<% unless @neuigkeit.published? %>
|
||||
<div class="sticker sticker-red"> Not Published</div> <% end %>
|
||||
|
||||
<% unless @neuigkeit.origurl.nil? || @neuigkeit.origurl.empty? %>
|
||||
<div class="alert"><%= link_to "Zitiert von "+ @neuigkeit.origurl, @neuigkeit.origurl %></div>
|
||||
<% end %>
|
||||
<div class="media">
|
||||
<% unless @neuigkeit.picture_robust.big_thumb.to_s.empty? %>
|
||||
<div class="pull-left" href="#">
|
||||
|
||||
2
config/initializers/crawler.rb
Normal file
2
config/initializers/crawler.rb
Normal file
@@ -0,0 +1,2 @@
|
||||
Rails.application.config.crawlconfig= YAML.load_file("#{::Rails.root.to_s}/config/crawl_config.yml")
|
||||
|
||||
5
db/migrate/20150417131341_add_origurl_to_neuigkeiten.rb
Normal file
5
db/migrate/20150417131341_add_origurl_to_neuigkeiten.rb
Normal file
@@ -0,0 +1,5 @@
|
||||
class AddOrigurlToNeuigkeiten < ActiveRecord::Migration
|
||||
def change
|
||||
add_column :neuigkeiten, :origurl, :string
|
||||
end
|
||||
end
|
||||
@@ -0,0 +1,5 @@
|
||||
class AddCrawledAtToCrawlobjects < ActiveRecord::Migration
|
||||
def change
|
||||
add_column :crawlobjects, :crawled_at, :timestamp
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user