This commit is contained in:
Andreas Stephanides
2015-04-22 18:43:24 +02:00
parent de215c16eb
commit 036f90e6e5
10 changed files with 146 additions and 8 deletions

View File

@@ -0,0 +1,29 @@
class CrawlerController < ApplicationController
def index
authorize! :doadmin, User
@crawlobjs=Crawlobject.where(:objtype=>5)
end
def load_attachments
authorize! :doadmin, User
c = Crawlobject.find(params[:id])
if c.has_attachments?
c.load_attachments
end
respond_to do |format|
format.html {redirect_to c.something}
format.js
end
end
def move_to_news
authorize! :doadmin, User
co=Crawlobject.find(params[:id])
if co.objtype == 5
co.move_to_neuigkeit(User.find(Rails.configuration.crawlconfig[5]["user_id"]), Rubrik.find(Rails.configuration.crawlconfig[5]["rubrik_id"]))
redirect_to co.something
end
end
end

View File

@@ -0,0 +1,4 @@
module CrawlobjectHelper
end

View File

@@ -1,8 +1,44 @@
class Crawlobject < ActiveRecord::Base
attr_accessible :children_count, :crawltime, :crawlurl, :depth, :lft, :name, :parent_id, :published_at, :raw, :referenced, :rgt, :schematype, :text, :type, :url
acts_as_nested_set
has_many :attachments, :as=>:parent
belongs_to :something, :polymorphic=>true
def self.config
Rails.application.config.crawlconfig
end
def has_attachments?
if self.objtype==2
return true
else
return false
end
end
def move_to_neuigkeit(user,rubrik)
if self.objtype == 5 and self.something.nil?
n=Neuigkeit.new
n.title=self.name
n.text=self.text
n.datum=self.published_at
n.author=user
n.rubrik=rubrik
n.origurl = self.url
n.save
self.something=n
self.save
return n
elsif self.objtype == 5
n=self.something
n.title=self.name
n.text=self.text
n.datum=self.published_at
n.author=user
n.rubrik=rubrik
n.origurl = self.url
n.save
end
end
def parse_children
if self.objtype == 1 # ET Forum Article loaded
self.json["comments"].each do |com|
@@ -25,6 +61,25 @@ class Crawlobject < ActiveRecord::Base
end
end
end
def load_attachments
if self.objtype == 2 # ET Comments only
self.json["attachments"].each do |url|
fn = `python ../microdata/download_file.py "#{url}"`
unless self.attachments.where(:name=>"Et_21.01.2015_L_sung.pdf").count > 0
a=Attachment.new
a.datei=File.open("/home/andreas/www/microdata/tmp/"+fn.strip)
a.name=fn.strip
a.parent=self
a.save
self.attachments<< a
a.save
end
end
end
end
def parse_object
if self.objtype == 1 # ET Forum Article loaded
@@ -41,9 +96,12 @@ class Crawlobject < ActiveRecord::Base
self.name = self.json["properties"]["name"].try(:first)
self.published_at = self.json["properties"]["commentTime"].try(:first)
self.text = self.json["properties"]["commentText"].try(:first)
end
if self.objtype==5
self.name=self.json["name"].strip
self.text=self.json["text"]
self.published_at=Time.parse(self.json["date"].strip)
self.url="http://www.htu.at"
end
end
def calc_hash
@@ -54,5 +112,22 @@ class Crawlobject < ActiveRecord::Base
def json
JSON.parse(self.raw)
end
def self.crawl_htu
res = JSON.parse(`python ../microdata/foswikicrawl.py`)
res.each do |r|
cc=Crawlobject.new(:raw=>r.to_json)
cc.objtype=5
cc.parse_object
cc.calc_hash
if Crawlobject.where(:objhash2=>cc.objhash2, :objtype=>5).count==0
cc.save
else
cc = Crawlobject.where(:objhash2=>cc.objhash2, :objtype=>5).first
cc.raw=r.to_json
cc.parse_object
cc.calc_hash
cc.save
end
end
end
end

View File

@@ -0,0 +1,9 @@
<h2>HTU News</h2>
<table>
<% @crawlobjs.each do |co| %>
<tr> <td> <%= render co %> </td>
<td>
<%= link_to "move_to_news",move_to_news_crawler_path(co) %>
<%= render co.something unless co.something.nil? %>
</td> </tr><% end %>
</table>

View File

@@ -0,0 +1 @@
alert("attachments loaded")neuigkeit.updated_at.try(:utc).try(:to_s)

View File

@@ -1,4 +1,9 @@
<li><%= link_to crawlobject.url do %> <b> <%= crawlobject.name %> (<%= crawlobject.published_at %>)</b> <%= crawlobject.text %>
<li><%= link_to crawlobject.url do %>
<b> <%= crawlobject.name %> (<%= crawlobject.published_at %>)</b><% end %>
<%= (crawlobject.text.html_safe) unless crawlobject.text.nil?%>
<% if crawlobject.has_attachments? %>
<%= render_attachments_for(crawlobject) %>
<%= link_to "load attachments", load_attachments_crawler_path(crawlobject), remote: true %>
<% end %>
<% if crawlobject.children.count >0 %>
@@ -8,4 +13,5 @@
<% end %>
</ul>
<% end %>
</li>

View File

@@ -49,7 +49,9 @@ end
</div>
<% unless @neuigkeit.published? %>
<div class="sticker sticker-red"> Not Published</div> <% end %>
<% unless @neuigkeit.origurl.nil? || @neuigkeit.origurl.empty? %>
<div class="alert"><%= link_to "Zitiert von "+ @neuigkeit.origurl, @neuigkeit.origurl %></div>
<% end %>
<div class="media">
<% unless @neuigkeit.picture_robust.big_thumb.to_s.empty? %>
<div class="pull-left" href="#">

View File

@@ -0,0 +1,2 @@
Rails.application.config.crawlconfig= YAML.load_file("#{::Rails.root.to_s}/config/crawl_config.yml")

View File

@@ -0,0 +1,5 @@
class AddOrigurlToNeuigkeiten < ActiveRecord::Migration
def change
add_column :neuigkeiten, :origurl, :string
end
end

View File

@@ -0,0 +1,5 @@
class AddCrawledAtToCrawlobjects < ActiveRecord::Migration
def change
add_column :crawlobjects, :crawled_at, :timestamp
end
end