diff --git a/app/controllers/crawler_controller.rb b/app/controllers/crawler_controller.rb new file mode 100644 index 0000000..0ceab2e --- /dev/null +++ b/app/controllers/crawler_controller.rb @@ -0,0 +1,29 @@ +class CrawlerController < ApplicationController + + def index + authorize! :doadmin, User + + @crawlobjs=Crawlobject.where(:objtype=>5) + end + def load_attachments + authorize! :doadmin, User + + c = Crawlobject.find(params[:id]) + if c.has_attachments? + c.load_attachments + end + respond_to do |format| + format.html {redirect_to c.something} + format.js + end + end + def move_to_news + authorize! :doadmin, User + + co=Crawlobject.find(params[:id]) + if co.objtype == 5 + co.move_to_neuigkeit(User.find(Rails.configuration.crawlconfig[5]["user_id"]), Rubrik.find(Rails.configuration.crawlconfig[5]["rubrik_id"])) + redirect_to co.something + end + end +end diff --git a/app/helpers/crawlobject_helper.rb b/app/helpers/crawlobject_helper.rb new file mode 100644 index 0000000..59aec5c --- /dev/null +++ b/app/helpers/crawlobject_helper.rb @@ -0,0 +1,4 @@ +module CrawlobjectHelper + + +end diff --git a/app/models/crawlobject.rb b/app/models/crawlobject.rb index 0071c57..d906c9e 100644 --- a/app/models/crawlobject.rb +++ b/app/models/crawlobject.rb @@ -1,8 +1,44 @@ class Crawlobject < ActiveRecord::Base attr_accessible :children_count, :crawltime, :crawlurl, :depth, :lft, :name, :parent_id, :published_at, :raw, :referenced, :rgt, :schematype, :text, :type, :url - acts_as_nested_set - + acts_as_nested_set + has_many :attachments, :as=>:parent + belongs_to :something, :polymorphic=>true + def self.config + Rails.application.config.crawlconfig + end + def has_attachments? + if self.objtype==2 + return true + else + return false + end + end + def move_to_neuigkeit(user,rubrik) + if self.objtype == 5 and self.something.nil? + n=Neuigkeit.new + n.title=self.name + n.text=self.text + n.datum=self.published_at + n.author=user + n.rubrik=rubrik + n.origurl = self.url + n.save + self.something=n + self.save + return n + elsif self.objtype == 5 + n=self.something + n.title=self.name + n.text=self.text + n.datum=self.published_at + n.author=user + n.rubrik=rubrik + n.origurl = self.url + n.save + + end + end def parse_children if self.objtype == 1 # ET Forum Article loaded self.json["comments"].each do |com| @@ -25,6 +61,25 @@ class Crawlobject < ActiveRecord::Base end end end + + def load_attachments + if self.objtype == 2 # ET Comments only + self.json["attachments"].each do |url| + fn = `python ../microdata/download_file.py "#{url}"` + + unless self.attachments.where(:name=>"Et_21.01.2015_L_sung.pdf").count > 0 + + a=Attachment.new + a.datei=File.open("/home/andreas/www/microdata/tmp/"+fn.strip) + a.name=fn.strip + a.parent=self + a.save + self.attachments<< a + a.save + end + end + end + end def parse_object if self.objtype == 1 # ET Forum Article loaded @@ -41,9 +96,12 @@ class Crawlobject < ActiveRecord::Base self.name = self.json["properties"]["name"].try(:first) self.published_at = self.json["properties"]["commentTime"].try(:first) self.text = self.json["properties"]["commentText"].try(:first) - - - + end + if self.objtype==5 + self.name=self.json["name"].strip + self.text=self.json["text"] + self.published_at=Time.parse(self.json["date"].strip) + self.url="http://www.htu.at" end end def calc_hash @@ -54,5 +112,22 @@ class Crawlobject < ActiveRecord::Base def json JSON.parse(self.raw) end - + def self.crawl_htu + res = JSON.parse(`python ../microdata/foswikicrawl.py`) + res.each do |r| + cc=Crawlobject.new(:raw=>r.to_json) + cc.objtype=5 + cc.parse_object + cc.calc_hash + if Crawlobject.where(:objhash2=>cc.objhash2, :objtype=>5).count==0 + cc.save + else + cc = Crawlobject.where(:objhash2=>cc.objhash2, :objtype=>5).first + cc.raw=r.to_json + cc.parse_object + cc.calc_hash + cc.save + end + end + end end diff --git a/app/views/crawler/index.html.erb b/app/views/crawler/index.html.erb new file mode 100644 index 0000000..08eeeb1 --- /dev/null +++ b/app/views/crawler/index.html.erb @@ -0,0 +1,9 @@ +

HTU News

+ +<% @crawlobjs.each do |co| %> + +<% end %> +
<%= render co %> +<%= link_to "move_to_news",move_to_news_crawler_path(co) %> +<%= render co.something unless co.something.nil? %> +
diff --git a/app/views/crawler/load_attachments.js.erb b/app/views/crawler/load_attachments.js.erb new file mode 100644 index 0000000..af3e63f --- /dev/null +++ b/app/views/crawler/load_attachments.js.erb @@ -0,0 +1 @@ +alert("attachments loaded")neuigkeit.updated_at.try(:utc).try(:to_s) \ No newline at end of file diff --git a/app/views/crawlobjects/_crawlobject.html.erb b/app/views/crawlobjects/_crawlobject.html.erb index 3df8f92..7c6096f 100644 --- a/app/views/crawlobjects/_crawlobject.html.erb +++ b/app/views/crawlobjects/_crawlobject.html.erb @@ -1,4 +1,9 @@ -
  • <%= link_to crawlobject.url do %> <%= crawlobject.name %> (<%= crawlobject.published_at %>) <%= crawlobject.text %> +
  • <%= link_to crawlobject.url do %> + <%= crawlobject.name %> (<%= crawlobject.published_at %>)<% end %> +<%= (crawlobject.text.html_safe) unless crawlobject.text.nil?%> +<% if crawlobject.has_attachments? %> +<%= render_attachments_for(crawlobject) %> +<%= link_to "load attachments", load_attachments_crawler_path(crawlobject), remote: true %> <% end %> <% if crawlobject.children.count >0 %> @@ -8,4 +13,5 @@ <% end %> <% end %> +
  • diff --git a/app/views/neuigkeiten/show.html.erb b/app/views/neuigkeiten/show.html.erb index 02b4b50..77ae9a7 100755 --- a/app/views/neuigkeiten/show.html.erb +++ b/app/views/neuigkeiten/show.html.erb @@ -49,7 +49,9 @@ end <% unless @neuigkeit.published? %>
    Not Published
    <% end %> - + <% unless @neuigkeit.origurl.nil? || @neuigkeit.origurl.empty? %> +
    <%= link_to "Zitiert von "+ @neuigkeit.origurl, @neuigkeit.origurl %>
    + <% end %>
    <% unless @neuigkeit.picture_robust.big_thumb.to_s.empty? %>
    diff --git a/config/initializers/crawler.rb b/config/initializers/crawler.rb new file mode 100644 index 0000000..c8b672c --- /dev/null +++ b/config/initializers/crawler.rb @@ -0,0 +1,2 @@ +Rails.application.config.crawlconfig= YAML.load_file("#{::Rails.root.to_s}/config/crawl_config.yml") + diff --git a/db/migrate/20150417131341_add_origurl_to_neuigkeiten.rb b/db/migrate/20150417131341_add_origurl_to_neuigkeiten.rb new file mode 100644 index 0000000..877c2c0 --- /dev/null +++ b/db/migrate/20150417131341_add_origurl_to_neuigkeiten.rb @@ -0,0 +1,5 @@ +class AddOrigurlToNeuigkeiten < ActiveRecord::Migration + def change + add_column :neuigkeiten, :origurl, :string + end +end diff --git a/db/migrate/20150417132658_add_crawled_at_to_crawlobjects.rb b/db/migrate/20150417132658_add_crawled_at_to_crawlobjects.rb new file mode 100644 index 0000000..f0c9e1a --- /dev/null +++ b/db/migrate/20150417132658_add_crawled_at_to_crawlobjects.rb @@ -0,0 +1,5 @@ +class AddCrawledAtToCrawlobjects < ActiveRecord::Migration + def change + add_column :crawlobjects, :crawled_at, :timestamp + end +end