134 lines
3.8 KiB
Ruby
134 lines
3.8 KiB
Ruby
class Crawlobject < ActiveRecord::Base
|
|
attr_accessible :children_count, :crawltime, :crawlurl, :depth, :lft, :name, :parent_id, :published_at, :raw, :referenced, :rgt, :schematype, :text, :type, :url
|
|
acts_as_nested_set
|
|
has_many :attachments, :as=>:parent
|
|
|
|
belongs_to :something, :polymorphic=>true
|
|
def self.config
|
|
Rails.application.config.crawlconfig
|
|
end
|
|
def has_attachments?
|
|
if self.objtype==2
|
|
return true
|
|
else
|
|
return false
|
|
end
|
|
end
|
|
def move_to_neuigkeit(user,rubrik)
|
|
if self.objtype == 5 and self.something.nil?
|
|
n=Neuigkeit.new
|
|
n.title=self.name
|
|
n.text=self.text
|
|
n.datum=self.published_at
|
|
n.author=user
|
|
n.rubrik=rubrik
|
|
n.origurl = self.url
|
|
n.save
|
|
self.something=n
|
|
self.save
|
|
return n
|
|
elsif self.objtype == 5
|
|
n=self.something
|
|
n.title=self.name
|
|
n.text=self.text
|
|
n.datum=self.published_at
|
|
n.author=user
|
|
n.rubrik=rubrik
|
|
n.origurl = self.url
|
|
n.save
|
|
|
|
end
|
|
end
|
|
def parse_children
|
|
if self.objtype == 1 # ET Forum Article loaded
|
|
self.json["comments"].each do |com|
|
|
if self.children.where(:objhash=>Digest::SHA512.hexdigest(com.to_json)).empty?
|
|
cco = self.children.new(:raw=>com.to_json,:crawlurl=>self.crawlurl)
|
|
cco.objtype=2
|
|
cco.parse_object
|
|
cco.calc_hash
|
|
|
|
if self.children.where(:objhash2=>cco.objhash2).empty?
|
|
cco.save
|
|
else
|
|
cco=self.children.where(:objhash2=>cco.objhash2).first
|
|
cco.raw=com.to_json
|
|
cco.parse_object
|
|
cco.calc_hash
|
|
cco.save
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
def load_attachments
|
|
if self.objtype == 2 # ET Comments only
|
|
self.json["attachments"].each do |url|
|
|
fn = `python ../microdata/download_file.py "#{url}"`
|
|
|
|
unless self.attachments.where(:name=>"Et_21.01.2015_L_sung.pdf").count > 0
|
|
|
|
a=Attachment.new
|
|
a.datei=File.open("/home/andreas/www/microdata/tmp/"+fn.strip)
|
|
a.name=fn.strip
|
|
a.parent=self
|
|
a.save
|
|
self.attachments<< a
|
|
a.save
|
|
end
|
|
end
|
|
end
|
|
end
|
|
def parse_object
|
|
|
|
if self.objtype == 1 # ET Forum Article loaded
|
|
# ET - Forum
|
|
self.schematype = self.json["type"].first
|
|
self.name = self.json["properties"]["name"].first
|
|
self.url = self.json["properties"]["url"].first
|
|
self.published_at = self.json["properties"]["dateCreated"].first
|
|
|
|
end
|
|
if self.objtype == 2 # ET Forum Comment loaded is part of Article
|
|
self.schematype = self.json["type"].first
|
|
self.url = self.json["properties"]["replyToUrl"].first
|
|
self.name = self.json["properties"]["name"].try(:first)
|
|
self.published_at = self.json["properties"]["commentTime"].try(:first)
|
|
self.text = self.json["properties"]["commentText"].try(:first)
|
|
end
|
|
if self.objtype==5
|
|
self.name=self.json["name"].strip
|
|
self.text=self.json["text"]
|
|
self.published_at=Time.parse(self.json["date"].strip)
|
|
self.url="http://www.htu.at"
|
|
end
|
|
end
|
|
def calc_hash
|
|
self.objhash=Digest::SHA512.hexdigest(self.raw)
|
|
self.objhash2=Digest::SHA512.hexdigest(self.url.to_s+self.try(:name).to_s+self.schematype.to_s+self.published_at.utc.to_s)
|
|
|
|
end
|
|
def json
|
|
JSON.parse(self.raw)
|
|
end
|
|
def self.crawl_htu
|
|
res = JSON.parse(`python ../microdata/foswikicrawl.py`)
|
|
res.each do |r|
|
|
cc=Crawlobject.new(:raw=>r.to_json)
|
|
cc.objtype=5
|
|
cc.parse_object
|
|
cc.calc_hash
|
|
if Crawlobject.where(:objhash2=>cc.objhash2, :objtype=>5).count==0
|
|
cc.save
|
|
else
|
|
cc = Crawlobject.where(:objhash2=>cc.objhash2, :objtype=>5).first
|
|
cc.raw=r.to_json
|
|
cc.parse_object
|
|
cc.calc_hash
|
|
cc.save
|
|
end
|
|
end
|
|
end
|
|
end
|