Parent

Class/Module Index [+]

Quicksearch

Crawlobject

Public Class Methods

config() click to toggle source
# File app/models/crawlobject.rb, line 7
def self.config
  Rails.application.config.crawlconfig
end
crawl_news(id) click to toggle source
# File app/models/crawlobject.rb, line 122
def self.crawl_news(id)
  cfg = Rails.application.config
  res = JSON.parse(`python #{Rails.root}/bin/#{cfg.crawlconfig[id]['bin']} #{cfg.crawlconfig[id]['url']}`)
  res.each do |r|
    cc=Crawlobject.new(:raw=>r.to_json)
    cc.objtype=id
    cc.parse_object
    cc.calc_hash
    if Crawlobject.where(:objhash2=>cc.objhash2, :objtype=>id).count==0
      cc.save
    else
      cc = Crawlobject.where(:objhash2=>cc.objhash2, :objtype=>id).first
      cc.raw=r.to_json
      cc.parse_object
      cc.calc_hash
      cc.save
    end
  end
end

Public Instance Methods

calc_hash() click to toggle source
# File app/models/crawlobject.rb, line 114
def calc_hash
  self.objhash=Digest::SHA512.hexdigest(self.raw)
  self.objhash2=Digest::SHA512.hexdigest(self.url.to_s+self.try(:name).to_s+self.schematype.to_s+self.published_at.try(:utc).to_s)
  
end
has_attachments?() click to toggle source
# File app/models/crawlobject.rb, line 10
def has_attachments?
if self.objtype==2 
  return true
else
  return false
end
end
json() click to toggle source
# File app/models/crawlobject.rb, line 119
def json
  JSON.parse(self.raw)
end
load_attachments() click to toggle source
# File app/models/crawlobject.rb, line 65
def load_attachments
  if self.objtype == 2 # ET Comments only
    self.json["attachments"].each do |url|
      fn = `python ../microdata/download_file.py "#{url}"`
      
      unless self.attachments.where(:name=>"Et_21.01.2015_L_sung.pdf").count > 0
      
        a=Attachment.new
        a.datei=File.open("/home/andreas/www/microdata/tmp/"+fn.strip)        
        a.name=fn.strip
        a.parent=self
        a.save
        self.attachments<< a
        a.save
      end
    end
  end
end
move_to_neuigkeit(user,rubrik) click to toggle source
# File app/models/crawlobject.rb, line 17
def move_to_neuigkeit(user,rubrik)
  if ( self.objtype == 5 || self.objtype == 6 )and self.something.nil?
    n=Neuigkeit.new
    n.title=self.name
    n.text=self.text
    n.datum=self.published_at
    n.author=user
    n.rubrik=rubrik
    n.origurl = self.url
    n.save
    self.something=n
    self.save
    return n
  elsif (   self.objtype == 5 || self.objtype==6)
    n=self.something
    n.title=self.name
    n.text=self.text
    n.datum=self.published_at
    n.author=user
    n.rubrik=rubrik
    n.origurl = self.url
    n.save
    
  end
end
parse_children() click to toggle source
# File app/models/crawlobject.rb, line 42
def parse_children
  if self.objtype == 1 # ET Forum Article loaded
    self.json["comments"].each do |com|
      if self.children.where(:objhash=>Digest::SHA512.hexdigest(com.to_json)).empty?
        cco = self.children.new(:raw=>com.to_json,:crawlurl=>self.crawlurl)
        cco.objtype=2
        cco.parse_object
        cco.calc_hash
        
        if self.children.where(:objhash2=>cco.objhash2).empty?
          cco.save
        else
          cco=self.children.where(:objhash2=>cco.objhash2).first
          cco.raw=com.to_json
          cco.parse_object
          cco.calc_hash
          cco.save
        end
      end
    end
  end
end
parse_object() click to toggle source
# File app/models/crawlobject.rb, line 83
def parse_object
    
  if self.objtype == 1 # ET Forum Article loaded
    # ET - Forum 
    self.schematype = self.json["type"].first
    self.name = self.json["properties"]["name"].first
    self.url = self.json["properties"]["url"].first
    self.published_at = self.json["properties"]["dateCreated"].first
    
  end
  if self.objtype == 2 # ET Forum Comment loaded is part of Article 
    self.schematype = self.json["type"].first
    self.url = self.json["properties"]["replyToUrl"].first
    self.name = self.json["properties"]["name"].try(:first)
    self.published_at = self.json["properties"]["commentTime"].try(:first)
    self.text = self.json["properties"]["commentText"].try(:first)
  end
  if self.objtype==5
    self.name=self.json["name"].strip
    self.text=self.json["text"]
    self.published_at=Time.parse(self.json["date"].strip)
    self.url="http://www.htu.at"
  end
  if self.objtype==6
    self.name=self.json["name"].strip
    self.text=self.json["text"]
    self.published_at=Time.parse(self.json["date"].strip)
    self.url="http://etit.tuwien.ac.at"
  end

end

[Validate]

Generated with the Darkfish Rdoc Generator 2.