[Ovirt-devel] [PATCH] Use multiple processes to check host status

Ian Main imain at redhat.com
Fri Jun 13 21:38:16 UTC 2008


This patch causes host-status to fork() up to node_count/5 times to
connect out to hosts via libvirt.  This guarantees that that it takes at
most 5 timeouts in a row to verify all nodes.  This should help with the
bottleneck we were seeing with libvirt connect timeouts.  Testing with 105
nodes, almost all of which were down, it took 27s to query all of them.

Signed-off-by: Ian Main <imain at redhat.com>
---
 wui/src/host-status/host-status.rb |  194 +++++++++++++++++++++---------------
 1 files changed, 115 insertions(+), 79 deletions(-)

diff --git a/wui/src/host-status/host-status.rb b/wui/src/host-status/host-status.rb
index 41638da..fcfd586 100755
--- a/wui/src/host-status/host-status.rb
+++ b/wui/src/host-status/host-status.rb
@@ -1,5 +1,5 @@
 #!/usr/bin/ruby
-# 
+#
 # Copyright (C) 2008 Red Hat, Inc.
 # Written by Chris Lalancette <clalance at redhat.com>
 #
@@ -29,7 +29,7 @@ include Daemonize
 $logfile = '/var/log/ovirt-wui/host-status.log'
 
 do_daemon = true
-sleeptime = 5
+sleeptime = 20
 opts = OptionParser.new do |opts|
   opts.on("-h", "--help", "Print help message") do
     puts opts
@@ -97,104 +97,140 @@ def kick_taskomatic(msg, vm)
   task.save
 end
 
-loop do
-  get_credentials
 
-  hosts = Host.find(:all)
-  hosts.each do |host|
-    
-    begin
-      conn = Libvirt::open("qemu+tcp://" + host.hostname + "/system")
-    rescue
-      # we couldn't contact the host for whatever reason.  Since we can't get
-      # to this host, we have to mark all vms on it as disconnected or stopped
-      # or such.
-      if host.state != "unavailable"
-        puts "Updating host state to unavailable: " + host.hostname
-        host.state = "unavailable"
-        host.save
-      end
+def check_status(host)
 
-      Vm.find(:all, :conditions => [ "host_id = ?", host.id ]).each do |vm|
-        # Since we can't reach the host on which the vms reside, we mark these
-        # as STATE_UNREACHABLE.  If they come back up we can mark them as
-        # running again, else they'll be stopped.  At least for now the user
-	# will know what's going on.
-        #
-        # If this causes too much trouble in the UI, this can be changed to
-        # STATE_STOPPED for now until it is resolved of another solution is
-        # brought forward.
-
-        if vm.state != Vm::STATE_UNREACHABLE:
-          kick_taskomatic(Vm::STATE_UNREACHABLE, vm)
-        end
+  # This is in a new process, we need a new database connection.
+  database_connect
+
+  begin
+    puts "Connecting to host " + host.hostname
+    conn = Libvirt::open("qemu+tcp://" + host.hostname + "/system")
+  rescue
+    # we couldn't contact the host for whatever reason.  Since we can't get
+    # to this host, we have to mark all vms on it as disconnected or stopped
+    # or such.
+    if host.state != "unavailable"
+      puts "Updating host state to unavailable: " + host.hostname
+      host.state = "unavailable"
+      host.save
+    end
+
+    Vm.find(:all, :conditions => [ "host_id = ?", host.id ]).each do |vm|
+      # Since we can't reach the host on which the vms reside, we mark these
+      # as STATE_UNREACHABLE.  If they come back up we can mark them as
+      # running again, else they'll be stopped.  At least for now the user
+      # will know what's going on.
+      #
+      # If this causes too much trouble in the UI, this can be changed to
+      # STATE_STOPPED for now until it is resolved of another solution is
+      # brought forward.
+
+      if vm.state != Vm::STATE_UNREACHABLE:
+        kick_taskomatic(Vm::STATE_UNREACHABLE, vm)
       end
 
+    end
+
+    return
+  end
+
+  if host.state != "available"
+    puts "Updating host state to available: " + host.hostname
+    host.state = "available"
+    host.save
+  end
+
+  begin
+    vm_ids = conn.list_domains
+  rescue
+    puts "Failed to request domain list on host " + host.hostname
+    conn.close
+    next
+  end
+
+  # Here we're going through every vm listed through libvirt.  This
+  # really only lets us find ones that are started that shouldn't be.
+  vm_ids.each do |vm_id|
+    puts "VM ID: %d" % [vm_id]
+    begin
+      dom = conn.lookup_domain_by_id(vm_id)
+    rescue
+      puts "Failed to find domain " + vm.description
       next
     end
 
-    if host.state != "available"
-      puts "Updating host state to available: " + host.hostname
-      host.state = "available"
-      host.save
+    vm_uuid = dom.uuid
+    info = dom.info
+
+    puts "VM UUID: %s" % [vm_uuid]
+    info = dom.info
+
+    vm = Vm.find(:first, :conditions => [ "uuid = ?", vm_uuid ])
+    if vm == nil
+      puts "VM Not found in database, must be created by user.  giving up."
+      next
     end
 
+    check_state(vm, info)
+  end
+
+  # Now we get a list of all vms that should be on this system and see if
+  # they are all running.
+  Vm.find(:all, :conditions => [ "host_id = ?", host.id ]).each do |vm|
+
     begin
-      vm_ids = conn.list_domains
+      dom = conn.lookup_domain_by_uuid(vm.uuid)
     rescue
-      puts "Failed to request domain list on host " + host.hostname
-      conn.close
+      # OK.  We couldn't find the UUID that we thought was there.  The only
+      # explanation is that the domain is dead.
+      puts "Failed to find domain " + vm.description
+      kick_taskomatic(Vm::STATE_STOPPED, vm)
       next
     end
+    info = dom.info
+    check_state(vm, info)
 
-    # Here we're going through every vm listed through libvirt.  This
-    # really only lets us find ones that are started that shouldn't be.
-    vm_ids.each do |vm_id|
-      puts "VM ID: %d" % [vm_id]
-      begin
-        dom = conn.lookup_domain_by_id(vm_id)
-      rescue
-        puts "Failed to find domain " + vm.description
-        next
-      end
-      
-      vm_uuid = dom.uuid
-      info = dom.info
-
-      puts "VM UUID: %s" % [vm_uuid]
-      info = dom.info
-      puts info.to_s
- 
-      vm = Vm.find(:first, :conditions => [ "uuid = ?", vm_uuid ])
-      if vm == nil
-        puts "VM Not found in database, must be created by user.  giving up."
-        next
-      end
+    conn.close
 
-      check_state(vm, info)
-    end
+  end
+end
 
-    # Now we get a list of all vms that should be on this system and see if
-    # they are all running.
-    Vm.find(:all, :conditions => [ "host_id = ?", host.id ]).each do |vm|
-    
-      begin
-        dom = conn.lookup_domain_by_uuid(vm.uuid)
-      rescue
-        # OK.  We couldn't find the UUID that we thought was there.  The only
-        # explanation is that the domain is dead.
-        puts "Failed to find domain " + vm.description
-        kick_taskomatic(Vm::STATE_STOPPED, vm)
-        next
-      end
-      info = dom.info
-      check_state(vm, info)
+get_credentials
 
-      conn.close
+loop do
+
+  # fork() seems to really mess with our db connection.  Need to have this
+  # in the main connection as well.  I verified it's not leaking connections/fds.
+  database_connect
+  hosts = Host.find(:all)
+
+  p_count = 0
+  hosts.each do |host|
+
+    p_count += 1
 
+    # Only allow up to n_hosts / 5 processes running at a time.  If we go above this
+    # Then we wait for one to exit before continuing.  This guarantees it will take
+    # at most 5 timeouts to check all hosts.
+    if p_count > hosts.length / 5
+      Process.wait
+      p_count -= 1
     end
+
+    fork do
+      check_status(host)
+      exit 0
+    end
+
   end
 
+  while p_count > 0
+    Process.wait
+    p_count -= 1
+  end
+
+
   STDOUT.flush
   sleep sleeptime
 end
-- 
1.5.5.1




More information about the ovirt-devel mailing list