Koji bandaid

Mike McGrath mmcgrath at redhat.com
Sun Feb 17 03:50:20 UTC 2008


The koji builders don't check back in automatically[1] if they've lost a
connection to the host.  I put this script together in an attempt to fix
it, thought I'd post it here before sticking it on the builders.  Basic
premis is check if its checked in in 5 minutes (should be waaay more then
enough)  unless the box is under high load, then check 15 minutes, might
be over kill.

I'd like to run this check via cron every 5 minutes on each builder.
Anyone have any suggested fixes or against me running this?

	-Mike

[1] https://fedorahosted.org/koji/ticket/66
-------------- next part --------------
#!/usr/bin/python

import urllib
import koji
import socket
import datetime
import time
import os
import sys

FIVE_MIN = 300
FIFTEEN_MIN = 900

k = koji.ClientSession('https://publictest8.fedora.phx.redhat.com/kojihub', {})
hosts = k.listHosts()

me = socket.gethostname().split('.')[0]

def restart():
    import commands
    (code, out) = commands.getstatusoutput('/etc/init.d/kojid reload')
    print out

def lock():
    f = open('/var/lock/subsys/koji_check', 'w')
    f.write('%s' % os.getpid())
    f.close()

def check_pid(pid):
    try:
        f = open('/proc/%s/status' % pid, 'r')
        f.close()
        return 1
    except OSError, err:
        return 0
    except TypeError:
        return 0
    except IOError:
        return 0

def remove_lock():
    os.remove('/var/lock/subsys/koji_check')

def check_lock():
    try:
        f = open('/var/lock/subsys/koji_check', 'r')
        old_pid = f.read()
        f.close
    except IOError:
        return
    else:
        if check_pid(old_pid):
            print "Check still running!"
            sys.exit(1)
        else:
            print "Lockfile exists, pid dead.  Removing lock"
            remove_lock()
    return

def check_host():
    for host in hosts:
        if host['name'].startswith(me) and host['enabled']:
            t = k.getLastHostUpdate(host['id'])
            dt = time.strptime(t.split('.')[0], "%Y-%m-%d %H:%M:%S")
            if (time.time() - time.mktime(dt)) > FIVE_MIN:
                # Check to see if the box just happens to be under load
                if host['ready'] == False and host['task_load'] >= host['capacity']:
                    # If under high load be a bit more lenient
                    if (time.time() - time.mktime(dt)) > FIFTEEN_MIN:
                        print "Restarting under high load"
                        restart()
                else:
                    # no load, its been 5 minutes.  Restart
                    print "Restarting"
                    restart()

def main():
    check_lock()
    lock()
    check_host()
    remove_lock()


if __name__ == '__main__':
    main()



More information about the Fedora-infrastructure-list mailing list