Container monitoring tool
Jump to navigation
Jump to search
Table creation script for container lab
create database nvidia_lab;
CREATE TABLE users
(id INT PRIMARY KEY,
pass VARCHAR(20),
node_no INT,
gpu_no INT,
misc VARCHAR(20),
misc_count INT);- Check status of containers : $ lab-manager status
- Boot all assigned containers for a node : $ lab-manager boot
- Create account for specific users : $ lab-manager create -list 1 2 3 5 8
#!/usr/bin/env python
import sys
import argparse
import MySQLdb as mysql
import os, binascii
import subprocess
import time
NODE_LIST=['172.28.0.132', '172.28.0.190', '172.28.1.135']
NO_OF_NODES=len(NODE_LIST)
GPU_PER_NODE=4
DB_IP="172.28.0.132"
def connect_db():
return mysql.connect(DB_IP, "boston","Boston2016","nvidia_lab" )
def get_list(args):
if args.list != None and len(args.list) > 0:
return args.list
else:
return range(args.start, args.stop+1)
def check_ip(node_ip):
p = subprocess.Popen("ip a", stdout=subprocess.PIPE, shell=True)
(output, err) = p.communicate()
if node_ip in output:
return True
return False
def check_container_status(id):
p = subprocess.Popen("docker ps", stdout=subprocess.PIPE, shell=True)
(output, err) = p.communicate()
if 'container_user' + str(id) in output:
return True
return False
def change_user_pass(id, password):
cmd = "ssh -oStrictHostKeyChecking=no -p " + str(8000 + id) + " root@localhost 'echo root:" + str(password) + " | chpasswd'"
print cmd
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
(output, err) = p.communicate()
exit_code = p.wait()
if exit_code > 0:
print exit_code
print "*** Password change failed. Default password 'screencast' is in effect ***"
else:
print "=== Password for container " + str(id) + " changed to '" + password + "' ==="
def boot_container(id, gpu):
cmd = "NV_GPU=" + str(gpu) + " nvidia-docker run -d -h gpunode" + str(id) + " -p " \
+ str(8000 + int(id)) + ":22 --name container_user" + str(id) + " boston-tf-nvidia"
print cmd
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
(output, err) = p.communicate()
exit_code = p.wait()
print "Starting container " + str(id) + " : " + str(output)
print err
print exit_code
for i in xrange(5):
if check_container_status(id):
return True
time.sleep(1)
return False
def status(args):
print args
print args.start
print args.stop
user_list = get_list(args)
print user_list
db = connect_db()
cursor = db.cursor()
running = 0
failed = 0
for id in user_list:
print "---------------------------------------"
sql = "SELECT * FROM users WHERE id = %d" % id
try:
#print sql
cursor.execute(sql)
user = cursor.fetchall()[0]
except Exception, err:
print Exception
print err
continue
if not check_ip(NODE_LIST[user[2]]):
msg = "Container assigned to node " + str(user[2]) + " with ip " + NODE_LIST[user[2]]
print "=== Container " + str(id) + " : " + msg + " ==="
continue
if check_container_status(id):
msg = "RUNNING"
running = running + 1
print "=== Container " + str(id) + " : " + msg + " ==="
else:
msg = "NOT RUNNING"
failed = failed + 1
print "*** Container " + str(id) + " : " + msg + " ***"
print "Total running containers : " + str(running)
print "Total failed containers : " + str(failed)
def boot(args):
print args
print args.start
print args.stop
user_list = get_list(args)
print user_list
db = connect_db()
cursor = db.cursor()
for id in user_list:
print "---------------------------------------"
sql = "SELECT * FROM users WHERE id = %d" % id
try:
#print sql
cursor.execute(sql)
user = cursor.fetchall()[0]
except Exception, err:
# Rollback in case there is any error
print Exception
print err
continue
if not check_ip(NODE_LIST[user[2]]):
print "Container for user " + str(id) + " assigned to node " + str(user[2]) + " with ip " + NODE_LIST[user[2]]
print "Cannot continue on this host. Run this command on the above mentioned host"
continue
# Starting containers
if not check_container_status(id):
if boot_container(id, user[3]):
print "=== Container " + str(id) + " started successfully ==="
time.sleep(2)
change_user_pass(id, user[1])
else:
print "*** Container " + str(id) + " failed to started ***"
else:
print "=== Container " + str(id) + "is already running on current node ==="
def create_accounts(args):
print args
print args.start
print args.stop
user_list = get_list(args)
db = connect_db()
cursor = db.cursor()
print "No of nodes = " + str(NO_OF_NODES)
print "GPU per node = " + str(GPU_PER_NODE)
for id in user_list:
node_no = (id - 1) % NO_OF_NODES
node_index = (id - 1 - node_no)/NO_OF_NODES
gpu_no = node_index % GPU_PER_NODE
password = binascii.b2a_hex(os.urandom(8))
# print str(id) + ": " + str(node_no) + " " + str(node_index) + " " + str(gpu_no) + ", " + str(password)
sql = "INSERT INTO users(id, pass, node_no, gpu_no) VALUES (%d, '%s', %d, %d)" % \
(id, password ,node_no, gpu_no)
#print sql
try:
cursor.execute(sql)
db.commit()
print "User " + str(id) + " created"
except Exception, err:
# Rollback in case there is any error
print Exception
print err
db.rollback()
db.close()
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--verbose', action="store_true")
subparsers = parser.add_subparsers()
parser_create_acc = subparsers.add_parser('create')
parser_create_acc.add_argument('--start', type=int, default=1)
parser_create_acc.add_argument('--stop', type=int, default=40)
parser_create_acc.add_argument('-list', type=int, nargs='+')
parser_create_acc.set_defaults(func=create_accounts)
parser_create_acc = subparsers.add_parser('boot')
parser_create_acc.add_argument('--start', type=int, default=1)
parser_create_acc.add_argument('--stop', type=int, default=40)
parser_create_acc.add_argument('-list', type=int, nargs='+')
parser_create_acc.set_defaults(func=boot)
parser_create_acc = subparsers.add_parser('status')
parser_create_acc.add_argument('--start', type=int, default=1)
parser_create_acc.add_argument('--stop', type=int, default=40)
parser_create_acc.add_argument('-list', type=int, nargs='+')
parser_create_acc.set_defaults(func=status)
args = parser.parse_args(sys.argv[1:])
args.func(args)
if __name__ == '__main__':
main()