Container monitoring tool

From Define Wiki
Jump to navigation Jump to search

Table creation script for container lab

FROM tensorflow/tensorflow:nightly-gpu

RUN apt-get update && apt-get install -y wget git && rm -rf /var/lib/apt/lists/*
RUN pip --no-cache-dir install sklearn  && pip --no-cache-dir install scikit-image  && pip --no-cache-dir install pandas

RUN apt-get update && apt-get install -y openssh-server && service ssh start

RUN mkdir -p /var/run/sshd && mkdir -p /root/.ssh
RUN echo 'root:tensorflow' | chpasswd

WORKDIR /root
RUN git clone https://github.com/alrojo/tensorflow_tutorial.git
RUN echo "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC9W1a/uxXVkH/gaoamFxyLVWDnApYDhAoAMUk7dJYwzA6v7VxBrq/3hJ9S6BsG2FRLihNHXClu9bKAheA2G+p7eJQumarOcoqVDuONYgQpyLQpDSe9iPdGX53e4m0kSXWHeK31VPA6lNqDnVk2r5bvbl97ZNOkBj4VZ4w7Ne7Z2a/ZY2FvB4XyadyN2fbz4dgv0k/XfOMeYvgJp+JGRs1VdkA2qPP94qLGImrDxZalSPjIUveVA5UZPiKmBja0i1t4uNiZEFfEAd1Z7KE+yteIpolkL1bzuT86BLmw1ye1uL1XC4DRgzc06BJnSmZK7kxhD6whm5QYBNdxX0q/YrGp root@nvlab1" >> /root/.ssh/authorized_keys


RUN sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config

# SSH login fix. Otherwise user is kicked off after login
RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd

ENV NOTVISIBLE "in users profile"
RUN echo "export VISIBLE=now" >> /etc/profile

RUN echo "export PATH=$PATH:/usr/local/nvidia/bin/" >> /root/.bashrc
RUN echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/nvidia/lib64" >> /root/.bashrc

#RUN echo "cd /opt/ " >> /opt/run.sh && echo "nohup /usr/sbin/sshd -D &" >> /opt/run.sh && echo "/bin/bash /run_jupyter.sh" >> /opt/run.sh && chmod +x /opt/run.sh
#RUN cat /opt/run.sh

#EXPOSE 8888
EXPOSE 22
#CMD ["/bin/bash", "/opt/run.sh"]

CMD ["/usr/sbin/sshd", "-D"]
create database nvidia_lab;

CREATE TABLE users 
(id INT PRIMARY KEY, 
pass VARCHAR(20),
node_no INT,
gpu_no INT,
misc VARCHAR(20), 
misc_count INT);
  • Check status of containers : $ lab-manager status
  • Boot all assigned containers for a node : $ lab-manager boot
  • Create account for specific users : $ lab-manager create -list 1 2 3 5 8
#!/usr/bin/env python

import sys
import argparse

import MySQLdb as mysql
import os, binascii

import subprocess
import time


NODE_LIST=['172.28.0.132', '172.28.0.190', '172.28.1.135']
NO_OF_NODES=len(NODE_LIST)
GPU_PER_NODE=4


DB_IP="172.28.0.132"
def connect_db():
	return mysql.connect(DB_IP, "boston","Boston2016","nvidia_lab" )


def get_list(args):

	if args.list != None and len(args.list) > 0:
		return args.list
	else:
		return range(args.start, args.stop+1)


def check_ip(node_ip):
	p = subprocess.Popen("ip a", stdout=subprocess.PIPE, shell=True)
	(output, err) = p.communicate()
	if node_ip in output:
		return True
	return False

def check_container_status(id):
	p = subprocess.Popen("docker ps", stdout=subprocess.PIPE, shell=True)
	(output, err) = p.communicate()
	if 'container_user' + str(id) in output:
		return True
	return False	

def change_user_pass(id, password):
	cmd = "ssh -oStrictHostKeyChecking=no -p " + str(8000 + id) + " root@localhost 'echo root:" + str(password) + " | chpasswd'"

	print cmd
	p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
	(output, err) = p.communicate()
	exit_code = p.wait()

	if exit_code > 0:
		print exit_code
		print "*** Password change failed. Default password 'screencast' is in effect ***"
	else:
		print "=== Password for container " + str(id) + " changed to '" + password + "' ==="

def boot_container(id, gpu):

	cmd = "NV_GPU=" + str(gpu) + " nvidia-docker run -d -h gpunode" + str(id) + " -p " \
		+ str(8000 + int(id)) + ":22 --name container_user" + str(id) + " boston-tf-nvidia"
	print cmd

	p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
	(output, err) = p.communicate()
	exit_code = p.wait()

	print "Starting container " + str(id) + " : " + str(output) 
	print err
	print exit_code

	for i in xrange(5):
		if check_container_status(id):
			return True
		time.sleep(1)
	return False	

def status(args):

	print args
	print args.start
	print args.stop

	user_list = get_list(args)

	print user_list

	db = connect_db()
	cursor = db.cursor()

	running = 0
	failed = 0

	for id in user_list:

		print "---------------------------------------"
		sql = "SELECT * FROM users WHERE id = %d" % id

		try:
			#print sql
			cursor.execute(sql)
			user = cursor.fetchall()[0]

		except Exception, err:
		   print Exception
		   print err
		   continue

		if not check_ip(NODE_LIST[user[2]]):
			msg = "Container assigned to node " + str(user[2]) + " with ip " + NODE_LIST[user[2]]
			print "=== Container " + str(id) + " : " + msg + " ==="
			continue

		if check_container_status(id):
			msg = "RUNNING"
			running = running + 1
			print "=== Container " + str(id) + " : " + msg + " ==="
		else:
			msg = "NOT RUNNING"
			failed = failed + 1
			print "*** Container " + str(id) + " : " + msg + " ***"

				

	print "Total running containers : " + str(running)
	print "Total failed containers : " + str(failed)
	 

def boot(args):

	print args
	print args.start
	print args.stop

	user_list = get_list(args)

	print user_list

	db = connect_db()
	cursor = db.cursor()

	for id in user_list:

		print "---------------------------------------"
		sql = "SELECT * FROM users WHERE id = %d" % id

		try:
			#print sql
			cursor.execute(sql)
			user = cursor.fetchall()[0]

		except Exception, err:
		   # Rollback in case there is any error
		   print Exception
		   print err
		   continue

		if not check_ip(NODE_LIST[user[2]]):
			print "Container for user " + str(id) + " assigned to node " + str(user[2]) + " with ip " + NODE_LIST[user[2]]
			print "Cannot continue on this host. Run this command on the above mentioned host"
			continue

		# Starting containers
		if not check_container_status(id):
			if boot_container(id, user[3]):
				print "=== Container " + str(id) + " started successfully ==="
				time.sleep(2)
				change_user_pass(id, user[1])
			else:
				print "*** Container " + str(id) + " failed to started ***"
		else:
			print "=== Container " + str(id) + "is already running on current node ==="		



def create_accounts(args):


	print args
	print args.start
	print args.stop

	user_list = get_list(args)

	db = connect_db()
	cursor = db.cursor()


	print "No of nodes = " + str(NO_OF_NODES)
	print "GPU per node = " + str(GPU_PER_NODE)
	
	for id in user_list:

		node_no = (id - 1) % NO_OF_NODES
		node_index = (id - 1 - node_no)/NO_OF_NODES
		gpu_no = node_index % GPU_PER_NODE

		password = binascii.b2a_hex(os.urandom(8))

		# print str(id) + ": " + str(node_no) + " " + str(node_index) + " " + str(gpu_no) + ", " + str(password)

		sql = "INSERT INTO users(id, pass, node_no, gpu_no) VALUES (%d, '%s', %d, %d)" % \
				(id, password ,node_no, gpu_no)
		#print sql
		try:
			cursor.execute(sql)
			db.commit()
			print "User " + str(id) + " created"

		except Exception, err:
		   # Rollback in case there is any error
		   print Exception
		   print err
		   db.rollback()

	db.close()


def main():

	parser = argparse.ArgumentParser()
	parser.add_argument('--verbose', action="store_true")

	subparsers = parser.add_subparsers()

	parser_create_acc = subparsers.add_parser('create')
	parser_create_acc.add_argument('--start', type=int, default=1)
	parser_create_acc.add_argument('--stop', type=int, default=40)
	parser_create_acc.add_argument('-list', type=int, nargs='+')
	parser_create_acc.set_defaults(func=create_accounts)

	parser_create_acc = subparsers.add_parser('boot')
	parser_create_acc.add_argument('--start', type=int, default=1)
	parser_create_acc.add_argument('--stop', type=int, default=40)
	parser_create_acc.add_argument('-list', type=int, nargs='+')
	parser_create_acc.set_defaults(func=boot)

	parser_create_acc = subparsers.add_parser('status')
	parser_create_acc.add_argument('--start', type=int, default=1)
	parser_create_acc.add_argument('--stop', type=int, default=40)
	parser_create_acc.add_argument('-list', type=int, nargs='+')
	parser_create_acc.set_defaults(func=status)


	args = parser.parse_args(sys.argv[1:])
	args.func(args)

if __name__ == '__main__':
	main()