Docker: Setting up tensorflow on docker (NVIDIA Lab)

From Define Wiki
Revision as of 21:53, 13 September 2016 by David (talk | contribs) (Created page with "* Assumes you have Docker basic installation complete: http://wiki.bostonlabs.co.uk/w/index.php?title=Docker:_Installation_on_Centos_7 * Steps below are to run on the hypervis...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search

Install CUDA dependencies

rpm -ivh  https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
yum install dkms pciutils 
yum install kernel-devel-$(uname -r) kernel-headers-$(uname -r)

Install CUDA

# check here for latest version: https://developer.nvidia.com/cuda-downloads
#wget http://developer.download.nvidia.com/compute/cuda/7.5/Prod/local_installers/cuda-repo-rhel7-7-5-local-7.5-18.x86_64.rpm
#rpm -ivh cuda-repo-rhel7-7-5-local-7.5-18.x86_64.rpm
#yum install cuda 
# check for errors - the nvidia driver will be built during the process (make sure you have kernel-devel and gcc installed)
# error ok try again with the cuda_run file; 
wget http://developer.download.nvidia.com/compute/cuda/7.5/Prod/local_installers/cuda_7.5.18_linux.run
chmod +x cuda_7.5.18_linux.run
./cuda_7.5.18_linux.run
# check the following files for driver errors: vi /var/log/nvidia-installer.log 
lsmod | grep nvidia
nvidia-smi


Get nvidia-docker

# check here for latest version: https://github.com/NVIDIA/nvidia-docker
wget -P /tmp https://github.com/NVIDIA/nvidia-docker/releases/download/v1.0.0-rc.3/nvidia-docker-1.0.0.rc.3-1.x86_64.rpm
sudo rpm -i /tmp/nvidia-docker*.rpm && rm /tmp/nvidia-docker*.rpm
sudo systemctl start nvidia-docker

Setup the Dockerfile and build image

[root@nvlab2 nvidia-tslab]# ls
Dockerfile
[root@nvlab2 nvidia-tslab]# cat Dockerfile 
FROM gcr.io/tensorflow/tensorflow:latest-gpu
RUN apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/*
RUN pip --no-cache-dir install sklearn  && pip --no-cache-dir install scikit-image  && pip --no-cache-dir install pandas
RUN apt-get update && apt-get install -y openssh-server && service ssh start
RUN mkdir -p /var/run/sshd
RUN echo 'root:screencast' | chpasswd
RUN sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config
# SSH login fix. Otherwise user is kicked off after login
RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
ENV NOTVISIBLE "in users profile"
RUN echo "export VISIBLE=now" >> /etc/profile
RUN echo "export PATH=$PATH:/usr/local/nvidia/bin/" >> /root/.bashrc
RUN echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/nvidia/lib64" >> /root/.bashrc
#EXPOSE 8888
EXPOSE 22
CMD ["/usr/sbin/sshd", "-D"]
docker build . -t boston-tf-nvidia
docker images 
nvidia-docker run -d -h gpunode01 -p 8888:22 boston-tf-nvidia
  • In this instance the host IP is: 172.28.0.190

Access the container and run tensorflow

[david@head-Boston ~]$ ssh -p8888 root@172.28.0.190
root@172.28.0.190's password: 
Welcome to Ubuntu 14.04.4 LTS (GNU/Linux 3.10.0-327.el7.x86_64 x86_64)

 * Documentation:  https://help.ubuntu.com/
Last login: Wed Sep 14 01:42:45 2016 from 10.16.1.1
root@gpunode01:~# /usr/local/nvidia/bin/nvidia-smi
Wed Sep 14 01:49:32 2016       
+------------------------------------------------------+                       
| NVIDIA-SMI 352.39     Driver Version: 352.39         |                       
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla K80           Off  | 0000:84:00.0     Off |                  Off |
| N/A   24C    P8    26W / 149W |     23MiB / 12287MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla K80           Off  | 0000:85:00.0     Off |                  Off |
| N/A   30C    P8    29W / 149W |     23MiB / 12287MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID  Type  Process name                               Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+
root@gpunode01:~# export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
root@gpunode01:~# export CUDA_HOME=/usr/local/cuda
root@gpunode01:~# python
Python 2.7.6 (default, Jun 22 2015, 17:58:13) 
[GCC 4.8.2] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import tensorflow as tf
I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcublas.so locally
I tensorflow/stream_executor/dso_loader.cc:99] Couldn't open CUDA library libcudnn.so. LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64::/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64
I tensorflow/stream_executor/cuda/cuda_dnn.cc:1562] Unable to load cuDNN DSO
I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcufft.so locally
I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcuda.so.1 locally
I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcurand.so locally
>>> hello = tf.constant('Hello, TensorFlow!')
>>> sess = tf.Session()
I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 0 with properties: 
name: Tesla K80
major: 3 minor: 7 memoryClockRate (GHz) 0.8235
pciBusID 0000:84:00.0
Total memory: 12.00GiB
Free memory: 11.91GiB
I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 1 with properties: 
name: Tesla K80
major: 3 minor: 7 memoryClockRate (GHz) 0.8235
pciBusID 0000:85:00.0
Total memory: 12.00GiB
Free memory: 11.91GiB
I tensorflow/core/common_runtime/gpu/gpu_init.cc:126] DMA: 0 1 
I tensorflow/core/common_runtime/gpu/gpu_init.cc:136] 0:   Y Y 
I tensorflow/core/common_runtime/gpu/gpu_init.cc:136] 1:   Y Y 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:755] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K80, pci bus id: 0000:84:00.0)
I tensorflow/core/common_runtime/gpu/gpu_device.cc:755] Creating TensorFlow device (/gpu:1) -> (device: 1, name: Tesla K80, pci bus id: 0000:85:00.0)
>>> print(sess.run(hello))
Hello, TensorFlow!
>>> a = tf.constant(10)
>>> b = tf.constant(32)
>>> print(sess.run(a + b))
42
>>>