Docker: Setting up tensorflow on docker (NVIDIA Lab)
Jump to navigation
Jump to search
- Assumes you have Docker basic installation complete: http://wiki.bostonlabs.co.uk/w/index.php?title=Docker:_Installation_on_Centos_7
- Steps below are to run on the hypervisor (needs docker + cuda before we get going)
Install CUDA dependencies
rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
yum install dkms pciutils
yum install kernel-devel-$(uname -r) kernel-headers-$(uname -r)Install CUDA
# check here for latest version: https://developer.nvidia.com/cuda-downloads
#wget http://developer.download.nvidia.com/compute/cuda/7.5/Prod/local_installers/cuda-repo-rhel7-7-5-local-7.5-18.x86_64.rpm
#rpm -ivh cuda-repo-rhel7-7-5-local-7.5-18.x86_64.rpm
#yum install cuda
# check for errors - the nvidia driver will be built during the process (make sure you have kernel-devel and gcc installed)
# error ok try again with the cuda_run file;
wget http://developer.download.nvidia.com/compute/cuda/7.5/Prod/local_installers/cuda_7.5.18_linux.run
chmod +x cuda_7.5.18_linux.run
./cuda_7.5.18_linux.run
# check the following files for driver errors: vi /var/log/nvidia-installer.log
lsmod | grep nvidia
nvidia-smi
Get nvidia-docker
# check here for latest version: https://github.com/NVIDIA/nvidia-docker
wget -P /tmp https://github.com/NVIDIA/nvidia-docker/releases/download/v1.0.0-rc.3/nvidia-docker-1.0.0.rc.3-1.x86_64.rpm
sudo rpm -i /tmp/nvidia-docker*.rpm && rm /tmp/nvidia-docker*.rpm
sudo systemctl start nvidia-dockerSetup the Dockerfile and build image
[root@nvlab2 nvidia-tslab]# ls
Dockerfile
[root@nvlab2 nvidia-tslab]# cat Dockerfile
FROM gcr.io/tensorflow/tensorflow:latest-gpu
RUN apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/*
RUN pip --no-cache-dir install sklearn && pip --no-cache-dir install scikit-image && pip --no-cache-dir install pandas
RUN apt-get update && apt-get install -y openssh-server && service ssh start
RUN mkdir -p /var/run/sshd
RUN echo 'root:screencast' | chpasswd
RUN sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config
# SSH login fix. Otherwise user is kicked off after login
RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
ENV NOTVISIBLE "in users profile"
RUN echo "export VISIBLE=now" >> /etc/profile
RUN echo "export PATH=$PATH:/usr/local/nvidia/bin/" >> /root/.bashrc
RUN echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/nvidia/lib64" >> /root/.bashrc
#EXPOSE 8888
EXPOSE 22
CMD ["/usr/sbin/sshd", "-D"]docker build . -t boston-tf-nvidia
docker images
nvidia-docker run -d -h gpunode01 -p 8888:22 boston-tf-nvidia- In this instance the host IP is: 172.28.0.190
Access the container and run tensorflow
[david@head-Boston ~]$ ssh -p8888 root@172.28.0.190
root@172.28.0.190's password:
Welcome to Ubuntu 14.04.4 LTS (GNU/Linux 3.10.0-327.el7.x86_64 x86_64)
* Documentation: https://help.ubuntu.com/
Last login: Wed Sep 14 01:42:45 2016 from 10.16.1.1
root@gpunode01:~# /usr/local/nvidia/bin/nvidia-smi
Wed Sep 14 01:49:32 2016
+------------------------------------------------------+
| NVIDIA-SMI 352.39 Driver Version: 352.39 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Tesla K80 Off | 0000:84:00.0 Off | Off |
| N/A 24C P8 26W / 149W | 23MiB / 12287MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 1 Tesla K80 Off | 0000:85:00.0 Off | Off |
| N/A 30C P8 29W / 149W | 23MiB / 12287MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
root@gpunode01:~# export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
root@gpunode01:~# export CUDA_HOME=/usr/local/cuda
root@gpunode01:~# python
Python 2.7.6 (default, Jun 22 2015, 17:58:13)
[GCC 4.8.2] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import tensorflow as tf
I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcublas.so locally
I tensorflow/stream_executor/dso_loader.cc:99] Couldn't open CUDA library libcudnn.so. LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64::/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64
I tensorflow/stream_executor/cuda/cuda_dnn.cc:1562] Unable to load cuDNN DSO
I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcufft.so locally
I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcuda.so.1 locally
I tensorflow/stream_executor/dso_loader.cc:105] successfully opened CUDA library libcurand.so locally
>>> hello = tf.constant('Hello, TensorFlow!')
>>> sess = tf.Session()
I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 0 with properties:
name: Tesla K80
major: 3 minor: 7 memoryClockRate (GHz) 0.8235
pciBusID 0000:84:00.0
Total memory: 12.00GiB
Free memory: 11.91GiB
I tensorflow/core/common_runtime/gpu/gpu_init.cc:102] Found device 1 with properties:
name: Tesla K80
major: 3 minor: 7 memoryClockRate (GHz) 0.8235
pciBusID 0000:85:00.0
Total memory: 12.00GiB
Free memory: 11.91GiB
I tensorflow/core/common_runtime/gpu/gpu_init.cc:126] DMA: 0 1
I tensorflow/core/common_runtime/gpu/gpu_init.cc:136] 0: Y Y
I tensorflow/core/common_runtime/gpu/gpu_init.cc:136] 1: Y Y
I tensorflow/core/common_runtime/gpu/gpu_device.cc:755] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K80, pci bus id: 0000:84:00.0)
I tensorflow/core/common_runtime/gpu/gpu_device.cc:755] Creating TensorFlow device (/gpu:1) -> (device: 1, name: Tesla K80, pci bus id: 0000:85:00.0)
>>> print(sess.run(hello))
Hello, TensorFlow!
>>> a = tf.constant(10)
>>> b = tf.constant(32)
>>> print(sess.run(a + b))
42
>>>