2019년 4월 23일 화요일

nvidia-docker 로 도커 컨테이너 실행 시, Docker 이미지의 driver mismatch 문제


GPU를 사용하기 위한 Docker 이미지를 생성하실 때, 보통 이미지 내에 cuda 를 설치하게 됩니다. 이때, nvidia-driver 까지 설치되어 있으면 nvidia-docker 로 Docker 컨테이너를 수행하더라도 driver version mismatch 문제가 발생합니다.

해당 이미지는 apt-get remove nvidia-driver-xxx 로 드라이버를 지운 후(Ubuntu 기준), apt autoremove 로 관련 항목을 삭제한 후 docker commit으로 이미지 재 생성하면 정상적으로 사용 가능합니다. 아래는 그 과정을 담았습니다.

[root@powerlinux ~]# docker images
REPOSITORY                                                TAG                 IMAGE ID            CREATED             SIZE
test                                                      v0.2                10ac6caf42fa        5 minutes ago       18.1GB
test                                                      v0.1                83c8bb389baa        11 minutes ago      18.1GB
bsyu/ubuntu18.04_cuda10-0_python368_tf1.12_ppc64le        v0.2                2b036cf22b81        20 hours ago        16.8GB
bsyu/ubuntu18.04_cuda10-0_python368_pytorch1.01_ppc64le   v0.2                abbd4b2ae7b4        20 hours ago        9.39GB
bsyu/ubuntu18.04_cuda10-0_python352_pytorch1.01_ppc64le   v0.3                80e339ebaa70        20 hours ago        9.18GB
bsyu/ubuntu18.04_cuda10-0_python352_pytorch1.01_ppc64le   v0.4                2a568a91c905        20 hours ago        9.18GB
bsyu/ubuntu18.04_cuda10-0_python368_tf1.12_ppc64le        v0.1                93044ddff649        5 days ago          16.8GB
bsyu/ubuntu18.04_cuda10-0_python368_pytorch1.01_ppc64le   v0.1                da0a0d186b11        5 days ago          9.39GB
bsyu/ubuntu18.04_cuda9-2_python352_pytorch1.01_ppc64le    v0.1                2344d1c2e1db        5 days ago          8.92GB
bsyu/ubuntu18.04_cuda9-2_python352_tf1.12_ppc64le         v0.1                c651ab05d934        5 days ago          17.4GB
bsyu/ubuntu18.04_cuda10-0_python352_pytorch1.01_ppc64le   v0.2                45fe1b559906        6 days ago          9.17GB
bsyu/ubuntu18.04_cuda9-2_python368_tf1.12_ppc64le         v0.1                4b7c25ac1922        6 days ago          18.5GB
bsyu/ubuntu18.04_cuda10-0_python352_tf1.12_ppc64le        v0.2                bb8a068050a7        6 days ago          18.1GB
nvidia/cuda-ppc64le                                       latest              07e3c70d64ff        6 months ago        2.24GB
[root@powerlinux ~]# docker run --runtime=nvidia --rm -ti bsyu/ubuntu18.04_cuda10-0_python352_tf1.12_ppc64le:v0.1
Unable to find image 'bsyu/ubuntu18.04_cuda10-0_python352_tf1.12_ppc64le:v0.1' locally
v0.1: Pulling from bsyu/ubuntu18.04_cuda10-0_python352_tf1.12_ppc64le
3bc68f2d2d19: Already exists
e0b8fe3eb9f1: Already exists
42fd213a0950: Already exists
eb8002c958b1: Already exists
96de9d1c2a16: Already exists
75d6d0885764: Already exists
e9aa4a68112e: Already exists
55db9dfc542a: Already exists
63e390dd1af8: Already exists
e25f36d860fe: Already exists
1e0f7888dfc5: Already exists
f6b91b686571: Already exists
370524eb9fd3: Already exists
3d83f613aa65: Already exists
cde5b7631e7f: Already exists
f180629981ca: Already exists
7b0ffa6553bc: Already exists
4097bfc39c0a: Already exists
c394c77e3800: Already exists
410f9af8eb56: Already exists
6f3aba555bb6: Already exists
5bb471683588: Already exists
e35585fcd4e2: Already exists
40d3a36277cb: Already exists
Digest: sha256:5ecd6d3777dde6f56ecf68f6ec3a90c9ca4ee769bc087a4b8ff636e7c9e642b7
Status: Downloaded newer image for bsyu/ubuntu18.04_cuda10-0_python352_tf1.12_ppc64le:v0.1
root@7b3af9f91c0f:/# nvidia-smi
Failed to initialize NVML: Driver/library version mismatch
root@7b3af9f91c0f:/# dpkg -l | grep nvidia-driver
ii  nvidia-driver-418               418.40.04-0ubuntu1                ppc64el      NVIDIA driver metapackage
root@7b3af9f91c0f:/# dpkg -l | grep nvidia
ii  libnvidia-cfg1-418:ppc64el      418.40.04-0ubuntu1                ppc64el      NVIDIA binary OpenGL/GLX configuration library
ii  libnvidia-common-418            418.40.04-0ubuntu1                all          Shared files used by the NVIDIA libraries
ii  libnvidia-compute-418:ppc64el   418.40.04-0ubuntu1                ppc64el      NVIDIA libcompute package
ii  libnvidia-decode-418:ppc64el    418.40.04-0ubuntu1                ppc64el      NVIDIA Video Decoding runtime libraries
ii  libnvidia-encode-418:ppc64el    418.40.04-0ubuntu1                ppc64el      NVENC Video Encoding runtime library
ii  libnvidia-fbc1-418:ppc64el      418.40.04-0ubuntu1                ppc64el      NVIDIA OpenGL-based Framebuffer Capture runtime library
ii  libnvidia-gl-418:ppc64el        418.40.04-0ubuntu1                ppc64el      NVIDIA OpenGL/GLX/EGL/GLES GLVND libraries and Vulkan ICD
ii  libnvidia-ifr1-418:ppc64el      418.40.04-0ubuntu1                ppc64el      NVIDIA OpenGL-based Inband Frame Readback runtime library
ii  nvidia-compute-utils-418        418.40.04-0ubuntu1                ppc64el      NVIDIA compute utilities
ii  nvidia-dkms-418                 418.40.04-0ubuntu1                ppc64el      NVIDIA DKMS package
ii  nvidia-driver-418               418.40.04-0ubuntu1                ppc64el      NVIDIA driver metapackage
ii  nvidia-kernel-common-418        418.40.04-0ubuntu1                ppc64el      Shared files used with the kernel module
ii  nvidia-kernel-source-418        418.40.04-0ubuntu1                ppc64el      NVIDIA kernel source package
ii  nvidia-modprobe                 418.40.04-0ubuntu1                ppc64el      Load the NVIDIA kernel driver and create device files
ii  nvidia-prime                    0.8.8.2                           all          Tools to enable NVIDIA's Prime
ii  nvidia-settings                 418.40.04-0ubuntu1                ppc64el      Tool for configuring the NVIDIA graphics driver
ii  nvidia-utils-418                418.40.04-0ubuntu1                ppc64el      NVIDIA driver support binaries
ii  xserver-xorg-video-nvidia-418   418.40.04-0ubuntu1                ppc64el      NVIDIA binary Xorg driver
root@7b3af9f91c0f:/# dpkg -l | grep cuda
ii  cuda-10-0                       10.0.130-1                        ppc64el      CUDA 10.0 meta-package
ii  cuda-command-line-tools-10-0    10.0.130-1                        ppc64el      CUDA command-line tools
ii  cuda-compat-10-0                410.104-1                         ppc64el      CUDA Compatibility Platform
ii  cuda-compiler-10-0              10.0.130-1                        ppc64el      CUDA compiler
ii  cuda-core-10-0                  10.0.130-1                        ppc64el      CUDA core tools
ii  cuda-cublas-10-0                10.0.130-1                        ppc64el      CUBLAS native runtime libraries
ii  cuda-cublas-dev-10-0            10.0.130-1                        ppc64el      CUBLAS native dev links, headers
ii  cuda-cudart-10-0                10.0.130-1                        ppc64el      CUDA Runtime native Libraries
ii  cuda-cudart-dev-10-0            10.0.130-1                        ppc64el      CUDA Runtime native dev links, headers
ii  cuda-cufft-10-0                 10.0.130-1                        ppc64el      CUFFT native runtime libraries
ii  cuda-cufft-dev-10-0             10.0.130-1                        ppc64el      CUFFT native dev links, headers
ii  cuda-cuobjdump-10-0             10.0.130-1                        ppc64el      CUDA cuobjdump
ii  cuda-cupti-10-0                 10.0.130-1                        ppc64el      CUDA profiling tools interface.
ii  cuda-curand-10-0                10.0.130-1                        ppc64el      CURAND native runtime libraries
ii  cuda-curand-dev-10-0            10.0.130-1                        ppc64el      CURAND native dev links, headers
ii  cuda-cusolver-10-0              10.0.130-1                        ppc64el      CUDA solver native runtime libraries
ii  cuda-cusolver-dev-10-0          10.0.130-1                        ppc64el      CUDA solver native dev links, headers
ii  cuda-cusparse-10-0              10.0.130-1                        ppc64el      CUSPARSE native runtime libraries
ii  cuda-cusparse-dev-10-0          10.0.130-1                        ppc64el      CUSPARSE native dev links, headers
ii  cuda-documentation-10-0         10.0.130-1                        ppc64el      CUDA documentation
ii  cuda-driver-dev-10-0            10.0.130-1                        ppc64el      CUDA Driver native dev stub library
ii  cuda-drivers                    418.40.04-1                       ppc64el      CUDA Driver meta-package


root@7b3af9f91c0f:/# apt-get remove nvidia-driver-418
Reading package lists... Done
Building dependency tree
Reading state information... Done
The following packages were automatically installed and are no longer required:
  dkms keyboard-configuration kmod libegl-mesa0 libegl1 libgbm1 libgles2 libjansson4 libnvidia-cfg1-418 libnvidia-common-418 libnvidia-compute-418
  libnvidia-decode-418 libnvidia-encode-418 libnvidia-fbc1-418 libnvidia-gl-418 libnvidia-ifr1-418 libopengl0 libpciaccess0 libpolkit-agent-1-0
  libpolkit-backend-1-0 libpolkit-gobject-1-0 libunwind8 libvdpau1 libwayland-server0 libxcb-xfixes0 libxfont2 libxkbfile1 libxnvctrl0
  linux-headers-4.15.0-47 linux-headers-4.15.0-47-generic linux-headers-generic mesa-vdpau-drivers nvidia-compute-utils-418 nvidia-dkms-418
  nvidia-kernel-common-418 nvidia-kernel-source-418 nvidia-modprobe nvidia-prime nvidia-settings nvidia-utils-418 pkg-config policykit-1 policykit-1-gnome
  python3-xkit screen-resolution-extra sudo udev vdpau-driver-all x11-xkb-utils xfonts-base xfonts-encodings xfonts-utils xserver-common
  xserver-xorg-core-hwe-18.04 xserver-xorg-video-nvidia-418
Use 'apt autoremove' to remove them.
The following packages will be REMOVED:
  cuda-10-0 cuda-drivers cuda-runtime-10-0 nvidia-driver-418
0 upgraded, 0 newly installed, 4 to remove and 5 not upgraded.
After this operation, 1141 kB disk space will be freed.
Do you want to continue? [Y/n] Y
(Reading database ... 80881 files and directories currently installed.)
Removing cuda-10-0 (10.0.130-1) ...
Removing cuda-runtime-10-0 (10.0.130-1) ...
Removing cuda-drivers (418.40.04-1) ...
Removing nvidia-driver-418 (418.40.04-0ubuntu1) ...

root@7b3af9f91c0f:/# apt autoremove
Reading package lists... Done
Building dependency tree
Reading state information... Done
The following packages will be REMOVED:
  dkms keyboard-configuration kmod libegl-mesa0 libegl1 libgbm1 libgles2 libjansson4 libnvidia-cfg1-418 libnvidia-common-418 libnvidia-compute-418
  libnvidia-decode-418 libnvidia-encode-418 libnvidia-fbc1-418 libnvidia-gl-418 libnvidia-ifr1-418 libopengl0 libpciaccess0 libpolkit-agent-1-0
  libpolkit-backend-1-0 libpolkit-gobject-1-0 libunwind8 libvdpau1 libwayland-server0 libxcb-xfixes0 libxfont2 libxkbfile1 libxnvctrl0
  linux-headers-4.15.0-47 linux-headers-4.15.0-47-generic linux-headers-generic mesa-vdpau-drivers nvidia-compute-utils-418 nvidia-dkms-418
  nvidia-kernel-common-418 nvidia-kernel-source-418 nvidia-modprobe nvidia-prime nvidia-settings nvidia-utils-418 pkg-config policykit-1 policykit-1-gnome
  python3-xkit screen-resolution-extra sudo udev vdpau-driver-all x11-xkb-utils xfonts-base xfonts-encodings xfonts-utils xserver-common
  xserver-xorg-core-hwe-18.04 xserver-xorg-video-nvidia-418
0 upgraded, 0 newly installed, 55 to remove and 5 not upgraded.
After this operation, 351 MB disk space will be freed.
Do you want to continue? [Y/n] Y
If you would prefer to access the root account with su(1)
or by logging in directly,
you must set a root password with "sudo passwd".

If you have arranged other means to access the root account,
and you are sure this is what you want,
you may bypass this check by setting an environment variable
(export SUDO_FORCE_REMOVE=yes).

Refusing to remove sudo.
dpkg: error processing package sudo (--remove):
installed sudo package pre-removal script subprocess returned error exit status 1
Removing udev (237-3ubuntu10.19) ...
invoke-rc.d: could not determine current runlevel
invoke-rc.d: policy-rc.d denied execution of stop.
Removing xfonts-base (1:1.0.4+nmu1) ...
Removing xfonts-utils (1:7.7+6) ...
Removing xfonts-encodings (1:1.0.4-2) ...
Removing libegl1:ppc64el (1.0.0-2ubuntu2.2) ...
Removing libegl-mesa0:ppc64el (18.2.8-0ubuntu0~18.04.2) ...
Removing libgbm1:ppc64el (18.2.8-0ubuntu0~18.04.2) ...
Removing libnvidia-compute-418:ppc64el (418.40.04-0ubuntu1) ...
Removing libwayland-server0:ppc64el (1.16.0-1ubuntu1.1~18.04.1) ...
Removing libxcb-xfixes0:ppc64el (1.13-2~ubuntu18.04) ...
Errors were encountered while processing:
nvidia-utils-418
nvidia-compute-utils-418
sudo
E: Sub-process /usr/bin/dpkg returned an error code (1)

다른창에서

[root@powerlinux ~]# docker commit 7b3af9f91c0f test:v0.5
sha256:1f77da5befa581a867371a0ffd5c7e75e09ac04249fdb0e5b09aa014037d0583
[root@powerlinux ~]# docker run --runtime=nvidia --rm -ti test:v0.5
root@07a93a6f8fa6:/# nvidia-smi
Wed Apr 24 03:33:53 2019
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.39       Driver Version: 418.39       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla V100-SXM2...  On   | 00000004:04:00.0 Off |                    0 |
| N/A   39C    P0    39W / 300W |      0MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000004:05:00.0 Off |                    0 |
| N/A   40C    P0    38W / 300W |      0MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM2...  On   | 00000035:03:00.0 Off |                    0 |
| N/A   39C    P0    38W / 300W |      0MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   3  Tesla V100-SXM2...  On   | 00000035:04:00.0 Off |                    0 |
| N/A   42C    P0    37W / 300W |      0MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

root@07a93a6f8fa6:/# python
Python 3.5.6 |Anaconda custom (64-bit)| (default, Aug 26 2018, 22:03:11)
[GCC 7.3.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import tensorflow as tf
>>> sess=tf.Session()
2019-04-24 03:34:40.080740: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties:
name: Tesla V100-SXM2-16GB major: 7 minor: 0 memoryClockRate(GHz): 1.53
pciBusID: 0004:04:00.0
totalMemory: 15.75GiB freeMemory: 15.44GiB
2019-04-24 03:34:40.190470: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 1 with properties:
name: Tesla V100-SXM2-16GB major: 7 minor: 0 memoryClockRate(GHz): 1.53
pciBusID: 0004:05:00.0
totalMemory: 15.75GiB freeMemory: 15.44GiB
2019-04-24 03:34:40.305558: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 2 with properties:
name: Tesla V100-SXM2-16GB major: 7 minor: 0 memoryClockRate(GHz): 1.53
pciBusID: 0035:03:00.0
totalMemory: 15.75GiB freeMemory: 15.44GiB
2019-04-24 03:34:40.426210: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 3 with properties:
name: Tesla V100-SXM2-16GB major: 7 minor: 0 memoryClockRate(GHz): 1.53
pciBusID: 0035:04:00.0
totalMemory: 15.75GiB freeMemory: 15.44GiB
2019-04-24 03:34:40.426315: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0, 1, 2, 3
2019-04-24 03:34:42.072754: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-04-24 03:34:42.072804: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988]      0 1 2 3
2019-04-24 03:34:42.072820: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0:   N Y Y Y
2019-04-24 03:34:42.072832: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 1:   Y N Y Y
2019-04-24 03:34:42.072844: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 2:   Y Y N Y
2019-04-24 03:34:42.072856: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 3:   Y Y Y N
2019-04-24 03:34:42.077306: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14941 MB memory) -> physical GPU (device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0004:04:00.0, compute capability: 7.0)
2019-04-24 03:34:42.078146: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 14941 MB memory) -> physical GPU (device: 1, name: Tesla V100-SXM2-16GB, pci bus id: 0004:05:00.0, compute capability: 7.0)
2019-04-24 03:34:42.078729: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:2 with 14941 MB memory) -> physical GPU (device: 2, name: Tesla V100-SXM2-16GB, pci bus id: 0035:03:00.0, compute capability: 7.0)
2019-04-24 03:34:42.079688: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:3 with 14945 MB memory) -> physical GPU (device: 3, name: Tesla V100-SXM2-16GB, pci bus id: 0035:04:00.0, compute capability: 7.0)
>>>