GPU를 사용하기 위한 Docker 이미지를 생성하실 때, 보통 이미지 내에 cuda 를 설치하게 됩니다. 이때, nvidia-driver 까지 설치되어 있으면 nvidia-docker 로 Docker 컨테이너를 수행하더라도 driver version mismatch 문제가 발생합니다.
해당 이미지는 apt-get remove nvidia-driver-xxx 로 드라이버를 지운 후(Ubuntu 기준), apt autoremove 로 관련 항목을 삭제한 후 docker commit으로 이미지 재 생성하면 정상적으로 사용 가능합니다. 아래는 그 과정을 담았습니다.
[root@powerlinux ~]# docker images
REPOSITORY TAG IMAGE ID CREATED SIZE
test v0.2 10ac6caf42fa 5 minutes ago 18.1GB
test v0.1 83c8bb389baa 11 minutes ago 18.1GB
bsyu/ubuntu18.04_cuda10-0_python368_tf1.12_ppc64le v0.2 2b036cf22b81 20 hours ago 16.8GB
bsyu/ubuntu18.04_cuda10-0_python368_pytorch1.01_ppc64le v0.2 abbd4b2ae7b4 20 hours ago 9.39GB
bsyu/ubuntu18.04_cuda10-0_python352_pytorch1.01_ppc64le v0.3 80e339ebaa70 20 hours ago 9.18GB
bsyu/ubuntu18.04_cuda10-0_python352_pytorch1.01_ppc64le v0.4 2a568a91c905 20 hours ago 9.18GB
bsyu/ubuntu18.04_cuda10-0_python368_tf1.12_ppc64le v0.1 93044ddff649 5 days ago 16.8GB
bsyu/ubuntu18.04_cuda10-0_python368_pytorch1.01_ppc64le v0.1 da0a0d186b11 5 days ago 9.39GB
bsyu/ubuntu18.04_cuda9-2_python352_pytorch1.01_ppc64le v0.1 2344d1c2e1db 5 days ago 8.92GB
bsyu/ubuntu18.04_cuda9-2_python352_tf1.12_ppc64le v0.1 c651ab05d934 5 days ago 17.4GB
bsyu/ubuntu18.04_cuda10-0_python352_pytorch1.01_ppc64le v0.2 45fe1b559906 6 days ago 9.17GB
bsyu/ubuntu18.04_cuda9-2_python368_tf1.12_ppc64le v0.1 4b7c25ac1922 6 days ago 18.5GB
bsyu/ubuntu18.04_cuda10-0_python352_tf1.12_ppc64le v0.2 bb8a068050a7 6 days ago 18.1GB
nvidia/cuda-ppc64le latest 07e3c70d64ff 6 months ago 2.24GB
[root@powerlinux ~]# docker run --runtime=nvidia --rm -ti bsyu/ubuntu18.04_cuda10-0_python352_tf1.12_ppc64le:v0.1
Unable to find image 'bsyu/ubuntu18.04_cuda10-0_python352_tf1.12_ppc64le:v0.1' locally
v0.1: Pulling from bsyu/ubuntu18.04_cuda10-0_python352_tf1.12_ppc64le
3bc68f2d2d19: Already exists
e0b8fe3eb9f1: Already exists
42fd213a0950: Already exists
eb8002c958b1: Already exists
96de9d1c2a16: Already exists
75d6d0885764: Already exists
e9aa4a68112e: Already exists
55db9dfc542a: Already exists
63e390dd1af8: Already exists
e25f36d860fe: Already exists
1e0f7888dfc5: Already exists
f6b91b686571: Already exists
370524eb9fd3: Already exists
3d83f613aa65: Already exists
cde5b7631e7f: Already exists
f180629981ca: Already exists
7b0ffa6553bc: Already exists
4097bfc39c0a: Already exists
c394c77e3800: Already exists
410f9af8eb56: Already exists
6f3aba555bb6: Already exists
5bb471683588: Already exists
e35585fcd4e2: Already exists
40d3a36277cb: Already exists
Digest: sha256:5ecd6d3777dde6f56ecf68f6ec3a90c9ca4ee769bc087a4b8ff636e7c9e642b7
Status: Downloaded newer image for bsyu/ubuntu18.04_cuda10-0_python352_tf1.12_ppc64le:v0.1
root@7b3af9f91c0f:/# nvidia-smi
Failed to initialize NVML: Driver/library version mismatch
root@7b3af9f91c0f:/# dpkg -l | grep nvidia-driver
ii nvidia-driver-418 418.40.04-0ubuntu1 ppc64el NVIDIA driver metapackage
root@7b3af9f91c0f:/# dpkg -l | grep nvidia
ii libnvidia-cfg1-418:ppc64el 418.40.04-0ubuntu1 ppc64el NVIDIA binary OpenGL/GLX configuration library
ii libnvidia-common-418 418.40.04-0ubuntu1 all Shared files used by the NVIDIA libraries
ii libnvidia-compute-418:ppc64el 418.40.04-0ubuntu1 ppc64el NVIDIA libcompute package
ii libnvidia-decode-418:ppc64el 418.40.04-0ubuntu1 ppc64el NVIDIA Video Decoding runtime libraries
ii libnvidia-encode-418:ppc64el 418.40.04-0ubuntu1 ppc64el NVENC Video Encoding runtime library
ii libnvidia-fbc1-418:ppc64el 418.40.04-0ubuntu1 ppc64el NVIDIA OpenGL-based Framebuffer Capture runtime library
ii libnvidia-gl-418:ppc64el 418.40.04-0ubuntu1 ppc64el NVIDIA OpenGL/GLX/EGL/GLES GLVND libraries and Vulkan ICD
ii libnvidia-ifr1-418:ppc64el 418.40.04-0ubuntu1 ppc64el NVIDIA OpenGL-based Inband Frame Readback runtime library
ii nvidia-compute-utils-418 418.40.04-0ubuntu1 ppc64el NVIDIA compute utilities
ii nvidia-dkms-418 418.40.04-0ubuntu1 ppc64el NVIDIA DKMS package
ii nvidia-driver-418 418.40.04-0ubuntu1 ppc64el NVIDIA driver metapackage
ii nvidia-kernel-common-418 418.40.04-0ubuntu1 ppc64el Shared files used with the kernel module
ii nvidia-kernel-source-418 418.40.04-0ubuntu1 ppc64el NVIDIA kernel source package
ii nvidia-modprobe 418.40.04-0ubuntu1 ppc64el Load the NVIDIA kernel driver and create device files
ii nvidia-prime 0.8.8.2 all Tools to enable NVIDIA's Prime
ii nvidia-settings 418.40.04-0ubuntu1 ppc64el Tool for configuring the NVIDIA graphics driver
ii nvidia-utils-418 418.40.04-0ubuntu1 ppc64el NVIDIA driver support binaries
ii xserver-xorg-video-nvidia-418 418.40.04-0ubuntu1 ppc64el NVIDIA binary Xorg driver
root@7b3af9f91c0f:/# dpkg -l | grep cuda
ii cuda-10-0 10.0.130-1 ppc64el CUDA 10.0 meta-package
ii cuda-command-line-tools-10-0 10.0.130-1 ppc64el CUDA command-line tools
ii cuda-compat-10-0 410.104-1 ppc64el CUDA Compatibility Platform
ii cuda-compiler-10-0 10.0.130-1 ppc64el CUDA compiler
ii cuda-core-10-0 10.0.130-1 ppc64el CUDA core tools
ii cuda-cublas-10-0 10.0.130-1 ppc64el CUBLAS native runtime libraries
ii cuda-cublas-dev-10-0 10.0.130-1 ppc64el CUBLAS native dev links, headers
ii cuda-cudart-10-0 10.0.130-1 ppc64el CUDA Runtime native Libraries
ii cuda-cudart-dev-10-0 10.0.130-1 ppc64el CUDA Runtime native dev links, headers
ii cuda-cufft-10-0 10.0.130-1 ppc64el CUFFT native runtime libraries
ii cuda-cufft-dev-10-0 10.0.130-1 ppc64el CUFFT native dev links, headers
ii cuda-cuobjdump-10-0 10.0.130-1 ppc64el CUDA cuobjdump
ii cuda-cupti-10-0 10.0.130-1 ppc64el CUDA profiling tools interface.
ii cuda-curand-10-0 10.0.130-1 ppc64el CURAND native runtime libraries
ii cuda-curand-dev-10-0 10.0.130-1 ppc64el CURAND native dev links, headers
ii cuda-cusolver-10-0 10.0.130-1 ppc64el CUDA solver native runtime libraries
ii cuda-cusolver-dev-10-0 10.0.130-1 ppc64el CUDA solver native dev links, headers
ii cuda-cusparse-10-0 10.0.130-1 ppc64el CUSPARSE native runtime libraries
ii cuda-cusparse-dev-10-0 10.0.130-1 ppc64el CUSPARSE native dev links, headers
ii cuda-documentation-10-0 10.0.130-1 ppc64el CUDA documentation
ii cuda-driver-dev-10-0 10.0.130-1 ppc64el CUDA Driver native dev stub library
ii cuda-drivers 418.40.04-1 ppc64el CUDA Driver meta-package
root@7b3af9f91c0f:/# apt-get remove nvidia-driver-418
Reading package lists... Done
Building dependency tree
Reading state information... Done
The following packages were automatically installed and are no longer required:
dkms keyboard-configuration kmod libegl-mesa0 libegl1 libgbm1 libgles2 libjansson4 libnvidia-cfg1-418 libnvidia-common-418 libnvidia-compute-418
libnvidia-decode-418 libnvidia-encode-418 libnvidia-fbc1-418 libnvidia-gl-418 libnvidia-ifr1-418 libopengl0 libpciaccess0 libpolkit-agent-1-0
libpolkit-backend-1-0 libpolkit-gobject-1-0 libunwind8 libvdpau1 libwayland-server0 libxcb-xfixes0 libxfont2 libxkbfile1 libxnvctrl0
linux-headers-4.15.0-47 linux-headers-4.15.0-47-generic linux-headers-generic mesa-vdpau-drivers nvidia-compute-utils-418 nvidia-dkms-418
nvidia-kernel-common-418 nvidia-kernel-source-418 nvidia-modprobe nvidia-prime nvidia-settings nvidia-utils-418 pkg-config policykit-1 policykit-1-gnome
python3-xkit screen-resolution-extra sudo udev vdpau-driver-all x11-xkb-utils xfonts-base xfonts-encodings xfonts-utils xserver-common
xserver-xorg-core-hwe-18.04 xserver-xorg-video-nvidia-418
Use 'apt autoremove' to remove them.
The following packages will be REMOVED:
cuda-10-0 cuda-drivers cuda-runtime-10-0 nvidia-driver-418
0 upgraded, 0 newly installed, 4 to remove and 5 not upgraded.
After this operation, 1141 kB disk space will be freed.
Do you want to continue? [Y/n] Y
(Reading database ... 80881 files and directories currently installed.)
Removing cuda-10-0 (10.0.130-1) ...
Removing cuda-runtime-10-0 (10.0.130-1) ...
Removing cuda-drivers (418.40.04-1) ...
Removing nvidia-driver-418 (418.40.04-0ubuntu1) ...
root@7b3af9f91c0f:/# apt autoremove
Reading package lists... Done
Building dependency tree
Reading state information... Done
The following packages will be REMOVED:
dkms keyboard-configuration kmod libegl-mesa0 libegl1 libgbm1 libgles2 libjansson4 libnvidia-cfg1-418 libnvidia-common-418 libnvidia-compute-418
libnvidia-decode-418 libnvidia-encode-418 libnvidia-fbc1-418 libnvidia-gl-418 libnvidia-ifr1-418 libopengl0 libpciaccess0 libpolkit-agent-1-0
libpolkit-backend-1-0 libpolkit-gobject-1-0 libunwind8 libvdpau1 libwayland-server0 libxcb-xfixes0 libxfont2 libxkbfile1 libxnvctrl0
linux-headers-4.15.0-47 linux-headers-4.15.0-47-generic linux-headers-generic mesa-vdpau-drivers nvidia-compute-utils-418 nvidia-dkms-418
nvidia-kernel-common-418 nvidia-kernel-source-418 nvidia-modprobe nvidia-prime nvidia-settings nvidia-utils-418 pkg-config policykit-1 policykit-1-gnome
python3-xkit screen-resolution-extra sudo udev vdpau-driver-all x11-xkb-utils xfonts-base xfonts-encodings xfonts-utils xserver-common
xserver-xorg-core-hwe-18.04 xserver-xorg-video-nvidia-418
0 upgraded, 0 newly installed, 55 to remove and 5 not upgraded.
After this operation, 351 MB disk space will be freed.
Do you want to continue? [Y/n] Y
If you would prefer to access the root account with su(1)
or by logging in directly,
you must set a root password with "sudo passwd".
If you have arranged other means to access the root account,
and you are sure this is what you want,
you may bypass this check by setting an environment variable
(export SUDO_FORCE_REMOVE=yes).
Refusing to remove sudo.
dpkg: error processing package sudo (--remove):
installed sudo package pre-removal script subprocess returned error exit status 1
Removing udev (237-3ubuntu10.19) ...
invoke-rc.d: could not determine current runlevel
invoke-rc.d: policy-rc.d denied execution of stop.
Removing xfonts-base (1:1.0.4+nmu1) ...
Removing xfonts-utils (1:7.7+6) ...
Removing xfonts-encodings (1:1.0.4-2) ...
Removing libegl1:ppc64el (1.0.0-2ubuntu2.2) ...
Removing libegl-mesa0:ppc64el (18.2.8-0ubuntu0~18.04.2) ...
Removing libgbm1:ppc64el (18.2.8-0ubuntu0~18.04.2) ...
Removing libnvidia-compute-418:ppc64el (418.40.04-0ubuntu1) ...
Removing libwayland-server0:ppc64el (1.16.0-1ubuntu1.1~18.04.1) ...
Removing libxcb-xfixes0:ppc64el (1.13-2~ubuntu18.04) ...
Errors were encountered while processing:
nvidia-utils-418
nvidia-compute-utils-418
sudo
E: Sub-process /usr/bin/dpkg returned an error code (1)
다른창에서
[root@powerlinux ~]# docker commit 7b3af9f91c0f test:v0.5
sha256:1f77da5befa581a867371a0ffd5c7e75e09ac04249fdb0e5b09aa014037d0583
[root@powerlinux ~]# docker run --runtime=nvidia --rm -ti test:v0.5
root@07a93a6f8fa6:/# nvidia-smi
Wed Apr 24 03:33:53 2019
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.39 Driver Version: 418.39 CUDA Version: 10.1 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Tesla V100-SXM2... On | 00000004:04:00.0 Off | 0 |
| N/A 39C P0 39W / 300W | 0MiB / 16130MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 1 Tesla V100-SXM2... On | 00000004:05:00.0 Off | 0 |
| N/A 40C P0 38W / 300W | 0MiB / 16130MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 2 Tesla V100-SXM2... On | 00000035:03:00.0 Off | 0 |
| N/A 39C P0 38W / 300W | 0MiB / 16130MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 3 Tesla V100-SXM2... On | 00000035:04:00.0 Off | 0 |
| N/A 42C P0 37W / 300W | 0MiB / 16130MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
root@07a93a6f8fa6:/# python
Python 3.5.6 |Anaconda custom (64-bit)| (default, Aug 26 2018, 22:03:11)
[GCC 7.3.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import tensorflow as tf
>>> sess=tf.Session()
2019-04-24 03:34:40.080740: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties:
name: Tesla V100-SXM2-16GB major: 7 minor: 0 memoryClockRate(GHz): 1.53
pciBusID: 0004:04:00.0
totalMemory: 15.75GiB freeMemory: 15.44GiB
2019-04-24 03:34:40.190470: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 1 with properties:
name: Tesla V100-SXM2-16GB major: 7 minor: 0 memoryClockRate(GHz): 1.53
pciBusID: 0004:05:00.0
totalMemory: 15.75GiB freeMemory: 15.44GiB
2019-04-24 03:34:40.305558: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 2 with properties:
name: Tesla V100-SXM2-16GB major: 7 minor: 0 memoryClockRate(GHz): 1.53
pciBusID: 0035:03:00.0
totalMemory: 15.75GiB freeMemory: 15.44GiB
2019-04-24 03:34:40.426210: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 3 with properties:
name: Tesla V100-SXM2-16GB major: 7 minor: 0 memoryClockRate(GHz): 1.53
pciBusID: 0035:04:00.0
totalMemory: 15.75GiB freeMemory: 15.44GiB
2019-04-24 03:34:40.426315: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0, 1, 2, 3
2019-04-24 03:34:42.072754: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-04-24 03:34:42.072804: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988] 0 1 2 3
2019-04-24 03:34:42.072820: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0: N Y Y Y
2019-04-24 03:34:42.072832: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 1: Y N Y Y
2019-04-24 03:34:42.072844: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 2: Y Y N Y
2019-04-24 03:34:42.072856: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 3: Y Y Y N
2019-04-24 03:34:42.077306: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14941 MB memory) -> physical GPU (device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0004:04:00.0, compute capability: 7.0)
2019-04-24 03:34:42.078146: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 14941 MB memory) -> physical GPU (device: 1, name: Tesla V100-SXM2-16GB, pci bus id: 0004:05:00.0, compute capability: 7.0)
2019-04-24 03:34:42.078729: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:2 with 14941 MB memory) -> physical GPU (device: 2, name: Tesla V100-SXM2-16GB, pci bus id: 0035:03:00.0, compute capability: 7.0)
2019-04-24 03:34:42.079688: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:3 with 14945 MB memory) -> physical GPU (device: 3, name: Tesla V100-SXM2-16GB, pci bus id: 0035:04:00.0, compute capability: 7.0)
>>>
댓글 없음:
댓글 쓰기