1 Centos7.7宿主机安装NVIDIA驱动

NVIDIA驱动官方下载地址

(1)关闭nouveau driver

echo "blacklist nouveau" >> /etc/modprobe.d/blacklist.conf
cat >>/lib/modprobe.d/dist-blacklist.conf<<EOF
blacklist nouveau
options nouveau modeset=0
EOF

(2)给当前镜像做备份,并建立新的镜像

mv /boot/initramfs-$(uname -r).img /boot/initramfs-$(uname -r)-nouveau.img
dracut /boot/initramfs-$(uname -r).img $(uname -r)
reboot # 重启机器

(3)安装NVIDIA驱动

rpm -ivh NVIDIA-vGPU-rhel-7.7-460.32.04.x86_64.rpm
reboot # 安装完后重启系统

(4)验证系统是否加载NVIDIA vGPU驱动

[root@localhost NVIDIA]# lsmod | grep vfio
nvidia_vgpu_vfio       50250  19 
nvidia              34001825  295 nvidia_vgpu_vfio
vfio_mdev              12841  1 
mdev                   20336  2 vfio_mdev,nvidia_vgpu_vfio
vfio_iommu_type1       22440  1 
vfio                   32657  6 vfio_mdev,nvidia_vgpu_vfio,vfio_iommu_type1

(5)nvidia-smi验证

[root@localhost NVIDIA]# nvidia-smi 
Tue May 11 15:51:09 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.04    Driver Version: 460.32.04    CUDA Version: N/A      |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  Tesla P40           On   | 00000000:3D:00.0 Off |                    0 |
| N/A   19C    P8    19W / 250W |     41MiB / 23039MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P40           On   | 00000000:41:00.0 Off |                    0 |
| N/A   20C    P8    19W / 250W |  22899MiB / 23039MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    1   N/A  N/A      5268    C+G   vgpu                            22848MiB |
+-----------------------------------------------------------------------------+

2 KVM虚拟机安装NVIDIA 驱动

(1)配置kvm虚拟机网络跟宿主机桥接

# 关闭NetworkManager服务
systemctl stop NetworkManager
systemctl disable NetworkManager
# 创建新的br0网卡,跟物理网卡进行绑定
virsh iface-bridge enp97s0f0 br0 

(2)网卡验证

[root@localhost ~]# brctl show
bridge name	bridge id		STP enabled	interfaces
br0		8000.b4055d3f1e78	yes		enp97s0f0
							vnet0
virbr0		8000.525400789520	yes		virbr0-nic

(3)拆分物理GPU,创建vGPU

# 通过nvidia-smi查看显卡bus_id,以 0000:3d:00.0 为例
cd /sys/class/mdev_bus/0000:3d:00.0/mdev_supported_types

(4)列出物理GPU可以拆分的类型

[root@localhost ~]# for i in nvidia-* ; do echo -n "$i "; cat $i/name ; done
nvidia-156 GRID P40-2B
nvidia-215 GRID P40-2B4
nvidia-241 GRID P40-1B4
nvidia-283 GRID P40-4C
nvidia-284 GRID P40-6C
nvidia-285 GRID P40-8C
nvidia-286 GRID P40-12C
nvidia-287 GRID P40-24C
nvidia-46 GRID P40-1Q
nvidia-47 GRID P40-2Q
nvidia-48 GRID P40-3Q
nvidia-49 GRID P40-4Q
nvidia-50 GRID P40-6Q
nvidia-51 GRID P40-8Q
nvidia-52 GRID P40-12Q
nvidia-53 GRID P40-24Q
nvidia-54 GRID P40-1A
nvidia-55 GRID P40-2A
nvidia-56 GRID P40-3A
nvidia-57 GRID P40-4A
nvidia-58 GRID P40-6A
nvidia-59 GRID P40-8A
nvidia-60 GRID P40-12A
nvidia-61 GRID P40-24A
nvidia-62 GRID P40-1B

(5)这里我用的是 Q 渲染模式

# 详细的模式选择可参考官方文档
# https://docs.nvidia.com/grid/10.0/grid-vgpu-user-guide/index.html#virtual-gpu-types-grid
[root@localhost mdev_supported_types]# cd nvidia-53
[root@localhost nvidia-53]# ls
available_instances  create  description  device_api  devices  namels
[root@localhost nvidia-53]# cat available_instances 
1
# 为vGPU生成唯一标识符
[root@localhost nvidia-53]# uuidgen
3d107946-44ef-4c87-aff2-dff273402208
[root@localhost nvidia-53]# echo "3d107946-44ef-4c87-aff2-dff273402208" > create
# 确认vGPU被创建
[root@localhost ~]# ls -l /sys/bus/mdev/devices/
total 0
lrwxrwxrwx. 1 root root 0 Apr 30 15:44 3d107946-44ef-4c87-aff2-dff273402208 -> ../../../devices/pci0000:3a/0000:3a:00.0/0000:3b:00.0/0000:3c:14.0/0000:41:00.0/3d107946-44ef-4c87-aff2-dff273402208

(6)kvm虚拟机引用此vGPU

[root@localhost ~]# virsh list --all
 Id    Name                           State
----------------------------------------------------
 -     centos7.0                        shut off
[root@localhost ~]# virsh edit centos7.0
# 添加如下字段
    <hostdev mode='subsystem' type='mdev' managed='no' model='vfio-pci' display='off'>
      <source>
        <address uuid='3d107946-44ef-4c87-aff2-dff273402208'/>
      </source>
      <address type='pci' domain='0x0000' bus='0x00' slot='0x0a' function='0x0'/>
    </hostdev>
# 启动kvm虚拟机
[root@localhost ~]# virsh start centos7.0

(7)拷贝NVIDIA的.run程序,这个可执行文件在官网下载的zip包中被一并打包

[root@aster NVIDIA]# ls
NVIDIA-Linux-x86_64-460.32.03-grid.run
# 关闭nouveau driver
echo "blacklist nouveau" >> /etc/modprobe.d/blacklist.conf
cat >>/lib/modprobe.d/dist-blacklist.conf<<EOF
blacklist nouveau
options nouveau modeset=0
EOF
# 给当前镜像做备份,并建立新的镜像
mv /boot/initramfs-$(uname -r).img /boot/initramfs-$(uname -r)-nouveau.img
dracut /boot/initramfs-$(uname -r).img $(uname -r)
reboot # 重启机器

(8)安装NVIDIA driver,需要在init3模式下

[root@aster NVIDIA]# init 3
[root@aster NVIDIA]# chmod 777 NVIDIA-Linux-x86_64-440.107-grid.run
[root@aster NVIDIA]# bash NVIDIA-Linux-x86_64-440.107-grid.run
[root@aster NVIDIA]# reboot  # 安装完之后reboot
# 重启后查看
[root@aster NVIDIA]# nvidia-smi 
Tue May 11 16:36:51 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  GRID P40-24Q        On   | 00000000:00:0A.0 Off |                  N/A |
| N/A   N/A    P8    N/A /  N/A |   3636MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+
SeriesOptimal Workload
Q-seriesVirtual workstations for creative and technical professionals who require the performance and features of Quadro technology
C-seriesCompute-intensive server workloads, such as artificial intelligence (AI), deep learning, or high-performance computing (HPC)
B-seriesVirtual desktops for business professionals and knowledge workers
A-seriesApp streaming or session-based solutions for virtual applications users

参考文章:NVIDIA VIRTUAL GPU SOFTWARE DOCUMENTATION

Logo

华为开发者空间,是为全球开发者打造的专属开发空间,汇聚了华为优质开发资源及工具,致力于让每一位开发者拥有一台云主机,基于华为根生态开发、创新。

更多推荐