Download the following package: https://hpc.lenovo.com/lico/downloads/7.1/intel_graphics.el8.x86_64.tgz. #### Step 2. Upload the package to the /root directory and set environment variable.
cd /root
cat > intel-graphic_env.local << EOF
# repository directory for intel-graphic
intel_graphics_repo_dir="/install/custom/intel-graphics-rhel8.6"
# link name of repository directory for intel-graphic
link_intel_graphics_repo_dir="/install/custom/intel-graphics"
EOF
Configure the repository for the management node:
/root
cd
source lico_env.local-graphic_env.local
source intel-p $intel_graphics_repo_dir
mkdir -xvf intel_graphics.el8.x86_64.tgz -C $intel_graphics_repo_dir
tar -rf $link_intel_graphics_repo_dir
rm -s $intel_graphics_repo_dir $link_intel_graphics_repo_dir
ln /mklocalrepo.sh $link_intel_graphics_repo_dir
Attention: Before running the commands, ensure that the management node has configured a local operating system repository for the above and the subsequent actions.
Configure the repository for other nodes:
cp /etc/yum.repos.d/intel-graphics.repo $share_installer_dir
sed -i '/^baseurl=/d' $share_installer_dir/intel-graphics.repo
sed -i '/^gpgkey=/d' $share_installer_dir/intel-graphics.repo
sed -i "/name=Intel-graphics/a\baseurl=http://${sms_name}\
${link_intel_graphics_repo_dir}/graphics/rhel/8.6" $share_installer_dir/intel-graphics.repo
sed -i "/name=Intel-graphics/a\gpgkey=http://${sms_name}\
${link_intel_graphics_repo_dir}/intel-graphics.key" $share_installer_dir/intel-graphics.repo
nodeshell all cp $share_installer_dir/intel-graphics.repo /etc/yum.repos.d/
nodeshell compute dnf clean all
nodeshell compute dnf makecache
The GPU driver should be installed on each GPU compute node. If only a subset of nodes is installed with GPUs, replace the compute argument in nodeshell commands with the corresponding node range of GPU nodes.
nodeshell compute dnf install -y openssl-devel gcc bison flex bc bzip2-devel elfutils-libelf-devel rsync \
rpm-build make perl lz4 redhat-lsb-core patch dkms
dnf install -y kernel-devel-$(uname -r) kernel-headers-$(uname -r)
nodeshell compute dnf install intel-i915-dkms \
\
intel-dmabuf-dkms \
intel-platform-vsec-dkms intel-platform-cse-dkms
nodeshell compute dnf install \
\
intel-opencl intel-media intel-mediasdk libmfxgen1 libvpl2\
level-zero intel-level-zero-gpu mesa-dri-drivers mesa-vulkan-drivers \
mesa-vdpau-drivers libdrm mesa-libEGL mesa-libgbm mesa-libGL \
mesa-libxatracker libvpl-tools intel-metrics-discovery \
intel-metrics-library intel-igc-core intel-igc-cm libva libva-utils intel-gmmlib intel-cmemu libmetee intel-gsc
If you will be doing development, you may want to install optional development packages. oneAPI users will need to install these packages for some of the oneAPI tools to function correctly:
nodeshell compute dnf install --refresh \
\
intel-igc-opencl-devel \
level-zero-devel \
intel-gsc-devel libmetee-devel
Reboot the system for these changes to take effect
nodeshell compute reboot
Intel XPU Manager package should be installed on each GPU compute node.
download XPU Manager package and install it in compute node
wget https://github.com/intel/xpumanager/releases/download/v1.1.0_golden2/xpu-smi-1.1.0-20221026.163421.06e646c9.centos8.5.x86_64.rpm
dnf install xpu-smi-1.2.0-20221116.124355.3cd07734.centos8.5.x86_64.rpm
Run the following command on the GPU nodes to verify Intel GPU driver whether installed successfully
[root@c1 ~]# xpu-smi discovery
+-----------+--------------------------------------------------------------------------------------+
| Device ID | Device Information |
+-----------+--------------------------------------------------------------------------------------+
| 0 | Device Name: Intel(R) Graphics [0x56c1] |
| | Vendor Name: Intel(R) Corporation |
| | UUID: 01000000-0000-0000-0000-000000b70000 |
| | PCI BDF Address: 0000:b7:00.0 |
| | DRM Device: /dev/dri/card0 |
+-----------+--------------------------------------------------------------------------------------+
| 1 | Device Name: Intel(R) Graphics [0x56c1] |
| | Vendor Name: Intel(R) Corporation |
| | UUID: 01000000-0000-0000-0000-000000bb0000 |
| | PCI BDF Address: 0000:bb:00.0 |
| | DRM Device: /dev/dri/card1 |
+-----------+--------------------------------------------------------------------------------------+
The following typical fields need to be configured:
GresTypes=gpu
NodeName=c1 Gres=gpu:2 Sockets=2 CPUs=108 CoresPerSocket=27 ThreadsPerCore=2 State=UNKNOWN
NodeName=c2 Gres=gpu:2 Sockets=2 CPUs=108 CoresPerSocket=27 ThreadsPerCore=2 State=UNKNOWN
Modify the following content:
AutoDetect=oneapi
Sync the modified slurm.conf、cgroup.conf、gres.conf to /etc/slurm/ path in compute node.
Run following command in the management node to start slurmctl and slurmd service .
systemctl enable slurmctld
systemctl restart slurmctld
nodeshell all systemctl enable slurmd
nodeshell all systemctl restart slurmd