Difference between revisions of "Broadcom install for RoCE on ROCm devices"
Jump to navigation
Jump to search
(Created page with "<pre> ❯ cat broadcom-roce-setup.txt # ubuntu 22.04 apt install ibverbs-utils libibverbs1 rdmacm-utils librdmacm1 librdmacm-dev infiniband-diags perftest hwloc ibutils numa...") |
(No difference)
|
Revision as of 10:46, 23 April 2024
❯ cat broadcom-roce-setup.txt # ubuntu 22.04 apt install ibverbs-utils libibverbs1 rdmacm-utils librdmacm1 librdmacm-dev infiniband-diags perftest hwloc ibutils numactl # latest version from: https://www.broadcom.com/products/ethernet-connectivity/network-adapters/p1200g curl -LO https://docs.broadcom.com/docs-and-downloads/ethernet-network-adapters/NXE/BRCM_229.1.123.0/bcm_229.1.123.0b.tar.gz tar zxvf bcm_229.1.123.0b.tar.gz cd bcm_229.1.123.0b/ cd Linux/Linux_Installer/ cd niccli/Linux_x86_64 dpkg -i ./niccli_229.0.150.0-1_x86_64.deb cd niccli/ tar zxvf sliff-229.0.150.0.tar.gz cd sliff-229.0.150.0/ make make install modprobe sliff cd Linux/KMP-RoCE-Lib/ tar zxvf libbnxt_re-229.0.139.0.tar.gz cd libbnxt_re-229.0.139.0/ sh autogen.sh ./configure make make install cp bnxt_re.driver /etc/libibverbs.d/ echo "/usr/local/lib" >> /etc/ld.so.conf ldconfig -v # little hacky bits, remove from install.sh script as its installed above. 229 ROCE_LINES="" 230 if [ "$ROCE" == "true" ]; then 231 ROCE_LINES=" 232 # - libbnxtre 233 - modules-load 234 - roce_cc 235 " 236 fi cd Linux/Linux_Installer/ bash install.sh -i enp139s0np0 # reboot system # on reboot we should see root@gpu4:~/scratch/bcm_229.1.123.0b# ibstatus Infiniband device 'bnxt_re0' port 1 status: default gid: fe80:0000:0000:0000:7ec2:55ff:feba:7388 base lid: 0x0 sm lid: 0x0 state: 4: ACTIVE phys state: 5: LinkUp rate: 100 Gb/sec (4X EDR) link_layer: Ethernet root@gpu4:~/scratch/bcm_229.1.123.0b/Linux/Linux_Installer# rdma link show link bnxt_re0/1 state ACTIVE physical_state LINK_UP netdev enp139s0np0 # benchmarks - standard bond1 numbers, no roce ubuntu@gpu3:~$ /opt/apps/ompi/5.0.x/bin/mpirun --mca btl_tcp_if_include bond1.103 -np 2 -H gpu3-ib,gpu4-ib /usr/local/libexec/osu-micro-benchmarks/mpi/pt2pt/osu_latency [gpu3:08818] No HIP capabale device found. Disabling component. # OSU MPI Latency Test v7.3 # Size Latency (us) # Datatype: MPI_CHAR. 1 29.19 2 29.23 4 29.17 8 29.14 16 29.05 32 29.23 64 29.32 128 29.28 256 30.56 512 30.65 1024 30.90 2048 31.47 4096 33.62 8192 36.95 16384 46.44 32768 59.56 65536 163.48 131072 176.76 262144 260.66 524288 377.56 1048576 632.83 2097152 1135.50 4194304 2127.84 ubuntu@gpu3:~$ /opt/apps/ompi/5.0.x/bin/mpirun --mca btl_tcp_if_include bond1.103 -np 2 -H gpu3-ib,gpu4-ib /usr/local/libexec/osu-micro-benchmarks/mpi/pt2pt/osu_ osu_bibw osu_bw osu_latency osu_latency_mp osu_latency_mt osu_mbw_mr osu_multi_lat ubuntu@gpu3:~$ /opt/apps/ompi/5.0.x/bin/mpirun --mca btl_tcp_if_include bond1.103 -np 2 -H gpu3-ib,gpu4-ib /usr/local/libexec/osu-micro-benchmarks/mpi/pt2pt/osu_bw [gpu3:08906] No HIP capabale device found. Disabling component. # OSU MPI Bandwidth Test v7.3 # Size Bandwidth (MB/s) # Datatype: MPI_CHAR. 1 0.33 2 0.66 4 1.31 8 2.64 16 5.33 32 8.42 64 19.40 128 38.31 256 74.01 512 127.42 1024 249.89 2048 455.06 4096 789.85 8192 1276.01 16384 1839.29 32768 2287.82 65536 2351.62 131072 3006.95 262144 3881.76 524288 4573.31 1048576 4889.55 2097152 4917.25 4194304 4944.65 # checking rdma # server: rping -s -d -C 1 # client: rping -c -a 192.168.17.2 -d -C 1 # ib_send_lat -d bnxt_re3 # ib_send_lat -d bnxt_re3 192.168.17.1 rdma link show link bnxt_re0/1 state DOWN physical_state DISABLED netdev enp37s0np0 link bnxt_re1/1 state DOWN physical_state DISABLED netdev enp12s0np0 link bnxt_re2/1 state ACTIVE physical_state LINK_UP netdev enp180s0np0 link bnxt_re3/1 state ACTIVE physical_state LINK_UP netdev enp139s0np0: