Difference between revisions of "Lustre: Using lustre-iokit"

From Define Wiki
Jump to navigation Jump to search
 
(2 intermediate revisions by 2 users not shown)
Line 6: Line 6:
 
* '''sg3_utils'''
 
* '''sg3_utils'''
 
* '''sg3_utils-libs'''
 
* '''sg3_utils-libs'''
 +
 +
== Install lustre-iokit update for EL7 ==
 +
<syntaxhighlight>
 +
yum install sg3_utils
 +
wget https://downloads.hpdd.intel.com/public/lustre/lustre-2.7.0/el7/client/RPMS/x86_64/lustre-iokit-2.7.0-3.10.0_123.20.1.el7.x86_64.x86_64.rpm
 +
rpm -ivh lustre-iokit-2.7.0-3.10.0_123.20.1.el7.x86_64.x86_64.rpm
 +
</syntaxhighlight>
  
 
== Download/Build lustre-iokit ==
 
== Download/Build lustre-iokit ==
Line 148: Line 155:
 
#!/bin/bash
 
#!/bin/bash
 
   
 
   
#TARGETS="oss-3-0.lustre3.net:bb-OST0000 oss-3-0.lustre3.net:bb-OST0001 oss-3-0.lustre3.net:bb-OST0002 oss-3-0.lustre3.net:bb-OST0003 oss-3-1.lustre3.net:bb-OST0004 oss-3-1.lustre3.net:bb-OST0005 oss-3-1.lustre3.net:bb-OST0006 oss-3-1.lustre3.net:bb-OST0007 oss-3-2.lustre3.net:bb-OST0008 oss-3-2.lustre3.net:bb-OST0009 oss-3-2.lustre3.net:bb-OST000a oss-3-2.lustre3.net:bb-OST000b oss-3-3.lustre3.net:bb-OST000c oss-3-3.lustre3.net:bb-OST000d oss-3-3.lustre3.net:bb-OST000e oss-3-3.lustre3.net:bb-OST000f"
+
# to get the names of the targets, run the following;
 +
#[root@head ~]# pdsh -w oss[1-2] lctl dl '| grep obd'
 +
#oss2:   3 UP obdfilter lfs1-OST0004 lfs1-OST0004_UUID 107
 +
#oss2:   6 UP obdfilter lfs1-OST0005 lfs1-OST0005_UUID 107
 +
#oss2:   9 UP obdfilter lfs1-OST0006 lfs1-OST0006_UUID 107
 +
#oss1:   3 UP obdfilter lfs1-OST0001 lfs1-OST0001_UUID 107
 +
#oss1:   6 UP obdfilter lfs1-OST0002 lfs1-OST0002_UUID 107
 +
#oss1:   9 UP obdfilter lfs1-OST0003 lfs1-OST0003_UUID 107
 +
 
 
TARGETS="
 
TARGETS="
 
lustre02-oss1.boston.co.uk:lfs2-OST0003
 
lustre02-oss1.boston.co.uk:lfs2-OST0003
Line 155: Line 170:
 
lustre02-oss4.boston.co.uk:lfs2-OST0001
 
lustre02-oss4.boston.co.uk:lfs2-OST0001
 
"
 
"
 +
 
NOBJLO=1
 
NOBJLO=1
 
NOBJHI=256
 
NOBJHI=256
Line 181: Line 197:
  
 
thrhi=$THRHI thrlo=$THRLO nobjhi=$NOBJHI nobjlo=$NOBJLO size=$SIZE case="disk" targets=$TARGETS rslt_loc=$OUTPUT obdfilter-survey
 
thrhi=$THRHI thrlo=$THRLO nobjhi=$NOBJHI nobjlo=$NOBJLO size=$SIZE case="disk" targets=$TARGETS rslt_loc=$OUTPUT obdfilter-survey
 +
 +
</syntaxhighlight>
 +
 +
 +
=== Script for LNET ===
 +
<syntaxhighlight>
 +
#!/bin/sh
 +
#
 +
# Simple wrapper script for LNET Selftest
 +
#
 +
 +
MODULE_LOAD=$(lsmod | grep -c lnet_selftest)
 +
 +
if [ ${MODULE_LOAD} -eq 0 ] ; then
 +
    modprobe lnet_selftest
 +
fi
 +
 +
#Output file
 +
ST=lst-output-$(date +%Y-%m-%d-%H:%M:%S)
 +
 +
# Concurrency
 +
CN=64
 +
#Size
 +
SZ=1M
 +
# Length of time to run test (secs)
 +
TM=60
 +
# Which BRW test to run (read or write)
 +
BRW=write
 +
# Checksum calculation (simple or full)
 +
CKSUM=simple
 +
 +
# The LST "from" list -- e.g. Lustre clients. Space separated list of NIDs.
 +
LFROM="192.168.0.21@o2ib0"
 +
# The LST "to" list -- e.g. Lustre servers. Space separated list of NIDs.
 +
LTO="192.168.0.3@o2ib0"
 +
 +
### End of customisation.
 +
#for CN in 4 8 16 32 64 128 256 512; do
 +
for CN in 4 8 16 ; do
 +
echo "####### START ####### CONCURRENCY = $CN"
 +
 +
export LST_SESSION=$$
 +
echo LST_SESSION = ${LST_SESSION}
 +
lst new_session lst${BRW}
 +
lst add_group lfrom ${LFROM}
 +
lst add_group lto ${LTO}
 +
lst add_batch bulk_${BRW}
 +
lst add_test --batch bulk_${BRW} --from lfrom --to lto brw ${BRW} --concurrency=${CN} check=${CKSUM} size=${SZ}
 +
lst run bulk_${BRW}
 +
echo -n "Capturing statistics for ${TM} secs "
 +
lst stat lfrom lto 2>&1 > ${ST}-${CN} &
 +
#lst stat lfrom lto &
 +
LSTPID=$!
 +
# Delay loop with interval markers displayed every 5 secs. Test time is rounded
 +
# up to the nearest 5 seconds.
 +
i=1
 +
j=$((${TM}/5))
 +
if [ $((${TM}%5)) -ne 0 ]; then let j++; fi
 +
while [ $i -le $j ]; do
 +
  sleep 5
 +
  let i++
 +
done
 +
echo
 +
kill ${LSTPID}
 +
lst show_error lfrom lto
 +
lst stop bulk_${BRW}
 +
lst end_session
 +
echo "####### END"
 +
echo
 +
done
  
 
</syntaxhighlight>
 
</syntaxhighlight>

Latest revision as of 10:55, 31 August 2016

Lustre IO Kit is available from: http://downloads.lustre.org/public/tools/lustre-iokit/

Instructions below are based on version 1.2, tests were conducted on standard compute nodes running over 1GB links. Disks are capable of ~130MB/s RW. One MDT and Four OSS, one OST per OSS.

Lustre-iokit requires the following packages to be installed (part of centos repo in Platform HPC)

  • sg3_utils
  • sg3_utils-libs

Install lustre-iokit update for EL7

yum install sg3_utils 
wget https://downloads.hpdd.intel.com/public/lustre/lustre-2.7.0/el7/client/RPMS/x86_64/lustre-iokit-2.7.0-3.10.0_123.20.1.el7.x86_64.x86_64.rpm
rpm -ivh lustre-iokit-2.7.0-3.10.0_123.20.1.el7.x86_64.x86_64.rpm

Download/Build lustre-iokit

# Get src RPM
wget http://downloads.lustre.org/public/tools/lustre-iokit/lustre-iokit-1.2-200709210921.src.rpm

# Install src RPM
rpm -ivh lustre-iokit-1.2-200709210921.src.rpm

# Build binary RPM package
rpmbuild -bb /usr/src/redhat/SPECS/lustre-iokit.spec

# Install built package
rpm -ivh  /usr/src/redhat/RPMS/noarch/lustre-iokit-1.2-200709210921.noarch.rpm

lustre-iokit Contents

The following files are provided as part of the package

[root@atass ~]$ rpm -qpl /usr/src/redhat/RPMS/noarch/lustre-iokit-1.2-200709210921.noarch.rpm 
/usr/bin/config.sh
/usr/bin/gather_stats_everywhere.sh
/usr/bin/ior-survey
/usr/bin/libecho
/usr/bin/lstats.sh
/usr/bin/obdfilter-survey
/usr/bin/ost-survey
/usr/bin/parse-ior
/usr/bin/plot-obdfilter
/usr/bin/plot-ost
/usr/bin/plot-sgpdd
/usr/bin/sgpdd-survey
/usr/share/doc/lustre-iokit-1.2
/usr/share/doc/lustre-iokit-1.2/README.ior-survey
/usr/share/doc/lustre-iokit-1.2/README.lstats.sh
/usr/share/doc/lustre-iokit-1.2/README.obdfilter-survey
/usr/share/doc/lustre-iokit-1.2/README.ost-survey
/usr/share/doc/lustre-iokit-1.2/README.sgpdd-survey

The toolkit contains the following tests:

  • sgpdd_survey
  • obdfilter_survey
  • ost_survey
  • stats-collect

sgpdd-survey

This is a bare metal dd performance test and can be run on the OST at block level. Running sgp_dd will ERASE the contents of the disk devices. This is NOT to be run on any OST where you care about any data!

Note requires sg module to have been loaded and output from sg_map must show the device

[root@atass07 ~]$ scsidevs=/dev/sda size=128 crghi=16 thrhi=21 /usr/bin/sgpdd-survey 
Wed Aug 17 17:37:25 BST 2011 sgpdd-survey on /dev/sda from atass07
total_size   131072K rsz 1024 crg     1 thr     1 write  135.23 MB/s     1 x 135.98 =  135.98 MB/s read  135.35 MB/s     1 x 136.08 =  136.08 MB/s 
total_size   131072K rsz 1024 crg     1 thr     2 write  140.37 MB/s     1 x 141.12 =  141.12 MB/s read  131.26 MB/s     1 x 131.92 =  131.92 MB/s 
total_size   131072K rsz 1024 crg     1 thr     4 write  138.47 MB/s     1 x 139.25 =  139.25 MB/s read  131.55 MB/s     1 x 132.27 =  132.27 MB/s 
total_size   131072K rsz 1024 crg     1 thr     8 write  139.05 MB/s     1 x 139.86 =  139.86 MB/s read  132.54 MB/s     1 x 133.28 =  133.28 MB/s 
total_size   131072K rsz 1024 crg     1 thr    16 write  135.06 MB/s     1 x 135.78 =  135.78 MB/s read  131.55 MB/s     1 x 132.24 =  132.24 MB/s 
total_size   131072K rsz 1024 crg     2 thr     2 write  124.71 MB/s     2 x  62.69 =  125.39 MB/s read  120.18 MB/s     2 x  60.41 =  120.81 MB/s 
total_size   131072K rsz 1024 crg     2 thr     4 write  116.48 MB/s     2 x  58.53 =  117.05 MB/s read  126.02 MB/s     2 x  63.33 =  126.67 MB/s 
total_size   131072K rsz 1024 crg     2 thr     8 write  122.01 MB/s     2 x  61.32 =  122.64 MB/s read  126.48 MB/s     2 x  63.57 =  127.14 MB/s 
total_size   131072K rsz 1024 crg     2 thr    16 write  120.14 MB/s     2 x  60.38 =  120.75 MB/s read  126.68 MB/s     2 x  63.66 =  127.32 MB/s 
total_size   131072K rsz 1024 crg     4 thr     4 write  108.01 MB/s     4 x  27.13 =  108.53 MB/s read  126.74 MB/s     4 x  31.85 =  127.41 MB/s 
total_size   131072K rsz 1024 crg     4 thr     8 write  110.71 MB/s     4 x  27.84 =  111.35 MB/s read  120.44 MB/s     4 x  30.28 =  121.12 MB/s 
total_size   131072K rsz 1024 crg     4 thr    16 write  126.18 MB/s     4 x  31.75 =  126.99 MB/s read  134.38 MB/s     4 x  33.79 =  135.15 MB/s 
total_size   131072K rsz 1024 crg     8 thr     8 write  122.43 MB/s     8 x  15.42 =  123.37 MB/s read  131.48 MB/s     8 x  16.57 =  132.52 MB/s 
total_size   131072K rsz 1024 crg     8 thr    16 write  133.87 MB/s     8 x  16.86 =  134.89 MB/s read  144.50 MB/s     8 x  18.17 =  145.34 MB/s 
total_size   131072K rsz 1024 crg    16 thr    16 write  131.38 MB/s    16 x   8.30 =  132.75 MB/s read  140.90 MB/s    16 x   8.86 =  141.75 MB/s

obdfilter-survey

Lustre IOKIT provides obdfilter-survey script which exercises obdfilter layer in Lustre IO stack for reading, writing and rewriting Lustre objects. Obdfilter-survey is primarily used for sizing OST throughput performance over the network.

Lustre OSS needs to be configured before running survey on OSTs Check output of lctl dl command on OSS nodes to verify existence of obdfilter instances

[root@atass04 ~]$ lctl dl 
  0 UP mgc MGC172.20.0.4@tcp cfc518c4-e41d-5dc5-240f-86f24981edfb 5
  1 UP ost OSS OSS_uuid 3
  2 UP obdfilter lustrewt-OST0003 lustrewt-OST0003_UUID 31

Ensure the obdecho is present

modprobe odbecho

Obdfilter-survey can be invoked with following parameters

  • case: local-disk, network-echo, network-disk, Run survey on disk-backed local obdfilter instances, network loopback

or disk instances

  • thrlo-thrhi: High - low counts of threads
  • nobjlo-nobjhi: No of objects to read/write
  • rszlo-rszhi: High - low record size in KB
  • size: Total IO size in MB
  • targets: names of obdfilter instances

Recommended parameters are:

  • rszlo=rszhi=1024, nobjhi=128, thrhi=128


Run locally against an OST (on the OSS)

[root@oss01 ~]$ rszio=rszhi=1024 nobjhi=128 thrhi=128 size=1024 case=disk sh obdfilter-survey
Thu Aug 18 11:57:16 BST 2011 Obdfilter-survey for case=disk from atass04
ost  1 sz  1048576K rsz 1024 obj    1 thr    1 write   22.27 [  20.96,  22.98] 
ost  1 sz  1048576K rsz 1024 obj    1 thr    2 write   22.28 [  20.98,  22.98] 
ost  1 sz  1048576K rsz 1024 obj    1 thr    4 write   38.06 [  32.94,  39.96] 
ost  1 sz  1048576K rsz 1024 obj    1 thr    8 write   53.47 [  47.91,  55.95] 
ost  1 sz  1048576K rsz 1024 obj    1 thr   16 write   76.96 [  64.94,  79.93] 
ost  1 sz  1048576K rsz 1024 obj    1 thr   32 write   96.43 [  94.91,  96.91] 
ost  1 sz  1048576K rsz 1024 obj    1 thr   64 write  100.21 [  91.91, 127.88] 
ost  1 sz  1048576K rsz 1024 obj    1 thr  128 write  113.79 [ 127.88, 127.88] 
... [etc]

Run over the network from a lustre client

Unmount the /mnt/lustre filesystem before running ($ umount /mnt/lustre)

[root@comp10 ~]$ rszio=rszhi=1024 nobjhi=8 thrhi=8 size=1024 targets="oss01" case=network sh obdfilter-survey
Warning: Permanently added 'atass01,172.20.0.5' (RSA) to the list of known hosts.
Thu Aug 18 12:27:36 BST 2011 Obdfilter-survey for case=network from atass10
ost  1 sz  1048576K rsz 1024 obj    1 thr    1 write   45.35 [  44.91,  45.96] 
ost  1 sz  1048576K rsz 1024 obj    1 thr    2 write   88.53 [  86.71,  90.92] 
ost  1 sz  1048576K rsz 1024 obj    1 thr    4 write  100.02 [  98.91, 100.91] 
ost  1 sz  1048576K rsz 1024 obj    1 thr    8 write  105.18 [ 103.91, 107.90] 
ost  1 sz  1048576K rsz 1024 obj    2 thr    2 write   90.30 [  89.83,  91.82] 
ost  1 sz  1048576K rsz 1024 obj    2 thr    4 write  100.06 [  99.81, 100.91] 
ost  1 sz  1048576K rsz 1024 obj    2 thr    8 write  104.68 [ 103.90, 106.90] 
ost  1 sz  1048576K rsz 1024 obj    4 thr    4 write   99.62 [  98.91, 100.91] 
ost  1 sz  1048576K rsz 1024 obj    4 thr    8 write  104.81 [ 102.91, 109.90] 
ost  1 sz  1048576K rsz 1024 obj    8 thr    8 write  104.54 [ 103.87, 104.90] 
done!

Scirpts for running obdfilter

#!/bin/bash
 
# to get the names of the targets, run the following; 
#[root@head ~]# pdsh -w oss[1-2] lctl dl '| grep obd'
#oss2:   3 UP obdfilter lfs1-OST0004 lfs1-OST0004_UUID 107
#oss2:   6 UP obdfilter lfs1-OST0005 lfs1-OST0005_UUID 107
#oss2:   9 UP obdfilter lfs1-OST0006 lfs1-OST0006_UUID 107
#oss1:   3 UP obdfilter lfs1-OST0001 lfs1-OST0001_UUID 107
#oss1:   6 UP obdfilter lfs1-OST0002 lfs1-OST0002_UUID 107
#oss1:   9 UP obdfilter lfs1-OST0003 lfs1-OST0003_UUID 107

TARGETS="
lustre02-oss1.boston.co.uk:lfs2-OST0003
lustre02-oss2.boston.co.uk:lfs2-OST0000
lustre02-oss3.boston.co.uk:lfs2-OST0002
lustre02-oss4.boston.co.uk:lfs2-OST0001
"

NOBJLO=1
NOBJHI=256
THRLO=1
THRHI=256
#OUTPUT="/root/test_results/obd/12lun_00_01"
 OUTPUT="/root/test_results/obd/short_peak"
 
# The test dataset size (MB) for each LUN. The total dataset size must be larger than 2 times of the RAM size in order to avoid the caching.
# The calculation of the size
# (RAM size * 2) / the number of LUNs
# For example, the server RAM is 24GB and there are 5 LUN
# the size should be (24GB * 2 ) / 5 ~= 10 GB = 10240 MB
# However, it is a good idea to run a very short trial run to make sure the test configuration working properly before scheduling a complete test.
# SIZE=100 is a good number for the trial run
 
 
#SIZE="46000"
SIZE="32000"
#SIZE="100"
 
ssh lustre02-oss1 mkdir -p $OUTPUT
ssh lustre02-oss2 mkdir -p $OUTPUT
ssh lustre02-oss3 mkdir -p $OUTPUT
ssh lustre02-oss4 mkdir -p $OUTPUT

thrhi=$THRHI thrlo=$THRLO nobjhi=$NOBJHI nobjlo=$NOBJLO size=$SIZE case="disk" targets=$TARGETS rslt_loc=$OUTPUT obdfilter-survey


Script for LNET

#!/bin/sh
#
# Simple wrapper script for LNET Selftest
#

MODULE_LOAD=$(lsmod | grep -c lnet_selftest)

if [ ${MODULE_LOAD} -eq 0 ] ; then
    modprobe lnet_selftest
fi

#Output file
ST=lst-output-$(date +%Y-%m-%d-%H:%M:%S)

# Concurrency
CN=64
#Size
SZ=1M
# Length of time to run test (secs)
TM=60
# Which BRW test to run (read or write)
BRW=write
# Checksum calculation (simple or full)
CKSUM=simple

# The LST "from" list -- e.g. Lustre clients. Space separated list of NIDs.
LFROM="192.168.0.21@o2ib0"
# The LST "to" list -- e.g. Lustre servers. Space separated list of NIDs.
LTO="192.168.0.3@o2ib0"

### End of customisation.
#for CN in 4 8 16 32 64 128 256 512; do 
for CN in 4 8 16 ; do 
echo "####### START ####### CONCURRENCY = $CN"

export LST_SESSION=$$
echo LST_SESSION = ${LST_SESSION}
lst new_session lst${BRW}
lst add_group lfrom ${LFROM}
lst add_group lto ${LTO}
lst add_batch bulk_${BRW}
lst add_test --batch bulk_${BRW} --from lfrom --to lto brw ${BRW} --concurrency=${CN} check=${CKSUM} size=${SZ}
lst run bulk_${BRW}
echo -n "Capturing statistics for ${TM} secs "
lst stat lfrom lto 2>&1 > ${ST}-${CN} &
#lst stat lfrom lto &
LSTPID=$!
# Delay loop with interval markers displayed every 5 secs. Test time is rounded
# up to the nearest 5 seconds.
i=1
j=$((${TM}/5))
if [ $((${TM}%5)) -ne 0 ]; then let j++; fi
while [ $i -le $j ]; do
  sleep 5
  let i++
done
echo
kill ${LSTPID}
lst show_error lfrom lto
lst stop bulk_${BRW}
lst end_session
echo "####### END"
echo
done

ost-survey

Run on a lustre client with the filesystem mounted.

TBC: Why not ~100MB/s, both disks/network are capable of this?

[root@atass07 ~]$ ost-survey /mnt/lustre/
/usr/bin/ost-survey: 08/30/11 OST speed survey on /mnt/lustre/ from 172.20.0.11@tcp
Number of Active OST devices : 4
warning: deprecated usage of setstripe positional parameters.  Use -c, -i, -s instead.
warning: deprecated usage of setstripe positional parameters.  Use -c, -i, -s instead.
warning: deprecated usage of setstripe positional parameters.  Use -c, -i, -s instead.
warning: deprecated usage of setstripe positional parameters.  Use -c, -i, -s instead.
Worst  Read OST indx: 0 speed: 56.321735
Best   Read OST indx: 1 speed: 59.597370
Read Average: 57.899567 +/- 1.248223 MB/s
Worst  Write OST indx: 2 speed: 59.103396
Best   Write OST indx: 0 speed: 68.555924
Write Average: 63.212332 +/- 3.600104 MB/s
Ost#  Read(MB/s)  Write(MB/s)  Read-time  Write-time
----------------------------------------------------
0     56.322       68.556        0.533      0.438
1     59.597       60.915        0.503      0.492
2     57.187       59.103        0.525      0.508
3     58.493       64.275        0.513      0.467

# increase the file size -s 500:
[root@blade1 ~]# ost-survey -s 500 /mnt/lustre/^C

Ost#  Read(MB/s)  Write(MB/s)  Read-time  Write-time
----------------------------------------------------
0     4006.794       90.196        0.125      5.543
1     3972.007       89.683        0.126      5.575
2     4010.817       92.355        0.125      5.414
3     3996.448       89.458        0.125      5.589
4     4055.085       92.344        0.123      5.415