Difference between revisions of "Lustre SFF"
| (15 intermediate revisions by the same user not shown) | |||
| Line 69: | Line 69: | ||
<syntaxhighlight> | <syntaxhighlight> | ||
zpool create mgt -o cachefile=none mirror -f /dev/disk/by-id/wwn-0x5001???????????? # Replace device names as required. | zpool create mgt -o cachefile=none mirror -f /dev/disk/by-id/wwn-0x5001???????????? # Replace device names as required. | ||
| − | zpool create ost0001 -o cachefile=none raidz2 -o ashift=12 -O recordsize=1M -f /dev/disk/by-id/wwn-0x5000c5002 | + | zpool create ost0001 -o cachefile=none raidz2 -o ashift=12 -O recordsize=1M -f /dev/disk/by-id/wwn-0x5000c5002??????? # Replace device names as required. |
| − | zpool create ost0002 -o cachefile=none raidz2 -o ashift=12 -O recordsize=1M -f /dev/disk/by-id/wwn-0x5000c5005 | + | zpool create ost0002 -o cachefile=none raidz2 -o ashift=12 -O recordsize=1M -f /dev/disk/by-id/wwn-0x5000c5005??????? # Replace device names as required. |
</syntaxhighlight> | </syntaxhighlight> | ||
| + | |||
| + | The errors "one or more devices is currently unavailable" or "no such pool or dataset" may be seen and it is suggested that this is due to slow updating of the "by-id" filesystem. Work around by using "/dev/sd?" devices and converting to "by-id" endpoints. | ||
| + | |||
| + | https://github.com/zfsonlinux/zfs/issues/3708 | ||
| + | |||
| + | <syntaxhighlight> | ||
| + | # Replace symlinks with full path and place on single line. | ||
| + | echo $(readlink -f $(ls /dev/disk/by-id/wwn-0x5000c5002???????) | tr '\n' ' ') | ||
| + | zpool create mgt -o cachefile=none mirror -f $(readlink -f $(ls /dev/disk/by-id/wwn-0x5001????????????) | tr '\n' ' ') | ||
| + | zpool create ost0001 -o cachefile=none raidz2 -o ashift=12 -O recordsize=1M -f $(readlink -f $(ls /dev/disk/by-id/wwn-0x5000c5002???????) | tr '\n' ' ') | ||
| + | zpool create ost0002 -o cachefile=none raidz2 -o ashift=12 -O recordsize=1M -f $(readlink -f $(ls /dev/disk/by-id/wwn-0x5000c5005???????) | tr '\n' ' ') | ||
| + | zpool export mgt | ||
| + | zpool export ost0001 | ||
| + | zpool export ost0002 | ||
| + | zpool import -d /dev/disk/by-id mgt | ||
| + | zpool import -d /dev/disk/by-id ost0001 | ||
| + | zpool import -d /dev/disk/by-id ost0002 | ||
| + | </syntaxhighlight> | ||
| + | |||
| + | === Format Lustre Filesystem === | ||
| + | |||
| + | Create combined MGS/MDT, on MGS | ||
| + | |||
| + | <syntaxhighlight> | ||
| + | MGS=192.168.0.1@o2ib | ||
| + | OSS=192.168.0.2@o2ib | ||
| + | FSNAME=lzfs | ||
| + | ssh $MGS mkfs.lustre --fsname=$FSNAME --mgs --mdt --index=0 --backfstype=zfs --servicenode=$OSS --mgsnode=$MGS --mgsnode=$OSS --reformat mgt/mgt | ||
| + | ssh $OSS mkfs.lustre --fsname=$FSNAME --ost --index=0000 --backfstype=zfs --servicenode=$MGS --mgsnode=$MGS --mgsnode=$OSS ost0001/ost0001 | ||
| + | ssh $OSS mkfs.lustre --fsname=$FSNAME --ost --index=0001 --backfstype=zfs --servicenode=$MGS --mgsnode=$MGS --mgsnode=$OSS ost0002/ost0002 | ||
| + | ssh $MGS mkdir -p /zfs/mgt | ||
| + | ssh $MGS mount -t lustre mgt/mgt /zfs/mgt | ||
| + | ssh $OSS mkdir -p /zfs/ost000{1,2} | ||
| + | ssh $OSS mount -t lustre ost0001/ost0001 /zfs/ost0001 | ||
| + | ssh $OSS mount -t lustre ost0002/ost0002 /zfs/ost0002 | ||
| + | </syntaxhighlight> | ||
| + | |||
| + | === Adding 90 Bay === | ||
| + | |||
| + | <syntaxhighlight> | ||
| + | zpool create -f mgt mirror /dev/sd{a,b}1 | ||
| + | zpool create -f mdt mirror /dev/sd{a,b}2 | ||
| + | |||
| + | foo=($(lsscsi -i | grep 0023 | grep -v 531b | awk '{print $7}' | sort | uniq | sort --random-sort)) | ||
| + | foo2=(${foo[@]/#//dev/mapper/}) | ||
| + | for i in {0..7}; do zpool create ost000$i raidz2 -f ${foo[@]:((11*i)):11}; done | ||
| + | |||
| + | |||
| + | mkfs.lustre --mgs --backfstype=zfs --reformat mgt/mgt | ||
| + | mkfs.lustre --mdt --backfstype=zfs --fsname=lzfs --index=0 --mgsnode=192.168.0.1@o2ib --reformat mdt/mdt | ||
| + | |||
| + | for i in {0..7}; do mkfs.lustre --ost --backfstype=zfs --fsname=lzfs --index=$i --mgsnode=192.168.0.1@o2ib --reformat ost000$i/ost000$i; done | ||
| + | |||
| + | # Adding back in-chassis | ||
| + | foo3=($(lsscsi -i | grep 0001 | awk '{print $7}' | sort | uniq | sort --random-sort)) | ||
| + | foo4=(${foo3[@]/#//dev/mapper/}) | ||
| + | foo5=($(lsscsi -i | grep SS | awk '{print $7}' | sort | uniq | sort --random-sort)) | ||
| + | foo6=(${foo5[@]/#//dev/mapper/}) | ||
| + | zpool create ost0008 raidz2 -f ${foo4[@]} | ||
| + | zpool create ost0009 raidz2 -f ${foo6[@]} | ||
| + | |||
| + | mkfs.lustre --ost --backfstype=zfs --fsname=lzfs --index=8 --mgsnode=192.168.0.1@o2ib --reformat ost0008/ost0008 | ||
| + | mkfs.lustre --ost --backfstype=zfs --fsname=lzfs --index=9 --mgsnode=192.168.0.1@o2ib --reformat ost0009/ost0009 | ||
| + | |||
| + | for i in mgt; do mount -t lustre $i/$i /zfs/$i; done | ||
| + | for i in mdt; do mount -t lustre $i/$i /zfs/$i; done | ||
| + | for i in ost000{0..3}; do mount -t lustre $i/$i /zfs/$i; done | ||
| + | |||
| + | for i in ost000{4..7}; do mount -t lustre $i/$i /zfs/$i; done | ||
| + | </syntaxhighlight> | ||
| + | |||
| + | === Advanced Configuration === | ||
| + | http://lustre.ornl.gov/ecosystem-2015/documents/LustreEco2015-Tutorial2.pdf | ||
| + | |||
| + | Configure InfiniBand as required: https://docs.oracle.com/cd/E19436-01/820-3522-10/ch4-linux.html | ||
=== Installing Non-Production Test Lustre (ZFS provided) === | === Installing Non-Production Test Lustre (ZFS provided) === | ||
Latest revision as of 11:41, 9 January 2017
Introduction
Lustre SFF (Small Form Factor), is a compact deployment of ZFS-backed (Zettabyte File System) Lustre intended as an alternative to NFS for a comparable capacity and scalability.
http://lustre.ornl.gov/ecosystem-2016/documents/tutorials/Stearman-LLNL-ZFS.pdf
Intel Enterprise Edition for Lustre White Paper
The Intel January 2014 white paper "Architecting a high performance storage system" serves as a good starting point for optimizing Lustre SFF.
Backend Storage
smartctl
smartctl (smartmontools; Self-Monitoring, Analysis and Reporting Technology System) is used to uniquely identify devices, conduct device testing, and assess the health of devices.
sgpdd_survey
sgpdd-survey (sg3_utils{,-libs} and Lustre iokit https://downloads.hpdd.intel.com/public/lustre ) is used to analyze backend storage (dd is not suitable as response to multiple IO threads is of interest).
- rszlo-rszhi
- record size in KB
- Affects how many blocks can be transferred in each transaction. Simulates Lustre RPC size.
- crglo-crghi
- number of regions
- Simulates multiple Lustre clients per OST. More regions requires more seeking and hence lower performance.
- thrlo-thrhi
- number of threads
- Simulates OSS threads.
- size
- total size in MB
- blocksize
- 512 B
- Default size is 8 GB and blocksize is 512 B but 32 GB (or 2x system memory) and 1 MB blocksize recommended to simulate Lustre sequential workload.
Recommended parameters: rszhi=1024, thrhi=16, crghi=16, size=32768 (or twice RAM), dio=1, oflag=direct, iflag=direct bs=1048576
obdfilter-survey
- case
- local-disk, network-echo, network-disk
- Run survey on disk-backed local obdfilter instances, network loopback or disk instances.
- thrlo-thrhi
- Number of threads
- nobjlo-nobjhi
- Number of objects to read/write.
- rszlo-rszhi
- Record size in KB.
- size
- Total IO size in MB.
- targets
- Names of obdfilter instances.
Recommended parameters: rszlo=rszhi=1024, nobjhi=128, thrhi=128
http://wiki.lustre.org/images/4/40/Wednesday_shpc-2009-benchmarking.pdf
Manually Installing Production Intel Enterprise Edition for Lustre
cd ee-3*
./create_installer zfs
tar xzvpf lustre-zfs*.tar.gz
cd lustre-zfs
./install # Takes some time as custom modules are compiled against kernel.
reboot # Would use new kernel if a new one is installed but not applicable for ZFS.
modprobe spl # Implements Solaris kernel compatibility interfaces.
modprobe zfsOfficial documentation points to http://build.hpdd.intel.com/job/lustre-manual/lastSuccessfulBuild/artifact/lustre_manual.xhtml for installing manually.
A mirror is created to be used as a combined MGS/MDT file system following "10.1 Configuring a simple Lustre file system".
zpool create mgt -o cachefile=none mirror -f /dev/disk/by-id/wwn-0x5001???????????? # Replace device names as required.
zpool create ost0001 -o cachefile=none raidz2 -o ashift=12 -O recordsize=1M -f /dev/disk/by-id/wwn-0x5000c5002??????? # Replace device names as required.
zpool create ost0002 -o cachefile=none raidz2 -o ashift=12 -O recordsize=1M -f /dev/disk/by-id/wwn-0x5000c5005??????? # Replace device names as required.The errors "one or more devices is currently unavailable" or "no such pool or dataset" may be seen and it is suggested that this is due to slow updating of the "by-id" filesystem. Work around by using "/dev/sd?" devices and converting to "by-id" endpoints.
https://github.com/zfsonlinux/zfs/issues/3708
# Replace symlinks with full path and place on single line.
echo $(readlink -f $(ls /dev/disk/by-id/wwn-0x5000c5002???????) | tr '\n' ' ')
zpool create mgt -o cachefile=none mirror -f $(readlink -f $(ls /dev/disk/by-id/wwn-0x5001????????????) | tr '\n' ' ')
zpool create ost0001 -o cachefile=none raidz2 -o ashift=12 -O recordsize=1M -f $(readlink -f $(ls /dev/disk/by-id/wwn-0x5000c5002???????) | tr '\n' ' ')
zpool create ost0002 -o cachefile=none raidz2 -o ashift=12 -O recordsize=1M -f $(readlink -f $(ls /dev/disk/by-id/wwn-0x5000c5005???????) | tr '\n' ' ')
zpool export mgt
zpool export ost0001
zpool export ost0002
zpool import -d /dev/disk/by-id mgt
zpool import -d /dev/disk/by-id ost0001
zpool import -d /dev/disk/by-id ost0002Format Lustre Filesystem
Create combined MGS/MDT, on MGS
MGS=192.168.0.1@o2ib
OSS=192.168.0.2@o2ib
FSNAME=lzfs
ssh $MGS mkfs.lustre --fsname=$FSNAME --mgs --mdt --index=0 --backfstype=zfs --servicenode=$OSS --mgsnode=$MGS --mgsnode=$OSS --reformat mgt/mgt
ssh $OSS mkfs.lustre --fsname=$FSNAME --ost --index=0000 --backfstype=zfs --servicenode=$MGS --mgsnode=$MGS --mgsnode=$OSS ost0001/ost0001
ssh $OSS mkfs.lustre --fsname=$FSNAME --ost --index=0001 --backfstype=zfs --servicenode=$MGS --mgsnode=$MGS --mgsnode=$OSS ost0002/ost0002
ssh $MGS mkdir -p /zfs/mgt
ssh $MGS mount -t lustre mgt/mgt /zfs/mgt
ssh $OSS mkdir -p /zfs/ost000{1,2}
ssh $OSS mount -t lustre ost0001/ost0001 /zfs/ost0001
ssh $OSS mount -t lustre ost0002/ost0002 /zfs/ost0002Adding 90 Bay
zpool create -f mgt mirror /dev/sd{a,b}1
zpool create -f mdt mirror /dev/sd{a,b}2
foo=($(lsscsi -i | grep 0023 | grep -v 531b | awk '{print $7}' | sort | uniq | sort --random-sort))
foo2=(${foo[@]/#//dev/mapper/})
for i in {0..7}; do zpool create ost000$i raidz2 -f ${foo[@]:((11*i)):11}; done
mkfs.lustre --mgs --backfstype=zfs --reformat mgt/mgt
mkfs.lustre --mdt --backfstype=zfs --fsname=lzfs --index=0 --mgsnode=192.168.0.1@o2ib --reformat mdt/mdt
for i in {0..7}; do mkfs.lustre --ost --backfstype=zfs --fsname=lzfs --index=$i --mgsnode=192.168.0.1@o2ib --reformat ost000$i/ost000$i; done
# Adding back in-chassis
foo3=($(lsscsi -i | grep 0001 | awk '{print $7}' | sort | uniq | sort --random-sort))
foo4=(${foo3[@]/#//dev/mapper/})
foo5=($(lsscsi -i | grep SS | awk '{print $7}' | sort | uniq | sort --random-sort))
foo6=(${foo5[@]/#//dev/mapper/})
zpool create ost0008 raidz2 -f ${foo4[@]}
zpool create ost0009 raidz2 -f ${foo6[@]}
mkfs.lustre --ost --backfstype=zfs --fsname=lzfs --index=8 --mgsnode=192.168.0.1@o2ib --reformat ost0008/ost0008
mkfs.lustre --ost --backfstype=zfs --fsname=lzfs --index=9 --mgsnode=192.168.0.1@o2ib --reformat ost0009/ost0009
for i in mgt; do mount -t lustre $i/$i /zfs/$i; done
for i in mdt; do mount -t lustre $i/$i /zfs/$i; done
for i in ost000{0..3}; do mount -t lustre $i/$i /zfs/$i; done
for i in ost000{4..7}; do mount -t lustre $i/$i /zfs/$i; doneAdvanced Configuration
http://lustre.ornl.gov/ecosystem-2015/documents/LustreEco2015-Tutorial2.pdf
Configure InfiniBand as required: https://docs.oracle.com/cd/E19436-01/820-3522-10/ch4-linux.html
Installing Non-Production Test Lustre (ZFS provided)
This will conflict with IEEL.
yum install epel-release # Provides DKMS.
gpg --quiet --with-fingerprint /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-7
# pub 4096R/352C64E5 2013-12-16 Fedora EPEL (7) <epel@fedoraproject.org>
# Key fingerprint = 91E9 7D7C 4A5E 96F1 7F3E 888F 6A2F AEA2 352C 64E5
yum install http://download.zfsonlinux.org/epel/zfs-release$(rpm -E %dist).noarch.rpm
gpg --quiet --with-fingerprint /etc/pki/rpm-gpg/RPM-GPG-KEY-zfsonlinux
# pub 2048R/F14AB620 2013-03-21 ZFS on Linux <zfs@zfsonlinux.org>
# Key fingerprint = C93A FFFD 9F3F 7B03 C310 CEB6 A9D5 A1C0 F14A B620
# sub 2048R/99685629 2013-03-21https://github.com/zfsonlinux/zfs/wiki/RHEL-%26-CentOS
yum install lustre-dkms-* lustre-osd-zfs-mount* # Downloaded from HPDD.Installing ZFS
http://lustre.ornl.gov/ecosystem-2016/documents/tutorials/Stearman-LLNL-ZFS.pdf
yum localinstall --nogpgcheck http://archive.zfsonlinux.org/epel/zfs-release.el7.noarch.rpm
yum install kernel-devel zfsZFS Pools
ZFS Best Practices: http://www.solarisinternals.com/wiki/index.php/ZFS_Best_Practics_Guide
Example pool creation:
zpool create scratchZ -o cachefile=none -o ashift=12 -O recordsize=1M -f $(lsscsi -i | grep ST1 | awk '{printf " /dev/disk/by-id/scsi-"$7 }')