Difference between revisions of "OpenHPC:Headnode install"

From Define Wiki
Jump to navigation Jump to search
 
(46 intermediate revisions by 7 users not shown)
Line 1: Line 1:
 
== Basic Initial System Configuration: ==
 
== Basic Initial System Configuration: ==
 +
'''Prep: Make a note of MAC addresses of interfaces of computenodes. MAC address stickers were found facing the front of the drawer which was misleading in terms of the left-right order of the interfaces.  In this particular case, the stickers should have been facing the rear of the drawer.  Conclusion: the MAC addresses may be the reverse of what you expect if the compute node is unable to connect to the head node.'''
 +
 +
'''Note; RPMS/repos; http://build.openhpc.community/OpenHPC:/'''
  
 
'''OpenHPC is designed to deploy CentOS 7.x based clusters, please install a fresh copy of CentOS 7.x onto a system. Please modify the partitioning and make the “/” partition a reasonable size.  
 
'''OpenHPC is designed to deploy CentOS 7.x based clusters, please install a fresh copy of CentOS 7.x onto a system. Please modify the partitioning and make the “/” partition a reasonable size.  
Line 18: Line 21:
 
<syntaxhighlight>
 
<syntaxhighlight>
 
echo “head.ohpc.net” > /etc/hostname
 
echo “head.ohpc.net” > /etc/hostname
 +
hostnamectl set-hostname head.ohpc.net
 +
# Private interface must take a "static" (not "dhcp") address IPADDR that will be used later.  NETMASK=255.255.255.0 should also be set.
 
vi /etc/sysconfig/network-scripts/ifcfg-eno1
 
vi /etc/sysconfig/network-scripts/ifcfg-eno1
 
vi /etc/sysconfig/network-scripts/ifcfg-eno2
 
vi /etc/sysconfig/network-scripts/ifcfg-eno2
Line 25: Line 30:
  
 
<syntaxhighlight>
 
<syntaxhighlight>
yum install kernel* tk* tcl* tigervnc* ipmitool* freeipmi* cairo* perl* gcc* glibc* screen epel-release vim; yum install htop; yum -y install kernel* tk* tcl*  
+
yum -y install kernel* tk* tcl* tigervnc* ipmitool* freeipmi* cairo* perl* gcc* glibc* screen epel-release vim ntp libnl lsof libxml2-python python mlocate numactl* yum-utils stop xinitd
tigervnc* ipmitool* freeipmi* cairo* perl* gcc* glibc* screen epel-release vim ntp libnl lsof libxml2-python python mlocate numactl* yum-utils; yum -y install
+
yum -y groupinstall "Development Tools" "X Windows System" “Base”
htop; yum -y groupinstall "Development Tools" "X Windows System" “Base”; yum –y update
+
yum -y update
 
</syntaxhighlight>
 
</syntaxhighlight>
  
Line 33: Line 38:
  
 
<syntaxhighlight>
 
<syntaxhighlight>
CHROOT=/opt/ohpc/admin/images/centos7.1
+
CHROOT=/opt/ohpc/admin/images/centos7.2
ohpc_repo=http://build.openhpc.community/OpenHPC:/1.0/CentOS_7.1/OpenHPC:1.0.repo
+
ohpc_repo=http://build.openhpc.community/OpenHPC:/1.3/CentOS_7/OpenHPC:1.3.repo
 
sms_name=head.ohpc.net # Hostanem of Headnode
 
sms_name=head.ohpc.net # Hostanem of Headnode
 
sms_ip=10.10.10.1 # Private Interface IP of Headnode
 
sms_ip=10.10.10.1 # Private Interface IP of Headnode
Line 45: Line 50:
 
sms_ipoib=10.10.20.1 # IPoIB Address of Headnode
 
sms_ipoib=10.10.20.1 # IPoIB Address of Headnode
 
ipoib_netmask=255.255.255.0 # IPoIB Netmask of Headnode
 
ipoib_netmask=255.255.255.0 # IPoIB Netmask of Headnode
 +
</syntaxhighlight>
  
 +
Use the variables:
 +
 +
<syntaxhighlight>
 
source /root/.bashrc
 
source /root/.bashrc
 
wget -P /etc/yum.repos.d ${ohpc_repo}
 
wget -P /etc/yum.repos.d ${ohpc_repo}
 
yum clean all
 
yum clean all
 +
</syntaxhighlight>
 +
 +
'''Setup Network Time Protocol'''
 +
 +
<syntaxhighlight>
 +
# Set local timezone:
 +
ln -sf /usr/share/zoneinfo/GB /etc/localtime
 +
service ntp stop
 +
ntpdate 0.centos.pool.ntp.org
 +
vi /etc/ntp.conf #Modify with your ntp server
 +
service ntp restart
 +
chkconfig ntp on
 +
# Alternatively
 +
systemctl restart ntpd
 +
systemctl enable ntpd
 
</syntaxhighlight>
 
</syntaxhighlight>
  
Line 56: Line 80:
  
 
<syntaxhighlight>
 
<syntaxhighlight>
yum groupinstall ohpc-base ohpc-warewulf
+
yum -y install ohpc-base ohpc-warewulf
yum -y groupinstall ohpc-slurm-server  
+
yum -y install ohpc-slurm-server  
useradd slurm
+
useradd slurm # Might already be create by ohpc-slurm-server package
 +
</syntaxhighlight>
 +
 
 +
== Modify Warewulf core Configuration Files to provision Correctly ==
 +
 
 +
'''Modify warewulf provision.conf and bootstrap.conf to correctly include the correct kernel modules and configuration'''
 +
 
 +
<syntaxhighlight>
 +
vi /etc/warewulf/vnfs.conf #Ensure exclude looks like this.
 +
 
 +
exclude += /tmp/*
 +
exclude += /var/log/*
 +
exclude += /var/chroots/*
 +
#exclude += /var/cache
 +
exclude += /usr/src
 +
#exclude += /usr/share
 +
#exclude += /home/*
 +
</syntaxhighlight>
 +
 
 +
<syntaxhighlight>
 +
vi /etc/warewulf/bootstrap.conf #Hash out all Infiniband drivers
 +
 
 +
# Infiniband drivers and Mellanox drivers
 +
#drivers += ib_ipath, ib_iser, ib_srpt, ib_sdp, ib_mthca, ib_qib, iw_cxgb3, cxgb3
 +
#drivers += iw_nes, mlx4_ib, ib_srp, ib_ipoib, ib_addr, rdma_cm, ib_ucm
 +
#drivers += ib_ucm, ib_uverbs, ib_umad, ib_cm, ib_mad, iw_cm, ib_core
 +
#drivers += rdma_ucm, ib_sa, mlx4_en, mlx4_core
 +
#drivers += rds, rds_rdma, rds_tcp, mlx4_vnic, mlx4_vnic_helper
 +
 
 +
#Unhash the modprobe for the Mellanox Modules
 +
modprobe += mlx4_core log_num_mtts=20 log_mtts_per_seg=6, ib_srp
 +
</syntaxhighlight>
 +
 
 +
'''Modify some Warewulf provisioning files to use the correct interfaces and some general Warewulf files to allow provisioning to work.'''
 +
<syntaxhighlight>
 +
perl -pi -e "s/device = eth1/device = ${sms_eth_internal}/" /etc/warewulf/provision.conf
 +
perl -pi -e "s/^\s+disable\s+= yes/ disable = no/" /etc/xinetd.d/tftp
 +
export MODFILE=/etc/httpd/conf.d/warewulf-httpd.conf
 +
perl -pi -e "s/cgi-bin>\$/cgi-bin>\n Require all granted/" $MODFILE
 +
perl -pi -e "s/Allow from all/Require all granted/" $MODFILE
 +
perl -ni -e "print unless /^\s+Order allow,deny/" $MODFILE
 +
perl -pi -e "s/ControlMachine=\S+/ControlMachine=head.ohpc.net/" /etc/slurm/slurm.conf
 +
</syntaxhighlight>
 +
 
 +
<syntaxhighlight>
 +
systemctl restart xinetd # This failed
 +
systemctl enable mariadb.service
 +
systemctl restart mariadb
 +
systemctl enable httpd.service
 +
systemctl restart httpd.service
 +
systemctl restart rpcbind.service
 +
systemctl enable rpcbind.service
 +
systemctl restart nfs-server.service
 +
systemctl enable nfs-server.service
 +
</syntaxhighlight>
 +
 
 +
== Build and Configure the Chroot ==
 +
 
 +
'''Make Initial VNFS (Chroot, compute Node template) and install some Base components into the chroot operating system'''
 +
 
 +
<syntaxhighlight>
 +
wwmkchroot centos-7 $CHROOT
 +
</syntaxhighlight>
 +
 
 +
An error may be seen regarding a deprecated URL so that an edit needs to be make in /usr/libexec/warewulf/wwmkchroot/centos-7.tmpl
 +
 
 +
<syntaxhighlight>
 +
yum -y --installroot=$CHROOT groupinstall Base
 +
yum -y --installroot=$CHROOT install kernel* grub* sudo ipmitool* epel-release htop nano tk* tcl* tigervnc* ipmitool* freeipmi* cairo* perl* gcc* glibc* screen yum-utils vim ntp libnl
 +
lsof libxml2-python python mlocate numactl* lmod-ohpc ohpc-slurm-client lmod-ohpc ganglia-gmond-ohpc environment-modules hwlock-libs libfabric libpsm2 intel-clck-ohpc
 +
 
 +
yum -y --installroot=$CHROOT install ohpc-base-compute
 +
 
 +
#ohpc-slurm-client may not exist:
 +
yum -y --installroot=$CHROOT install slurm-ohpc slurm-munge-ohpc munge-devel-ohpc
 +
 
 +
## Our X7 1U Twin hardware seems to have issues with Mellanox OFED, OpenIB seems to be the way to go...
 +
yum -y --installroot=$CHROOT install openib ibutils infiniband-diags
  
mkdir /tmp/setup-filesystems
 
cd /tmp/setup-filesystems
 
cat /srv/warewulf/initramfs/capabilities/setup-filesystems | cpio –i
 
vi warewulf/provision/80-mkbootable
 
 
</syntaxhighlight>
 
</syntaxhighlight>
  
'''Modify the file with the below contents'''
+
'''Setup SSH Keys for the Cluster – this is required for the root user only, /home will be exported so user ssh keys will be available. '''
  
 
<syntaxhighlight>
 
<syntaxhighlight>
#!/bin/sh
+
wwinit ssh_keys
#
+
cat ~/.ssh/cluster.pub >> $CHROOT/root/.ssh/authorized_keys
# Copyright (c) 2001-2003 Gregory M. Kurtzer
+
# Comment out GSSAPI lines on head and in CHROOT.
#
+
sed -i 's/^\(GSSAPI.\)/#\1/g' {,${CHROOT}}/etc/ssh/sshd_config
# Copyright (c) 2003-2011, The Regents of the University of California,
+
# Don't use DNS on head or CHROOT.
# through Lawrence Berkeley National Laboratory (subject to receipt of any
+
sed -i 's/#UseDNS yes/UseDNS no/' {,${CHROOT}}/etc/ssh/sshd_config
# required approvals from the U.S. Dept. of Energy).  All rights reserved.
+
</syntaxhighlight>
#
 
  
 +
'''Setup NFS exports and FSTAB on the compute image'''
  
# Install a bootloader if $WWBOOTLOADER is set and the root device is known
+
<syntaxhighlight>
if [ -n "$WWBOOTLOADER" -a -f "/tmp/rootdev" ]; then
+
echo "${sms_ip}:/home /home nfs nfsvers=3,rsize=1024,wsize=1024,cto 0 0" >> $CHROOT/etc/fstab
    if ! echo $WWBOOTLOADER | grep -q "^/dev/"; then
+
echo "${sms_ip}:/opt/ohpc/pub /opt/ohpc/pub nfs nfsvers=3,rsize=1024,wsize=1024,cto 0 0" >> $CHROOT/etc/fstab
        WWBOOTLOADER="/dev/$WWBOOTLOADER"
 
    fi
 
    if [ -b "$WWBOOTLOADER" ]; then
 
        KERNEL=`cd $NEWROOT; find boot/vmlinuz-* 2>/dev/null | tail -n 1`
 
        if [ -n "$KERNEL" ]; then
 
            KERNELVERSION=`echo "$KERNEL" | sed -e 's@.*boot/vmlinuz-@@'`
 
            if [ -x "$NEWROOT/sbin/dracut" ]; then
 
                chroot $NEWROOT /sbin/dracut --force '' $KERNELVERSION
 
                INITRD=`cd $NEWROOT; find boot/initr*-${KERNELVERSION}.img* 2>/dev/null | tail -n 1`
 
            elif [ -x "$NEWROOT/sbin/mkinitrd" ]; then
 
                INITRD=`cd $NEWROOT; find boot/initr*-${KERNELVERSION}.img* 2>/dev/null | tail -n 1`
 
                if [ -n "$INITRD" ]; then
 
                    INITRD="boot/initramfs-$KERNELVERSION.img"
 
                fi
 
                mount -t sysfs none $NEWROOT/sys
 
                chroot $NEWROOT /sbin/mkinitrd -f $INITRD $KERNELVERSION
 
                umount $NEWROOT/sys
 
            fi
 
            if [ -z "${WWKARGS}" ]; then
 
                WWKARGS="rhgb"
 
            fi
 
            if [ -n "${WWCONSOLE}" ]; then
 
                # Parse the kernel console option for serial settings for grub
 
                sed_serial() { echo $1 | sed -nr "s/.*ttyS([0-9]),([0-9]{4,6})([n,o,e])?([5-8])?(r)?.*/\\${2}/p"; }
 
                SERIALUNIT=`sed_serial ${WWCONSOLE} 1`
 
                SERIALSPEED=`sed_serial ${WWCONSOLE} 2`
 
                if [ -z $SERIALSPEED ]; then
 
                    SERIALSPEED=115200
 
                fi
 
                SERIALPARITY=`sed_serial ${WWCONSOLE} 3`
 
                if [ -z $SERIALPARITY ] || [ "$SERIALPARITY" = "n" ]; then
 
                    SERIALPARITY=no
 
                elif [ $SERIALPARITY = "o" ]; then
 
                    SERIALPARITY=odd
 
                elif [ $SERIALPARITY = "e" ]; then
 
                    SERIALPARITY=even
 
                fi
 
                SERIALWORD=`sed_serial ${WWCONSOLE} 4`
 
                if [ -z $SERIALWORD ]; then
 
                    SERIALWORD=8
 
                fi
 
            fi
 
  
            if [ -x "$NEWROOT/usr/sbin/grub2-install" ]; then
+
echo "/home *(rw,no_subtree_check,fsid=10,no_root_squash)" >> /etc/exports
                if [ -n "${WWCONSOLE}" ]; then
+
echo "/opt/ohpc/pub *(ro,no_subtree_check,fsid=11)" >> /etc/exports
                    echo "GRUB_CMDLINE_LINUX='${WWKARGS} console=tty0 console=${WWCONSOLE}'" >> $NEWROOT/etc/default/grub
 
                    if [ -n SERIALUNIT ]; then
 
                        echo "GRUB_TERMINAL='console serial'" >> $NEWROOT/etc/default/grub
 
                        echo "GRUB_SERIAL_COMMAND='serial --speed=${SERIALSPEED} --unit=${SERIALUNIT} --word=${SERIALWORD} --parity=${SERIALPARITY}'" >> $NEWROOT/etc/default/grub
 
                    fi
 
                else
 
                    echo "GRUB_CMDLINE_LINUX='${WWKARGS}'" >> $NEWROOT/etc/default/grub
 
                fi
 
                chroot $NEWROOT /usr/sbin/grub2-mkconfig -o /boot/grub2/grub.cfg >/dev/null
 
  
                if chroot $NEWROOT /usr/sbin/grub2-install $WWBOOTLOADER >/dev/null; then
+
exportfs -a
                    exit 0
+
systemctl restart rpcbind
                fi
+
systemctl restart nfs-server.service
            elif [ -x "$NEWROOT/sbin/grub-install" ]; then
+
</syntaxhighlight>
                ROOTDEV=`cat /tmp/rootdev`
 
                if [ -n "$INITRD" -a -n "$KERNEL" ]; then
 
                    if [ -f "$NEWROOT/etc/redhat-release" ]; then
 
                        OSVERSION=`sed -e 's@ (.*@@' $NEWROOT/etc/redhat-release`
 
                    elif [ -f "$NEWROOT/etc/release" ]; then
 
                        OSVERSION=`cat $NEWROOT/etc/redhat-release | head -n 1`
 
                    else
 
                        OSVERSION="Warewulf"
 
                    fi
 
                    if [ -f /tmp/mptab ]; then
 
                        if grep -q "^/boot " /tmp/mptab; then
 
                            INITRD=${INITRD##boot/}
 
                            KERNEL=${KERNEL##boot/}
 
                        fi
 
                    fi
 
  
                    echo "# This file was written by Warewulf bootstrap (capability setup-filesystems)" > $NEWROOT/boot/grub/device.map
+
'''Copy over  resolv.conf to the chroot and modify the contents to point to the headnode and google dns.'''
                    echo "(hd0) $WWBOOTLOADER" >> $NEWROOT/boot/grub/device.map
 
  
                    echo "# This file was written by Warewulf bootstrap (capability setup-filesystems)" > $NEWROOT/boot/grub/grub.conf
+
<syntaxhighlight>
                    if [ -n $SERIALUNIT ]; then
+
cp /etc/resolv.conf $CHROOT/etc/
                        echo "serial --speed=${SERIALSPEED} --unit=${SERIALUNIT} --word=${SERIALWORD} --parity=${SERIALPARITY}" >> $NEWROOT/boot/grub/grub.conf
+
vi $CHROOT/etc/resolv.conf
                        echo "terminal_input console serial; terminal_output console serial" >> $NEWROOT/boot/grub/grub.conf
+
</syntaxhighlight>
                    fi
 
                    echo "default 0" >>$NEWROOT/boot/grub/grub.conf
 
                    echo "timeout 10" >>$NEWROOT/boot/grub/grub.conf
 
                    echo "root (hd0,0)" >>$NEWROOT/boot/grub/grub.conf
 
                    echo "" >>$NEWROOT/boot/grub/grub.conf
 
                    echo "title $OSVERSION - $KERNELVERSION" >>$NEWROOT/boot/grub/grub.conf
 
                    if [ -n "${WWCONSOLE}" ]; then
 
                        echo "    kernel /$KERNEL ro root=$ROOTDEV rhgb ${WWKARGS} console=tty0 console=${WWCONSOLE}" >>$NEWROOT/boot/grub/grub.conf
 
                    else
 
                        echo "    kernel /$KERNEL ro root=$ROOTDEV rhgb ${WWKARGS}" >>$NEWROOT/boot/grub/grub.conf
 
                    fi
 
                    echo "    initrd /$INITRD" >>$NEWROOT/boot/grub/grub.conf
 
  
                    if [ -f "/tmp/mtab" ]; then
+
'''Modify Limits to unlimited on headnode and compute nodes..'''
                        cp /tmp/mtab $NEWROOT/etc/mtab
 
                    fi
 
  
                    mkdir $NEWROOT/dev/mapper
+
<syntaxhighlight>
                    mknod $NEWROOT/dev/mapper/control c 10 58
+
echo "* soft memlock unlimited" >> /etc/security/limits.conf
 +
echo "* hard memlock unlimited" >> /etc/security/limits.conf
 +
echo "* soft memlock unlimited" >> $CHROOT/etc/security/limits.conf
 +
echo "* hard memlock unlimited" >> $CHROOT/etc/security/limits.conf
 +
</syntaxhighlight>
  
                    if chroot $NEWROOT /sbin/grub-install $WWBOOTLOADER >/dev/null; then
+
'''Import warewulf files to the database, theses will be synced to compute nodes all of the time.'''
                        exit 0
 
                    else
 
                        gscript="/root/grubinstall.sh";
 
                        grubscript=${NEWROOT}${gscript};
 
                        grubtext=$NEWROOT/root/grubinstall.txt;
 
  
                        echo -n "Running grub-install failed. Trying manually.";
+
<syntaxhighlight>
 +
wwinit database
 +
wwsh file import /etc/passwd
 +
wwsh file import /etc/shadow
 +
wwsh file import /etc/group
 +
wwsh file import /etc/slurm/slurm.conf
 +
wwsh file import /etc/munge/munge.key
 +
wwsh file import /opt/ohpc/pub/examples/network/centos/ifcfg-ib0.ww
 +
wwsh -y file set ifcfg-ib0.ww --path=/etc/sysconfig/network-scripts/ifcfg-ib0
 +
wwsh file resync # If changes are made and do not want to wait 5 mins for resync.
 +
</syntaxhighlight>
  
                        # Setup Commands to install GRUB
+
'''Building the bootstrap and vnfs images'''
                        echo "root (hd0,0)" > $grubtext;
 
                        echo "setup (hd0)" >> $grubtext;
 
                        echo "quit" >> $grubtext;
 
  
                        # Bash script to run grub.
+
<syntaxhighlight>
                        echo "#!/bin/bash" > $grubscript;
+
# -T may need to be removed from head of wwbootstrap and wwvnfs scripts.
                        echo "/sbin/grub --batch < /root/grubinstall.txt &>/root/grubinstall.out" >> $grubscript;
+
wwbootstrap 3.10.0-229.20.1.el7.x86_64 # Alternatively wwbootstrap $(uname -r)
                        chmod 755 $grubscript;
+
wwvnfs -y --chroot $CHROOT
 +
</syntaxhighlight>
  
                        if chroot $NEWROOT $gscript &>/dev/null; then
+
'''NB: This will need to be updated depending on the kernel version being used. For example:'''
                            exit 0;
+
<syntaxhighlight>
                        else
+
[root@head setup-filesystems]# wwbootstrap 3.10.0-327.10.1.el7.x86_64
                            echo "Running grub-install failed!"
+
Number of drivers included in bootstrap: 433
                            exit 255
+
Number of firmware images included in bootstrap: 93
                        fi
+
Building and compressing bootstrap
                    fi
+
Integrating the Warewulf bootstrap: 3.10.0-327.10.1.el7.x86_64
                else
+
Including capability: provision-adhoc
                    echo "Could not find INITRD and/or KERNEL version!"
+
Including capability: provision-files
                    exit 2
+
Including capability: provision-selinux
                fi
+
Including capability: provision-vnfs
            else
+
Including capability: setup-filesystems
                echo "GRUB is not installed!"
+
Including capability: transport-http
                exit 2
+
Compressing the initramfs
            fi
+
Locating the kernel object
        else
+
Bootstrap image '3.10.0-327.10.1.el7.x86_64' is ready
            echo "Could not identify kernel version in VNFS!"
+
Done.
            exit 2
+
[root@head setup-filesystems]# wwvnfs -y --chroot $CHROOT
        fi
+
Using 'centos7.1' as the VNFS name
    else
+
Creating VNFS image from centos7.1
        echo "BOOTLOADER=$BOOTLOADER is invalid!"
+
Building new chroot...
        exit 2
+
Building and compressing the final image
    fi
+
Cleaning temporary files
else
+
VNFS 'centos7.1' has been imported
    exit 1
+
Done.
fi
+
Wrote a new configuration file at: /etc/warewulf/vnfs/centos7.1.conf
 +
</syntaxhighlight>
  
# vim: filetype=sh:syntax=sh:expandtab:ts=4:sw=4:
+
'''Configure IPv4 forwarding on the head node'''
 +
 
 +
<syntaxhighlight>
 +
echo "net.ipv4.ip_forward = 1" >>  /usr/lib/sysctl.d/50-default.conf
 +
/sbin/sysctl -p
 +
firewall-cmd --permanent --direct --passthrough ipv4 -t nat -I POSTROUTING -o enp3s0f0 -j MASQUERADE -s 192.168.92.0/24
 +
firewall-cmd --reload
 +
</syntaxhighlight>
 +
 
 +
enp3s0f0 being the external(public) interface and 192.168.92.0/24 the private(internal) network
 +
 
 +
== Compute Node Configuration for Stateful Provisioning ==
 +
 
 +
'''Adding the compute nodes to the database with the correct parameters to allow for statefull provisioning'''
 +
 
 +
<syntaxhighlight>
 +
# Gateway, -G, is the private IP of headnode.
 +
wwsh node new c0 --ipaddr=some.ip.address.here -M 255.255.255.0 -G ${sms_ip} --domain=ohpc.net  --hwaddr=some.mac.address.here -D eno1
 +
echo "GATEWAYDEV=${eth_provision}" > /tmp/network.$$
 +
wwsh -y file import /tmp/network.$$ --name network
 +
wwsh -y file set network --path /etc/sysconfig/network --mode=0644 --uid=0
 +
 
 +
wwsh -y provision set c0 --files=dynamic_hosts,passwd,group,shadow,slurm.conf,munge.key,network
 +
wwsh -y provision set c0 --vnfs=centos7.1 --bootstrap=kernel.version.here
 +
 
 +
wwsh node clone c0 c1 # Clone node.
 +
wwsh node set c1 --ipaddr=some.ip.address.here --hwaddr=some.mac.address.here -D eno1 #sometimes it does not work with eno1, replace it with eth1/eth0 and it should work
 +
</syntaxhighlight>
 +
 
 +
'''Setup Bootloader and Partitions'''
 +
<syntaxhighlight>
 +
wwsh -y object modify -s bootloader=sda c0
 +
wwsh -y object modify -s diskpartition=sda c0
 +
wwsh -y object modify -s diskformat=sda1,sda2,sda3 c0
 +
wwsh -y object modify -s filesystems="mountpoint=/boot:dev=sda1:type=ext4:size=500,dev=sda2:type=swap:size=32768,mountpoint=/:dev=sda3:type=ext4:size=fill" c0
 +
# Setup disks for other nodes as desired.
 +
</syntaxhighlight>
 +
 
 +
<syntaxhighlight>
 +
systemctl restart dhcpd
 +
wwsh pxe update
 +
wwsh dhcp update
 +
wwsh node list # Show all the nodes you just added
 
</syntaxhighlight>
 
</syntaxhighlight>

Latest revision as of 12:36, 30 January 2018

Basic Initial System Configuration:

Prep: Make a note of MAC addresses of interfaces of computenodes. MAC address stickers were found facing the front of the drawer which was misleading in terms of the left-right order of the interfaces. In this particular case, the stickers should have been facing the rear of the drawer. Conclusion: the MAC addresses may be the reverse of what you expect if the compute node is unable to connect to the head node.

Note; RPMS/repos; http://build.openhpc.community/OpenHPC:/

OpenHPC is designed to deploy CentOS 7.x based clusters, please install a fresh copy of CentOS 7.x onto a system. Please modify the partitioning and make the “/” partition a reasonable size.

service NetworkManager stop
service iptables stop
chkconfig NetworkManager off
chkconfig iptables off
chkconfig firewalld off
setenforce 0
vi /etc/selinux/config

Modify the system Hostname to use a fully qualified domain name, also modify the network interfaces to have 1x Private interface and 1x Public interface, eno1 must be the private (Provisioning) interface and eno2 can be the public interface. There are alternate ways to specify what interface to use during provision, however with CentOS 7.1 I have been unable to find them.

echo “head.ohpc.net” > /etc/hostname
hostnamectl set-hostname head.ohpc.net
# Private interface must take a "static" (not "dhcp") address IPADDR that will be used later.  NETMASK=255.255.255.0 should also be set.
vi /etc/sysconfig/network-scripts/ifcfg-eno1
vi /etc/sysconfig/network-scripts/ifcfg-eno2

Yum update the system to the latest package versions, install additional packages and reboot.

yum -y install kernel* tk* tcl* tigervnc* ipmitool* freeipmi* cairo* perl* gcc* glibc* screen epel-release vim ntp libnl lsof libxml2-python python mlocate numactl* yum-utils stop xinitd
yum -y groupinstall "Development Tools" "X Windows System" “Base”
yum -y update

Add the following alias into your bashrc for ease as you will be typing them frequently.

	
CHROOT=/opt/ohpc/admin/images/centos7.2
ohpc_repo=http://build.openhpc.community/OpenHPC:/1.3/CentOS_7/OpenHPC:1.3.repo
sms_name=head.ohpc.net			# Hostanem of Headnode
sms_ip=10.10.10.1				# Private Interface IP of Headnode
sms_eth_internal=eno1				# Private Interface of Headnode
eth_provision=eno1				# Provisioning Interface of Headnode
internal_netmask=255.255.255.0		# Netmaks of Private Interface
ntp_server=0.centos.pool.ntp.org		# Some NTP Server
bmc_username=ADMIN
bmc_password=ADMIN
sms_ipoib=10.10.20.1				# IPoIB Address of Headnode
ipoib_netmask=255.255.255.0			# IPoIB Netmask of Headnode

Use the variables:

source /root/.bashrc
wget -P /etc/yum.repos.d ${ohpc_repo}
yum clean all

Setup Network Time Protocol

# Set local timezone:
ln -sf /usr/share/zoneinfo/GB /etc/localtime
service ntp stop
ntpdate 0.centos.pool.ntp.org
vi /etc/ntp.conf		#Modify with your ntp server
service ntp restart
chkconfig ntp on
# Alternatively
systemctl restart ntpd
systemctl enable ntpd

Installing and Patching the OpenHPC Base components

Basic OpenHPC Component install and patching to make it work correctly with Grub2 (The patching component of this step is for stateful provisioning only) If deploying systems as only RAM disks this process is not necessary.

yum -y install ohpc-base ohpc-warewulf
yum -y install ohpc-slurm-server 
useradd slurm # Might already be create by ohpc-slurm-server package

Modify Warewulf core Configuration Files to provision Correctly

Modify warewulf provision.conf and bootstrap.conf to correctly include the correct kernel modules and configuration

vi /etc/warewulf/vnfs.conf		#Ensure exclude looks like this.

exclude += /tmp/*
exclude += /var/log/*
exclude += /var/chroots/*
#exclude += /var/cache
exclude += /usr/src
#exclude += /usr/share
#exclude += /home/*
vi /etc/warewulf/bootstrap.conf		#Hash out all Infiniband drivers

# Infiniband drivers and Mellanox drivers
#drivers += ib_ipath, ib_iser, ib_srpt, ib_sdp, ib_mthca, ib_qib, iw_cxgb3, cxgb3
#drivers += iw_nes, mlx4_ib, ib_srp, ib_ipoib, ib_addr, rdma_cm, ib_ucm
#drivers += ib_ucm, ib_uverbs, ib_umad, ib_cm, ib_mad, iw_cm, ib_core
#drivers += rdma_ucm, ib_sa, mlx4_en, mlx4_core
#drivers += rds, rds_rdma, rds_tcp, mlx4_vnic, mlx4_vnic_helper

#Unhash the modprobe for the Mellanox Modules
modprobe += mlx4_core log_num_mtts=20 log_mtts_per_seg=6, ib_srp

Modify some Warewulf provisioning files to use the correct interfaces and some general Warewulf files to allow provisioning to work.

perl -pi -e "s/device = eth1/device = ${sms_eth_internal}/" /etc/warewulf/provision.conf
perl -pi -e "s/^\s+disable\s+= yes/ disable = no/" /etc/xinetd.d/tftp
export MODFILE=/etc/httpd/conf.d/warewulf-httpd.conf
perl -pi -e "s/cgi-bin>\$/cgi-bin>\n Require all granted/" $MODFILE
perl -pi -e "s/Allow from all/Require all granted/" $MODFILE
perl -ni -e "print unless /^\s+Order allow,deny/" $MODFILE
perl -pi -e "s/ControlMachine=\S+/ControlMachine=head.ohpc.net/" /etc/slurm/slurm.conf
systemctl restart xinetd # This failed
systemctl enable mariadb.service
systemctl restart mariadb
systemctl enable httpd.service
systemctl restart httpd.service
systemctl restart rpcbind.service
systemctl enable rpcbind.service
systemctl restart nfs-server.service
systemctl enable nfs-server.service

Build and Configure the Chroot

Make Initial VNFS (Chroot, compute Node template) and install some Base components into the chroot operating system

wwmkchroot centos-7 $CHROOT

An error may be seen regarding a deprecated URL so that an edit needs to be make in /usr/libexec/warewulf/wwmkchroot/centos-7.tmpl

yum -y --installroot=$CHROOT groupinstall Base
yum -y --installroot=$CHROOT install kernel* grub* sudo ipmitool* epel-release htop nano tk* tcl* tigervnc* ipmitool* freeipmi* cairo* perl* gcc* glibc* screen yum-utils vim ntp libnl 
lsof libxml2-python python mlocate numactl* lmod-ohpc ohpc-slurm-client lmod-ohpc ganglia-gmond-ohpc environment-modules hwlock-libs libfabric libpsm2 intel-clck-ohpc

yum -y --installroot=$CHROOT install ohpc-base-compute

#ohpc-slurm-client may not exist:
yum -y --installroot=$CHROOT install slurm-ohpc slurm-munge-ohpc munge-devel-ohpc

## Our X7 1U Twin hardware seems to have issues with Mellanox OFED, OpenIB seems to be the way to go...
yum -y --installroot=$CHROOT install openib ibutils infiniband-diags

Setup SSH Keys for the Cluster – this is required for the root user only, /home will be exported so user ssh keys will be available.

wwinit ssh_keys
cat ~/.ssh/cluster.pub >> $CHROOT/root/.ssh/authorized_keys
# Comment out GSSAPI lines on head and in CHROOT.
sed -i 's/^\(GSSAPI.\)/#\1/g' {,${CHROOT}}/etc/ssh/sshd_config
# Don't use DNS on head or CHROOT.
sed -i 's/#UseDNS yes/UseDNS no/' {,${CHROOT}}/etc/ssh/sshd_config

Setup NFS exports and FSTAB on the compute image

echo "${sms_ip}:/home /home nfs nfsvers=3,rsize=1024,wsize=1024,cto 0 0" >> $CHROOT/etc/fstab
echo "${sms_ip}:/opt/ohpc/pub /opt/ohpc/pub nfs nfsvers=3,rsize=1024,wsize=1024,cto 0 0" >> $CHROOT/etc/fstab

echo "/home *(rw,no_subtree_check,fsid=10,no_root_squash)" >> /etc/exports
echo "/opt/ohpc/pub *(ro,no_subtree_check,fsid=11)" >> /etc/exports

exportfs -a
systemctl restart rpcbind
systemctl restart nfs-server.service

Copy over resolv.conf to the chroot and modify the contents to point to the headnode and google dns.

cp /etc/resolv.conf $CHROOT/etc/
vi $CHROOT/etc/resolv.conf

Modify Limits to unlimited on headnode and compute nodes..

echo "* soft memlock unlimited" >> /etc/security/limits.conf
echo "* hard memlock unlimited" >> /etc/security/limits.conf
echo "* soft memlock unlimited" >> $CHROOT/etc/security/limits.conf
echo "* hard memlock unlimited" >> $CHROOT/etc/security/limits.conf

Import warewulf files to the database, theses will be synced to compute nodes all of the time.

wwinit database 
wwsh file import /etc/passwd
wwsh file import /etc/shadow
wwsh file import /etc/group
wwsh file import /etc/slurm/slurm.conf
wwsh file import /etc/munge/munge.key
wwsh file import /opt/ohpc/pub/examples/network/centos/ifcfg-ib0.ww
wwsh -y file set ifcfg-ib0.ww --path=/etc/sysconfig/network-scripts/ifcfg-ib0
wwsh file resync # If changes are made and do not want to wait 5 mins for resync.

Building the bootstrap and vnfs images

# -T may need to be removed from head of wwbootstrap and wwvnfs scripts.
wwbootstrap 3.10.0-229.20.1.el7.x86_64 # Alternatively wwbootstrap $(uname -r)
wwvnfs -y --chroot $CHROOT

NB: This will need to be updated depending on the kernel version being used. For example:

[root@head setup-filesystems]# wwbootstrap 3.10.0-327.10.1.el7.x86_64
Number of drivers included in bootstrap: 433
Number of firmware images included in bootstrap: 93
Building and compressing bootstrap
Integrating the Warewulf bootstrap: 3.10.0-327.10.1.el7.x86_64
Including capability: provision-adhoc
Including capability: provision-files
Including capability: provision-selinux
Including capability: provision-vnfs
Including capability: setup-filesystems
Including capability: transport-http
Compressing the initramfs
Locating the kernel object
Bootstrap image '3.10.0-327.10.1.el7.x86_64' is ready
Done.
[root@head setup-filesystems]# wwvnfs -y --chroot $CHROOT
Using 'centos7.1' as the VNFS name
Creating VNFS image from centos7.1
Building new chroot...
Building and compressing the final image
Cleaning temporary files
VNFS 'centos7.1' has been imported
Done.
Wrote a new configuration file at: /etc/warewulf/vnfs/centos7.1.conf

Configure IPv4 forwarding on the head node

echo "net.ipv4.ip_forward = 1" >>  /usr/lib/sysctl.d/50-default.conf
/sbin/sysctl -p
firewall-cmd --permanent --direct --passthrough ipv4 -t nat -I POSTROUTING -o enp3s0f0 -j MASQUERADE -s 192.168.92.0/24
firewall-cmd --reload

enp3s0f0 being the external(public) interface and 192.168.92.0/24 the private(internal) network

Compute Node Configuration for Stateful Provisioning

Adding the compute nodes to the database with the correct parameters to allow for statefull provisioning

# Gateway, -G, is the private IP of headnode.
wwsh node new c0 --ipaddr=some.ip.address.here -M 255.255.255.0 -G ${sms_ip} --domain=ohpc.net  --hwaddr=some.mac.address.here -D eno1
echo "GATEWAYDEV=${eth_provision}" > /tmp/network.$$
wwsh -y file import /tmp/network.$$ --name network
wwsh -y file set network --path /etc/sysconfig/network --mode=0644 --uid=0

wwsh -y provision set c0 --files=dynamic_hosts,passwd,group,shadow,slurm.conf,munge.key,network
wwsh -y provision set c0 --vnfs=centos7.1 --bootstrap=kernel.version.here

wwsh node clone c0 c1 # Clone node.
wwsh node set c1 --ipaddr=some.ip.address.here --hwaddr=some.mac.address.here -D eno1 #sometimes it does not work with eno1, replace it with eth1/eth0 and it should work

Setup Bootloader and Partitions

wwsh -y object modify -s bootloader=sda c0
wwsh -y object modify -s diskpartition=sda c0
wwsh -y object modify -s diskformat=sda1,sda2,sda3 c0
wwsh -y object modify -s filesystems="mountpoint=/boot:dev=sda1:type=ext4:size=500,dev=sda2:type=swap:size=32768,mountpoint=/:dev=sda3:type=ext4:size=fill" c0
# Setup disks for other nodes as desired.
systemctl restart dhcpd
wwsh pxe update
wwsh dhcp update
wwsh node list		# Show all the nodes you just added