Difference between revisions of "Orch:Headnode install"
| (2 intermediate revisions by the same user not shown) | |||
| Line 3: | Line 3: | ||
We assume the ISO file is copied in a location like /tmp/Intel_HPC_Orchestrator-rhel7.2-16.01.004.ga.iso | We assume the ISO file is copied in a location like /tmp/Intel_HPC_Orchestrator-rhel7.2-16.01.004.ga.iso | ||
| + | First change the hostname of the master/head node | ||
| + | |||
| + | <syntaxhighlight> | ||
| + | echo “head.orch.net” > /etc/hostname | ||
| + | hostnamectl set-hostname head.orch.net | ||
| + | </syntaxhighlight> | ||
== Enable local Intel® HPC Orchestrator repository == | == Enable local Intel® HPC Orchestrator repository == | ||
On the head node, mount the image and enable Orchestrator as a local repository using the "hpc-orch-release" package rpm. | On the head node, mount the image and enable Orchestrator as a local repository using the "hpc-orch-release" package rpm. | ||
| Line 77: | Line 83: | ||
export internal_netmask = 255.255.255.0 | export internal_netmask = 255.255.255.0 | ||
export sms_ip = 10.10.10.1 | export sms_ip = 10.10.10.1 | ||
| + | export sms_name = head.orch.net | ||
# Configure Warewulf provisioning to use desired internal interface | # Configure Warewulf provisioning to use desired internal interface | ||
| Line 115: | Line 122: | ||
mkdir -p $CHROOT/mnt/hpc_orch_iso | mkdir -p $CHROOT/mnt/hpc_orch_iso | ||
mount -o loop /tmp/Intel_HPC_Orchestrator-rhel7.2-16.01.004.ga.iso $CHROOT/mnt/hpc_orch_iso ; echo $? | mount -o loop /tmp/Intel_HPC_Orchestrator-rhel7.2-16.01.004.ga.iso $CHROOT/mnt/hpc_orch_iso ; echo $? | ||
| − | rpm -Uvh | + | chroot $CHROOT rpm -Uvh /mnt/hpc_orch_iso/x86_64/Intel_HPC_Orchestrator_release-*.x86_64.rpm |
chroot $CHROOT rpm --import /etc/pki/pgp/HPC-Orchestrator*.asc | chroot $CHROOT rpm --import /etc/pki/pgp/HPC-Orchestrator*.asc | ||
| − | chroot $CHROOT | + | chroot $CHROOT rpm --import /etc/pki/pgp/PSXE-keyfile.asc |
</syntaxhighlight> | </syntaxhighlight> | ||
| Line 178: | Line 185: | ||
<syntaxhighlight> | <syntaxhighlight> | ||
echo "${sms_ip}:/home /home nfs nfsvers=3,rsize=1024,wsize=1024,cto 0 0" >> $CHROOT/etc/fstab | echo "${sms_ip}:/home /home nfs nfsvers=3,rsize=1024,wsize=1024,cto 0 0" >> $CHROOT/etc/fstab | ||
| − | echo "${sms_ip}:/opt/ | + | echo "${sms_ip}:/opt/intel /opt/intel nfs nfsvers=3,rsize=1024,wsize=1024,cto 0 0" >> $CHROOT/etc/fstab |
| + | |||
| + | perl -pi -e "s/ControlMachine=\S+/ControlMachine=${sms_name}/" /etc/slurm/slurm.conf | ||
| + | # Export /home and public packages from master server to cluster compute nodes | ||
echo "/home *(rw,no_subtree_check,fsid=10,no_root_squash)" >> /etc/exports | echo "/home *(rw,no_subtree_check,fsid=10,no_root_squash)" >> /etc/exports | ||
| − | echo "/opt/ | + | echo "/opt/intel *(ro,no_subtree_check,fsid=11,no_root_squash)" >> /etc/exports |
| − | + | echo "/opt/intel/hpc-orchestrator *(ro,no_subtree_check,fsid=12,no_root_squash,nohide)" /etc/exports | |
| + | |||
exportfs -a | exportfs -a | ||
| + | |||
systemctl restart rpcbind | systemctl restart rpcbind | ||
systemctl restart nfs-server.service | systemctl restart nfs-server.service | ||
| Line 203: | Line 215: | ||
wwsh file import /etc/slurm/slurm.conf | wwsh file import /etc/slurm/slurm.conf | ||
wwsh file import /etc/munge/munge.key | wwsh file import /etc/munge/munge.key | ||
| − | wwsh file import /opt/ | + | wwsh file import /opt/intel/hpc-orchestrator/pub/examples/network/centos/ifcfg-ib0.ww |
wwsh -y file set ifcfg-ib0.ww --path=/etc/sysconfig/network-scripts/ifcfg-ib0 | wwsh -y file set ifcfg-ib0.ww --path=/etc/sysconfig/network-scripts/ifcfg-ib0 | ||
wwsh file resync # If changes are made and do not want to wait 5 mins for resync. | wwsh file resync # If changes are made and do not want to wait 5 mins for resync. | ||
| Line 216: | Line 228: | ||
== Adding the compute nodes to the database with the correct parameters to allow for statefull provisioning == | == Adding the compute nodes to the database with the correct parameters to allow for statefull provisioning == | ||
<syntaxhighlight> | <syntaxhighlight> | ||
| − | wwsh node new c0 --ipaddr=some.ip.address.here -M 255.255.255.0 -G ${sms_ip} --domain= | + | wwsh node new c0 --ipaddr=some.ip.address.here -M 255.255.255.0 -G ${sms_ip} --domain=orch.net --hwaddr=some.mac.address.here -D eth1 #eno "notation" does cause problems |
echo "GATEWAYDEV=${eth_provision}" > /tmp/network.$$ | echo "GATEWAYDEV=${eth_provision}" > /tmp/network.$$ | ||
wwsh -y file import /tmp/network.$$ --name network | wwsh -y file import /tmp/network.$$ --name network | ||
Latest revision as of 16:55, 23 January 2017
In the following we assume the availability of a single head node master, some -at least one- compute nodes, and the Intel Orchestrator ISO file (if you don't have it, contact David). The master node is provisioned with Centos7.2, and serves as the overall system management server (SMS). In its role as an SMS, the master node is configured to provision the remaining compute in a stateless configuration using Warewulf.
We assume the ISO file is copied in a location like /tmp/Intel_HPC_Orchestrator-rhel7.2-16.01.004.ga.iso
First change the hostname of the master/head node
echo “head.orch.net” > /etc/hostname
hostnamectl set-hostname head.orch.netEnable local Intel® HPC Orchestrator repository
On the head node, mount the image and enable Orchestrator as a local repository using the "hpc-orch-release" package rpm.
mkdir -p /mnt/hpc_orch_iso
mount -o loop /tmp/Intel-HPC-Orchestrator-rhel7.2-16.01.004.ga.iso /mnt/hpc_orch_iso ; echo $?
rpm -Uvh /mnt/hpc_orch_iso/x86_64/Intel_HPC_Orchestrator_release-*.x86_64.rpm
rpm --import /etc/pki/pgp/HPC-Orchestrator*.asc
rpm --import /etc/pki/pgp/PSXE-keyfile.ascAdd provisioning services to master node
With the Intel® HPC Orchestrator repository enabled, we proceed by adding the orch-base and Warewul provisioning packaage onto the master node.
yum -y groupinstall orch-base
yum -y groupinstall orch-warewulfProvisioning services with Warewulf rely on DHCP, TFTP, and HTTP network protocols. Default firewall rules may prohibit these services. Therefore we will disable the firewall (Once installed it's highly recommended to re-enable it on the head node and configure it to only allow access on port 22 from the external interface, while still allowing traffic on the internal interfaces to the system)
rpm -q firewalld && systemctl disable firewalld
rpm -q firewalld && systemctl stop firewalldIntel® HPC Orchestrator relies on synchronized clocks throughout the system and uses the NTP protocol to facilitate this synchronization. To enable NTP services on the head node with a specific server ${ntp_server}, issue the following on the heas node:
systemctl enable ntpd.service
# Disable default external servers
sed -i 's|^server|#server|' /etc/ntp.conf
echo "server ${ntp_server}" >> /etc/ntp.conf
echo "server 127.127.1.0 # local clock" >> /etc/ntp.conf
echo "fudge 127.127.1.0 stratum 10" >> /etc/ntp.conf
systemctl restart ntpd
Add resource management services to the master node
The following command adds the SLURM workload manager server components to the head node. Later on, client-side components will be added to the compute image.
yum -y groupinstall orch-slurm-server
# Add PDSH support to determine the nodelist of a Slurm job and run a command on those nodes
yum -y install pdsh-mod-slurm-orchSLURM requires the designation of a system user that runs the underlying resource management daemons. The default configuration file that is supplied with the Intel® HPC Orchestrator build of SLURM identifies this SlurmUser to be a dedicated user named "slurm" and this user must exist.
getent passwd slurm || useradd slurmSLURM can also be configured to control which local resource limits get propagated to a user's allocated resources by enabling SLURM's PAM support.
perl -pi -e "s|^#UsePAM=|UsePAM=1|" /etc/slurm/slurm.conf
cat <<- HERE > /etc/pam.d/slurm
account required pam_unix.so
account required pam_slurm.so
auth required pam_localuser.so
session required pam_limits.so
HEREBy default all resource limits are propagated from the session a user submitted a job from. With PAM support enabled configuration can be added to SLURM's configuration, e.g. adding PropagateResourceLimitsExcept=NOFILE will prevent the user's resource limit on open files from being set on their allocated nodes.
Add genders
This is not required for basic installation
Complete basic Warewulf setup for master node
At this stage, we have to decide which interface on the head node will be used for the provisioning, e.g. eno1
# we recommended to add the following "exports" to your .bashrc file so that you do not have to export it each time after reboot
export sms_eth_internal = eno1
export internal_netmask = 255.255.255.0
export sms_ip = 10.10.10.1
export sms_name = head.orch.net
# Configure Warewulf provisioning to use desired internal interface
perl -pi -e "s/device = eth1/device = ${sms_eth_internal}/" /etc/warewulf/provision.conf
# Enable tftp service for compute node image distribution
perl -pi -e "s/^\s+disable\s+= yes/ disable = no/" /etc/xinetd.d/tftp
# Enable http access for Warewulf cgi-bin directory to support newer apache syntax
export MODFILE=/etc/httpd/conf.d/warewulf-httpd.conf
perl -pi -e "s/cgi-bin>\$/cgi-bin>\n Require all granted/" $MODFILE
perl -pi -e "s/Allow from all/Require all granted/" $MODFILE
perl -ni -e "print unless /^\s+Order allow,deny/" $MODFILE
# Enable internal interface for provisioning; here it is better to edit /etc/sysconfig/network-scripts/ifcfg-eno1 and set BOOTPROTO to "static",
# IPADDR=10.10.10.1, NETMASK=255.255.255.0, and ONBOOT=yes, Otherwise run the following (make sure that these changes persist
# after any reboot
ifconfig ${sms_eth_internal} ${sms_ip} netmask ${internal_netmask} up #ifconfig not installed? run yum install net-tools
# Restart/enable relevant services to support provisioning
systemctl restart xinetd
systemctl enable mariadb.service
systemctl restart mariadb
systemctl enable httpd.service
systemctl restart httpdBuild initial Base OS image
export CHROOT=/opt/intel/hpc-orchestrator/admin/images/Centos7.2
wwmkchroot centos-7 $CHROOT
yum -y --installroot=$CHROOT groupinstall Base
yum -y --installroot=$CHROOT install kernel* grub* sudo ipmitool* epel-release htop nano tk* tcl* tigervnc* ipmitool* freeipmi* cairo* perl* gcc* glibc* screen yum-utils vim ntp libnl lsof libxml2-python python mlocate numactl* environment-modules hwlock-libs libfabric libpsm2Mount the Orchestrator image in $CHROOT/mnt
mkdir -p $CHROOT/mnt/hpc_orch_iso
mount -o loop /tmp/Intel_HPC_Orchestrator-rhel7.2-16.01.004.ga.iso $CHROOT/mnt/hpc_orch_iso ; echo $?
chroot $CHROOT rpm -Uvh /mnt/hpc_orch_iso/x86_64/Intel_HPC_Orchestrator_release-*.x86_64.rpm
chroot $CHROOT rpm --import /etc/pki/pgp/HPC-Orchestrator*.asc
chroot $CHROOT rpm --import /etc/pki/pgp/PSXE-keyfile.ascAdd InfiniBand support services to the head node
yum -y groupinstall "InfiniBand Support"
yum -y install infinipath-psm
# Re-run udev rules
/sbin/udevadm trigger --subsystem-match=ipath
# Load IB drivers
systemctl enable rdma
systemctl start rdma
# Bring up ib0
ifup ib0Add Intel® HPC Orchestrator components to compute node image
cp -p /etc/resolv.conf $CHROOT/etc/resolv.confNow, add the Orchestrator to the image
# Make sure signing keys are available inside the chroot
cp -prv /etc/pki $CHROOT/etc
chroot $CHROOT yum -y install Intel_HPC_Orchestrator_release
# Add Slurm client support
chroot $CHROOT yum -y groupinstall orch-slurm-client
# Add PDSH support to determine the nodelist of a Slurm job and run a command on those nodes
chroot $CHROOT yum -y install pdsh-mod-slurm-orch
# Add Network Time Protocol (NTP) support
chroot $CHROOT yum -y install ntp
# Add kernel drivers
chroot $CHROOT yum -y install kernel
# Include modules user environment
chroot $CHROOT yum -y install lmod-orch
# Confirm dependencies of CLCK are available on compute nodes
chroot $CHROOT yum -y install dmidecode pciutils coreutilsAdd InfiniBand support services to the compute node image
# Add IB support and enable
chroot $CHROOT yum -y groupinstall "InfiniBand Support"
chroot $CHROOT yum -y install infinipath-psm
chroot $CHROOT systemctl enable rdmaSetup SSH Keys for the Cluster – this is required for the root user only, /home will be exported so user ssh keys will be available.
wwinit ssh_keys
cat ~/.ssh/cluster.pub >> $CHROOT/root/.ssh/authorized_keys
# Comment out GSSAPI lines on head and in CHROOT.
sed -i 's/^\(GSSAPI.\)/#\1/g' {,${CHROOT}}/etc/ssh/sshd_config
# Don't use DNS on head or CHROOT.
sed -i 's/#UseDNS yes/UseDNS no/' {,${CHROOT}}/etc/ssh/sshd_configSetup NFS exports and FSTAB on the compute image
echo "${sms_ip}:/home /home nfs nfsvers=3,rsize=1024,wsize=1024,cto 0 0" >> $CHROOT/etc/fstab
echo "${sms_ip}:/opt/intel /opt/intel nfs nfsvers=3,rsize=1024,wsize=1024,cto 0 0" >> $CHROOT/etc/fstab
perl -pi -e "s/ControlMachine=\S+/ControlMachine=${sms_name}/" /etc/slurm/slurm.conf
# Export /home and public packages from master server to cluster compute nodes
echo "/home *(rw,no_subtree_check,fsid=10,no_root_squash)" >> /etc/exports
echo "/opt/intel *(ro,no_subtree_check,fsid=11,no_root_squash)" >> /etc/exports
echo "/opt/intel/hpc-orchestrator *(ro,no_subtree_check,fsid=12,no_root_squash,nohide)" /etc/exports
exportfs -a
systemctl restart rpcbind
systemctl restart nfs-server.serviceModify Limits to unlimited on headnode and compute nodes
echo "* soft memlock unlimited" >> /etc/security/limits.conf
echo "* hard memlock unlimited" >> /etc/security/limits.conf
echo "* soft memlock unlimited" >> $CHROOT/etc/security/limits.conf
echo "* hard memlock unlimited" >> $CHROOT/etc/security/limits.confImport warewulf files to the database, theses will be synced to compute nodes all of the time.
wwsh file import /etc/passwd
wwsh file import /etc/shadow
wwsh file import /etc/group
wwsh file import /etc/slurm/slurm.conf
wwsh file import /etc/munge/munge.key
wwsh file import /opt/intel/hpc-orchestrator/pub/examples/network/centos/ifcfg-ib0.ww
wwsh -y file set ifcfg-ib0.ww --path=/etc/sysconfig/network-scripts/ifcfg-ib0
wwsh file resync # If changes are made and do not want to wait 5 mins for resync.Building the bootstrap and vnfs images
wwbootstrap $(uname -r)
wwvnfs -y --chroot $CHROOTAdding the compute nodes to the database with the correct parameters to allow for statefull provisioning
wwsh node new c0 --ipaddr=some.ip.address.here -M 255.255.255.0 -G ${sms_ip} --domain=orch.net --hwaddr=some.mac.address.here -D eth1 #eno "notation" does cause problems
echo "GATEWAYDEV=${eth_provision}" > /tmp/network.$$
wwsh -y file import /tmp/network.$$ --name network
wwsh -y file set network --path /etc/sysconfig/network --mode=0644 --uid=0
wwsh -y provision set c0 --files=dynamic_hosts,passwd,group,shadow,slurm.conf,munge.key,network
wwsh -y provision set c0 --vnfs=centos7.2 --bootstrap=$(uname -r)
wwsh node clone c0 c1 # Clone node.
wwsh node set c1 --ipaddr=some.ip.address.here --hwaddr=some.mac.address.here -D eth1Update dhcp, pxe
systemctl restart dhcpd
wwsh pxe update
wwsh dhcp update
#list all added nodes
wwsh node list