Installation notes on OpenHPC 3 and Rocky 9

From Define Wiki
Jump to navigation Jump to search

Headnode Installation

Setup the networking

nmcli device status
nmcli device show enp129s0f0

# UI way
nmtui

# CLI way
# set to static (from dhcp)

nmcli con mod enp129s0f0 ipv4.addresses 10.141.0.1/16
nmcli con mod enp129s0f0 ipv4.gateway 10.141.0.254
nmcli con mod enp129s0f0 ipv4.dns "8.8.8.8"
nmcli con mod enp129s0f0 ipv4.method manual

nmcli con up enp129s0f0

OpenHPC vars.sh

[root@openhpc3-headnode scratch]# cat openhpc-vars.sh
# vars needed

# ${sms name}
# ${sms ip}
# ${sms eth internal}
# ${eth provision}
# ${internal netmask}
# ${ntp server}
# ${bmc username}
# ${bmc password}
# ${num computes}
# ${c ip[0]}, ${c ip[1]}, ...
# ${c bmc[0]}, ${c bmc[1]}, ...
# ${c mac[0]}, ${c mac[1]}, ...
# ${c name[0]}, ${c name[1]}, ...
# ${compute regex}
# ${compute prefix}

sms_name="rocky-head"
sms_ip="10.20.30.240"
sms_eth_internal="eth1"
eth_provision="eth1"
internal_netmask="255.255.255.0"
ntp_server=0.centos.pool.ntp.org
compute_regex="compute*"
compute_prefix="compute"

Setup hosts file

# source openhpc-vars.sh

[root@openhpc3-headnode scratch]# echo ${sms_ip} ${sms_name}
10.20.30.240 rocky-head
[root@openhpc3-headnode scratch]# echo ${sms_ip} ${sms_name} >> /etc/hosts
[root@openhpc3-headnode scratch]# cat /etc/hosts
127.0.0.1   localhost localhost.localdomain localhost4 localhost4.localdomain4
::1         localhost localhost.localdomain localhost6 localhost6.localdomain6

10.20.30.240 rocky-head

Disable services and selinux

sed -i 's/SELINUX=enforcing/SELINUX=disabled/g' /etc/sysconfig/selinux
systemctl disable firewalld
systemctl stop firewalld

Install OpenHPC Components

dnf -y install http://repos.openhpc.community/OpenHPC/3/EL_9/x86_64/ohpc-release-3-1.el9.x86_64.rpm
dnf -y install dnf-plugins-core
dnf -y config-manager --set-enabled crb
dnf -y groupinstall 'Development Tools'

Optional: HPC Docs and Scripts

dnf -y install docs-ohpc
# then the file will be 
/opt/ohpc/pub/doc/recipes/rocky9/x86_64/warewulf/slurm/recipe.sh

however I like to do by hand, so lets crack on!

Add provisioning services on headnode

dnf -y install ohpc-base
dnf -y install ohpc-warewulf

Chrony / time services

systemctl enable chronyd.service
echo "local stratum 10" >> /etc/chrony.conf
echo "server ${ntp_server}" >> /etc/chrony.conf
echo "allow all" >> /etc/chrony.conf
systemctl restart chronyd

Slurm Resource Manager

dnf -y install ohpc-slurm-server
cp /etc/slurm/slurm.conf.ohpc /etc/slurm/slurm.conf
cp /etc/slurm/cgroup.conf.example /etc/slurm/cgroup.conf
perl -pi -e "s/SlurmctldHost=\S+/gre=${sms_name}/" /etc/slurm/slurm.conf

Get Werewulf setup

perl -pi -e "s/device = eth1/device = ${sms_eth_internal}/" /etc/warewulf/provision.conf
# not needed ip link set dev ${sms_eth_internal} up
# not needed ip address add ${sms_ip}/${internal_netmask} broadcast + dev ${sms_eth_internal}
systemctl enable httpd.service
systemctl restart httpd
systemctl enable dhcpd.service
systemctl enable tftp.socket
systemctl start tftp.socket

Compute node configuration

# Define chroot location
export CHROOT=/opt/ohpc/admin/images/rocky9.2
# Build initial chroot image
wwmkchroot -v rocky-9 $CHROOT

# failed - need some perl extras
dnf -y install perl-utils
mkdir -p  /usr/local/lib64/perl5/5.32
cd /usr/include; 
h2ph * */*

# ok lets try this again
wwmkchroot -v rocky-9 $CHROOT

mount -o bind /sys $CHROOT/sys
mount -o bind /proc $CHROOT/pro

# Enable OpenHPC and EPEL repos inside chroot
dnf -y --installroot $CHROOT install epel-release
# hmm something odd happened , needed to cp /etc/resolve.conf $CHROOT/etc ;  chroot $CHROOT; rpm -e epel-release ; rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
cp -p /etc/yum.repos.d/OpenHPC*.repo $CHROOT/etc/yum.repos.d
dnf -y --installroot=$CHROOT install ohpc-base-compute

# some files / creds
cp -p /etc/resolv.conf $CHROOT/etc/resolv.conf
cp /etc/passwd /etc/group $CHROOT/etc

dnf -y --installroot=$CHROOT install ohpc-slurm-client
chroot $CHROOT systemctl enable munge
chroot $CHROOT systemctl enable slurmd
echo SLURMD_OPTIONS="--conf-server ${sms_ip}" > $CHROOT/etc/sysconfig/slurmd

dnf -y --installroot=$CHROOT install chrony
echo "server ${sms_ip} iburst" >> $CHROOT/etc/chrony.conf

dnf -y --installroot=$CHROOT install kernel-`uname -r`
# failed; yum -y --installroot=$CHROOT install kernel-`uname -r`
# if you run yum install kernel on headnode, then the versions will be the same (probably avoided if you start the whole process with yum update) 
#yum -y --installroot=$CHROOT install kernel

dnf -y --installroot=$CHROOT install lmod-ohpc

Cutomise the system configuration

wwinit database
wwinit ssh_keys

echo "${sms_ip}:/home /home nfs nfsvers=4,nodev,nosuid 0 0" >> $CHROOT/etc/fstab
echo "${sms_ip}:/opt/ohpc/pub /opt/ohpc/pub nfs nfsvers=4,nodev 0 0" >> $CHROOT/etc/fstab

echo "/home *(rw,no_subtree_check,fsid=10,no_root_squash)" >> /etc/exports
echo "/opt/ohpc/pub *(ro,no_subtree_check,fsid=11)" >> /etc/exports

exportfs -a
systemctl restart nfs-server
systemctl enable nfs-server

# Update memlock settings on master
perl -pi -e 's/# End of file/\* soft memlock unlimited\n$&/s' /etc/security/limits.conf 
perl -pi -e 's/# End of file/\* hard memlock unlimited\n$&/s' /etc/security/limits.conf
# Update memlock settings within compute image
perl -pi -e 's/# End of file/\* soft memlock unlimited\n$&/s' $CHROOT/etc/security/limits.conf 
perl -pi -e 's/# End of file/\* hard memlock unlimited\n$&/s' $CHROOT/etc/security/limits.conf

# no access to compute nodes if job not running
echo "account required pam_slurm.so" >> $CHROOT/etc/pam.d/sshd

Enable IB Infiniband

dnf  -y groupinstall "InfiniBand Support"
udevadm trigger --type=devices --action=add
systemctl restart rdma-load-modules@infiniband.service

# and for clients
dnf -y --installroot=$CHROOT groupinstall "InfiniBand Support"

Setup Rsyslog

# Configure SMS to receive messages and reload rsyslog configuration
echo 'module(load="imudp")' >> /etc/rsyslog.d/ohpc.conf
echo 'input(type="imudp" port="514")' >> /etc/rsyslog.d/ohpc.conf
systemctl restart rsyslog

# Define compute node forwarding destination
echo "*.* @${sms_ip}:514" >> $CHROOT/etc/rsyslog.conf
echo "Target=\"${sms_ip}\" Protocol=\"udp\"" >> $CHROOT/etc/rsyslog.conf

# Disable most local logging on computes. Emergency and boot logs will remain on the compute nodes
perl -pi -e "s/^\*\.info/\\#\*\.info/" $CHROOT/etc/rsyslog.conf 
perl -pi -e "s/^authpriv/\\#authpriv/" $CHROOT/etc/rsyslog.conf
perl -pi -e "s/^mail/\\#mail/" $CHROOT/etc/rsyslog.conf 
perl -pi -e "s/^cron/\\#cron/" $CHROOT/etc/rsyslog.conf
perl -pi -e "s/^uucp/\\#uucp/" $CHROOT/etc/rsyslog.conf

Add Nagios (FAILED Revisit)

# Install nagios, nrep, and all available plugins on master host
dnf -y install --skip-broken nagios nrpe nagios-plugins-*

# Install nrpe and an example plugin into compute node image
dnf -y --installroot=$CHROOT install nrpe nagios-plugins-ssh

# Enable and configure Nagios NRPE daemon in compute image
chroot $CHROOT systemctl enable nrpe
perl -pi -e "s/^allowed_hosts=/# allowed_hosts=/" $CHROOT/etc/nagios/nrpe.cfg
echo "nrpe : ${sms_ip} : ALLOW" >> $CHROOT/etc/hosts.allow
echo "nrpe : ALL : DENY" >> $CHROOT/etc/hosts.allow

# Copy example Nagios config file to define a compute group and ssh check
# (note: edit as desired to add all desired compute hosts)
cp /opt/ohpc/pub/examples/nagios/compute.cfg /etc/nagios/objects

# failed no such file or directory in the examples... come back to this another time

Add ClusterShell

# Install ClusterShell
dnf -y install clustershell
# Setup node deficanitions
cd /etc/clustershell/groups.d
mv local.cfg local.cfg.orig
echo "adm: ${sms_name}" > local.cfg
# note set num_computes above
echo "compute: ${compute_prefix}[1-${num_computes}]" >> local.cfg 
echo "all: @adm,@compute" >> local.cfg

Add Genders

# Install genders
# assumes we have ipmi setup - I don't!
dnf -y install genders-ohpc
# Generate a sample genders file
echo -e "${sms_name}\tsms" > /etc/genders
for ((i=0; i<$num_computes; i++)) ; do
echo -e "${c_name[$i]}\tcompute,bmc=${c_bmc[$i]}"
done >> /etc/genders

Here's what I hacked without bmc

[root@openhpc3-headnode ~]# cat /etc/genders
rocky-head	sms
compute-1	compute
compute-2	compute

# or with ipmi
head-rocky9     sms
gpunode01       compute,bmc=gpunode01-ipmi
gpunode02       compute,bmc=gpunode02-ipmi

Add Magpie (skipped)

Add Conman (skipped)

# Install conman to provide a front-end to compute consoles and log output
dnf -y install conman-ohpc

# Configure conman for computes (note your IPMI password is required for console access)
for ((i=0; i<$num_computes; i++)) ; do
echo -n 'CONSOLE name="'${c_name[$i]}'" dev="ipmi:'${c_bmc[$i]}'" '
echo 'ipmiopts="'U:${bmc_username},P:${IPMI_PASSWORD:-undefined},W:solpayloadsize'"'
done >> /etc/conman.conf

# or here's the end of a version populated (oghl)

<snip>
CONSOLE name="gpunode01" dev="ipmi:10.141.128.1" ipmiopts="U:ADMIN,P:ADMIN,W:solpayloadsize"
CONSOLE name="gpunode02" dev="ipmi:10.141.128.2" ipmiopts="U:ADMIN,P:ADMIN,W:solpayloadsize"
CONSOLE name="gpunode03" dev="ipmi:10.141.128.3" ipmiopts="U:ADMIN,P:ADMIN,W:solpayloadsize"
CONSOLE name="gpunode04" dev="ipmi:10.141.128.4" ipmiopts="U:ADMIN,P:ADMIN,W:solpayloadsize"
CONSOLE name="gpunode05" dev="ipmi:10.141.128.5" ipmiopts="U:ADMIN,P:ADMIN,W:solpayloadsize"
CONSOLE name="gpunode06" dev="ipmi:10.141.128.6" ipmiopts="U:ADMIN,P:ADMIN,W:solpayloadsize"
CONSOLE name="gpunode07" dev="ipmi:10.141.128.7" ipmiopts="U:ADMIN,P:ADMIN,W:solpayloadsize"
CONSOLE name="gpunode08" dev="ipmi:10.141.128.8" ipmiopts="U:ADMIN,P:ADMIN,W:solpayloadsize"
CONSOLE name="gpunode09" dev="ipmi:10.141.128.9" ipmiopts="U:ADMIN,P:ADMIN,W:solpayloadsize"
CONSOLE name="gpunode10" dev="ipmi:10.141.128.10" ipmiopts="U:ADMIN,P:ADMIN,W:solpayloadsize"
CONSOLE name="gpunode11" dev="ipmi:10.141.128.11" ipmiopts="U:ADMIN,P:ADMIN,W:solpayloadsize"
CONSOLE name="gpunode12" dev="ipmi:10.141.128.12" ipmiopts="U:ADMIN,P:ADMIN,W:solpayloadsize"
CONSOLE name="gpunode13" dev="ipmi:10.141.128.13" ipmiopts="U:ADMIN,P:ADMIN,W:solpayloadsize"
CONSOLE name="gpunode14" dev="ipmi:10.141.128.14" ipmiopts="U:ADMIN,P:ADMIN,W:solpayloadsize"
CONSOLE name="gpunode15" dev="ipmi:10.141.128.15" ipmiopts="U:ADMIN,P:ADMIN,W:solpayloadsize"
CONSOLE name="gpunode16" dev="ipmi:10.141.128.16" ipmiopts="U:ADMIN,P:ADMIN,W:solpayloadsize"
CONSOLE name="rocky-head" dev="ipmi:10.141.255.254" ipmiopts="U:ADMIN,P:ADMIN,W:solpayloadsize"
</snip>

# Enable and start conman
systemctl enable conman
systemctl start conman

Add Node Health Check

# Install NHC on master and compute nodes
dnf -y install nhc-ohpc
dnf -y --installroot=$CHROOT install nhc-ohpc

# Register as SLURM's health check program
echo "HealthCheckProgram=/usr/sbin/nhc" >> /etc/slurm/slurm.conf
echo "HealthCheckInterval=600" >> /etc/slurm/slurm.conf # execute every 10 minutes minutes

Add GEOPM (skipped)

Check back on this power framework

Import files ot Warewulf

wwsh -y file import /etc/passwd
wwsh -y file import /etc/group
wwsh -y file import /etc/shadow

wwsh -y file import /etc/munge/munge.key

Perl Fucked

At this stage perl was fucked and didn't have any system header libraries was complaining about sys/ioctl.ph being missing. Here's what I did to fix

dnf -y install perl-utils
dnf -y install glibc-devel

mkdir -p  /usr/local/lib64/perl5/5.32

cd /usr/include; h2ph * sys/* bits/*
wwsh file import /etc/passwd
export CHROOT=/opt/ohpc/admin/images/rocky9.2
wwmkchroot -v rocky-9 $CHROOT
h2ph bits/*
cd /usr/include; h2ph -r -l .
wwmkchroot -v rocky-9 $CHROOT
# now it completes without error

Finalise Compute configuration

# Build bootstrap image
wwbootstrap `uname -r`

# Assemble Virtual Node File System (VNFS) image
# if $chroot/proc and sys are mounted then undo that 
umount --force /opt/ohpc/admin/images/rocky9.2/sys
umount --force -l  /opt/ohpc/admin/images/rocky9.2/proc


wwvnfs --chroot $CHROOT

# note changed eth_provision to eth0, as its the compute node - clearly mucked up the headnode configuration based on the docs 
echo "GATEWAYDEV=${eth_provision}" > /tmp/network.$$
wwsh -y file import /tmp/network.$$ --name network
wwsh -y file set network --path /etc/sysconfig/network --mode=0644 --uid=0

Setup pxeboot image in openstack

# back to openstack node with admin rc sourced.
curl --output /tmp/pxeboot.img --location https://linux.web.cern.ch/centos7/docs/pxeboot.img
openstack image create pxeboot --property os=LINUX --file /tmp/pxeboot.img --public
openstack image set --property hw_vif_model=e1000 pxeboot

# then boot 2 nodes, disable port security, grab their IP and MAC addresses

Then setup these nodes in warewulf

# Add nodes to Warewulf data store
 wwsh -y node new compute-1 --ipaddr=10.20.30.130 --hwaddr=fa:16:3e:32:82:57 -D eth0
 wwsh -y node new compute-2 --ipaddr=10.20.30.158 --hwaddr=fa:16:3e:e4:a0:2a -D eth0

 wwsh node list

NAME                GROUPS              IPADDR              HWADDR
================================================================================
compute-1           UNDEF               10.20.30.130        fa:16:3e:32:82:57
compute-2           UNDEF               10.20.30.158        fa:16:3e:e4:a0:2a

# Define provisioning image for hosts
 wwsh -y provision set compute* --vnfs=rocky9.2 --bootstrap=`uname -r` --files=dynamic_hosts,passwd,group,shadow,munge.key,network

 wwsh provision list
NODE                VNFS            BOOTSTRAP             FILES
================================================================================
compute-1           rocky9.2        5.14.0-362.13.1.el... dynamic_hosts,grou...
compute-2           rocky9.2        5.14.0-362.13.1.el... dynamic_hosts,grou...

# OGHL 
wwsh -y node new gpunode02 --ipaddr=10.141.0.2 --netmask=255.255.0.0 --hwaddr=00:25:90:96:24:c6 -D enp129s0f0
wwsh -y provision set gpunode02 --kargs "console=ttyS1,115200" --vnfs=rocky9.2 --bootstrap=`uname -r` --files=dynamic_hosts,passwd,group,shadow,munge.key,network

# Restart dhcp / update PXE
systemctl restart dhcpd
 wwsh pxe update

No virtio_net driver in the bootstrap images - lets add

# on the openstack env
openstack image set --property hw_vif_model=e1000 pxeboot

# the damn VMs run eth0 from the virtio_net driver - lets add to the bootstrap

echo "drivers += virtio_net" >> /etc/warewulf/bootstrap.conf
wwbootstrap `uname -r`
# ^ that didn't work
# reboot VM

Final Config SLurm (after adding nodes)

# Start munge and slurm controller on master host
systemctl enable munge
systemctl enable slurmctld
systemctl start munge
systemctl start slurmctld

# Start slurm clients on compute hosts
pdsh -w $compute_prefix[1-${num_computes}] systemctl start munge 
pdsh -w $compute_prefix[1-${num_computes}] systemctl start slurmd

# Generate NHC configuration file based on compute node environment
pdsh -w c1 "/usr/sbin/nhc-genconf -H '*' -c -" | dshbak -c

Setup the SlurmDBD service for accounting and OOD (note did this while installing OOD so hopefully everything is in place, otherwise come back to this at the end)

# cat /etc/slurm/slurmdbd.conf

# Authentication info
AuthType=auth/munge
#AuthInfo=/var/run/munge/munge.socket.2
#
# slurmDBD info
DbdHost=head-rocky9
DbdAddr=head-rocky9
DbdPort=6819
SlurmUser=slurm
#MessageTimeout=300
DebugLevel=4
#DefaultQOS=normal,standby
# NOTE: By default, slurmdbd will log to syslog
#LogFile=/var/log/slurm/slurmdbd.log
PidFile=/var/run/slurmdbd.pid
#PluginDir=/usr/lib/slurm
#PrivateData=accounts,users,usage,jobs
#TrackWCKey=yes
#
# Database info
StorageType=accounting_storage/mysql
#StorageHost=head-rocky9
StorageUser=slurm
StoragePass=slurmp455dbd
StorageLoc=slurm_acct_db

# chmod 600 /etc/slurm/slurmdbd.conf
# chown slurm.slurm /etc/slurm/slurmdbd.conf

# add to the slurm.conf file
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=head-rocky9
JobAcctGatherType=jobacct_gather/linux
JobAcctGatherFrequency=30

Setup the DB and SLURM user for the DBD service

mysql -u root -e "create user 'slurm'@'localhost' identified by 'slurmp455dbd'; grant all on slurm_acct_db.* to 'slurm'@'localhost'; create database slurm_acct_db;"

systemctl restart slurmctld slurmdbd

Check its working ok

[root@head-rocky9 ~]# sacctmgr list cluster
   Cluster     ControlHost  ControlPort   RPC     Share GrpJobs       GrpTRES GrpSubmit MaxJobs       MaxTRES MaxSubmit     MaxWall                  QOS   Def QOS
---------- --------------- ------------ ----- --------- ------- ------------- --------- ------- ------------- --------- ----------- -------------------- ---------
   cluster      10.141.0.1         6817  9728         1                                                                                           normal

Installing Extra OpenHPC Software MPI / Libs

dnf -y install ohpc-autotools
dnf -y install EasyBuild-ohpc
dnf -y install hwloc-ohpc
dnf -y install spack-ohpc
dnf -y install valgrind-ohpc

dnf -y install gnu12-compilers-ohpc

dnf -y install openmpi4-pmix-gnu12-ohpc mpich-ofi-gnu12-ohpc
dnf -y install mpich-ucx-gnu12-ohpc

dnf -y install mvapich2-gnu12-ohpc

dnf -y install ohpc-gnu12-perf-tools

dnf -y install lmod-defaults-gnu12-openmpi4-ohpc

dnf -y install ohpc-gnu12-serial-libs
dnf -y install ohpc-gnu12-io-libs
dnf -y install ohpc-gnu12-python-libs
dnf -y install ohpc-gnu12-runtimes

Add users

adduser rsupport
wish file sync 
pdsh -w $compute_prefix[1-${num_computes}] /warewulf/bin/wwgetfiles

Add OpenOnDemand

# on headnode
yum install https://yum.osc.edu/ondemand/3.0/ondemand-release-web-3.0-1.noarch.rpm
dnf install ondemand
systemctl restart httpd

Setup basic auth using PAM (not recommended - should use dex/ldap)

dnf -y install mod_authnz_pam
echo "LoadModule authnz_pam_module modules/mod_authnz_pam.so" > /etc/httpd/conf.modules.d/55-authnz_pam.conf
cp /etc/pam.d/sshd /etc/pam.d/ood
chmod 640 /etc/shadow
chgrp apache /etc/shadow

# Update /etc/ood/config/ood_portal.yml to use PAM authentication
auth:
  - 'AuthType Basic'
  - 'AuthName "Open OnDemand"'
  - 'AuthBasicProvider PAM'
  - 'AuthPAMService ood'
  - 'Require valid-user'
# Capture system user name from authenticated user name
user_map_cmd: "/opt/ood/ood_auth_map/bin/ood_auth_map.regex"

# Apply modifications to the /etc/ood/config/ood_portal.yml

/opt/ood/ood-portal-generator/sbin/update_ood_portal

# needed to grab this file - doesn't seem to exist anymore
curl https://raw.githubusercontent.com/OSC/ood_auth_map/master/bin/ood_auth_map.regex > /opt/ood/ood_auth_map/bin/ood_auth_map.regex
chmod +x /opt/ood/ood_auth_map/bin/ood_auth_map.regex

# quick test 
[root@head-rocky9 log]# /opt/ood/ood_auth_map/bin/ood_auth_map.regex definetech definetech
definetech
[root@head-rocky9 log]# /opt/ood/ood_auth_map/bin/ood_auth_map.regex definetech
definetech

systemctl restart httpd

OOD Needs SSL to work - set that up


TBC

  935  cd /etc/pki/tls/
  936  ls
  937  mkdir ood
  938  cd oof
  939  cd ood/
  940  openssl genrsa -des3 -out ood-private.key 2048
  941  echo "oodpass" > pphrase.txt
  942  ll
  943  cat ood-private.key
  944  penssl req -key ood-private.key -new -out ood-server.csr
  945  openssl req -key ood-private.key -new -out ood-server.csr
  946  ll
  947  openssl x509 -signkey ood-private.key -in ood-server.csr -req -days 3650 -out ood-server.crt
  948  openssl x509 -text -noout -in ood-server.crt
  949  ll
  950  vi /etc/ood/config/ood_portal.yml

Setup the Desktop env in ODD

dnf -y --installroot=$CHROOT install nmap-ncat
dnf -y --installroot=$CHROOT install python3-websockify

# turbo vnc needs its own repo 
curl -o $CHROOT/etc/yum.repos.d/TurboVNC.repo https://raw.githubusercontent.com/TurboVNC/repo/main/TurboVNC.repo
dnf -y --installroot=$CHROOT install turbovnc
dnf -y --installroot=$CHROOT group install xfce

# rebuild our vnfs
wwvnfs --chroot  ${CHROOT}

Configuration files for Desktop app

# [root@head-rocky9 config]# cat clusters.d/cluster.yml
---
v2:
   metadata:
     title: "oghl cluster"
   login:
     host: "head-rocky9.cluster.internal"
   job:
     adapter: "slurm"
     cluster: "cluster"
     bin: "/usr/bin"
     conf: "/etc/slurm/slurm.conf"
     # bin_overrides:
       # sbatch: "/usr/local/bin/sbatch"
       # squeue: ""
       # scontrol: ""
       # scancel: ""
     copy_enviornment: false
   batch_connect:
     basic:
       script_wrapper: |
         module purge
         %s
       set_host: "host=$(hostname -s)"
     vnc:
       script_wrapper: |
         module purge
         export PATH="/opt/TurboVNC/bin:$PATH"
         export WEBSOCKIFY_CMD="/usr/bin/websockify"
         # Workaround to avoid \"Unable to contact settings server\" when
         # lauching xfce4-session\n
         #/bin/dbus-launch /bin/xfce4-session $@
         #export -f xfce4-session
         %s
       set_host: "host=$(hostname -s)"


# [root@head-rocky9 config]# cat apps/bc_desktop/cluster.yml
attributes:
    bc_queue:
        value: normal
    desktop: xfce
    node:
        help: Select a particular node or leave empty to let Slurm pick the next available
        label: Node name
        value: ''
    num_cores:
        label: Number of cores
        value: 1
cluster: cluster
description: Request a desktop to run GUI applications.
form:
- desktop
- bc_queue
- bc_num_hours
- num_cores
- node
submit: submit/submit.yml.erb
title: Remote Desktop
[root@head-rocky9 config]# cat apps/bc_desktop/submit/submit.yml.erb
---
script:
  job_name: "ood-desktop"
  native:
    - <%= "--nodes=1" %>
    - <%= "--ntasks=#{num_cores}" %>
    - <%= "--nodelist=#{node}" %>
[root@head-rocky9 config]# grep -v "^#" ood_portal.yml
---

servername: head-rocky9.cluster.internal

ssl:
  - 'SSLCertificateFile "/etc/pki/tls/ood/ood-server.crt"'
  - 'SSLCertificateKeyFile "/etc/pki/tls/ood/ood-private.key"'

auth:
  - 'AuthType Basic'
  - 'AuthName "Open OnDemand"'
  - 'AuthBasicProvider PAM'
  - 'AuthPAMService ood'
  - 'Require valid-user'
user_map_cmd: "/opt/ood/ood_auth_map/bin/ood_auth_map.regex"

root_uri: '/pun/sys/dashboard'

host_regex: '.+'
node_uri: '/node'
rnode_uri: '/rnode'

Note; wearwulf doesn't work as OOD automatically puts a redirect in for 80 -> 443 - need to undo that.

# TBC work out how to fix this properly 
[root@head-rocky9 conf.d]# diff ood-portal.conf ood-portal.conf.new
46,51c46,51
< #<VirtualHost *:80>
< #  ServerName head-rocky9.cluster.internal
< #
< #  RewriteEngine On
< #  RewriteRule ^(.*) https://%{HTTP_HOST}:443$1 [R=301,NE,L]
< #</VirtualHost>
---
> <VirtualHost *:80>
>   ServerName head-rocky9.cluster.internal
>
>   RewriteEngine On
>   RewriteRule ^(.*) https://%{HTTP_HOST}:443$1 [R=301,NE,L]
> </VirtualHost>

vi /etc/httpd/conf.d/ood-portal.conf.new
# comment out the :80 virtual host section
# TLS passphrase: oodpass
systemctl restart httpd


Add Jupyter App

Setup the following files

mkdir /etc/ood/config/apps/jupyter
mkdir /etc/ood/config/apps/jupyter/submit
cd /etc/ood/config/apps/jupyter

# [root@head-rocky9 jupyter]# cat cluster.yml
attributes:
    bc_queue:
        value: normal
    extra_jupyter_args: ''
    modules: ''
    node: ''
    num_cores:
        label: Number of cores
        value: 1
cluster: slurm
description: Request a Jupyter Notebook server
form:
- modules
- extra_jupyter_args
- bc_queue
- bc_num_hours
- num_cores
- node
submit: submit/submit.yml.erb
title: Jupyter Notebook

# [root@head-rocky9 jupyter]# cat submit/submit.yml.erb
---
batch_connect:
  template: "basic"
script:
  job_name: "ood-jupyter"
  native:
    - <%= "--nodes=1" %>
    - <%= "--ntasks=#{num_cores}" %>
    - <%= "--nodelist=#{node}" %>

Setup Jupyter in the image

dnf -y --installroot=$CHROOT install python3-pip
chroot $CHROOT
pip install jupyter
exit
wwvnfs --chroot $CHROOT

Install the Jupyter app

cd /var/www/ood/apps/sys/jupyter/
git clone https://github.com/OSC/bc_example_jupyter.git 
/opt/ood/ood-portal-generator/sbin/update_ood_portal
systemctl restart httpd