From 664e08e3dee09b7ad6b7f4558b9443d4a02c1afd Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Thu, 6 Jun 2024 11:46:33 +0100 Subject: [PATCH] Image update - OpenHPC v3.1 for RL9 (#394) * bump Packer source image to RL9.4 * downgrade OFED to LTS to get stable download url * bump OOD role, now ondemand dnf package installed will be latest * Revert Packer source image to RL9.3 to avoid hanging after post-update reboot" This reverts commit 851c494fa7b88581cfb4194f9e7f305b63f9e5c0. * bump OFED to get RL9.4-supported version * bump leafcloud packer vm to 8GB RAM * DEBUG: disable (working) OFED build * Revert "DEBUG: disable (working) OFED build" This reverts commit 45a48c3bab3f86c4f4e91534df3308fb653ad944. * DEBUG: output builder hostname * Revert "DEBUG: output builder hostname" This reverts commit 3f95f8ea3e14f2e3bbd022c0895d13cf8b5b4794. * fix build workflow concurrency * DEBUG: disable updates * Revert "DEBUG: disable updates" This reverts commit 3581a35529aa54cdaebaaba11d691f1684f22d0c. * bump packer build volume size for non-ofed to avoid RL8 build running out of root space * try to prevent stackhpc env image build connection drops * bump packer source image to fixed RL9.4 image * run test CI workflow on RL8 image if PR labeled with 'RL8' * bump CI images * bump openhpc role to fix munge checks on key path --- .github/workflows/fatimage.yml | 4 +++- .github/workflows/stackhpc.yml | 3 +++ ansible/roles/ofed/defaults/main.yml | 2 +- environments/.stackhpc/ARCUS.pkrvars.hcl | 3 ++- environments/.stackhpc/LEAFCLOUD.pkrvars.hcl | 4 ++-- environments/.stackhpc/ansible.cfg | 2 +- environments/.stackhpc/terraform/main.tf | 6 +++--- .../common/inventory/group_vars/all/openondemand.yml | 2 -- packer/openstack.pkr.hcl | 2 +- requirements.yml | 4 ++-- 10 files changed, 18 insertions(+), 14 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index d3c9adaf7..9209e85ea 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -8,11 +8,13 @@ name: Build fat image description: Include RL8 image build type: boolean default: false +concurrency: + group: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS + cancel-in-progress: true jobs: openstack: name: openstack-imagebuild runs-on: ubuntu-20.04 - concurrency: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS strategy: matrix: os_version: [RL8, RL9] diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index c8bb9b06f..d0f74ad1c 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -24,10 +24,13 @@ jobs: - ${{ inputs.use_RL8 == true }} # only potentially true for workflow_dispatch rl8_branch: - ${{ startsWith(github.head_ref, 'rl8') == true }} # only potentially for pull_request, always false on merge + rl8_label: + - ${{ contains(github.event.pull_request.labels.*.name, 'RL8') }} # NB: needs a new commit if added after PR created exclude: - os_version: RL8 rl8_selected: false rl8_branch: false + rl8_label: false env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack diff --git a/ansible/roles/ofed/defaults/main.yml b/ansible/roles/ofed/defaults/main.yml index 369e43a15..7233809bc 100644 --- a/ansible/roles/ofed/defaults/main.yml +++ b/ansible/roles/ofed/defaults/main.yml @@ -1,4 +1,4 @@ -ofed_version: 24.01-0.3.3.1 +ofed_version: '24.04-0.6.6.0' # LTS version 23.10-2.1.3.1 does not support RL9.4 ofed_download_url: https://content.mellanox.com/ofed/MLNX_OFED-{{ ofed_version }}/MLNX_OFED_LINUX-{{ ofed_version }}-{{ ofed_distro }}{{ ofed_distro_version }}-{{ ofed_arch }}.tgz ofed_distro: rhel # NB: not expected to work on other distros due to installation differences ofed_distro_version: "{{ ansible_distribution_version }}" # e.g. '8.9' diff --git a/environments/.stackhpc/ARCUS.pkrvars.hcl b/environments/.stackhpc/ARCUS.pkrvars.hcl index 738a021c0..72e978c95 100644 --- a/environments/.stackhpc/ARCUS.pkrvars.hcl +++ b/environments/.stackhpc/ARCUS.pkrvars.hcl @@ -1,6 +1,7 @@ flavor = "vm.ska.cpu.general.small" use_blockstorage_volume = true -volume_size = 12 # GB. Compatible with SMS-lab's general.v1.tiny +volume_size = 15 # GB +volume_size_ofed = 15 # GB image_disk_format = "qcow2" networks = ["4b6b2722-ee5b-40ec-8e52-a6610e14cc51"] # portal-internal (DNS broken on ilab-60) ssh_keypair_name = "slurm-app-ci" diff --git a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl index 1a1bcd0ab..1f6ece01f 100644 --- a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl +++ b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl @@ -1,6 +1,6 @@ -flavor = "ec1.medium" +flavor = "ec1.large" use_blockstorage_volume = true -volume_size = 12 # GB. Compatible with SMS-lab's general.v1.tiny +volume_size = 15 # GB volume_size_ofed = 15 # GB volume_type = "unencrypted" image_disk_format = "qcow2" diff --git a/environments/.stackhpc/ansible.cfg b/environments/.stackhpc/ansible.cfg index 139ffa033..aa0ec5aaf 100644 --- a/environments/.stackhpc/ansible.cfg +++ b/environments/.stackhpc/ansible.cfg @@ -12,5 +12,5 @@ roles_path = ../../ansible/roles filter_plugins = ../../ansible/filter_plugins [ssh_connection] -ssh_args = -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null +ssh_args = -o ServerAliveInterval=10 -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null pipelining = True diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 9c4d64181..26ff32c2a 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -29,9 +29,9 @@ variable "cluster_image" { description = "single image for all cluster nodes, keyed by os_version - a convenience for CI" type = map(string) default = { - # https://github.com/stackhpc/ansible-slurm-appliance/pull/353 - RL8: "openhpc-RL8-240423-1002-4b09ba85" - RL9: "openhpc-ofed-RL9-240423-1059-4b09ba85" + # https://github.com/stackhpc/ansible-slurm-appliance/pull/394 + RL8: "openhpc-RL8-240605-1205-a3002d19" + RL9: "openhpc-ofed-RL9-240605-1204-a3002d19" } } diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index 18e741ce7..5e85392ca 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -13,8 +13,6 @@ # or include regex special characters. openondemand_host_regex: "{{ (groups['compute'] + groups['grafana']) | to_ood_regex }}" -ondemand_package: ondemand-3.0.3 - # Add grafana to dashboard links to OOD only if grafana group is available openondemand_dashboard_links_grafana: - name: Grafana diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 0db3591f7..262b071f4 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -49,7 +49,7 @@ variable "fatimage_source_image_name" { type = map(string) default = { RL8: "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2" - RL9: "Rocky-9-GenericCloud-Base-9.3-20231113.0.x86_64.qcow2" + RL9: "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2" } } diff --git a/requirements.yml b/requirements.yml index e00e19680..995329fbf 100644 --- a/requirements.yml +++ b/requirements.yml @@ -3,7 +3,7 @@ roles: - src: stackhpc.nfs version: v23.12.1 # Tolerate state nfs file handles - src: https://github.com/stackhpc/ansible-role-openhpc.git - version: v0.25.0 # https://github.com/stackhpc/ansible-role-openhpc/pull/167 + version: v0.26.0 # https://github.com/stackhpc/ansible-role-openhpc/pull/168 name: stackhpc.openhpc - src: https://github.com/stackhpc/ansible-node-exporter.git version: stackhpc @@ -19,7 +19,7 @@ roles: # No versions available - src: https://github.com/OSC/ood-ansible.git name: osc.ood - version: v3.0.6 + version: v3.1.5 - src: https://github.com/stackhpc/ansible-role-os-manila-mount.git name: stackhpc.os-manila-mount version: v24.2.0 # Support RockyLinux 9