From efd2883211fbe9563568ff13a2c2759ef9ef31a3 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Thu, 12 Dec 2024 13:40:41 +0000 Subject: [PATCH] Add role to install NVIDIA DOCA on top of an existing "fat" image (#492) * add doca role run by fatimage * add workflow to test doca build * make packer inventory groups clearer and allow defining no extra * update packer workflows for new packer config * define builds entirely via matrix * WIP: do DOCA CI build on top of current fat image * fixup matrix for changes * fix doca workflow typo * use current fatimage for doca test build * enable fatimage to be used for volume-backed builds * bump CI image * doca workflow: clean up image and only run on relevant changes * remove commented-out code * add DOCA README * fix DOCA role actually running * tidyup DOCA play * include doca packages in image summary * fix squid being selected for any stackhopc build VM * fix nightly build concurrency * re-add squid back to Stackhpc builder group * remove debugging exit * update image build docs * update packer docs --- .github/workflows/doca.yml | 132 ++++++++++++++++++ .github/workflows/fatimage.yml | 38 +++-- .github/workflows/nightlybuild.yml | 54 +++---- ansible/.gitignore | 2 + ansible/cleanup.yml | 5 + ansible/fatimage.yml | 11 ++ ansible/roles/doca/README.md | 12 ++ ansible/roles/doca/defaults/main.yml | 3 + .../roles/doca/tasks/install-kernel-devel.yml | 24 ++++ ansible/roles/doca/tasks/install.yml | 53 +++++++ ansible/roles/doca/tasks/main.yml | 1 + docs/image-build.md | 74 +++------- .../terraform/cluster_image.auto.tfvars.json | 4 +- packer/openstack.pkr.hcl | 65 +++------ 14 files changed, 323 insertions(+), 155 deletions(-) create mode 100644 .github/workflows/doca.yml create mode 100644 ansible/roles/doca/README.md create mode 100644 ansible/roles/doca/defaults/main.yml create mode 100644 ansible/roles/doca/tasks/install-kernel-devel.yml create mode 100644 ansible/roles/doca/tasks/install.yml create mode 100644 ansible/roles/doca/tasks/main.yml diff --git a/.github/workflows/doca.yml b/.github/workflows/doca.yml new file mode 100644 index 000000000..cfd3bb982 --- /dev/null +++ b/.github/workflows/doca.yml @@ -0,0 +1,132 @@ +name: Test DOCA extra build +on: + workflow_dispatch: + push: + branches: + - main + paths: + - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json' + - 'ansible/roles/doca/**' + - '.github/workflows/doca' + pull_request: + paths: + - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json' + - 'ansible/roles/doca/**' + - '.github/workflows/doca' + +jobs: + doca: + name: doca-build + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS + cancel-in-progress: true + runs-on: ubuntu-22.04 + strategy: + fail-fast: false # allow other matrix jobs to continue even if one fails + matrix: # build RL8, RL9 + build: + - image_name: openhpc-doca-RL8 + source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json + inventory_groups: doca + - image_name: openhpc-doca-RL9 + source_image_name_key: RL9 + inventory_groups: doca + env: + ANSIBLE_FORCE_COLOR: True + OS_CLOUD: openstack + CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings + ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} + + steps: + - uses: actions/checkout@v2 + + - name: Load current fat images into GITHUB_ENV + # see https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#example-of-a-multiline-string + run: | + { + echo 'FAT_IMAGES<> "$GITHUB_ENV" + + - name: Record settings + run: | + echo CI_CLOUD: ${{ env.CI_CLOUD }} + echo FAT_IMAGES: ${FAT_IMAGES} + + - name: Setup ssh + run: | + set -x + mkdir ~/.ssh + echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa + chmod 0600 ~/.ssh/id_rsa + shell: bash + + - name: Add bastion's ssh key to known_hosts + run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts + shell: bash + + - name: Install ansible etc + run: dev/setup-env.sh + + - name: Write clouds.yaml + run: | + mkdir -p ~/.config/openstack/ + echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml + shell: bash + + - name: Setup environment + run: | + . venv/bin/activate + . environments/.stackhpc/activate + + - name: Build fat image with packer + id: packer_build + run: | + set -x + . venv/bin/activate + . environments/.stackhpc/activate + cd packer/ + packer init . + + PACKER_LOG=1 packer build \ + -on-error=${{ vars.PACKER_ON_ERROR }} \ + -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ + -var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \ + -var "image_name=${{ matrix.build.image_name }}" \ + -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ + openstack.pkr.hcl + + - name: Get created image names from manifest + id: manifest + run: | + . venv/bin/activate + IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json) + while ! openstack image show -f value -c name $IMAGE_ID; do + sleep 5 + done + IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID) + echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" + echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" + echo $IMAGE_ID > image-id.txt + echo $IMAGE_NAME > image-name.txt + + - name: Make image usable for further builds + run: | + . venv/bin/activate + openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}" + + - name: Delete image for automatically-run workflows + run: | + . venv/bin/activate + openstack image delete "${{ steps.manifest.outputs.image-id }}" + if: ${{ github.event_name != 'workflow_dispatch' }} + + - name: Upload manifest artifact + uses: actions/upload-artifact@v4 + with: + name: image-details-${{ matrix.build.image_name }} + path: | + ./image-id.txt + ./image-name.txt + overwrite: true diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 217b09c22..da933c91d 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -15,30 +15,23 @@ jobs: openstack: name: openstack-imagebuild concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS cancel-in-progress: true runs-on: ubuntu-22.04 strategy: fail-fast: false # allow other matrix jobs to continue even if one fails matrix: # build RL8, RL9 - os_version: - - RL8 - - RL9 build: - - openstack.openhpc + - image_name: openhpc-RL8 + source_image_name: rocky-latest-RL8 + inventory_groups: control,compute,login + - image_name: openhpc-RL9 + source_image_name: rocky-latest-RL9 + inventory_groups: control,compute,login env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack CI_CLOUD: ${{ github.event.inputs.ci_cloud }} - SOURCE_IMAGES_MAP: | - { - "RL8": { - "openstack.openhpc": "rocky-latest-RL8" - }, - "RL9": { - "openstack.openhpc": "rocky-latest-RL9" - } - } ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} steps: @@ -85,13 +78,11 @@ jobs: PACKER_LOG=1 packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ - -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ - -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ + -var "source_image_name=${{ matrix.build.source_image_name }}" \ + -var "image_name=${{ matrix.build.image_name }}" \ + -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ openstack.pkr.hcl - env: - PKR_VAR_os_version: ${{ matrix.os_version }} - SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version][matrix.build] }} - name: Get created image names from manifest id: manifest @@ -102,13 +93,20 @@ jobs: sleep 5 done IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID) + echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" + echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" echo $IMAGE_ID > image-id.txt echo $IMAGE_NAME > image-name.txt + - name: Make image usable for further builds + run: | + . venv/bin/activate + openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}" + - name: Upload manifest artifact uses: actions/upload-artifact@v4 with: - name: image-details-${{ matrix.build }}-${{ matrix.os_version }} + name: image-details-${{ matrix.build.image_name }} path: | ./image-id.txt ./image-name.txt diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 9f45b0890..a0e78cd0b 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -11,32 +11,29 @@ on: - SMS - ARCUS schedule: - - cron: '0 0 * * *' # Run at midnight + - cron: '0 0 * * *' # Run at midnight on default branch jobs: openstack: name: openstack-imagebuild concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS cancel-in-progress: true runs-on: ubuntu-22.04 strategy: fail-fast: false # allow other matrix jobs to continue even if one fails matrix: # build RL8, RL9 - os_version: - - RL8 - - RL9 build: - - openstack.rocky-latest + - image_name: rocky-latest-RL8 + source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2 + inventory_groups: update + - image_name: rocky-latest-RL9 + source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 + inventory_groups: update env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack CI_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }} - SOURCE_IMAGES_MAP: | - { - "RL8": "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2", - "RL9": "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2" - } ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} steps: @@ -83,15 +80,12 @@ jobs: PACKER_LOG=1 packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ - -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ - -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ + -var "source_image_name=${{ matrix.build.source_image_name }}" \ + -var "image_name=${{ matrix.build.image_name }}" \ + -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ openstack.pkr.hcl - env: - PKR_VAR_os_version: ${{ matrix.os_version }} - SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version] }} - - name: Get created image names from manifest id: manifest run: | @@ -125,7 +119,7 @@ jobs: name: upload-nightly-targets needs: openstack concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.image }}-${{ matrix.target_cloud }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }}-${{ matrix.target_cloud }} cancel-in-progress: true runs-on: ubuntu-22.04 strategy: @@ -135,18 +129,15 @@ jobs: - LEAFCLOUD - SMS - ARCUS - os_version: - - RL8 - - RL9 - image: - - rocky-latest + build: + - image_name: rocky-latest-RL8 + - image_name: rocky-latest-RL9 exclude: - target_cloud: LEAFCLOUD env: OS_CLOUD: openstack SOURCE_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }} TARGET_CLOUD: ${{ matrix.target_cloud }} - IMAGE_NAME: "${{ matrix.image }}-${{ matrix.os_version }}" steps: - uses: actions/checkout@v2 @@ -161,42 +152,37 @@ jobs: . venv/bin/activate pip install -U pip pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt) - shell: bash - name: Write clouds.yaml run: | mkdir -p ~/.config/openstack/ echo "${{ secrets[format('{0}_CLOUDS_YAML', env.SOURCE_CLOUD)] }}" > ~/.config/openstack/source_clouds.yaml echo "${{ secrets[format('{0}_CLOUDS_YAML', env.TARGET_CLOUD)] }}" > ~/.config/openstack/target_clouds.yaml - shell: bash - name: Download source image run: | . venv/bin/activate export OS_CLIENT_CONFIG_FILE=~/.config/openstack/source_clouds.yaml - openstack image save --file ${{ env.IMAGE_NAME }} ${{ env.IMAGE_NAME }} - shell: bash + openstack image save --file ${{ matrix.build.image_name }} ${{ matrix.build.image_name }} - name: Upload to target cloud run: | . venv/bin/activate export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml - openstack image create "${{ env.IMAGE_NAME }}" \ - --file "${{ env.IMAGE_NAME }}" \ + openstack image create "${{ matrix.build.image_name }}" \ + --file "${{ matrix.build.image_name }}" \ --disk-format qcow2 \ - shell: bash - name: Delete old latest image from target cloud run: | . venv/bin/activate export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml - IMAGE_COUNT=$(openstack image list --name ${{ env.IMAGE_NAME }} -f value -c ID | wc -l) + IMAGE_COUNT=$(openstack image list --name ${{ matrix.build.image_name }} -f value -c ID | wc -l) if [ "$IMAGE_COUNT" -gt 1 ]; then - OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ env.IMAGE_NAME }}" -f value -c ID | head -n 1) + OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ matrix.build.image_name }}" -f value -c ID | head -n 1) openstack image delete "$OLD_IMAGE_ID" else echo "Only one image exists, skipping deletion." fi - shell: bash diff --git a/ansible/.gitignore b/ansible/.gitignore index 48c917c4f..3fef64ecc 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -66,3 +66,5 @@ roles/* !roles/lustre/** !roles/dnf_repos/ !roles/dnf_repos/** +!roles/doca/ +!roles/doca/** diff --git a/ansible/cleanup.yml b/ansible/cleanup.yml index cf9b0bdab..3f059d157 100644 --- a/ansible/cleanup.yml +++ b/ansible/cleanup.yml @@ -61,5 +61,10 @@ os: "{{ ansible_distribution }} {{ ansible_distribution_version }}" kernel: "{{ ansible_kernel }}" ofed: "{{ ansible_facts.packages['mlnx-ofa_kernel'].0.version | default('-') }}" + doca: "{{ ansible_facts.packages[doca_profile | default('doca-ofed') ].0.version | default('-') }}" cuda: "{{ ansible_facts.packages['cuda'].0.version | default('-') }}" slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}" + +- name: Show image summary + debug: + var: image_info diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index b28e4f308..439c50e70 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -6,6 +6,9 @@ tasks: - name: Report hostname (= final image name) command: hostname + - name: Report inventory groups + debug: + var: group_names - name: Run pre.yml hook vars: @@ -199,6 +202,14 @@ name: cloudalchemy.grafana tasks_from: install.yml +- hosts: doca + become: yes + gather_facts: yes + tasks: + - name: Install NVIDIA DOCA + import_role: + name: doca + - name: Run post.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" diff --git a/ansible/roles/doca/README.md b/ansible/roles/doca/README.md new file mode 100644 index 000000000..5f898add5 --- /dev/null +++ b/ansible/roles/doca/README.md @@ -0,0 +1,12 @@ +# doca + +Install [NVIDIA DOCA](https://docs.nvidia.com/doca/sdk/index.html). + +This role is not idempotent and is only intended to be run during an image build. It builds DOCA kernel modules to match the installed kernel and then installs these +plus the selected DOCA packages. + +## Role Variables + +- `doca_version`: Optional. String giving doca version. +- `doca_profile`: Optional. Name of [profile](https://docs.nvidia.com/doca/sdk/nvidia+doca+profiles/index.html) defining subset of DOCA to install. Default is `doca-ofed`. +- `doca_repo_url`: Optional. URL of DOCA repository. Default is appropriate upstream public repository for DOCA version, distro version and architecture. diff --git a/ansible/roles/doca/defaults/main.yml b/ansible/roles/doca/defaults/main.yml new file mode 100644 index 000000000..66437cd04 --- /dev/null +++ b/ansible/roles/doca/defaults/main.yml @@ -0,0 +1,3 @@ +doca_version: '2.9.1' # 2.9 is LTS, last to support ConnectX-4, 3 years for bug fixes and CVE updates +doca_profile: doca-ofed +doca_repo_url: "https://linux.mellanox.com/public/repo/doca/{{ doca_version }}/rhel{{ ansible_distribution_version }}/{{ ansible_architecture }}/" diff --git a/ansible/roles/doca/tasks/install-kernel-devel.yml b/ansible/roles/doca/tasks/install-kernel-devel.yml new file mode 100644 index 000000000..6a1943a32 --- /dev/null +++ b/ansible/roles/doca/tasks/install-kernel-devel.yml @@ -0,0 +1,24 @@ +- name: Get installed kernels + command: dnf list --installed kernel + register: _ofed_dnf_kernels + changed_when: false + +- name: Determine running kernel + command: uname -r # e.g. 4.18.0-513.18.1.el8_9.x86_64 + register: _ofed_loaded_kernel + changed_when: false + +- name: Check current kernel is newest installed + assert: + that: _ofed_kernel_current == _ofed_dnf_kernels_newest + fail_msg: "Kernel {{ _ofed_loaded_kernel.stdout }} is loaded but newer {{ _ofed_dnf_kernels_newest }} is installed: consider rebooting?" + vars: + _ofed_kernel_current: >- + {{ _ofed_loaded_kernel.stdout | regex_replace('\.(?:.(?!\.))+$', '') | regex_replace('\.(?:.(?!\.))+$', '') }} + _ofed_dnf_kernels_newest: >- + {{ _ofed_dnf_kernels.stdout_lines[1:] | map('split') | map(attribute=1) | map('regex_replace', '\.(?:.(?!\.))+$', '') | community.general.version_sort | last }} + # dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos " + +- name: Install matching kernel-devel package + dnf: + name: "kernel-devel-{{ _ofed_loaded_kernel.stdout | trim }}" diff --git a/ansible/roles/doca/tasks/install.yml b/ansible/roles/doca/tasks/install.yml new file mode 100644 index 000000000..9d297e946 --- /dev/null +++ b/ansible/roles/doca/tasks/install.yml @@ -0,0 +1,53 @@ +- import_tasks: install-kernel-devel.yml + +- name: Install DOCA repo + ansible.builtin.yum_repository: + name: doca + file: doca + description: DOCA Online Repo + baseurl: "{{ doca_repo_url }}" + enabled: true + gpgcheck: false + +- name: Install doca-extra package + ansible.builtin.dnf: + name: doca-extra + +- name: Build DOCA kernel modules + ansible.builtin.shell: + cmd: /opt/mellanox/doca/tools/doca-kernel-support + register: _doca_kernel_build + + +- name: Find generated doca-kernel-repo + ansible.builtin.shell: 'find /tmp/DOCA.* -name doca-kernel-repo-*' + register: _doca_kernel_repo # e.g. /tmp/DOCA.WVMchs2QWo/doca-kernel-repo-24.10.1.1.4.0-1.kver.5.14.0.427.31.1.el9.4.x86.64.x86_64.rpm + changed_when: false + +- name: Create dnf cache + ansible.builtin.command: dnf makecache + +- name: Install DOCA repository package + ansible.builtin.dnf: + name: "{{ _doca_kernel_repo.stdout }}" + disable_gpg_check: true + +- name: Install DOCA packages + ansible.builtin.dnf: + name: "{{ doca_profile }}" + +- name: Cleanup DOCA build directories + ansible.builtin.file: + state: absent + path: "{{ (_doca_kernel_repo.stdout | split('/'))[:2] | join('/') }}" + +- name: Update initramfs + ansible.builtin.command: + cmd: dracut -f --tmpdir /var/tmp + environment: + TMPDIR: /var/tmp + register: _doca_dracut + failed_when: _doca_dracut.stderr != '' # appears rc is always 0 + +- name: Load the new driver + ansible.builtin.command: /etc/init.d/openibd restart diff --git a/ansible/roles/doca/tasks/main.yml b/ansible/roles/doca/tasks/main.yml new file mode 100644 index 000000000..e7a272f38 --- /dev/null +++ b/ansible/roles/doca/tasks/main.yml @@ -0,0 +1 @@ +- include_tasks: install.yml diff --git a/docs/image-build.md b/docs/image-build.md index 4896bde57..a7d2e951b 100644 --- a/docs/image-build.md +++ b/docs/image-build.md @@ -2,87 +2,57 @@ The appliance contains code and configuration to use [Packer](https://developer.hashicorp.com/packer) with the [OpenStack builder](https://www.packer.io/plugins/builders/openstack) to build images. -The Packer configuration defined here builds "fat images" which contain binaries for all nodes, but no cluster-specific configuration. Using these: +The Packer configuration defined here builds "fat images" which contain packages, binaries and container images but no cluster-specific configuration. Using these: - Enables the image to be tested in CI before production use. - Ensures re-deployment of the cluster or deployment of additional nodes can be completed even if packages are changed in upstream repositories (e.g. due to RockyLinux or OpenHPC updates). - Improves deployment speed by reducing the number of package downloads to improve deployment speed. -By default, a fat image build starts from a nightly image build containing Mellanox OFED, and updates all DNF packages already present. The 'latest' nightly build itself is from a RockyLinux GenericCloud image. - -The fat images StackHPC builds and test in CI are available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). However with some additional configuration it is also possible to: +The fat images StackHPC builds and tests in CI are available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). However with some additional configuration it is also possible to: 1. Build site-specific fat images from scratch. -2. Extend an existing fat image with additional software. +2. Extend an existing fat image with additional functionality. # Usage -The steps for building site-specific fat images or extending an existing fat image are the same: +To build either a site-specific fat image from scratch, or to extend an existing StackHPC fat image: 1. Ensure the current OpenStack credentials have sufficient authorisation to upload images (this may or may not require the `member` role for an application credential, depending on your OpenStack configuration). -2. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables) at e.g. `environments//builder.pkrvars.hcl` containing at a minimum e.g.: +2. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables) at e.g. `environments//builder.pkrvars.hcl` containing at a minimum: ```hcl flavor = "general.v1.small" # VM flavor to use for builder VMs networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"] # List of network UUIDs to attach the VM to + source_image_name = "Rocky-9-GenericCloud-Base-9.4" # Name of image to create VM with, i.e. starting image + inventory_groups = "control,login,compute" # Additional inventory groups to add build VM to + ``` + Note that: - - The network used for the Packer VM must provide outbound internet access but does not need to provide access to resources which the final cluster nodes require (e.g. Slurm control node, network filesystem servers etc.). - - For additional options such as non-default private key locations or jumphost configuration see the variable descriptions in `./openstack.pkr.hcl`. - - For an example of configuration for extending an existing fat image see below. + - The network used for the Packer VM must provide outbound internet access but does not need to provide access to resources which the final cluster nodes require (e.g. Slurm control node, network filesystem servers etc.). + - The flavor used must have sufficent memory for the build tasks, but otherwise does not need to match the final cluster nodes. Usually 8GB is sufficent. By default, the build VM is volume-backed to allow control of the root disk size (and hence final image size) so the flavor disk size does not matter. + - The source image should be either a RockyLinux GenericCloud image for a site-specific image build from scratch, or a StackHPC fat image if extending an existing image. + - The `inventory_groups` variable takes a comma-separated list of Ansible inventory groups to add the build VM to. This is in addition to the `builder` group which it is always added to. This controls which Ansible roles and functionality run during build, and hence what gets added to the image. All possible groups are listed in `environments/common/groups` but common options for this variable will be: + - `update,control,login,compute`: The resultant image has all packages in the source image updated, and then packages for all types of nodes in the cluster are added. When using a GenericCloud image for `source_image_name` this builds a site-specific fat image from scratch. + - One or more specific groups which are not enabled in the appliance by default, e.g. `lustre`. When using a StackHPC fat image for `source_image_name` this extends the image with just this additional functionality. 3. Activate the venv and the relevant environment. 4. Build images using the relevant variable definition file, e.g.: cd packer/ - PACKER_LOG=1 /usr/bin/packer build -only=openstack.openhpc --on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl - - Note that the `-only` flag here restricts Packer to a single specific "build" definition (in Packer terminology). Options here are: - - `-only=openstack.openhpc`: Build a fat image including Mellanox OFED - - `-only=openstack.openhpc-cuda`: Build a fat image including Mellanox OFED, Nvidia drivers and CUDA - - `-only=openstack.openhpc-extra`: Build an image which *extends* an existing fat image - -5. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc-` and including a timestamp and a shortened git hash. - -# Defining an "extra" image build - -An "extra" image build starts with an existing fat image (e.g. one provided by StackHPC) rather than a RockyLinux GenericCloud image, and only runs a specific subset of the -Ansible in the appliance. This allows adding additional functionality into site-specific images, without modifying the existing functionality in the base fat image. This is the recommended way to build site-specific images. - -To configure an "extra" image build, prepare a Packer variable definition file as described above but also including: - -- `extra_build_image_name`: A string to add into the final image name. -- `source_image` or `source_image_name`: The UUID or name of the fat image to start from (which must already be present in OpenStack). -- `extra_build_groups`: A list of Ansible inventory groups to put the build VM into, in addition to the `builder` group. This defines the roles/functionality - which are added to the image. -- `extra_build_volume_size`: A number giving the size in GB of the volume for the build VM's root disk and therefore the resulting image size. - Note this assumes the default of `use_blockstorage_volume = true`. - -E.g. to add the lustre client to an RockyLinux 9 image: - - # environments/site/lustre.pkvars.hcl - - extra_build_image_name = "lustre" # output image name will be like "openhpc-lustre-RL9-$timestamp-$commit" - source_image_name = "openhpc-ofed-RL9-240906-1041-32568dbb" # e.g. current StackHPC RL9 image - extra_build_groups = ["lustre"] # only run lustre role during this extra build - extra_build_volume_size = 15 # default non-CUDA build image size has enough free space - - # ... define flavor, network, etc as normal - - -Then, reference this build and variables file in the Packer build command: + PACKER_LOG=1 /usr/bin/packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl - PACKER_LOG=1 /usr/bin/packer build -only=openstack.openhpc-extra --on-error=ask -var-file=environments/site/lustre.pkvars.hcl openstack.pkr.hcl + **NB:** If the build fails while creating the volume, check if the source image has the `signature_verified` property: -**NB:** If the build fails while creating the volume, check if the source image has the `signature_verified` property: + openstack image show $SOURCE_IMAGE - openstack image show $SOURCE_IMAGE + If it does, remove this property: -If it does, remove this property: + openstack image unset --property signature_verified $SOURCE_IMAGE - openstack image unset --property signature_verified $SOURCE_IMAGE + then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [Openstack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). -then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [Openstack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). +5. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc` and including a timestamp and a shortened git hash. # Build Process diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 14c997596..5b9d845ef 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241203-1659-b0558b95", - "RL9": "openhpc-RL9-241203-1659-b0558b95" + "RL8": "openhpc-RL8-241211-1322-ded60c2c", + "RL9": "openhpc-RL9-241211-1322-ded60c2c" } } diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 52202ead1..2ba0a1e63 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -23,6 +23,7 @@ data "git-commit" "cwd-head" { } locals { git_commit = data.git-commit.cwd-head.hash timestamp = formatdate("YYMMDD-hhmm", timestamp()) + image_name_version = var.image_name_version == "auto" ? "-${local.timestamp}-${substr(local.git_commit, 0, 8)}" : var.image_name_version } # Path pointing to root of repository - automatically set by environment variable PKR_VAR_repo_root @@ -39,12 +40,6 @@ variable "networks" { type = list(string) } -variable "os_version" { - type = string - description = "'RL8' or 'RL9' with default source_image_* mappings" - default = "RL9" -} - # Must supply either source_image_name or source_image_id variable "source_image_name" { type = string @@ -123,15 +118,6 @@ variable "volume_type" { } variable "volume_size" { - type = map(number) - default = { - # fat image builds, GB: - rocky-latest = 15 - openhpc = 15 - } -} - -variable "extra_build_volume_size" { type = number default = 15 } @@ -146,25 +132,22 @@ variable "metadata" { default = {} } -variable "groups" { - type = map(list(string)) - description = "Additional inventory groups (other than 'builder') to add build VM to, keyed by source name" - default = { - # fat image builds: - rocky-latest = ["update"] - openhpc = ["control", "compute", "login"] - } +variable "inventory_groups" { + type = string + description = "Comma-separated list of additional inventory groups (other than 'builder') to add build VM to. Default is none." + default = "" } -variable "extra_build_groups" { - type = list(string) - default = [] +variable "image_name" { + type = string + description = "Name of image" + default = "openhpc" } -variable "extra_build_image_name" { +variable "image_name_version" { type = string - description = "Infix for 'extra' build image name" - default = "extra" + description = "Suffix for image name giving version. Default of 'auto' appends timestamp + short commit" + default = "auto" } source "openstack" "openhpc" { @@ -172,9 +155,11 @@ source "openstack" "openhpc" { flavor = var.flavor use_blockstorage_volume = var.use_blockstorage_volume volume_type = var.volume_type - volume_size = lookup(var.volume_size, source.name, var.extra_build_volume_size) + volume_size = var.volume_size metadata = var.metadata - instance_metadata = {ansible_init_disable = "true"} + instance_metadata = { + ansible_init_disable = "true" + } networks = var.networks floating_ip_network = var.floating_ip_network security_groups = var.security_groups @@ -200,27 +185,13 @@ source "openstack" "openhpc" { build { - # latest nightly image: - source "source.openstack.openhpc" { - name = "rocky-latest" - image_name = "${source.name}-${var.os_version}" - } - - # fat image: - source "source.openstack.openhpc" { - name = "openhpc" - image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" - } - - # Extended site-specific image, built on fat image: source "source.openstack.openhpc" { - name = "openhpc-extra" - image_name = "openhpc-${var.extra_build_image_name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" + image_name = "${var.image_name}${local.image_name_version}" } provisioner "ansible" { playbook_file = "${var.repo_root}/ansible/fatimage.yml" - groups = concat(["builder"], lookup(var.groups, source.name, var.extra_build_groups)) + groups = concat(["builder"], var.inventory_groups == "" ? [] : split(",", var.inventory_groups)) keep_inventory_file = true # for debugging use_proxy = false # see https://www.packer.io/docs/provisioners/ansible#troubleshooting extra_arguments = [