Skip to content

Commit

Permalink
Add role to install NVIDIA DOCA on top of an existing "fat" image (#492)
Browse files Browse the repository at this point in the history
* add doca role run by fatimage

* add workflow to test doca build

* make packer inventory groups clearer and allow defining no extra

* update packer workflows for new packer config

* define builds entirely via matrix

* WIP: do DOCA CI build on top of current fat image

* fixup matrix for changes

* fix doca workflow typo

* use current fatimage for doca test build

* enable fatimage to be used for volume-backed builds

* bump CI image

* doca workflow: clean up image and only run on relevant changes

* remove commented-out code

* add DOCA README

* fix DOCA role actually running

* tidyup DOCA play

* include doca packages in image summary

* fix squid being selected for any stackhopc build VM

* fix nightly build concurrency

* re-add squid back to Stackhpc builder group

* remove debugging exit

* update image build docs

* update packer docs
  • Loading branch information
sjpb authored Dec 12, 2024
1 parent 4de581c commit efd2883
Show file tree
Hide file tree
Showing 14 changed files with 323 additions and 155 deletions.
132 changes: 132 additions & 0 deletions .github/workflows/doca.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
name: Test DOCA extra build
on:
workflow_dispatch:
push:
branches:
- main
paths:
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
- 'ansible/roles/doca/**'
- '.github/workflows/doca'
pull_request:
paths:
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
- 'ansible/roles/doca/**'
- '.github/workflows/doca'

jobs:
doca:
name: doca-build
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS
cancel-in-progress: true
runs-on: ubuntu-22.04
strategy:
fail-fast: false # allow other matrix jobs to continue even if one fails
matrix: # build RL8, RL9
build:
- image_name: openhpc-doca-RL8
source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
inventory_groups: doca
- image_name: openhpc-doca-RL9
source_image_name_key: RL9
inventory_groups: doca
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings
ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }}

steps:
- uses: actions/checkout@v2

- name: Load current fat images into GITHUB_ENV
# see https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#example-of-a-multiline-string
run: |
{
echo 'FAT_IMAGES<<EOF'
cat environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
echo EOF
} >> "$GITHUB_ENV"
- name: Record settings
run: |
echo CI_CLOUD: ${{ env.CI_CLOUD }}
echo FAT_IMAGES: ${FAT_IMAGES}
- name: Setup ssh
run: |
set -x
mkdir ~/.ssh
echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa
chmod 0600 ~/.ssh/id_rsa
shell: bash

- name: Add bastion's ssh key to known_hosts
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
shell: bash

- name: Install ansible etc
run: dev/setup-env.sh

- name: Write clouds.yaml
run: |
mkdir -p ~/.config/openstack/
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
shell: bash

- name: Setup environment
run: |
. venv/bin/activate
. environments/.stackhpc/activate
- name: Build fat image with packer
id: packer_build
run: |
set -x
. venv/bin/activate
. environments/.stackhpc/activate
cd packer/
packer init .
PACKER_LOG=1 packer build \
-on-error=${{ vars.PACKER_ON_ERROR }} \
-var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
-var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \
-var "image_name=${{ matrix.build.image_name }}" \
-var "inventory_groups=${{ matrix.build.inventory_groups }}" \
openstack.pkr.hcl
- name: Get created image names from manifest
id: manifest
run: |
. venv/bin/activate
IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json)
while ! openstack image show -f value -c name $IMAGE_ID; do
sleep 5
done
IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID)
echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT"
echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT"
echo $IMAGE_ID > image-id.txt
echo $IMAGE_NAME > image-name.txt
- name: Make image usable for further builds
run: |
. venv/bin/activate
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}"
- name: Delete image for automatically-run workflows
run: |
. venv/bin/activate
openstack image delete "${{ steps.manifest.outputs.image-id }}"
if: ${{ github.event_name != 'workflow_dispatch' }}

- name: Upload manifest artifact
uses: actions/upload-artifact@v4
with:
name: image-details-${{ matrix.build.image_name }}
path: |
./image-id.txt
./image-name.txt
overwrite: true
38 changes: 18 additions & 20 deletions .github/workflows/fatimage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,30 +15,23 @@ jobs:
openstack:
name: openstack-imagebuild
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS
cancel-in-progress: true
runs-on: ubuntu-22.04
strategy:
fail-fast: false # allow other matrix jobs to continue even if one fails
matrix: # build RL8, RL9
os_version:
- RL8
- RL9
build:
- openstack.openhpc
- image_name: openhpc-RL8
source_image_name: rocky-latest-RL8
inventory_groups: control,compute,login
- image_name: openhpc-RL9
source_image_name: rocky-latest-RL9
inventory_groups: control,compute,login
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
CI_CLOUD: ${{ github.event.inputs.ci_cloud }}
SOURCE_IMAGES_MAP: |
{
"RL8": {
"openstack.openhpc": "rocky-latest-RL8"
},
"RL9": {
"openstack.openhpc": "rocky-latest-RL9"
}
}
ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }}

steps:
Expand Down Expand Up @@ -85,13 +78,11 @@ jobs:
PACKER_LOG=1 packer build \
-on-error=${{ vars.PACKER_ON_ERROR }} \
-only=${{ matrix.build }} \
-var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
-var "source_image_name=${{ env.SOURCE_IMAGE }}" \
-var "source_image_name=${{ matrix.build.source_image_name }}" \
-var "image_name=${{ matrix.build.image_name }}" \
-var "inventory_groups=${{ matrix.build.inventory_groups }}" \
openstack.pkr.hcl
env:
PKR_VAR_os_version: ${{ matrix.os_version }}
SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version][matrix.build] }}
- name: Get created image names from manifest
id: manifest
Expand All @@ -102,13 +93,20 @@ jobs:
sleep 5
done
IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID)
echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT"
echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT"
echo $IMAGE_ID > image-id.txt
echo $IMAGE_NAME > image-name.txt
- name: Make image usable for further builds
run: |
. venv/bin/activate
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}"
- name: Upload manifest artifact
uses: actions/upload-artifact@v4
with:
name: image-details-${{ matrix.build }}-${{ matrix.os_version }}
name: image-details-${{ matrix.build.image_name }}
path: |
./image-id.txt
./image-name.txt
Expand Down
54 changes: 20 additions & 34 deletions .github/workflows/nightlybuild.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,32 +11,29 @@ on:
- SMS
- ARCUS
schedule:
- cron: '0 0 * * *' # Run at midnight
- cron: '0 0 * * *' # Run at midnight on default branch

jobs:
openstack:
name: openstack-imagebuild
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS
cancel-in-progress: true
runs-on: ubuntu-22.04
strategy:
fail-fast: false # allow other matrix jobs to continue even if one fails
matrix: # build RL8, RL9
os_version:
- RL8
- RL9
build:
- openstack.rocky-latest
- image_name: rocky-latest-RL8
source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2
inventory_groups: update
- image_name: rocky-latest-RL9
source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2
inventory_groups: update
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
CI_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }}
SOURCE_IMAGES_MAP: |
{
"RL8": "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2",
"RL9": "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2"
}
ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }}

steps:
Expand Down Expand Up @@ -83,15 +80,12 @@ jobs:
PACKER_LOG=1 packer build \
-on-error=${{ vars.PACKER_ON_ERROR }} \
-only=${{ matrix.build }} \
-var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
-var "source_image_name=${{ env.SOURCE_IMAGE }}" \
-var "source_image_name=${{ matrix.build.source_image_name }}" \
-var "image_name=${{ matrix.build.image_name }}" \
-var "inventory_groups=${{ matrix.build.inventory_groups }}" \
openstack.pkr.hcl
env:
PKR_VAR_os_version: ${{ matrix.os_version }}
SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version] }}

- name: Get created image names from manifest
id: manifest
run: |
Expand Down Expand Up @@ -125,7 +119,7 @@ jobs:
name: upload-nightly-targets
needs: openstack
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.image }}-${{ matrix.target_cloud }}
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }}-${{ matrix.target_cloud }}
cancel-in-progress: true
runs-on: ubuntu-22.04
strategy:
Expand All @@ -135,18 +129,15 @@ jobs:
- LEAFCLOUD
- SMS
- ARCUS
os_version:
- RL8
- RL9
image:
- rocky-latest
build:
- image_name: rocky-latest-RL8
- image_name: rocky-latest-RL9
exclude:
- target_cloud: LEAFCLOUD
env:
OS_CLOUD: openstack
SOURCE_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }}
TARGET_CLOUD: ${{ matrix.target_cloud }}
IMAGE_NAME: "${{ matrix.image }}-${{ matrix.os_version }}"
steps:
- uses: actions/checkout@v2

Expand All @@ -161,42 +152,37 @@ jobs:
. venv/bin/activate
pip install -U pip
pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)
shell: bash
- name: Write clouds.yaml
run: |
mkdir -p ~/.config/openstack/
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.SOURCE_CLOUD)] }}" > ~/.config/openstack/source_clouds.yaml
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.TARGET_CLOUD)] }}" > ~/.config/openstack/target_clouds.yaml
shell: bash
- name: Download source image
run: |
. venv/bin/activate
export OS_CLIENT_CONFIG_FILE=~/.config/openstack/source_clouds.yaml
openstack image save --file ${{ env.IMAGE_NAME }} ${{ env.IMAGE_NAME }}
shell: bash
openstack image save --file ${{ matrix.build.image_name }} ${{ matrix.build.image_name }}
- name: Upload to target cloud
run: |
. venv/bin/activate
export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml
openstack image create "${{ env.IMAGE_NAME }}" \
--file "${{ env.IMAGE_NAME }}" \
openstack image create "${{ matrix.build.image_name }}" \
--file "${{ matrix.build.image_name }}" \
--disk-format qcow2 \
shell: bash
- name: Delete old latest image from target cloud
run: |
. venv/bin/activate
export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml
IMAGE_COUNT=$(openstack image list --name ${{ env.IMAGE_NAME }} -f value -c ID | wc -l)
IMAGE_COUNT=$(openstack image list --name ${{ matrix.build.image_name }} -f value -c ID | wc -l)
if [ "$IMAGE_COUNT" -gt 1 ]; then
OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ env.IMAGE_NAME }}" -f value -c ID | head -n 1)
OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ matrix.build.image_name }}" -f value -c ID | head -n 1)
openstack image delete "$OLD_IMAGE_ID"
else
echo "Only one image exists, skipping deletion."
fi
shell: bash
2 changes: 2 additions & 0 deletions ansible/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,5 @@ roles/*
!roles/lustre/**
!roles/dnf_repos/
!roles/dnf_repos/**
!roles/doca/
!roles/doca/**
5 changes: 5 additions & 0 deletions ansible/cleanup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,5 +61,10 @@
os: "{{ ansible_distribution }} {{ ansible_distribution_version }}"
kernel: "{{ ansible_kernel }}"
ofed: "{{ ansible_facts.packages['mlnx-ofa_kernel'].0.version | default('-') }}"
doca: "{{ ansible_facts.packages[doca_profile | default('doca-ofed') ].0.version | default('-') }}"
cuda: "{{ ansible_facts.packages['cuda'].0.version | default('-') }}"
slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}"

- name: Show image summary
debug:
var: image_info
11 changes: 11 additions & 0 deletions ansible/fatimage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
tasks:
- name: Report hostname (= final image name)
command: hostname
- name: Report inventory groups
debug:
var: group_names

- name: Run pre.yml hook
vars:
Expand Down Expand Up @@ -199,6 +202,14 @@
name: cloudalchemy.grafana
tasks_from: install.yml

- hosts: doca
become: yes
gather_facts: yes
tasks:
- name: Install NVIDIA DOCA
import_role:
name: doca

- name: Run post.yml hook
vars:
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
Expand Down
Loading

0 comments on commit efd2883

Please sign in to comment.