Skip to content

Commit

Permalink
merge conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
wtripp180901 committed Jan 2, 2025
2 parents 533d7c5 + a769015 commit 8c98e16
Show file tree
Hide file tree
Showing 32 changed files with 827 additions and 51 deletions.
21 changes: 13 additions & 8 deletions .github/workflows/doca.yml → .github/workflows/extra.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Test DOCA extra build
name: Test extra build
on:
workflow_dispatch:
push:
Expand All @@ -7,16 +7,18 @@ on:
paths:
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
- 'ansible/roles/doca/**'
- '.github/workflows/doca'
- 'ansible/roles/cuda/**'
- '.github/workflows/extra.yml'
pull_request:
paths:
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
- 'ansible/roles/doca/**'
- '.github/workflows/doca'
- 'ansible/roles/cuda/**'
- '.github/workflows/extra.yml'

jobs:
doca:
name: doca-build
name: extra-build
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS
cancel-in-progress: true
Expand All @@ -25,12 +27,14 @@ jobs:
fail-fast: false # allow other matrix jobs to continue even if one fails
matrix: # build RL8, RL9
build:
- image_name: openhpc-doca-RL8
- image_name: openhpc-extra-RL8
source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
inventory_groups: doca
- image_name: openhpc-doca-RL9
inventory_groups: doca,cuda
volume_size: 30 # needed for cuda
- image_name: openhpc-extra-RL9
source_image_name_key: RL9
inventory_groups: doca
inventory_groups: doca,cuda
volume_size: 30 # needed for cuda
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
Expand Down Expand Up @@ -95,6 +99,7 @@ jobs:
-var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \
-var "image_name=${{ matrix.build.image_name }}" \
-var "inventory_groups=${{ matrix.build.inventory_groups }}" \
-var "volume_size=${{ matrix.build.volume_size }}" \
openstack.pkr.hcl
- name: Get created image names from manifest
Expand Down
2 changes: 2 additions & 0 deletions ansible/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ roles/*
!roles/squid/**
!roles/tuned/
!roles/tuned/**
!roles/compute_init/
!roles/compute_init/**
!roles/k3s/
!roles/k3s/**
!roles/k9s/
Expand Down
3 changes: 1 addition & 2 deletions ansible/cleanup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,5 +66,4 @@
slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}"

- name: Show image summary
debug:
var: image_info
command: cat /var/lib/image/image.json
25 changes: 24 additions & 1 deletion ansible/extras.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@
gather_facts: yes
tags: cuda
tasks:
- import_role:
- include_role:
name: cuda
tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}"

- name: Persist hostkeys across rebuilds
# Must be after filesystems.yml (for storage)
Expand All @@ -37,10 +38,32 @@
- import_role:
name: persist_hostkeys


- name: Setup NFS export for compute node configuration
hosts: compute_init:!builder
# NB: has to be after eeesi and os-manila-mount
tags: compute_init
become: yes
name: Export hostvars
tasks:
- include_role:
name: compute_init
tasks_from: export.yml

- name: Install k9s
become: yes
hosts: k9s
tags: k9s
tasks:
- import_role:
name: k9s

- hosts: extra_packages
become: yes
tags:
- extra_packages
tasks:
- name: Install additional packages
dnf:
name: "{{ appliances_extra_packages }}"
when: appliances_mode != 'configure' or appliances_extra_packages_during_configure
22 changes: 20 additions & 2 deletions ansible/fatimage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@

- import_playbook: bootstrap.yml

- hosts: doca
become: yes
gather_facts: yes
tasks:
- name: Install NVIDIA DOCA
import_role:
name: doca

- name: Run post-bootstrap.yml hook
vars:
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
Expand Down Expand Up @@ -65,6 +73,16 @@

- import_playbook: extras.yml

# TODO: is this the right place?
- name: Install compute_init script
hosts: compute_init
tags: compute_init # tagged to allow running on cluster instances for dev
become: yes
tasks:
- include_role:
name: compute_init
tasks_from: install.yml

- hosts: builder
become: yes
gather_facts: yes
Expand Down Expand Up @@ -220,15 +238,15 @@
import_role:
name: doca

- import_playbook: disable-repos.yml

- name: Run post.yml hook
vars:
appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
hook_path: "{{ appliances_environment_root }}/hooks/post.yml"
import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}"
when: hook_path | exists

- import_playbook: disable-repos.yml

- hosts: builder
become: yes
gather_facts: yes
Expand Down
4 changes: 3 additions & 1 deletion ansible/filter_plugins/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@ def to_ood_regex(items):
return '|'.join(r)

def appliances_repo_to_subpath(repo_entry):
return repo_entry['path']+'/'+repo_entry['timestamp']
""" Take an element from appliances_pulp_repos and convert it to a pulp path. This assumes that the remote and local pulp structures are the same
"""
return repo_entry['path'] + '/' + repo_entry['timestamp']

class FilterModule(object):
''' Ansible core jinja2 filters '''
Expand Down
4 changes: 2 additions & 2 deletions ansible/roles/cluster_infra/templates/resources.tf.j2
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ resource "openstack_compute_instance_v2" "login" {
ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}"
{% endif %}
{% endfor %}
k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
control_address = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
k3s_token = "{{ k3s_token }}"
}
}
Expand Down Expand Up @@ -565,7 +565,7 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" {
ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}"
{% endif %}
{% endfor %}
k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
control_address = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
k3s_token = "{{ k3s_token }}"
}
}
Expand Down
130 changes: 130 additions & 0 deletions ansible/roles/compute_init/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# EXPERIMENTAL: compute-init

Experimental / in-progress functionality to allow compute nodes to rejoin the
cluster after a reboot.

To enable this add compute nodes (or a subset of them into) the `compute_init`
group.

This works as follows:
1. During image build, an ansible-init playbook and supporting files
(e.g. templates, filters, etc) are installed.
2. Cluster instances are created as usual; the above compute-init playbook does
not run.
3. The `site.yml` playbook is run as usual to configure all the instances into
a cluster. In addition, with `compute-init` enabled, a `/exports/cluster` NFS
share is created on the control node containing:
- an /etc/hosts file for the cluster
- Hostvars for each compute node
4. On reboot of a compute node, ansible-init runs the compute-init playbook
which:
a. Checks whether the `enable_compute` metadata flag is set, and exits if
not.
b. Tries to mount the above `/exports/cluster` NFS share from the control
node, and exits if it cannot.
c. Configures itself using the exported hostvars, depending on the
`enable_*` flags set in metadata.
d. Issues an `scontrol` command to resume the node (because Slurm will
consider it as "unexpectedly rebooted").

The check in 4b. above is what prevents the compute-init script from trying
to configure the node before the services on the control node are available
(which requires running the site.yml playbook).

The following roles/groups are currently fully functional:
- `resolv_conf`: all functionality
- `etc_hosts`: all functionality
- `nfs`: client functionality only
- `manila`: all functionality
- `basic_users`: all functionality, assumes home directory already exists on
shared storage
- `eessi`: all functionality, assumes `cvmfs_config` is the same on control
node and all compute nodes.
- `openhpc`: all functionality

# Development/debugging

To develop/debug this without actually having to build an image:


1. Deploy a cluster using tofu and ansible/site.yml as normal. This will
additionally configure the control node to export compute hostvars over NFS.
Check the cluster is up.

2. Reimage the compute nodes:

ansible-playbook --limit compute ansible/adhoc/rebuild.yml

3. Add metadata to a compute node e.g. via Horizon to turn on compute-init
playbook functionality.

4. Fake an image build to deploy the compute-init playbook:

ansible-playbook ansible/fatimage.yml --tags compute_init

NB: This will also re-export the compute hostvars, as the nodes are not
in the builder group, which conveniently means any changes made to that
play also get picked up.

5. Fake a reimage of compute to run ansible-init and the compute-init playbook:

On compute node where metadata was added:

[root@rl9-compute-0 rocky]# rm -f /var/lib/ansible-init.done && systemctl restart ansible-init
[root@rl9-compute-0 rocky]# systemctl status ansible-init

Use `systemctl status ansible-init` to view stdout/stderr from Ansible.

Steps 4/5 can be repeated with changes to the compute script. If required,
reimage the compute node(s) first as in step 2 and/or add additional metadata
as in step 3.


# Design notes
- Duplicating code in roles into the `compute-init` script is unfortunate, but
does allow developing this functionality without wider changes to the
appliance.

- In general, we don't want to rely on NFS export. So should e.g. copy files
from this mount ASAP in the compute-init script. TODO:

- There are a couple of approaches to supporting existing roles using `compute-init`:

1. Control node copies files resulting from role into cluster exports,
compute-init copies to local disk. Only works if files are not host-specific
Examples: etc_hosts, eessi config?

2. Re-implement the role. Works if the role vars are not too complicated,
(else they all need to be duplicated in compute-init). Could also only
support certain subsets of role functionality or variables
Examples: resolv_conf, stackhpc.openhpc

- Some variables are defined using hostvars from other nodes, which aren't
available v the current approach:

```
[root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml
"grafana_address": "{{ hostvars[groups['grafana'].0].api_address }}",
"grafana_api_address": "{{ hostvars[groups['grafana'].0].internal_address }}",
"mysql_host": "{{ hostvars[groups['mysql'] | first].api_address }}",
"nfs_server_default": "{{ hostvars[groups['control'] | first ].internal_address }}",
"openhpc_slurm_control_host": "{{ hostvars[groups['control'].0].api_address }}",
"openondemand_address": "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}",
"openondemand_node_proxy_directives": "{{ _opeonondemand_unset_auth if (openondemand_auth == 'basic_pam' and 'openondemand_host_regex' and groups['grafana'] | length > 0 and hostvars[ groups['grafana'] | first]._grafana_auth_is_anonymous) else '' }}",
"openondemand_servername": "{{ hostvars[ groups['openondemand'] | first].ansible_host }}",
"prometheus_address": "{{ hostvars[groups['prometheus'].0].api_address }}",
"{{ hostvars[groups['freeipa_server'].0].ansible_host }}"
```
More generally, there is nothing to stop any group var depending on a
"{{ hostvars[] }}" interpolation ...
Only `nfs_server_default` and `openhpc_slurm_control_host` are of concern
for compute nodes - both of these indirect via `api_address` to
`inventory_hostname`. This has been worked around by replacing this with
"{{ groups['control'] | first }}" which does result in the control node
inventory hostname when templating.
Note that although `groups` is defined in the templated hostvars, when
the hostvars are loaded using `include_vars:` is is ignored as it is a
"magic variable" determined by ansible itself and cannot be set.
Loading

0 comments on commit 8c98e16

Please sign in to comment.