merge conflicts

stackhpc · Jan 2, 2025 · 8c98e16 · 8c98e16
2 parents 533d7c5 + a769015
commit 8c98e16
Show file tree

Hide file tree

Showing 32 changed files with 827 additions and 51 deletions.
diff --git a/.github/workflows/doca.yml → .github/workflows/extra.yml b/.github/workflows/doca.yml → .github/workflows/extra.yml
@@ -1,4 +1,4 @@
-name: Test DOCA extra build
+name: Test extra build
 on:
   workflow_dispatch:
   push:
@@ -7,16 +7,18 @@ on:
     paths:
       - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
       - 'ansible/roles/doca/**'
-      - '.github/workflows/doca'
+      - 'ansible/roles/cuda/**'
+      - '.github/workflows/extra.yml'
   pull_request:
     paths:
       - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
       - 'ansible/roles/doca/**'
-      - '.github/workflows/doca'
+      - 'ansible/roles/cuda/**'
+      - '.github/workflows/extra.yml'
 
 jobs:
   doca:
-    name: doca-build
+    name: extra-build
     concurrency:
       group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS
       cancel-in-progress: true
@@ -25,12 +27,14 @@ jobs:
       fail-fast: false # allow other matrix jobs to continue even if one fails
       matrix: # build RL8, RL9
         build:
-          - image_name: openhpc-doca-RL8
+          - image_name: openhpc-extra-RL8
             source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
-            inventory_groups: doca
-          - image_name: openhpc-doca-RL9
+            inventory_groups: doca,cuda
+            volume_size: 30 # needed for cuda
+          - image_name: openhpc-extra-RL9
             source_image_name_key: RL9
-            inventory_groups: doca
+            inventory_groups: doca,cuda
+            volume_size: 30 # needed for cuda
     env:
       ANSIBLE_FORCE_COLOR: True
       OS_CLOUD: openstack
@@ -95,6 +99,7 @@ jobs:
           -var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \
           -var "image_name=${{ matrix.build.image_name }}" \
           -var "inventory_groups=${{ matrix.build.inventory_groups }}" \
+          -var "volume_size=${{ matrix.build.volume_size }}" \
           openstack.pkr.hcl
         
       - name: Get created image names from manifest

diff --git a/ansible/.gitignore b/ansible/.gitignore
@@ -58,6 +58,8 @@ roles/*
 !roles/squid/**
 !roles/tuned/
 !roles/tuned/**
+!roles/compute_init/
+!roles/compute_init/**
 !roles/k3s/
 !roles/k3s/**
 !roles/k9s/

diff --git a/ansible/cleanup.yml b/ansible/cleanup.yml
@@ -66,5 +66,4 @@
       slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}"
 
 - name: Show image summary
-  debug:
-    var: image_info
+  command: cat /var/lib/image/image.json
diff --git a/ansible/extras.yml b/ansible/extras.yml
@@ -24,8 +24,9 @@
   gather_facts: yes
   tags: cuda
   tasks:
-    - import_role:
+    - include_role:
         name: cuda
+        tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}"
 
 - name: Persist hostkeys across rebuilds
   # Must be after filesystems.yml (for storage)
@@ -37,10 +38,32 @@
     - import_role:
         name: persist_hostkeys
 
+
+- name: Setup NFS export for compute node configuration
+  hosts: compute_init:!builder
+  # NB: has to be after eeesi and os-manila-mount
+  tags: compute_init
+  become: yes
+  name: Export hostvars
+  tasks:
+    - include_role:
+        name: compute_init
+        tasks_from: export.yml
+
 - name: Install k9s
   become: yes
   hosts: k9s
   tags: k9s
   tasks:
   - import_role:
       name: k9s
+
+- hosts: extra_packages
+  become: yes
+  tags:
+   - extra_packages
+  tasks:
+  - name: Install additional packages
+    dnf:
+      name: "{{ appliances_extra_packages }}"
+    when: appliances_mode != 'configure' or appliances_extra_packages_during_configure
diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml
@@ -29,6 +29,14 @@
 
 - import_playbook: bootstrap.yml
 
+- hosts: doca
+  become: yes
+  gather_facts: yes
+  tasks:
+    - name: Install NVIDIA DOCA
+      import_role:
+        name: doca
+
 - name: Run post-bootstrap.yml hook
   vars:
     appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
@@ -65,6 +73,16 @@
 
 - import_playbook: extras.yml
 
+# TODO: is this the right place?
+- name: Install compute_init script
+  hosts: compute_init
+  tags: compute_init # tagged to allow running on cluster instances for dev
+  become: yes
+  tasks:
+    - include_role:
+        name: compute_init
+        tasks_from: install.yml
+
 - hosts: builder
   become: yes
   gather_facts: yes
@@ -220,15 +238,15 @@
       import_role:
         name: doca
 
-- import_playbook: disable-repos.yml
-
 - name: Run post.yml hook
   vars:
     appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
     hook_path: "{{ appliances_environment_root }}/hooks/post.yml"
   import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}"
   when: hook_path | exists
 
+- import_playbook: disable-repos.yml
+
 - hosts: builder
   become: yes
   gather_facts: yes

diff --git a/ansible/filter_plugins/utils.py b/ansible/filter_plugins/utils.py
@@ -49,7 +49,9 @@ def to_ood_regex(items):
     return '|'.join(r)
 
 def appliances_repo_to_subpath(repo_entry):
-    return repo_entry['path']+'/'+repo_entry['timestamp']
+    """ Take an element from appliances_pulp_repos and convert it to a pulp path. This assumes that the remote and local pulp structures are the same
+    """
+    return repo_entry['path'] + '/' + repo_entry['timestamp']
 
 class FilterModule(object):
     ''' Ansible core jinja2 filters '''

diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2
@@ -399,7 +399,7 @@ resource "openstack_compute_instance_v2" "login" {
         ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}"
       {% endif %}
     {% endfor %} 
-    k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
+    control_address = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
     k3s_token = "{{ k3s_token }}"
   }
 }
@@ -565,7 +565,7 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" {
         ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}"
       {% endif %}
     {% endfor %} 
-    k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
+    control_address = openstack_compute_instance_v2.control.network[0].fixed_ip_v4
     k3s_token = "{{ k3s_token }}"
   }
 }

diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md
@@ -0,0 +1,130 @@
+# EXPERIMENTAL: compute-init
+
+Experimental / in-progress functionality to allow compute nodes to rejoin the
+cluster after a reboot.
+
+To enable this add compute nodes (or a subset of them into) the `compute_init`
+group.
+
+This works as follows:
+1. During image build, an ansible-init playbook and supporting files
+(e.g. templates, filters, etc) are installed.
+2. Cluster instances are created as usual; the above compute-init playbook does
+not run.
+3. The `site.yml` playbook is run as usual to configure all the instances into
+a cluster. In addition, with `compute-init` enabled, a `/exports/cluster` NFS
+share is created on the control node containing:
+    - an /etc/hosts file for the cluster
+    - Hostvars for each compute node
+4. On reboot of a compute node, ansible-init runs the compute-init playbook
+which:
+    a. Checks whether the `enable_compute` metadata flag is set, and exits if
+       not.
+    b. Tries to mount the above `/exports/cluster` NFS share from the control
+       node, and exits if it cannot.
+    c. Configures itself using the exported hostvars, depending on the
+       `enable_*` flags set in metadata.
+    d. Issues an `scontrol` command to resume the node (because Slurm will
+       consider it as "unexpectedly rebooted").
+
+The check in 4b. above is what prevents the compute-init script from trying
+to configure the node before the services on the control node are available
+(which requires running the site.yml playbook).
+
+The following roles/groups are currently fully functional:
+- `resolv_conf`: all functionality
+- `etc_hosts`: all functionality
+- `nfs`: client functionality only
+- `manila`: all functionality
+- `basic_users`: all functionality, assumes home directory already exists on
+  shared storage
+- `eessi`: all functionality, assumes `cvmfs_config` is the same on control
+  node and all compute nodes.
+- `openhpc`: all functionality
+
+# Development/debugging
+
+To develop/debug this without actually having to build an image:
+
+
+1. Deploy a cluster using tofu and ansible/site.yml as normal. This will
+   additionally configure the control node to export compute hostvars over NFS.
+   Check the cluster is up.
+
+2. Reimage the compute nodes:
+
+        ansible-playbook --limit compute ansible/adhoc/rebuild.yml
+
+3. Add metadata to a compute node e.g. via Horizon to turn on compute-init
+   playbook functionality.
+
+4. Fake an image build to deploy the compute-init playbook:
+
+        ansible-playbook ansible/fatimage.yml --tags compute_init
+
+    NB: This will also re-export the compute hostvars, as the nodes are not
+    in the builder group, which conveniently means any changes made to that
+    play also get picked up.
+
+5. Fake a reimage of compute to run ansible-init and the compute-init playbook:
+
+    On compute node where metadata was added:
+
+        [root@rl9-compute-0 rocky]# rm -f /var/lib/ansible-init.done && systemctl restart ansible-init
+        [root@rl9-compute-0 rocky]# systemctl status ansible-init
+
+    Use `systemctl status ansible-init` to view stdout/stderr from Ansible.
+
+Steps 4/5 can be repeated with changes to the compute script. If required,
+reimage the compute node(s) first as in step 2 and/or add additional metadata
+as in step 3.
+
+
+# Design notes
+- Duplicating code in roles into the `compute-init` script is unfortunate, but
+  does allow developing this functionality without wider changes to the
+  appliance.
+
+- In general, we don't want to rely on NFS export. So should e.g. copy files
+  from this mount ASAP in the compute-init script. TODO:
+
+- There are a couple of approaches to supporting existing roles using `compute-init`:
+
+  1. Control node copies files resulting from role into cluster exports,
+     compute-init copies to local disk. Only works if files are not host-specific
+     Examples: etc_hosts, eessi config?
+
+  2. Re-implement the role. Works if the role vars are not too complicated,
+     (else they all need to be duplicated in compute-init). Could also only
+     support certain subsets of role functionality or variables
+     Examples: resolv_conf, stackhpc.openhpc
+
+- Some variables are defined using hostvars from other nodes, which aren't
+  available v the current approach:
+
+    ```
+    [root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml 
+        "grafana_address": "{{ hostvars[groups['grafana'].0].api_address }}",
+        "grafana_api_address": "{{ hostvars[groups['grafana'].0].internal_address }}",
+        "mysql_host": "{{ hostvars[groups['mysql'] | first].api_address }}",
+        "nfs_server_default": "{{ hostvars[groups['control'] | first ].internal_address }}",
+        "openhpc_slurm_control_host": "{{ hostvars[groups['control'].0].api_address }}",
+        "openondemand_address": "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}",
+        "openondemand_node_proxy_directives": "{{ _opeonondemand_unset_auth if (openondemand_auth == 'basic_pam' and 'openondemand_host_regex' and groups['grafana'] | length > 0 and hostvars[ groups['grafana']  | first]._grafana_auth_is_anonymous) else '' }}",
+        "openondemand_servername": "{{ hostvars[ groups['openondemand'] | first].ansible_host }}",
+        "prometheus_address": "{{ hostvars[groups['prometheus'].0].api_address }}",
+            "{{ hostvars[groups['freeipa_server'].0].ansible_host }}"
+    ```
+
+    More generally, there is nothing to stop any group var depending on a
+    "{{ hostvars[] }}" interpolation ...
+
+    Only `nfs_server_default` and `openhpc_slurm_control_host` are of concern
+    for compute nodes - both of these indirect via `api_address` to
+    `inventory_hostname`. This has been worked around by replacing this with
+    "{{ groups['control'] | first }}" which does result in the control node
+    inventory hostname when templating.
+
+    Note that although `groups` is defined in the templated hostvars, when
+    the hostvars are loaded using `include_vars:` is is ignored as it is a
+    "magic variable" determined by ansible itself and cannot be set.