diff --git a/.github/workflows/doca.yml b/.github/workflows/extra.yml similarity index 89% rename from .github/workflows/doca.yml rename to .github/workflows/extra.yml index cfd3bb982..dece242ce 100644 --- a/.github/workflows/doca.yml +++ b/.github/workflows/extra.yml @@ -1,4 +1,4 @@ -name: Test DOCA extra build +name: Test extra build on: workflow_dispatch: push: @@ -7,16 +7,18 @@ on: paths: - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json' - 'ansible/roles/doca/**' - - '.github/workflows/doca' + - 'ansible/roles/cuda/**' + - '.github/workflows/extra.yml' pull_request: paths: - 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json' - 'ansible/roles/doca/**' - - '.github/workflows/doca' + - 'ansible/roles/cuda/**' + - '.github/workflows/extra.yml' jobs: doca: - name: doca-build + name: extra-build concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS cancel-in-progress: true @@ -25,12 +27,14 @@ jobs: fail-fast: false # allow other matrix jobs to continue even if one fails matrix: # build RL8, RL9 build: - - image_name: openhpc-doca-RL8 + - image_name: openhpc-extra-RL8 source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json - inventory_groups: doca - - image_name: openhpc-doca-RL9 + inventory_groups: doca,cuda + volume_size: 30 # needed for cuda + - image_name: openhpc-extra-RL9 source_image_name_key: RL9 - inventory_groups: doca + inventory_groups: doca,cuda + volume_size: 30 # needed for cuda env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack @@ -95,6 +99,7 @@ jobs: -var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \ -var "image_name=${{ matrix.build.image_name }}" \ -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ + -var "volume_size=${{ matrix.build.volume_size }}" \ openstack.pkr.hcl - name: Get created image names from manifest diff --git a/ansible/.gitignore b/ansible/.gitignore index 6c4f32017..a7197ff4c 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -58,6 +58,8 @@ roles/* !roles/squid/** !roles/tuned/ !roles/tuned/** +!roles/compute_init/ +!roles/compute_init/** !roles/k3s/ !roles/k3s/** !roles/k9s/ diff --git a/ansible/cleanup.yml b/ansible/cleanup.yml index 3f059d157..670a99b29 100644 --- a/ansible/cleanup.yml +++ b/ansible/cleanup.yml @@ -66,5 +66,4 @@ slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}" - name: Show image summary - debug: - var: image_info + command: cat /var/lib/image/image.json diff --git a/ansible/extras.yml b/ansible/extras.yml index 107f85252..6bb141109 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -24,8 +24,9 @@ gather_facts: yes tags: cuda tasks: - - import_role: + - include_role: name: cuda + tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}" - name: Persist hostkeys across rebuilds # Must be after filesystems.yml (for storage) @@ -37,6 +38,18 @@ - import_role: name: persist_hostkeys + +- name: Setup NFS export for compute node configuration + hosts: compute_init:!builder + # NB: has to be after eeesi and os-manila-mount + tags: compute_init + become: yes + name: Export hostvars + tasks: + - include_role: + name: compute_init + tasks_from: export.yml + - name: Install k9s become: yes hosts: k9s @@ -44,3 +57,13 @@ tasks: - import_role: name: k9s + +- hosts: extra_packages + become: yes + tags: + - extra_packages + tasks: + - name: Install additional packages + dnf: + name: "{{ appliances_extra_packages }}" + when: appliances_mode != 'configure' or appliances_extra_packages_during_configure diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 55e56e612..9f1e9107c 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -29,6 +29,14 @@ - import_playbook: bootstrap.yml +- hosts: doca + become: yes + gather_facts: yes + tasks: + - name: Install NVIDIA DOCA + import_role: + name: doca + - name: Run post-bootstrap.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" @@ -65,6 +73,16 @@ - import_playbook: extras.yml +# TODO: is this the right place? +- name: Install compute_init script + hosts: compute_init + tags: compute_init # tagged to allow running on cluster instances for dev + become: yes + tasks: + - include_role: + name: compute_init + tasks_from: install.yml + - hosts: builder become: yes gather_facts: yes @@ -220,8 +238,6 @@ import_role: name: doca -- import_playbook: disable-repos.yml - - name: Run post.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" @@ -229,6 +245,8 @@ import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists +- import_playbook: disable-repos.yml + - hosts: builder become: yes gather_facts: yes diff --git a/ansible/filter_plugins/utils.py b/ansible/filter_plugins/utils.py index 9559d0fee..1187b3c4b 100644 --- a/ansible/filter_plugins/utils.py +++ b/ansible/filter_plugins/utils.py @@ -49,7 +49,9 @@ def to_ood_regex(items): return '|'.join(r) def appliances_repo_to_subpath(repo_entry): - return repo_entry['path']+'/'+repo_entry['timestamp'] + """ Take an element from appliances_pulp_repos and convert it to a pulp path. This assumes that the remote and local pulp structures are the same + """ + return repo_entry['path'] + '/' + repo_entry['timestamp'] class FilterModule(object): ''' Ansible core jinja2 filters ''' diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2 index 453f01a7e..69d001105 100644 --- a/ansible/roles/cluster_infra/templates/resources.tf.j2 +++ b/ansible/roles/cluster_infra/templates/resources.tf.j2 @@ -399,7 +399,7 @@ resource "openstack_compute_instance_v2" "login" { ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}" {% endif %} {% endfor %} - k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 + control_address = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 k3s_token = "{{ k3s_token }}" } } @@ -565,7 +565,7 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" { ansible_init_coll_{{ loop.index0 }}_source = "{{ collection.source }}" {% endif %} {% endfor %} - k3s_server = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 + control_address = openstack_compute_instance_v2.control.network[0].fixed_ip_v4 k3s_token = "{{ k3s_token }}" } } diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md new file mode 100644 index 000000000..77a127245 --- /dev/null +++ b/ansible/roles/compute_init/README.md @@ -0,0 +1,130 @@ +# EXPERIMENTAL: compute-init + +Experimental / in-progress functionality to allow compute nodes to rejoin the +cluster after a reboot. + +To enable this add compute nodes (or a subset of them into) the `compute_init` +group. + +This works as follows: +1. During image build, an ansible-init playbook and supporting files +(e.g. templates, filters, etc) are installed. +2. Cluster instances are created as usual; the above compute-init playbook does +not run. +3. The `site.yml` playbook is run as usual to configure all the instances into +a cluster. In addition, with `compute-init` enabled, a `/exports/cluster` NFS +share is created on the control node containing: + - an /etc/hosts file for the cluster + - Hostvars for each compute node +4. On reboot of a compute node, ansible-init runs the compute-init playbook +which: + a. Checks whether the `enable_compute` metadata flag is set, and exits if + not. + b. Tries to mount the above `/exports/cluster` NFS share from the control + node, and exits if it cannot. + c. Configures itself using the exported hostvars, depending on the + `enable_*` flags set in metadata. + d. Issues an `scontrol` command to resume the node (because Slurm will + consider it as "unexpectedly rebooted"). + +The check in 4b. above is what prevents the compute-init script from trying +to configure the node before the services on the control node are available +(which requires running the site.yml playbook). + +The following roles/groups are currently fully functional: +- `resolv_conf`: all functionality +- `etc_hosts`: all functionality +- `nfs`: client functionality only +- `manila`: all functionality +- `basic_users`: all functionality, assumes home directory already exists on + shared storage +- `eessi`: all functionality, assumes `cvmfs_config` is the same on control + node and all compute nodes. +- `openhpc`: all functionality + +# Development/debugging + +To develop/debug this without actually having to build an image: + + +1. Deploy a cluster using tofu and ansible/site.yml as normal. This will + additionally configure the control node to export compute hostvars over NFS. + Check the cluster is up. + +2. Reimage the compute nodes: + + ansible-playbook --limit compute ansible/adhoc/rebuild.yml + +3. Add metadata to a compute node e.g. via Horizon to turn on compute-init + playbook functionality. + +4. Fake an image build to deploy the compute-init playbook: + + ansible-playbook ansible/fatimage.yml --tags compute_init + + NB: This will also re-export the compute hostvars, as the nodes are not + in the builder group, which conveniently means any changes made to that + play also get picked up. + +5. Fake a reimage of compute to run ansible-init and the compute-init playbook: + + On compute node where metadata was added: + + [root@rl9-compute-0 rocky]# rm -f /var/lib/ansible-init.done && systemctl restart ansible-init + [root@rl9-compute-0 rocky]# systemctl status ansible-init + + Use `systemctl status ansible-init` to view stdout/stderr from Ansible. + +Steps 4/5 can be repeated with changes to the compute script. If required, +reimage the compute node(s) first as in step 2 and/or add additional metadata +as in step 3. + + +# Design notes +- Duplicating code in roles into the `compute-init` script is unfortunate, but + does allow developing this functionality without wider changes to the + appliance. + +- In general, we don't want to rely on NFS export. So should e.g. copy files + from this mount ASAP in the compute-init script. TODO: + +- There are a couple of approaches to supporting existing roles using `compute-init`: + + 1. Control node copies files resulting from role into cluster exports, + compute-init copies to local disk. Only works if files are not host-specific + Examples: etc_hosts, eessi config? + + 2. Re-implement the role. Works if the role vars are not too complicated, + (else they all need to be duplicated in compute-init). Could also only + support certain subsets of role functionality or variables + Examples: resolv_conf, stackhpc.openhpc + +- Some variables are defined using hostvars from other nodes, which aren't + available v the current approach: + + ``` + [root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml + "grafana_address": "{{ hostvars[groups['grafana'].0].api_address }}", + "grafana_api_address": "{{ hostvars[groups['grafana'].0].internal_address }}", + "mysql_host": "{{ hostvars[groups['mysql'] | first].api_address }}", + "nfs_server_default": "{{ hostvars[groups['control'] | first ].internal_address }}", + "openhpc_slurm_control_host": "{{ hostvars[groups['control'].0].api_address }}", + "openondemand_address": "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}", + "openondemand_node_proxy_directives": "{{ _opeonondemand_unset_auth if (openondemand_auth == 'basic_pam' and 'openondemand_host_regex' and groups['grafana'] | length > 0 and hostvars[ groups['grafana'] | first]._grafana_auth_is_anonymous) else '' }}", + "openondemand_servername": "{{ hostvars[ groups['openondemand'] | first].ansible_host }}", + "prometheus_address": "{{ hostvars[groups['prometheus'].0].api_address }}", + "{{ hostvars[groups['freeipa_server'].0].ansible_host }}" + ``` + + More generally, there is nothing to stop any group var depending on a + "{{ hostvars[] }}" interpolation ... + + Only `nfs_server_default` and `openhpc_slurm_control_host` are of concern + for compute nodes - both of these indirect via `api_address` to + `inventory_hostname`. This has been worked around by replacing this with + "{{ groups['control'] | first }}" which does result in the control node + inventory hostname when templating. + + Note that although `groups` is defined in the templated hostvars, when + the hostvars are loaded using `include_vars:` is is ignored as it is a + "magic variable" determined by ansible itself and cannot be set. diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml new file mode 100644 index 000000000..c7a9048b4 --- /dev/null +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -0,0 +1,285 @@ +--- + +- name: Compute node initialisation + hosts: localhost + become: yes + vars: + os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" + server_node_ip: "{{ os_metadata.meta.control_address }}" + enable_compute: "{{ os_metadata.meta.enable_compute | default(false) | bool }}" + enable_resolv_conf: "{{ os_metadata.meta.enable_resolv_conf | default(false) | bool }}" + enable_etc_hosts: "{{ os_metadata.meta.enable_etc_hosts | default(false) | bool }}" + enable_nfs: "{{ os_metadata.meta.enable_nfs | default(false) | bool }}" + enable_manila: "{{ os_metadata.meta.enable_manila | default(false) | bool }}" + enable_basic_users: "{{ os_metadata.meta.enable_basic_users | default(false) | bool }}" + enable_eessi: "{{ os_metadata.meta.enable_eessi | default(false) | bool }}" + + # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects + resolv_conf_nameservers: [] + + nfs_client_mnt_point: "/mnt" + nfs_client_mnt_options: + nfs_client_mnt_state: mounted + nfs_configurations: + nfs_enable: + clients: false + + # openhpc: no defaults required + + os_manila_mount_shares: [] + os_manila_mount_ceph_conf_path: /etc/ceph + os_manila_mount_state: mounted + os_manila_mount_opts: + - x-systemd.device-timeout=30 + - x-systemd.mount-timeout=30 + - noatime + - _netdev # prevents mount blocking early boot before networking available + - rw + + basic_users_groups: [] + basic_users_manage_homedir: false # homedir must already exist on shared filesystem + basic_users_userdefaults: + state: present + create_home: "{{ basic_users_manage_homedir }}" + generate_ssh_key: "{{ basic_users_manage_homedir }}" + ssh_key_comment: "{{ item.name }}" + basic_users_users: [] + + tasks: + - block: + - name: Report skipping initialization if not compute node + # meta: end_play produces no output + debug: + msg: "Skipping compute initialization: Metadata enable_compute is not true" + + - meta: end_play + when: not enable_compute + + - name: Ensure the mount directory exists + file: + path: /mnt/cluster + state: directory + owner: root + group: root + mode: u=rwX,go= # is sensitive + + - name: Mount /mnt/cluster + mount: + path: /mnt/cluster + src: "{{ server_node_ip }}:/exports/cluster" + fstype: nfs + opts: ro,sync + state: mounted + register: _mount_mnt_cluster + ignore_errors: true + # TODO: add some retries here? + + - block: + - name: Report skipping initialization if cannot mount nfs + # meta: end_play produces no output + debug: + msg: "Skipping compute initialization: Failed to mount /exports/cluster from control node {{ server_node_ip }}" + + - meta: end_play + when: _mount_mnt_cluster.failed + + - name: Load hostvars from NFS + # this is higher priority than vars block = normal ansible's hostvars + include_vars: + file: "/mnt/cluster/hostvars/{{ ansible_hostname }}/hostvars.yml" # can't use inventory_hostname + + # TODO: should /mnt/cluster now be UNMOUNTED to avoid future hang-ups? + + - name: Configure resolve.conf + block: + - name: Set nameservers in /etc/resolv.conf + ansible.builtin.template: + src: resolv.conf.j2 + dest: /etc/resolv.conf + owner: root + group: root + mode: u=rw,og=r + + - name: Disable NetworkManager control of resolv.conf + ansible.builtin.copy: + src: files/NetworkManager-dns-none.conf + dest: /etc/NetworkManager/conf.d/90-dns-none.conf + owner: root + group: root + mode: u=rw,og=r + register: _copy_nm_config + + - name: Reload NetworkManager + ansible.builtin.systemd: + name: NetworkManager + state: reloaded + when: _copy_nm_config.changed | default(false) + when: enable_resolv_conf + + - name: Copy cluster /etc/hosts + copy: + src: /mnt/cluster/hosts + dest: /etc/hosts + owner: root + group: root + mode: 0644 + when: enable_etc_hosts + + # NFS client mount + - name: If nfs-clients is present + include_tasks: tasks/nfs-clients.yml + when: + - enable_nfs + - nfs_enable.clients | bool or ('nfs_enable' in item and item.nfs_enable.clients | bool) + loop: "{{ nfs_configurations }}" + + - name: Manila mounts + block: + - name: Read manila share info from nfs file + include_vars: + file: /mnt/cluster/manila_share_info.yml + no_log: true # contains secrets + + - name: Ensure Ceph configuration directory exists + ansible.builtin.file: + path: "{{ os_manila_mount_ceph_conf_path }}" + state: directory + mode: "0755" + owner: root + group: root + + - name: Configure ceph.conf using os_manila_mount_host + ansible.builtin.template: + src: ceph.conf.j2 + dest: "{{ os_manila_mount_ceph_conf_path }}/ceph.conf" + owner: root + group: root + mode: "0600" + + - name: Ensure mount directory exists + ansible.builtin.file: + path: "{{ item.mount_path }}" + state: directory + owner: "{{ item.mount_user | default(omit) }}" + group: "{{ item.mount_group | default(omit) }}" + mode: "{{ item.mount_mode | default(omit) }}" + loop: "{{ os_manila_mount_shares }}" + loop_control: + label: "{{ item.share_name }}" + + - name: Write Ceph client keyring + ansible.builtin.template: + src: ceph.keyring.j2 + dest: "{{ os_manila_mount_ceph_conf_path }}/ceph.client.{{ item.share_user }}.keyring" + mode: "0600" + owner: root + group: root + loop: "{{ os_manila_mount_share_info }}" + loop_control: + label: "{{ item.share_name }}" + + - name: Mount the Ceph share + ansible.posix.mount: + path: "{{ item[0].mount_path }}" + src: "{{ item[1].host }}:{{ item[1].export }}" + fstype: ceph + opts: "name={{ item[1].share_user }},{{ (item[0].mount_opts | default(os_manila_mount_opts)) | join(',') }}" + # NB share_user is looked up here in case of autodetection + state: "{{ item[0].mount_state | default(os_manila_mount_state) }}" + loop: "{{ os_manila_mount_shares | zip(os_manila_mount_share_info) }}" + loop_control: + label: "{{ item[0].share_name }}" + + - name: Ensure mounted directory has correct permissions + ansible.builtin.file: + path: "{{ item.mount_path }}" + state: directory + owner: "{{ item.mount_user | default(omit) }}" + group: "{{ item.mount_group | default(omit) }}" + mode: "{{ item.mount_mode | default(omit) }}" + loop: "{{ os_manila_mount_shares }}" + loop_control: + label: "{{ item.share_name }}" + when: item.mount_state | default(os_manila_mount_state) in ['mounted' or 'ephemeral'] + when: + - enable_manila + - os_manila_mount_shares | length > 0 + + - name: Basic users + block: + - name: Create groups + ansible.builtin.group: "{{ item }}" + loop: "{{ basic_users_groups }}" + + - name: Create users + user: "{{ basic_users_userdefaults | combine(item) | filter_user_params() }}" + loop: "{{ basic_users_users }}" + loop_control: + label: "{{ item.name }} [{{ item.state | default('present') }}]" + register: basic_users_info + + - name: Write sudo rules + blockinfile: + path: /etc/sudoers.d/80-{{ item.name}}-user + block: "{{ item.sudo }}" + create: true + loop: "{{ basic_users_users }}" + loop_control: + label: "{{ item.name }}" + when: "'sudo' in item" + when: enable_basic_users + + - name: EESSI + block: + - name: Copy cvmfs config + copy: + src: /mnt/cluster/cvmfs/default.local + dest: /etc/cvmfs/default.local + owner: root + group: root + mode: 0644 + + - name: Ensure CVMFS config is setup + command: + cmd: "cvmfs_config setup" + when: enable_eessi + + # NB: don't need conditional block on enable_compute as have already exited + # if not the case + - name: Write Munge key + copy: + content: "{{ openhpc_munge_key }}" + dest: "/etc/munge/munge.key" + owner: munge + group: munge + mode: 0400 + + - name: Set slurmctld location for configless operation + lineinfile: + path: /etc/sysconfig/slurmd + line: "SLURMD_OPTIONS='--conf-server {{ openhpc_slurm_control_host_address | default(openhpc_slurm_control_host) }}'" + regexp: "^SLURMD_OPTIONS=" + create: yes + owner: root + group: root + mode: 0644 + + - name: Ensure Munge service state + service: + name: munge + enabled: true + state: started + + - name: Ensure slurmd service state + service: + name: slurmd + enabled: true + state: started + + - name: Ensure node is resumed + # TODO: consider if this is always safe for all job states? + command: scontrol update state=resume nodename={{ ansible_hostname }} + register: _scontrol_update + failed_when: + - _scontrol_update.rc > 0 + - "'slurm_update error: Invalid node state specified' not in _scontrol_update.stderr" diff --git a/ansible/roles/compute_init/tasks/export.yml b/ansible/roles/compute_init/tasks/export.yml new file mode 100644 index 000000000..12b648f6e --- /dev/null +++ b/ansible/roles/compute_init/tasks/export.yml @@ -0,0 +1,67 @@ +- name: Ensure the /exports/cluster directory exists + file: + path: /exports/cluster + state: directory + owner: root + group: root + mode: u=rwX,go= + run_once: true + delegate_to: "{{ groups['control'] | first }}" + +- name: Copy /etc/hosts to /exports/cluster + copy: + src: /etc/hosts + dest: /exports/cluster/hosts + owner: root + group: root + mode: u=rw,go= + remote_src: true + run_once: true + delegate_to: "{{ groups['control'] | first }}" + +- name: Create hostvars directory + file: + path: /exports/cluster/hostvars/{{ inventory_hostname }}/ + state: directory + mode: u=rwX,go= + # TODO: owner,mode,etc + delegate_to: "{{ groups['control'] | first }}" + +- name: Template out hostvars + template: + src: hostvars.yml.j2 + dest: /exports/cluster/hostvars/{{ inventory_hostname }}/hostvars.yml + mode: u=rw,go= + delegate_to: "{{ groups['control'] | first }}" + +- name: Copy manila share info to /exports/cluster + copy: + content: "{{ os_manila_mount_share_info_var | to_nice_yaml }}" + dest: /exports/cluster/manila_share_info.yml + run_once: true + delegate_to: "{{ groups['control'] | first }}" + when: os_manila_mount_share_info is defined + vars: + os_manila_mount_share_info_var: + os_manila_mount_share_info: "{{ os_manila_mount_share_info }}" + +- name: Ensure /exports/cluster/cvmfs directory exists + file: + path: /exports/cluster/cvmfs + state: directory + owner: root + group: root + mode: 0755 + run_once: true + delegate_to: "{{ groups['control'] | first }}" + +- name: Copy EESSI CVMFS config to /exports/cluster + copy: + src: /etc/cvmfs/default.local + dest: /exports/cluster/cvmfs/default.local + owner: root + group: root + mode: 0644 + remote_src: true + run_once: true + delegate_to: "{{ groups['control'] | first }}" diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml new file mode 100644 index 000000000..bbcbf133f --- /dev/null +++ b/ansible/roles/compute_init/tasks/install.yml @@ -0,0 +1,53 @@ +--- + +- name: Ensure directories exist + file: + path: "/etc/ansible-init/playbooks/{{ item }}" + state: directory + owner: root + group: root + mode: 0755 + loop: + - templates + - files + - library + - filter_plugins + - tasks + +- name: Inject files from roles + copy: + src: '{{ item.src }}' + dest: '/etc/ansible-init/playbooks/{{ item.dest }}' + owner: root + group: root + mode: 0644 + loop: + - src: ../../resolv_conf/templates/resolv.conf.j2 + dest: templates/resolv.conf.j2 + - src: ../../stackhpc.os-manila-mount/templates/ceph.conf.j2 + dest: templates/ceph.conf.j2 + - src: ../../stackhpc.os-manila-mount/templates/ceph.keyring.j2 + dest: templates/ceph.keyring.j2 + - src: ../../resolv_conf/files/NetworkManager-dns-none.conf + dest: files/NetworkManager-dns-none.conf + - src: ../../basic_users/filter_plugins/filter_keys.py + dest: filter_plugins/filter_keys.py + - src: ../../stackhpc.nfs/tasks/nfs-clients.yml + dest: tasks/nfs-clients.yml + +- name: Add filter_plugins to ansible.cfg + lineinfile: + path: /etc/ansible-init/ansible.cfg + line: "filter_plugins = /etc/ansible-init/filter_plugins" + state: present + owner: root + group: root + mode: 0644 + +- name: Add compute initialisation playbook + copy: + src: compute-init.yml + dest: /etc/ansible-init/playbooks/1-compute-init.yml + owner: root + group: root + mode: 0644 diff --git a/ansible/roles/compute_init/templates/hostvars.yml.j2 b/ansible/roles/compute_init/templates/hostvars.yml.j2 new file mode 100644 index 000000000..7d4351b44 --- /dev/null +++ b/ansible/roles/compute_init/templates/hostvars.yml.j2 @@ -0,0 +1 @@ +{{ hostvars[inventory_hostname] | to_nice_json }} \ No newline at end of file diff --git a/ansible/roles/cuda/README.md b/ansible/roles/cuda/README.md index 141e7b80d..be6439cd5 100644 --- a/ansible/roles/cuda/README.md +++ b/ansible/roles/cuda/README.md @@ -1,6 +1,6 @@ # cuda -Install NVIDIA CUDA. The CUDA binaries are added to the PATH for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled. +Install NVIDIA drivers and optionally CUDA packages. CUDA binaries are added to the `$PATH` for all users, and the [NVIDIA persistence daemon](https://docs.nvidia.com/deploy/driver-persistence/index.html#persistence-daemon) is enabled. ## Prerequisites @@ -8,8 +8,8 @@ Requires OFED to be installed to provide required kernel-* packages. ## Role Variables -- `cuda_distro`: Optional. Default `rhel8`. -- `cuda_repo`: Optional. Default `https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo` -- `cuda_driver_stream`: Optional. The default value `default` will, on first use of this role, enable the dkms-flavour `nvidia-driver` DNF module stream with the current highest version number. The `latest-dkms` stream is not enabled, and subsequent runs of the role will *not* change the enabled stream, even if a later version has become available. Changing this value once an `nvidia-driver` stream has been enabled raises an error. If an upgrade of the `nvidia-driver` module is required, the currently-enabled stream and all packages should be manually removed. +- `cuda_repo_url`: Optional. URL of `.repo` file. Default is upstream for appropriate OS/architecture. +- `cuda_nvidia_driver_stream`: Optional. Version of `nvidia-driver` stream to enable. This controls whether the open or proprietary drivers are installed and the major version. Changing this once the drivers are installed does not change the version. - `cuda_packages`: Optional. Default: `['cuda', 'nvidia-gds']`. +- `cuda_package_version`: Optional. Default `latest` which will install the latest packages if not installed but won't upgrade already-installed packages. Use `'none'` to skip installing CUDA. - `cuda_persistenced_state`: Optional. State of systemd `nvidia-persistenced` service. Values as [ansible.builtin.systemd:state](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/systemd_module.html#parameter-state). Default `started`. diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml index 33a25d9b4..05f1e093d 100644 --- a/ansible/roles/cuda/defaults/main.yml +++ b/ansible/roles/cuda/defaults/main.yml @@ -1,7 +1,6 @@ -cuda_distro: "rhel{{ ansible_distribution_major_version }}" -cuda_repo: "https://developer.download.nvidia.com/compute/cuda/repos/{{ cuda_distro }}/x86_64/cuda-{{ cuda_distro }}.repo" -cuda_driver_stream: default -cuda_package_version: 'latest' +cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo" +cuda_nvidia_driver_stream: '560-open' # 565-open has problems with cuda packages +cuda_package_version: '12.6.3-1' cuda_packages: - "cuda{{ ('-' + cuda_package_version) if cuda_package_version != 'latest' else '' }}" - nvidia-gds diff --git a/ansible/roles/cuda/tasks/main.yml b/ansible/roles/cuda/tasks/install.yml similarity index 60% rename from ansible/roles/cuda/tasks/main.yml rename to ansible/roles/cuda/tasks/install.yml index 22f8e9e8e..51c92a0d3 100644 --- a/ansible/roles/cuda/tasks/main.yml +++ b/ansible/roles/cuda/tasks/install.yml @@ -1,7 +1,7 @@ # Based on https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#redhat8-installation -- name: Check for OFED +- name: Check for OFED/DOCA command: cmd: dnf list --installed rdma-core register: _dnf_rdma_core @@ -10,41 +10,53 @@ - name: Assert OFED installed assert: that: "'mlnx' in _dnf_rdma_core.stdout" - fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED installed?" + fail_msg: "Did not find 'mlnx' in installed rdma-core package, is OFED/DOCA installed?" - name: Install cuda repo get_url: - dest: "/etc/yum.repos.d/cuda-{{ cuda_distro }}.repo" - url: "{{ cuda_repo }}" + dest: "/etc/yum.repos.d/cuda-rhel{{ ansible_distribution_major_version }}.repo" + url: "{{ cuda_repo_url }}" - name: Check if nvidia driver module is enabled - shell: - cmd: dnf module list --enabled nvidia-driver + ansible.builtin.command: dnf module list --enabled nvidia-driver changed_when: false failed_when: false register: _cuda_driver_module_enabled - name: Enable nvidia driver module - ansible.builtin.command: "dnf module enable -y nvidia-driver:open-dkms" + ansible.builtin.command: "dnf module enable -y nvidia-driver:{{ cuda_nvidia_driver_stream }}" register: _cuda_driver_module_enable when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr" changed_when: "'Nothing to do' not in _cuda_driver_module_enable.stdout" +- name: Check if nvidia driver module is installed + ansible.builtin.command: dnf module list --installed nvidia-driver + changed_when: false + failed_when: false + register: _cuda_driver_module_installed + - name: Install nvidia drivers ansible.builtin.command: dnf module install -y nvidia-driver register: _cuda_driver_install - when: "'No matching Modules to list' in _cuda_driver_module_enabled.stderr" + when: "'No matching Modules to list' in _cuda_driver_module_installed.stderr" changed_when: "'Nothing to do' not in _cuda_driver_install.stdout" +- name: Check kernel has not been modified + assert: + that: "'kernel ' not in _cuda_driver_install.stdout | default('')" # space ensures we don't flag e.g. kernel-devel-matched + fail_msg: "{{ _cuda_driver_install.stdout_lines | default([]) | select('search', 'kernel ') }}" + - name: Install cuda packages ansible.builtin.dnf: name: "{{ cuda_packages }}" + when: cuda_package_version != 'none' register: cuda_package_install - name: Add cuda binaries to path lineinfile: path: /etc/profile.d/sh.local line: 'export PATH=$PATH:$(ls -1d /usr/local/cuda-* | sort -V | tail -1)/bin' + when: cuda_package_version != 'none' - name: Enable NVIDIA Persistence Daemon systemd: @@ -60,3 +72,4 @@ - name: Wait for hosts to be reachable wait_for_connection: sleep: 15 + when: cuda_package_install.changed diff --git a/ansible/roles/cuda/tasks/runtime.yml b/ansible/roles/cuda/tasks/runtime.yml new file mode 100644 index 000000000..c16a48c6f --- /dev/null +++ b/ansible/roles/cuda/tasks/runtime.yml @@ -0,0 +1,5 @@ +- name: Ensure NVIDIA Persistence Daemon state + systemd: + name: nvidia-persistenced + enabled: true + state: "{{ cuda_persistenced_state }}" diff --git a/ansible/roles/k3s/files/start_k3s.yml b/ansible/roles/k3s/files/start_k3s.yml index 8ee0e6114..b9b82f1c4 100644 --- a/ansible/roles/k3s/files/start_k3s.yml +++ b/ansible/roles/k3s/files/start_k3s.yml @@ -3,7 +3,7 @@ vars: os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" k3s_token: "{{ os_metadata.meta.k3s_token }}" - k3s_server_name: "{{ os_metadata.meta.k3s_server }}" + k3s_server_name: "{{ os_metadata.meta.control_address }}" service_name: "{{ 'k3s-agent' if k3s_server_name is defined else 'k3s' }}" tasks: - name: Ensure password directory exists diff --git a/ansible/roles/openondemand/README.md b/ansible/roles/openondemand/README.md index c6a4f3f9f..365265df0 100644 --- a/ansible/roles/openondemand/README.md +++ b/ansible/roles/openondemand/README.md @@ -17,7 +17,7 @@ This uses the [osc.ood](https://github.com/OSC/ood-ansible) Ansible role to prov ### General - `openondemand_clusters`: Required. Synonym for [osc.ood: clusters](https://github.com/OSC/ood-ansible#clusters) role variable. -- `openondemand_servername`: Required. Synonym for [osc.ood: servername](https://github.com/OSC/ood-ansible/blob/master/defaults/main/ood_portal.yml#L27) role variable. This defines what the Open Ondemand portal's Apache server uses for the [name-based virtual host](https://httpd.apache.org/docs/current/mod/core.html#servername). It should be the IP or hostname(+domain) part of the URL used to access Open Ondemand in the browser, e.g. `ondemand.mysite.org`. **NB:** If a domain or external IP is not available, specify the host's internal IP here and use ssh with a `DynamicForward` option and a SOCKS proxy to access this address. Using ssh's `LocalForward` option is not recommended as the server name will have to be `localhost` which causes some issues. +- `openondemand_servername`: Required. Synonym for [osc.ood: servername](https://github.com/OSC/ood-ansible/blob/master/defaults/main/ood_portal.yml#L27) role variable. This defines what the Open Ondemand portal's Apache server uses for the [name-based virtual host](https://httpd.apache.org/docs/current/mod/core.html#servername). It should be the IP or hostname(+domain) part of the URL used to access Open Ondemand in the browser, e.g. `ondemand.mysite.org`. **NB:** If a domain or external IP is not available, specify the host's internal IP here and use ssh with a `DynamicForward` option and a SOCKS proxy to access this address. Using ssh's `LocalForward` option is not recommended as the server name will have to be `localhost` which causes some issues. Changing this value on an already deployed cluster requires a reboot of the login node for OOD app state to be correctly refreshed. ### Authentication See the Open Ondemand [Authentication docs](https://osc.github.io/ood-documentation/latest/authentication/overview.html) for an overview of the authentication process. @@ -77,7 +77,7 @@ The Open Ondemand portal can proxy other servers. Variables: to proxy: - All "compute" nodes, e.g. for Open Ondemand interactive apps such as remote desktop and Jupyter notebook server. - The Grafana server - note a link to Grafana is always added to the Open Ondemand dashboard. - + The exact pattern depends on inventory hostnames / partitions / addresses. - `openondemand_node_proxy_directives`: Optional, default ''. Multiline string to insert into Apache directives definition for `node_uri` ([docs](https://osc.github.io/ood-documentation/master/reference/files/ood-portal-yml.html#configure-reverse-proxy)). diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index fc23a4489..081307b6a 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -7,7 +7,7 @@ pulp_site_default_upstream_suffix: "{{ pulp_site_target_arch }}/os" pulp_site_validate_certs: false pulp_site_install_dir: '/home/rocky/pulp' pulp_site_selinux_suffix: "{{ ':Z' if ansible_selinux.status == 'enabled' else '' }}" -pulp_site_target_facts: "{{ hostvars[groups['builder'][0]]['ansible_facts'] }}" +pulp_site_target_facts: "{{ hostvars[groups['pulp'][0]]['ansible_facts'] }}" pulp_site_target_distribution_version: "{{ pulp_site_target_facts['distribution_version'] }}" pulp_site_target_distribution_version_major: "{{ pulp_site_target_facts['distribution_major_version'] }}" diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md new file mode 100644 index 000000000..dae840d95 --- /dev/null +++ b/docs/experimental/compute-init.md @@ -0,0 +1,113 @@ +# compute-init + +See the role README.md + +# Results/progress + +Without any metadata: + + [root@rl9-compute-0 rocky]# systemctl status ansible-init + ● ansible-init.service + Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) + Active: activating (start) since Fri 2024-12-13 20:41:16 UTC; 1min 45s ago + Main PID: 16089 (ansible-init) + Tasks: 8 (limit: 10912) + Memory: 99.5M + CPU: 11.687s + CGroup: /system.slice/ansible-init.service + ├─16089 /usr/lib/ansible-init/bin/python /usr/bin/ansible-init + ├─16273 /usr/lib/ansible-init/bin/python3.9 /usr/lib/ansible-init/bin/ansible-playbook --connection local --inventory 127.0.0.1, /etc/ansible-init/playbooks/1-compute-init.yml + ├─16350 /usr/lib/ansible-init/bin/python3.9 /usr/lib/ansible-init/bin/ansible-playbook --connection local --inventory 127.0.0.1, /etc/ansible-init/playbooks/1-compute-init.yml + ├─16361 /bin/sh -c "/usr/bin/python3 /root/.ansible/tmp/ansible-tmp-1734122485.9542894-16350-45936546411977/AnsiballZ_mount.py && sleep 0" + ├─16362 /usr/bin/python3 /root/.ansible/tmp/ansible-tmp-1734122485.9542894-16350-45936546411977/AnsiballZ_mount.py + ├─16363 /usr/bin/mount /mnt/cluster + └─16364 /sbin/mount.nfs 192.168.10.12:/exports/cluster /mnt/cluster -o ro,sync + + Dec 13 20:41:24 rl9-compute-0.rl9.invalid ansible-init[16273]: ok: [127.0.0.1] + Dec 13 20:41:24 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Report skipping initialization if not compute node] ********************** + Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: skipping: [127.0.0.1] + Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [meta] ******************************************************************** + Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: skipping: [127.0.0.1] + Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Ensure the mount directory exists] *************************************** + Dec 13 20:41:25 rl9-compute-0.rl9.invalid python3[16346]: ansible-file Invoked with path=/mnt/cluster state=directory owner=root group=root mode=u=rwX,go= recurse=False force=False follow=True modification_time_format=%Y%m%d%H%M.%S access> + Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: changed: [127.0.0.1] + Dec 13 20:41:25 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [Mount /mnt/cluster] ****************************************************** + Dec 13 20:41:26 rl9-compute-0.rl9.invalid python3[16362]: ansible-mount Invoked with path=/mnt/cluster src=192.168.10.12:/exports/cluster fstype=nfs opts=ro,sync state=mounted boot=True dump=0 passno=0 backup=False fstab=None + [root@rl9-compute-0 rocky]# systemctl status ansible-init + +Added metadata via horizon: + + compute_groups ["compute"] + + +OK: + + [root@rl9-compute-0 rocky]# systemctl status ansible-init + ● ansible-init.service + Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) + Active: active (exited) since Fri 2024-12-13 20:43:31 UTC; 33s ago + Process: 16089 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS) + Main PID: 16089 (code=exited, status=0/SUCCESS) + CPU: 13.003s + + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: ok: [127.0.0.1] => { + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: "msg": "Skipping compute initialization as cannot mount exports/cluster share" + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: } + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: TASK [meta] ******************************************************************** + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: PLAY RECAP ********************************************************************* + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16273]: 127.0.0.1 : ok=4 changed=1 unreachable=0 failed=0 skipped=1 rescued=0 ignored=1 + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] executing remote playbooks for stage - post + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] writing sentinel file /var/lib/ansible-init.done + Dec 13 20:43:31 rl9-compute-0.rl9.invalid ansible-init[16089]: [INFO] ansible-init completed successfully + Dec 13 20:43:31 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. + +Now run site.yml, then restart ansible-init again: + + + [root@rl9-compute-0 rocky]# systemctl status ansible-init + ● ansible-init.service + Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) + Active: active (exited) since Fri 2024-12-13 20:50:10 UTC; 11s ago + Process: 18921 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS) + Main PID: 18921 (code=exited, status=0/SUCCESS) + CPU: 8.240s + + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: TASK [Report skipping initialization if cannot mount nfs] ********************** + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: skipping: [127.0.0.1] + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: TASK [meta] ******************************************************************** + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: skipping: [127.0.0.1] + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: PLAY RECAP ********************************************************************* + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[19110]: 127.0.0.1 : ok=3 changed=1 unreachable=0 failed=0 skipped=2 rescued=0 ignored=0 + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] executing remote playbooks for stage - post + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] writing sentinel file /var/lib/ansible-init.done + Dec 13 20:50:10 rl9-compute-0.rl9.invalid ansible-init[18921]: [INFO] ansible-init completed successfully + Dec 13 20:50:10 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. + [root@rl9-compute-0 rocky]# ls /mnt/cluster/host + hosts hostvars/ + [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute- + rl9-compute-0/ rl9-compute-1/ + [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute- + rl9-compute-0/ rl9-compute-1/ + [root@rl9-compute-0 rocky]# ls /mnt/cluster/hostvars/rl9-compute-0/ + hostvars.yml + +This commit - shows that hostvars have loaded: + + [root@rl9-compute-0 rocky]# systemctl status ansible-init + ● ansible-init.service + Loaded: loaded (/etc/systemd/system/ansible-init.service; enabled; preset: disabled) + Active: active (exited) since Fri 2024-12-13 21:06:20 UTC; 5s ago + Process: 27585 ExecStart=/usr/bin/ansible-init (code=exited, status=0/SUCCESS) + Main PID: 27585 (code=exited, status=0/SUCCESS) + CPU: 8.161s + + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: TASK [Demonstrate hostvars have loaded] **************************************** + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: ok: [127.0.0.1] => { + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: "prometheus_version": "2.27.0" + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: } + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: PLAY RECAP ********************************************************************* + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27769]: 127.0.0.1 : ok=5 changed=0 unreachable=0 failed=0 skipped=2 rescued=0 ignored=0 + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] executing remote playbooks for stage - post + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] writing sentinel file /var/lib/ansible-init.done + Dec 13 21:06:20 rl9-compute-0.rl9.invalid ansible-init[27585]: [INFO] ansible-init completed successfully + Dec 13 21:06:20 rl9-compute-0.rl9.invalid systemd[1]: Finished ansible-init.service. diff --git a/docs/operations.md b/docs/operations.md index a20d7f10c..4bebe1b3f 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -63,17 +63,30 @@ This is a usually a two-step process: Deploying the additional nodes and applying these changes requires rerunning both Terraform and the Ansible site.yml playbook - follow [Deploying a Cluster](#Deploying-a-Cluster). # Adding Additional Packages -Packages from any enabled DNF repositories (which always includes EPEL, PowerTools and OpenHPC) can be added to all nodes by defining a list `openhpc_packages_extra` (defaulted to the empty list in the common environment) in e.g. `environments/$SITE_ENV/inventory/group_vars/all/openhpc.yml`. For example: - - # environments/foo-base/inventory/group_vars/all/openhpc.yml: - openhpc_packages_extra: +By default, the following utility packages are installed during build: +- htop +- nano +- screen +- tmux +- wget +- bind-utils +- net-tools +- postfix +- git +- latest python version for system (3.6 for for Rocky 8.9 and 3.12 for Rocky 9.4) + +Additional packages from any DNF repositories which are enabled during build (which always includes EPEL, PowerTools and OpenHPC) can be added to the image by defining a list `appliances_extra_packages_other` (defaulted to the empty list in the common environment) in e.g. `environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example: + +```yaml + # environments/foo-base/inventory/group_vars/all/defaults.yml: + appliances_extra_packages_other: - somepackage - anotherpackage The packages available from the OpenHPC repos are described in Appendix E of the OpenHPC installation guide (linked from the [OpenHPC releases page](https://github.com/openhpc/ohpc/releases/)). Note "user-facing" OpenHPC packages such as compilers, mpi libraries etc. include corresponding `lmod` modules. -To add these packages to the current cluster, run the same command as for [Reconfiguring Slurm](#Reconfiguring-Slurm). TODO: describe what's required to add these to site-specific images. +If you wish to install packages during runtime, the `site.yml` playbook should be run with `appliances_packages_during_configure` overriden to `true` and `cluster` should be added as a child of the `dnf_repos` group in order to temporarily re-enable DNF repositories during runtime (WARNING: this should only be done if using an unauthenticated local Pulp server. If using StackHPC Ark directly, doing this WILL leak credentials to users). If additional repositories are required, these could be added/enabled as necessary in a play added to `environments/$SITE_ENV/hooks/{pre,post}.yml` as appropriate. Note such a plat should NOT exclude the builder group, so that the repositories are also added to built images. There are various Ansible modules which might be useful for this: - `ansible.builtin.yum_repository`: Add a repo from an URL providing a 'repodata' directory. diff --git a/environments/.stackhpc/inventory/group_vars/all/manila.yml b/environments/.stackhpc/inventory/group_vars/all/manila.yml new file mode 100644 index 000000000..59f935873 --- /dev/null +++ b/environments/.stackhpc/inventory/group_vars/all/manila.yml @@ -0,0 +1,7 @@ +os_manila_mount_shares_arcus: + - share_name: slurm-v2-home + mount_path: /project + - share_name: slurm-scratch + mount_path: /scratch + +os_manila_mount_shares: "{{ os_manila_mount_shares_arcus if ci_cloud == 'ARCUS' else [] }}" diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 5e5acebeb..8a9e3b66a 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241218-0900-a99d8be6", - "RL9": "openhpc-RL9-241218-0859-a99d8be6" + "RL8": "openhpc-RL8-241220-1131-a2dde143", + "RL9": "openhpc-RL9-241220-1131-a2dde143" } } diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index fa1fdf767..8fe1eb402 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -80,7 +80,28 @@ appliances_local_users_default: appliances_local_users_extra: [] # see format of appliances_local_users_default above appliances_local_users: "{{ appliances_local_users_default + appliances_local_users_extra }}" -########################################################################################### +################## bootstrap: extra package installs ###################################### + +appliances_extra_packages_default: + - htop + - nano + - screen + - tmux + - wget + - bind-utils + - net-tools + - postfix + - git + - "{{ 'python36' if ansible_distribution_version == '8.9' else 'python312' }}" + + +appliances_extra_packages_during_configure: false + +appliances_extra_packages_other: [] + +appliances_extra_packages: "{{ appliances_extra_packages_default + appliances_extra_packages_other }}" + +###################### ark repo timestamps ################################################### appliances_pulp_repos: baseos: diff --git a/environments/common/inventory/group_vars/all/nfs.yml b/environments/common/inventory/group_vars/all/nfs.yml index bd340b190..45b7c6967 100644 --- a/environments/common/inventory/group_vars/all/nfs.yml +++ b/environments/common/inventory/group_vars/all/nfs.yml @@ -3,7 +3,7 @@ # See: https://github.com/stackhpc/ansible-role-cluster-nfs # for variable definitions -nfs_server_default: "{{ hostvars[groups['control'] | first ].internal_address }}" +nfs_server_default: "{{ groups['control'] | first }}" # avoid using hostvars for compute-init nfs_configurations: - comment: Export /exports/home from Slurm control node as /home @@ -15,3 +15,9 @@ nfs_configurations: nfs_server: "{{ nfs_server_default }}" nfs_export: "/exports/home" # assumes skeleton TF is being used nfs_client_mnt_point: "/home" + + - comment: Export /exports/cluster from Slurm control node + nfs_enable: + server: "{{ inventory_hostname in groups['control'] }}" + clients: false + nfs_export: "/exports/cluster" diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index cf2762f17..e3d20b9c3 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -13,7 +13,7 @@ openhpc_slurm_accounting_storage_type: 'accounting_storage/slurmdbd' openhpc_slurmdbd_mysql_database: slurm_acct_db openhpc_slurmdbd_mysql_password: "{{ vault_mysql_slurm_password }}" openhpc_slurmdbd_mysql_username: slurm -openhpc_slurm_control_host: "{{ hostvars[groups['control'].0].api_address }}" +openhpc_slurm_control_host: "{{ groups['control'] | first }}" # avoid using hostvars for compute-init openhpc_slurmdbd_host: "{{ openhpc_slurm_control_host }}" openhpc_slurm_partitions: - name: "compute" diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 062276f76..cbc69d800 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -136,6 +136,9 @@ freeipa_client [ansible_init] # Hosts to run linux-anisble-init +[compute_init] +# EXPERIMENTAL: Compute hosts to enable joining cluster on boot on + [k3s] # Hosts to run k3s server/agent @@ -145,6 +148,9 @@ freeipa_client [lustre] # Hosts to run lustre client +[extra_packages] +# Hosts to install specified additional packages on + [dnf_repos:children] # Hosts to replace system repos with Pulp repos # Warning: when using Ark directly rather than a local Pulp server, adding hosts other than `builder` will leak Ark creds to users diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index ba5cbc08d..878bebbf3 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -82,6 +82,10 @@ openhpc # Hosts to run ansible-init cluster +[compute_init:children] +# EXPERIMENTAL: Compute hosts to enable joining cluster on boot on +compute + [k3s:children] # Hosts to run k3s server/agent openhpc @@ -92,3 +96,7 @@ control [lustre] # Hosts to run lustre client + +[extra_packages:children] +# Hosts to install specified additional packages on +cluster diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf index eb2139eba..14c728a5a 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute.tf @@ -16,6 +16,6 @@ module "compute" { key_pair = var.key_pair environment_root = var.environment_root k3s_token = var.k3s_token - k3s_server = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] + control_address = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] security_group_ids = [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id] } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf index e64a2162c..7a2a706a6 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/nodes.tf @@ -47,7 +47,7 @@ resource "openstack_compute_instance_v2" "compute" { metadata = { environment_root = var.environment_root k3s_token = var.k3s_token - k3s_server = var.k3s_server + control_address = var.control_address } user_data = <<-EOF diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf index 9d2c2e47c..3655c9e65 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/compute/variables.tf @@ -72,7 +72,7 @@ variable "k3s_token" { type = string } -variable "k3s_server" { - description = "Name/address of k3s server" +variable "control_address" { + description = "Name/address of control node" type = string } diff --git a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf index bfbd1c532..8ea8cabcb 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/terraform/nodes.tf @@ -126,7 +126,7 @@ resource "openstack_compute_instance_v2" "login" { metadata = { environment_root = var.environment_root k3s_token = var.k3s_token - k3s_server = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] + control_address = [for n in openstack_compute_instance_v2.control["control"].network: n.fixed_ip_v4 if n.access_network][0] } user_data = <<-EOF