From 50fc320be89db6e5884323830eb5c548ddbb8199 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 8 Jan 2025 10:13:38 +0000 Subject: [PATCH 1/2] Update ceph to use ark packages and move RL9 to ceph reef (#519) * Release train support for ceph repos * bump images * Update requirements.yml * bumped rocky 9 ceph repos to reef * updated rl9 ceph version number * bump images * reverted to upstream ceph versions * Update requirements.yml * comment --- ansible/roles/dnf_repos/defaults/main.yml | 3 +++ ansible/roles/pulp_site/defaults/main.yml | 2 ++ .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- environments/common/inventory/group_vars/all/defaults.yml | 8 +++++++- .../common/inventory/group_vars/all/os-manila-mount.yml | 3 +++ requirements.yml | 2 +- 6 files changed, 18 insertions(+), 4 deletions(-) create mode 100644 environments/common/inventory/group_vars/all/os-manila-mount.yml diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 841631890..6d41046ec 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -30,6 +30,9 @@ dnf_repos_default_repolist: - file: "{{ dnf_repos_version_filenames.extras }}" name: extras base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.extras[ansible_distribution_version] | appliances_repo_to_subpath }}" +- file: ceph + name: Ceph + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.ceph[ansible_distribution_major_version] | appliances_repo_to_subpath }}" dnf_repos_openhpc_repolist: - name: OpenHPC diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index c0b191336..c549dac53 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -26,6 +26,8 @@ pulp_site_rpm_info: subpath: "{{ appliances_pulp_repos.openhpc_base[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" - name: "ohpc-updates-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.openhpc_updates[pulp_site_target_distribution_version_major].timestamp }}" subpath: "{{ appliances_pulp_repos.openhpc_updates[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" +- name: "ceph-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.ceph[pulp_site_target_distribution_version_major].timestamp }}" + subpath: "{{ appliances_pulp_repos.ceph[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" pulp_site_rpm_repo_defaults: remote_username: "{{ pulp_site_upstream_username }}" diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 943a2dfbd..9c72b07ce 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250106-0916-f8603056", - "RL9": "openhpc-RL9-250106-0916-f8603056" + "RL8": "openhpc-RL8-250107-1534-b03caaf3", + "RL9": "openhpc-RL9-250107-1535-b03caaf3" } } diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index e052eb709..e26bc3018 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -165,4 +165,10 @@ appliances_pulp_repos: '9': path: OpenHPC/3/updates/EL_9 timestamp: 20241218T154614 - + ceph: + '8': + timestamp: 20231104T015751 + path: centos/8-stream/storage/x86_64/ceph-quincy + '9': + timestamp: 20240923T233036 + path: centos/9-stream/storage/x86_64/ceph-reef diff --git a/environments/common/inventory/group_vars/all/os-manila-mount.yml b/environments/common/inventory/group_vars/all/os-manila-mount.yml new file mode 100644 index 000000000..6b25d62cb --- /dev/null +++ b/environments/common/inventory/group_vars/all/os-manila-mount.yml @@ -0,0 +1,3 @@ +# Empty repo lists from stackhpc.ansible-role-os-manila-mount role defaults, as these repofiles are +# now generated by dnf_repos to allow injecting Ark creds: +os_manila_mount_ceph_rpm_repos: [] diff --git a/requirements.yml b/requirements.yml index 7e71bb904..71adbc6e5 100644 --- a/requirements.yml +++ b/requirements.yml @@ -21,7 +21,7 @@ roles: version: v3.1.5 - src: https://github.com/stackhpc/ansible-role-os-manila-mount.git name: stackhpc.os-manila-mount - version: v24.11.0 # Support ceph quincy for RL9 + version: v25.1.1 collections: - name: containers.podman From 781c2d474848309dbe42bb4ca83343b1aad3b621 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Wed, 8 Jan 2025 12:45:03 +0000 Subject: [PATCH 2/2] Add more information re. configuring production sites (#508) * add lots of info to production docs * Production docs tweaks from review Co-authored-by: Scott Davidson <49713135+sd109@users.noreply.github.com> * add prod docs comment re login FIPs --------- Co-authored-by: Scott Davidson <49713135+sd109@users.noreply.github.com> --- docs/production.md | 150 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 145 insertions(+), 5 deletions(-) diff --git a/docs/production.md b/docs/production.md index 7219ee7fc..c1b139994 100644 --- a/docs/production.md +++ b/docs/production.md @@ -1,9 +1,149 @@ # Production Deployments -This page contains some brief notes about differences between the default/demo configuration, as described in the main [README.md](../README.md) and production-ready deployments. +This page contains some brief notes about differences between the default/demo +configuration (as described in the main [README.md](../README.md)) and +production-ready deployments. + +- Get it agreed up front what the cluster names will be. Changing this later + requires instance deletion/recreation. + +- At least three environments should be created: + - `site`: site-specific base environment + - `production`: production environment + - `staging`: staging environment + + A `dev` environment should also be created if considered required, or this + can be left until later., + + These can all be produced using the cookicutter instructions, but the + `production` and `staging` environments will need their + `environments/$ENV/ansible.cfg` file modifying so that they point to the + `site` environment: + + ```ini + inventory = ../common/inventory,../site/inventory,inventory + ``` + +- To avoid divergence of configuration all possible overrides for group/role +vars should be placed in `environments/site/inventory/group_vars/all/*.yml` +unless the value really is environment-specific (e.g. DNS names for +`openondemand_servername`). + +- Where possible hooks should also be placed in `environments/site/hooks/` +and referenced from the `site` and `production` environments, e.g.: + + ```yaml + # environments/production/hooks/pre.yml: + - name: Import parent hook + import_playbook: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/../site/hooks/pre.yml" + ``` + +- OpenTofu configurations should be defined in the `site` environment and used + as a module from the other environments. This can be done with the + cookie-cutter generated configurations: + - Delete the *contents* of the cookie-cutter generated `terraform/` directories + from the `production` and `staging` environments. + - Create a `main.tf` in those directories which uses `site/terraform/` as a + [module](https://opentofu.org/docs/language/modules/), e.g. : + + ``` + ... + module "cluster" { + source = "../../site/terraform/" + + cluster_name = "foo" + ... + } + ``` + + Note that: + - Environment-specific variables (`cluster_name`) should be hardcoded + into the module block. + - Environment-independent variables (e.g. maybe `cluster_net` if the + same is used for staging and production) should be set as *defaults* + in `environments/site/terraform/variables.tf`, and then don't need to + be passed in to the module. + +- Vault-encrypt secrets. Running the `generate-passwords.yml` playbook creates + a secrets file at `environments/$ENV/inventory/group_vars/all/secrets.yml`. + To ensure staging environments are a good model for production this should + generally be moved into the `site` environment. It should be be encrypted + using [Ansible vault](https://docs.ansible.com/ansible/latest/user_guide/vault.html) + and then committed to the repository. + +- Ensure created instances have accurate/synchronised time. For VM instances + this is usually provided by the hypervisor, but if not (or for bare metal + instances) it may be necessary to configure or proxy `chronyd` via an + environment hook. + +- The cookiecutter provided OpenTofu configurations define resources for home and + state volumes. The former may not be required if the cluster's `/home` is + provided from an external filesystem (or Manila). In any case, in at least + the production environment, and probably also in the staging environment, + the volumes should be manually created and the resources changed to [data + resources](https://opentofu.org/docs/language/data-sources/). This ensures that even if the cluster is deleted via tofu, the + volumes will persist. + + For a development environment, having volumes under tofu control via volume + resources is usually appropriate as there may be many instantiations + of this environment. + +- Enable `etc_hosts` templating: + + ```yaml + # environments/site/inventory/groups: + [etc_hosts:children] + cluster + ``` -- Create a site environment. Usually at least production, staging and possibly development environments are required. To avoid divergence of configuration these should all have an `inventory` path referencing a shared, site-specific base environment. Where possible hooks should also be placed in this site-specific environment. -- Vault-encrypt secrets. Running the `generate-passwords.yml` playbook creates a secrets file at `environments/$ENV/inventory/group_vars/all/secrets.yml`. To ensure staging environments are a good model for production this should generally be moved into the site-specific environment. It can be be encrypted using [Ansible vault](https://docs.ansible.com/ansible/latest/user_guide/vault.html) and then committed to the repository. -- Ensure created instances have accurate/synchronised time. For VM instances this is usually provided by the hypervisor, but if not (or for bare metal instances) it may be necessary to configure or proxy `chronyd` via an environment hook. -- Remove production volumes from OpenTofu control. In the default OpenTofu configuration, deleting the resources also deletes the volumes used for persistent state and home directories. This is usually undesirable for production, so these resources should be removed from the OpenTofu configurations and manually deployed once. However note that for development environments leaving them under OpenTofu control is usually best. - Configure Open OpenOndemand - see [specific documentation](openondemand.README.md). + +- Modify `environments/site/terraform/nodes.tf` to provide fixed IPs for at least + the control node, and (if not using FIPs) the login node(s): + + ``` + resource "openstack_networking_port_v2" "control" { + ... + fixed_ip { + subnet_id = data.openstack_networking_subnet_v2.cluster_subnet.id + ip_address = var.control_ip_address + } + } + ``` + + Note the variable `control_ip_address` is new. + + Using fixed IPs will require either using admin credentials or policy changes. + +- If floating IPs are required for login nodes, modify the OpenTofu configurations + appropriately. + +- Enable persisting login node hostkeys so users do not get annoying ssh warning + messages on reimage: + + ```yaml + # environments/site/inventory/groups: + [persist_hostkeys:children] + login + ``` + And configure NFS to include exporting the state directory to these hosts: + + ```yaml + # environments/common/inventory/group_vars/all/nfs.yml: + nfs_configurations: + # ... potentially, /home defintion from common environment + - comment: Export state directory to login nodes + nfs_enable: + server: "{{ inventory_hostname in groups['control'] }}" + clients: "{{ inventory_hostname in groups['login'] }}" + nfs_server: "{{ nfs_server_default }}" + nfs_export: "/var/lib/state" + nfs_client_mnt_point: "/var/lib/state" + ``` + See [issue 506](https://github.com/stackhpc/ansible-slurm-appliance/issues/506). + +- Consider whether mapping of baremetal nodes to ironic nodes is required. See + [PR 485](https://github.com/stackhpc/ansible-slurm-appliance/pull/485). + +- Note [PR 473](https://github.com/stackhpc/ansible-slurm-appliance/pull/473) + may help identify any site-specific configuration.