Skip to content

Commit

Permalink
Fork accelerator builds to use the EL + kickstart build (#2398)
Browse files Browse the repository at this point in the history
* Fork accelerator builds to use the EL + kickstart build

More native than the derivative images we were buliding

* Update forked path

Forgot to save this file the first time

* Cleanup preview image builds, update nvidia version ref

* Remove old preview builds using derivative images
* Adjust EL builds for Rocky to use "latest" instead of "550"

* Update workflow name

Missed a reference to 550

* Update image descriptions

Not LTS, but just latest

* Fix repo status

Deprecated, not used
  • Loading branch information
jjerger authored Nov 9, 2024
1 parent 440b988 commit 15e9f65
Show file tree
Hide file tree
Showing 5 changed files with 323 additions and 80 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
time period is set.
*/}}
{
"Name": "rocky-linux-8-optimized-gcp-with-nvidia-550",
"Name": "rocky-linux-8-optimized-gcp-nvidia-latest",
{{$work_project := printf "%q" "gce-image-builder" -}}
{{$endpoint := `"https://www.googleapis.com/compute/alpha/projects/"` -}}
{{$delete_after := `"24h*30*4"` -}}
Expand All @@ -33,9 +33,9 @@
{{$time := trimPrefix .publish_version "v"}}
"Images": [
{
"Prefix": "rocky-linux-8-optimized-gcp-with-nvidia-550",
"Family": "rocky-linux-8-optimized-gcp-with-nvidia-550",
"Description": "Rocky Linux 8 optimized for GCP with Nvidia 550 built on {{$time}}",
"Prefix": "rocky-linux-8-optimized-gcp-nvidia-latest",
"Family": "rocky-linux-8-optimized-gcp-nvidia-latest",
"Description": "Rocky Linux 8 optimized for GCP with latest Nvidia driver built on {{$time}}",
"Architecture": "X86_64",
"Licenses": [
"https://www.googleapis.com/compute/v1/projects/accelerator-preview-images/global/licenses/accelerator-preview-image",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"Name": "rocky-linux-8-optimized-gcp-with-nvidia-550",
"Name": "rocky-linux-8-optimized-gcp-nvidia-latest",
"Project": "gce-image-builder",
"Zone": "us-central1-b",
"GCSPath": "gs://gce-image-build-bucket/daisy/${USERNAME}",
Expand All @@ -24,6 +24,10 @@
"Value": "${OUTSPATH}/export-image.sbom.json",
"Description": "SBOM final export destination, copies in place by default"
},
"installer_iso": {
"Required": true,
"Description": "The Rocky Linux 8 installer ISO to build from."
},
"sbom_util_gcs_root": {
"Value": "",
"Description": "The root gcs bucket for sbomutil, if using sbomutil to generate the SBOM."
Expand All @@ -37,44 +41,34 @@
"build": {
"TimeOut": "60m",
"IncludeWorkflow": {
"Path": "${workflow_root}/image_build/enterprise_linux/rocky_linux_8_optimized_gcp_with_nvidia_550.wf.json",
"Path": "${workflow_root}/image_build/enterprise_linux/rocky_linux_8_optimized_gcp_with_nvidia_latest.wf.json",
"Vars": {
"build_date": "${build_date}"
"build_date": "${build_date}",
"installer_iso": "${installer_iso}"
}
}
},
"create-disk": {
"CreateDisks": [
{
"Name": "disk-rocky-linux-8-optimized-gcp-with-nvidia-550",
"SourceImage": "rocky-linux-8-optimized-gcp-with-nvidia-550-v${build_date}",
"SizeGb": "30",
"Type": "pd-ssd"
}
]
},
"export-image": {
"Timeout": "60m",
"IncludeWorkflow": {
"Path": "${workflow_root}/export/disk_export.wf.json",
"Vars": {
"destination": "${gcs_url}",
"sbom_destination": "${sbom_destination}",
"source_disk": "disk-rocky-linux-8-optimized-gcp-with-nvidia-550",
"source_disk": "el-install-disk",
"sbom_util_gcs_root": "${sbom_util_gcs_root}",
"sha256_txt": "${sha256_txt}"
}
}
},
"cleanup-image": {
"DeleteResources": {
"Images": ["rocky-linux-8-optimized-gcp-with-nvidia-550-v${build_date}"]
"Images": ["rocky-linux-8-optimized-gcp-nvidia-latest-v${build_date}"]
}
}
},
"Dependencies": {
"create-disk": ["build"],
"export-image": ["create-disk"],
"export-image": ["build"],
"cleanup-image": ["export-image"]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
# rocky-linux-8-optimized-gcp-options.cfg

### Anaconda installer configuration.
# Install in text mode.
text --non-interactive
url --url="https://dl.rockylinux.org/pub/sig/8/cloud/x86_64/cloud-kernel"
repo --name=BaseOS --baseurl="https://dl.rockylinux.org/pub/rocky/8/BaseOS/x86_64/os" --excludepkgs="kernel,kernel-core"
repo --name=AppStream --baseurl="https://dl.rockylinux.org/pub/rocky/8/AppStream/x86_64/os"
repo --name=PowerTools --baseurl="https://dl.rockylinux.org/pub/rocky/8/PowerTools/x86_64/os"
poweroff

# Network configuration
network --bootproto=dhcp --device=link

### Installed system configuration.
firewall --enabled
services --enabled=sshd,rngd --disabled=sshd-keygen@
skipx
timezone --utc UTC --ntpservers=metadata.google.internal
rootpw --iscrypted --lock *
firstboot --disabled
user --name=gce --lock

### Disk configuration.
# Disk configuration is done by including a separate file with disk configuration, otherwise anaconda will try to validate that the disk exists before we configure udev rules.
%pre --interpreter=/usr/bin/bash
cp /run/install/isodir/65-gce-disk-naming.rules /etc/udev/rules.d/
cp /run/install/isodir/google_nvme_id /usr/lib/udev/
chmod +x /usr/lib/udev/google_nvme_id
# Wait for coldplug events from boot to settle, or we won't generate new events for the reload/trigger
udevadm settle
udevadm control --reload
udevadm trigger --settle
tee -a /tmp/disk-config << EOM
# build_installer.py will replace with the id of the install disk to avoid race conditions
bootloader --boot-drive=/dev/disk/by-id/google-el-install-disk --timeout=0 --append="net.ifnames=0 biosdevname=0 scsi_mod.use_blk_mq=Y"
# EFI partitioning, creates a GPT partitioned disk.
clearpart --drives=/dev/disk/by-id/google-el-install-disk --all
part /boot/efi --size=200 --fstype=efi --ondrive=/dev/disk/by-id/google-el-install-disk
part / --size=100 --grow --ondrive=/dev/disk/by-id/google-el-install-disk --label=root --fstype=xfs
EOM
%end
%include /tmp/disk-config

# packages.cfg
# Contains a list of packages to be installed, or not, on all flavors.
# The %package command begins the package selection section of kickstart.
# Packages can be specified by group, or package name. @Base and @Core are
# always selected by default so they do not need to be specified.

%packages
acpid
dhcp-client
dnf-automatic
net-tools
openssh-server
python3
rng-tools
tar
vim
-subscription-manager
-alsa-utils
-b43-fwcutter
-dmraid
-eject
-gpm
-irqbalance
-microcode_ctl
-smartmontools
-aic94xx-firmware
-atmel-firmware
-b43-openfwwf
-bfa-firmware
-ipw2100-firmware
-ipw2200-firmware
-ivtv-firmware
-iwl100-firmware
-iwl1000-firmware
-iwl3945-firmware
-iwl4965-firmware
-iwl5000-firmware
-iwl5150-firmware
-iwl6000-firmware
-iwl6000g2a-firmware
-iwl6050-firmware
-kernel-firmware
-libertas-usb8388-firmware
-ql2100-firmware
-ql2200-firmware
-ql23xx-firmware
-ql2400-firmware
-ql2500-firmware
-rt61pci-firmware
-rt73usb-firmware
-xorg-x11-drv-ati-firmware
-zd1211-firmware
%end

%post
tee -a /etc/yum.repos.d/google-cloud.repo << EOM
[google-compute-engine]
name=Google Compute Engine
baseurl=https://packages.cloud.google.com/yum/repos/google-compute-engine-el8-x86_64-stable
enabled=1
gpgcheck=1
repo_gpgcheck=0
gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg
https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
EOM
tee -a /etc/yum.repos.d/google-cloud.repo << EOM
[google-cloud-sdk]
name=Google Cloud SDK
baseurl=https://packages.cloud.google.com/yum/repos/cloud-sdk-el8-x86_64
enabled=1
gpgcheck=1
repo_gpgcheck=0
gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg
https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
EOM

# Rocky Linux Cloud Kernel repo.
tee -a /etc/yum.repos.d/Rocky-CloudKernel.repo << EOM
[cloud-kernel]
name=Rocky Linux 8 - Cloud Kernel
baseurl=https://dl.rockylinux.org/pub/sig/8/cloud/x86_64/cloud-kernel
enabled=1
gpgcheck=1
gpgkey=https://dl.rockylinux.org/pub/sig/8/cloud/x86_64/cloud-kernel/RPM-GPG-KEY-Rocky-SIG-Cloud
priority=-1
EOM
tee -a /etc/yum.repos.d/Rocky-CloudKernel.repo << EOM
[cloud-kernel-source]
name=Rocky Linux 8 - Cloud Kernel Source
baseurl=https://dl.rockylinux.org/pub/sig/8/cloud/source/cloud-kernel
enabled=0
gpgcheck=1
gpgkey=https://dl.rockylinux.org/pub/sig/8/cloud/x86_64/cloud-kernel/RPM-GPG-KEY-Rocky-SIG-Cloud
priority=-1
EOM
# Be sure we don't get kernels from the BaseOS repo
tee -a /etc/yum.repos.d/Rocky-BaseOS.repo << EOM
exclude=kernel*
EOM
%end
# Google Compute Engine kickstart config for Enterprise Linux 8.
%onerror
echo "Build Failed!" > /dev/ttyS0
shutdown -h now
%end

%post --erroronfail
set -x
exec &> /dev/ttyS0
# Delete the dummy user account.
userdel -r gce

# Import all RPM GPG keys.
curl -o /etc/pki/rpm-gpg/google-rpm-package-key.gpg https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
curl -o /etc/pki/rpm-gpg/google-key.gpg https://packages.cloud.google.com/yum/doc/yum-key.gpg
curl -o /etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-SIG-Cloud https://dl.rockylinux.org/pub/sig/8/cloud/x86_64/cloud-kernel/RPM-GPG-KEY-Rocky-SIG-Cloud
rpm --import /etc/pki/rpm-gpg/*

# Configure the network for GCE.
# Given that GCE users typically control the firewall at the network API level,
# we want to leave the standard Linux firewall setup enabled but all-open.
firewall-offline-cmd --set-default-zone=trusted

cat >>/etc/dhcp/dhclient.conf <<EOL
# Set the dhclient retry interval to 10 seconds instead of 5 minutes.
retry 10;
EOL

# Set google-compute-engine config for EL8.
cat >>/etc/default/instance_configs.cfg.distro << EOL
# Disable boto plugin setup.
[InstanceSetup]
set_boto_config = false
EOL

# Install GCE guest packages.
dnf install -y google-compute-engine google-osconfig-agent gce-disk-expand

# Install the Cloud SDK package.
dnf install -y google-cloud-cli

# Install Accelerator components: nvidia and mellanox drivers
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
dnf install -y gcc make kernel-devel kernel
test -f /var/tmp/kernel-upgrade-done || sh -c 'touch /var/tmp/kernel-upgrade-done'
curl -L -o nvidia.run https://us.download.nvidia.com/tesla/550.90.12/NVIDIA-Linux-x86_64-550.90.12.run
chmod +x ./nvidia.run
# DKMS - not suitable for prod
./nvidia.run -s --kernel-source-path=/usr/src/kernels/$(uname -r)/
dnf install -y createrepo gdb-headless libtool autoconf rpm-build kernel-rpm-macros patch automake wget lsof tk gcc-gfortran tcl pciutils
wget https://content.mellanox.com/ofed/MLNX_OFED-23.10-3.2.2.0/MLNX_OFED_LINUX-23.10-3.2.2.0-rhel8.9-x86_64.tgz
tar xf MLNX_OFED_LINUX-23.10-3.2.2.0-rhel8.9-x86_64.tgz
cd MLNX_OFED_LINUX-23.10-3.2.2.0-rhel8.9-x86_64
./mlnxofedinstall --guest --force --skip-distro-check --add-kernel-support
cd ..
rm -rf MLNX_OFED_LINUX-23.10-3.2.2.0-rhel8.9-x86_64 MLNX_OFED_LINUX-23.10-3.2.2.0-rhel8.9-x86_64.tgz

# Send /root/anaconda-ks.cfg to our logs.
cp /run/install/ks.cfg /tmp/anaconda-ks.cfg

# Remove files which shouldn't make it into the image. Its possible these files
# will not exist.
rm -f /etc/boto.cfg /etc/udev/rules.d/70-persistent-net.rules

# Remove ens4 config from installer.
rm -f /etc/sysconfig/network-scripts/ifcfg-ens4

# Disable password authentication by default.
sed -i -e '/^PasswordAuthentication /s/ yes$/ no/' /etc/ssh/sshd_config

# Set ServerAliveInterval and ClientAliveInterval to prevent SSH
# disconnections. The pattern match is tuned to each source config file.
# The $'...' quoting syntax tells the shell to expand escape characters.
sed -i -e $'/^\tServerAliveInterval/d' /etc/ssh/ssh_config
sed -i -e $'/^Host \\*$/a \\\tServerAliveInterval 420' /etc/ssh/ssh_config
sed -i -e '/ClientAliveInterval/s/^.*/ClientAliveInterval 420/' /etc/ssh/sshd_config

# Disable root login via SSH by default.
sed -i -e '/PermitRootLogin yes/s/^.*/PermitRootLogin no/' /etc/ssh/sshd_config

# Update all packages.
dnf -y update

# Make changes to dnf automatic.conf
# Apply updates for security (RHEL) by default. NOTE this will not work in CentOS.
sed -i 's/upgrade_type =.*/upgrade_type = security/' /etc/dnf/automatic.conf
sed -i 's/apply_updates =.*/apply_updates = yes/' /etc/dnf/automatic.conf
# Enable the DNF automatic timer service.
systemctl enable dnf-automatic.timer

# Cleanup this repo- we don't want to continue updating with it.
# Depending which repos are used in build, one or more of these files will not
# exist.
rm -f /etc/yum.repos.d/google-cloud-unstable.repo \
/etc/yum.repos.d/google-cloud-staging.repo

# Clean up the cache for smaller images.
dnf clean all
rm -fr /var/cache/dnf/*

# Blacklist the floppy module.
echo "blacklist floppy" > /etc/modprobe.d/blacklist-floppy.conf
restorecon /etc/modprobe.d/blacklist-floppy.conf

# Generate initramfs from latest kernel instead of the running kernel.
kver="$(ls -t /lib/modules | head -n1)"
dracut -f --kver="${kver}"

# Fix selinux contexts on /etc/resolv.conf.
restorecon /etc/resolv.conf
%end

# Cleanup.
%post --nochroot --log=/dev/ttyS0
set -x
rm -Rf /mnt/sysimage/tmp/*
%end
Loading

0 comments on commit 15e9f65

Please sign in to comment.