Skip to content

Commit

Permalink
feat: collect log from provisioning rosa HCP
Browse files Browse the repository at this point in the history
  • Loading branch information
psturc committed Dec 11, 2024
1 parent 1917411 commit 3ea8889
Show file tree
Hide file tree
Showing 2 changed files with 217 additions and 0 deletions.
3 changes: 3 additions & 0 deletions tasks/rosa/hosted-cp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Tekton Task: Rosa HCP

Please read/update [rosa-hcp tasks docs](../../../docs/qe-available-tasks/rosa.md) from docs folder.
214 changes: 214 additions & 0 deletions tasks/rosa/hosted-cp/rosa-hcp-provision/0.2/rosa-hcp-provision.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
apiVersion: tekton.dev/v1
kind: Task
metadata:
name: rosa-hcp-provision
spec:
description: |
The `rosa-hcp-provision` task automates the creation and provisioning of an ephemeral OpenShift cluster using Red Hat OpenShift on AWS (ROSA) with Hosted Control Planes (HCP).
The task takes several parameters, including the OpenShift version, AWS machine type, and cluster name, to configure and deploy the cluster on AWS.
It uses credentials stored in a Kubernetes secret for authentication and configuration of AWS and ROSA.
Once the cluster is provisioned, the task outputs a login command to access the newly created cluster, which can be used in subsequent pipeline steps.
results:
- name: ocp-login-command
description: Command to log in to the newly ephemeral OpenShift cluster.
params:
- name: ocp-version
type: string
description: The version of the OpenShift Container Platform (OCP) to deploy. This will be used to fetch the corresponding HCP version for deployment.
- name: cluster-name
type: string
description: The unique name of the OpenShift cluster to be created.
- name: machine-type
type: string
description: The AWS EC2 instance type to be used for the worker nodes of the OpenShift cluster (e.g., m5.xlarge).
- name: replicas
type: string
description: The number of worker nodes to provision in the cluster. Defaults to 3 worker nodes.
default: '3'
- name: konflux-test-infra-secret
type: string
description: The name of the Kubernetes secret that contains AWS and ROSA configuration credentials needed for cluster provisioning.
- name: cloud-credential-key
type: string
description: The key within the secret where AWS ROSA configurations (e.g., credentials, roles) are stored.
- name: oci-container
type: string
description: The ORAS container registry URI where artifacts will be stored.
volumes:
- name: konflux-test-infra-volume
secret:
secretName: "$(params.konflux-test-infra-secret)"
steps:
- name: provision
image: quay.io/konflux-qe-incubator/konflux-qe-tools:latest
onError: continue
volumeMounts:
- name: konflux-test-infra-volume
mountPath: /usr/local/konflux-test-infra
workingDir: /workspace/cluster-provision
env:
- name: CLUSTER_NAME
value: "$(params.cluster-name)"
- name: OCP_VERSION
value: "$(params.ocp-version)"
- name: MACHINE_TYPE
value: "$(params.machine-type)"
script: |
set -o errexit
set -o nounset
set -o pipefail
export ROSA_TOKEN AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY BILLING_ACCOUNT_ID AWS_OIDC_CONFIG_ID OPERATOR_ROLES_PREFIX \
SUBNET_IDS INSTALL_ROLE_ARN SUPPORT_ROLE_ARN WORKER_ROLE_ARN REGION
ROSA_TOKEN=$(jq -r '.aws["rosa-hcp"]["rosa-token"]' /usr/local/konflux-test-infra/$(params.cloud-credential-key))
AWS_ACCESS_KEY_ID=$(jq -r '.aws["access-key-id"]' /usr/local/konflux-test-infra/$(params.cloud-credential-key))
AWS_SECRET_ACCESS_KEY=$(jq -r '.aws["access-key-secret"]' /usr/local/konflux-test-infra/$(params.cloud-credential-key))
BILLING_ACCOUNT_ID=$(jq -r '.aws["aws-account-id"]' /usr/local/konflux-test-infra/$(params.cloud-credential-key))
AWS_OIDC_CONFIG_ID=$(jq -r '.aws["rosa-hcp"]["aws-oidc-config-id"]' /usr/local/konflux-test-infra/$(params.cloud-credential-key))
OPERATOR_ROLES_PREFIX=$(jq -r '.aws["rosa-hcp"]["operator-roles-prefix"]' /usr/local/konflux-test-infra/$(params.cloud-credential-key))
SUBNET_IDS=$(jq -r '.aws["rosa-hcp"]["subnets-ids"]' /usr/local/konflux-test-infra/$(params.cloud-credential-key))
INSTALL_ROLE_ARN=$(jq -r '.aws["rosa-hcp"]["install-role-arn"]' /usr/local/konflux-test-infra/$(params.cloud-credential-key))
SUPPORT_ROLE_ARN=$(jq -r '.aws["rosa-hcp"]["support-role-arn"]' /usr/local/konflux-test-infra/$(params.cloud-credential-key))
WORKER_ROLE_ARN=$(jq -r '.aws["rosa-hcp"]["worker-role-arn"]' /usr/local/konflux-test-infra/$(params.cloud-credential-key))
REGION=$(jq -r '.aws["region"]' /usr/local/konflux-test-infra/$(params.cloud-credential-key))
main() {
config_aws_creds() {
printf "INFO: Configure AWS Credentials...\n"
aws configure set aws_access_key_id "$AWS_ACCESS_KEY_ID"
aws configure set aws_secret_access_key "$AWS_SECRET_ACCESS_KEY"
aws configure set region "$REGION"
}
print_debug_info() {
printf "INFO: Print debug info......\n"
rosa --region "$REGION" describe cluster --cluster="$CLUSTER_NAME"
}
# Even the cluster is shown ready on ocm side, and the cluster operators are available, some of the cluster operators are still progressing.
check_clusteroperators() {
local STATUS_LOG="co_status.log"
local max_attempts=10
local attempt
echo "[INFO] Checking cluster operators' status..."
# retrying to get clusteroperator. Makes sense in case master nodes are not ready
for attempt in $(seq 1 $max_attempts); do
echo "[INFO] Attempt $attempt/$max_attempts"
if kubectl get clusteroperators -A > "$STATUS_LOG" 2>&1; then
cat "$STATUS_LOG"
echo "[INFO] Cluster operators are accessible."
break
fi
echo "[INFO] Attempt $attempt failed, retrying in 10 seconds..."
sleep 10
done
if [ $attempt -eq $max_attempts ]; then
echo "[ERROR] All attempts to access cluster operators failed. Check $STATUS_LOG for details."
cat "$STATUS_LOG"
return 1
fi
echo "[INFO] Waiting for cluster operators to be in 'Progressing=false' state..."
kubectl wait clusteroperators --all --for=condition=Progressing=false --timeout=60m > "$STATUS_LOG" 2>&1
cat "$STATUS_LOG"
}
get_hcp_full_version() {
rosa_output=$(rosa list version --channel-group stable --region "$REGION" --hosted-cp -o json)
raw_id=$(echo "$rosa_output" | jq -r "[.[].raw_id | select(startswith(\"$OCP_VERSION\"))] | max")
HCP_FULL_VERSION="$raw_id"
if [ -z "$HCP_FULL_VERSION" ]; then
echo "Failed to get the HCP full version of $OCP_VERSION" >&2
exit 1
fi
}
deploy_cluster() {
printf "INFO: Log in to your Red Hat account...\n"
config_aws_creds
rosa login --token="$ROSA_TOKEN"
printf "INFO: Create ROSA with HCP cluster...\n"
get_hcp_full_version
rosa create cluster --cluster-name "$CLUSTER_NAME" \
--sts --mode=auto --oidc-config-id "$AWS_OIDC_CONFIG_ID" \
--operator-roles-prefix "$OPERATOR_ROLES_PREFIX" --region "$REGION" --version "$HCP_FULL_VERSION" \
--role-arn "$INSTALL_ROLE_ARN" \
--support-role-arn "$SUPPORT_ROLE_ARN" \
--worker-iam-role "$WORKER_ROLE_ARN" \
--compute-machine-type "$MACHINE_TYPE" \
--subnet-ids="$SUBNET_IDS" \
--billing-account "$BILLING_ACCOUNT_ID" \
--replicas $(params.replicas) \
--tags konflux-ci:true,creation-date:$(date -u +"%Y-%m-%d"),cluster-type:rosa-hcp \
--hosted-cp -y
printf "INFO: Track the progress of the cluster creation...\n"
rosa logs install --cluster="$CLUSTER_NAME" --region "$REGION" --watch
printf "INFO: ROSA with HCP cluster is ready, create a cluster admin account for accessing the cluster\n"
admin_output="$(rosa create admin --region "$REGION" --cluster="$CLUSTER_NAME")"
# Get the admin account credentials and API server URL
admin_user="$(echo "$admin_output" | grep -oP '(?<=--username ).*(?= --password)')"
admin_pass="$(echo "$admin_output" | grep -oP '(?<=--password ).*')"
api_url="$(echo "$admin_output" | grep -oP '(?<=oc login ).*(?= --username)')"
printf "INFO: Storing login command...\n"
echo "oc login $api_url --username $admin_user --password $admin_pass" > $(results.ocp-login-command.path)
# Use the admin account to login to the cluster in a loop until the account is active.
printf "INFO: Check if it's able to login to OCP cluster...\n"
max_retries=10
retries=0
while ! oc login "$api_url" --username "$admin_user" --password "$admin_pass" >/dev/null 2>&1; do
if [ "$retries" -eq "$max_retries" ]; then
echo "ERROR: Failed to login the cluster." >&2
print_debug_info
exit 1
fi
sleep 60
retries=$((retries + 1))
echo "Retried $retries times..."
done
#Workaround: Check if apiserver is ready by calling kubectl get nodes
printf "INFO: Check if apiserver is ready...\n"
if ! timeout 300s bash -c "while ! kubectl get nodes >/dev/null 2>/dev/null; do printf '.'; sleep 10; done"; then
echo "ERROR: API server is not ready" >&2
exit 1
fi
check_clusteroperators
}
deploy_cluster
}
main 2>&1 | tee cluster-provision.log
- name: secure-push-oci
ref:
resolver: git
params:
- name: url
value: https://github.com/konflux-ci/tekton-integration-catalog.git
- name: revision
value: main
- name: pathInRepo
value: stepactions/secure-push-oci/0.1/secure-push-oci.yaml
params:
- name: workdir-path
value: /workspace/cluster-provision
- name: oci-ref
value: $(params.oci-container)
- name: credentials-volume-name
value: konflux-test-infra-volume
- name: fail-if-any-step-failed
ref:
resolver: git
params:
- name: url
value: https://github.com/konflux-ci/tekton-integration-catalog.git
- name: revision
value: main
- name: pathInRepo
value: stepactions/fail-if-any-step-failed/0.1/fail-if-any-step-failed.yaml

0 comments on commit 3ea8889

Please sign in to comment.