feat: collect log from provisioning rosa HCP

konflux-ci · Dec 11, 2024 · 3ea8889 · 3ea8889
1 parent 1917411
commit 3ea8889
Show file tree

Hide file tree

Showing 2 changed files with 217 additions and 0 deletions.
diff --git a/tasks/rosa/hosted-cp/README.md b/tasks/rosa/hosted-cp/README.md
@@ -0,0 +1,3 @@
+# Tekton Task: Rosa HCP
+
+Please read/update [rosa-hcp tasks docs](../../../docs/qe-available-tasks/rosa.md) from docs folder.
diff --git a/tasks/rosa/hosted-cp/rosa-hcp-provision/0.2/rosa-hcp-provision.yaml b/tasks/rosa/hosted-cp/rosa-hcp-provision/0.2/rosa-hcp-provision.yaml
@@ -0,0 +1,214 @@
+apiVersion: tekton.dev/v1
+kind: Task
+metadata:
+  name: rosa-hcp-provision
+spec:
+  description: |
+    The `rosa-hcp-provision` task automates the creation and provisioning of an ephemeral OpenShift cluster using Red Hat OpenShift on AWS (ROSA) with Hosted Control Planes (HCP).
+    The task takes several parameters, including the OpenShift version, AWS machine type, and cluster name, to configure and deploy the cluster on AWS.
+    It uses credentials stored in a Kubernetes secret for authentication and configuration of AWS and ROSA.
+    Once the cluster is provisioned, the task outputs a login command to access the newly created cluster, which can be used in subsequent pipeline steps.
+  results:
+    - name: ocp-login-command
+      description: Command to log in to the newly ephemeral OpenShift cluster.
+  params:
+    - name: ocp-version
+      type: string
+      description: The version of the OpenShift Container Platform (OCP) to deploy. This will be used to fetch the corresponding HCP version for deployment.
+    - name: cluster-name
+      type: string
+      description: The unique name of the OpenShift cluster to be created.
+    - name: machine-type
+      type: string
+      description: The AWS EC2 instance type to be used for the worker nodes of the OpenShift cluster (e.g., m5.xlarge).
+    - name: replicas
+      type: string
+      description: The number of worker nodes to provision in the cluster. Defaults to 3 worker nodes.
+      default: '3'
+    - name: konflux-test-infra-secret
+      type: string
+      description: The name of the Kubernetes secret that contains AWS and ROSA configuration credentials needed for cluster provisioning.
+    - name: cloud-credential-key
+      type: string
+      description: The key within the secret where AWS ROSA configurations (e.g., credentials, roles) are stored.
+    - name: oci-container
+      type: string
+      description: The ORAS container registry URI where artifacts will be stored.
+  volumes:
+    - name: konflux-test-infra-volume
+      secret:
+        secretName: "$(params.konflux-test-infra-secret)"
+  steps:
+    - name: provision
+      image: quay.io/konflux-qe-incubator/konflux-qe-tools:latest
+      onError: continue
+      volumeMounts:
+        - name: konflux-test-infra-volume
+          mountPath: /usr/local/konflux-test-infra
+      workingDir: /workspace/cluster-provision
+      env:
+        - name: CLUSTER_NAME
+          value: "$(params.cluster-name)"
+        - name: OCP_VERSION
+          value: "$(params.ocp-version)"
+        - name: MACHINE_TYPE
+          value: "$(params.machine-type)"
+      script: |
+        set -o errexit
+        set -o nounset
+        set -o pipefail
+
+        export ROSA_TOKEN AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY BILLING_ACCOUNT_ID AWS_OIDC_CONFIG_ID OPERATOR_ROLES_PREFIX \
+              SUBNET_IDS INSTALL_ROLE_ARN SUPPORT_ROLE_ARN WORKER_ROLE_ARN REGION
+
+        ROSA_TOKEN=$(jq -r '.aws["rosa-hcp"]["rosa-token"]' /usr/local/konflux-test-infra/$(params.cloud-credential-key))
+        AWS_ACCESS_KEY_ID=$(jq -r '.aws["access-key-id"]' /usr/local/konflux-test-infra/$(params.cloud-credential-key))
+        AWS_SECRET_ACCESS_KEY=$(jq -r '.aws["access-key-secret"]' /usr/local/konflux-test-infra/$(params.cloud-credential-key))
+        BILLING_ACCOUNT_ID=$(jq -r '.aws["aws-account-id"]' /usr/local/konflux-test-infra/$(params.cloud-credential-key))
+        AWS_OIDC_CONFIG_ID=$(jq -r '.aws["rosa-hcp"]["aws-oidc-config-id"]' /usr/local/konflux-test-infra/$(params.cloud-credential-key))
+        OPERATOR_ROLES_PREFIX=$(jq -r '.aws["rosa-hcp"]["operator-roles-prefix"]' /usr/local/konflux-test-infra/$(params.cloud-credential-key))
+        SUBNET_IDS=$(jq -r '.aws["rosa-hcp"]["subnets-ids"]' /usr/local/konflux-test-infra/$(params.cloud-credential-key))
+        INSTALL_ROLE_ARN=$(jq -r '.aws["rosa-hcp"]["install-role-arn"]' /usr/local/konflux-test-infra/$(params.cloud-credential-key))
+        SUPPORT_ROLE_ARN=$(jq -r '.aws["rosa-hcp"]["support-role-arn"]' /usr/local/konflux-test-infra/$(params.cloud-credential-key))
+        WORKER_ROLE_ARN=$(jq -r '.aws["rosa-hcp"]["worker-role-arn"]' /usr/local/konflux-test-infra/$(params.cloud-credential-key))
+        REGION=$(jq -r '.aws["region"]' /usr/local/konflux-test-infra/$(params.cloud-credential-key))
+
+        main() {
+          config_aws_creds() {
+              printf "INFO: Configure AWS Credentials...\n" 
+              aws configure set aws_access_key_id "$AWS_ACCESS_KEY_ID"
+              aws configure set aws_secret_access_key "$AWS_SECRET_ACCESS_KEY"
+              aws configure set region "$REGION"
+          }
+
+          print_debug_info() {
+              printf "INFO: Print debug info......\n" 
+              rosa --region "$REGION" describe cluster --cluster="$CLUSTER_NAME"
+          }
+
+          # Even the cluster is shown ready on ocm side, and the cluster operators are available, some of the cluster operators are still progressing.
+          check_clusteroperators() {
+              local STATUS_LOG="co_status.log"
+              local max_attempts=10
+              local attempt
+              echo "[INFO] Checking cluster operators' status..."
+              # retrying to get clusteroperator. Makes sense in case master nodes are not ready
+              for attempt in $(seq 1 $max_attempts); do
+                  echo "[INFO] Attempt $attempt/$max_attempts"
+                  if kubectl get clusteroperators -A > "$STATUS_LOG" 2>&1; then
+                      cat "$STATUS_LOG"
+                      echo "[INFO] Cluster operators are accessible."
+                      break
+                  fi
+                  echo "[INFO] Attempt $attempt failed, retrying in 10 seconds..."
+                  sleep 10
+              done
+              if [ $attempt -eq $max_attempts ]; then
+                  echo "[ERROR] All attempts to access cluster operators failed. Check $STATUS_LOG for details."
+                  cat "$STATUS_LOG"
+                  return 1
+              fi
+              echo "[INFO] Waiting for cluster operators to be in 'Progressing=false' state..."
+              kubectl wait clusteroperators --all --for=condition=Progressing=false --timeout=60m > "$STATUS_LOG" 2>&1
+              cat "$STATUS_LOG"
+          }
+
+          get_hcp_full_version() {
+              rosa_output=$(rosa list version --channel-group stable --region "$REGION" --hosted-cp -o json)
+              raw_id=$(echo "$rosa_output" | jq -r "[.[].raw_id | select(startswith(\"$OCP_VERSION\"))] | max")
+              HCP_FULL_VERSION="$raw_id"
+              if [ -z "$HCP_FULL_VERSION" ]; then
+                  echo "Failed to get the HCP full version of $OCP_VERSION" >&2
+                  exit 1
+              fi
+          }
+
+          deploy_cluster() {
+              printf "INFO: Log in to your Red Hat account...\n" 
+              config_aws_creds
+              rosa login --token="$ROSA_TOKEN"
+
+              printf "INFO: Create ROSA with HCP cluster...\n" 
+              get_hcp_full_version
+              rosa create cluster --cluster-name "$CLUSTER_NAME" \
+                  --sts --mode=auto --oidc-config-id "$AWS_OIDC_CONFIG_ID" \
+                  --operator-roles-prefix "$OPERATOR_ROLES_PREFIX" --region "$REGION" --version "$HCP_FULL_VERSION" \
+                  --role-arn "$INSTALL_ROLE_ARN" \
+                  --support-role-arn "$SUPPORT_ROLE_ARN" \
+                  --worker-iam-role "$WORKER_ROLE_ARN" \
+                  --compute-machine-type "$MACHINE_TYPE" \
+                  --subnet-ids="$SUBNET_IDS" \
+                  --billing-account "$BILLING_ACCOUNT_ID" \
+                  --replicas $(params.replicas) \
+                  --tags konflux-ci:true,creation-date:$(date -u +"%Y-%m-%d"),cluster-type:rosa-hcp \
+                  --hosted-cp -y
+
+              printf "INFO: Track the progress of the cluster creation...\n" 
+              rosa logs install --cluster="$CLUSTER_NAME" --region "$REGION" --watch
+
+              printf "INFO: ROSA with HCP cluster is ready, create a cluster admin account for accessing the cluster\n" 
+              admin_output="$(rosa create admin --region "$REGION" --cluster="$CLUSTER_NAME")"
+
+              # Get the admin account credentials and API server URL
+              admin_user="$(echo "$admin_output" | grep -oP '(?<=--username ).*(?= --password)')"
+              admin_pass="$(echo "$admin_output" | grep -oP '(?<=--password ).*')"
+              api_url="$(echo "$admin_output" | grep -oP '(?<=oc login ).*(?= --username)')"
+
+              printf "INFO: Storing login command...\n"
+              echo "oc login $api_url --username $admin_user --password $admin_pass" > $(results.ocp-login-command.path)
+
+              # Use the admin account to login to the cluster in a loop until the account is active.
+              printf "INFO: Check if it's able to login to OCP cluster...\n"
+              max_retries=10
+              retries=0
+              
+              while ! oc login "$api_url" --username "$admin_user" --password "$admin_pass" >/dev/null 2>&1; do
+                  if [ "$retries" -eq "$max_retries" ]; then
+                      echo "ERROR: Failed to login the cluster." >&2
+                      print_debug_info
+                      exit 1
+                  fi
+                  sleep 60
+                  retries=$((retries + 1))
+                  echo "Retried $retries times..."
+              done
+
+              #Workaround: Check if apiserver is ready by calling kubectl get nodes
+              printf "INFO: Check if apiserver is ready...\n"
+              if ! timeout 300s bash -c "while ! kubectl get nodes >/dev/null 2>/dev/null; do printf '.'; sleep 10; done"; then
+                  echo "ERROR: API server is not ready" >&2
+                  exit 1
+              fi
+              check_clusteroperators
+          }
+
+          deploy_cluster
+        }
+        main 2>&1 | tee cluster-provision.log
+    - name: secure-push-oci
+      ref:
+        resolver: git
+        params:
+          - name: url
+            value: https://github.com/konflux-ci/tekton-integration-catalog.git
+          - name: revision
+            value: main
+          - name: pathInRepo
+            value: stepactions/secure-push-oci/0.1/secure-push-oci.yaml
+      params:
+        - name: workdir-path
+          value: /workspace/cluster-provision
+        - name: oci-ref
+          value: $(params.oci-container)
+        - name: credentials-volume-name
+          value: konflux-test-infra-volume
+    - name: fail-if-any-step-failed
+      ref:
+        resolver: git
+        params:
+          - name: url
+            value: https://github.com/konflux-ci/tekton-integration-catalog.git
+          - name: revision
+            value: main
+          - name: pathInRepo
+            value: stepactions/fail-if-any-step-failed/0.1/fail-if-any-step-failed.yaml
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Tekton Task: Rosa HCP

		Please read/update [rosa-hcp tasks docs](../../../docs/qe-available-tasks/rosa.md) from docs folder.