[Impacted Area Based PR testing] Roll out T0 and T1 PR checkers. (#16598

) What is the motivation for this PR? In PRs #15666 and #16403, we partially rolled out the T0 and T1 PR checkers, considering resource utilization since these checkers require over 20 instances and needed to run in parallel with the legacy PR checkers. After a period of observation, we have confirmed the stability of the new system. In this PR, we complete the rollout of the remaining T0 and T1 PR checkers and officially deprecate the old PR checkers. At the same time, we have added all test scripts into PR testing, and we will gather scripts though pytest mark, so we don't need onboarding PR checkers anymore. How did you do it? In this PR, we complete the rollout of the remaining T0 and T1 PR checkers and officially deprecate the old PR checkers. How did you verify/test it? Test by pipeline itself, to see if we can successfully pass the PR checkers.
sonic-net · Jan 22, 2025 · aac9677 · aac9677
1 parent d9d3854
commit aac9677
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 115 deletions.
diff --git a/.azure-pipelines/impacted_area_testing/calculate-instance-numbers.yml b/.azure-pipelines/impacted_area_testing/calculate-instance-numbers.yml
@@ -7,6 +7,10 @@ parameters:
     type: string
     default: ""
 
+  - name: PREPARE_TIME
+    type: number
+    default: 30
+
 steps:
 - script: |
     set -x
@@ -64,7 +68,7 @@ steps:
     pip install azure-kusto-data
     pip install azure-kusto-data azure-identity
 
-    INSTANCE_NUMBER=$(python ./.azure-pipelines/impacted_area_testing/calculate_instance_number.py --scripts $(SCRIPTS) --topology ${{ parameters.TOPOLOGY }} --branch ${{ parameters.BUILD_BRANCH }})
+    INSTANCE_NUMBER=$(python ./.azure-pipelines/impacted_area_testing/calculate_instance_number.py --scripts $(SCRIPTS) --topology ${{ parameters.TOPOLOGY }} --branch ${{ parameters.BUILD_BRANCH }} --prepare_time ${{ parameters.PREPARE_TIME }})
 
     if [[ $? -ne 0 ]]; then
       echo "##vso[task.complete result=Failed;]Get instances number fails."

diff --git a/.azure-pipelines/impacted_area_testing/calculate_instance_number.py b/.azure-pipelines/impacted_area_testing/calculate_instance_number.py
@@ -64,7 +64,7 @@ def get_access_token():
         raise Exception(f"Failed to get token after {MAX_GET_TOKEN_RETRY_TIMES} attempts")
 
 
-def main(scripts, topology, branch):
+def main(scripts, topology, branch, prepare_time):
     ingest_cluster = os.getenv("TEST_REPORT_QUERY_KUSTO_CLUSTER_BACKUP")
     access_token = get_access_token()
 
@@ -107,33 +107,37 @@ def main(scripts, topology, branch):
             # To get the result for a single time, we need to divide by five
             # If response.primary_results is None, which means where is no historical data in Kusto,
             # we will use the default 1800s for a script.
+            running_time = row["TotalRuntime"]
             actual_count = row["ActualCount"]
 
             # There is no relevant records in Kusto
-            if actual_count == 0:
+            if running_time == 0:
                 average_running_time = 1800
             else:
-                average_running_time = row["TotalRuntime"] / actual_count
+                average_running_time = running_time / actual_count
 
         total_running_time += average_running_time
         scripts_running_time[script] = average_running_time
     logging.info(f"Time for each test script: {scripts_running_time}")
     logging.info(f"Total running time: {total_running_time}")
     # Total running time is calculated by seconds, divide by 60 to get minutes
-    # For one instance, we plan to assign 90 minutes to run test scripts
+    # Our goal is to limit the whole PR testing into 120 minutes
+    # As we need some time to prepare testbeds, the prepare time should be subtracted.
     # Obtain the number of instances by rounding up the calculation.
     # To prevent unexpected situations, we set the maximum number of instance
-    print(min(math.ceil(total_running_time / 60 / 90), MAX_INSTANCE_NUMBER))
+    print(min(math.ceil(total_running_time / 60 / (120 - prepare_time)), MAX_INSTANCE_NUMBER))
 
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument("--topology", help="The topology of testplan", type=str, default="")
     parser.add_argument("--scripts", help="Test scripts to be executed", type=str, default="")
     parser.add_argument("--branch", help="Test branch", type=str, default="")
+    parser.add_argument("--prepare_time", help="Time for preparing testbeds", type=int, default=30)
     args = parser.parse_args()
 
     scripts = args.scripts
     topology = args.topology
     branch = args.branch
-    main(scripts, topology, branch)
+    prepare_time = args.prepare_time
+    main(scripts, topology, branch, prepare_time)
diff --git a/.azure-pipelines/impacted_area_testing/get_test_scripts.py b/.azure-pipelines/impacted_area_testing/get_test_scripts.py
@@ -101,15 +101,7 @@ def collect_scripts_by_topology_type(features: str, location: str) -> dict:
         except Exception as e:
             raise Exception('Exception occurred while trying to get topology in {}, error {}'.format(s, e))
 
-    test_scripts = {k: v for k, v in test_scripts_per_topology_checker.items() if v}
-
-    # This is just for the first stage of rolling out
-    # To avoid the overuse of resource, we will ignore the PR which modifies the common part.
-    if features == "":
-        test_scripts.pop("t0_checker")
-        test_scripts.pop("t1_checker")
-
-    return test_scripts
+    return {k: v for k, v in test_scripts_per_topology_checker.items() if v}
 
 
 def main(features, location):

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -78,19 +78,19 @@ stages:
   - job: get_impacted_area
     displayName: "Get impacted area"
     timeoutInMinutes: 240
-    continueOnError: true
+    continueOnError: false
     pool: sonic-ubuntu-1c
     steps:
       - template: .azure-pipelines/impacted_area_testing/get-impacted-area.yml
 
   - job: impacted_area_t0_elastictest
-    displayName: "impacted-area-kvmtest-t0 by Elastictest - optional"
+    displayName: "impacted-area-kvmtest-t0 by Elastictest"
     dependsOn: get_impacted_area
     condition: contains(dependencies.get_impacted_area.outputs['SetVariableTask.PR_CHECKERS'], 't0_checker')
     variables:
       TEST_SCRIPTS: $[ dependencies.get_impacted_area.outputs['SetVariableTask.TEST_SCRIPTS'] ]
     timeoutInMinutes: 240
-    continueOnError: true
+    continueOnError: false
     pool: sonic-ubuntu-1c
     steps:
       - template: .azure-pipelines/impacted_area_testing/calculate-instance-numbers.yml
@@ -106,7 +106,6 @@ stages:
           MAX_WORKER: $(INSTANCE_NUMBER)
           KVM_IMAGE_BRANCH: $(BUILD_BRANCH)
           MGMT_BRANCH: "master"
-          STOP_ON_FAILURE: "False"
 
   - job: impacted_area_t0_2vlans_elastictest
     displayName: "impacted-area-kvmtest-t0_2vlans by Elastictest"
@@ -134,19 +133,21 @@ stages:
           MGMT_BRANCH: "master"
 
   - job: impacted_area_t1_lag_elastictest
-    displayName: "impacted-area-kvmtest-t1-lag by Elastictest - optional"
+    displayName: "impacted-area-kvmtest-t1-lag by Elastictest"
     dependsOn: get_impacted_area
     condition: contains(dependencies.get_impacted_area.outputs['SetVariableTask.PR_CHECKERS'], 't1_checker')
     variables:
       TEST_SCRIPTS: $[ dependencies.get_impacted_area.outputs['SetVariableTask.TEST_SCRIPTS'] ]
     timeoutInMinutes: 240
-    continueOnError: true
+    continueOnError: false
     pool: sonic-ubuntu-1c
     steps:
       - template: .azure-pipelines/impacted_area_testing/calculate-instance-numbers.yml
         parameters:
           TOPOLOGY: t1
           BUILD_BRANCH: $(BUILD_BRANCH)
+          # 50 mins for preparing testbed, 30 mins for pre-test and post-test
+          PREPARE_TIME: 80
 
       - template: .azure-pipelines/run-test-elastictest-template.yml
         parameters:
@@ -156,7 +157,6 @@ stages:
           MAX_WORKER: $(INSTANCE_NUMBER)
           KVM_IMAGE_BRANCH: $(BUILD_BRANCH)
           MGMT_BRANCH: "master"
-          STOP_ON_FAILURE: "False"
 
   - job: impacted_area_dualtor_elastictest
     displayName: "impacted-area-kvmtest-dualtor by Elastictest"
@@ -172,6 +172,8 @@ stages:
         parameters:
           TOPOLOGY: dualtor
           BUILD_BRANCH: $(BUILD_BRANCH)
+          # 30 mins for preparing testbed, 30 mins for pre-test and 20 mins for post-test
+          PREPARE_TIME: 80
 
       - template: .azure-pipelines/run-test-elastictest-template.yml
         parameters:
@@ -222,6 +224,7 @@ stages:
         parameters:
           TOPOLOGY: t0-sonic
           BUILD_BRANCH: $(BUILD_BRANCH)
+          PREPARE_TIME: 40
 
       - template: .azure-pipelines/run-test-elastictest-template.yml
         parameters:
@@ -259,68 +262,6 @@ stages:
           MGMT_BRANCH: "master"
 
 # Below is the original PR checkers
-  - job: t0_elastictest
-    displayName: "kvmtest-t0 by Elastictest"
-    timeoutInMinutes: 240
-    continueOnError: false
-    pool: sonic-ubuntu-1c
-    steps:
-    - template: .azure-pipelines/run-test-elastictest-template.yml
-      parameters:
-        TOPOLOGY: t0
-        MIN_WORKER: $(T0_INSTANCE_NUM)
-        MAX_WORKER: $(T0_INSTANCE_NUM)
-        KVM_IMAGE_BRANCH: $(BUILD_BRANCH)
-        MGMT_BRANCH: "master"
-
-  - job: t1_lag_elastictest
-    displayName: "kvmtest-t1-lag by Elastictest"
-    timeoutInMinutes: 240
-    continueOnError: false
-    pool: sonic-ubuntu-1c
-    steps:
-    - template: .azure-pipelines/run-test-elastictest-template.yml
-      parameters:
-        TOPOLOGY: t1-lag
-        MIN_WORKER: $(T1_LAG_INSTANCE_NUM)
-        MAX_WORKER: $(T1_LAG_INSTANCE_NUM)
-        KVM_IMAGE_BRANCH: $(BUILD_BRANCH)
-        MGMT_BRANCH: "master"
-
-  - job: onboarding_elastictest_t0
-    displayName: "onboarding t0 testcases by Elastictest - optional"
-    timeoutInMinutes: 240
-    continueOnError: true
-    pool: sonic-ubuntu-1c
-    steps:
-      - template: .azure-pipelines/run-test-elastictest-template.yml
-        parameters:
-          TOPOLOGY: t0
-          STOP_ON_FAILURE: "False"
-          RETRY_TIMES: 0
-          MIN_WORKER: $(T0_ONBOARDING_SONIC_INSTANCE_NUM)
-          MAX_WORKER: $(T0_ONBOARDING_SONIC_INSTANCE_NUM)
-          KVM_IMAGE_BRANCH: $(BUILD_BRANCH)
-          MGMT_BRANCH: "master"
-          TEST_SET: onboarding_t0
-
-  - job: onboarding_elastictest_t1
-    displayName: "onboarding t1 testcases by Elastictest - optional"
-    timeoutInMinutes: 240
-    continueOnError: true
-    pool: sonic-ubuntu-1c
-    steps:
-      - template: .azure-pipelines/run-test-elastictest-template.yml
-        parameters:
-          TOPOLOGY: t1-lag
-          STOP_ON_FAILURE: "False"
-          RETRY_TIMES: 0
-          MIN_WORKER: $(T1_LAG_ONBOARDING_INSTANCE_NUM)
-          MAX_WORKER: $(T1_LAG_ONBOARDING_INSTANCE_NUM)
-          KVM_IMAGE_BRANCH: $(BUILD_BRANCH)
-          MGMT_BRANCH: "master"
-          TEST_SET: onboarding_t1
-
   - job: onboarding_multi_asic_elastictest_t1
     displayName: "onboarding t1 testcases for kvmtest-multi-asic-t1-lag by Elastictest - optional"
     timeoutInMinutes: 240
@@ -337,33 +278,3 @@ stages:
           NUM_ASIC: 4
           KVM_IMAGE_BRANCH: $(BUILD_BRANCH)
           MGMT_BRANCH: "master"
-
-#  - job: onboarding_elastictest_dualtor
-#    displayName: "onboarding dualtor testcases by Elastictest - optional"
-#    timeoutInMinutes: 240
-#    continueOnError: true
-#    pool: sonic-ubuntu-1c
-#    steps:
-#      - template: .azure-pipelines/run-test-elastictest-template.yml
-#        parameters:
-#          TOPOLOGY: dualtor
-#          STOP_ON_FAILURE: "False"
-#          RETRY_TIMES: 0
-#          MIN_WORKER: $(T0_DUALTOR_INSTANCE_NUM)
-#          MAX_WORKER: $(T0_DUALTOR_INSTANCE_NUM)
-#          KVM_IMAGE_BRANCH: $(BUILD_BRANCH)
-#          MGMT_BRANCH: "master"
-#          TEST_SET: onboarding_dualtor
-
-#  - job: wan_elastictest
-#    displayName: "kvmtest-wan by Elastictest"
-#    timeoutInMinutes: 240
-#    continueOnError: false
-#    pool: sonic-ubuntu-1c
-#    steps:
-#      - template: .azure-pipelines/run-test-elastictest-template.yml
-#        parameters:
-#          TOPOLOGY: wan-pub
-#          MIN_WORKER: $(WAN_INSTANCE_NUM)
-#          MAX_WORKER: $(WAN_INSTANCE_NUM)
-#          COMMON_EXTRA_PARAMS: "--skip_sanity "