Merge branch 'master' into support_qos_kvm

sonic-net · Jan 6, 2025 · 7b9ccbe · 7b9ccbe
2 parents f5a7557 + 169c7be
commit 7b9ccbe
Show file tree

Hide file tree

Showing 77 changed files with 10,169 additions and 433 deletions.
diff --git a/.azure-pipelines/impacted_area_testing/calculate-instance-numbers.yml b/.azure-pipelines/impacted_area_testing/calculate-instance-numbers.yml
@@ -0,0 +1,76 @@
+parameters:
+  - name: TOPOLOGY
+    type: string
+    default: ""
+
+  - name: BUILD_BRANCH
+    type: string
+    default: ""
+
+steps:
+- script: |
+    set -x
+
+    sudo apt-get update && sudo apt-get install -y jq
+
+    TEST_SCRIPTS=$(echo '$(TEST_SCRIPTS)' | jq -r -c '."${{ parameters.TOPOLOGY }}"')
+
+    if [[ $? -ne 0 ]]; then
+      echo "##vso[task.complete result=Failed;]Get test scripts of specfic topology fails."
+      exit 1
+    fi
+
+    SCRIPTS=$(echo "$TEST_SCRIPTS" | jq -r '. | join(",")')
+    echo -n "##vso[task.setvariable variable=SCRIPTS]$SCRIPTS"
+  displayName: "Get ${{ parameters.TOPOLOGY }} test scripts"
+
+- script: |
+    set -x
+
+    # Check if azure cli is installed. If not, try to install it
+    if ! command -v az; then
+      echo "Azure CLI is not installed. Trying to install it..."
+
+      echo "Get packages needed for the installation process"
+      sudo apt-get -o DPkg::Lock::Timeout=600 update
+      sudo apt-get -o DPkg::Lock::Timeout=600 -y install apt-transport-https ca-certificates curl gnupg lsb-release
+
+      echo "Download and install the Microsoft signing key"
+      sudo mkdir -p /etc/apt/keyrings
+      curl -sLS https://packages.microsoft.com/keys/microsoft.asc |
+        gpg --dearmor | sudo tee /etc/apt/keyrings/microsoft.gpg > /dev/null
+      sudo chmod go+r /etc/apt/keyrings/microsoft.gpg
+
+      echo "Add the Azure CLI software repository"
+      AZ_DIST=$(lsb_release -cs)
+      echo "Types: deb
+    URIs: https://packages.microsoft.com/repos/azure-cli/
+    Suites: ${AZ_DIST}
+    Components: main
+    Architectures: $(dpkg --print-architecture)
+    Signed-by: /etc/apt/keyrings/microsoft.gpg" | sudo tee /etc/apt/sources.list.d/azure-cli.sources
+
+      echo "Update repository information and install the azure-cli package"
+      sudo apt-get -o DPkg::Lock::Timeout=600 update
+      sudo apt-get -o DPkg::Lock::Timeout=600 -y install azure-cli
+    else
+      echo "Azure CLI is already installed"
+    fi
+  displayName: "Install azure-cli"
+
+- script: |
+    set -x
+
+    pip install azure-kusto-data
+    pip install azure-kusto-data azure-identity
+
+    INSTANCE_NUMBER=$(python ./.azure-pipelines/impacted_area_testing/calculate_instance_number.py --scripts $(SCRIPTS) --topology ${{ parameters.TOPOLOGY }} --branch ${{ parameters.BUILD_BRANCH }})
+
+    if [[ $? -ne 0 ]]; then
+      echo "##vso[task.complete result=Failed;]Get instances number fails."
+      exit 1
+    fi
+
+    echo "$INSTANCE_NUMBER"
+    echo -n "##vso[task.setvariable variable=INSTANCE_NUMBER]$INSTANCE_NUMBER"
+  displayName: "Calculate instance number"
diff --git a/.azure-pipelines/impacted_area_testing/calculate_instance_number.py b/.azure-pipelines/impacted_area_testing/calculate_instance_number.py
@@ -0,0 +1,142 @@
+import os
+import argparse
+import math
+import logging
+from constant import PR_CHECKER_TOPOLOGY_NAME, MAX_INSTANCE_NUMBER, MAX_GET_TOKEN_RETRY_TIMES
+from azure.kusto.data import KustoConnectionStringBuilder, KustoClient
+
+logging.basicConfig(level=logging.INFO)
+
+
+def parse_list_from_str(s):
+    # Since Azure Pipeline doesn't support to receive an empty parameter,
+    # We use ' ' as a magic code for empty parameter.
+    # So we should consider ' ' as en empty input.
+    if isinstance(s, str):
+        s = s.strip()
+    if not s:
+        return None
+    return [single_str.strip()
+            for single_str in s.split(',')
+            if single_str.strip()]
+
+
+def get_access_token():
+    managed_identity_id = os.environ.get("SONIC_AUTOMATION_UMI")
+
+    # 1. Run az login with re-try
+    az_login_cmd = f"az login --identity --username {managed_identity_id}"
+    az_login_attempts = 0
+    while az_login_attempts < MAX_GET_TOKEN_RETRY_TIMES:
+        try:
+            result = os.popen(az_login_cmd)
+            result.read()
+            break
+        except Exception as exception:
+            az_login_attempts += 1
+            raise Exception(
+                f"Failed to az login with exception: {repr(exception)}. "
+                f"Retry {MAX_GET_TOKEN_RETRY_TIMES - az_login_attempts} times to login."
+            )
+
+    # If az login failed, return with exception
+    if az_login_attempts >= MAX_GET_TOKEN_RETRY_TIMES:
+        raise Exception(f"Failed to az login after {MAX_GET_TOKEN_RETRY_TIMES} attempts.")
+
+    # 2. Get access token with re-try
+    get_token_cmd = "az account get-access-token --resource https://api.kusto.windows.net --query accessToken -o tsv"
+    get_token_attempts = 0
+    while get_token_attempts < MAX_GET_TOKEN_RETRY_TIMES:
+        try:
+            result = os.popen(get_token_cmd)
+            access_token = result.read()
+            if not access_token:
+                raise Exception("Parse token from stdout failed, accessToken is None.")
+
+            return access_token
+
+        except Exception as exception:
+            get_token_attempts += 1
+            raise Exception(f"Failed to get token with exception: {repr(exception)}.")
+
+    # If az get token failed, return with exception
+    if get_token_attempts >= MAX_GET_TOKEN_RETRY_TIMES:
+        raise Exception(f"Failed to get token after {MAX_GET_TOKEN_RETRY_TIMES} attempts")
+
+
+def main(scripts, topology, branch):
+    ingest_cluster = os.getenv("TEST_REPORT_QUERY_KUSTO_CLUSTER_BACKUP")
+    access_token = get_access_token()
+
+    if not ingest_cluster or not access_token:
+        raise RuntimeError(
+            "Could not load Kusto Credentials from environment")
+
+    try:
+        kcsb = KustoConnectionStringBuilder.with_aad_application_token_authentication(ingest_cluster,
+                                                                                      access_token)  # noqa F841
+        client = KustoClient(kcsb)
+    except Exception as e:
+        raise Exception("Connect to kusto fails, error {}".format(e))
+
+    scripts = parse_list_from_str(scripts)
+
+    scripts_running_time = {}
+    total_running_time = 0
+
+    for script in scripts:
+        # As baseline test is the universal set of PR test
+        # we get the historical running time of one script here
+        # We get recent 5 test plans and calculate the average running time
+        query = "V2TestCases " \
+                "| join kind=inner" \
+                "(TestPlans " \
+                "| where TestPlanType == 'PR' and Result == 'FINISHED' " \
+                f"and Topology == '{PR_CHECKER_TOPOLOGY_NAME[topology][0]}' " \
+                f"and TestBranch == '{branch}' and TestPlanName contains '{PR_CHECKER_TOPOLOGY_NAME[topology][1]}' " \
+                "and TestPlanName contains '_BaselineTest_' and UploadTime > ago(7d)" \
+                "| order by UploadTime desc) on TestPlanId " \
+                f"| where FilePath == '{script}' " \
+                "| where Result !in ('failure', 'error') " \
+                "| take 5" \
+                "| summarize ActualCount = count(), TotalRuntime = sum(Runtime)"
+        try:
+            response = client.execute("SonicTestData", query)
+        except Exception as e:
+            raise Exception("Query results from Kusto fails, error {}".format(e))
+
+        for row in response.primary_results[0]:
+            # We have obtained the results of the most recent five times.
+            # To get the result for a single time, we need to divide by five
+            # If response.primary_results is None, which means where is no historical data in Kusto,
+            # we will use the default 1800s for a script.
+            actual_count = row["ActualCount"]
+
+            # There is no relevant records in Kusto
+            if actual_count == 0:
+                average_running_time = 1800
+            else:
+                average_running_time = row["TotalRuntime"] / actual_count
+
+        total_running_time += average_running_time
+        scripts_running_time[script] = average_running_time
+    logging.info(f"Time for each test script: {scripts_running_time}")
+    logging.info(f"Total running time: {total_running_time}")
+    # Total running time is calculated by seconds, divide by 60 to get minutes
+    # For one instance, we plan to assign 90 minutes to run test scripts
+    # Obtain the number of instances by rounding up the calculation.
+    # To prevent unexpected situations, we set the maximum number of instance
+    print(min(math.ceil(total_running_time / 60 / 90), MAX_INSTANCE_NUMBER))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--topology", help="The topology of testplan", type=str, default="")
+    parser.add_argument("--scripts", help="Test scripts to be executed", type=str, default="")
+    parser.add_argument("--branch", help="Test branch", type=str, default="")
+    args = parser.parse_args()
+
+    scripts = args.scripts
+    topology = args.topology
+    branch = args.branch
+    main(scripts, topology, branch)
diff --git a/.azure-pipelines/impacted_area_testing/constant.py b/.azure-pipelines/impacted_area_testing/constant.py
@@ -0,0 +1,28 @@
+# Now, we only have below types of PR checker
+# - dpu
+# - dualtor-t0
+# - multi-asic-t1-lag
+# - t0
+# - t0-2vlans
+# - t0-sonic
+# - t1- lag
+PR_TOPOLOGY_TYPE = ["t0", "t0-2vlans", "t0-sonic", "t1", "t1-multi-asic", "dpu", "dualtor"]
+
+EXCLUDE_TEST_SCRIPTS = [
+    "test_posttest.py",
+    "test_pretest.py"
+]
+
+# The mapping of topology type in PR test and topology recorded in kusto and the name of PR test.
+PR_CHECKER_TOPOLOGY_NAME = {
+    "t0": ["t0", "_kvmtest-t0_"],
+    "t0-2vlans": ["t0", "_kvmtest-t0-2vlans_"],
+    "t0-sonic": ["t0-64-32", "_kvmtest-t0-sonic_"],
+    "t1": ["t1-lag", "_kvmtest-t1-lag_"],
+    "t1-multi-asic": ["t1-8-lag", "_kvmtest-multi-asic-t1-lag_"],
+    "dpu": ["dpu", "_kvmtest-dpu_"],
+    "dualtor": ["dualtor", "_kvmtest-dualtor-t0_"]
+}
+
+MAX_INSTANCE_NUMBER = 25
+MAX_GET_TOKEN_RETRY_TIMES = 3
diff --git a/.azure-pipelines/impacted_area_testing/get-impacted-area.yml b/.azure-pipelines/impacted_area_testing/get-impacted-area.yml
@@ -0,0 +1,76 @@
+steps:
+- script: |
+    set -x
+
+    git fetch --all
+    DIFF_FOLDERS=$(git diff origin/master HEAD --name-only | xargs -n1 dirname | sort -u | tr '\n' ' ')
+
+    if [[ $? -ne 0 ]]; then
+      echo "##vso[task.complete result=Failed;]Get diff folders fails."
+      exit 1
+    else
+      echo -n "##vso[task.setvariable variable=DIFF_FOLDERS]$DIFF_FOLDERS"
+    fi
+
+  continueOnError: false
+  displayName: "Get diff folders"
+
+- script: |
+    set -x
+
+    pip install PyYAML
+    pip install natsort
+
+    sudo apt-get install -y jq
+
+    FINAL_FEATURES=""
+    IFS=' ' read -ra FEATURES_LIST <<< "$(DIFF_FOLDERS)"
+    for FEATURE in "${FEATURES_LIST[@]}"
+    do
+      # If changes contains the common part in tests folder,the scope of PR testing is all test scripts.
+      if [[ "$FEATURE" == *tests/common* ]]; then
+        FINAL_FEATURES=""
+        break
+
+      # If changes only limited to specific feature, the scope of PR testing is impacted area.
+      elif [[ "$FEATURE" =~ tests\/* ]]; then
+        # Cut the feature path
+        if [[ $FEATURE == */*/* ]]; then
+            FEATURE=$(echo "$FEATURE" | cut -d'/' -f1-2)
+        fi
+
+        FEATURE=${FEATURE#tests/}
+
+        if [[ -z "$FINAL_FEATURES" ]]; then
+          FINAL_FEATURES="$FEATURE"
+        elif [[ ! "$FINAL_FEATURES" == *$FEATURE* ]]; then
+          FINAL_FEATURES="$FINAL_FEATURES,$FEATURE"
+        fi
+
+      # If changes related to other folders excpet tests, we also consider them as common part.
+      # The scope of PR testing is all test scripts.
+      else
+        FINAL_FEATURES=""
+        break
+      fi
+    done
+
+    TEST_SCRIPTS=$(python ./.azure-pipelines/impacted_area_testing/get_test_scripts.py --features ${FINAL_FEATURES} --location tests)
+
+    if [[ $? -ne 0 ]]; then
+      echo "##vso[task.complete result=Failed;]Get test scripts fails."
+      exit 1
+    fi
+
+    PR_CHECKERS=$(echo "${TEST_SCRIPTS}" | jq -c 'keys')
+
+    if [[ $? -ne 0 ]]; then
+      echo "##vso[task.complete result=Failed;]Get valid PR checkers fails."
+      exit 1
+    fi
+
+    echo "##vso[task.setvariable variable=PR_CHECKERS;isOutput=true]$PR_CHECKERS"
+    echo "##vso[task.setvariable variable=TEST_SCRIPTS;isOutput=true]$TEST_SCRIPTS"
+  name: SetVariableTask
+  continueOnError: false
+  displayName: "Get impacted area"