Skip to content

Commit

Permalink
Fix for redis memory check failure after link flap and also sometimes…
Browse files Browse the repository at this point in the history
… cpu usage high failure (#15732)

Description of PR
Redis memory check result is not a stable value using "redis-cli info memory | grep used_memory_human". It's found on a stable system (BGO converged, no port flapping etc), the above check could have memory usage difference by more than 0.2M.

Followings are CLI output from 202405 and 202205.

202405:
admin@ixre-egl-board30: redis-cli info memory | grep used_memory_human | sed -e 's/.:(.)M/\1/'
2.64
admin@ixre-egl-board30: redis-cli info memory | grep used_memory_human | sed -e 's/.:(.)M/\1/'
2.74
admin@ixre-egl-board30: redis-cli info memory | grep used_memory_human | sed -e 's/.:(.)M/\1/'
2.52

202205:
admin@ixre-egl-board64: redis-cli info memory | grep used_memory_human | sed -e 's/.:(.)M/\1/'
6.02
admin@ixre-egl-board64: redis-cli info memory | grep used_memory_human | sed -e 's/.:(.)M/\1/'
6.26
admin@ixre-egl-board64: redis-cli info memory | grep used_memory_human | sed -e 's/.:(.)M/\1/'
6.14

We can see that 202405 has some memory optimization for redis and it's not using as much memory as 202205. 0.2M memory usage difference could easily reach the memory usage threshold of 5% in 202405.

Solution is to get the average redis memory usage before and after link flap. using 5 seconds interval and 5 times query and then get the average memory usage for redis. Also make the threshold to 10% from 5%. With this fix it's found that the redis memory check will not fail for 2405 after link flap.

This commit also provide a fix for sometimes CPU utilization check failed for orchagent after link flap. The reason is in scaling setup (34k routes) orchagent takes more time to calm down.

Summary:
Fixes # (issue) #15733

Approach
What is the motivation for this PR?
Fix test failures

How did you verify/test it?
OC tests run with the fix. Did not see the test failed.

co-authorized by: jianquanye@microsoft.com
  • Loading branch information
wumiaont authored and yejianquan committed Dec 9, 2024
1 parent e8ad4a8 commit 63bd5f3
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 43 deletions.
38 changes: 38 additions & 0 deletions tests/platform_tests/link_flap/link_flap_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import time
import logging
import random
import time

from tests.common.platform.device_utils import fanout_switch_port_lookup
from tests.common.utilities import wait_until
Expand Down Expand Up @@ -236,3 +237,40 @@ def check_bgp_routes(dut, start_time_ipv4_route_counts, start_time_ipv6_route_co
incr_ipv4_route_counts = abs(int(float(start_time_ipv4_route_counts)) - int(float(routesv4)))
incr_ipv6_route_counts = abs(int(float(start_time_ipv6_route_counts)) - int(float(routesv6)))
return incr_ipv4_route_counts < MAX_DIFF and incr_ipv6_route_counts < MAX_DIFF


def get_avg_redis_mem_usage(duthost, interval, num_times):
"""
Redis memory usage is not a stable value. It's fluctuating even when the device is stable stage.
202205 has larger redis memory usage (~ 5.5M) so the fluctuation of 0.2M is not an issue.
With 202405 redis memory usage is optimized (~ 2.5M) and 0.2M usage could make the test fail
if memory threshold is 5%.
This API returns the average radis memory usage during a period.
Args:
duthost: DUT host object
interval: time interval to wait for next query
num_times: number of times to query
"""
logger.info("Checking average redis memory usage")
cmd = r"redis-cli info memory | grep used_memory_human | sed -e 's/.*:\(.*\)M/\1/'"
redis_memory = 0.0
for i in range(num_times):
redis_memory += float(duthost.shell(cmd)["stdout"])
time.sleep(interval)
return float(redis_memory/num_times)


def validate_redis_memory_increase(tbinfo, start_mem, end_mem):
# Calculate diff in Redis memory
incr_redis_memory = end_mem - start_mem
logging.info("Redis memory usage difference: %f", incr_redis_memory)

# Check redis memory only if it is increased else default to pass
if incr_redis_memory > 0.0:
percent_incr_redis_memory = (incr_redis_memory / start_mem) * 100
logging.info("Redis Memory percentage Increase: %d", percent_incr_redis_memory)
incr_redis_memory_threshold = 15 if tbinfo["topo"]["type"] in ["m0", "mx"] else 10
if percent_incr_redis_memory >= incr_redis_memory_threshold:
return False
return True
34 changes: 12 additions & 22 deletions tests/platform_tests/link_flap/test_cont_link_flap.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from tests.common.helpers.assertions import pytest_assert, pytest_require
from tests.common import port_toggle
from tests.platform_tests.link_flap.link_flap_utils import build_test_candidates, toggle_one_link,\
check_orch_cpu_utilization, check_bgp_routes
check_orch_cpu_utilization, check_bgp_routes, , get_avg_redis_mem_usage, validate_redis_memory_increase
from tests.common.utilities import wait_until
from tests.common.devices.eos import EosHost
from tests.common.devices.sonic import SonicHost
Expand Down Expand Up @@ -64,7 +64,7 @@ def test_cont_link_flap(self, request, duthosts, nbrhosts, enum_rand_one_per_hws
3.) Watch for memory (show system-memory), FRR daemons memory(vtysh -c "show memory bgp/zebra"),
orchagent CPU Utilization and Redis_memory.
Pass Criteria: All routes must be re-learned with < 5% increase in Redis/FRR memory usage and
Pass Criteria: All routes must be re-learned with < 10% increase in Redis/FRR memory usage and
ORCH agent CPU consumption below threshold after 3 mins after stopping flaps.
"""
duthost = duthosts[enum_rand_one_per_hwsku_frontend_hostname]
Expand All @@ -76,9 +76,8 @@ def test_cont_link_flap(self, request, duthosts, nbrhosts, enum_rand_one_per_hws
logging.info("Memory Status at start: %s", memory_output)

# Record Redis Memory at start
start_time_redis_memory = duthost.shell(
r"redis-cli info memory | grep used_memory_human | sed -e 's/.*:\(.*\)M/\1/'")["stdout"]
logging.info("Redis Memory: %s M", start_time_redis_memory)
start_time_redis_memory = get_avg_redis_mem_usage(duthost, 5, 5)
logging.info("Redis Memory: %f M", start_time_redis_memory)

# Record ipv4 route counts at start
sumv4, sumv6 = duthost.get_ip_route_summary(skip_kernel_tunnel=True)
Expand Down Expand Up @@ -207,26 +206,17 @@ def test_cont_link_flap(self, request, duthosts, nbrhosts, enum_rand_one_per_hws
logging.info("Orchagent PID {0} CPU Util at end: {1}".format(pid, util))

# Record Redis Memory at end
end_time_redis_memory = duthost.shell(
r"redis-cli info memory | grep used_memory_human | sed -e 's/.*:\(.*\)M/\1/'")["stdout"]
logging.info("Redis Memory at start: %s M", start_time_redis_memory)
logging.info("Redis Memory at end: %s M", end_time_redis_memory)

# Calculate diff in Redis memory
incr_redis_memory = float(end_time_redis_memory) - float(start_time_redis_memory)
logging.info("Redis absolute difference: %d", incr_redis_memory)

# Check redis memory only if it is increased else default to pass
if incr_redis_memory > 0.0:
percent_incr_redis_memory = (incr_redis_memory / float(start_time_redis_memory)) * 100
logging.info("Redis Memory percentage Increase: %d", percent_incr_redis_memory)
incr_redis_memory_threshold = 10 if tbinfo["topo"]["type"] in ["m0", "mx"] else 5
pytest_assert(percent_incr_redis_memory < incr_redis_memory_threshold,
"Redis Memory Increase more than expected: {}".format(percent_incr_redis_memory))
end_time_redis_memory = get_avg_redis_mem_usage(duthost, 5, 5)
logging.info("Redis Memory at start: %f M", start_time_redis_memory)
logging.info("Redis Memory at end: %f M", end_time_redis_memory)

result = validate_redis_memory_increase(tbinfo, start_time_redis_memory, end_time_redis_memory)
pytest_assert(result, "Redis Memory Increases more than expected: start {}, end {}"
.format(start_time_redis_memory, end_time_redis_memory))

# Orchagent CPU should consume < orch_cpu_threshold at last.
logging.info("watch orchagent CPU utilization when it goes below %d", orch_cpu_threshold)
pytest_assert(wait_until(45, 2, 0, check_orch_cpu_utilization, duthost, orch_cpu_threshold),
pytest_assert(wait_until(120, 5, 0, check_orch_cpu_utilization, duthost, orch_cpu_threshold),
"Orch CPU utilization {} > orch cpu threshold {} after link flap"
.format(duthost.shell("show processes cpu | grep orchagent | awk '{print $9}'")["stdout"],
orch_cpu_threshold))
33 changes: 12 additions & 21 deletions tests/platform_tests/link_flap/test_link_flap.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import logging
import pytest

from tests.platform_tests.link_flap.link_flap_utils import toggle_one_link, check_orch_cpu_utilization
from tests.platform_tests.link_flap.link_flap_utils import toggle_one_link, check_orch_cpu_utilization, \
get_avg_redis_mem_usage, validate_redis_memory_increase
from tests.common.platform.device_utils import fanout_switch_port_lookup
from tests.common.helpers.assertions import pytest_assert
from tests.common.utilities import wait_until
Expand Down Expand Up @@ -36,9 +37,8 @@ def test_link_flap(request, duthosts, rand_one_dut_hostname, tbinfo, fanouthosts
logger.info("Memory Status at start: %s", memory_output)

# Record Redis Memory at start
start_time_redis_memory = duthost.shell(
r"redis-cli info memory | grep used_memory_human | sed -e 's/.*:\(.*\)M/\1/'")["stdout"]
logger.info("Redis Memory: %s M", start_time_redis_memory)
start_time_redis_memory = get_avg_redis_mem_usage(duthost, 5, 5)
logging.info("Redis Memory: %f M", start_time_redis_memory)

# Make Sure Orch CPU < orch_cpu_threshold before starting test.
logger.info("Make Sure orchagent CPU utilization is less that %d before link flap", orch_cpu_threshold)
Expand Down Expand Up @@ -75,26 +75,17 @@ def test_link_flap(request, duthosts, rand_one_dut_hostname, tbinfo, fanouthosts
logger.info("Orchagent CPU Util at end: %s", orch_cpu)

# Record Redis Memory at end
end_time_redis_memory = duthost.shell(
r"redis-cli info memory | grep used_memory_human | sed -e 's/.*:\(.*\)M/\1/'")["stdout"]
logger.info("Redis Memory at start: %s M", start_time_redis_memory)
logger.info("Redis Memory at end: %s M", end_time_redis_memory)

# Calculate diff in Redis memory
incr_redis_memory = float(end_time_redis_memory) - float(start_time_redis_memory)
logger.info("Redis absolute difference: %d", incr_redis_memory)

# Check redis memory only if it is increased else default to pass
if incr_redis_memory > 0.0:
percent_incr_redis_memory = (incr_redis_memory / float(start_time_redis_memory)) * 100
logger.info("Redis Memory percentage Increase: %d", percent_incr_redis_memory)
incr_redis_memory_threshold = 10 if tbinfo["topo"]["type"] in ["m0", "mx"] else 5
pytest_assert(percent_incr_redis_memory < incr_redis_memory_threshold,
"Redis Memory Increase more than expected: {}".format(percent_incr_redis_memory))
end_time_redis_memory = get_avg_redis_mem_usage(duthost, 5, 5)
logging.info("Redis Memory at start: %f M", start_time_redis_memory)
logging.info("Redis Memory at end: %f M", end_time_redis_memory)

result = validate_redis_memory_increase(tbinfo, start_time_redis_memory, end_time_redis_memory)
pytest_assert(result, "Redis Memory increases more than expected: start {}, end {}"
.format(start_time_redis_memory, end_time_redis_memory))

# Orchagent CPU should consume < orch_cpu_threshold at last.
logger.info("watch orchagent CPU utilization when it goes below %d", orch_cpu_threshold)
pytest_assert(wait_until(45, 2, 0, check_orch_cpu_utilization, duthost, orch_cpu_threshold),
pytest_assert(wait_until(120, 5, 0, check_orch_cpu_utilization, duthost, orch_cpu_threshold),
"Orch CPU utilization {} > orch cpu threshold {} before link flap"
.format(duthost.shell("show processes cpu | grep orchagent | awk '{print $9}'")["stdout"],
orch_cpu_threshold))

0 comments on commit 63bd5f3

Please sign in to comment.