Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test(performance): Add 90% storage utilization scaleout/scalein test #9787

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions configurations/nemesis/FullStorageUtilizationNemesis.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
nemesis_class_name: 'FullStorageUtilizationNemesis'
user_prefix: 'full-storage-utilization'
1 change: 1 addition & 0 deletions data_dir/nemesis_classes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
- EndOfQuotaNemesis
- EnospcAllNodesMonkey
- EnospcMonkey
- FullStorageUtilizationNemesis
- GrowShrinkClusterNemesis
- HardRebootNodeMonkey
- LoadAndStreamMonkey
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!groovy

// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43
def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm)

perfRegressionParallelPipeline(
backend: "aws",
availability_zone: 'a',
test_name: "performance_regression_test.PerformanceRegressionTest",
test_config: """["test-cases/performance/perf-regression-latency-i4i_2xlarge-elasticity-90-percent.yaml", "configurations/disable_kms.yaml"]""",
sub_tests: ["test_latency_mixed_with_nemesis"],
)
6 changes: 6 additions & 0 deletions performance_regression_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,12 @@ def _stop_load_when_nemesis_threads_end(self):
extra_time_to_expiration=60):
self.loaders.kill_stress_thread()

def stop_load_during_nemesis(self):
with EventsSeverityChangerFilter(new_severity=Severity.NORMAL, # killing stress creates Critical error
event_class=CassandraStressEvent,
extra_time_to_expiration=60):
self.loaders.kill_stress_thread()

@optional_stage('perf_preload_data')
def preload_data(self, compaction_strategy=None):
# if test require a pre-population of data
Expand Down
99 changes: 96 additions & 3 deletions sdcm/nemesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ class Nemesis: # pylint: disable=too-many-instance-attributes,too-many-public-m
manager_operation: bool = False # flag that signals that the nemesis uses scylla manager
delete_rows: bool = False # A flag denotes a nemesis deletes partitions/rows, generating tombstones.
zero_node_changes: bool = False
full_storage_utilization: bool = False

def __init__(self, tester_obj, termination_event, *args, nemesis_selector=None, **kwargs): # pylint: disable=unused-argument
for name, member in inspect.getmembers(self, lambda x: inspect.isfunction(x) or inspect.ismethod(x)):
Expand Down Expand Up @@ -2104,11 +2105,11 @@ def _truncate_cmd_timeout_suffix(self, truncate_timeout): # pylint: disable=no-
# NOTE: 'self' is used by the 'scylla_versions' decorator
return ''

def disrupt_truncate(self):
keyspace_truncate = 'ks_truncate'
def disrupt_truncate(self, full_storage_utilization=False):
keyspace_truncate = 'ks_truncate' if not full_storage_utilization else 'refill_keyspace'
table = 'standard1'

self._prepare_test_table(ks=keyspace_truncate)
self._prepare_test_table(ks=keyspace_truncate) if not full_storage_utilization else None

# In order to workaround issue #4924 when truncate timeouts, we try to flush before truncate.
with adaptive_timeout(Operations.FLUSH, self.target_node, timeout=HOUR_IN_SEC * 2):
Expand Down Expand Up @@ -4362,6 +4363,86 @@ def _shrink_cluster(self, rack=None, new_nodes: list[BaseNode] | None = None):
self.log.info("Cluster shrink finished. Current number of data nodes %s", num_of_nodes)
InfoEvent(message=f'Cluster shrink finished. Current number of data nodes {num_of_nodes}').publish()

@target_data_nodes
def disrupt_full_storage_utilization(self):
"""
Wait for steady state. Then Scale out cluster by adding new nodes.
Finally Scale in cluster by removing nodes while maintaining high storage utilization.
"""
sleep_time_between_ops = self.cluster.params.get('nemesis_sequence_sleep_between_ops')
if not self.has_steady_run and sleep_time_between_ops:
self.steady_state_latency()
self.has_steady_run = True

new_nodes = self.scaleout_at_full_storage(rack=None)
new_nodes = new_nodes if self.tester.params.get('nemesis_grow_shrink_instance_type') else None
self.scalein_to_reach_full_storage(rack=None, new_nodes=new_nodes)

def scaleout_at_full_storage(self, rack=None):
"""
Performs cluster scale out when storage utilization is around 90%.

It adds first new node, then refills data to 90% utilization level after stopping current load.
Start mixed workload and add remaining new node.
"""
add_nodes_number = self.tester.params.get('nemesis_add_node_cnt')
InfoEvent(message=f"Start grow cluster by {add_nodes_number} data nodes").publish()
new_nodes = []
stress_queue = None
for idx in range(add_nodes_number):
# if rack is not specified, round-robin racks to spread nodes evenly
rack_idx = rack if rack is not None else idx % self.cluster.racks_count
if idx == 0:
new_nodes += self.add_new_nodes(count=1, rack=rack_idx,
instance_type=self.tester.params.get('nemesis_grow_shrink_instance_type'))
# Before starting refilling data current c-s need to be stopped
self.tester.stop_load_during_nemesis()
self.tester.wait_no_compactions_running()
self.log.info("Started: refill data to 90")
refill_90_percent = self.tester.params.get('stress_cmd_w')
stress_queue = self.tester.run_stress_thread(
stress_cmd=refill_90_percent, stress_num=1, stats_aggregate_cmds=False)
self.tester.get_stress_results(
queue=stress_queue, store_results=False)
self.log.info("Completed: refill data to 90")
self.tester.wait_no_compactions_running()
self.log.info("Completed: refill data to 90 - no compactions running..proceed")
stress_cmd = self.tester.params.get('stress_cmd_m')
stress_queue = self.tester.run_stress_thread(
stress_cmd=stress_cmd, stress_num=1, stats_aggregate_cmds=False, duration=180)
# wait for c-s to start
time.sleep(120)
else:
new_nodes += self.add_new_nodes(count=1, rack=rack_idx,
instance_type=self.tester.params.get('nemesis_grow_shrink_instance_type'))
self.log.info("Finish cluster grow")
time.sleep(self.interval)
return new_nodes

def scalein_to_reach_full_storage(self, rack=None, new_nodes: list[BaseNode] | None = None):
"""
Performs cluster scale in while maintaining high storage utilization

Decommission first node. Then perform truncate after first node decommission to
maintain 90% storage load. Continue decomission of the other node.
"""
nodes_count = self.tester.params.get('nemesis_add_node_cnt')
InfoEvent(message=f'Start shrink cluster by {nodes_count} nodes').publish()
self.log.info("Start shrink cluster by %s nodes", nodes_count)
for idx in range(nodes_count):
self._decommission_nodes(
1,
rack,
is_seed=None if self._is_it_on_kubernetes() else DefaultValue,
dc_idx=self.target_node.dc_idx,
exact_nodes=[new_nodes[idx]],
)
self.disrupt_truncate(full_storage_utilization=True) if idx == 0 else None

num_of_nodes = len(self.cluster.data_nodes)
self.log.info("Cluster shrink finished. Current number of data nodes %s", num_of_nodes)
InfoEvent(message=f'Cluster shrink finished. Current number of data nodes {num_of_nodes}').publish()

# TODO: add support for the 'LocalFileSystemKeyProviderFactory' and 'KmipKeyProviderFactory' key providers
# TODO: add encryption for a table with large partitions?

Expand Down Expand Up @@ -5626,6 +5707,18 @@ def disrupt(self):
self.disrupt_grow_shrink_cluster()


class FullStorageUtilizationNemesis(Nemesis):
Lakshmipathi marked this conversation as resolved.
Show resolved Hide resolved
"""
This nemesis performs cluster scale-out and scale-in operations while the cluster is at around 90% storage utilization.
It measures latency under mixed-load stress during these operations.
"""
disruptive = False
full_storage_utilization = True

def disrupt(self):
self.disrupt_full_storage_utilization()


class AddRemoveRackNemesis(Nemesis):
disruptive = True
kubernetes = True
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
test_duration: 3000
prepare_write_cmd: [
"cassandra-stress write no-warmup cl=ALL n=300000000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=1..300000000",
"cassandra-stress write no-warmup cl=ALL n=300000000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=300000001..600000000",
"cassandra-stress write no-warmup cl=ALL n=300000000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=600000001..900000000",
"cassandra-stress write no-warmup cl=ALL n=250000000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=900000001..1200000000",
"cassandra-stress write no-warmup cl=ALL n=300000000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=1200000001..1500000000",
]

stress_cmd_w: "cassandra-stress write no-warmup cl=ALL n=335544320 -schema 'keyspace=refill_keyspace replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=300 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=1500000001..1835544321"
stress_cmd_m: "cassandra-stress mixed no-warmup cl=QUORUM duration=800m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate 'threads=300 fixed=16875/s' -col 'size=FIXED(128) n=FIXED(8)' -pop 'dist=gauss(1..600000000,300000000,6000000)'"


n_db_nodes: 3
nemesis_add_node_cnt: 2
n_loaders: 4
n_monitor_nodes: 1
nemesis_grow_shrink_instance_type: 'i4i.2xlarge'

instance_type_loader: 'c6i.2xlarge'
instance_type_monitor: 't3.large'
instance_type_db: 'i4i.2xlarge'

nemesis_class_name: 'FullStorageUtilizationNemesis'
nemesis_interval: 30
nemesis_sequence_sleep_between_ops: 10

user_prefix: 'fullstorage-utilization-test'

round_robin: true
append_scylla_args: '--blocked-reactor-notify-ms 5 --abort-on-lsa-bad-alloc 1 --abort-on-seastar-bad-alloc --abort-on-internal-error 1 --abort-on-ebadf 1'
backtrace_decoding: false
print_kernel_callstack: true

store_perf_results: true
email_recipients: ["scylla-perf-results@scylladb.com"]
# use_prepared_loaders: true
use_hdr_cs_histogram: true
email_subject_postfix: 'fullstorage utilization test'
nemesis_double_load_during_grow_shrink_duration: 0
parallel_node_operations: false

append_scylla_yaml:
enable_tablets: true
auto_snapshot: false

stress_image:
cassandra-stress: 'scylladb/cassandra-stress:3.17.0'
2 changes: 1 addition & 1 deletion unit_tests/test_nemesis_sisyphus.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def test_list_all_available_nemesis(generate_file=True):
disruption_list, disruptions_dict, disruption_classes = sisyphus.get_list_of_disrupt_methods(
subclasses_list=subclasses, export_properties=True)

assert len(disruption_list) == 90
assert len(disruption_list) == 91

if generate_file:
with open(sct_abs_path('data_dir/nemesis.yml'), 'w', encoding="utf-8") as outfile1:
Expand Down
Loading