Skip to content

Commit

Permalink
test(performance): Add 90% storage utilization scaleout/scalein test
Browse files Browse the repository at this point in the history
Perform scale-out around 90% disk utilization and cluster scale-in
by decommissioning the newly added nodes while measuring latency under
mixed-load stress.

#9156

Signed-off-by: Lakshmipathi.Ganapathi <lakshmipathi.ganapathi@scylladb.com>
  • Loading branch information
Lakshmipathi committed Jan 23, 2025
1 parent ebb835a commit d055087
Show file tree
Hide file tree
Showing 7 changed files with 166 additions and 4 deletions.
2 changes: 2 additions & 0 deletions configurations/nemesis/FullStorageUtilizationNemesis.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
nemesis_class_name: 'FullStorageUtilizationNemesis'
user_prefix: 'full-storage-utilization'
1 change: 1 addition & 0 deletions data_dir/nemesis_classes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
- EndOfQuotaNemesis
- EnospcAllNodesMonkey
- EnospcMonkey
- FullStorageUtilizationNemesis
- GrowShrinkClusterNemesis
- HardRebootNodeMonkey
- LoadAndStreamMonkey
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!groovy

// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43
def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm)

perfRegressionParallelPipeline(
backend: "aws",
availability_zone: 'a',
test_name: "performance_regression_test.PerformanceRegressionTest",
test_config: """["test-cases/performance/perf-regression-latency-i4i_2xlarge-elasticity-90-percent.yaml", "configurations/disable_kms.yaml"]""",
sub_tests: ["test_latency_mixed_with_nemesis"],
)
6 changes: 6 additions & 0 deletions performance_regression_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,12 @@ def _stop_load_when_nemesis_threads_end(self):
extra_time_to_expiration=60):
self.loaders.kill_stress_thread()

def stop_load_during_nemesis(self):
with EventsSeverityChangerFilter(new_severity=Severity.NORMAL, # killing stress creates Critical error
event_class=CassandraStressEvent,
extra_time_to_expiration=60):
self.loaders.kill_stress_thread()

@optional_stage('perf_preload_data')
def preload_data(self, compaction_strategy=None):
# if test require a pre-population of data
Expand Down
99 changes: 96 additions & 3 deletions sdcm/nemesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ class Nemesis: # pylint: disable=too-many-instance-attributes,too-many-public-m
manager_operation: bool = False # flag that signals that the nemesis uses scylla manager
delete_rows: bool = False # A flag denotes a nemesis deletes partitions/rows, generating tombstones.
zero_node_changes: bool = False
full_storage_utilization: bool = False

def __init__(self, tester_obj, termination_event, *args, nemesis_selector=None, **kwargs): # pylint: disable=unused-argument
for name, member in inspect.getmembers(self, lambda x: inspect.isfunction(x) or inspect.ismethod(x)):
Expand Down Expand Up @@ -2104,11 +2105,11 @@ def _truncate_cmd_timeout_suffix(self, truncate_timeout): # pylint: disable=no-
# NOTE: 'self' is used by the 'scylla_versions' decorator
return ''

def disrupt_truncate(self):
keyspace_truncate = 'ks_truncate'
def disrupt_truncate(self, full_storage_utilization=False):
keyspace_truncate = 'ks_truncate' if not full_storage_utilization else 'refill_keyspace'
table = 'standard1'

self._prepare_test_table(ks=keyspace_truncate)
self._prepare_test_table(ks=keyspace_truncate) if not full_storage_utilization else None

# In order to workaround issue #4924 when truncate timeouts, we try to flush before truncate.
with adaptive_timeout(Operations.FLUSH, self.target_node, timeout=HOUR_IN_SEC * 2):
Expand Down Expand Up @@ -4362,6 +4363,86 @@ def _shrink_cluster(self, rack=None, new_nodes: list[BaseNode] | None = None):
self.log.info("Cluster shrink finished. Current number of data nodes %s", num_of_nodes)
InfoEvent(message=f'Cluster shrink finished. Current number of data nodes {num_of_nodes}').publish()

@target_data_nodes
def disrupt_full_storage_utilization(self):
"""
Wait for steady state. Then Scale out cluster by adding new nodes.
Finally Scale in cluster by removing nodes while maintaining high storage utilization.
"""
sleep_time_between_ops = self.cluster.params.get('nemesis_sequence_sleep_between_ops')
if not self.has_steady_run and sleep_time_between_ops:
self.steady_state_latency()
self.has_steady_run = True

new_nodes = self.scaleout_at_full_storage(rack=None)
new_nodes = new_nodes if self.tester.params.get('nemesis_grow_shrink_instance_type') else None
self.scalein_to_reach_full_storage(rack=None, new_nodes=new_nodes)

def scaleout_at_full_storage(self, rack=None):
"""
Performs cluster scale out when storage utilization is around 90%.
It adds first new node, then refills data to 90% utilization level after stopping current load.
Start mixed workload and add remaining new node.
"""
add_nodes_number = self.tester.params.get('nemesis_add_node_cnt')
InfoEvent(message=f"Start grow cluster by {add_nodes_number} data nodes").publish()
new_nodes = []
stress_queue = None
for idx in range(add_nodes_number):
# if rack is not specified, round-robin racks to spread nodes evenly
rack_idx = rack if rack is not None else idx % self.cluster.racks_count
if idx == 0:
new_nodes += self.add_new_nodes(count=1, rack=rack_idx,
instance_type=self.tester.params.get('nemesis_grow_shrink_instance_type'))
# Before starting refilling data current c-s need to be stopped
self.tester.stop_load_during_nemesis()
self.tester.wait_no_compactions_running()
self.log.info("Started: refill data to 90")
refill_90_percent = self.tester.params.get('stress_cmd_w')
stress_queue = self.tester.run_stress_thread(
stress_cmd=refill_90_percent, stress_num=1, stats_aggregate_cmds=False)
self.tester.get_stress_results(
queue=stress_queue, store_results=False)
self.log.info("Completed: refill data to 90")
self.tester.wait_no_compactions_running()
self.log.info("Completed: refill data to 90 - no compactions running..proceed")
stress_cmd = self.tester.params.get('stress_cmd_m')
stress_queue = self.tester.run_stress_thread(
stress_cmd=stress_cmd, stress_num=1, stats_aggregate_cmds=False, duration=180)
# wait for c-s to start
time.sleep(120)
else:
new_nodes += self.add_new_nodes(count=1, rack=rack_idx,
instance_type=self.tester.params.get('nemesis_grow_shrink_instance_type'))
self.log.info("Finish cluster grow")
time.sleep(self.interval)
return new_nodes

def scalein_to_reach_full_storage(self, rack=None, new_nodes: list[BaseNode] | None = None):
"""
Performs cluster scale in while maintaining high storage utilization
Decommission first node. Then perform truncate after first node decommission to
maintain 90% storage load. Continue decomission of the other node.
"""
nodes_count = self.tester.params.get('nemesis_add_node_cnt')
InfoEvent(message=f'Start shrink cluster by {nodes_count} nodes').publish()
self.log.info("Start shrink cluster by %s nodes", nodes_count)
for idx in range(nodes_count):
self._decommission_nodes(
1,
rack,
is_seed=None if self._is_it_on_kubernetes() else DefaultValue,
dc_idx=self.target_node.dc_idx,
exact_nodes=[new_nodes[idx]],
)
self.disrupt_truncate(full_storage_utilization=True) if idx == 0 else None

num_of_nodes = len(self.cluster.data_nodes)
self.log.info("Cluster shrink finished. Current number of data nodes %s", num_of_nodes)
InfoEvent(message=f'Cluster shrink finished. Current number of data nodes {num_of_nodes}').publish()

# TODO: add support for the 'LocalFileSystemKeyProviderFactory' and 'KmipKeyProviderFactory' key providers
# TODO: add encryption for a table with large partitions?

Expand Down Expand Up @@ -5626,6 +5707,18 @@ def disrupt(self):
self.disrupt_grow_shrink_cluster()


class FullStorageUtilizationNemesis(Nemesis):
"""
This nemesis performs cluster scale-out and scale-in operations while the cluster is at around 90% storage utilization.
It measures latency under mixed-load stress during these operations.
"""
disruptive = False
full_storage_utilization = True

def disrupt(self):
self.disrupt_full_storage_utilization()


class AddRemoveRackNemesis(Nemesis):
disruptive = True
kubernetes = True
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
test_duration: 3000
prepare_write_cmd: [
"cassandra-stress write no-warmup cl=ALL n=300000000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=1..300000000",
"cassandra-stress write no-warmup cl=ALL n=300000000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=300000001..600000000",
"cassandra-stress write no-warmup cl=ALL n=300000000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=600000001..900000000",
"cassandra-stress write no-warmup cl=ALL n=250000000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=900000001..1200000000",
"cassandra-stress write no-warmup cl=ALL n=300000000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=1200000001..1500000000",
]

stress_cmd_w: "cassandra-stress write no-warmup cl=ALL n=335544320 -schema 'keyspace=refill_keyspace replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=300 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=1500000001..1835544321"
stress_cmd_m: "cassandra-stress mixed no-warmup cl=QUORUM duration=800m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate 'threads=300 fixed=16875/s' -col 'size=FIXED(128) n=FIXED(8)' -pop 'dist=gauss(1..600000000,300000000,6000000)'"


n_db_nodes: 3
nemesis_add_node_cnt: 2
n_loaders: 4
n_monitor_nodes: 1
nemesis_grow_shrink_instance_type: 'i4i.2xlarge'

instance_type_loader: 'c6i.2xlarge'
instance_type_monitor: 't3.large'
instance_type_db: 'i4i.2xlarge'

nemesis_class_name: 'FullStorageUtilizationNemesis'
nemesis_interval: 30
nemesis_sequence_sleep_between_ops: 10

user_prefix: 'fullstorage-utilization-test'

round_robin: true
append_scylla_args: '--blocked-reactor-notify-ms 5 --abort-on-lsa-bad-alloc 1 --abort-on-seastar-bad-alloc --abort-on-internal-error 1 --abort-on-ebadf 1'
backtrace_decoding: false
print_kernel_callstack: true

store_perf_results: true
email_recipients: ["scylla-perf-results@scylladb.com"]
# use_prepared_loaders: true
use_hdr_cs_histogram: true
email_subject_postfix: 'fullstorage utilization test'
nemesis_double_load_during_grow_shrink_duration: 0
parallel_node_operations: false

append_scylla_yaml:
enable_tablets: true
auto_snapshot: false

stress_image:
cassandra-stress: 'scylladb/cassandra-stress:3.17.0'
2 changes: 1 addition & 1 deletion unit_tests/test_nemesis_sisyphus.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def test_list_all_available_nemesis(generate_file=True):
disruption_list, disruptions_dict, disruption_classes = sisyphus.get_list_of_disrupt_methods(
subclasses_list=subclasses, export_properties=True)

assert len(disruption_list) == 90
assert len(disruption_list) == 91

if generate_file:
with open(sct_abs_path('data_dir/nemesis.yml'), 'w', encoding="utf-8") as outfile1:
Expand Down

0 comments on commit d055087

Please sign in to comment.