Skip to content

Commit

Permalink
fix(tests): remove sync_empty_state (near#12473)
Browse files Browse the repository at this point in the history
This test can fail with nightly features and block PR merges:
https://nayduck.nearone.org/#/test/231517. The reason it fails is that
it sets up a node with a real rocksdb database in a temp dir, and
creating snapshots actually takes several seconds even with small state
because [this
loop](https://github.com/near/nearcore/blob/8e0b26f161b43ef03e41342cd08e1363dfa23db9/core/store/src/db/rocksdb.rs#L475)
takes a while. So the node trying to state sync can't request parts from
the other node because it doesn't have a snapshot yet, and it doesn't
retry its request until long after the current epoch is over and the
other node has already deleted that snapshot. The fact that that loop in
`create_checkpoint()` takes so long is something that perhaps we could
look into, but it's not an issue in practice with long epoch lengths, so
it's not a super high priority thing that should be causing test
failures.

Fix it by just deleting that test and modifying the test loop state sync
test to get the same conditions where we have fewer accounts than shards
so that one of them is empty. (Although it should be noted that with
nightly features, the state won't actually be empty since there'll be
`BandwidthSchedulerState` data in the shards without accounts)
  • Loading branch information
marcelo-gonzalez authored Nov 18, 2024
1 parent 1a3382c commit 7dfdfb0
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 151 deletions.
79 changes: 58 additions & 21 deletions integration-tests/src/test_loop/tests/state_sync.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,7 @@ use std::collections::HashMap;

const EPOCH_LENGTH: BlockHeightDelta = 40;

struct ShardAccounts {
boundary_accounts: Vec<String>,
accounts: Vec<Vec<(AccountId, Nonce)>>,
}

fn generate_accounts(num_shards: usize) -> ShardAccounts {
let accounts_per_shard = 5;

fn get_boundary_accounts(num_shards: usize) -> Vec<String> {
if num_shards > 27 {
todo!("don't know how to include more than 27 shards yet!");
}
Expand All @@ -41,7 +34,11 @@ fn generate_accounts(num_shards: usize) -> ShardAccounts {
}
boundary_accounts.push(boundary_account);
}
boundary_accounts
}

fn generate_accounts(boundary_accounts: &[String]) -> Vec<Vec<(AccountId, Nonce)>> {
let accounts_per_shard = 5;
let mut accounts = Vec::new();
let mut account_base = "0";
for a in boundary_accounts.iter() {
Expand All @@ -58,23 +55,24 @@ fn generate_accounts(num_shards: usize) -> ShardAccounts {
.collect::<Vec<_>>(),
);

ShardAccounts { boundary_accounts, accounts }
accounts
}

struct TestState {
env: TestLoopEnv,
accounts: Vec<Vec<(AccountId, Nonce)>>,
accounts: Option<Vec<Vec<(AccountId, Nonce)>>>,
}

fn setup_initial_blockchain(
num_validators: usize,
num_shards: usize,
generate_shard_accounts: bool,
chunks_produced: HashMap<ShardId, Vec<bool>>,
) -> TestState {
let builder = TestLoopBuilder::new();

let num_block_producer_seats = 1;
let num_chunk_producer_seats = num_shards;
let num_validators = std::cmp::max(num_block_producer_seats, num_chunk_producer_seats);
let validators = (0..num_validators)
.map(|i| {
let account_id = format!("node{}", i);
Expand All @@ -88,7 +86,9 @@ fn setup_initial_blockchain(
.collect::<Vec<_>>();
let clients = validators.iter().map(|v| v.account_id.clone()).collect::<Vec<_>>();

let ShardAccounts { boundary_accounts, accounts } = generate_accounts(num_shards);
let boundary_accounts = get_boundary_accounts(num_shards);
let accounts =
if generate_shard_accounts { Some(generate_accounts(&boundary_accounts)) } else { None };

let mut genesis_builder = TestGenesisBuilder::new();
genesis_builder
Expand All @@ -111,9 +111,11 @@ fn setup_initial_blockchain(
// This part is the only reference to state sync at all in this test, since all we check is that the blockchain
// progresses for a few epochs, meaning that state sync must have been successful.
.shuffle_shard_assignment_for_chunk_producers(true);
for accounts in accounts.iter() {
for (account, _nonce) in accounts.iter() {
genesis_builder.add_user_account_simple(account.clone(), 10000 * ONE_NEAR);
if let Some(accounts) = accounts.as_ref() {
for accounts in accounts.iter() {
for (account, _nonce) in accounts.iter() {
genesis_builder.add_user_account_simple(account.clone(), 10000 * ONE_NEAR);
}
}
}
let (genesis, epoch_config_store) = genesis_builder.build();
Expand Down Expand Up @@ -198,7 +200,7 @@ fn send_txs_between_shards(
/// runs the network and sends transactions at the beginning of each epoch. At the end the condition we're
/// looking for is just that a few epochs have passed, because that should only be possible if state sync was successful
/// (which will be required because we enable chunk producer shard shuffling on this chain)
fn produce_chunks(env: &mut TestLoopEnv, mut accounts: Vec<Vec<(AccountId, Nonce)>>) {
fn produce_chunks(env: &mut TestLoopEnv, mut accounts: Option<Vec<Vec<(AccountId, Nonce)>>>) {
let handle = env.datas[0].client_sender.actor_handle();
let client = &env.test_loop.data.get(&handle).client;
let mut tip = client.chain.head().unwrap();
Expand Down Expand Up @@ -228,7 +230,9 @@ fn produce_chunks(env: &mut TestLoopEnv, mut accounts: Vec<Vec<(AccountId, Nonce
if epoch_id_switches > 2 {
break;
}
send_txs_between_shards(&mut env.test_loop, &env.datas, &mut accounts);
if let Some(accounts) = accounts.as_mut() {
send_txs_between_shards(&mut env.test_loop, &env.datas, accounts);
}
}
tip = new_tip;
}
Expand All @@ -242,7 +246,9 @@ fn run_test(state: TestState) {
* u32::try_from(EPOCH_LENGTH).unwrap_or(u32::MAX)
+ Duration::seconds(2);

send_txs_between_shards(&mut env.test_loop, &env.datas, &mut accounts);
if let Some(accounts) = accounts.as_mut() {
send_txs_between_shards(&mut env.test_loop, &env.datas, accounts);
}

env.test_loop.run_until(
|data| {
Expand All @@ -260,17 +266,42 @@ fn run_test(state: TestState) {

#[derive(Debug)]
struct StateSyncTest {
num_validators: usize,
num_shards: usize,
// If true, generate several extra accounts per shard. We have a test with this disabled
// to test state syncing shards without any account data
generate_shard_accounts: bool,
chunks_produced: &'static [(ShardId, &'static [bool])],
}

static TEST_CASES: &[StateSyncTest] = &[
// The first two make no modifications to chunks_produced, and all chunks should be produced. This is the normal case
StateSyncTest { num_shards: 2, chunks_produced: &[] },
StateSyncTest { num_shards: 4, chunks_produced: &[] },
StateSyncTest {
num_validators: 2,
num_shards: 2,
generate_shard_accounts: true,
chunks_produced: &[],
},
StateSyncTest {
num_validators: 4,
num_shards: 4,
generate_shard_accounts: true,
chunks_produced: &[],
},
// In this test we have 2 validators and 4 shards, and we don't generate any extra accounts.
// That makes 3 accounts ncluding the "near" account. This means at least one shard will have no
// accounts in it, so we check that corner case here.
StateSyncTest {
num_validators: 2,
num_shards: 4,
generate_shard_accounts: false,
chunks_produced: &[],
},
// Now we miss some chunks at the beginning of the epoch
StateSyncTest {
num_validators: 4,
num_shards: 4,
generate_shard_accounts: true,
chunks_produced: &[
(ShardId::new(0), &[false]),
(ShardId::new(1), &[true]),
Expand All @@ -279,11 +310,15 @@ static TEST_CASES: &[StateSyncTest] = &[
],
},
StateSyncTest {
num_validators: 4,
num_shards: 4,
generate_shard_accounts: true,
chunks_produced: &[(ShardId::new(0), &[true, false]), (ShardId::new(1), &[true, false])],
},
StateSyncTest {
num_validators: 4,
num_shards: 4,
generate_shard_accounts: true,
chunks_produced: &[
(ShardId::new(0), &[false, true]),
(ShardId::new(2), &[true, false, true]),
Expand All @@ -298,7 +333,9 @@ fn slow_test_state_sync_current_epoch() {
for t in TEST_CASES.iter() {
tracing::info!("run test: {:?}", t);
let state = setup_initial_blockchain(
t.num_validators,
t.num_shards,
t.generate_shard_accounts,
t.chunks_produced
.iter()
.map(|(shard_id, produced)| (*shard_id, produced.to_vec()))
Expand Down Expand Up @@ -357,7 +394,7 @@ fn spam_state_sync_header_reqs(env: &mut TestLoopEnv) {
fn slow_test_state_request() {
init_test_logger();

let TestState { mut env, .. } = setup_initial_blockchain(4, HashMap::default());
let TestState { mut env, .. } = setup_initial_blockchain(4, 4, false, HashMap::default());

spam_state_sync_header_reqs(&mut env);
env.shutdown_and_drain_remaining_events(Duration::seconds(3));
Expand Down
128 changes: 0 additions & 128 deletions integration-tests/src/tests/client/sync_state_nodes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -292,134 +292,6 @@ fn ultra_slow_test_sync_state_nodes_multishard() {
});
}

/// Start a validator that validators four shards. Since we only have 3 accounts one shard must have
/// empty state. Start another node that does state sync. Check state sync on empty state works.
#[test]
fn ultra_slow_test_sync_empty_state() {
heavy_test(|| {
init_integration_logger();

let mut genesis = Genesis::test_sharded_new_version(
vec!["test1".parse().unwrap(), "test2".parse().unwrap()],
1,
vec![1, 1, 1, 1],
);
genesis.config.epoch_length = 20;

let _dir1 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_1").tempdir().unwrap());
let dir1 = _dir1.clone();
let _dir2 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_2").tempdir().unwrap());
let dir2 = _dir2.clone();

run_actix(async move {
let (port1, port2) =
(tcp::ListenerAddr::reserve_for_test(), tcp::ListenerAddr::reserve_for_test());

// State sync triggers when header head is two epochs in the future.
// Produce more blocks to make sure that state sync gets triggered when the second node starts.
let state_sync_horizon = 10;
let block_header_fetch_horizon = 1;
let block_fetch_horizon = 1;

let mut near1 = load_test_config("test1", port1, genesis.clone());
near1.client_config.min_num_peers = 0;
near1.client_config.min_block_production_delay = Duration::milliseconds(200);
near1.client_config.max_block_production_delay = Duration::milliseconds(400);

let nearcore::NearNode { view_client: view_client1, .. } =
start_with_config(dir1.path(), near1).expect("start_with_config");

let view_client2_holder = Arc::new(RwLock::new(None));
let arbiters_holder = Arc::new(RwLock::new(vec![]));
let arbiters_holder2 = arbiters_holder;

WaitOrTimeoutActor::new(
Box::new(move |_ctx| {
if view_client2_holder.read().unwrap().is_none() {
let view_client2_holder2 = view_client2_holder.clone();
let arbiters_holder2 = arbiters_holder2.clone();
let genesis2 = genesis.clone();
let dir2 = dir2.clone();

let actor = view_client1.send(GetBlock::latest().with_span_context());
let actor = actor.then(move |res| {
match &res {
Ok(Ok(b)) if b.header.height >= state_sync_horizon + 1 => {
let mut view_client2_holder2 =
view_client2_holder2.write().unwrap();
let mut arbiters_holder2 = arbiters_holder2.write().unwrap();

if view_client2_holder2.is_none() {
let mut near2 = load_test_config("test2", port2, genesis2);
near2.network_config.peer_store.boot_nodes =
convert_boot_nodes(vec![("test1", *port1)]);
near2.client_config.min_num_peers = 1;
near2.client_config.min_block_production_delay =
Duration::milliseconds(200);
near2.client_config.max_block_production_delay =
Duration::milliseconds(400);
near2.client_config.block_header_fetch_horizon =
block_header_fetch_horizon;
near2.client_config.block_fetch_horizon =
block_fetch_horizon;
near2.client_config.tracked_shards = vec![ShardId::new(0)]; // Track all shards.

let nearcore::NearNode {
view_client: view_client2,
arbiters,
..
} = start_with_config(dir2.path(), near2)
.expect("start_with_config");
*view_client2_holder2 = Some(view_client2);
*arbiters_holder2 = arbiters;
}
}
Ok(Ok(b)) if b.header.height <= state_sync_horizon => {
println!("FIRST STAGE {}", b.header.height)
}
Err(_) => return future::ready(()),
_ => {}
};
future::ready(())
});
actix::spawn(actor);
}

if let Some(view_client2) = &*view_client2_holder.write().unwrap() {
let actor = view_client2.send(GetBlock::latest().with_span_context());
let actor = actor.then(|res| {
match &res {
Ok(Ok(b)) if b.header.height >= 40 => System::current().stop(),
Ok(Ok(b)) if b.header.height < 40 => {
println!("SECOND STAGE {}", b.header.height)
}
Ok(Err(e)) => {
println!("SECOND STAGE ERROR1: {:?}", e);
return future::ready(());
}
Err(e) => {
println!("SECOND STAGE ERROR2: {:?}", e);
return future::ready(());
}
_ => {
assert!(false);
}
};
future::ready(())
});
actix::spawn(actor);
}
}),
100,
600000,
)
.start();
});
drop(_dir1);
drop(_dir2);
});
}

#[test]
// FIXME(#9650): locks should not be held across await points, allowed currently only because the
// lint started triggering during a toolchain bump.
Expand Down
2 changes: 0 additions & 2 deletions nightly/expensive.txt
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,6 @@ expensive integration-tests integration_tests tests::client::process_blocks::ult
expensive integration-tests integration_tests tests::client::process_blocks::ultra_slow_test_gc_after_state_sync --features nightly
expensive integration-tests integration_tests tests::client::process_blocks::ultra_slow_test_process_block_after_state_sync
expensive integration-tests integration_tests tests::client::process_blocks::ultra_slow_test_process_block_after_state_sync --features nightly
expensive integration-tests integration_tests tests::client::sync_state_nodes::ultra_slow_test_sync_empty_state
expensive integration-tests integration_tests tests::client::sync_state_nodes::ultra_slow_test_sync_empty_state --features nightly
expensive integration-tests integration_tests tests::client::sync_state_nodes::ultra_slow_test_sync_state_dump
expensive integration-tests integration_tests tests::client::sync_state_nodes::ultra_slow_test_sync_state_dump --features nightly
expensive integration-tests integration_tests tests::client::sync_state_nodes::ultra_slow_test_sync_state_nodes
Expand Down

0 comments on commit 7dfdfb0

Please sign in to comment.