refactor our test separation and how we run them (near#12458)

This fundamentally removes the differentiation between "integration" and "unit" tests from the test runner perspective, as well as removes the reliance on the "expensive-tests" compile-time feature. Instead tests are categorized by the underlying reason for which we had the separation in the first place -- test execution speed. These are now annotated with `slow_test_` and `ultraslow_test_` prefixes, and their execution time is enforced by time limits in `nextest.toml`. By using `nextest` filters we can cleanly filter out tests from these three categories to be run in various contexts. For instance, maybe quick checks locally don't need to run the slow tests every time -- `just nextest` by default now takes just 9 seconds (on my machine) to run all the tests. Whereas previously (due to e.g. estimator smoke test) it would take at least a minute. Then a `just nextest-slow` is added in the rarer cases where one might want to run the full suite that runs on CI. CI will run these mildly slower tests for each PR. Then what remains are the ultraslow tests which only run on nayduck and can take quite some time to complete. These can now be trivially run locally as well with `just nextest-all` without necessarily rebuilding the entire project or "unignoring" tests that are marked as ignored for other reasons (like for instance because they're just broken.)
saketh-are · Nov 18, 2024 · 31a2ba0 · 31a2ba0
1 parent 6dab9ee
commit 31a2ba0
Show file tree

Hide file tree

Showing 63 changed files with 521 additions and 672 deletions.
diff --git a/.config/nextest.toml b/.config/nextest.toml
@@ -1,24 +1,45 @@
+# By default tests should not take more than 5 seconds to execute and will get killed after 15
+# seconds.
+#
+# If that happens, instead of making the period longer here, change the test name to one of the
+# next test "speed tiers" by using `slow_test_` or `ultra_slow_test` prefix for the test's name.
 [profile.default]
-slow-timeout = { period = "60s", terminate-after = 3, grace-period = "0s" }
+slow-timeout = { period = "5s", terminate-after = 3, grace-period = "1s" }
+default-filter = "not (test(/^(.*::slow_test|slow_test)/) | test(/^(.*::ultra_slow_test|ultra_slow_test)/))"
+final-status-level = "slow"
 
+# Tests that take a longer time to execute. `slow_test*` tests are given 1m30s to execute.
 [[profile.default.overrides]]
-filter = 'test(test_full_estimator)'
-slow-timeout = { period = "10m", terminate-after = 3 }
-retries = 0
-threads-required = 2
+filter = 'test(/^(.*::slow_test|slow_test)/)'
+slow-timeout = { period = "30s", terminate-after = 3, grace-period = "1s" }
+
+# Tests that take longer than the heat death of the universe of time to execute. Consider making
+# the test faster, but if you must `ultra_slow_test*` tests are given 1h to execute. These are
+# generally left for nayduck to execute.
+[[profile.default.overrides]]
+filter = 'test(/^(.*::ultra_slow_test|ultra_slow_test)/)'
+slow-timeout = { period = "30m", terminate-after = 2, grace-period = "1s" }
+
 
 # Unfortunately no support for inheriting profiles yet:
 # https://github.com/nextest-rs/nextest/issues/387
 [profile.ci]
-slow-timeout = { period = "120s", terminate-after = 5 }
+slow-timeout = { period = "5s", terminate-after = 5, grace-period = "1s" }
+default-filter = "not test(/^(.*::ultra_slow_test|ultra_slow_test)/)"
 # Try a few times before failing the whole test suite on a potentially spurious tests.
 # The hope is that people will fix the spurious tests as they encounter them locally...
 retries = { backoff = "fixed", count = 3, delay = "1s" }
 failure-output = "final"
 fail-fast = false
 
+# Tests that take a longer time to execute. `slow_test*` tests are given 1m30s to execute.
+[[profile.ci.overrides]]
+filter = 'test(/^(.*::slow_test|slow_test)/)'
+slow-timeout = { period = "30s", terminate-after = 3, grace-period = "1s" }
+
+# Tests that take longer than the heat death of the universe of time to execute. Consider making
+# the test faster, but if you must `ultra_slow_test*` tests are given 1h to execute. These are
+# generally left for nayduck to execute.
 [[profile.ci.overrides]]
-filter = 'test(test_full_estimator)'
-slow-timeout = { period = "10m", terminate-after = 3 }
-retries = 0
-threads-required = 2
+filter = 'test(/^(.*::ultra_slow_test|ultra_slow_test)/)'
+slow-timeout = { period = "30m", terminate-after = 2, grace-period = "1s" }
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -27,19 +27,16 @@ jobs:
             id: linux
             os: ubuntu-22.04-16core
             type: stable
-            runs_integ_tests: true
             upload_profraws: true
           - name: Linux Nightly
             id: linux-nightly
             os: ubuntu-22.04-16core
             type: nightly
-            runs_integ_tests: true
             upload_profraws: true
           - name: MacOS
             id: macos
             os: macos-latest-xlarge
             type: stable
-            runs_integ_tests: false
             # TODO: Currently only computing linux coverage, because the MacOS runners
             # have files at a different path and thus comes out duplicated.
             upload_profraws: false
@@ -53,18 +50,11 @@ jobs:
           tool: just,cargo-nextest,cargo-llvm-cov
 
       # Run the tests:
-      - run: mkdir -p coverage/profraw/{unit,integration,binaries}
+      - run: mkdir -p coverage/profraw/{unit,binaries}
       # - Run the unit tests, retrieving the coverage information
-      - run: just codecov-ci "nextest-unit ${{ matrix.type }}"
+      - run: just codecov-ci "nextest-slow ${{ matrix.type }}"
       - run: mv coverage/codecov/{new,unit-${{matrix.id}}}.json
       - run: mv coverage/profraw/{new,unit/${{matrix.id}}}.tar.zst
-      # - Run the integration tests, retrieving the coverage information
-      - run: just codecov-ci "nextest-integration ${{ matrix.type }}"
-        if: matrix.runs_integ_tests
-      - run: mv coverage/codecov/{new,integration-${{matrix.id}}}.json
-        if: matrix.runs_integ_tests
-      - run: mv coverage/profraw/{new,integration/${{matrix.id}}}.tar.zst
-        if: matrix.runs_integ_tests
 
       # Cleanup the target directory, leaving only stuff interesting to llvm-cov, and tarball it
       - run: just tar-bins-for-coverage-ci
@@ -348,8 +338,6 @@ jobs:
         include:
           - type: unit
             profraws: unit
-          - type: integration
-            profraws: unit integration
     steps:
       - uses: actions/checkout@v4
         with:
@@ -423,21 +411,6 @@ jobs:
           files: unit-macos.json
           fail_ci_if_error: true
           flags: unittests,macos
-      - uses: codecov/codecov-action@eaaf4bedf32dbdc6b720b63067d99c4d77d6047d
-        with:
-          files: integration-linux.json
-          fail_ci_if_error: true
-          flags: integration-tests,linux
-      - uses: codecov/codecov-action@eaaf4bedf32dbdc6b720b63067d99c4d77d6047d
-        with:
-          files: integration-linux-nightly.json
-          fail_ci_if_error: true
-          flags: integration-tests,linux-nightly
-      # - uses: codecov/codecov-action@eaaf4bedf32dbdc6b720b63067d99c4d77d6047d
-      #   with:
-      #     files: integration-macos.json
-      #     fail_ci_if_error: true
-      #     flags: integration-tests,macos
       - uses: codecov/codecov-action@eaaf4bedf32dbdc6b720b63067d99c4d77d6047d
         with:
           files: py-backward-compat.json
@@ -463,7 +436,7 @@ jobs:
           files: py-upgradability.json
           fail_ci_if_error: true
           flags: pytests,upgradability,linux
-  
+
   windows_public_libraries_check:
     name: "Windows check for building public libraries"
     runs-on: "windows-latest"

diff --git a/Justfile b/Justfile
@@ -1,18 +1,14 @@
 # FIXME: some of these tests don't work very well on MacOS at the moment. Should fix
 # them at earliest convenience :)
 # Also in addition to this, the `nextest-integration` test is currently disabled on macos
-with_macos_excludes := if os() == "macos" {
-    "--exclude node-runtime --exclude runtime-params-estimator --exclude near-network --exclude estimator-warehouse"
-} else {
-    ""
-}
-# On MacOS, not all structs are collected by `inventory`. Non-incremental build fixes that.
-# See https://github.com/dtolnay/inventory/issues/52.
-with_macos_incremental := if os() == "macos" {
-    "CARGO_INCREMENTAL=0"
+platform_excludes := if os() == "macos" {
+    "--exclude node-runtime --exclude runtime-params-estimator --exclude near-network --exclude estimator-warehouse --exclude integration-tests"
+} else if os() == "windows" {
+    "--exclude node-runtime --exclude runtime-params-estimator --exclude near-network --exclude estimator-warehouse --exclude integration-tests"
 } else {
     ""
 }
+
 nightly_flags := "--features nightly,test_features"
 public_libraries := "-p near-primitives -p near-crypto -p near-jsonrpc-primitives -p near-chain-configs -p near-primitives-core"
 
@@ -39,46 +35,23 @@ test-ci *FLAGS: check-cargo-fmt \
 # tests that are as close to CI as possible, but not exactly the same code
 test-extra: check-lychee
 
-# all cargo tests, TYPE is "stable" or "nightly"
-nextest TYPE *FLAGS: (nextest-unit TYPE FLAGS) (nextest-integration TYPE FLAGS)
-
-# cargo unit tests, TYPE is "stable" or "nightly"
-nextest-unit TYPE *FLAGS:
-    RUSTFLAGS="-D warnings" \
+# all cargo tests,
+# TYPE is one of "stable", "nightly"
+nextest TYPE *FLAGS:
+    env RUSTFLAGS="-D warnings" \
     cargo nextest run \
         --locked \
         --workspace \
-        --exclude integration-tests \
         --cargo-profile dev-release \
         {{ ci_hack_nextest_profile }} \
-        {{ with_macos_excludes }} \
+        {{ platform_excludes }} \
         {{ if TYPE == "nightly" { nightly_flags } \
            else if TYPE == "stable" { "" } \
            else { error("TYPE is neighter 'nightly' nor 'stable'") } }} \
         {{ FLAGS }}
 
-# cargo integration tests, TYPE is "stable" or "nightly"
-[linux]
-nextest-integration TYPE *FLAGS:
-    RUSTFLAGS="-D warnings" \
-    cargo nextest run \
-        --locked \
-        --package integration-tests \
-        --cargo-profile dev-release \
-        {{ ci_hack_nextest_profile }} \
-        {{ if TYPE == "nightly" { nightly_flags } \
-           else if TYPE == "stable" { "" } \
-           else { error("TYPE is neither 'nightly' nor 'stable'") } }} \
-        {{ FLAGS }}
-# Note: when re-enabling this on macos, ci.yml will need to be adjusted to report code coverage again
-[macos]
-nextest-integration TYPE *FLAGS:
-    @echo "Nextest integration tests are currently disabled on macos!"
-
-[windows]
-nextest-integration TYPE *FLAGS:
-    @echo "Nextest integration tests are currently disabled on windows!"
-
+nextest-slow TYPE *FLAGS: (nextest TYPE "--ignore-default-filter -E 'default() + test(/^(.*::slow_test|slow_test)/)'" FLAGS)
+nextest-all TYPE *FLAGS: (nextest TYPE "--ignore-default-filter -E 'all()'" FLAGS)
 
 doctests:
     cargo test --doc
@@ -164,22 +137,18 @@ check-lychee:
              else { "Note: 'Too Many Requests' errors are allowed here but not in CI, set GITHUB_TOKEN to check them" } }}
 
 # check tools/protocol-schema-check/res/protocol_schema.toml
-check-protocol-schema: install-rustc-nightly
+# On MacOS, not all structs are collected by `inventory`. Non-incremental build fixes that.
+# See https://github.com/dtolnay/inventory/issues/52.
+protocol_schema_env := "CARGO_TARGET_DIR=" + justfile_directory() + "/target/schema-check RUSTC_BOOTSTRAP=1 RUSTFLAGS='--cfg enable_const_type_id' CARGO_INCREMENTAL=0"
+check-protocol-schema:
     # Below, we *should* have been used `cargo +nightly ...` instead of
     # `RUSTC_BOOTSTRAP=1`. However, the env var appears to be more stable.
     # `nightly` builds are updated daily and may be broken sometimes, e.g.
     # https://github.com/rust-lang/rust/issues/130769.
     #
     # If there is an issue with the env var, fall back to `cargo +nightly ...`.
-
-    # Test that checker is not broken
-    RUSTC_BOOTSTRAP=1 RUSTFLAGS="--cfg enable_const_type_id" \
-        cargo test -p protocol-schema-check --profile dev-artifacts
-
-    # Run the checker
-    RUSTC_BOOTSTRAP=1 RUSTFLAGS="--cfg enable_const_type_id" \
-        {{ with_macos_incremental }} \
-        cargo run -p protocol-schema-check --profile dev-artifacts
+    env {{protocol_schema_env}} cargo test -p protocol-schema-check --profile dev-artifacts
+    env {{protocol_schema_env}} cargo run -p protocol-schema-check --profile dev-artifacts
 
 check_build_public_libraries:
     cargo check {{public_libraries}}
diff --git a/chain/chain/src/tests/doomslug.rs b/chain/chain/src/tests/doomslug.rs
@@ -284,8 +284,7 @@ fn one_iter(
 }
 
 #[test]
-#[cfg_attr(not(feature = "expensive_tests"), ignore)]
-fn test_fuzzy_doomslug_liveness_and_safety() {
+fn ultra_slow_test_fuzzy_doomslug_liveness_and_safety() {
     for (time_to_gst_millis, height_goal) in
         &[(0, 200), (1000, 200), (10000, 300), (100000, 400), (500000, 500)]
     {

diff --git a/chain/chain/src/tests/garbage_collection.rs b/chain/chain/src/tests/garbage_collection.rs
@@ -350,8 +350,7 @@ fn test_gc_remove_fork_small() {
 }
 
 #[test]
-#[cfg_attr(not(feature = "expensive_tests"), ignore)]
-fn test_gc_remove_fork_large() {
+fn ultra_slow_test_gc_remove_fork_large() {
     test_gc_remove_fork_common(20)
 }
 
@@ -397,8 +396,7 @@ fn test_gc_not_remove_fork_small() {
 }
 
 #[test]
-#[cfg_attr(not(feature = "expensive_tests"), ignore)]
-fn test_gc_not_remove_fork_large() {
+fn ultra_slow_test_gc_not_remove_fork_large() {
     test_gc_not_remove_fork_common(20)
 }
 
@@ -416,7 +414,7 @@ fn test_gc_not_remove_longer_fork() {
 
 // This test creates forks from genesis
 #[test]
-fn test_gc_forks_from_genesis() {
+fn slow_test_gc_forks_from_genesis() {
     for fork_length in 1..=10 {
         let chains = vec![
             SimpleChain { from: 0, length: 101, is_removed: false },
@@ -450,7 +448,7 @@ fn test_gc_forks_from_genesis() {
 }
 
 #[test]
-fn test_gc_overlap() {
+fn slow_test_gc_overlap() {
     for max_changes in 1..=20 {
         let chains = vec![
             SimpleChain { from: 0, length: 101, is_removed: false },
@@ -483,8 +481,7 @@ fn test_gc_boundaries_small() {
 }
 
 #[test]
-#[cfg_attr(not(feature = "expensive_tests"), ignore)]
-fn test_gc_boundaries_large() {
+fn ultra_slow_test_gc_boundaries_large() {
     test_gc_boundaries_common(20)
 }
 
@@ -507,13 +504,12 @@ fn test_gc_random_common(runs: u64) {
 }
 
 #[test]
-fn test_gc_random_small() {
+fn slow_test_gc_random_small() {
     test_gc_random_common(3);
 }
 
 #[test]
-#[cfg_attr(not(feature = "expensive_tests"), ignore)]
-fn test_gc_random_large() {
+fn ultra_slow_test_gc_random_large() {
     test_gc_random_common(25);
 }
 
@@ -545,8 +541,7 @@ fn test_gc_pine_small() {
 }
 
 #[test]
-#[cfg_attr(not(feature = "expensive_tests"), ignore)]
-fn test_gc_pine() {
+fn ultra_slow_test_gc_pine() {
     for max_changes in 1..=20 {
         let mut chains = vec![SimpleChain { from: 0, length: 101, is_removed: false }];
         for i in 1..100 {
@@ -578,8 +573,7 @@ fn test_gc_star_small() {
 }
 
 #[test]
-#[cfg_attr(not(feature = "expensive_tests"), ignore)]
-fn test_gc_star_large() {
+fn ultra_slow_test_gc_star_large() {
     test_gc_star_common(20)
 }
 
@@ -834,9 +828,8 @@ fn test_clear_old_data_fixed_height() {
 
 /// Test that `gc_blocks_limit` works properly
 #[test]
-#[cfg_attr(not(feature = "expensive_tests"), ignore)]
 #[allow(unreachable_code)]
-fn test_clear_old_data_too_many_heights() {
+fn ultra_slow_test_clear_old_data_too_many_heights() {
     // TODO(#10634): panics on `clear_data` -> `clear_resharding_data` ->
     // `MockEpochManager::is_next_block_epoch_start` apparently because
     // epoch manager is not updated at all. Should we fix it together with

diff --git a/chain/client/src/sync/header.rs b/chain/client/src/sync/header.rs
@@ -651,7 +651,7 @@ mod test {
     /// the peer doesn't get banned. (specifically, that the expected height downloaded gets properly
     /// adjusted for time passed)
     #[test]
-    fn test_slow_header_sync() {
+    fn slow_test_slow_header_sync() {
         let network_adapter = Arc::new(MockPeerManagerAdapter::default());
         let highest_height = 1000;
 
@@ -751,7 +751,7 @@ mod test {
     }
 
     #[test]
-    fn test_sync_from_very_behind() {
+    fn slow_test_sync_from_very_behind() {
         let mock_adapter = Arc::new(MockPeerManagerAdapter::default());
         let mut header_sync = HeaderSync::new(
             Clock::real(),

diff --git a/chain/client/src/tests/bug_repros.rs b/chain/client/src/tests/bug_repros.rs
@@ -30,7 +30,7 @@ use near_primitives::block::Block;
 use near_primitives::transaction::SignedTransaction;
 
 #[test]
-fn repro_1183() {
+fn slow_test_repro_1183() {
     init_test_logger();
     run_actix(async {
         let connectors: Arc<RwLock<Vec<ActorHandlesForTesting>>> = Arc::new(RwLock::new(vec![]));
@@ -169,7 +169,7 @@ fn repro_1183() {
 }
 
 #[test]
-fn test_sync_from_archival_node() {
+fn slow_test_sync_from_archival_node() {
     init_test_logger();
     let vs = ValidatorSchedule::new().num_shards(4).block_producers_per_epoch(vec![vec![
         "test1".parse().unwrap(),
@@ -279,7 +279,7 @@ fn test_sync_from_archival_node() {
 }
 
 #[test]
-fn test_long_gap_between_blocks() {
+fn slow_test_long_gap_between_blocks() {
     init_test_logger();
     let vs = ValidatorSchedule::new()
         .num_shards(2)