Cargo.toml

# For optimal build performance, the implementation is split in multiple crates
#
# - A "common" crate featuring the logic shared by all floating-point operations
# - A crate per floating-point operation that we may want to benchmark
# - A "subwoofer" root crate that receives configuration (via cargo features)
#   and orchestrates benchmark runs accordingly
#   * ...which defines one benchmark per supported floating-point type
#
# These crates are grouped in a workspace where common properties are defined...

[workspace]
members = [
    "addsub",
    "common",
    "div_denominator_min",
    "div_numerator_max",
    "fma_addend_max",
    "fma_full_max_mul",
    "fma_multiplier_bidi",
    "max",
    "mul_max",
    "sqrt_positive_max"
]

[workspace.package]
authors = ["Hadrien G. <knights_of_ni@gmx.com>"]
categories = ["command-line-utilities", "hardware-support", "science"]
edition = "2021"
keywords = ["benchmark", "cpu", "denormals", "floating-point", "subnormals"]
license = "MPL-2.0"
repository = "https://github.com/HadrienG2/subwoofer.git"
version = "1.0.0"

[workspace.dependencies]
common = { path = "./common" }
criterion = { version = "0.5", default-features = false }
pessimize = { version = "2.0", features = ["nightly"] }
rand = "0.8.5"
target-features = "0.1"
hwlocality = "1.0.0-alpha"

[profile.bench]
# Sadly needed for pessimize::hide() to be compiled efficiently. Without it, no
# amount of inlining and LTO can save rustc nightly 2024-12-06 from spilling
# accumulators to memory in benchmarks with memory data sources. It is not clear
# to me why that is the case, most likely multiple codegen units get in the way
# of some important whole-program optimization.
codegen-units = 1
# Good for perf profiling
debug = "line-tables-only"

[profile.bench.package."*"]
# Only workspace members need top optimizations, dependencies can use less
# aggressive optimization settings in order to improve build performance
codegen-units = 256
incremental = true
opt-level = 2

[profile.release]
# May slightly improve build and runtime perf
panic = "abort"

# ...and everything below this point is about the "subwoofer" root crate

[package]
authors.workspace = true
autobenches = false
categories.workspace = true
description = "Assessing the impact of subnormals on your CPU's performance"
edition.workspace = true
keywords.workspace = true
license.workspace = true
name = "subwoofer"
repository.workspace = true
version.workspace = true

[features]
# If you are running this benchmark on hardware which you only have temporary
# access to, consider --all-features to check everything that the benchmark can
# possibly measure, at the expense of extremely long compilation and execution
# time (we're talking about multiple days of execution).
#
# More fine-grained options are described below.
default = ["cargo_bench_support", "check"]

### High-level profiles ###

# Run enough benchmarks to tell if subnormals are a problem on your hardware
#
# This configuration exercises all supported hardware operations for scalar
# floating-point data in the L1 cache, at a coarse subnormal freq granularity.
#
# This configuration runs relatively quickly (<1h) and should be enough to
# qualitatively tell whether subnormals affect floating-point performance on
# your hardware, and if so which operations are affected. It is not enough to
# precisely tell how much the hardware operations are affected, whether the
# impact is type-dependent, how it depends on subnormal freq, etc.
check = [
    "bench_addsub",              # ADD/SUB (both are assumed to behave the same)
    "bench_max",                 # MAX is assumed to be same as MIN
    "bench_mul_max",             # Includes subnormal MUL -> subnormal MAX
    "bench_sqrt_positive_max",   # Includes subnormal SQRT -> maybe-subnormal MAX
    "bench_div_numerator_max",   # Includes DIV with subnormal numerator -> subnormal MAX
    "bench_div_denominator_min", # Includes DIV with subnormal denominator -> infinite MIN
    "bench_fma_full_max_mul",    # Includes subnormal FMA -> subnormal MAX -> normal MUL
]

# Run enough benchmarks to check how good/bad the generated code is
#
# This configuration makes sure that all qualitatively different code paths are
# generated and exercised by cargo bench, so that they appear in the output of
# profilers like perf or can be analyzed by static analysis tools like Cutter or
# MAQAO. It does not increase the amount of execution configurations, so it is
# not suitable for quantitative analysis.
measure_codegen = ["bench_all", "register_data_sources", "simd"]

# Run enough benchmarks to quantitatively assess the perf impact of subnormals
#
# This configuration contains everything needed to quantitatively assess the
# worst-case impact of subnormals on typical hardware.
measure = [
    "measure_codegen",
    "subnormal_freq_resolution_1in64",
]

### Fine-grained options ###

# Enable support for cargo bench
#
# Without this, "cargo bench" runs do not generate criterion reports, and one
# needs to use "cargo criterion" instead. This speeds up the criterion build
# because now benchmarks do not need to bundle a full report generator.
cargo_bench_support = ["criterion/cargo_bench_support", "criterion/plotters", "criterion/rayon"]

# Full set of supported microbenchmarks
bench_all = [
    "bench_addsub",              # ADD/SUB with x%subnormal input, normal result
    "bench_max",                 # MAX with x%subnormal input, normal result, assumed to behave just like MIN
    "bench_mul_max",             # MUL with x%subnormal input & result -> MAX with x%subnormal input
    "bench_sqrt_positive_max",   # SQRT with x%subnormal input, x²%subnormal result -> MAX with x²%subnormal input
    "bench_div_numerator_max",   # DIV with x%subnormal numerator, normal denominator -> x%subnormal MAX
    "bench_div_denominator_min", # DIV with x%subnormal denominator, normal numerator -> x%infinite MIN
    "bench_fma_addend_max",      # FMA with x%subnormal addend, normal multiplier & result -> normal MIN
    "bench_fma_multiplier_bidi", # FMA with x%subnormal multiplier, normal addend & result
    "bench_fma_full_max_mul",    # FMA with x%subnormal multiplier and addend, x²%subnormal result -> MAX with x²%subnormal input -> normal MUL
]

# Fine-grained control over benchmark selection. Lets you speed up compilation
# when debugging a specific benchmark. Use with --no-default-features.
bench_addsub = ["dep:addsub"]
bench_div_denominator_min = ["dep:div_denominator_min"]
bench_div_numerator_max = ["dep:div_numerator_max"]
bench_fma_addend_max = ["dep:fma_addend_max"]
bench_fma_full_max_mul = ["dep:fma_full_max_mul"]
bench_fma_multiplier_bidi = ["dep:fma_multiplier_bidi"]
bench_max = ["dep:max"]
bench_mul_max = ["dep:mul_max"]
bench_sqrt_positive_max = ["dep:sqrt_positive_max"]

# By default, we only run benchmarks from data in the L1 cache, because that's
# where we expect the maximal subnormal impact outside of perhaps the
# in-registers configuration, and it's less artificial and less likely to be
# messed up by CPU µarch details than the in-registers configuration.
#
# Use this feature to test at all levels of the memory hierarchy instead, at the
# expense of a large increase in compilation and execution time...
more_data_sources = ["register_data_sources", "more_memory_data_sources"]

# ...or use these finer-grained features if you want to be more specific
register_data_sources = []
more_memory_data_sources = []

# By default, we only run benchmarks with minimal instruction-level parallelism
# (a configuration which is latency-bound for all operations other than SQRT),
# with ~maximal ILP (hopefully throughput-bound), and with half the maximal ILP
# (as a sanity check, see below).
#
# This feature lets you cover all power-of-two degrees of ILP instead. Use it
# when you observe that the intermediate ILP configuration runs faster than the
# maximal ILP configuration, which suggests that the maximal ILP configuration
# is running into codegen or microarchitectural bottleneck e.g. it trashes the
# CPU's instruction cache because there is too much code. In this situation, it
# is better to try all ILP configurations to make sure that you do cover the ILP
# configuration of highest runtime performance.
#
# The price to pay is a moderate increase in execution time.
more_ilp_configurations = []

# By default, we only check subnormal behavior in scalars.
#
# Enable this feature to also check vectors of all-normals or all-subnormals.
# This lets you check if the subnormal fallback logic is vectorized or scalar.
#
# The price to pay is a large increase in compilation and execution time.
simd = ["common/simd"]

# By default, we check with [0, 25, 50, 75, 100]% of subnormal inputs.
#
# If you notice that the subnormals-induced slowdown does not follow a simple
# monotonic pattern (e.g. linearly grows to a maximum at 100%, or linearly grows
# from 0% to 50% then linearly decays from 50% to 100%), then you should try
# enabling one of the following features to more precisely probe the overhead vs
# subnormal occurence frequency curve.
#
# The suggested way to tune this for your hardware, if you have enough time, is:
#
# 1. Start with the maximal supported resolution configuration.
# 2. Check out the criterion report, find out how many data point you truly need
#    to faithfully probe the overhead vs subnormals frequency curve (e.g. to
#    sample the maximal overhead point with good precision).
# 3. If you need to re-run the benchmark on the same hardware later on, use only
#    this number of data points to reduce execution time.
#
# The price to pay for increasing subnormal frequency resolution is a
# multiplicative increase in execution time on benchmarks with memory inputs.
subnormal_freq_resolution_1in8 = []
subnormal_freq_resolution_1in16 = []
subnormal_freq_resolution_1in32 = []
subnormal_freq_resolution_1in64 = []
subnormal_freq_resolution_1in128 = []
# TODO: Add configurations and adjust "measure" configuration if we find
#       hardware where <1% resolution is still not enough

[dependencies]
addsub = { path = "./addsub", optional = true }
common.workspace = true
criterion.workspace = true
div_denominator_min = { path = "./div_denominator_min", optional = true }
div_numerator_max = { path = "./div_numerator_max", optional = true }
fma_addend_max = { path = "./fma_addend_max", optional = true }
fma_full_max_mul = { path = "./fma_full_max_mul", optional = true }
fma_multiplier_bidi = { path = "./fma_multiplier_bidi", optional = true }
hwlocality.workspace = true
max = { path = "./max", optional = true }
mul_max = { path = "./mul_max", optional = true }
pessimize.workspace = true
rand.workspace = true
sqrt_positive_max = { path = "./sqrt_positive_max", optional = true }

[lib]
bench = false

[[bench]]
name = "f32"
harness = false

[[bench]]
name = "f32x04"
harness = false
required-features = ["simd"]

[[bench]]
name = "f32x08"
harness = false
required-features = ["simd"]

[[bench]]
name = "f32x16"
harness = false
required-features = ["simd"]

[[bench]]
name = "f64"
harness = false

[[bench]]
name = "f64x02"
harness = false
required-features = ["simd"]

[[bench]]
name = "f64x04"
harness = false
required-features = ["simd"]

[[bench]]
name = "f64x08"
harness = false
required-features = ["simd"]