diff --git a/tuner/tuner/libtuner.py b/tuner/tuner/libtuner.py index 95c488cc4..63c9c5fbc 100644 --- a/tuner/tuner/libtuner.py +++ b/tuner/tuner/libtuner.py @@ -221,9 +221,12 @@ def validate_devices(user_devices: list[str]) -> None: ) -def validate_benchmark_results( +def get_valid_benchmark_results( benchmark_results: list[BenchmarkResult], ) -> list[BenchmarkResult]: + """ + Filter the benchmark_results list to return list with finite `time` values. + """ filtered_benchmark_results = [r for r in benchmark_results if math.isfinite(r.time)] if len(filtered_benchmark_results) == 0: logging.error("No successful candidate benchmarks.") @@ -231,23 +234,29 @@ def validate_benchmark_results( return filtered_benchmark_results -def map_baseline_by_device(baseline_results: list[BenchmarkResult]) -> dict[str, float]: - return {r.device_id: r.time for r in baseline_results} +def check_baseline_devices_uniqueness(baseline_results: list[BenchmarkResult]) -> bool: + seen = set() + for result in baseline_results: + if result.device_id in seen: + return False + seen.add(result.device_id) + return True -def validate_baselines_device_ids_match( - first_baseline_by_device: dict[str, float], - second_baseline_by_device: dict[str, float], -) -> bool: - return first_baseline_by_device.keys() == second_baseline_by_device.keys() +def map_baseline_by_device(baseline_results: list[BenchmarkResult]) -> dict[str, float]: + return {r.device_id: r.time for r in baseline_results} -def validate_baseline_regression( - first_baseline_by_device: dict[str, float], - second_baseline_by_device: dict[str, float], +def detect_baseline_regression( + first_baseline_results: list[BenchmarkResult], + second_baseline_results: list[BenchmarkResult], ) -> list[str]: + """ + Detects performance regressions between two sets of baseline results. + """ regression_device_ids = [] - + first_baseline_by_device = map_baseline_by_device(first_baseline_results) + second_baseline_by_device = map_baseline_by_device(second_baseline_results) for device_id in first_baseline_by_device: if device_id not in second_baseline_by_device: continue @@ -899,7 +908,7 @@ def select_best_benchmark_results( baseline_results: list[BenchmarkResult], num_candidates: Optional[int], ) -> list[BenchmarkResult]: - filtered_candidate_results = validate_benchmark_results(candidate_results) + filtered_candidate_results = get_valid_benchmark_results(candidate_results) if len(filtered_candidate_results) == 0: logging.error("No successful candidate benchmarks.") return [] @@ -972,8 +981,8 @@ def benchmark( tuning_client=tuning_client, candidate_trackers=candidate_trackers, ) - - baseline_times_by_device = map_baseline_by_device(baseline_results) + if not check_baseline_devices_uniqueness(baseline_results): + logging.warning("Duplicate device IDs detected in the first baseline results.") candidate_indices = [i for i in compiled_candidates if i != 0] candidate_results = benchmark_candidates( @@ -992,15 +1001,17 @@ def benchmark( tuning_client=tuning_client, candidate_trackers=candidate_trackers, ) - post_baseline_times_by_device = map_baseline_by_device(post_baseline_results) - if validate_baselines_device_ids_match( - baseline_times_by_device, post_baseline_times_by_device - ): + if not check_baseline_devices_uniqueness: + logging.warning("Duplicate device IDs detected in the second baseline results.") + + first_baseline_by_device = map_baseline_by_device(baseline_results) + second_baseline_by_device = map_baseline_by_device(post_baseline_results) + if first_baseline_by_device.keys() != second_baseline_by_device.keys(): logging.warning("Device ID mismatch between baseline runs.") - regression_devices = validate_baseline_regression( - baseline_times_by_device, post_baseline_results + regression_devices = detect_baseline_regression( + baseline_results, post_baseline_results ) if regression_devices: logging.warning( diff --git a/tuner/tuner/libtuner_test.py b/tuner/tuner/libtuner_test.py index cda738470..00c661a42 100644 --- a/tuner/tuner/libtuner_test.py +++ b/tuner/tuner/libtuner_test.py @@ -238,57 +238,71 @@ def test_validate_benchmark_results(): libtuner.BenchmarkResult(0, math.inf, "hip://0"), ] - result = libtuner.validate_benchmark_results(benchmark_results) + result = libtuner.get_valid_benchmark_results(benchmark_results) assert result == [] benchmark_results = [ libtuner.BenchmarkResult(0, math.inf, "hip://0"), libtuner.BenchmarkResult(0, 0.1, "hip://1"), ] - result = libtuner.validate_benchmark_results(benchmark_results) + result = libtuner.get_valid_benchmark_results(benchmark_results) assert len(result) == 1 assert result[0].candidate_id == 0 assert result[0].time == 0.1 assert result[0].device_id == "hip://1" -def test_validate_baselines_device_id_match(): - first_baseline = {"hip://0": 1000.0, "hip://1": 2000.0} - second_baseline = {"hip://1": 1500.0, "hip://2": 2500.0} - - result = libtuner.validate_baselines_device_ids_match( - first_baseline, second_baseline - ) - assert result is False - - first_baseline = {"hip://0": 1000.0, "hip://1": 2000.0} - second_baseline = {"hip://0": 1500.0, "hip://1": 2500.0} +def test_check_baseline_devices_uniqueness(): + baseline_results = [ + libtuner.BenchmarkResult(0, 1000.0, "hip://0"), + libtuner.BenchmarkResult(0, 2000.0, "hip://1"), + libtuner.BenchmarkResult(0, 3000.0, "hip://2"), + ] + assert libtuner.check_baseline_devices_uniqueness(baseline_results) - result = libtuner.validate_baselines_device_ids_match( - first_baseline, second_baseline - ) - assert result is True + baseline_results = [ + libtuner.BenchmarkResult(0, 1000.0, "hip://0"), + libtuner.BenchmarkResult(0, 2000.0, "hip://0"), + libtuner.BenchmarkResult(0, 3000.0, "hip://2"), + ] + assert not libtuner.check_baseline_devices_uniqueness(baseline_results) -def test_validate_baseline_regression(): - first_baseline = {"hip://0": 1000.0, "hip://1": 2000.0} - second_baseline = {"hip://0": 1100.0, "hip://1": 1900.0} - regression_devices = libtuner.validate_baseline_regression( +def test_detect_baseline_regression(): + first_baseline = [ + libtuner.BenchmarkResult(0, 1000.0, "hip://0"), + libtuner.BenchmarkResult(0, 2000.0, "hip://1"), + ] + second_baseline = [ + libtuner.BenchmarkResult(0, 1100.0, "hip://0"), + libtuner.BenchmarkResult(0, 1900.0, "hip://1"), + ] + regression_devices = libtuner.detect_baseline_regression( first_baseline, second_baseline ) assert regression_devices == ["hip://0"] - first_baseline = {"hip://0": 1000.0, "hip://1": 2000.0} - second_baseline = {"hip://0": 1000.0, "hip://1": 2000.0} - - regression_devices = libtuner.validate_baseline_regression( + first_baseline = [ + libtuner.BenchmarkResult(0, 1000.0, "hip://0"), + libtuner.BenchmarkResult(0, 2000.0, "hip://1"), + ] + second_baseline = [ + libtuner.BenchmarkResult(0, 1000.0, "hip://0"), + libtuner.BenchmarkResult(0, 1000.0, "hip://1"), + ] + regression_devices = libtuner.detect_baseline_regression( first_baseline, second_baseline ) assert regression_devices == [] - first_baseline = {"hip://0": 1000.0, "hip://1": 2000.0} - second_baseline = {"hip://0": 1100.0} - regression_devices = libtuner.validate_baseline_regression( + first_baseline = [ + libtuner.BenchmarkResult(0, 1000.0, "hip://0"), + libtuner.BenchmarkResult(0, 2000.0, "hip://1"), + ] + second_baseline = [ + libtuner.BenchmarkResult(0, 1100.0, "hip://0"), + ] + regression_devices = libtuner.detect_baseline_regression( first_baseline, second_baseline ) assert regression_devices == ["hip://0"]