From 4aa08f2870b3ecd40808e57888a9d26c1d424c78 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Tue, 12 Nov 2024 14:59:29 -0500 Subject: [PATCH] CDNA1/2 data tiling (#19100) CDNA1/2 machines are going to be in use for a while, and adding data tiling support for an architecture is just a matter of populating those 3 optional fields in `TargetWgpDetails`, and the corresponding `gpu_materialize_encoding_gfx***.mlir` test adds some coverage around the intrinsics that is useful beyond data tiling. --------- Signed-off-by: Benoit Jacob --- .../Codegen/Common/GPU/test/BUILD.bazel | 2 + .../Codegen/Common/GPU/test/CMakeLists.txt | 2 + .../test/gpu_materialize_encoding_gfx908.mlir | 60 +++++++++ .../test/gpu_materialize_encoding_gfx90a.mlir | 119 ++++++++++++++++++ .../Dialect/GPU/TargetUtils/KnownTargets.cpp | 10 +- 5 files changed, 191 insertions(+), 2 deletions(-) create mode 100644 compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding_gfx908.mlir create mode 100644 compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding_gfx90a.mlir diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel index 608ff2eb5672..12431f01799a 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel @@ -30,6 +30,8 @@ iree_lit_test_suite( "gpu_infer_memory_space.mlir", "gpu_lower_to_ukernels.mlir", "gpu_combine_value_barriers.mlir", + "gpu_materialize_encoding_gfx908.mlir", + "gpu_materialize_encoding_gfx90a.mlir", "gpu_materialize_encoding_gfx942.mlir", "gpu_materialize_encoding_gfx1100.mlir", "gpu_nested_layout_contract_amdgpu.mlir", diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt index ba0a75149df8..ae65754bafa9 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt @@ -27,6 +27,8 @@ iree_lit_test_suite( "gpu_infer_memory_space.mlir" "gpu_lower_to_ukernels.mlir" "gpu_materialize_encoding_gfx1100.mlir" + "gpu_materialize_encoding_gfx908.mlir" + "gpu_materialize_encoding_gfx90a.mlir" "gpu_materialize_encoding_gfx942.mlir" "gpu_nested_layout_contract_amdgpu.mlir" "gpu_nested_layout_vector_distribution.mlir" diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding_gfx908.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding_gfx908.mlir new file mode 100644 index 000000000000..f2ad507dd3dd --- /dev/null +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding_gfx908.mlir @@ -0,0 +1,60 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-materialize-device-encoding))" \ +// RUN: --iree-gpu-test-target=gfx908 \ +// RUN: --split-input-file %s | FileCheck %s + +#map = affine_map<(d0, d1, d2) -> (d0, d2)> +#map1 = affine_map<(d0, d1, d2) -> (d2, d1)> +#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> +#pipeline_layout_3 = #hal.pipeline.layout, + #hal.pipeline.binding, + #hal.pipeline.binding +]> +func.func @matmul_lowering_MFMA_i32_16x16x16_i8() { + %c0 = arith.constant 0 : index + %M = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(0) alignment(64) offset(%c0) + : !flow.dispatch.tensor>{%M, %K} + %1 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(1) alignment(64) offset(%c0) + : !flow.dispatch.tensor>{%K, %N} + %2 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(2) alignment(64) offset(%c0) + : !flow.dispatch.tensor>{%M, %N} + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] + : !flow.dispatch.tensor>{%M, %K} + -> tensor + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] + : !flow.dispatch.tensor>{%K, %N} + -> tensor + %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] + : !flow.dispatch.tensor>{%M, %N} + -> tensor + %6 = linalg.matmul + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor + flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] + : tensor + -> !flow.dispatch.tensor>{%M, %N} + return +} +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> +// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> +// CHECK: func.func @matmul_lowering_MFMA_i32_16x16x16_i8 +// CHECK-DAG: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(0) +// CHECK-DAG: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(1) +// CHECK-DAG: %[[ACC_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(2) +// CHECK-DAG: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]]{{.+}} -> tensor +// CHECK-DAG: %[[RHS:.+]] = flow.dispatch.tensor.load %[[RHS_BINDING]]{{.+}} -> tensor +// CHECK-DAG: %[[ACC:.+]] = flow.dispatch.tensor.load %[[ACC_BINDING]]{{.+}} -> tensor +// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]] +// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], +// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type] +// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout +// CHECK: flow.dispatch.tensor.store %[[MMA]], %[[ACC_BINDING]] diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding_gfx90a.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding_gfx90a.mlir new file mode 100644 index 000000000000..1a9085944784 --- /dev/null +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding_gfx90a.mlir @@ -0,0 +1,119 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-materialize-device-encoding))" \ +// RUN: --iree-gpu-test-target=gfx90a \ +// RUN: --split-input-file %s | FileCheck %s + +#map = affine_map<(d0, d1, d2) -> (d0, d2)> +#map1 = affine_map<(d0, d1, d2) -> (d2, d1)> +#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> +#pipeline_layout_3 = #hal.pipeline.layout, + #hal.pipeline.binding, + #hal.pipeline.binding +]> +func.func @matmul_lowering_MFMA_f32_16x16x8_bf16() { + %c0 = arith.constant 0 : index + %M = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(0) alignment(64) offset(%c0) + : !flow.dispatch.tensor>{%M, %K} + %1 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(1) alignment(64) offset(%c0) + : !flow.dispatch.tensor>{%K, %N} + %2 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(2) alignment(64) offset(%c0) + : !flow.dispatch.tensor>{%M, %N} + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] + : !flow.dispatch.tensor>{%M, %K} + -> tensor + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] + : !flow.dispatch.tensor>{%K, %N} + -> tensor + %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] + : !flow.dispatch.tensor>{%M, %N} + -> tensor + %6 = linalg.matmul + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor + flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] + : tensor + -> !flow.dispatch.tensor>{%M, %N} + return +} +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> +// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> +// CHECK: func.func @matmul_lowering_MFMA_f32_16x16x8_bf16 +// CHECK-DAG: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(0) +// CHECK-DAG: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(1) +// CHECK-DAG: %[[ACC_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(2) +// CHECK-DAG: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]]{{.+}} -> tensor +// CHECK-DAG: %[[RHS:.+]] = flow.dispatch.tensor.load %[[RHS_BINDING]]{{.+}} -> tensor +// CHECK-DAG: %[[ACC:.+]] = flow.dispatch.tensor.load %[[ACC_BINDING]]{{.+}} -> tensor +// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]] +// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], +// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type] +// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout +// CHECK: flow.dispatch.tensor.store %[[MMA]], %[[ACC_BINDING]] + +// ----- + +#map = affine_map<(d0, d1, d2) -> (d0, d2)> +#map1 = affine_map<(d0, d1, d2) -> (d2, d1)> +#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> +#pipeline_layout_3 = #hal.pipeline.layout, + #hal.pipeline.binding, + #hal.pipeline.binding +]> +func.func @matmul_lowering_MFMA_f64_16x16x4_f64() { + %c0 = arith.constant 0 : index + %M = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(0) alignment(64) offset(%c0) + : !flow.dispatch.tensor>{%M, %K} + %1 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(1) alignment(64) offset(%c0) + : !flow.dispatch.tensor>{%K, %N} + %2 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(2) alignment(64) offset(%c0) + : !flow.dispatch.tensor>{%M, %N} + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] + : !flow.dispatch.tensor>{%M, %K} + -> tensor + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] + : !flow.dispatch.tensor>{%K, %N} + -> tensor + %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] + : !flow.dispatch.tensor>{%M, %N} + -> tensor + %6 = linalg.matmul + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor + flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] + : tensor + -> !flow.dispatch.tensor>{%M, %N} + return +} +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> +// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> +// CHECK: func.func @matmul_lowering_MFMA_f64_16x16x4_f64 +// CHECK-DAG: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(0) +// CHECK-DAG: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(1) +// CHECK-DAG: %[[ACC_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(2) +// CHECK-DAG: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]]{{.+}} -> tensor +// CHECK-DAG: %[[RHS:.+]] = flow.dispatch.tensor.load %[[RHS_BINDING]]{{.+}} -> tensor +// CHECK-DAG: %[[ACC:.+]] = flow.dispatch.tensor.load %[[ACC_BINDING]]{{.+}} -> tensor +// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]] +// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], +// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type] +// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout +// CHECK: flow.dispatch.tensor.store %[[MMA]], %[[ACC_BINDING]] diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp index e198e216ece7..1ce6f12d9ac8 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp @@ -189,7 +189,10 @@ const WgpDetails *getCDNA2WgpDetails() { {1024, 1024, 1024}, 1024, 64 * 1024, - {0x7fffffff, 0x7fffffff, 0x7fffffff}}; + {0x7fffffff, 0x7fffffff, 0x7fffffff}, + /*maxLoadInstructionBits=*/128, + /*simdsPerWgp=*/4, + /*vgprSpaceBits=*/256 * 32}; return &cdna2Wgp; } @@ -209,7 +212,10 @@ const WgpDetails *getCDNA1WgpDetails() { {1024, 1024, 1024}, 1024, 64 * 1024, - {0x7fffffff, 0x7fffffff, 0x7fffffff}}; + {0x7fffffff, 0x7fffffff, 0x7fffffff}, + /*maxLoadInstructionBits=*/128, + /*simdsPerWgp=*/4, + /*vgprSpaceBits=*/256 * 32}; return &cdna1Wgp; }