diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp index 23e5cbb13e27..67e7236c2e21 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp @@ -1577,6 +1577,7 @@ setWarpReductionConfig(IREE::GPU::TargetAttr target, return failure(); } } + int numDynamicDims = llvm::count_if(bounds, ShapedType::isDynamic); // Distribution of multi-dim masked writes currently aren't fully supported. if (numDynamicReductionDims > 1) { @@ -1617,9 +1618,9 @@ setWarpReductionConfig(IREE::GPU::TargetAttr target, size_t numLoops = partitionedLoops.empty() ? 0 : partitionedLoops.back() + 1; SmallVector workgroupTileSizes(numLoops, 1); - // Without any bounds on dynamic reduction dims, we need specialization to + // Without any bounds on dynamic dims, we need specialization to // get peak performance. For now, just use the warp size. - if (numDynamicReductionDims) { + if (numDynamicDims > 0) { SmallVector reductionTileSizes(op.getNumLoops(), 0); int64_t preferredSubgroupSize = target.getPreferredSubgroupSize(); reductionTileSizes[reductionDims[0]] = preferredSubgroupSize; diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir index 5262c3460fad..1e5dbf63f2f9 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir @@ -273,3 +273,34 @@ func.func @not_vmt() { // CHECK-SAME: translation_info = #[[$TRANSLATION]] // CHECK: linalg.generic // CHECK-SAME: lowering_config = #[[$CONFIG]] + +// ----- + +func.func @dynamic_parallel_dims(%dynsize : index, %input : tensor<4x?x4096xf16>) -> tensor<4x?xf32> { + %cst = arith.constant 0.0 : f32 + %0 = tensor.empty(%dynsize) : tensor<4x?xf32> + %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<4x?xf32>) -> tensor<4x?xf32> + %2 = linalg.generic { + indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], + iterator_types = ["parallel", "parallel", "reduction"]} + ins(%input : tensor<4x?x4096xf16>) outs(%1 : tensor<4x?xf32>) { + ^bb0(%in: f16, %out: f32): + %3 = arith.extf %in : f16 to f32 + %4 = arith.addf %3, %out : f32 + linalg.yield %4 : f32 + } -> tensor<4x?xf32> + return %2 : tensor<4x?xf32> +} +// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config +// CHECK: func @dynamic_parallel_dims +// CHECK-SAME: translation_info = #[[TRANSLATION]] +// CHECK: linalg.generic +// CHECK-SAME: lowering_config = #[[CONFIG]] + +// CDNA3-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config +// CDNA3: func @dynamic_parallel_dims +// CDNA3-SAME: translation_info = #[[TRANSLATION]] +// CDNA3: linalg.generic +// CDNA3-SAME: lowering_config = #[[CONFIG]] diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir index 6145ebd9688f..feb0e2766303 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir @@ -743,8 +743,8 @@ func.func @i4_dequant_matvec() { return } -// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config -// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info +// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config +// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info // CHECK-LABEL: func.func @i4_dequant_matvec() // CHECK-SAME: translation_info = #[[$TRANSLATION]] // CHECK: linalg.generic