From 0c2f51bc9deb09dbc80e69b09995e631cab21c8a Mon Sep 17 00:00:00 2001 From: Nirvedh Meshram <96096277+nirvedhmeshram@users.noreply.github.com> Date: Tue, 13 Aug 2024 18:22:48 -0500 Subject: [PATCH] [LLVMGPU] Drop WorkgroupSpecializationPass (#18212) This pass creates control flow in case the tile and distribute phase results in dynamic dimensions. However `IREEComprehensiveBufferizePass` is capable of handling this later on. --- .../compiler/Codegen/Common/GPU/BUILD.bazel | 1 - .../Codegen/Common/GPU/CMakeLists.txt | 1 - .../compiler/Codegen/Common/GPU/Passes.td | 9 - .../GPU/WorkgroupSpecializationPass.cpp | 166 ------------------ .../compiler/Codegen/Common/test/BUILD.bazel | 1 - .../Codegen/Common/test/CMakeLists.txt | 1 - .../Common/test/workgroup_specialization.mlir | 150 ---------------- .../iree/compiler/Codegen/LLVMGPU/Passes.cpp | 3 - .../compiler/Codegen/LLVMGPU/test/BUILD.bazel | 1 - .../Codegen/LLVMGPU/test/CMakeLists.txt | 1 - ...orkgroup_specialization_pipeline_test.mlir | 114 ------------ 11 files changed, 448 deletions(-) delete mode 100644 compiler/src/iree/compiler/Codegen/Common/GPU/WorkgroupSpecializationPass.cpp delete mode 100644 compiler/src/iree/compiler/Codegen/Common/test/workgroup_specialization.mlir delete mode 100644 compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel index 91895b38e642..da454087781c 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel @@ -75,7 +75,6 @@ iree_compiler_cc_library( "Passes.cpp", "VectorReductionToGPU.cpp", "WorkgroupReordering.cpp", - "WorkgroupSpecializationPass.cpp", ], hdrs = [ "GPUPatterns.h", diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt index 8905673860f7..e22fa0306556 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt @@ -73,7 +73,6 @@ iree_cc_library( "Passes.cpp" "VectorReductionToGPU.cpp" "WorkgroupReordering.cpp" - "WorkgroupSpecializationPass.cpp" DEPS ::PassHeaders ::PassesIncGen diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td index 88c3d24b3ae4..36507fdd0a41 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td @@ -217,13 +217,4 @@ def VectorReductionToGPUPass : ]; } -def WorkgroupSpecializationPass : - InterfacePass<"iree-codegen-workgroup-specialization", "mlir::FunctionOpInterface"> { - let summary = "Specialize workgroup distribution loops"; - let dependentDialects = [ - "::mlir::affine::AffineDialect", "::mlir::linalg::LinalgDialect", - "::mlir::scf::SCFDialect", "::mlir::tensor::TensorDialect", - ]; -} - #endif // IREE_CODEGEN_COMMON_GPU_PASSES diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/WorkgroupSpecializationPass.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/WorkgroupSpecializationPass.cpp deleted file mode 100644 index ccb966b92edb..000000000000 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/WorkgroupSpecializationPass.cpp +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright 2022 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -//=== WorkgroupSpecializationPass.cpp ------------------------------------===// -// -// This pass specializes the workgroup distribution loops with the tile sizes. -// -// For example, it converts -// -// %tileSizeY = affine.min ... -// %tileSizeX = affine.min ... -// the_op with bounded tile sizes (The tensor is of dynamic shape.) -// -// into -// -// %tileSizeY = affine.min ... -// %tileSizeX = affine.min ... -// %cmp0 = arith.cmpi %worksizeY, %tilesizeY -// %cmp1 = arith.cmpi %worksizeX, %tilesizeX -// %cond = arith.and %cmp0, %cmp1 -// scf.if %cond -// operation with the static shape with the main tile sizes -// else -// original nested loops with dynamic shaped op -// -//===---------------------------------------------------------------------===// - -#include "iree/compiler/Codegen/Common/GPU/Passes.h" -#include "iree/compiler/Codegen/Utils/Utils.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "mlir/Dialect/Affine/IR/AffineOps.h" -#include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Dialect/SCF/IR/SCF.h" -#include "mlir/Dialect/Tensor/IR/Tensor.h" -#include "mlir/Interfaces/FunctionInterfaces.h" - -#define DEBUG_TYPE "iree-codegen-workgroup-specialization" - -namespace mlir::iree_compiler { - -#define GEN_PASS_DEF_WORKGROUPSPECIALIZATIONPASS -#include "iree/compiler/Codegen/Common/GPU/Passes.h.inc" - -namespace { -static llvm::cl::opt clEnableWorkgroupSpecialization( - "iree-codegen-enable-workgroup-specialization", - llvm::cl::desc("Enable workgroup specialization."), llvm::cl::init(true)); - -static std::optional -getConstantLowerBound(affine::AffineMinOp affineMinOp) { - for (AffineExpr expr : affineMinOp.getMap().getResults()) { - if (auto cst = dyn_cast(expr)) { - return cst.getValue(); - } - } - return std::nullopt; -} - -// Specialize the distributed function with the main tile sizes. -// -// Transformed output -// cond = (boundedTileSizeY != TileX) && (boundedTileSizeX != TileY) && ... -// scf.if cond -// distribution loops with static shapes with the tile size -// else -// distribution loops with dynamic shapes with the tile size -// -// Steps: -// 1. Walk the code and collect affine.min that only depend on workgroup.id -// and have one constant result. -// 2. Move those at the top of the function -// 3. Create a condition that ANDs all the affineMin == constant -// 4. Splice the rest of the block and clone into a specialized if/else -static void specializeFunction(mlir::FunctionOpInterface funcOp) { - SmallVector minSizeOps; - SmallVector ids; - funcOp.walk([&minSizeOps, &ids](Operation *op) { - if (auto affineMin = dyn_cast(op)) { - for (Value operand : affineMin->getOperands()) { - if (!operand.getDefiningOp()) { - return WalkResult::advance(); - } - ids.push_back(operand.getDefiningOp()); - } - if (!getConstantLowerBound(affineMin)) { - return WalkResult::advance(); - } - minSizeOps.push_back(affineMin); - } - return WalkResult::advance(); - }); - if (minSizeOps.empty()) { - return; - } - - auto loc = funcOp.getLoc(); - Block *block = &(*funcOp.getBlocks().begin()); - - OpBuilder builder(funcOp->getContext()); - OpBuilder::InsertionGuard guard(builder); - // Move ops at the top of the function. This is always correct as those only - // depends on workgroup ids. - for (affine::AffineMinOp affineMin : llvm::reverse(minSizeOps)) { - affineMin->moveBefore(&block->front()); - } - for (Operation *id : llvm::reverse(ids)) { - id->moveBefore(&block->front()); - } - builder.setInsertionPointAfter(minSizeOps.back()); - // create a condition for scf.if - Value cond; - SmallVector constantOps; // ConstantIndexOps for tile sizes - for (unsigned i = 0, e = minSizeOps.size(); i != e; ++i) { - affine::AffineMinOp minOp = minSizeOps[i]; - int64_t lowerBound = *getConstantLowerBound(minOp); - // Generate a compare op that checks the dynamic size is equal to the - // constant main tile size. - Value constant = builder.create(loc, lowerBound); - constantOps.push_back(constant); - Value cmp = builder.create(loc, arith::CmpIPredicate::eq, - minOp, constant); - cond = cond ? builder.create(loc, cond, cmp) : cmp; - } - - // generate scf.if %cond - auto ifOp = builder.create(loc, cond, /*withElseRegion=*/true); - - // Transfer the original body to the scf.else body. - auto origBodyBegin = ++Block::iterator(ifOp); - auto origBodyEnd = --block->end(); // yield - - Block *elseBlock = ifOp.elseBlock(); - elseBlock->getOperations().splice(elseBlock->begin(), block->getOperations(), - origBodyBegin, origBodyEnd); - // Clone the else block into the then block. minOps are replaced during the - // cloning. - auto b = ifOp.getThenBodyBuilder(); - IRMapping bvm; - for (unsigned i = 0, e = minSizeOps.size(); i != e; ++i) { - if (minSizeOps[i]) { - bvm.map(minSizeOps[i], constantOps[i]); - } - } - for (auto &blockOp : elseBlock->without_terminator()) { - b.clone(blockOp, bvm); - } - return; -} - -struct WorkgroupSpecializationPass final - : impl::WorkgroupSpecializationPassBase { - void runOnOperation() override { - if (!clEnableWorkgroupSpecialization) - return; - - FunctionOpInterface funcOp = getOperation(); - specializeFunction(funcOp); - } -}; - -} // namespace -} // namespace mlir::iree_compiler diff --git a/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel index 9651d49fbb11..7b47488e42d8 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel @@ -78,7 +78,6 @@ iree_lit_test_suite( "vectorize_memref_copy.mlir", "vectorize_tensor_pad.mlir", "vector_layout_analysis.mlir", - "workgroup_specialization.mlir", ], include = ["*.mlir"], exclude = [ diff --git a/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt index d2b97e2e0a8e..adfafb20567f 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt @@ -74,7 +74,6 @@ iree_lit_test_suite( "vector_layout_analysis.mlir" "vectorize_memref_copy.mlir" "vectorize_tensor_pad.mlir" - "workgroup_specialization.mlir" TOOLS FileCheck iree-opt diff --git a/compiler/src/iree/compiler/Codegen/Common/test/workgroup_specialization.mlir b/compiler/src/iree/compiler/Codegen/Common/test/workgroup_specialization.mlir deleted file mode 100644 index 1654fce2b5d5..000000000000 --- a/compiler/src/iree/compiler/Codegen/Common/test/workgroup_specialization.mlir +++ /dev/null @@ -1,150 +0,0 @@ -// RUN: iree-opt --iree-codegen-enable-workgroup-specialization --pass-pipeline="builtin.module(func.func(iree-codegen-workgroup-specialization),canonicalize,cse)" --split-input-file %s | FileCheck %s - -#pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer>, - #hal.descriptor_set.binding<2, storage_buffer> - ]> -]> -#config = #iree_codegen.lowering_config -#map = affine_map<()[s0] -> (s0 * 64)> -#map1 = affine_map<()[s0] -> (s0 * -64 + 123, 64)> -#map2 = affine_map<()[s0] -> (s0 * -64 + 789, 64)> -func.func @matmul_tensors() { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %3 = affine.apply #map()[%workgroup_id_y] - %4 = affine.min #map1()[%workgroup_id_y] - %5 = affine.apply #map()[%workgroup_id_x] - %6 = affine.min #map2()[%workgroup_id_x] - %7 = flow.dispatch.tensor.load %0, offsets = [%3, 0], sizes = [%4, 456], strides = [1, 1] : !flow.dispatch.tensor> -> tensor - %8 = flow.dispatch.tensor.load %1, offsets = [0, %5], sizes = [456, %6], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<456x?xf32> - %9 = tensor.empty(%4, %6) : tensor - %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor) -> tensor - %11 = linalg.matmul {lowering_config = #config} ins(%7, %8 : tensor, tensor<456x?xf32>) outs(%10 : tensor) -> tensor - flow.dispatch.tensor.store %11, %2, offsets = [%3, %5], sizes = [%4, %6], strides = [1, 1] : tensor -> !flow.dispatch.tensor> - return -} - -// CHECK: func.func @matmul_tensors() -// CHECK: %[[C64:.+]] = arith.constant 64 : index -// CHECK: %[[CMP0:.+]] = arith.cmpi eq, %{{.+}}, %[[C64]] : index -// CHECK: %[[CMP1:.+]] = arith.cmpi eq, %{{.+}}, %[[C64]] : index -// CHECK: %[[COND:.+]] = arith.andi %[[CMP0]], %[[CMP1]] : i1 -// CHECK: scf.if %[[COND]] { -// CHECK: linalg.matmul -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : tensor<64x456xf32>, tensor<456x64xf32>) outs(%{{.+}} : tensor<64x64xf32>) -> tensor<64x64xf32> -// CHECK: } else { -// CHECK: linalg.matmul -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : tensor, tensor<456x?xf32>) outs(%{{.+}} : tensor) -> tensor - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer>, - #hal.descriptor_set.binding<2, storage_buffer> - ]> -]> -#config = #iree_codegen.lowering_config -#map = affine_map<()[s0] -> (s0 * 64)> -#map1 = affine_map<()[s0] -> (s0 * -64 + 123, 64)> -#map2 = affine_map<()[s0] -> (s0 * -64 + 789, 64)> -#map3 = affine_map<(d0, d1) -> (d0, d1)> -func.func @add_tensors() { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %3 = affine.apply #map()[%workgroup_id_y] - %4 = affine.min #map1()[%workgroup_id_y] - %5 = affine.apply #map()[%workgroup_id_x] - %6 = affine.min #map2()[%workgroup_id_x] - %7 = flow.dispatch.tensor.load %0, offsets = [%3, %5], sizes = [%4, %6], strides = [1, 1] : !flow.dispatch.tensor> -> tensor - %8 = flow.dispatch.tensor.load %1, offsets = [%3, %5], sizes = [%4, %6], strides = [1, 1] : !flow.dispatch.tensor> -> tensor - %9 = tensor.empty(%4, %6) : tensor - %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor) -> tensor - %11 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%7, %8 : tensor, tensor) outs(%10 : tensor) attrs = {lowering_config = #config} { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %12 = arith.addf %in, %in_0 : f32 - linalg.yield %12 : f32 - } -> tensor - flow.dispatch.tensor.store %11, %2, offsets = [%3, %5], sizes = [%4, %6], strides = [1, 1] : tensor -> !flow.dispatch.tensor> - return -} - -// CHECK: func.func @add_tensors() -// CHECK: %[[C64:.+]] = arith.constant 64 : index -// CHECK: %[[CMP0:.+]] = arith.cmpi eq, %{{.+}}, %[[C64]] : index -// CHECK: %[[CMP1:.+]] = arith.cmpi eq, %{{.+}}, %[[C64]] : index -// CHECK: %[[COND:.+]] = arith.andi %[[CMP0]], %[[CMP1]] : i1 -// CHECK: scf.if %[[COND]] { -// CHECK: linalg.generic -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : tensor<64x64xf32>, tensor<64x64xf32>) outs(%{{.+}} : tensor<64x64xf32>) -// CHECK: } else { -// CHECK: linalg.generic -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : tensor, tensor) outs(%{{.+}} : tensor) - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer>, - #hal.descriptor_set.binding<2, storage_buffer> - ]> -]> -#config = #iree_codegen.lowering_config -#map = affine_map<()[s0] -> (s0 * 2)> -#map1 = affine_map<()[s0] -> (s0 * 256)> -#map2 = affine_map<()[s0] -> (s0 * -256 + 30522, 256)> -#map3 = affine_map<(d0, d1) -> (d0, d1)> -#map4 = affine_map<(d0, d1) -> (d1)> -func.func @unaligned_partial_loop() { - %c512 = arith.constant 512 : index - %c786944 = arith.constant 786944 : index - %c265458176 = arith.constant 265458176 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c512) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c786944) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c265458176) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %4 = affine.apply #map()[%workgroup_id_y] - %5 = affine.apply #map1()[%workgroup_id_x] - %6 = affine.min #map2()[%workgroup_id_x] - %7 = flow.dispatch.tensor.load %0, offsets = [%4, 0], sizes = [2, 768], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2x768xf32> - %8 = flow.dispatch.tensor.load %1, offsets = [0, %5], sizes = [768, %6], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<768x?xf32> - %9 = tensor.empty(%6) : tensor<2x?xf32> - %10 = linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%9 : tensor<2x?xf32>) -> tensor<2x?xf32> - %11 = linalg.matmul {lowering_config = #config} ins(%7, %8 : tensor<2x768xf32>, tensor<768x?xf32>) outs(%10 : tensor<2x?xf32>) -> tensor<2x?xf32> - %12 = flow.dispatch.tensor.load %2, offsets = [%5], sizes = [%6], strides = [1] : !flow.dispatch.tensor> -> tensor - %13 = tensor.empty(%6) : tensor<2x?xf32> - %14 = linalg.generic {indexing_maps = [#map3, #map4, #map3], iterator_types = ["parallel", "parallel"]} ins(%11, %12 : tensor<2x?xf32>, tensor) outs(%13 : tensor<2x?xf32>) attrs = {lowering_config = #config} { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %15 = arith.addf %in, %in_0 : f32 - linalg.yield %15 : f32 - } -> tensor<2x?xf32> - flow.dispatch.tensor.store %14, %3, offsets = [%4, %5], sizes = [2, %6], strides = [1, 1] : tensor<2x?xf32> -> !flow.dispatch.tensor> - return -} - -// CHECK: func.func @unaligned_partial_loop() -// CHECK: %[[C256:.+]] = arith.constant 256 : index -// CHECK: %[[COND:.+]] = arith.cmpi eq, %{{.+}}, %[[C256]] : index -// CHECK: scf.if %[[COND]] { -// CHECK: linalg.matmul -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : tensor<2x768xf32>, tensor<768x256xf32>) outs(%{{.+}} : tensor<2x256xf32>) -// CHECK: } else { -// CHECK: linalg.matmul -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : tensor<2x768xf32>, tensor<768x?xf32>) outs(%{{.+}} : tensor<2x?xf32>) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp index 250645e1694d..c4dde0d81721 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp @@ -270,7 +270,6 @@ void addGPUVectorizationPassPipeline(OpPassManager &funcPassManager) { tileAndDistributeToWorkgroup(funcPassManager); funcPassManager.addPass(createCanonicalizerPass()); - funcPassManager.addPass(createWorkgroupSpecializationPass()); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createCSEPass()); @@ -456,7 +455,6 @@ void addGPUMatmulSimtPassPipeline(OpPassManager &funcPassManager, tileAndDistributeToWorkgroup(funcPassManager); funcPassManager.addPass(createCanonicalizerPass()); - funcPassManager.addPass(createWorkgroupSpecializationPass()); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createCSEPass()); @@ -653,7 +651,6 @@ void addGPUTransposePassPipeline(OpPassManager &funcPassManager, tileAndDistributeToWorkgroup(funcPassManager); funcPassManager.addPass(createCanonicalizerPass()); - funcPassManager.addPass(createWorkgroupSpecializationPass()); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createCSEPass()); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel index 67a84393a918..3ed0f3692fb0 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel @@ -77,7 +77,6 @@ iree_lit_test_suite( "vector_lowering.mlir", "vector_to_gpu.mlir", "winograd_pipeline_test.mlir", - "workgroup_specialization_pipeline_test.mlir", ], include = ["*.mlir"], # tensor_dialect_*_spec is a an MLIR file that specifies a diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt index 46366b4d9fa6..692ce93d2916 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt @@ -73,7 +73,6 @@ iree_lit_test_suite( "vector_lowering.mlir" "vector_to_gpu.mlir" "winograd_pipeline_test.mlir" - "workgroup_specialization_pipeline_test.mlir" TOOLS FileCheck iree-opt diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir deleted file mode 100644 index 73ffa190ec99..000000000000 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir +++ /dev/null @@ -1,114 +0,0 @@ -// RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_80 --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s - -#pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer>, - #hal.descriptor_set.binding<2, storage_buffer> - ]> -]> -hal.executable private @forward_dispatch_116 { - hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { - hal.executable.export public @forward_dispatch_116_matmul_128x30522x768 ordinal(0) layout(#pipeline_layout) { - ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): - %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 - hal.return %x, %y, %z : index, index, index - } - builtin.module { - func.func @forward_dispatch_116_matmul_128x30522x768() { - %c512 = arith.constant 512 : index - %c786944 = arith.constant 786944 : index - %c265458176 = arith.constant 265458176 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c512) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c786944) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c265458176) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 768], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x768xf32> - %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [768, 30522], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<768x30522xf32> - %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [30522], strides = [1] : !flow.dispatch.tensor> -> tensor<30522xf32> - %7 = tensor.empty() : tensor<128x30522xf32> - %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<128x30522xf32>) -> tensor<128x30522xf32> - %9 = linalg.matmul ins(%4, %5 : tensor<128x768xf32>, tensor<768x30522xf32>) outs(%8 : tensor<128x30522xf32>) -> tensor<128x30522xf32> - %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9, %6 : tensor<128x30522xf32>, tensor<30522xf32>) outs(%7 : tensor<128x30522xf32>) { - ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): - %11 = arith.addf %arg0, %arg1 : f32 - linalg.yield %11 : f32 - } -> tensor<128x30522xf32> - flow.dispatch.tensor.store %10, %3, offsets = [0, 0], sizes = [128, 30522], strides = [1, 1] : tensor<128x30522xf32> -> !flow.dispatch.tensor> - return - } - } - } -} - -// The specialized workgroup should have vector operations. - -// CHECK-LABEL: func.func @forward_dispatch_116_matmul_128x30522x768 -// CHECK: arith.cmpi eq -// CHECK: scf.if -// CHECK: vector.transfer_read -// CHECK: vector.transfer_read -// CHECK: vector.contract -// CHECK: vector.transfer_read -// CHECK: vector.broadcast -// CHECK: vector.transfer_write -// CHECK: else -// CHECK-NOT: vector.transfer - - -// ----- - -#pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer>, - #hal.descriptor_set.binding<2, storage_buffer> - ]> -]> -#map = affine_map<(d0) -> (d0)> -#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb"> -hal.executable private @vectorized_dispatch_0 { - hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { - hal.executable.export public @vectorized_dispatch_0_generic_102401 ordinal(0) layout(#pipeline_layout) { - ^bb0(%arg0: !hal.device, %arg1: index): - %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1 - hal.return %x, %y, %z : index, index, index - } - builtin.module { - func.func @vectorized_dispatch_0_generic_102401() { - %c0 = arith.constant 0 : index - %cst = arith.constant -3.000000e+00 : f32 - %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [102401], strides = [1] : !flow.dispatch.tensor> -> tensor<102401xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [102401], strides = [1] : !flow.dispatch.tensor> -> tensor<102401xf32> - %5 = tensor.empty() : tensor<102401xf32> - %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%3, %4 : tensor<102401xf32>, tensor<102401xf32>) outs(%5 : tensor<102401xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %7 = math.fma %cst, %in, %in_0 : f32 - linalg.yield %7 : f32 - } -> tensor<102401xf32> - flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [102401], strides = [1] : tensor<102401xf32> -> !flow.dispatch.tensor> - return - } - } - } -} - -// CHECK-LABEL: func.func @vectorized_dispatch_0_generic_102401 -// CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32 -// CHECK-DAG: %[[c256:.*]] = arith.constant 256 : index -// CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index -// CHECK: %[[BLKX:.*]] = hal.interface.workgroup.id[0] : index -// CHECK: %[[BLKX2:.*]] = affine.min #{{.+}}()[%[[BLKX]]] -// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[BLKX2]], %[[c256]] : index -// CHECK: scf.if %[[CMP]] -// CHECK: %[[ARR:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%[[c0]]) : memref<102401xf32, #hal.descriptor_type> -// CHECK: %[[ARR2:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%[[c0]]) : memref<102401xf32, #hal.descriptor_type> -// CHECK: %[[TIDX:.*]] = gpu.thread_id x -// CHECK: %[[AFF:.*]] = affine.apply #{{.+}}(%[[TIDX]])[%[[BLKX]]] -// CHECK: vector.transfer_read %[[ARR]][%[[AFF]]], %[[cst]] {in_bounds = [true]} : memref<102401xf32, #hal.descriptor_type>, vector<4xf32> -// CHECK: vector.transfer_read %[[ARR2]][%[[AFF]]], %[[cst]] {in_bounds = [true]} : memref<102401xf32, #hal.descriptor_type>, vector<4xf32>