[GPU][DT] Add support for GPU data-tiling E2E tests. (iree-org#18591)

The revision introduces GPUMaterializeHostEncodingPass for early materialization. It is mainly for testing purpose while the codegen for encodings is still in progress. A experimental flag (i.e., `iree-global-opt-experimental-rocm-data-tiling`) is introduced to provide a path for GPU data-tiling e2e tests. In the tests, a `optimization_barrier` op is introduced in between of `set_encoding` op and `unset_encoding` op. The compiler could be clever to cancel data-layout transformation, so a barrier is inserted as a hint. To support the e2e path, the revision adds two additional changes: 1. Implement `MaterializeOptimizationBarrierOp` pattern, which replaces the barrier op with the same op with materialized shape. 2. Implement the fallback for unset_encoding materialization pattern. Currently only f32.f32.f32 and i8.i8.i32 mfma ops are supported. The wmma ops are not supported, so the codegen should turn encodings to nop on the targets that do not yet support the intrinsics. E.g., gfx1100. Note: this only tests set_encoding and unset_encoding. The gemm codegen is still in progress. --------- Signed-off-by: hanhanW <hanhan0912@gmail.com>
nod-ai · Sep 25, 2024 · 0b29f7b · 0b29f7b
1 parent 7290283
commit 0b29f7b
Show file tree

Hide file tree

Showing 11 changed files with 467 additions and 27 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
@@ -102,7 +102,9 @@ iree_compiler_cc_library(
         "//compiler/src/iree/compiler/Codegen/Utils",
         "//compiler/src/iree/compiler/Codegen/Utils:VectorOpUtils",
         "//compiler/src/iree/compiler/Dialect/Encoding/IR",
+        "//compiler/src/iree/compiler/Dialect/HAL/Analysis",
         "//compiler/src/iree/compiler/Dialect/HAL/IR",
+        "//compiler/src/iree/compiler/Dialect/Stream/Analysis",
         "//compiler/src/iree/compiler/Utils",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AMDGPUDialect",

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
@@ -133,7 +133,9 @@ iree_cc_library(
     iree::compiler::Codegen::Utils
     iree::compiler::Codegen::Utils::VectorOpUtils
     iree::compiler::Dialect::Encoding::IR
+    iree::compiler::Dialect::HAL::Analysis
     iree::compiler::Dialect::HAL::IR
+    iree::compiler::Dialect::Stream::Analysis
     iree::compiler::Utils
   PUBLIC
 )

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp
@@ -16,7 +16,9 @@
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Dialect/Encoding/IR/EncodingDialect.h"
 #include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h"
+#include "iree/compiler/Dialect/HAL/Analysis/DeviceAnalysis.h"
 #include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
+#include "iree/compiler/Dialect/Stream/Analysis/Affinity.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
@@ -35,6 +37,7 @@
 namespace mlir::iree_compiler {
 
 #define GEN_PASS_DEF_GPUMATERIALIZEDEVICEENCODINGPASS
+#define GEN_PASS_DEF_GPUMATERIALIZEHOSTENCODINGPASS
 #include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"
 
 static bool hasIntrinsic(IREE::GPU::TargetAttr target,
@@ -130,6 +133,22 @@ materializeEncodingForTarget(RankedTensorType tensorType,
 }
 
 namespace {
+
+// TODO(hanchung): Delete this pass and rely on tensor-based analysis to
+// materialize encodings based on where tensors are used. This pass is not able
+// to handle that.
+struct GPUMaterializeHostEncodingPass
+    : public impl::GPUMaterializeHostEncodingPassBase<
+          GPUMaterializeHostEncodingPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<arith::ArithDialect, tensor::TensorDialect,
+                    linalg::LinalgDialect, IREE::Encoding::IREEEncodingDialect,
+                    IREE::GPU::IREEGPUDialect>();
+  }
+
+  void runOnOperation() override;
+};
+
 struct GPUMaterializeDeviceEncodingPass final
     : impl::GPUMaterializeDeviceEncodingPassBase<
           GPUMaterializeDeviceEncodingPass> {
@@ -248,8 +267,15 @@ struct GPUUnsetEncodingOpLoweringConversion
     FailureOr<MaterializeEncodingInfo> maybeEncodingInfo =
         converter->getEncodingInfo(unsetEncodingOp.getSource().getType());
     if (failed(maybeEncodingInfo)) {
-      return rewriter.notifyMatchFailure(unsetEncodingOp,
-                                         "unhandled result encoding");
+      Value result = adaptor.getSource();
+      Type targetType =
+          getTypeConverter()->convertType(unsetEncodingOp.getSourceType());
+      if (targetType != result.getType()) {
+        result = rewriter.create<tensor::CastOp>(unsetEncodingOp.getLoc(),
+                                                 targetType, result);
+      }
+      rewriter.replaceOp(unsetEncodingOp, result);
+      return success();
     }
 
     Location loc = unsetEncodingOp.getLoc();
@@ -400,12 +426,10 @@ class GPUConvertToMultiMma final
   const MaterializeEncodingValueFn materializeEncodingValueFn;
 };
 
-} // namespace
-
-void GPUMaterializeDeviceEncodingPass::runOnOperation() {
-  MLIRContext *ctx = &getContext();
-  FunctionOpInterface funcOp = getOperation();
-  auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(funcOp);
+static LogicalResult
+materializeFuncOpEncodings(FunctionOpInterface funcOp,
+                           IREE::HAL::ExecutableTargetAttr targetAttr) {
+  MLIRContext *ctx = funcOp.getContext();
   {
     RewritePatternSet patterns(ctx);
     MaterializeEncodingTypeConverter typeConverter(materializeEncodingForTarget,
@@ -424,7 +448,7 @@ void GPUMaterializeDeviceEncodingPass::runOnOperation() {
     memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns);
     if (failed(applyPartialConversion(funcOp, target, std::move(patterns)))) {
       funcOp.emitOpError("materialization failed");
-      return signalPassFailure();
+      return failure();
     }
   }
 
@@ -436,9 +460,92 @@ void GPUMaterializeDeviceEncodingPass::runOnOperation() {
     memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns);
     if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
       funcOp.emitOpError("folding patterns failed");
+      return failure();
+    }
+  }
+
+  return success();
+}
+
+static std::optional<SetVector<IREE::HAL::ExecutableTargetAttr>>
+getFuncExecutableTargetAttrs(FunctionOpInterface funcOp,
+                             IREE::Stream::AffinityAnalysis &affinityAnalysis,
+                             IREE::HAL::DeviceAnalysis &deviceAnalysis) {
+  // Get a set of all unique affinities used by resources within the function.
+  SetVector<IREE::Stream::AffinityAttr> uniqueAffinityAttrs;
+  SmallVector<IREE::Stream::AffinityAttr> lookupAffinityAttrs;
+  funcOp.walk([&](Operation *op) {
+    if (affinityAnalysis.tryLookupExecutionAffinity(op, lookupAffinityAttrs)) {
+      uniqueAffinityAttrs.insert(lookupAffinityAttrs.begin(),
+                                 lookupAffinityAttrs.end());
+    }
+    lookupAffinityAttrs.clear();
+  });
+
+  // Resolve affinities to executable targets.
+  SetVector<IREE::HAL::ExecutableTargetAttr> executableTargetAttrs;
+  for (auto affinityAttr : uniqueAffinityAttrs) {
+    deviceAnalysis.gatherRequiredExecutableTargets(affinityAttr, funcOp,
+                                                   executableTargetAttrs);
+  }
+  return executableTargetAttrs;
+}
+
+} // namespace
+
+void GPUMaterializeHostEncodingPass::runOnOperation() {
+  auto moduleOp = getOperation();
+
+  // Run required analysis passes.
+  IREE::Stream::AffinityAnalysis affinityAnalysis(moduleOp);
+  if (failed(affinityAnalysis.run())) {
+    return signalPassFailure();
+  }
+  IREE::HAL::DeviceAnalysis deviceAnalysis(moduleOp);
+  if (failed(deviceAnalysis.run())) {
+    return signalPassFailure();
+  }
+
+  for (auto funcOp : moduleOp.getOps<FunctionOpInterface>()) {
+    // Gather the required executable targets for the function. Note that it's
+    // possible there are more required for ops nested within the function but
+    // this pass is a hack and can't handle that :shrug:.
+    auto executableTargets =
+        getFuncExecutableTargetAttrs(funcOp, affinityAnalysis, deviceAnalysis);
+    if (!executableTargets) {
+      funcOp.emitOpError()
+          << "could not determine executable targets for the function";
+      return signalPassFailure();
+    } else if (executableTargets->empty()) {
+      // Probably no tensors.
+      continue;
+    }
+
+    // HACK: this pass is run on the host _but shouldn't be_. Because it's
+    // run on the host and IREE is a compiler capable of multi-targeting there
+    // may be multiple executable targets at any point in the host program.
+    // This pass can't handle that and assumes it's been checked earlier by
+    // spooky action at a distance. This needs to be fixed.
+    if (executableTargets->size() != 1) {
+      funcOp.emitOpError() << "has multiple executable targets and CPU data "
+                              "tiling isn't built to support that";
+      return signalPassFailure();
+    }
+
+    // Materialize encodings within the function.
+    if (failed(
+            materializeFuncOpEncodings(funcOp, executableTargets->front()))) {
       return signalPassFailure();
     }
   }
 }
 
+void GPUMaterializeDeviceEncodingPass::runOnOperation() {
+  FunctionOpInterface funcOp = getOperation();
+  auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(funcOp);
+  if (failed(materializeFuncOpEncodings(funcOp, targetAttr))) {
+    return signalPassFailure();
+  }
+}
+
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
@@ -197,6 +197,11 @@ def GPUApplyTilingLevelPass :
   ];
 }
 
+def GPUMaterializeHostEncodingPass :
+    Pass<"iree-codegen-gpu-materialize-host-encoding", "mlir::ModuleOp"> {
+  let summary = "Materialize the encoding for tensor as specified by the backend.";
+}
+
 def GPUMaterializeDeviceEncodingPass :
     InterfacePass<"iree-codegen-gpu-materialize-device-encoding", "mlir::FunctionOpInterface"> {
   let summary = "Materialize the encoding for tensor as specified by the backend.";

diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp
@@ -14,6 +14,7 @@
 #include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h"
 #include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
 #include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
+#include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "llvm/ADT/SmallVectorExtras.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -876,6 +877,27 @@ struct MaterializeOperation : public OpMaterializeEncodingPattern<OpTy> {
   }
 };
 
+struct MaterializeOptimizationBarrierOp
+    : public OpMaterializeEncodingPattern<IREE::Util::OptimizationBarrierOp> {
+  using OpMaterializeEncodingPattern<
+      IREE::Util::OptimizationBarrierOp>::OpMaterializeEncodingPattern;
+
+  LogicalResult
+  matchAndRewrite(IREE::Util::OptimizationBarrierOp op,
+                  IREE::Util::OptimizationBarrierOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    if (llvm::none_of(op.getOperandTypes(), [](Type type) -> bool {
+          auto tensorType = dyn_cast<RankedTensorType>(type);
+          return tensorType && tensorType.getEncoding();
+        })) {
+      return failure();
+    }
+    rewriter.replaceOpWithNewOp<IREE::Util::OptimizationBarrierOp>(
+        op, adaptor.getOperands());
+    return success();
+  }
+};
+
 /// Pattern to convert contraction operations.
 class MaterializeContractionOp : public OpInterfaceConversionPattern<
                                      mlir::linalg::ContractionOpInterface> {
@@ -953,12 +975,12 @@ void populateShapeIndependentMaterializeEncodingPatterns(
         return resultType == typeConverter.convertType(resultType);
       });
 
-  patterns.insert<MaterializeDPSOperation<linalg::FillOp>,
-                  MaterializeOperation<tensor::EmptyOp>,
-                  MaterializeFlowDispatchTensorLoadOp,
-                  MaterializeFlowDispatchTensorStoreOp,
-                  MaterializeInterfaceBindingEncoding>(
-      context, typeConverter, materializeEncodingValueFn);
+  patterns.insert<
+      MaterializeDPSOperation<linalg::FillOp>,
+      MaterializeOperation<tensor::EmptyOp>, MaterializeOptimizationBarrierOp,
+      MaterializeFlowDispatchTensorLoadOp, MaterializeFlowDispatchTensorStoreOp,
+      MaterializeInterfaceBindingEncoding>(context, typeConverter,
+                                           materializeEncodingValueFn);
 };
 
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/GlobalOptimization/BUILD.bazel b/compiler/src/iree/compiler/GlobalOptimization/BUILD.bazel
@@ -77,6 +77,7 @@ iree_compiler_cc_library(
         ":PassesIncGen",
         "//compiler/src/iree/compiler/Codegen/Common",
         "//compiler/src/iree/compiler/Codegen/Common/CPU:CommonCPUPasses",
+        "//compiler/src/iree/compiler/Codegen/Common/GPU:CommonGPUPasses",
         "//compiler/src/iree/compiler/Dialect/Encoding/IR",
         "//compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow",
         "//compiler/src/iree/compiler/Dialect/Flow/IR",

diff --git a/compiler/src/iree/compiler/GlobalOptimization/CMakeLists.txt b/compiler/src/iree/compiler/GlobalOptimization/CMakeLists.txt
@@ -92,6 +92,7 @@ iree_cc_library(
     MLIRTransforms
     iree::compiler::Codegen::Common
     iree::compiler::Codegen::Common::CPU::CommonCPUPasses
+    iree::compiler::Codegen::Common::GPU::CommonGPUPasses
     iree::compiler::Dialect::Encoding::IR
     iree::compiler::Dialect::Flow::Conversion::TensorToFlow
     iree::compiler::Dialect::Flow::IR

diff --git a/compiler/src/iree/compiler/GlobalOptimization/MaterializeHomogeneousEncodings.cpp b/compiler/src/iree/compiler/GlobalOptimization/MaterializeHomogeneousEncodings.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "iree/compiler/Codegen/Common/CPU/Passes.h"
+#include "iree/compiler/Codegen/Common/GPU/Passes.h"
 #include "iree/compiler/Codegen/Common/Passes.h"
 #include "iree/compiler/Dialect/HAL/Analysis/DeviceAnalysis.h"
 #include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
@@ -23,6 +24,14 @@
 
 namespace mlir::iree_compiler::GlobalOptimization {
 
+// TODO: Remove the flag once the codegen can handle the late materialization
+// path. This is mainly for testing.
+static llvm::cl::opt<bool> clEnableExperimentalRocmDataTiling(
+    "iree-global-opt-experimental-rocm-data-tiling",
+    llvm::cl::desc("Enables data-tiling materializatino for rocm backends "
+                   "(experimental)."),
+    llvm::cl::init(false));
+
 #define GEN_PASS_DEF_MATERIALIZEHOMOGENEOUSENCODINGSPASS
 #include "iree/compiler/GlobalOptimization/Passes.h.inc"
 
@@ -38,13 +47,9 @@ class MaterializeHomogeneousEncodingsPass
     registry.insert<IREE::HAL::HALDialect, tensor::TensorDialect>();
   }
 
-  void runNopPipeline(ModuleOp &moduleOp) {
-    OpPassManager passManager(moduleOp.getOperationName());
+  void addNopPipeline(OpPassManager &passManager) {
     FunctionLikeNest(passManager).addPass(createMaterializeEncodingIntoNopPass);
     FunctionLikeNest(passManager).addPass(createCanonicalizerPass);
-    if (failed(runPipeline(passManager, moduleOp))) {
-      return signalPassFailure();
-    }
   }
 
   void runOnOperation() override {
@@ -55,8 +60,13 @@ class MaterializeHomogeneousEncodingsPass
 
     SetVector<IREE::HAL::ExecutableTargetAttr> executableTargets;
     deviceAnalysis.gatherAllExecutableTargets(executableTargets);
+    OpPassManager passManager(moduleOp.getOperationName());
     if (executableTargets.size() != 1) {
-      return runNopPipeline(moduleOp);
+      addNopPipeline(passManager);
+      if (failed(runPipeline(passManager, moduleOp))) {
+        return signalPassFailure();
+      }
+      return;
     }
 
     // TODO: vmvx has its own logic about supporting dynamic tile
@@ -67,13 +77,21 @@ class MaterializeHomogeneousEncodingsPass
       return;
     }
 
-    // Only llvm-cpu backends handle encodings for now, others just go with nop.
-    if (executableTarget.getBackend() != "llvm-cpu") {
-      return runNopPipeline(moduleOp);
+    // Only llvm-cpu and rocm backends handle encodings for now, others just go
+    // with nop.
+    if (executableTarget.getBackend() == "llvm-cpu") {
+      passManager.addPass(createCPUMaterializeHostEncodingPass());
+    } else if (clEnableExperimentalRocmDataTiling &&
+               executableTarget.getBackend() == "rocm") {
+      passManager.addPass(createGPUMaterializeHostEncodingPass());
+      FunctionLikeNest(passManager).addPass([&]() {
+        return createDecomposePackUnPackOpsPass(/*tileOuterToOne=*/false,
+                                                /*useOnlyReshapes=*/true,
+                                                /*controlFn=*/std::nullopt);
+      });
+    } else {
+      addNopPipeline(passManager);
     }
-
-    OpPassManager passManager(moduleOp.getOperationName());
-    passManager.addPass(createCPUMaterializeHostEncodingPass());
     if (failed(runPipeline(passManager, moduleOp))) {
       return signalPassFailure();
     }

diff --git a/tests/e2e/rocm_specific/BUILD.bazel b/tests/e2e/rocm_specific/BUILD.bazel
@@ -0,0 +1,24 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Tests for end-to-end IREE support specific to the vulkan-spirv lowering.
+
+load("//build_tools/bazel:iree_check_test.bzl", "iree_check_single_backend_test_suite")
+
+package(
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_check_single_backend_test_suite(
+    name = "check_rocm_hip",
+    srcs = ["encoding.mlir"],
+    compiler_flags = [
+        "--iree-global-opt-experimental-rocm-data-tiling",
+    ],
+    driver = "hip",
+    target_backend = "rocm",
+)