Skip to content

Commit

Permalink
[GPU][DT] Add support for GPU data-tiling E2E tests. (iree-org#18591)
Browse files Browse the repository at this point in the history
The revision introduces GPUMaterializeHostEncodingPass for early
materialization. It is mainly for testing purpose while the codegen for
encodings is still in progress. A experimental flag (i.e.,
`iree-global-opt-experimental-rocm-data-tiling`) is introduced to
provide a path for GPU data-tiling e2e tests.

In the tests, a `optimization_barrier` op is introduced in between of
`set_encoding` op and `unset_encoding` op. The compiler could be clever
to cancel data-layout transformation, so a barrier is inserted as a
hint. To support the e2e path, the revision adds two additional changes:

1. Implement `MaterializeOptimizationBarrierOp` pattern, which replaces
the barrier op with the same op with materialized shape.
2. Implement the fallback for unset_encoding materialization pattern.
Currently only f32.f32.f32 and i8.i8.i32 mfma ops are supported. The
wmma ops are not supported, so the codegen should turn encodings to nop
on the targets that do not yet support the intrinsics. E.g., gfx1100.

Note: this only tests set_encoding and unset_encoding. The gemm codegen
is still in progress.

---------

Signed-off-by: hanhanW <hanhan0912@gmail.com>
  • Loading branch information
hanhanW authored Sep 25, 2024
1 parent 7290283 commit 0b29f7b
Show file tree
Hide file tree
Showing 11 changed files with 467 additions and 27 deletions.
2 changes: 2 additions & 0 deletions compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,9 @@ iree_compiler_cc_library(
"//compiler/src/iree/compiler/Codegen/Utils",
"//compiler/src/iree/compiler/Codegen/Utils:VectorOpUtils",
"//compiler/src/iree/compiler/Dialect/Encoding/IR",
"//compiler/src/iree/compiler/Dialect/HAL/Analysis",
"//compiler/src/iree/compiler/Dialect/HAL/IR",
"//compiler/src/iree/compiler/Dialect/Stream/Analysis",
"//compiler/src/iree/compiler/Utils",
"@llvm-project//llvm:Support",
"@llvm-project//mlir:AMDGPUDialect",
Expand Down
2 changes: 2 additions & 0 deletions compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,9 @@ iree_cc_library(
iree::compiler::Codegen::Utils
iree::compiler::Codegen::Utils::VectorOpUtils
iree::compiler::Dialect::Encoding::IR
iree::compiler::Dialect::HAL::Analysis
iree::compiler::Dialect::HAL::IR
iree::compiler::Dialect::Stream::Analysis
iree::compiler::Utils
PUBLIC
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "iree/compiler/Dialect/Encoding/IR/EncodingDialect.h"
#include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h"
#include "iree/compiler/Dialect/HAL/Analysis/DeviceAnalysis.h"
#include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
#include "iree/compiler/Dialect/Stream/Analysis/Affinity.h"
#include "llvm/ADT/SmallVector.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
Expand All @@ -35,6 +37,7 @@
namespace mlir::iree_compiler {

#define GEN_PASS_DEF_GPUMATERIALIZEDEVICEENCODINGPASS
#define GEN_PASS_DEF_GPUMATERIALIZEHOSTENCODINGPASS
#include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"

static bool hasIntrinsic(IREE::GPU::TargetAttr target,
Expand Down Expand Up @@ -130,6 +133,22 @@ materializeEncodingForTarget(RankedTensorType tensorType,
}

namespace {

// TODO(hanchung): Delete this pass and rely on tensor-based analysis to
// materialize encodings based on where tensors are used. This pass is not able
// to handle that.
struct GPUMaterializeHostEncodingPass
: public impl::GPUMaterializeHostEncodingPassBase<
GPUMaterializeHostEncodingPass> {
void getDependentDialects(DialectRegistry &registry) const override {
registry.insert<arith::ArithDialect, tensor::TensorDialect,
linalg::LinalgDialect, IREE::Encoding::IREEEncodingDialect,
IREE::GPU::IREEGPUDialect>();
}

void runOnOperation() override;
};

struct GPUMaterializeDeviceEncodingPass final
: impl::GPUMaterializeDeviceEncodingPassBase<
GPUMaterializeDeviceEncodingPass> {
Expand Down Expand Up @@ -248,8 +267,15 @@ struct GPUUnsetEncodingOpLoweringConversion
FailureOr<MaterializeEncodingInfo> maybeEncodingInfo =
converter->getEncodingInfo(unsetEncodingOp.getSource().getType());
if (failed(maybeEncodingInfo)) {
return rewriter.notifyMatchFailure(unsetEncodingOp,
"unhandled result encoding");
Value result = adaptor.getSource();
Type targetType =
getTypeConverter()->convertType(unsetEncodingOp.getSourceType());
if (targetType != result.getType()) {
result = rewriter.create<tensor::CastOp>(unsetEncodingOp.getLoc(),
targetType, result);
}
rewriter.replaceOp(unsetEncodingOp, result);
return success();
}

Location loc = unsetEncodingOp.getLoc();
Expand Down Expand Up @@ -400,12 +426,10 @@ class GPUConvertToMultiMma final
const MaterializeEncodingValueFn materializeEncodingValueFn;
};

} // namespace

void GPUMaterializeDeviceEncodingPass::runOnOperation() {
MLIRContext *ctx = &getContext();
FunctionOpInterface funcOp = getOperation();
auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(funcOp);
static LogicalResult
materializeFuncOpEncodings(FunctionOpInterface funcOp,
IREE::HAL::ExecutableTargetAttr targetAttr) {
MLIRContext *ctx = funcOp.getContext();
{
RewritePatternSet patterns(ctx);
MaterializeEncodingTypeConverter typeConverter(materializeEncodingForTarget,
Expand All @@ -424,7 +448,7 @@ void GPUMaterializeDeviceEncodingPass::runOnOperation() {
memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns);
if (failed(applyPartialConversion(funcOp, target, std::move(patterns)))) {
funcOp.emitOpError("materialization failed");
return signalPassFailure();
return failure();
}
}

Expand All @@ -436,9 +460,92 @@ void GPUMaterializeDeviceEncodingPass::runOnOperation() {
memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns);
if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
funcOp.emitOpError("folding patterns failed");
return failure();
}
}

return success();
}

static std::optional<SetVector<IREE::HAL::ExecutableTargetAttr>>
getFuncExecutableTargetAttrs(FunctionOpInterface funcOp,
IREE::Stream::AffinityAnalysis &affinityAnalysis,
IREE::HAL::DeviceAnalysis &deviceAnalysis) {
// Get a set of all unique affinities used by resources within the function.
SetVector<IREE::Stream::AffinityAttr> uniqueAffinityAttrs;
SmallVector<IREE::Stream::AffinityAttr> lookupAffinityAttrs;
funcOp.walk([&](Operation *op) {
if (affinityAnalysis.tryLookupExecutionAffinity(op, lookupAffinityAttrs)) {
uniqueAffinityAttrs.insert(lookupAffinityAttrs.begin(),
lookupAffinityAttrs.end());
}
lookupAffinityAttrs.clear();
});

// Resolve affinities to executable targets.
SetVector<IREE::HAL::ExecutableTargetAttr> executableTargetAttrs;
for (auto affinityAttr : uniqueAffinityAttrs) {
deviceAnalysis.gatherRequiredExecutableTargets(affinityAttr, funcOp,
executableTargetAttrs);
}
return executableTargetAttrs;
}

} // namespace

void GPUMaterializeHostEncodingPass::runOnOperation() {
auto moduleOp = getOperation();

// Run required analysis passes.
IREE::Stream::AffinityAnalysis affinityAnalysis(moduleOp);
if (failed(affinityAnalysis.run())) {
return signalPassFailure();
}
IREE::HAL::DeviceAnalysis deviceAnalysis(moduleOp);
if (failed(deviceAnalysis.run())) {
return signalPassFailure();
}

for (auto funcOp : moduleOp.getOps<FunctionOpInterface>()) {
// Gather the required executable targets for the function. Note that it's
// possible there are more required for ops nested within the function but
// this pass is a hack and can't handle that :shrug:.
auto executableTargets =
getFuncExecutableTargetAttrs(funcOp, affinityAnalysis, deviceAnalysis);
if (!executableTargets) {
funcOp.emitOpError()
<< "could not determine executable targets for the function";
return signalPassFailure();
} else if (executableTargets->empty()) {
// Probably no tensors.
continue;
}

// HACK: this pass is run on the host _but shouldn't be_. Because it's
// run on the host and IREE is a compiler capable of multi-targeting there
// may be multiple executable targets at any point in the host program.
// This pass can't handle that and assumes it's been checked earlier by
// spooky action at a distance. This needs to be fixed.
if (executableTargets->size() != 1) {
funcOp.emitOpError() << "has multiple executable targets and CPU data "
"tiling isn't built to support that";
return signalPassFailure();
}

// Materialize encodings within the function.
if (failed(
materializeFuncOpEncodings(funcOp, executableTargets->front()))) {
return signalPassFailure();
}
}
}

void GPUMaterializeDeviceEncodingPass::runOnOperation() {
FunctionOpInterface funcOp = getOperation();
auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(funcOp);
if (failed(materializeFuncOpEncodings(funcOp, targetAttr))) {
return signalPassFailure();
}
}

} // namespace mlir::iree_compiler
5 changes: 5 additions & 0 deletions compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,11 @@ def GPUApplyTilingLevelPass :
];
}

def GPUMaterializeHostEncodingPass :
Pass<"iree-codegen-gpu-materialize-host-encoding", "mlir::ModuleOp"> {
let summary = "Materialize the encoding for tensor as specified by the backend.";
}

def GPUMaterializeDeviceEncodingPass :
InterfacePass<"iree-codegen-gpu-materialize-device-encoding", "mlir::FunctionOpInterface"> {
let summary = "Materialize the encoding for tensor as specified by the backend.";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h"
#include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
#include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
#include "iree/compiler/Dialect/Util/IR/UtilOps.h"
#include "llvm/ADT/SmallVectorExtras.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
Expand Down Expand Up @@ -876,6 +877,27 @@ struct MaterializeOperation : public OpMaterializeEncodingPattern<OpTy> {
}
};

struct MaterializeOptimizationBarrierOp
: public OpMaterializeEncodingPattern<IREE::Util::OptimizationBarrierOp> {
using OpMaterializeEncodingPattern<
IREE::Util::OptimizationBarrierOp>::OpMaterializeEncodingPattern;

LogicalResult
matchAndRewrite(IREE::Util::OptimizationBarrierOp op,
IREE::Util::OptimizationBarrierOp::Adaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
if (llvm::none_of(op.getOperandTypes(), [](Type type) -> bool {
auto tensorType = dyn_cast<RankedTensorType>(type);
return tensorType && tensorType.getEncoding();
})) {
return failure();
}
rewriter.replaceOpWithNewOp<IREE::Util::OptimizationBarrierOp>(
op, adaptor.getOperands());
return success();
}
};

/// Pattern to convert contraction operations.
class MaterializeContractionOp : public OpInterfaceConversionPattern<
mlir::linalg::ContractionOpInterface> {
Expand Down Expand Up @@ -953,12 +975,12 @@ void populateShapeIndependentMaterializeEncodingPatterns(
return resultType == typeConverter.convertType(resultType);
});

patterns.insert<MaterializeDPSOperation<linalg::FillOp>,
MaterializeOperation<tensor::EmptyOp>,
MaterializeFlowDispatchTensorLoadOp,
MaterializeFlowDispatchTensorStoreOp,
MaterializeInterfaceBindingEncoding>(
context, typeConverter, materializeEncodingValueFn);
patterns.insert<
MaterializeDPSOperation<linalg::FillOp>,
MaterializeOperation<tensor::EmptyOp>, MaterializeOptimizationBarrierOp,
MaterializeFlowDispatchTensorLoadOp, MaterializeFlowDispatchTensorStoreOp,
MaterializeInterfaceBindingEncoding>(context, typeConverter,
materializeEncodingValueFn);
};

} // namespace mlir::iree_compiler
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/GlobalOptimization/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ iree_compiler_cc_library(
":PassesIncGen",
"//compiler/src/iree/compiler/Codegen/Common",
"//compiler/src/iree/compiler/Codegen/Common/CPU:CommonCPUPasses",
"//compiler/src/iree/compiler/Codegen/Common/GPU:CommonGPUPasses",
"//compiler/src/iree/compiler/Dialect/Encoding/IR",
"//compiler/src/iree/compiler/Dialect/Flow/Conversion/TensorToFlow",
"//compiler/src/iree/compiler/Dialect/Flow/IR",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ iree_cc_library(
MLIRTransforms
iree::compiler::Codegen::Common
iree::compiler::Codegen::Common::CPU::CommonCPUPasses
iree::compiler::Codegen::Common::GPU::CommonGPUPasses
iree::compiler::Dialect::Encoding::IR
iree::compiler::Dialect::Flow::Conversion::TensorToFlow
iree::compiler::Dialect::Flow::IR
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "iree/compiler/Codegen/Common/CPU/Passes.h"
#include "iree/compiler/Codegen/Common/GPU/Passes.h"
#include "iree/compiler/Codegen/Common/Passes.h"
#include "iree/compiler/Dialect/HAL/Analysis/DeviceAnalysis.h"
#include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
Expand All @@ -23,6 +24,14 @@

namespace mlir::iree_compiler::GlobalOptimization {

// TODO: Remove the flag once the codegen can handle the late materialization
// path. This is mainly for testing.
static llvm::cl::opt<bool> clEnableExperimentalRocmDataTiling(
"iree-global-opt-experimental-rocm-data-tiling",
llvm::cl::desc("Enables data-tiling materializatino for rocm backends "
"(experimental)."),
llvm::cl::init(false));

#define GEN_PASS_DEF_MATERIALIZEHOMOGENEOUSENCODINGSPASS
#include "iree/compiler/GlobalOptimization/Passes.h.inc"

Expand All @@ -38,13 +47,9 @@ class MaterializeHomogeneousEncodingsPass
registry.insert<IREE::HAL::HALDialect, tensor::TensorDialect>();
}

void runNopPipeline(ModuleOp &moduleOp) {
OpPassManager passManager(moduleOp.getOperationName());
void addNopPipeline(OpPassManager &passManager) {
FunctionLikeNest(passManager).addPass(createMaterializeEncodingIntoNopPass);
FunctionLikeNest(passManager).addPass(createCanonicalizerPass);
if (failed(runPipeline(passManager, moduleOp))) {
return signalPassFailure();
}
}

void runOnOperation() override {
Expand All @@ -55,8 +60,13 @@ class MaterializeHomogeneousEncodingsPass

SetVector<IREE::HAL::ExecutableTargetAttr> executableTargets;
deviceAnalysis.gatherAllExecutableTargets(executableTargets);
OpPassManager passManager(moduleOp.getOperationName());
if (executableTargets.size() != 1) {
return runNopPipeline(moduleOp);
addNopPipeline(passManager);
if (failed(runPipeline(passManager, moduleOp))) {
return signalPassFailure();
}
return;
}

// TODO: vmvx has its own logic about supporting dynamic tile
Expand All @@ -67,13 +77,21 @@ class MaterializeHomogeneousEncodingsPass
return;
}

// Only llvm-cpu backends handle encodings for now, others just go with nop.
if (executableTarget.getBackend() != "llvm-cpu") {
return runNopPipeline(moduleOp);
// Only llvm-cpu and rocm backends handle encodings for now, others just go
// with nop.
if (executableTarget.getBackend() == "llvm-cpu") {
passManager.addPass(createCPUMaterializeHostEncodingPass());
} else if (clEnableExperimentalRocmDataTiling &&
executableTarget.getBackend() == "rocm") {
passManager.addPass(createGPUMaterializeHostEncodingPass());
FunctionLikeNest(passManager).addPass([&]() {
return createDecomposePackUnPackOpsPass(/*tileOuterToOne=*/false,
/*useOnlyReshapes=*/true,
/*controlFn=*/std::nullopt);
});
} else {
addNopPipeline(passManager);
}

OpPassManager passManager(moduleOp.getOperationName());
passManager.addPass(createCPUMaterializeHostEncodingPass());
if (failed(runPipeline(passManager, moduleOp))) {
return signalPassFailure();
}
Expand Down
24 changes: 24 additions & 0 deletions tests/e2e/rocm_specific/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Copyright 2024 The IREE Authors
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

# Tests for end-to-end IREE support specific to the vulkan-spirv lowering.

load("//build_tools/bazel:iree_check_test.bzl", "iree_check_single_backend_test_suite")

package(
features = ["layering_check"],
licenses = ["notice"], # Apache 2.0
)

iree_check_single_backend_test_suite(
name = "check_rocm_hip",
srcs = ["encoding.mlir"],
compiler_flags = [
"--iree-global-opt-experimental-rocm-data-tiling",
],
driver = "hip",
target_backend = "rocm",
)
Loading

0 comments on commit 0b29f7b

Please sign in to comment.