[Codegen][GPU] Update greedy tile + fuse pipeline to generate mfma (i…

…ree-org#17617) This adds intrinsic packing and reshape propagation patterns to LLVMGPUTileAndFuse to allow for generating mfma operations. This adds a few passes to invoke a few necessary patterns for the pipeline to generate (good) code. 1. PropagateReshapesByExpansion to propagate reshapes introduced after decomposing tensor.pack/unpack towards the edges of the kernel in the hopes that the destination can line up properly. 2. IREE::GPU::PackToIntrinsics to pack based on the lowering config specified mma kind. 3. IREE::GPU::DistributeMmaToLanes to distribute iree_gpu.multi_mma ops to lanes, similar to another tiling level. There are a few known outstanding issues. 1. We run `ConvertToDestinationPassingStyle` twice to re-link the kernel destination with the body after decomposing `tensor.unpack`. This is to work around an issue with EliminateEmptyTensors being unable to analyze `flow.dispatch.tensor.store` ops with slicing behavior properly. After workgroup distribution is refactored to generate an scf.forall, this needs to be revisited. 4. iree_gpu.shuffle_tensor lowering to `tensor.insert_slice` is still broken. This will need to be reworked to support dynamic shapes. 5. Currently, because of the way the layout works, only MFMA_16x16x16 works. To support other layouts we will need another level of expanding to the intrinsic implicit layout and then propagating those expand_shapes. This will likely need to happen after reduction tiling unless we want to teach tile + fuse to swap tensor.expand_shape ops with tensor.extract_slice.
nod-ai · Jun 20, 2024 · 9fd55d2 · 9fd55d2
1 parent d01fb23
commit 9fd55d2
Show file tree

Hide file tree

Showing 48 changed files with 1,210 additions and 158 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/BUILD.bazel
@@ -129,6 +129,7 @@ iree_compiler_cc_library(
         "PassUtils.cpp",
         "Passes.cpp",
         "PolynomialApproximationPass.cpp",
+        "PropagateReshapesByExpansion.cpp",
         "ReconcileTranslationInfo.cpp",
         "RematerializeParallelOps.cpp",
         "RemoveTrivialLoops.cpp",

diff --git a/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
@@ -120,6 +120,7 @@ iree_cc_library(
     "PassUtils.cpp"
     "Passes.cpp"
     "PolynomialApproximationPass.cpp"
+    "PropagateReshapesByExpansion.cpp"
     "ReconcileTranslationInfo.cpp"
     "RematerializeParallelOps.cpp"
     "RemoveTrivialLoops.cpp"

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
@@ -89,6 +89,7 @@ iree_compiler_cc_library(
         "//compiler/src/iree/compiler/Codegen/Common:VectorLayoutAnalysis",
         "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
         "//compiler/src/iree/compiler/Codegen/Dialect/GPU/IR:IREEGPUDialect",
+        "//compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms:GPUTransforms",
         "//compiler/src/iree/compiler/Codegen/Interfaces:PartitionableLoopsInterface",
         "//compiler/src/iree/compiler/Codegen/Transforms",
         "//compiler/src/iree/compiler/Codegen/Utils",

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
@@ -120,6 +120,7 @@ iree_cc_library(
     iree::compiler::Codegen::Common::VectorLayoutAnalysis
     iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect
     iree::compiler::Codegen::Dialect::GPU::IR::IREEGPUDialect
+    iree::compiler::Codegen::Dialect::GPU::Transforms::GPUTransforms
     iree::compiler::Codegen::Interfaces::PartitionableLoopsInterface
     iree::compiler::Codegen::Transforms
     iree::compiler::Codegen::Utils

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp
@@ -8,7 +8,9 @@
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenInterfaces.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLForwardCompat.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
@@ -65,11 +67,8 @@ collectTiledAndFusedOps(Operation *op,
 static LogicalResult
 applyTileAndFuseToEachRoot(RewriterBase &rewriter,
                            llvm::SmallDenseSet<TilingInterface> &payloadOps,
-                           bool threadTiling) {
+                           IREE::GPU::TilingLevel tilingLevel) {
   MLIRContext *context = rewriter.getContext();
-  unsigned tilingLevel =
-      threadTiling ? static_cast<unsigned>(IREE::GPU::TilingLevel::Thread)
-                   : static_cast<unsigned>(IREE::GPU::TilingLevel::Reduction);
   for (TilingInterface tilingInterfaceOp : payloadOps) {
     mlir::DominanceInfo dominanceInfo(tilingInterfaceOp);
 
@@ -87,7 +86,8 @@ applyTileAndFuseToEachRoot(RewriterBase &rewriter,
     rewriter.setInsertionPoint(tilingInterfaceOp);
     SmallVector<OpFoldResult> tileSizes =
         getLoweringConfig(tilingInterfaceOp)
-            .getTilingLevelSizes(rewriter, tilingLevel, tilingInterfaceOp);
+            .getTilingLevelSizes(rewriter, llvm::to_underlying(tilingLevel),
+                                 tilingInterfaceOp);
 
     // Pad the tile sizes with zero.
     auto zero = rewriter.getIndexAttr(0);
@@ -101,7 +101,8 @@ applyTileAndFuseToEachRoot(RewriterBase &rewriter,
 
     scf::SCFTilingOptions tilingOptions;
     tilingOptions.setTileSizes(tileSizes);
-    if (threadTiling) {
+    if (tilingLevel == IREE::GPU::TilingLevel::Thread ||
+        tilingLevel == IREE::GPU::TilingLevel::Subgroup) {
       tilingOptions.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
 
       // TODO: Add some helpers to construct this based on the enum type rather
@@ -112,8 +113,14 @@ applyTileAndFuseToEachRoot(RewriterBase &rewriter,
         if (!isConstantIntValue(size, 0)) {
           unsigned mappingId =
               static_cast<unsigned>(gpu::MappingId::LinearDim0) + idx++;
-          mapping.push_back(gpu::GPUThreadMappingAttr::get(
-              context, static_cast<gpu::MappingId>(mappingId)));
+          if (tilingLevel == IREE::GPU::TilingLevel::Thread) {
+            mapping.push_back(gpu::GPUThreadMappingAttr::get(
+                context, static_cast<gpu::MappingId>(mappingId)));
+          } else {
+            // Else it must be subgroup tiling.
+            mapping.push_back(gpu::GPUWarpMappingAttr::get(
+                context, static_cast<gpu::MappingId>(mappingId)));
+          }
         }
       }
       tilingOptions.setMapping(mapping);
@@ -168,14 +175,13 @@ applyTileAndFuseToEachRoot(RewriterBase &rewriter,
 static llvm::SmallDenseSet<TilingInterface>
 getTiledOps(Operation *funcOp, IREE::GPU::TilingLevel tilingLevel) {
   llvm::SmallDenseSet<TilingInterface> targets;
-  unsigned opaqueLevel = static_cast<unsigned>(tilingLevel);
+  unsigned opaqueLevel = llvm::to_underlying(tilingLevel);
   funcOp->walk([&](TilingInterface target) {
     // TODO: This would probably be easier with a lowering config interface
     // method that checks whether a particular level is tiled.
     if (IREE::Codegen::LoweringConfigAttrInterface loweringConfig =
             getLoweringConfig(target)) {
-      if (!loweringConfig.getStaticTilingLevelSizes(opaqueLevel, target)
-               .empty()) {
+      if (loweringConfig.hasTilingLevel(opaqueLevel)) {
         targets.insert(target);
       }
     }
@@ -187,18 +193,18 @@ void GPUApplyTilingLevelPass::runOnOperation() {
   FunctionOpInterface funcOp = getOperation();
 
   if (tilingLevel != IREE::GPU::TilingLevel::Reduction &&
-      tilingLevel != IREE::GPU::TilingLevel::Thread) {
+      tilingLevel != IREE::GPU::TilingLevel::Thread &&
+      tilingLevel != IREE::GPU::TilingLevel::Subgroup) {
     funcOp.emitError() << "unsupported tiling level: "
                        << IREE::GPU::stringifyEnum(tilingLevel) << "\n";
     return signalPassFailure();
   }
 
   llvm::SmallDenseSet<TilingInterface> targetOps =
       getTiledOps(funcOp, tilingLevel);
-  bool useThread = tilingLevel == IREE::GPU::TilingLevel::Thread;
 
   IRRewriter rewriter(funcOp);
-  if (failed(applyTileAndFuseToEachRoot(rewriter, targetOps, useThread))) {
+  if (failed(applyTileAndFuseToEachRoot(rewriter, targetOps, tilingLevel))) {
     funcOp.emitError() << "tiling of level "
                        << IREE::GPU::stringifyEnum(tilingLevel) << " failed\n";
     return signalPassFailure();

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistribute.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistribute.cpp
@@ -6,6 +6,7 @@
 
 #include "iree/compiler/Codegen/Common/GPU/Passes.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
+#include "iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
@@ -23,6 +24,10 @@ struct GPUDistributePass final
     : impl::GPUDistributePassBase<GPUDistributePass> {
   void runOnOperation() override {
     auto funcOp = getOperation();
+    IRRewriter rewriter(funcOp->getContext());
+
+    // First map all lane level forall loops to lanes.
+    IREE::GPU::mapLaneForalls(rewriter, funcOp, /*insertBarrier=*/false);
 
     std::optional<SmallVector<int64_t>> workgroupSize =
         getWorkgroupSize(funcOp);
@@ -35,7 +40,6 @@ struct GPUDistributePass final
     // TODO: Don't hard code kCudaWarpSize here.
     int64_t subgroupSize = maybeSubgroupSize.value_or(kCudaWarpSize);
 
-    IRRewriter rewriter(funcOp->getContext());
     rewriter.setInsertionPointToStart(&funcOp.front());
     DiagnosedSilenceableFailure result =
         mlir::transform::gpu::mapNestedForallToThreadsImpl(

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
@@ -28,7 +28,11 @@ def GPUCreateFastSlowPathPass :
 def GPUDistributePass :
     InterfacePass<"iree-codegen-gpu-distribute", "mlir::FunctionOpInterface"> {
   let summary = "Pass to distribute scf.forall ops.";
-  let dependentDialects = ["::mlir::affine::AffineDialect", "::mlir::gpu::GPUDialect"];
+  let dependentDialects = [
+    "::mlir::affine::AffineDialect",
+    "::mlir::gpu::GPUDialect",
+    "::mlir::iree_compiler::IREE::GPU::IREEGPUDialect",
+  ];
 }
 
 def GPUDistributeSharedMemoryCopyPass :
@@ -152,6 +156,8 @@ def GPUApplyTilingLevelPass :
               clEnumValN(IREE::GPU::TilingLevel::Reduction, "reduction",
                          "Tile and fuse all annotated ops to serial loops"),
               clEnumValN(IREE::GPU::TilingLevel::Thread, "thread",
+                         "Tile and fuse all annotated ops to threads"),
+              clEnumValN(IREE::GPU::TilingLevel::Subgroup, "subgroup",
                          "Tile and fuse all annotated ops to threads")
            )}]>,
   ];

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir
@@ -1,7 +1,8 @@
 // RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level, canonicalize, cse))" %s | FileCheck %s
 // RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{tiling-level=thread}, canonicalize, cse))" %s | FileCheck %s --check-prefix=THREAD
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{tiling-level=subgroup}, canonicalize, cse))" %s | FileCheck %s --check-prefix=SUBGROUP
 
-#config = #iree_gpu.lowering_config<{thread = [2, 16]}>
+#config = #iree_gpu.lowering_config<{thread = [2, 16], subgroup = [2, 16]}>
 #map = affine_map<(d0, d1) -> (d0, d1)>
 module {
   func.func @add_tensor() {
@@ -35,6 +36,12 @@ module {
 //       THREAD:     scf.forall.in_parallel
 //       THREAD:   mapping = [#gpu.thread<linear_dim_0>, #gpu.thread<linear_dim_1>]
 
+// SUBGROUP-LABEL: func.func @add_tensor
+//       SUBGROUP:   scf.forall ({{.*}}) = (0, 0) to (64, 256) step (2, 16)
+//       SUBGROUP:     linalg.generic {{.*}} ins(%{{.*}}: tensor<2x16xf32>, tensor<2x16xf32>)
+//       SUBGROUP:     scf.forall.in_parallel
+//       SUBGROUP:   mapping = [#gpu.warp<linear_dim_0>, #gpu.warp<linear_dim_1>]
+
 // -----
 
 #config = #iree_gpu.lowering_config<{thread = [0, 16]}>

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-distribute, cse))" %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-distribute, cse))" %s --split-input-file | FileCheck %s
 
 #map = affine_map<()[s0] -> (s0 * 256)>
 #map1 = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>
@@ -43,3 +43,49 @@ module {
 //         CHECK:   %[[B:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[OFF]]], %{{.*}} {in_bounds = [true]} : memref<1x256xf32, #{{.*}}>, vector<4xf32>
 //         CHECK:   %[[C:.*]] = arith.addf %[[A]], %[[B]] : vector<4xf32>
 //         CHECK:   vector.transfer_write %[[C]], %[[S]][%[[C0]], %[[C0]]] {in_bounds = [true]} : vector<4xf32>, memref<1x4xf32, #{{.*}}>
+
+// -----
+
+#map = affine_map<()[s0] -> (s0 * 256)>
+#map1 = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>
+#map2 = affine_map<(d0) -> (d0 * 4)>
+#translation = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 1, 1]>
+module {
+  func.func @add_tensor_lane_id() attributes {translation_info = #translation} {
+    %cst = arith.constant 0.000000e+00 : f32
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
+    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<233x1024xf32>
+    memref.assume_alignment %0, 64 : memref<233x1024xf32>
+    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<233x1024xf32>
+    memref.assume_alignment %1, 64 : memref<233x1024xf32>
+    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<233x1024xf32>
+    memref.assume_alignment %2, 64 : memref<233x1024xf32>
+    %workgroup_id_x = hal.interface.workgroup.id[0] : index
+    %workgroup_id_y = hal.interface.workgroup.id[1] : index
+    %3 = affine.apply #map()[%workgroup_id_x]
+    %subview = memref.subview %2[%workgroup_id_y, %3] [1, 256] [1, 1] : memref<233x1024xf32> to memref<1x256xf32, #map1>
+    %subview_0 = memref.subview %0[%workgroup_id_y, %3] [1, 256] [1, 1] : memref<233x1024xf32> to memref<1x256xf32, #map1>
+    %subview_1 = memref.subview %1[%workgroup_id_y, %3] [1, 256] [1, 1] : memref<233x1024xf32> to memref<1x256xf32, #map1>
+    scf.forall (%arg0) in (%c64) {
+      %4 = affine.apply #map2(%arg0)
+      %subview_2 = memref.subview %subview[0, %4] [1, 4] [1, 1] : memref<1x256xf32, #map1> to memref<1x4xf32, #map1>
+      %5 = vector.transfer_read %subview_0[%c0, %4], %cst {in_bounds = [true]} : memref<1x256xf32, #map1>, vector<4xf32>
+      %6 = vector.transfer_read %subview_1[%c0, %4], %cst {in_bounds = [true]} : memref<1x256xf32, #map1>, vector<4xf32>
+      %7 = arith.addf %5, %6 : vector<4xf32>
+      vector.transfer_write %7, %subview_2[%c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x4xf32, #map1>
+    } {mapping = [#iree_gpu.lane_id<0>]}
+    return
+  }
+}
+
+//         CHECK: #[[$MAP:.*]] = affine_map<(d0) -> (d0 * 4)>
+//   CHECK-LABEL: func.func @add_tensor_lane_id
+//         CHECK:   %[[C0:.*]] = arith.constant 0 : index
+//         CHECK:   %[[TX:.*]] = gpu.lane_id
+//         CHECK:   %[[OFF:.*]] = affine.apply #[[$MAP]](%[[TX]])
+//         CHECK:   %[[S:.*]] = memref.subview %{{.*}}[0, %[[OFF]]] [1, 4] [1, 1] : memref<1x256xf32, #{{.*}}> to memref<1x4xf32, #{{.*}}>
+//         CHECK:   %[[A:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[OFF]]], %{{.*}} {in_bounds = [true]} : memref<1x256xf32, #{{.*}}>, vector<4xf32>
+//         CHECK:   %[[B:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[OFF]]], %{{.*}} {in_bounds = [true]} : memref<1x256xf32, #{{.*}}>, vector<4xf32>
+//         CHECK:   %[[C:.*]] = arith.addf %[[A]], %[[B]] : vector<4xf32>
+//         CHECK:   vector.transfer_write %[[C]], %[[S]][%[[C0]], %[[C0]]] {in_bounds = [true]} : vector<4xf32>, memref<1x4xf32, #{{.*}}>
diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.h b/compiler/src/iree/compiler/Codegen/Common/Passes.h
@@ -242,6 +242,10 @@ std::unique_ptr<InterfacePass<FunctionOpInterface>> createPadDynamicAlloc();
 /// Pass to convert math operations to their polynomial approximation.
 std::unique_ptr<OperationPass<>> createPolynomialApproximationPass();
 
+/// Pass to propagate reshapes by expansion through all ops without explicit
+/// lowering configurations.
+std::unique_ptr<OperationPass<>> createPropagateReshapesByExpansionPass();
+
 /// Pass to reconcile TranslationInfo across multiple functions in a dispatch
 /// and set the appropriate values on the surrounding HAL ops.
 std::unique_ptr<OperationPass<IREE::HAL::ExecutableVariantOp>>

diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.td b/compiler/src/iree/compiler/Codegen/Common/Passes.td
@@ -434,6 +434,12 @@ def PolynomialApproximationPass :
       "mlir::iree_compiler::createPolynomialApproximationPass()";
 }
 
+def PropagateReshapesByExpansionPass :
+    Pass<"iree-codegen-propagate-reshapes-by-expansion", ""> {
+  let summary = "Propagates reshaping operations by expansion.";
+  let constructor = "mlir::iree_compiler::createPropagateReshapesByExpansionPass()";
+}
+
 def RematerializeParallelOps :
     InterfacePass<"iree-codegen-rematerialize-parallel-ops", "mlir::FunctionOpInterface"> {
   let summary = "Pass to rematerialize and merge parallel ops into consumers.";

diff --git a/compiler/src/iree/compiler/Codegen/Common/PropagateReshapesByExpansion.cpp b/compiler/src/iree/compiler/Codegen/Common/PropagateReshapesByExpansion.cpp
@@ -0,0 +1,78 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/Common/PassDetail.h"
+#include "iree/compiler/Codegen/Common/Passes.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
+#include "iree/compiler/Codegen/Transforms/Transforms.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir::iree_compiler {
+
+namespace {
+
+struct PropagateReshapesByExpansionPass
+    : public PropagateReshapesByExpansionPassBase<
+          PropagateReshapesByExpansionPass> {
+  void runOnOperation() override;
+};
+} // namespace
+
+void PropagateReshapesByExpansionPass::runOnOperation() {
+  MLIRContext *context = &getContext();
+
+  {
+    RewritePatternSet patterns(context);
+    // Preemptively attempt to fold any reshapes into interface bindings if
+    // possible to simplify subsequent reshape propagation.
+    populateReshapeToInterfaceTensorPatterns(patterns);
+    if (failed(applyPatternsAndFoldGreedily(getOperation(),
+                                            std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+
+  RewritePatternSet bubbleExpandShapePatterns(context);
+  linalg::ControlFusionFn bubbleUpExpansionControlFn =
+      [](OpOperand *fusedOperand) {
+        Operation *producer = fusedOperand->get().getDefiningOp();
+        Operation *consumer = fusedOperand->getOwner();
+
+        // Block only if one of the operations has a lowering configuration
+        // which means it likely expects tiling specific to its original shape.
+        if (getLoweringConfig(producer) || getLoweringConfig(consumer)) {
+          return false;
+        }
+        return true;
+      };
+  linalg::populateFoldReshapeOpsByExpansionPatterns(bubbleExpandShapePatterns,
+                                                    bubbleUpExpansionControlFn);
+  // Add patterns to do some additional cleanup (on top of canonicalizations
+  // that can be done later) of reshape ops.
+  tensor::populateFoldTensorEmptyPatterns(bubbleExpandShapePatterns);
+  linalg::FillOp::getCanonicalizationPatterns(bubbleExpandShapePatterns,
+                                              context);
+  tensor::CollapseShapeOp::getCanonicalizationPatterns(
+      bubbleExpandShapePatterns, context);
+  tensor::EmptyOp::getCanonicalizationPatterns(bubbleExpandShapePatterns,
+                                               context);
+  tensor::ExpandShapeOp::getCanonicalizationPatterns(bubbleExpandShapePatterns,
+                                                     context);
+  populateReshapeToInterfaceTensorPatterns(bubbleExpandShapePatterns);
+
+  if (failed(applyPatternsAndFoldGreedily(
+          getOperation(), std::move(bubbleExpandShapePatterns)))) {
+    getOperation()->emitOpError("Failed to propagate reshapes");
+    return signalPassFailure();
+  }
+}
+
+std::unique_ptr<OperationPass<>> createPropagateReshapesByExpansionPass() {
+  return std::make_unique<PropagateReshapesByExpansionPass>();
+}
+
+} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/BUILD.bazel
@@ -66,6 +66,7 @@ iree_compiler_cc_library(
         "//compiler/src/iree/compiler/Codegen/Common:VectorLayoutAnalysis",
         "//compiler/src/iree/compiler/Codegen/Common/GPU:CommonGPUPasses",
         "//compiler/src/iree/compiler/Codegen/Dialect/GPU/IR:IREEGPUDialect",
+        "//compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms:GPUTransforms",
         "//compiler/src/iree/compiler/Codegen/Interfaces:BufferizationInterfaces",
         "//compiler/src/iree/compiler/Codegen/Transforms",
         "//compiler/src/iree/compiler/Codegen/Utils",

diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CMakeLists.txt
@@ -70,6 +70,7 @@ iree_cc_library(
     iree::compiler::Codegen::Common::GPU::CommonGPUPasses
     iree::compiler::Codegen::Common::VectorLayoutAnalysis
     iree::compiler::Codegen::Dialect::GPU::IR::IREEGPUDialect
+    iree::compiler::Codegen::Dialect::GPU::Transforms::GPUTransforms
     iree::compiler::Codegen::Interfaces::BufferizationInterfaces
     iree::compiler::Codegen::Transforms
     iree::compiler::Codegen::Utils