diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp
index 0ef6e64d2c26..d72ac17b0e9e 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp
@@ -27,65 +27,6 @@ using VectorValue = TypedValue<VectorType>;
 
 namespace {
 
-/// Given the state of the iterator, compute the indices of the original vector
-/// that the current iterator state is iterating over. These indices are
-/// parameterized by the thread grid.
-static SmallVector<Value> computeSIMDIndex(const LayoutIterator::State &state,
-                                           LayoutAttr layout, Value laneId,
-                                           RewriterBase &rewriter) {
-  MLIRContext *ctx = layout.getContext();
-  AffineExpr threadX, threadY, threadZ;
-  bindSymbols(ctx, threadX, threadY, threadZ);
-
-  SmallVector<Value> simdIndex;
-  // Calculate the index for each dim separately.
-  for (PerDimLayoutAttr dimLayout : layout.getLayouts()) {
-    AffineExpr offset = getAffineConstantExpr(0, ctx);
-    AffineExpr stride = getAffineConstantExpr(1, ctx);
-    for (auto [label, shape] : llvm::reverse(
-             llvm::zip(dimLayout.getLabels(), dimLayout.getShapes()))) {
-      int64_t position = state.lookup(label.getValue()).getPosition();
-
-      switch (label.getValue()) {
-      case LayoutDimension::LANEX:
-        offset = offset + stride * threadX;
-        break;
-      case LayoutDimension::LANEY:
-        offset = offset + stride * threadY;
-        break;
-      case LayoutDimension::LANEZ:
-        offset = offset + stride * threadZ;
-        break;
-      default:
-        offset = offset + stride * getAffineConstantExpr(position, ctx);
-        break;
-      }
-      stride = stride * getAffineConstantExpr(shape, ctx);
-    }
-
-    auto [laneDimX, laneDimY, laneDimZ] = layout.getLaneGrid();
-    SmallVector<Value> laneGrid = {
-        rewriter.create<arith::ConstantIndexOp>(laneId.getLoc(), laneDimZ),
-        rewriter.create<arith::ConstantIndexOp>(laneId.getLoc(), laneDimY),
-        rewriter.create<arith::ConstantIndexOp>(laneId.getLoc(), laneDimX)};
-    FailureOr<SmallVector<Value>> maybeReversedLaneGridVals =
-        affine::delinearizeIndex(rewriter, laneId.getLoc(), laneId, laneGrid);
-    assert(succeeded(maybeReversedLaneGridVals) &&
-           "Failed to delinearize lane index");
-    SmallVector<Value> laneGridVals = {(*maybeReversedLaneGridVals)[2],
-                                       (*maybeReversedLaneGridVals)[1],
-                                       (*maybeReversedLaneGridVals)[0]};
-
-    // Compute the index for the dim.
-    AffineMap indexMap = AffineMap::get(0, 3, offset);
-    Value index = rewriter.create<affine::AffineApplyOp>(
-        rewriter.getUnknownLoc(), indexMap, laneGridVals);
-    simdIndex.push_back(index);
-  }
-
-  return simdIndex;
-}
-
 struct DistributeConstants final : OpDistributionPattern<arith::ConstantOp> {
   using OpDistributionPattern::OpDistributionPattern;
 
@@ -192,338 +133,6 @@ getReducedPermutation(AffineMap permutationMap,
   return permutation;
 }
 
-template <typename OpTy>
-struct DistributeXferLayoutAttr : OpDistributionPattern<OpTy> {
-  static_assert(std::is_same<OpTy, vector::TransferReadOp>::value ||
-                    std::is_same<OpTy, vector::TransferWriteOp>::value,
-                "expected vector::TransferReadOp or vector::TransferWriteOp");
-
-  DistributeXferLayoutAttr(MLIRContext *context, Value laneId,
-                           PatternBenefit benefit = 1)
-      : OpDistributionPattern<OpTy>(context, benefit), laneId(laneId) {}
-
-  VectorValue accessMemory(OpTy xferOp, VectorValue accumulator,
-                           LayoutAttr vectorLayout,
-                           PatternRewriter &rewriter) const {
-    // We need to take special consideration of the permutation map when
-    // lowering. When accessing memory, we use the memoryLayout, because that
-    // is how the data is accessed in memory. The data is stored in the vector
-    // according to vectorLayout.
-    llvm::SmallBitVector unusedDims;
-    SmallVector<int64_t> permutation =
-        getReducedPermutation(xferOp.getPermutationMap(), unusedDims);
-    LayoutAttr memoryLayout =
-        cast<LayoutAttr>(vectorLayout.permute(permutation));
-
-    int loadWidth = getLoadStoreWidth(memoryLayout);
-    DenseMap<LayoutDimension, int64_t> steps;
-    steps[LayoutDimension::VECTORX] = loadWidth;
-    LayoutIterator iterator(vectorLayout, steps);
-
-    iterator.apply([&](const LayoutIterator::State &state) {
-      SmallVector<Value> memoryIndices = getMemoryIndices(
-          state, memoryLayout, xferOp.getIndices(), unusedDims, rewriter);
-      SmallVector<int64_t> accIndices = state.computeSIMTIndex();
-      accumulator = accessUnit(xferOp, memoryIndices, accIndices, accumulator,
-                               vectorLayout, memoryLayout, rewriter);
-    });
-
-    return accumulator;
-  }
-
-  SmallVector<Value> getMemoryIndices(const LayoutIterator::State &state,
-                                      LayoutAttr memoryLayout,
-                                      SmallVector<Value> indices,
-                                      llvm::SmallBitVector &projectedDims,
-                                      RewriterBase &rewriter) const {
-    SmallVector<Value> simdIndices =
-        computeSIMDIndex(state, memoryLayout, laneId, rewriter);
-    SmallVector<Value> memoryIndices(indices);
-
-    // The memory layout has some projected leading dims that indices doesn't.
-    int currSimd = 0;
-    for (int i = 0, e = memoryIndices.size(); i < e; ++i) {
-      if (projectedDims[i]) {
-        continue;
-      }
-
-      memoryIndices[i] = rewriter.create<arith::AddIOp>(
-          rewriter.getUnknownLoc(), memoryIndices[i], simdIndices[currSimd]);
-      ++currSimd;
-    }
-
-    return memoryIndices;
-  }
-
-  virtual VectorValue accessUnit(OpTy xferOp, SmallVector<Value> &memoryIndices,
-                                 SmallVector<int64_t> &accIndices,
-                                 VectorValue accumulator,
-                                 LayoutAttr vectorLayout,
-                                 LayoutAttr memoryLayout,
-                                 PatternRewriter &rewriter) const = 0;
-
-  int getLoadStoreWidth(LayoutAttr layout) const {
-    PerDimLayoutAttr fastestChanging = layout.getLayouts().back();
-    if (std::optional<int64_t> width =
-            fastestChanging.getShape(LayoutDimension::VECTORX)) {
-      return *width;
-    }
-    return 1;
-  }
-
-  Value laneId;
-};
-
-struct DistributeTransferReadLayoutAttr final
-    : DistributeXferLayoutAttr<vector::TransferReadOp> {
-  using DistributeXferLayoutAttr::DistributeXferLayoutAttr;
-
-  LogicalResult matchAndRewrite(vector::TransferReadOp readOp,
-                                DistributionSignature &signature,
-                                PatternRewriter &rewriter) const override {
-    LayoutAttr vectorLayout =
-        dyn_cast<LayoutAttr>(signature[readOp.getResult()]);
-    if (!vectorLayout) {
-      return failure();
-    }
-
-    // TODO: Return failure if we need masking.
-
-    Type elementType = readOp.getSource().getType().getElementType();
-    auto vectorType =
-        VectorType::get(vectorLayout.getDistributedShape(), elementType);
-    Value zero = rewriter.create<arith::ConstantOp>(
-        readOp.getLoc(), vectorType, rewriter.getZeroAttr(vectorType));
-    VectorValue acc = cast<VectorValue>(zero);
-
-    VectorValue readVec = accessMemory(readOp, acc, vectorLayout, rewriter);
-
-    replaceOpWithDistributedValues(rewriter, readOp, readVec);
-    return success();
-  }
-
-  VectorValue accessUnit(vector::TransferReadOp readOp,
-                         SmallVector<Value> &memoryIndices,
-                         SmallVector<int64_t> &accIndices,
-                         VectorValue accumulator, LayoutAttr vectorLayout,
-                         LayoutAttr memoryLayout,
-                         PatternRewriter &rewriter) const override {
-    auto unitType = VectorType::get({getLoadStoreWidth(memoryLayout)},
-                                    accumulator.getType().getElementType());
-    VectorValue load = rewriter.create<vector::LoadOp>(
-        readOp.getLoc(), unitType, readOp.getSource(), memoryIndices);
-    return rewriter.create<vector::InsertStridedSliceOp>(
-        readOp.getLoc(), load, accumulator, accIndices,
-        SmallVector<int64_t>{1});
-  }
-};
-
-struct DistributeTransferWriteLayoutAttr final
-    : DistributeXferLayoutAttr<vector::TransferWriteOp> {
-  using DistributeXferLayoutAttr::DistributeXferLayoutAttr;
-
-  LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp,
-                                DistributionSignature &signature,
-                                PatternRewriter &rewriter) const override {
-    LayoutAttr vectorLayout =
-        dyn_cast<LayoutAttr>(signature[writeOp.getVector()]);
-    if (!vectorLayout) {
-      return failure();
-    }
-
-    if (writeOp.getMask()) {
-      return failure();
-    }
-
-    accessMemory(writeOp, writeOp.getVector(), vectorLayout, rewriter);
-
-    rewriter.eraseOp(writeOp);
-    return success();
-  }
-
-  VectorValue accessUnit(vector::TransferWriteOp writeOp,
-                         SmallVector<Value> &memoryIndices,
-                         SmallVector<int64_t> &accIndices,
-                         VectorValue accumulator, LayoutAttr vectorLayout,
-                         LayoutAttr memoryLayout,
-                         PatternRewriter &rewriter) const override {
-    int width = getLoadStoreWidth(memoryLayout);
-
-    SmallVector<int64_t> strides(accIndices.size(), 1);
-    SmallVector<int64_t> shapes(accIndices.size(), 1);
-    shapes[shapes.size() - 1] = width;
-    Value result = rewriter.create<vector::ExtractStridedSliceOp>(
-        writeOp.getLoc(), getDistributed(rewriter, accumulator, vectorLayout),
-        accIndices, shapes, strides);
-    result = rewriter.create<vector::ExtractOp>(
-        writeOp.getLoc(), result,
-        SmallVector<int64_t>(accIndices.size() - 1, 0));
-    rewriter.create<vector::StoreOp>(writeOp.getLoc(), result,
-                                     writeOp.getSource(), memoryIndices);
-
-    return accumulator;
-  }
-};
-
-struct DistributeReductions final
-    : OpDistributionPattern<vector::MultiDimReductionOp> {
-  using OpDistributionPattern::OpDistributionPattern;
-
-  DistributeReductions(MLIRContext *context, int64_t maxBitsPerShuffle)
-      : OpDistributionPattern(context), maxBitsPerShuffle(maxBitsPerShuffle) {}
-
-  static constexpr int64_t kDefaultSubgroupSize = 32;
-
-  // Do parallel reduction using butterfly shuffles.
-  Value doThreadGlobalReduction(Value result, uint64_t shuffleOffset,
-                                int64_t laneSize,
-                                vector::CombiningKind combiningKind,
-                                int64_t entriesPerVector, Value mEmpty,
-                                OpBuilder &rewriter, Location loc) const {
-    auto funcOp = result.getDefiningOp()->getParentOfType<func::FuncOp>();
-    std::optional<int64_t> maybeSubgroupSize = getSubgroupSize(funcOp);
-    if (!maybeSubgroupSize)
-      funcOp->emitWarning("No subgroup size specified, using default value = " +
-                          Twine(kDefaultSubgroupSize));
-    int64_t subgroupSize = maybeSubgroupSize.value_or(kDefaultSubgroupSize);
-
-    Value mask;
-    assert(llvm::isPowerOf2_64(laneSize));
-    for (uint64_t i = shuffleOffset; i < shuffleOffset * laneSize; i <<= 1) {
-      Value packed = packVectorToSupportedWidth(loc, rewriter, result);
-      auto shuffleOp = rewriter.create<gpu::ShuffleOp>(
-          loc, packed, i, subgroupSize, gpu::ShuffleMode::XOR);
-      Value unpacked =
-          unpackToVector(loc, rewriter, shuffleOp.getShuffleResult(),
-                         cast<VectorType>(result.getType()));
-      result = makeArithReduction(rewriter, loc, combiningKind, unpacked,
-                                  result, nullptr, mask);
-    }
-
-    // Reduce packed vector with initial value.
-    Value reducedValue = rewriter.create<vector::ExtractOp>(
-        loc, result, SmallVector<int64_t>{0});
-    for (int i = 1; i < entriesPerVector; i++) {
-      Value next = rewriter.create<vector::ExtractOp>(loc, result,
-                                                      SmallVector<int64_t>{i});
-      reducedValue = makeArithReduction(rewriter, loc, combiningKind,
-                                        reducedValue, next, nullptr, mask);
-    }
-    result = makeArithReduction(rewriter, loc, combiningKind, reducedValue,
-                                mEmpty, nullptr, mask);
-    return result;
-  }
-
-  // This pattern distributes reductions as follows:
-  // First, the data local to a specific thread is reduced.
-  // Then, the data between threads is reduced by emitting appropriate
-  // shuffle instructions.
-  // Currently, only 16 and 32 bit types are supported.
-  // TODO: Add ability to reduce n parallel dims together.
-  LogicalResult matchAndRewrite(vector::MultiDimReductionOp reductionOp,
-                                DistributionSignature &signature,
-                                PatternRewriter &rewriter) const override {
-    ArrayRef<int64_t> reductionDims = reductionOp.getReductionDims();
-    // TODO: Add support for reductions along multiple dimensions.
-    if (reductionDims.size() > 1)
-      return failure();
-
-    VectorValue resultVec = dyn_cast<VectorValue>(reductionOp.getResult());
-    // TODO: Support results that are not vectors.
-    if (!resultVec)
-      return failure();
-    LayoutAttr resultLayout = dyn_cast<LayoutAttr>(signature[resultVec]);
-    if (!resultLayout)
-      return failure();
-
-    VectorValue source = reductionOp.getSource();
-    ShapedType sourceType = llvm::cast<ShapedType>(source.getType());
-    // TODO: Add support for (n != 2)-D tensors.
-    if (sourceType.getRank() != 2)
-      return failure();
-
-    LayoutAttr sourceLayout = dyn_cast<LayoutAttr>(signature[source]);
-    if (!sourceLayout)
-      return failure();
-
-    VectorValue acc = dyn_cast<VectorValue>(reductionOp.getAcc());
-    ShapedType accType = llvm::cast<ShapedType>(acc.getType());
-    Type elementType = accType.getElementType();
-    int bitWidth = elementType.getIntOrFloatBitWidth();
-    // TODO: Support additional bitwidths.
-    if ((bitWidth != 16) && (bitWidth != 32))
-      return failure();
-
-    Location loc = reductionOp.getLoc();
-    auto storeVectorType =
-        VectorType::get(resultLayout.getDistributedShape(), elementType);
-    Value storeVec = rewriter.create<arith::ConstantOp>(
-        loc, storeVectorType, rewriter.getZeroAttr(storeVectorType));
-
-    int reductionDim = reductionDims[0];
-    int parallelDim = reductionDim ^ 1;
-    if (!sourceLayout.getLane(reductionDim))
-      return failure();
-    uint64_t shuffleOffset = sourceLayout.getShuffleOffset(reductionDim);
-    int64_t laneSize = sourceLayout.getLaneDim(reductionDim).value();
-    if (!llvm::isPowerOf2_64(laneSize))
-      return failure();
-    vector::CombiningKind combiningKind = reductionOp.getKind();
-
-    auto reduceFn = [&](const LayoutIterator::State &state) {
-      SmallVector<int64_t> parallelSimtIndices = state.computeSIMTIndex();
-      Value mEmpty = rewriter.create<vector::ExtractOp>(
-          loc, getDistributed(rewriter, acc, resultLayout),
-          parallelSimtIndices);
-
-      // Store one or more elements in packed vector depending on type.
-      int64_t entriesPerVector = maxBitsPerShuffle / bitWidth;
-      Value packedVector = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getZeroAttr(
-                   VectorType::get({entriesPerVector}, elementType)));
-
-      int64_t index{0};
-      Value result, mask;
-      // Thread-local reduction.
-      auto reduceLocalFn = [&](const LayoutIterator::State &state) {
-        SmallVector<int64_t> indices = state.computeSIMTIndex();
-        Value element = rewriter.create<vector::ExtractOp>(
-            loc, getDistributed(rewriter, source, sourceLayout), indices);
-        packedVector = rewriter.create<vector::InsertOp>(
-            loc, element, packedVector, SmallVector<int64_t>{index});
-        index = (index + 1) % entriesPerVector;
-        // Reduce packed vector when full.
-        if (index == 0) {
-          result = result
-                       ? makeArithReduction(rewriter, loc, combiningKind,
-                                            result, packedVector, nullptr, mask)
-                       : packedVector;
-        }
-      };
-
-      LayoutIterator reductionIterator(sourceLayout, reductionDim);
-      reductionIterator.maybeFreezeAndConcatenate(state);
-      reductionIterator.apply(reduceLocalFn);
-
-      // Thread-global reduction.
-      result = doThreadGlobalReduction(result, shuffleOffset, laneSize,
-                                       combiningKind, entriesPerVector, mEmpty,
-                                       rewriter, loc);
-      storeVec = rewriter.create<vector::InsertOp>(loc, result, storeVec,
-                                                   parallelSimtIndices);
-    };
-
-    LayoutIterator parallelIterator(sourceLayout, parallelDim);
-    parallelIterator.apply(reduceFn);
-    replaceOpWithDistributedValues(rewriter, reductionOp, storeVec);
-
-    return success();
-  }
-
-private:
-  int64_t maxBitsPerShuffle;
-};
-
 struct DistributeScfFor final : OpDistributionPattern<scf::ForOp> {
   using OpDistributionPattern::OpDistributionPattern;
 
@@ -625,402 +234,6 @@ struct DistributeScfFor final : OpDistributionPattern<scf::ForOp> {
   }
 };
 
-struct DistributeTransposeLayoutAttr final
-    : OpDistributionPattern<vector::TransposeOp> {
-  using OpDistributionPattern::OpDistributionPattern;
-
-  LogicalResult matchAndRewrite(vector::TransposeOp transposeOp,
-                                DistributionSignature &signature,
-                                PatternRewriter &rewriter) const override {
-    VectorValue value = transposeOp.getVector();
-    VectorLayoutInterface layout = dyn_cast<LayoutAttr>(signature[value]);
-    if (!layout) {
-      return rewriter.notifyMatchFailure(transposeOp,
-                                         "layout must be LayoutAttr");
-    }
-
-    /// Transpose only changes the notion of where the data carried by each
-    /// thread comes from in the SIMD vector. The data carried by each thread is
-    /// still the same, just iterated in a new permuted order. This iteration
-    /// information is carried by the layout. So, we can simply distribute
-    /// transpose to a no-op. Example:
-    ///
-    /// input: vector<2x4xf16>
-    ///
-    /// 0 0 1 1
-    /// 2 2 3 3
-    ///
-    /// after transpose,
-    ///
-    /// transp: vector<4x2xf16>
-    ///
-    /// 0 2
-    /// 0 2
-    /// 1 3
-    /// 1 3
-    ///
-    /// As it can be seen, each thread is still carrying the same data and
-    /// distributes to vector<2xf16>.
-    ///
-    /// The only difference is where this vector<2xf16> comes from and that
-    /// before transpose, this vector<2xf16> was representing the fastest
-    /// changing dimension, but after distribution it's not.
-    replaceOpWithDistributedValues(rewriter, transposeOp,
-                                   getDistributed(rewriter, value, layout));
-    return success();
-  }
-};
-
-struct DistributeBroadcastLayoutAttr final
-    : OpDistributionPattern<vector::BroadcastOp> {
-  using OpDistributionPattern::OpDistributionPattern;
-
-  LogicalResult matchAndRewrite(vector::BroadcastOp broadcastOp,
-                                DistributionSignature &signature,
-                                PatternRewriter &rewriter) const override {
-
-    VectorValue source = dyn_cast<VectorValue>(broadcastOp.getSource());
-    if (!source) {
-      // TODO: Add support for scalar broadcasting.
-      return failure();
-    }
-
-    VectorValue vector = broadcastOp.getVector();
-    LayoutAttr layout = dyn_cast<LayoutAttr>(signature[vector]);
-    if (!layout) {
-      return failure();
-    }
-
-    VectorLayoutInterface sourceLayout = signature[source];
-
-    // We currently only support 1-D to 2-D broadcasting.
-    if (source.getType().getRank() != 1 || vector.getType().getRank() != 2) {
-      return failure();
-    }
-
-    int broadcastedDim = 0;
-    int parallelDim = 1;
-
-    Type elementType =
-        llvm::cast<ShapedType>(vector.getType()).getElementType();
-    auto vectorType =
-        VectorType::get(layout.getDistributedShape(), elementType);
-    Location loc = broadcastOp.getLoc();
-    Value accumulator = rewriter.create<arith::ConstantOp>(
-        loc, vectorType, rewriter.getZeroAttr(vectorType));
-
-    // Iterate over the parallel dimension.;
-    LayoutIterator parallelIterator(layout, parallelDim);
-    parallelIterator.apply([&](const LayoutIterator::State &parallelState) {
-      // Extract the value from source.
-      SmallVector<int64_t> sourceIndices = parallelState.computeSIMTIndex();
-      Value value = rewriter.create<vector::ExtractOp>(
-          loc, getDistributed(rewriter, source, sourceLayout), sourceIndices);
-
-      // Broadcast value over the broadcasted dimension.
-      LayoutIterator broadcastIterator(layout, broadcastedDim);
-      broadcastIterator.maybeFreezeAndConcatenate(parallelState);
-      broadcastIterator.apply([&](const LayoutIterator::State &broadcastState) {
-        SmallVector<int64_t> resultIndices = broadcastState.computeSIMTIndex();
-
-        accumulator = rewriter.create<vector::InsertOp>(loc, value, accumulator,
-                                                        resultIndices);
-      });
-    });
-
-    replaceOpWithDistributedValues(rewriter, broadcastOp, accumulator);
-    return success();
-  }
-};
-
-/// This pattern implements a distribution pattern for layout conflict
-/// resolutions where the resolution is a simple vector reshape.
-/// In most cases, layout conflicts will need to be resolved with a
-/// trip to shared memory or shuffle instructions and in those scenarios
-/// this pattern will not work.
-///
-/// Below we outline some scenarios where this pattern will be useful:
-/// - Unary Operators which are permutation invariant
-///   Example:
-///     Say the data for a single row is distributed among 2 threads as
-///     0 0 0 0 1 1 1 1
-///     and we have a layout conflict that requires the data to be
-///     distributed as
-///     0 0 1 1 0 0 1 1
-///     and we are interested in computing an elementwise operation like exp
-///     or trying to do a reduction along the row, then since the operations
-///     are permutation invariant, we can treat the resolution as a vector
-///     reshape.
-/// - Binary Operators which are permutation invariant
-///   Example:
-///     Using the same example as above, say we are trying to do a dot product
-///     between two vectors that have the above layout. As long as both
-///     operands are permuted the same way, we will end up with the correct
-///     sequence of multiplications and additions.
-///
-struct DistributeLayoutConflictResolutions final
-    : OpDistributionPattern<IREE::VectorExt::ToLayoutOp> {
-  using OpDistributionPattern::OpDistributionPattern;
-
-  VectorValue reshapeVector(Location loc, RewriterBase &rewriter,
-                            VectorValue src, LayoutAttr &currentLayout,
-                            LayoutAttr &targetLayout, Type elementType) const {
-
-    SmallVector<int64_t> targetShape = targetLayout.getDistributedShape();
-    SmallVector<int64_t> currentShape = currentLayout.getDistributedShape();
-
-    auto newVectorType = VectorType::get(targetShape, elementType);
-    auto constantOp = rewriter.create<arith::ConstantOp>(
-        loc, newVectorType, rewriter.getZeroAttr(newVectorType));
-    auto newVector = dyn_cast<VectorValue>(constantOp.getResult());
-
-    int64_t innermostDim = targetShape.size() - 1;
-    int64_t step =
-        std::min(targetShape[innermostDim], currentShape[innermostDim]);
-    DenseMap<LayoutDimension, int64_t> steps;
-    LayoutDimension vecDim = LayoutDimension::VECTORX;
-    steps[vecDim] = step;
-    LayoutIterator srcIterator(currentLayout, steps);
-    LayoutIterator targetIterator(targetLayout, steps);
-
-    for (; !srcIterator.iterationComplete() &&
-           !targetIterator.iterationComplete();
-         ++srcIterator, ++targetIterator) {
-      SmallVector<int64_t> srcOffset =
-          srcIterator.getState().computeSIMTIndex();
-      SmallVector<int64_t> targetOffset =
-          targetIterator.getState().computeSIMTIndex();
-      SmallVector<int64_t> sliceSize(srcOffset.size(), 1);
-      sliceSize[sliceSize.size() - 1] = step;
-      SmallVector<int64_t> sliceStride(srcOffset.size(), 1);
-      Value slice = rewriter.create<vector::ExtractStridedSliceOp>(
-          loc, src, srcOffset, sliceSize, sliceStride);
-      newVector = rewriter.create<vector::InsertStridedSliceOp>(
-          loc, slice, newVector, targetOffset, sliceStride);
-    }
-    return newVector;
-  }
-
-  LogicalResult matchAndRewrite(IREE::VectorExt::ToLayoutOp resolutionOp,
-                                DistributionSignature &signature,
-                                PatternRewriter &rewriter) const override {
-    auto vector = cast<VectorValue>(resolutionOp.getInput());
-    auto result = cast<VectorValue>(resolutionOp.getOutput());
-    LayoutAttr currentLayout = dyn_cast<LayoutAttr>(signature[vector]);
-    if (!currentLayout)
-      return failure();
-    LayoutAttr targetLayout = dyn_cast<LayoutAttr>(signature[result]);
-    if (!targetLayout)
-      return failure();
-
-    if (currentLayout == targetLayout) {
-      return rewriter.notifyMatchFailure(
-          resolutionOp, "Layout conversion is not a conflict.");
-    }
-
-    SmallVector<int64_t> currentVecShape = currentLayout.getDistributedShape();
-    SmallVector<int64_t> targetVecShape = targetLayout.getDistributedShape();
-    if (currentVecShape.size() != targetVecShape.size())
-      return failure();
-
-    auto numElements = [](ArrayRef<int64_t> vector) {
-      return std::accumulate(vector.begin(), vector.end(), 1,
-                             std::multiplies<int64_t>());
-    };
-    if (numElements(currentVecShape) != numElements(targetVecShape))
-      return failure();
-
-    if (currentLayout.hasLaneConflictWith(targetLayout)) {
-      return failure();
-    }
-
-    Type elementType =
-        llvm::cast<VectorType>(result.getType()).getElementType();
-    Value newVector =
-        reshapeVector(resolutionOp.getLoc(), rewriter,
-                      getDistributed(rewriter, vector, targetLayout),
-                      currentLayout, targetLayout, elementType);
-    replaceOpWithDistributedValues(rewriter, resolutionOp, newVector);
-    return success();
-  }
-};
-
-/// Pattern that allows us to write to shared memory
-/// and read back to register with correct layouts.
-/// especially used when we don't have an optimized way
-/// to resolve the conflict.
-struct DistributeLayoutConflictToSharedMemory final
-    : OpDistributionPattern<IREE::VectorExt::ToLayoutOp> {
-  using OpDistributionPattern::OpDistributionPattern;
-
-  LogicalResult matchAndRewrite(IREE::VectorExt::ToLayoutOp resolutionOp,
-                                DistributionSignature &signature,
-                                PatternRewriter &rewriter) const override {
-    auto loc = resolutionOp.getLoc();
-    auto vector = cast<VectorValue>(resolutionOp.getInput());
-    auto result = cast<VectorValue>(resolutionOp.getOutput());
-    LayoutAttr currentLayout = dyn_cast<LayoutAttr>(signature[vector]);
-    if (!currentLayout) {
-      return rewriter.notifyMatchFailure(resolutionOp,
-                                         "Source layout must be LayoutAttr.");
-    }
-    LayoutAttr targetLayout = dyn_cast<LayoutAttr>(signature[result]);
-    if (!targetLayout) {
-      return rewriter.notifyMatchFailure(resolutionOp,
-                                         "Target layout must be LayoutAttr.");
-    }
-
-    if (currentLayout == targetLayout) {
-      return rewriter.notifyMatchFailure(
-          resolutionOp, "Layout conversion is not a conflict.");
-    }
-
-    SmallVector<int64_t> currentVecShape = currentLayout.getDistributedShape();
-    SmallVector<int64_t> targetVecShape = targetLayout.getDistributedShape();
-    if (currentVecShape.size() != targetVecShape.size()) {
-      return rewriter.notifyMatchFailure(
-          resolutionOp,
-          "Target's and source's distributed rank needs to match.");
-    }
-
-    auto numElements = [](ArrayRef<int64_t> vector) {
-      return std::accumulate(vector.begin(), vector.end(), 1,
-                             std::multiplies<int64_t>());
-    };
-
-    if (numElements(currentVecShape) == numElements(targetVecShape) &&
-        !currentLayout.hasLaneConflictWith(targetLayout)) {
-      // If the conditions suffice, we can skip the trip to shared memory
-      // and just use the default/more efficient layout conflict resolution
-      // distribution.
-      return rewriter.notifyMatchFailure(resolutionOp,
-                                         "Failing because condition suffice to "
-                                         "use better conflict resolutions.");
-    }
-
-    // Compute Subgroup and Workgroup related information and offsets.
-    auto funcOp = resolutionOp->getParentOfType<func::FuncOp>();
-    if (!funcOp) {
-      return rewriter.notifyMatchFailure(
-          resolutionOp, "Expects a parent of type funcOp S.T we can compute "
-                        "subgroup and workgroup related information.");
-    }
-    std::optional<SmallVector<int64_t>> workgroupSize =
-        getWorkgroupSize(funcOp);
-    std::optional<int64_t> subgroupSize = getSubgroupSize(funcOp);
-    if (!workgroupSize.has_value() || !subgroupSize.has_value()) {
-      return rewriter.notifyMatchFailure(
-          resolutionOp, "Expects workgroup/subgroup information to be "
-                        "available to resolve conflict.");
-    }
-    int64_t flatThreadSize = ShapedType::getNumElements(workgroupSize.value());
-    if (flatThreadSize % subgroupSize.value() != 0)
-      return failure();
-    int64_t numSubgroups = flatThreadSize / subgroupSize.value();
-
-    // Define shapes and types needed to be roundtripped to shared-memory.
-    // The allocated shared-memory will stack subgroup data
-    // on fastest dimension. Hence, shape will be:
-    // [dim0, dim1, ..., subgroupCount * dimN]
-
-    auto resolutionType =
-        llvm::dyn_cast_or_null<VectorType>(resolutionOp.getResult().getType());
-    if (!resolutionType) {
-      return rewriter.notifyMatchFailure(
-          resolutionOp,
-          "Expects resolutionOp result to be of type vectorType.");
-    }
-    if (!resolutionType.hasStaticShape()) {
-      return rewriter.notifyMatchFailure(
-          resolutionOp, "Expects resolutionOp result to have static shape.");
-    }
-    auto paddedShape = SmallVector<int64_t>(resolutionType.getShape());
-    int64_t vectorRank = resolutionType.getRank();
-    paddedShape[vectorRank - 1] *= numSubgroups;
-
-    // Offset and indexing computation such that subgroups can
-    // write and read to shared memory correctly and without conflicts.
-    AffineExpr d0, d1, d2, s0;
-    bindDims(rewriter.getContext(), d0, d1, d2);
-    bindSymbols(rewriter.getContext(), s0);
-    auto indexType = rewriter.getIndexType();
-    Value threadX =
-        rewriter.create<gpu::ThreadIdOp>(loc, indexType, gpu::Dimension::x);
-    Value threadY =
-        rewriter.create<gpu::ThreadIdOp>(loc, indexType, gpu::Dimension::y);
-    Value threadZ =
-        rewriter.create<gpu::ThreadIdOp>(loc, indexType, gpu::Dimension::z);
-    Value flatThreadId = affine::makeComposedAffineApply(
-        rewriter, loc,
-        (d0 + workgroupSize.value()[0] * d1 +
-         (workgroupSize.value()[0] * workgroupSize.value()[1]) * d2),
-        {threadX, threadY, threadZ});
-    Value subgroupOffset = affine::makeComposedAffineApply(
-        rewriter, loc,
-        s0.floorDiv(subgroupSize.value()) *
-            resolutionType.getShape()[vectorRank - 1],
-        {flatThreadId});
-
-    // Create shared memory to store the intermediate from src layout.
-    auto workgroupMemoryAddressSpace = Attribute(gpu::AddressSpaceAttr::get(
-        rewriter.getContext(), gpu::AddressSpace::Workgroup));
-    MemRefType allocType =
-        MemRefType::get(paddedShape, resolutionType.getElementType(),
-                        AffineMap(), workgroupMemoryAddressSpace);
-    auto alloc = rewriter.create<memref::AllocOp>(loc, allocType);
-
-    SmallVector<OpFoldResult> offsets(vectorRank, rewriter.getIndexAttr(0));
-    SmallVector<OpFoldResult> strides(vectorRank, rewriter.getIndexAttr(1));
-    SmallVector<OpFoldResult> shapes = llvm::to_vector(
-        llvm::map_range(resolutionType.getShape(), [&](int64_t dim) {
-          return OpFoldResult(rewriter.getIndexAttr(dim));
-        }));
-    offsets[vectorRank - 1] = subgroupOffset;
-    auto subview = rewriter.create<memref::SubViewOp>(loc, alloc, offsets,
-                                                      shapes, strides);
-
-    // Creating write/trip to shared memory using src layout.
-    Value c0 = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-    SmallVector<Value> indices(resolutionType.getRank(), c0);
-    SmallVector<bool> inBounds(vectorRank, true);
-    auto write = rewriter.create<vector::TransferWriteOp>(loc, vector, subview,
-                                                          indices, inBounds);
-    // Insert gpu.barrier
-    rewriter.create<gpu::BarrierOp>(write.getLoc());
-
-    // Creating read from shared memory using dst layout.
-    // Read with offset starting from the warpIdx * OG fastest dim.
-    indices[vectorRank - 1] = subgroupOffset;
-    auto read = rewriter.create<vector::TransferReadOp>(loc, resolutionType,
-                                                        alloc, indices);
-
-    // Set layouts signature for write.
-    // We need to set the layout on the srcVector/first operand.
-    auto unitAttr = UnitAttr::get(rewriter.getContext());
-    auto writeAttrs = SmallVector<Attribute>(write->getNumOperands(), unitAttr);
-    writeAttrs[0] =
-        currentLayout; // 1st operand is src which requires currentLayout.
-    ArrayAttr writeOperandsAttr =
-        ArrayAttr::get(rewriter.getContext(), writeAttrs);
-    ArrayAttr writeResultsAttr = ArrayAttr::get(rewriter.getContext(), {});
-    setSignatureForRedistribution(rewriter, write.getOperation(),
-                                  writeOperandsAttr, writeResultsAttr);
-
-    // Set layouts signature for read.
-    // We only need to set the layout on output.
-    ArrayAttr readOperandsAttr = ArrayAttr::get(
-        rewriter.getContext(),
-        SmallVector<Attribute>(read->getNumOperands(), unitAttr));
-    ArrayAttr readResultsAttr =
-        ArrayAttr::get(rewriter.getContext(), {targetLayout});
-    setSignatureForRedistribution(rewriter, read.getOperation(),
-                                  readOperandsAttr, readResultsAttr);
-
-    rewriter.replaceOp(resolutionOp, read.getResult());
-    return success();
-  }
-};
-
 struct DistributeTrivialLayoutConversions final
     : OpDistributionPattern<IREE::VectorExt::ToLayoutOp> {
   using OpDistributionPattern::OpDistributionPattern;
@@ -1102,11 +315,6 @@ struct DistributeGather final : OpDistributionPattern<vector::GatherOp> {
 
 } // namespace
 
-void populateGPUReductionDistributionPatterns(RewritePatternSet &patterns,
-                                              int64_t maxBitsPerShuffle) {
-  patterns.add<DistributeReductions>(patterns.getContext(), maxBitsPerShuffle);
-}
-
 void populateGPUDistributionPatterns(RewritePatternSet &patterns) {
   patterns.add<DistributeConstants, DistributeScfFor>(patterns.getContext());
   // Elementwise patterns.
@@ -1116,20 +324,4 @@ void populateGPUDistributionPatterns(RewritePatternSet &patterns) {
   patterns.add<DistributeGather>(patterns.getContext());
 }
 
-void populateGPUDistributionLayoutAttrPatterns(Value laneId,
-                                               RewritePatternSet &patterns) {
-  patterns
-      .add<DistributeTransferReadLayoutAttr, DistributeTransferWriteLayoutAttr>(
-          patterns.getContext(), laneId);
-  patterns.add<DistributeBroadcastLayoutAttr, DistributeTransposeLayoutAttr>(
-      patterns.getContext());
-}
-
-// TODO: Need a new op/analysis to determine when this pattern is safe to use.
-void populateGPULayoutResolutionDistributionPatterns(
-    RewritePatternSet &patterns) {
-  patterns.add<DistributeLayoutConflictResolutions,
-               DistributeLayoutConflictToSharedMemory>(patterns.getContext());
-}
-
 }; // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPatterns.h b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPatterns.h
index 87303844853f..9e81014a4087 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPatterns.h
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPatterns.h
@@ -31,12 +31,6 @@ void populateDropSharedMemoryDeallocOpPatterns(RewritePatternSet &patterns);
 
 void populateGPUDistributionPatterns(RewritePatternSet &patterns);
 
-void populateGPUDistributionLayoutAttrPatterns(Value laneId,
-                                               RewritePatternSet &patterns);
-
-void populateGPUReductionDistributionPatterns(RewritePatternSet &patterns,
-                                              int64_t maxBitsPerShuffle = 32);
-
 void populateGPUDistributeNestedLayoutAttrPatterns(
     RewritePatternSet &patterns, Value threadId, int64_t subgroupSize,
     int64_t maxBitsPerShuffle = 32);
@@ -46,9 +40,6 @@ void populateGPUDistributeNestedLayoutAttrPatterns(
 void populateGPUDistributeNestedLayoutContractAMDGPUPatterns(
     RewritePatternSet &patterns);
 
-void populateGPULayoutResolutionDistributionPatterns(
-    RewritePatternSet &patterns);
-
 } // namespace mlir::iree_compiler
 
 #endif // IREE_COMPILER_CODEGEN_COMMON_GPUPATTERNS_H_
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
index c392eb783581..a503664ecef4 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
@@ -1,46 +1,5 @@
 // RUN: iree-opt --iree-transform-dialect-interpreter --split-input-file --canonicalize --cse %s | FileCheck %s
 
-#layout = #iree_vector_ext.layout<<[VECTORY, LANEY], [4, 4]>, <[VECTORX, LANEX], [4, 4]>>
-
-// CHECK-LABEL: @distribute_elementwise_f16
-func.func @distribute_elementwise_f16(%a: vector<16x16xf16>, %b: vector<16x16xf16>, %denom: vector<16x16xf16>) -> vector<16x16xi1> {
-  %c0 = arith.constant 0 : index
-  %cst_0 = arith.constant 0.0 : f16
-  // CHECK: %[[ROOT:.*]] = arith.constant dense<0.000000e+00> : vector<16xf16>
-  %root = arith.constant dense<0.0> : vector<16x16xf16>
-  %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<16x16xf16>
-  // CHECK-DAG: %[[B:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xf16> -> vector<16xf16>
-  // CHECK-DAG: %[[C:.*]] = arith.mulf %[[B]], %[[ROOT]] {{.*}} : vector<16xf16>
-  %c = arith.mulf %rootl, %b : vector<16x16xf16>
-  // CHECK-DAG: %[[DENOM:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xf16> -> vector<16xf16>
-  // CHECK-DAG: %[[DIVD:.*]] = arith.divf %[[C]], %[[DENOM]] {{.*}} : vector<16xf16>
-  %divd = arith.divf %c, %denom : vector<16x16xf16>
-  // CHECK-DAG: %[[A:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xf16> -> vector<16xf16>
-  // CHECK-DAG: %[[D:.*]] = arith.addf %[[DIVD]], %[[A]] fastmath<reassoc,nnan> {{.*}} : vector<16xf16>
-  %d = arith.addf %divd, %a fastmath<reassoc,nnan> : vector<16x16xf16>
-  // CHECK-DAG: %[[R:.*]] = arith.cmpf ult, %[[D]], %[[ROOT]] {{.*}} : vector<16xf16>
-  %r = arith.cmpf ult, %d, %root : vector<16x16xf16>
-  // CHECK: iree_vector_ext.to_simd %[[R]] : vector<16xi1> -> vector<16x16xi1>
-  return %r : vector<16x16xi1>
-}
-
-// CHECK-LABEL: @distribute_elementwise_i32
-func.func @distribute_elementwise_i32(%a: vector<16x16xi32>, %b: vector<16x16xi32>) -> vector<16x16xi32> {
-  %c0 = arith.constant 0 : index
-  %cst_0 = arith.constant 0 : i32
-  // CHECK: %[[ROOT:.*]] = arith.constant dense<2> : vector<16xi32>
-  %root = arith.constant dense<2> : vector<16x16xi32>
-  %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<16x16xi32>
-  // CHECK-DAG: %[[B:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xi32> -> vector<16xi32>
-  // CHECK-DAG: %[[C:.*]] = arith.muli %[[B]], %[[ROOT]] {{.*}} : vector<16xi32>
-  %c = arith.muli %rootl, %b : vector<16x16xi32>
-  // CHECK-DAG: %[[A:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xi32> -> vector<16xi32>
-  // CHECK-DAG: %[[D:.*]] = arith.addi %[[C]], %[[A]] {{.*}} : vector<16xi32>
-  %d = arith.addi %c, %a : vector<16x16xi32>
-  // CHECK: iree_vector_ext.to_simd %[[D]] : vector<16xi32> -> vector<16x16xi32>
-  return %d : vector<16x16xi32>
-}
-
 #nested = #iree_vector_ext.nested_layout<
   subgroup_tile = [2, 1, 1],
   batch_tile    = [8, 2, 4],
@@ -69,27 +28,35 @@ func.func @distribute_elementwise_nested_layout_f16(%a: vector<128x128x128xf16>,
   return %d : vector<128x128x128xf16>
 }
 
+#layout = #iree_vector_ext.nested_layout<
+  subgroup_tile = [1, 1],
+  batch_tile = [1, 1],
+  outer_tile = [1, 1],
+  thread_tile = [1, 1],
+  element_tile = [16, 16],
+
+  subgroup_strides = [1, 1],
+  thread_strides = [1, 1]
+>
+
 // CHECK-LABEL: @distribute_scf_for
 func.func @distribute_scf_for(%a: vector<16x16xi32>, %b: vector<16x16xi32>) -> vector<16x16xi32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c128 = arith.constant 128 : index
   %cst_0 = arith.constant 0 : i32
-  // CHECK: %[[ROOT:.*]] = arith.constant dense<0> : vector<16xi32>
+  // CHECK: %[[ROOT:.*]] = arith.constant dense<0> : vector<1x1x1x1x16x16xi32>
   %root = arith.constant dense<0> : vector<16x16xi32>
   %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<16x16xi32>
-  // CHECK: iter_args(%[[ARG0:.*]] = %[[ROOT]]) -> (vector<16xi32>)
+  // CHECK: iter_args(%[[ARG0:.*]] = %[[ROOT]]) -> (vector<1x1x1x1x16x16xi32>)
   %out = scf.for %i = %c0 to %c128 step %c1 iter_args(%arg0 = %rootl) -> (vector<16x16xi32>) {
-    // These should be ideally folded if canonicalization was ever ran.
-    // Canonicalization currently breaks other tests. If canonicalization
-    // is ever ran, this should be updated.
-    // CHECK-DAG: %[[B:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xi32> -> vector<16xi32>
-    // CHECK-DAG: %[[C:.*]] = arith.muli %[[ARG0]], %[[B]] {{.*}} : vector<16xi32>
+    // CHECK-DAG: %[[B:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xi32> -> vector<1x1x1x1x16x16xi32>
+    // CHECK-DAG: %[[C:.*]] = arith.muli %[[ARG0]], %[[B]] {{.*}} : vector<1x1x1x1x16x16xi32>
     %c = arith.muli %arg0, %b : vector<16x16xi32>
-    // CHECK-DAG: %[[A:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xi32> -> vector<16xi32>
-    // CHECK-DAG: %[[D:.*]] = arith.addi %[[C]], %[[A]] {{.*}} : vector<16xi32>
+    // CHECK-DAG: %[[A:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xi32> -> vector<1x1x1x1x16x16xi32>
+    // CHECK-DAG: %[[D:.*]] = arith.addi %[[C]], %[[A]] {{.*}} : vector<1x1x1x1x16x16xi32>
     %d = arith.addi %c, %a : vector<16x16xi32>
-    // CHECK: scf.yield %[[D]] : vector<16xi32>
+    // CHECK: scf.yield %[[D]] : vector<1x1x1x1x16x16xi32>
     scf.yield %d : vector<16x16xi32>
   }
   return %out : vector<16x16xi32>
@@ -102,632 +69,3 @@ builtin.module attributes { transform.with_named_sequence } {
     transform.yield
   }
 }
-
-// -----
-
-#layout_row_major = #iree_vector_ext.layout<<[BATCHX, LANEY], [2, 8]>, <[BATCHY, LANEX, VECTORX], [2, 1, 8]>>
-#layout_col_major = #iree_vector_ext.layout<<[BATCHX, LANEY, VECTORX], [1, 4, 4]>, <[BATCHY, LANEX], [2, 8]>>
-
-// CHECK-LABEL: @distribute_transfer_read_row_major
-func.func @distribute_transfer_read_row_major(%alloc: memref<4x4xf16>) -> vector<16x16xf16> {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.0 : f16
-  %root = vector.transfer_read %alloc[%c0, %c0], %cst
-          {in_bounds = [false, false]}
-                  : memref<4x4xf16>, vector<16x16xf16>
-  %rootl = iree_vector_ext.to_layout %root to layout(#layout_row_major) : vector<16x16xf16>
-  // CHECK-COUNT-4: vector.load {{.*}}, vector<8xf16>
-  func.return %rootl : vector<16x16xf16>
-}
-
-// CHECK-LABEL: @distribute_transfer_read_col_major
-func.func @distribute_transfer_read_col_major(%alloc: memref<32x32xf16>) -> vector<16x16xf16> {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.0 : f16
-  %root = vector.transfer_read %alloc[%c0, %c0], %cst
-          {in_bounds = [true, true]}
-                  : memref<32x32xf16>, vector<16x16xf16>
-  %rootl = iree_vector_ext.to_layout %root to layout(#layout_col_major) : vector<16x16xf16>
-  // CHECK-COUNT-8: vector.load {{.*}}, vector<1xf16>
-  func.return %rootl : vector<16x16xf16>
-}
-
-// CHECK-LABEL: @distribute_transfer_read_row_major_with_broadcast
-func.func @distribute_transfer_read_row_major_with_broadcast(%a: index, %b: index, %alloc: memref<32x32x32x32xf16>) -> vector<16x16xf16> {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.0 : f16
-  %root = vector.transfer_read %alloc[%c0, %c0, %a, %b], %cst
-          {in_bounds = [true, true],
-           permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)>}
-                  : memref<32x32x32x32xf16>, vector<16x16xf16>
-  %rootl = iree_vector_ext.to_layout %root to layout(#layout_row_major) : vector<16x16xf16>
-  // CHECK-COUNT-4: vector.load {{.*}}, vector<8xf16>
-  func.return %rootl : vector<16x16xf16>
-}
-
-// CHECK-LABEL: @distribute_transfer_read_col_major_with_broadcast
-func.func @distribute_transfer_read_col_major_with_broadcast(%a: index, %b: index, %alloc: memref<32x32x32x32xf16>) -> vector<16x16xf16> {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.0 : f16
-  %root = vector.transfer_read %alloc[%c0, %c0, %a, %b], %cst
-          {in_bounds = [true, true],
-           permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)>}
-                  : memref<32x32x32x32xf16>, vector<16x16xf16>
-  %rootl = iree_vector_ext.to_layout %root to layout(#layout_col_major) : vector<16x16xf16>
-  // CHECK-COUNT-8: vector.load {{.*}}, vector<1xf16>
-  func.return %rootl : vector<16x16xf16>
-}
-
-// CHECK-LABEL: @distribute_transfer_read_row_major_transpose
-func.func @distribute_transfer_read_row_major_transpose(%a: index, %b: index, %alloc: memref<32x32x32x32xf16>) -> vector<16x16xf16> {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.0 : f16
-  %root = vector.transfer_read %alloc[%c0, %c0, %a, %b], %cst
-          {in_bounds = [true, true],
-           permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d2)>}
-                  : memref<32x32x32x32xf16>, vector<16x16xf16>
-  %rootl = iree_vector_ext.to_layout %root to layout(#layout_row_major) : vector<16x16xf16>
-  // CHECK-COUNT-32: vector.load {{.*}}, vector<1xf16>
-  func.return %rootl : vector<16x16xf16>
-}
-
-// CHECK-LABEL: @distribute_transfer_read_col_major_transpose
-func.func @distribute_transfer_read_col_major_transpose(%a: index, %b: index, %alloc: memref<32x32x32x32xf16>) -> vector<16x16xf16> {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.0 : f16
-  %root = vector.transfer_read %alloc[%c0, %c0, %a, %b], %cst
-          {in_bounds = [true, true],
-           permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d2)>}
-                  : memref<32x32x32x32xf16>, vector<16x16xf16>
-  %rootl = iree_vector_ext.to_layout %root to layout(#layout_col_major) : vector<16x16xf16>
-  // CHECK-COUNT-2: vector.load {{.*}}, vector<4xf16>
-  func.return %rootl : vector<16x16xf16>
-}
-
-builtin.module attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
-    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
-#layout_row_major = #iree_vector_ext.layout<<[BATCHX, LANEY], [2, 8]>, <[BATCHY, LANEX, VECTORX], [2, 1, 8]>>
-#layout_col_major = #iree_vector_ext.layout<<[BATCHX, LANEY, VECTORX], [1, 4, 4]>, <[BATCHY, LANEX], [2, 8]>>
-
-// TODO: Use affine min tricks based on the grid size to elide the mod.
-// Note that this IR is invalid if subgroup size != 8.
-
-func.func @distribute_transfer_write_row_major(%root: vector<16x16xf16>, %alloc: memref<64x64xf16>) {
-  %c0 = arith.constant 0 : index
-  %rootl = iree_vector_ext.to_layout %root to layout(#layout_row_major) : vector<16x16xf16>
-  vector.transfer_write %rootl, %alloc[%c0, %c0]
-          {in_bounds = [true, true]}
-                  : vector<16x16xf16>, memref<64x64xf16>
-  func.return
-}
-// CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 mod 8)>
-// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 mod 8 + 8)>
-
-// CHECK-LABEL: @distribute_transfer_write_row_major
-// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index
-// CHECK-DAG: %[[LANEID:.+]] = gpu.thread_id  x
-// CHECK: %[[VEC_LANE_Y:.+]] = affine.apply #[[$MAP0]]()[%[[LANEID]]]
-// CHECK: %[[DIST_SRC_VEC:.+]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xf16> -> vector<2x2x8xf16>
-// CHECK: %[[BATCH_0_0:.+]] = vector.extract %[[DIST_SRC_VEC]][0, 0] : vector<8xf16> from vector<2x2x8xf16>
-// CHECK: vector.store %[[BATCH_0_0]], %{{.*}}[%[[VEC_LANE_Y]], %[[C0]]] : memref<64x64xf16>, vector<8xf16>
-
-// CHECK: %[[NEXT_VEC_LANE_Y:.+]] = affine.apply #[[$MAP1]]()[%[[LANEID]]]
-// CHECK: %[[BATCH_1_0:.+]] = vector.extract %[[DIST_SRC_VEC]][1, 0] : vector<8xf16> from vector<2x2x8xf16>
-// CHECK: vector.store %[[BATCH_1_0]], %{{.*}}[%[[NEXT_VEC_LANE_Y]], %[[C0]]] : memref<64x64xf16>, vector<8xf16>
-
-// CHECK: %[[BATCH_0_1:.+]] = vector.extract %[[DIST_SRC_VEC]][0, 1] : vector<8xf16> from vector<2x2x8xf16>
-// CHECK: vector.store %[[BATCH_0_1]], %{{.*}}[%[[VEC_LANE_Y]], %[[C8]]] : memref<64x64xf16>, vector<8xf16>
-
-// CHECK: %[[BATCH_1_1:.+]] = vector.extract %[[DIST_SRC_VEC]][1, 1] : vector<8xf16> from vector<2x2x8xf16>
-// CHECK: vector.store %[[BATCH_1_1]], %{{.*}}[%[[NEXT_VEC_LANE_Y]], %[[C8]]] : memref<64x64xf16>, vector<8xf16>
-
-func.func @distribute_transfer_write_col_major(%root: vector<16x16xf16>, %alloc: memref<64x64xf16>) {
-  %c0 = arith.constant 0 : index
-  %rootl = iree_vector_ext.to_layout %root to layout(#layout_col_major) : vector<16x16xf16>
-  vector.transfer_write %rootl, %alloc[%c0, %c0]
-          {in_bounds = [true, true]}
-                  : vector<16x16xf16>, memref<64x64xf16>
-  func.return
-}
-// CHECK-LABEL: @distribute_transfer_write_col_major
-// CHECK-COUNT-8: vector.store {{.*}}, vector<1xf16>
-
-func.func @distribute_transfer_write_row_major_with_broadcast(%root: vector<16x16xf16>, %a: index, %b: index, %alloc: memref<32x32x32x32xf16>) {
-  %c0 = arith.constant 0 : index
-  %rootl = iree_vector_ext.to_layout %root to layout(#layout_row_major) : vector<16x16xf16>
-  vector.transfer_write %rootl, %alloc[%c0, %c0, %a, %b]
-          {in_bounds = [true, true],
-           permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)>}
-                  : vector<16x16xf16>, memref<32x32x32x32xf16>
-  func.return
-}
-// CHECK-LABEL: @distribute_transfer_write_row_major_with_broadcast
-// CHECK-COUNT-4: vector.store {{.*}}, vector<8xf16>
-
-func.func @distribute_transfer_write_col_major_with_broadcast(%root: vector<16x16xf16>, %a: index, %b: index, %alloc: memref<32x32x32x32xf16>) {
-  %c0 = arith.constant 0 : index
-  %rootl = iree_vector_ext.to_layout %root to layout(#layout_col_major) : vector<16x16xf16>
-  vector.transfer_write %rootl, %alloc[%c0, %c0, %a, %b]
-          {in_bounds = [true, true],
-           permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)>}
-                  : vector<16x16xf16>, memref<32x32x32x32xf16>
-  func.return
-}
-// CHECK-LABEL: @distribute_transfer_write_col_major_with_broadcast
-// CHECK-COUNT-8: vector.store {{.*}}, vector<1xf16>
-
-func.func @distribute_transfer_write_row_major_transpose(%root: vector<16x16xf16>, %a: index, %b: index, %alloc: memref<32x32x32x32xf16>) {
-  %c0 = arith.constant 0 : index
-  %rootl = iree_vector_ext.to_layout %root to layout(#layout_row_major) : vector<16x16xf16>
-  vector.transfer_write %rootl, %alloc[%c0, %c0, %a, %b]
-          {in_bounds = [true, true],
-           permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d2)>}
-                  : vector<16x16xf16>, memref<32x32x32x32xf16>
-  func.return
-}
-// CHECK-LABEL: @distribute_transfer_write_row_major_transpose
-// CHECK-COUNT-32: vector.store {{.*}}, vector<1xf16>
-
-func.func @distribute_transfer_write_col_major_transpose(%root: vector<16x16xf16>, %a: index, %b: index, %alloc: memref<32x32x32x32xf16>) {
-  %c0 = arith.constant 0 : index
-  %rootl = iree_vector_ext.to_layout %root to layout(#layout_col_major) : vector<16x16xf16>
-  vector.transfer_write %rootl, %alloc[%c0, %c0, %a, %b]
-          {in_bounds = [true, true],
-           permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d2)>}
-                  : vector<16x16xf16>, memref<32x32x32x32xf16>
-  func.return
-}
-// CHECK-LABEL: @distribute_transfer_write_col_major_transpose
-// CHECK-COUNT-2: vector.store {{.*}}, vector<4xf16>
-
-
-func.func @distribute_transfer_write_with_non_contiguous_broadcast(%root: vector<16x16xf16>, %a: index, %b: index, %alloc: memref<32x32x32x32xf16>) {
-  %c0 = arith.constant 0 : index
-  %rootl = iree_vector_ext.to_layout %root to layout(#layout_row_major) : vector<16x16xf16>
-  vector.transfer_write %rootl, %alloc[%c0, %a, %c0, %b]
-          {in_bounds = [true, true],
-           permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, d3)>}
-                  : vector<16x16xf16>, memref<32x32x32x32xf16>
-  func.return
-}
-// CHECK-LABEL: func.func @distribute_transfer_write_with_non_contiguous_broadcast
-// CHECK-SAME: %[[ROOT:.+]]: vector<16x16xf16>, %[[A:.+]]: index, %[[B:.+]]: index, %[[ALLOC:.+]]: memref<32x32x32x32xf16>)
-// CHECK: %[[C0:.+]] = arith.constant 0 : index
-// CHECK-COUNT-4: vector.store %{{.+}}, %[[ALLOC]][%[[C0]], {{.+}}, %[[C0]], %{{.+}}] : memref<32x32x32x32xf16>, vector<8xf16>
-
-builtin.module attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
-    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
-#row_layout = #iree_vector_ext.per_dim_layout<[BATCHX, LANEY, VECTORX], [1, 4, 4]>
-#col_layout = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [1, 16]>
-#layout2d = #iree_vector_ext.layout<#row_layout, #col_layout>
-#layout1d = #iree_vector_ext.layout<#col_layout>
-#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {}>
-#translation_info = #iree_codegen.translation_info<pipeline = None subgroup_size = 64>
-module {
-  func.func @distribute_reduction_f16(%source: vector<16x16xf16>, %init: vector<16xf16>) -> vector<16xf16>
-  attributes {hal.executable.target = #executable_target_rocm_hsaco_fb, translation_info = #translation_info} {
-    %sourcel = iree_vector_ext.to_layout %source to layout(#layout2d) : vector<16x16xf16>
-    %result = vector.multi_reduction <maximumf>, %sourcel, %init [0]
-                    : vector<16x16xf16> to vector<16xf16>
-    func.return %result : vector<16xf16>
-  }
-}
-//      CHECK: func.func @distribute_reduction_f16(%[[ARG0:[a-zA-Z0-9_]+]]: vector<16x16xf16>, %[[ARG1:[a-zA-Z0-9_]+]]: vector<16xf16>) -> vector<16xf16>
-//  CHECK-DAG:    %[[C32_I32:.+]] = arith.constant 32 : i32
-//  CHECK-DAG:    %[[C64_I32:.+]] = arith.constant 64 : i32
-//  CHECK-DAG:    %[[C16_I32:.+]] = arith.constant 16 : i32
-//  CHECK-DAG:    %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<2xf16>
-//  CHECK-DAG:    %[[CST_0:.+]] = arith.constant dense<0.000000e+00> : vector<1xf16>
-//      CHECK:   %[[D0:.+]] = iree_vector_ext.to_simt %[[ARG1]] : vector<16xf16> -> vector<1xf16>
-//      CHECK:   %[[D1:.+]] = vector.extract %[[D0]][0] : f16 from vector<1xf16>
-//      CHECK:   %[[D2:.+]] = iree_vector_ext.to_simt %[[ARG0]] : vector<16x16xf16> -> vector<1x1x4xf16>
-//      CHECK:   %[[D3:.+]] = vector.extract %[[D2]][0, 0, 0] : f16 from vector<1x1x4xf16>
-//      CHECK:   %[[D4:.+]] = vector.insert %[[D3]], %[[CST]] [0] : f16 into vector<2xf16>
-//      CHECK:   %[[D5:.+]] = vector.extract %[[D2]][0, 0, 1] : f16 from vector<1x1x4xf16>
-//      CHECK:   %[[D6:.+]] = vector.insert %[[D5]], %[[D4]] [1] : f16 into vector<2xf16>
-//      CHECK:   %[[D7:.+]] = vector.extract %[[D2]][0, 0, 2] : f16 from vector<1x1x4xf16>
-//      CHECK:   %[[D8:.+]] = vector.insert %[[D7]], %[[D6]] [0] : f16 into vector<2xf16>
-//      CHECK:   %[[D9:.+]] = vector.extract %[[D2]][0, 0, 3] : f16 from vector<1x1x4xf16>
-//      CHECK:   %[[D10:.+]] = vector.insert %[[D9]], %[[D8]] [1] : f16 into vector<2xf16>
-//      CHECK:   %[[D11:.+]] = arith.maximumf %[[D6]], %[[D10]] : vector<2xf16>
-//      CHECK:   %[[D12:.+]] = vector.bitcast %[[D11]] : vector<2xf16> to vector<1xi32>
-//      CHECK:   %[[D13:.+]] = vector.extract %[[D12]][0] : i32 from vector<1xi32>
-//      CHECK:   %[[SHUFFLERESULT:.+]], %[[VALID:.+]] = gpu.shuffle  xor %[[D13]], %[[C16_I32]], %[[C64_I32]] : i32
-//      CHECK:   %[[D14:.+]] = vector.broadcast %[[SHUFFLERESULT]] : i32 to vector<1xi32>
-//      CHECK:   %[[D15:.+]] = vector.bitcast %[[D14]] : vector<1xi32> to vector<2xf16>
-//      CHECK:   %[[D16:.+]] = arith.maximumf %[[D15]], %[[D11]] : vector<2xf16>
-//      CHECK:   %[[D17:.+]] = vector.bitcast %[[D16]] : vector<2xf16> to vector<1xi32>
-//      CHECK:   %[[D18:.+]] = vector.extract %[[D17]][0] : i32 from vector<1xi32>
-//      CHECK:   %[[SHUFFLERESULT_1:.+]], %[[VALID_2:.+]] = gpu.shuffle  xor %[[D18]], %[[C32_I32]], %[[C64_I32]] : i32
-//      CHECK:   %[[D19:.+]] = vector.broadcast %[[SHUFFLERESULT_1]] : i32 to vector<1xi32>
-//      CHECK:   %[[D20:.+]] = vector.bitcast %[[D19]] : vector<1xi32> to vector<2xf16>
-//      CHECK:   %[[D21:.+]] = arith.maximumf %[[D20]], %[[D16]] : vector<2xf16>
-//      CHECK:   %[[D22:.+]] = vector.extract %[[D21]][0] : f16 from vector<2xf16>
-//      CHECK:   %[[D23:.+]] = vector.extract %[[D21]][1] : f16 from vector<2xf16>
-//      CHECK:   %[[D24:.+]] = arith.maximumf %[[D22]], %[[D23]] : f16
-//      CHECK:   %[[D25:.+]] = arith.maximumf %[[D24]], %[[D1]] : f16
-//      CHECK:   %[[D26:.+]] = vector.insert %[[D25]], %[[CST_0]] [0] : f16 into vector<1xf16>
-//      CHECK:   %[[D27:.+]] = iree_vector_ext.to_simd %[[D26]] : vector<1xf16> -> vector<16xf16>
-
-#executable_target_rocm_hsaco_fb2 = #hal.executable.target<"rocm", "rocm-hsaco-fb", {}>
-module {
-  func.func @distribute_reduction_f32(%source: vector<16x16xf32>, %init: vector<16xf32>) -> vector<16xf32>
-  attributes {hal.executable.target = #executable_target_rocm_hsaco_fb, translation_info = #translation_info} {
-    %sourcel = iree_vector_ext.to_layout %source to layout(#layout2d) : vector<16x16xf32>
-    %result = vector.multi_reduction <maximumf>, %sourcel, %init [0]
-                : vector<16x16xf32> to vector<16xf32>
-    func.return %result : vector<16xf32>
-  }
-}
-//      CHECK: func.func @distribute_reduction_f32(%[[ARG0:[a-zA-Z0-9_]+]]: vector<16x16xf32>, %[[ARG1:[a-zA-Z0-9_]+]]: vector<16xf32>) -> vector<16xf32>
-//  CHECK-DAG:    %[[C32_I32:.+]] = arith.constant 32 : i32
-//  CHECK-DAG:    %[[C64_I32:.+]] = arith.constant 64 : i32
-//  CHECK-DAG:    %[[C16_I32:.+]] = arith.constant 16 : i32
-//  CHECK-DAG:    %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<1xf32>
-//      CHECK:   %[[D0:.+]] = iree_vector_ext.to_simt %[[ARG1]] : vector<16xf32> -> vector<1xf32>
-//      CHECK:   %[[D1:.+]] = vector.extract %[[D0]][0] : f32 from vector<1xf32>
-//      CHECK:   %[[D2:.+]] = iree_vector_ext.to_simt %[[ARG0]] : vector<16x16xf32> -> vector<1x1x4xf32>
-//      CHECK:   %[[D3:.+]] = vector.extract %[[D2]][0, 0, 0] : f32 from vector<1x1x4xf32>
-//      CHECK:   %[[D4:.+]] = vector.insert %[[D3]], %[[CST]] [0] : f32 into vector<1xf32>
-//      CHECK:   %[[D5:.+]] = vector.extract %[[D2]][0, 0, 1] : f32 from vector<1x1x4xf32>
-//      CHECK:   %[[D6:.+]] = vector.insert %[[D5]], %[[D4]] [0] : f32 into vector<1xf32>
-//      CHECK:   %[[D7:.+]] = arith.maximumf %[[D4]], %[[D6]] : vector<1xf32>
-//      CHECK:   %[[D8:.+]] = vector.extract %[[D2]][0, 0, 2] : f32 from vector<1x1x4xf32>
-//      CHECK:   %[[D9:.+]] = vector.insert %[[D8]], %[[D6]] [0] : f32 into vector<1xf32>
-//      CHECK:   %[[D10:.+]] = arith.maximumf %[[D7]], %[[D9]] : vector<1xf32>
-//      CHECK:   %[[D11:.+]] = vector.extract %[[D2]][0, 0, 3] : f32 from vector<1x1x4xf32>
-//      CHECK:   %[[D12:.+]] = vector.insert %[[D11]], %[[D9]] [0] : f32 into vector<1xf32>
-//      CHECK:   %[[D13:.+]] = arith.maximumf %[[D10]], %[[D12]] : vector<1xf32>
-//      CHECK:   %[[D14:.+]] = vector.bitcast %[[D13]] : vector<1xf32> to vector<1xi32>
-//      CHECK:   %[[D15:.+]] = vector.extract %[[D14]][0] : i32 from vector<1xi32>
-//      CHECK:   %[[SHUFFLERESULT:.+]], %[[VALID:.+]] = gpu.shuffle  xor %[[D15]], %[[C16_I32]], %[[C64_I32]] : i32
-//      CHECK:   %[[D16:.+]] = vector.broadcast %[[SHUFFLERESULT]] : i32 to vector<1xi32>
-//      CHECK:   %[[D17:.+]] = vector.bitcast %[[D16]] : vector<1xi32> to vector<1xf32>
-//      CHECK:   %[[D18:.+]] = arith.maximumf %[[D17]], %[[D13]] : vector<1xf32>
-//      CHECK:   %[[D19:.+]] = vector.bitcast %[[D18]] : vector<1xf32> to vector<1xi32>
-//      CHECK:   %[[D20:.+]] = vector.extract %[[D19]][0] : i32 from vector<1xi32>
-//      CHECK:   %[[SHUFFLERESULT_0:.+]], %[[VALID_1:.+]] = gpu.shuffle  xor %[[D20]], %[[C32_I32]], %[[C64_I32]] : i32
-//      CHECK:   %[[D21:.+]] = vector.broadcast %[[SHUFFLERESULT_0]] : i32 to vector<1xi32>
-//      CHECK:   %[[D22:.+]] = vector.bitcast %[[D21]] : vector<1xi32> to vector<1xf32>
-//      CHECK:   %[[D23:.+]] = arith.maximumf %[[D22]], %[[D18]] : vector<1xf32>
-//      CHECK:   %[[D24:.+]] = vector.extract %[[D23]][0] : f32 from vector<1xf32>
-//      CHECK:   %[[D25:.+]] = arith.maximumf %[[D24]], %[[D1]] : f32
-//      CHECK:   %[[D26:.+]] = vector.insert %[[D25]], %[[CST]] [0] : f32 into vector<1xf32>
-//      CHECK:   %[[D27:.+]] = iree_vector_ext.to_simd %[[D26]] : vector<1xf32> -> vector<16xf32>
-
-#transpose_test_layout = #iree_vector_ext.layout<<[LANEY], [32]>, <[LANEX, VECTORX], [4, 4]>>
-func.func @distribute_transpose(%mem: memref<32x32xf16>, %mem1: memref<32x32xf16>) -> vector<32x16xf16> {
-  // CHECK: func.func @distribute_transpose(%[[MEM:.*]]: memref<32x32xf16>, %[[MEM1:.*]]: memref<32x32xf16>
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.0 : f16
-  // CHECK-COUNT-1: vector.load %[[MEM]]
-  // CHECK-COUNT-4: vector.load %[[MEM1]]
-  %a = vector.transfer_read %mem[%c0, %c0], %cst : memref<32x32xf16>, vector<32x16xf16>
-  %b = vector.transfer_read %mem1[%c0, %c0], %cst : memref<32x32xf16>, vector<16x32xf16>
-  // CHECK-NOT: vector.transpose
-  %b_t = vector.transpose %b, [1, 0] : vector<16x32xf16> to vector<32x16xf16>
-  // CHECK: %[[ADD:.*]] = arith.addf %{{.*}}, %{{.*}} : vector<4xf16>
-  %c = arith.addf %a, %b_t : vector<32x16xf16>
-  %cl = iree_vector_ext.to_layout %c to layout(#transpose_test_layout) : vector<32x16xf16>
-  // CHECK: iree_vector_ext.to_simd %[[ADD]] : vector<4xf16> -> vector<32x16xf16>
-  func.return %cl : vector<32x16xf16>
-}
-
-#row_broadcast_layout = #iree_vector_ext.per_dim_layout<[BATCHX, LANEX], [2, 16]>
-#col_broadcast_layout = #iree_vector_ext.per_dim_layout<[BATCHY, LANEY, VECTORX], [2, 4, 4]>
-#layout_broadcast_1d = #iree_vector_ext.layout<#row_broadcast_layout>
-#layout_broadcast_2d = #iree_vector_ext.layout<#row_broadcast_layout, #col_broadcast_layout>
-#layout_broadcast_1d_t = #iree_vector_ext.layout<#col_broadcast_layout>
-#layout_broadcast_2d_t = #iree_vector_ext.layout<#col_broadcast_layout, #row_broadcast_layout>
-
-func.func @distribute_broadcast_row_col(%source: vector<32xf32>) -> vector<32x32xf32> {
-  %result = vector.broadcast %source : vector<32xf32> to vector<32x32xf32>
-  %resultl = iree_vector_ext.to_layout %result to layout(#layout_broadcast_2d) : vector<32x32xf32>
-  // CHECK-DAG: %[[S00:.*]] = vector.extract %[[SOURCE:.*]][0, 0]
-  // CHECK-DAG: vector.insert %[[S00]], %{{.*}} [0, 0, 0]
-  // CHECK-DAG: vector.insert %[[S00]], %{{.*}} [1, 0, 0]
-  // CHECK-DAG: %[[S01:.*]] = vector.extract %[[ACC:.*]][0, 1]
-  // CHECK-DAG: vector.insert %[[S01]], %{{.*}} [0, 0, 1]
-  // CHECK-DAG: vector.insert %[[S01]], %{{.*}} [1, 0, 1]
-  // CHECK-DAG: %[[S02:.*]] = vector.extract %[[ACC:.*]][0, 2]
-  // CHECK-DAG: vector.insert %[[S02]], %{{.*}} [0, 0, 2]
-  // CHECK-DAG: vector.insert %[[S02]], %{{.*}} [1, 0, 2]
-  // CHECK-DAG: %[[S03:.*]] = vector.extract %[[ACC:.*]][0, 3]
-  // CHECK-DAG: vector.insert %[[S03]], %{{.*}} [0, 0, 3]
-  // CHECK-DAG: vector.insert %[[S03]], %{{.*}} [1, 0, 3]
-
-  // CHECK-DAG: %[[S10:.*]] = vector.extract %[[SOURCE]][1, 0]
-  // CHECK-DAG: vector.insert %[[S10]], %{{.*}} [0, 1, 0]
-  // CHECK-DAG: vector.insert %[[S10]], %{{.*}} [1, 1, 0]
-  // CHECK-DAG: %[[S11:.*]] = vector.extract %[[ACC:.*]][1, 1]
-  // CHECK-DAG: vector.insert %[[S11]], %{{.*}} [0, 1, 1]
-  // CHECK-DAG: vector.insert %[[S11]], %{{.*}} [1, 1, 1]
-  // CHECK-DAG: %[[S12:.*]] = vector.extract %[[ACC:.*]][1, 2]
-  // CHECK-DAG: vector.insert %[[S12]], %{{.*}} [0, 1, 2]
-  // CHECK-DAG: vector.insert %[[S12]], %{{.*}} [1, 1, 2]
-  // CHECK-DAG: %[[S13:.*]] = vector.extract %[[ACC:.*]][1, 3]
-  // CHECK-DAG: vector.insert %[[S13]], %{{.*}} [0, 1, 3]
-  // CHECK-DAG: vector.insert %[[S13]], %{{.*}} [1, 1, 3]
-  func.return %resultl : vector<32x32xf32>
-}
-
-func.func @distribute_broadcast_col_row(%source: vector<32xf32>) -> vector<32x32xf32> {
-  %result = vector.broadcast %source : vector<32xf32> to vector<32x32xf32>
-  %resultl = iree_vector_ext.to_layout %result to layout(#layout_broadcast_2d_t) : vector<32x32xf32>
-  // CHECK-DAG: %[[S0:.*]] = vector.extract %[[SOURCE:.*]][0]
-  // CHECK-DAG: vector.insert %[[S0]], %{{.*}} [0, 0, 0]
-  // CHECK-DAG: vector.insert %[[S0]], %{{.*}} [0, 0, 1]
-  // CHECK-DAG: vector.insert %[[S0]], %{{.*}} [0, 0, 2]
-  // CHECK-DAG: vector.insert %[[S0]], %{{.*}} [0, 0, 3]
-  // CHECK-DAG: vector.insert %[[S0]], %{{.*}} [0, 1, 0]
-  // CHECK-DAG: vector.insert %[[S0]], %{{.*}} [0, 1, 1]
-  // CHECK-DAG: vector.insert %[[S0]], %{{.*}} [0, 1, 2]
-  // CHECK-DAG: vector.insert %[[S0]], %{{.*}} [0, 1, 3]
-
-  // CHECK-DAG: %[[S1:.*]] = vector.extract %[[SOURCE:.*]][1]
-  // CHECK-DAG: vector.insert %[[S1]], %{{.*}} [1, 0, 0]
-  // CHECK-DAG: vector.insert %[[S1]], %{{.*}} [1, 0, 1]
-  // CHECK-DAG: vector.insert %[[S1]], %{{.*}} [1, 0, 2]
-  // CHECK-DAG: vector.insert %[[S1]], %{{.*}} [1, 0, 3]
-  // CHECK-DAG: vector.insert %[[S1]], %{{.*}} [1, 1, 0]
-  // CHECK-DAG: vector.insert %[[S1]], %{{.*}} [1, 1, 1]
-  // CHECK-DAG: vector.insert %[[S1]], %{{.*}} [1, 1, 2]
-  // CHECK-DAG: vector.insert %[[S1]], %{{.*}} [1, 1, 3]
-  func.return %resultl : vector<32x32xf32>
-}
-
-#layout_broadcast_vectory_1d = #iree_vector_ext.layout<
-  <[BATCHY, VECTORX], [1, 4]>
->
-
-#layout_broadcast_vectory_2d = #iree_vector_ext.layout<
-  <[BATCHX, VECTORY], [1, 4]>,
-  <[BATCHY, VECTORX], [1, 4]>
->
-
-// This test case checks if we distribute correct when we have vectorx frozen
-// and we iterate on vectory.
-// This previously caused a bug, since calculating SIMT index for broadcast
-// needs to know the range of vectorx.
-func.func @distribute_broadcast_vectory(%source: vector<4xf32>) -> vector<4x4xf32> {
-  %result = vector.broadcast %source : vector<4xf32> to vector<4x4xf32>
-  %resultl = iree_vector_ext.to_layout %result to layout(#layout_broadcast_vectory_2d) : vector<4x4xf32>
-  // CHECK-DAG: %[[S00:.*]] = vector.extract %[[SOURCE:.*]][0, 0] : f32 from vector<1x4xf32>
-  // CHECK-DAG: %[[S01:.*]] = vector.extract %[[SOURCE:.*]][0, 1] : f32 from vector<1x4xf32>
-  // CHECK-DAG: %[[S02:.*]] = vector.extract %[[SOURCE:.*]][0, 2] : f32 from vector<1x4xf32>
-  // CHECK-DAG: %[[S02:.*]] = vector.extract %[[SOURCE:.*]][0, 3] : f32 from vector<1x4xf32>
-  // CHECK-DAG: vector.insert %[[S00:.*]] %{{.*}} [0, 0, 0] : f32 into vector<1x1x16xf32>
-  // CHECK-DAG: vector.insert %[[S00:.*]] %{{.*}} [0, 0, 4] : f32 into vector<1x1x16xf32>
-  // CHECK-DAG: vector.insert %[[S00:.*]] %{{.*}} [0, 0, 8] : f32 into vector<1x1x16xf32>
-  // CHECK-DAG: vector.insert %[[S00:.*]] %{{.*}} [0, 0, 12] : f32 into vector<1x1x16xf32>
-  // CHECK-DAG: vector.insert %[[S01:.*]] %{{.*}} [0, 0, 1] : f32 into vector<1x1x16xf32>
-  // CHECK-DAG: vector.insert %[[S01:.*]] %{{.*}} [0, 0, 5] : f32 into vector<1x1x16xf32>
-  // CHECK-DAG: vector.insert %[[S01:.*]] %{{.*}} [0, 0, 9] : f32 into vector<1x1x16xf32>
-  // CHECK-DAG: vector.insert %[[S01:.*]] %{{.*}} [0, 0, 13] : f32 into vector<1x1x16xf32>
-  // CHECK-DAG: vector.insert %[[S02:.*]] %{{.*}} [0, 0, 2] : f32 into vector<1x1x16xf32>
-  // CHECK-DAG: vector.insert %[[S02:.*]] %{{.*}} [0, 0, 6] : f32 into vector<1x1x16xf32>
-  // CHECK-DAG: vector.insert %[[S02:.*]] %{{.*}} [0, 0, 10] : f32 into vector<1x1x16xf32>
-  // CHECK-DAG: vector.insert %[[S02:.*]] %{{.*}} [0, 0, 14] : f32 into vector<1x1x16xf32>
-  // CHECK-DAG: vector.insert %[[S03:.*]] %{{.*}} [0, 0, 3] : f32 into vector<1x1x16xf32>
-  // CHECK-DAG: vector.insert %[[S03:.*]] %{{.*}} [0, 0, 7] : f32 into vector<1x1x16xf32>
-  // CHECK-DAG: vector.insert %[[S03:.*]] %{{.*}} [0, 0, 11] : f32 into vector<1x1x16xf32>
-  // CHECK-DAG: vector.insert %[[S03:.*]] %{{.*}} [0, 0, 15] : f32 into vector<1x1x16xf32>
-  func.return %resultl : vector<4x4xf32>
-}
-
-builtin.module attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
-    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
-// This test case checks that chained WMMA contraction is distributable.
-// Let C0 = matmul(A0, B0), and OUT = matmul(A1, C0).
-
-// In this case, since C-layout and the RHS-Layout of WMMA has lane conflict,
-// and has different numel per lane/thread, we expect compiler to emit code
-// that will write back data from C0 to shared memory, before loading it again
-// as RHS-layout from shared memory to register.
-
-// We assume in this test that we have distributed it the IR at a subgroup level.
-
-#layoutA = #iree_vector_ext.layout<<[ BATCHX,  LANEX], [1, 16]>, <[ BATCHY,  LANEY,  VECTORX], [1, 1, 16]>>
-#layoutB = #iree_vector_ext.layout<<[ BATCHX,  LANEX], [1, 16]>, <[ BATCHY,  LANEY,  VECTORX], [1, 1, 16]>>
-#layoutC = #iree_vector_ext.layout<<[ BATCHX,  VECTORY,  LANEY,  VECTORX], [1, 8, 2, 1]>, <[ BATCHY,  LANEX], [1, 16]>>
-
-#layoutA2 = #iree_vector_ext.layout<<[ BATCHX,  LANEX], [1, 16]>, <[ BATCHY,  LANEY,  VECTORX], [1, 1, 16]>>
-#layoutB2 = #iree_vector_ext.layout<<[ BATCHX,  LANEY,  VECTORX], [1, 1, 16]>, <[ BATCHY,  LANEX], [1, 16]>>
-#layoutC2 = #iree_vector_ext.layout<<[ BATCHX,  VECTORY,  LANEY,  VECTORX], [1, 8, 2, 1]>, <[ BATCHY,  LANEX], [1, 16]>>
-
-// CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + (s0 floordiv 32) * 16)>
-// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 mod 16)>
-// CHECK-LABEL: func.func @resolve_wmma_layout_conflict_with_shared_memory
-func.func @resolve_wmma_layout_conflict_with_shared_memory(%15 : vector<16x16xf16>,
-                                                           %14 : vector<16x16xf16>,
-                                                           %16 : vector<16x16xf32>,
-                                                           %35 : vector<16x16xf16>,
-                                                           %33 : vector<16x16xf32>)
-                                                           -> vector<16x16xf32>
-  attributes {translation_info = #iree_codegen.translation_info<pipeline = None
-                                                                workgroup_size = [32, 2, 1]
-                                                                subgroup_size = 32>} {
-
-  %A = iree_vector_ext.to_layout %15 to layout(#layoutA) : vector<16x16xf16>
-  %B = iree_vector_ext.to_layout %14 to layout(#layoutB) : vector<16x16xf16>
-  %C = iree_vector_ext.to_layout %16 to layout(#layoutC) : vector<16x16xf32>
-
-  %M1 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
-                                          affine_map<(d0, d1, d2) -> (d1, d2)>,
-                                          affine_map<(d0, d1, d2) -> (d0, d1)>],
-                         iterator_types = ["parallel", "parallel", "reduction"],
-                         kind = #vector.kind<add>,
-                         iree.amdgpu.mma = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>}
-        %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32>
-
-  %TM1 = arith.truncf %M1 : vector<16x16xf32> to vector<16x16xf16>
-
-  %A2 = iree_vector_ext.to_layout %35  to layout(#layoutA2) : vector<16x16xf16>
-  %B2 = iree_vector_ext.to_layout %TM1 to layout(#layoutB2) : vector<16x16xf16>
-  %C2 = iree_vector_ext.to_layout %33  to layout(#layoutC2) : vector<16x16xf32>
-
-  %M2 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
-                                          affine_map<(d0, d1, d2) -> (d2, d1)>,
-                                          affine_map<(d0, d1, d2) -> (d0, d1)>],
-                         iterator_types = ["parallel", "parallel", "reduction"],
-                         kind = #vector.kind<add>,
-                         iree.amdgpu.mma = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>}
-       %A2, %B2, %C2  : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32>
-
-  func.return %M2 : vector<16x16xf32>
-}
-// CHECK-NOT: iree_vector_ext.layout_conflict_resolution
-// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
-// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index
-// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index
-
-// CHECK: %[[VEC_INIT:.+]] = arith.constant dense<0.000000e+00> : vector<1x1x16xf16
-// CHECK: %[[TID_X:.+]] = gpu.thread_id  x
-// CHECK: %[[TID_Y:.+]] = gpu.thread_id  y
-// CHECK: %[[TID_Z:.+]] = gpu.thread_id  z
-// CHECK: %[[SUBGROUP_OFFSET:.+]] = affine.apply #[[$MAP0]]()[%[[TID_X]], %[[TID_Y]], %[[TID_Z]]]
-// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<16x32xf16, #gpu.address_space<workgroup>>
-// CHECK: %[[SUBVIEW:.+]] = memref.subview %[[ALLOC]][0, %[[SUBGROUP_OFFSET]]] [16, 16] [1, 1]
-// CHECK: %[[HALF_LANE_ID:.+]] = affine.apply #[[$MAP1]]()[%[[TID_X]]]
-// CHECK-COUNT-8: vector.store %{{.+}}, %[[SUBVIEW]][%{{.+}}, %[[HALF_LANE_ID]]]
-// CHECK-AFTER: gpu.barrier
-
-// CHECK: %[[LANE_OFFSET:.+]] = arith.addi %[[SUBGROUP_OFFSET]], %[[HALF_LANE_ID]]
-// CHECK: %[[LOAD0:.+]] = vector.load %[[ALLOC]][%[[C0]], %[[LANE_OFFSET]]]
-// CHECK: %[[INSERT0:.+]] = vector.insert_strided_slice %[[LOAD0]], %[[VEC_INIT]] {offsets = [0, 0, 0], strides = [1]} : vector<1xf16> into vector<1x1x16xf16>
-// CHECK: %[[LOAD1:.+]] = vector.load %[[ALLOC]][%[[C1]], %[[LANE_OFFSET]]]
-// CHECK: %[[INSERT1:.+]] = vector.insert_strided_slice %[[LOAD1]], %[[INSERT0]] {offsets = [0, 0, 1], strides = [1]} : vector<1xf16> into vector<1x1x16xf16>
-// CHECK: %[[LOAD2:.+]] = vector.load %[[ALLOC]][%[[C2]], %[[LANE_OFFSET]]]
-// CHECK: %[[INSERT2:.+]] = vector.insert_strided_slice %[[LOAD2]], %[[INSERT1]] {offsets = [0, 0, 2], strides = [1]} : vector<1xf16> into vector<1x1x16xf16>
-// CHECK: %[[LOAD3:.+]] = vector.load %[[ALLOC]][%[[C3]], %[[LANE_OFFSET]]]
-// CHECK: %[[INSERT3:.+]] = vector.insert_strided_slice %[[LOAD3]], %[[INSERT2]] {offsets = [0, 0, 3], strides = [1]} : vector<1xf16> into vector<1x1x16xf16>
-// CHECK: %[[LOAD4:.+]] = vector.load %[[ALLOC]][%[[C4]], %[[LANE_OFFSET]]]
-// CHECK: %[[INSERT4:.+]] = vector.insert_strided_slice %[[LOAD4]], %[[INSERT3]] {offsets = [0, 0, 4], strides = [1]} : vector<1xf16> into vector<1x1x16xf16>
-// CHECK-COUNT-11: %[[LOADN:.+]] = vector.load %[[ALLOC]]
-// CHECK-AFTER: vector.insert_strided_slice %[[LOADN]]
-
-builtin.module attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
-    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.test_gpu_vector_distribution %top_level_func {experimental = true} : !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
-// This test is used to ensure that we are handling cases
-// where the same arith.constant has multiple users with different layouts.
-
-// Main motivation is to ensure we can distribute attention when the tile
-// size for M,K1,N is the same. Which means the init of 1st contract, and
-// IV's init uses the same constant.
-
-#layoutA = #iree_vector_ext.layout<<[BATCHY, LANEX], [2, 32]>, <[BATCHX,  LANEY,  VECTORX], [2, 4, 8]>>
-#layoutB = #iree_vector_ext.layout<<[BATCHY, LANEX], [2, 32]>, <[BATCHX,  VECTORY,  LANEY,  VECTORX], [2, 4, 2, 4]>>
-
-builtin.module attributes { transform.with_named_sequence } {
-  func.func @resolve_constant_with_multiple_layout_uses(%A : vector<64x64xf16>, %B : vector<64x64xf16>) -> vector<64x64xf16> {
-    %a = iree_vector_ext.to_layout %A to layout(#layoutA) : vector<64x64xf16>
-    %b = iree_vector_ext.to_layout %B to layout(#layoutB) : vector<64x64xf16>
-    %zero = arith.constant dense<0.0> : vector<64x64xf16>
-    %add_0 = arith.addf %a, %zero : vector<64x64xf16>
-    %add_1 = arith.addf %b, %zero : vector<64x64xf16>
-    %layout_change = iree_vector_ext.to_layout %add_1 to layout(#layoutA) : vector<64x64xf16>
-    %out = arith.addf %layout_change, %add_0 : vector<64x64xf16>
-    func.return %out : vector<64x64xf16>
-  }
-// CHECK-LABEL: func.func @resolve_constant_with_multiple_layout_uses
-// CHECK-SAME: (%[[ARG0:.+]]: vector<64x64xf16>, %[[ARG0:.+]]: vector<64x64xf16>)
-// CHECK: %[[V0:.+]] = arith.constant dense<0.000000e+00> : vector<2x2x8xf16>
-// CHECK: %[[V1:.+]] = arith.constant dense<0.000000e+00> : vector<2x2x16xf16>
-// CHECK: %[[ADD0:.+]] = arith.addf %{{.+}}, %[[V0]]{{.*}} : vector<2x2x8xf16>
-// CHECK: %[[ADD1:.+]] = arith.addf %{{.+}}, %[[V1]]{{.*}} : vector<2x2x16xf16>
-// CHECK: arith.addf %{{.+}}, %[[ADD0]]{{.*}} : vector<2x2x8xf16>
-
-  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
-    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
-#row_layout = #iree_vector_ext.per_dim_layout<[BATCHX, LANEY, VECTORX], [2, 4, 4]>
-#col_layout = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [1, 16]>
-#layout0 = #iree_vector_ext.layout<#row_layout, #col_layout>
-#row_layout2 = #iree_vector_ext.per_dim_layout<[BATCHX, LANEY, VECTORX], [1, 4, 8]>
-#layout1 = #iree_vector_ext.layout<#row_layout2, #col_layout>
-#row_layout3 = #iree_vector_ext.per_dim_layout<[BATCHX, LANEY, VECTORX], [4, 2, 4]>
-#layout2 = #iree_vector_ext.layout<#row_layout3, #col_layout>
-
-func.func @resolved_layout_conflict(%a : memref<32x16xf16>, %b : memref<32x16xf16>) {
-  // CHECK: func.func @resolved_layout_conflict(%[[MEM:.*]]: memref<32x16xf16>, %[[MEM1:.*]]: memref<32x16xf16>
-  // CHECK-DAG: %[[CST0:.*]] = arith.constant dense<0.000000e+00> : vector<1x1x8xf16>
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.0 : f16
-  // CHECK-COUNT-8: vector.load %[[MEM]]
-  %vec = vector.transfer_read  %a[%c0, %c0], %cst : memref<32x16xf16>, vector<32x16xf16>
-  %vecl = iree_vector_ext.to_layout %vec to layout(#layout1) : vector<32x16xf16>
-  // CHECK: %[[R0:.+]] = vector.insert_strided_slice {{.*}} {offsets = [0, 0, 7], strides = [1]} : vector<1xf16> into vector<1x1x8xf16>
-  // CHECK: %[[ADD:.*]] = arith.addf %[[R0]], %[[R0]] {{.*}} : vector<1x1x8xf16>
-  %vec2 = arith.addf %vecl, %vecl : vector<32x16xf16>
-  %vec2l = iree_vector_ext.to_layout %vec2 to layout(#layout0) : vector<32x16xf16>
-  // CHECK: %[[R1:.*]] = vector.extract_strided_slice %[[ADD]] {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x1x8xf16> to vector<1x1x4xf16>
-  // CHECK: %[[R2:.*]] = vector.extract_strided_slice %[[ADD]] {offsets = [0, 0, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x1x8xf16> to vector<1x1x4xf16>
-  vector.transfer_write %vec2l, %b[%c0, %c0] {in_bounds = [true, true]} : vector<32x16xf16>, memref<32x16xf16>
-  // CHECK-COUNT-8: vector.store {{.*}}, vector<1xf16>
-  func.return
-}
-
-func.func @unresolved_layout_conflict(%a : memref<32x16xf16>, %b : memref<32x16xf16>) {
-  // CHECK: func.func @unresolved_layout_conflict(%[[MEM:.*]]: memref<32x16xf16>, %[[MEM1:.*]]: memref<32x16xf16>
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.0 : f16
-  %vcst = arith.constant dense<0.0> : vector<32x16xf16>
-  // CHECK-COUNT-8: vector.load %[[MEM]]
-  %vec = vector.transfer_read  %a[%c0, %c0], %cst : memref<32x16xf16>, vector<32x16xf16>
-  %vecl = iree_vector_ext.to_layout %vec to layout(#layout1) : vector<32x16xf16>
-  // CHECK: iree_vector_ext.to_layout {{.*}}
-  %vec2 = arith.addf %vecl, %vcst : vector<32x16xf16>
-  // CHECK-COUNT-16: vector.store {{.*}}, vector<1xf16>
-  %vec2l = iree_vector_ext.to_layout %vec2 to layout(#layout2) : vector<32x16xf16>
-  vector.transfer_write %vec2l, %b[%c0, %c0] {in_bounds = [true, true]} : vector<32x16xf16>, memref<32x16xf16>
-  func.return
-}
-
-builtin.module attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
-    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.test_gpu_vector_distribution %top_level_func {experimental = true} : !transform.any_op
-    transform.yield
-  }
-}
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
index ac8ae7386f55..cc2649823f4e 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
@@ -1120,14 +1120,10 @@ transform_dialect::TestGpuVectorDistribution::applyToOne(
       rewriter.create<gpu::ThreadIdOp>(target.getLoc(), gpu::Dimension::x);
 
   populateGPUDistributionPatterns(patterns);
-  populateGPUDistributionLayoutAttrPatterns(laneId, patterns);
-  populateGPUReductionDistributionPatterns(patterns);
   // For testing we use subgroup size = 64.
   populateGPUDistributeNestedLayoutAttrPatterns(patterns, laneId,
                                                 /*subgroupSize=*/64);
   populateGPUDistributeNestedLayoutContractAMDGPUPatterns(patterns);
-  if (getExperimental())
-    populateGPULayoutResolutionDistributionPatterns(patterns);
   if (failed(distributeVectorOps(target, patterns, options))) {
     return emitDefaultDefiniteFailure(target);
   }
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/vector_layout_analysis.mlir b/compiler/src/iree/compiler/Codegen/Common/test/vector_layout_analysis.mlir
index 6533a09e6d5a..03f581ee7552 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/vector_layout_analysis.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/vector_layout_analysis.mlir
@@ -1,6 +1,15 @@
 // RUN: iree-opt -iree-transform-dialect-interpreter --split-input-file %s --verify-diagnostics
 
-#layout = #iree_vector_ext.layout<<[VECTORY], [16]>, <[VECTORX], [16]>>
+#layout = #iree_vector_ext.nested_layout<
+  subgroup_tile = [1, 1],
+  batch_tile = [1, 1],
+  outer_tile = [1, 1],
+  thread_tile = [1, 1],
+  element_tile = [16, 16],
+
+  subgroup_strides = [0, 0],
+  thread_strides   = [0, 0]
+>
 
 // Propagate the layout from transfer_read to everyone.
 builtin.module attributes { transform.with_named_sequence } {
@@ -8,14 +17,14 @@ builtin.module attributes { transform.with_named_sequence } {
     %c0 = arith.constant 0 : index
     %cst_0 = arith.constant 0.0 : f16
     %root = vector.transfer_read %arr[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}}
+    // expected-remark @above {{element_tile = [16, 16]}}
     %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<16x16xf16>
     %c = arith.mulf %rootl, %b : vector<16x16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}}
+    // expected-remark @above {{element_tile = [16, 16]}}
     %d = arith.addf %c, %a : vector<16x16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}}
+    // expected-remark @above {{element_tile = [16, 16]}}
     %e = arith.select %cond, %c, %d : vector<16x16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}}
+    // expected-remark @above {{element_tile = [16, 16]}}
     func.return %e : vector<16x16xf16>
   }
 
@@ -28,7 +37,16 @@ builtin.module attributes { transform.with_named_sequence } {
 
 // -----
 
-#layout = #iree_vector_ext.layout<<[VECTORY], [16]>, <[VECTORX], [16]>>
+#layout = #iree_vector_ext.nested_layout<
+  subgroup_tile = [1, 1],
+  batch_tile = [1, 1],
+  outer_tile = [1, 1],
+  thread_tile = [1, 1],
+  element_tile = [16, 16],
+
+  subgroup_strides = [0, 0],
+  thread_strides   = [0, 0]
+>
 
 // Enforce the layout from the transfer_write to everyone
 builtin.module attributes { transform.with_named_sequence } {
@@ -36,11 +54,11 @@ builtin.module attributes { transform.with_named_sequence } {
     %c0 = arith.constant 0 : index
     %cst_0 = arith.constant 0.0 : f16
     %cst0 = arith.constant dense<0.0> : vector<16x16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}}
+    // expected-remark @above {{element_tile = [16, 16]}}
     %c = arith.mulf %cst0, %b : vector<16x16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}}
+    // expected-remark @above {{element_tile = [16, 16]}}
     %d = arith.addf %c, %a : vector<16x16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}}
+    // expected-remark @above {{element_tile = [16, 16]}}
     %dl = iree_vector_ext.to_layout %d to layout(#layout) : vector<16x16xf16>
     vector.transfer_write %dl, %arr[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<16x16xf16>
     func.return %d : vector<16x16xf16>
@@ -55,7 +73,16 @@ builtin.module attributes { transform.with_named_sequence } {
 
 // -----
 
-#layout = #iree_vector_ext.layout<<[VECTORY], [16]>, <[VECTORX], [16]>>
+#layout = #iree_vector_ext.nested_layout<
+  subgroup_tile = [1, 1],
+  batch_tile = [1, 1],
+  outer_tile = [1, 1],
+  thread_tile = [1, 1],
+  element_tile = [16, 16],
+
+  subgroup_strides = [0, 0],
+  thread_strides   = [0, 0]
+>
 
 // First propagate the layout, and then enforce it up.
 builtin.module attributes { transform.with_named_sequence } {
@@ -63,16 +90,16 @@ builtin.module attributes { transform.with_named_sequence } {
     %c0 = arith.constant 0 : index
     %cst_0 = arith.constant 0.0 : f16
     %root = vector.transfer_read %arr[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}}
+    // expected-remark @above {{element_tile = [16, 16]}}
     %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<16x16xf16>
     %root2 = vector.transfer_read %arr2[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}}
+    // expected-remark @above {{element_tile = [16, 16]}}
     %c = arith.mulf %rootl, %b : vector<16x16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}}
+    // expected-remark @above {{element_tile = [16, 16]}}
     %d = arith.addf %c, %a : vector<16x16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}}
+    // expected-remark @above {{element_tile = [16, 16]}}
     %e = arith.divf %d, %root2 : vector<16x16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}}
+    // expected-remark @above {{element_tile = [16, 16]}}
     func.return %e : vector<16x16xf16>
   }
 
@@ -85,7 +112,16 @@ builtin.module attributes { transform.with_named_sequence } {
 
 // -----
 
-#layout = #iree_vector_ext.layout<<[VECTORY], [16]>, <[BATCHY, VECTORX], [2, 8]>>
+#layout = #iree_vector_ext.nested_layout<
+  subgroup_tile = [1, 1],
+  batch_tile = [1, 2],
+  outer_tile = [1, 1],
+  thread_tile = [1, 1],
+  element_tile = [16, 8],
+
+  subgroup_strides = [0, 0],
+  thread_strides   = [0, 0]
+>
 
 // Propagate and enforce through reduction.
 builtin.module attributes { transform.with_named_sequence } {
@@ -93,20 +129,20 @@ builtin.module attributes { transform.with_named_sequence } {
     %c0 = arith.constant 0 : index
     %cst_0 = arith.constant 0.0 : f16
     %cst0_1 = arith.constant dense<0.0> : vector<16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY,  VECTORX], [2, 8]>>}}
+    // expected-remark @above {{element_tile = [8]}}
     %root = vector.transfer_read %arr[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ BATCHY,  VECTORX], [2, 8]>>}}
+    // expected-remark @above {{element_tile = [16, 8]}}
     %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<16x16xf16>
     %root2 = vector.transfer_read %arr2[%c0], %cst_0 {in_bounds = [true]} : memref<16xf16>, vector<16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY,  VECTORX], [2, 8]>>}}
+    // expected-remark @above {{element_tile = [8]}}
     %root_red = vector.multi_reduction<add>, %rootl, %cst0_1 [0]  : vector<16x16xf16> to vector<16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY,  VECTORX], [2, 8]>>}}
+    // expected-remark @above {{element_tile = [8]}}
     %c = arith.mulf %root_red, %b : vector<16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY,  VECTORX], [2, 8]>>}}
+    // expected-remark @above {{element_tile = [8]}}
     %d = arith.addf %c, %a : vector<16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY,  VECTORX], [2, 8]>>}}
+    // expected-remark @above {{element_tile = [8]}}
     %e = arith.divf %d, %root2 : vector<16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY,  VECTORX], [2, 8]>>}}
+    // expected-remark @above {{element_tile = [8]}}
     func.return %e : vector<16xf16>
   }
 
@@ -119,7 +155,16 @@ builtin.module attributes { transform.with_named_sequence } {
 
 // -----
 
-#layout = #iree_vector_ext.layout<<[VECTORY], [16]>, <[BATCHY, VECTORX], [2, 8]>>
+#layout = #iree_vector_ext.nested_layout<
+  subgroup_tile = [1, 1],
+  batch_tile = [1, 2],
+  outer_tile = [1, 1],
+  thread_tile = [1, 1],
+  element_tile = [16, 8],
+
+  subgroup_strides = [0, 0],
+  thread_strides   = [0, 0]
+>
 
 // Propagate and enforce through transpose and then reduction.
 builtin.module attributes { transform.with_named_sequence } {
@@ -127,22 +172,22 @@ builtin.module attributes { transform.with_named_sequence } {
     %c0 = arith.constant 0 : index
     %cst_0 = arith.constant 0.0 : f16
     %cst0_1 = arith.constant dense<0.0> : vector<16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}}
+    // expected-remark @above {{element_tile = [16]}}
     %root = vector.transfer_read %arr[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ BATCHY,  VECTORX], [2, 8]>>}}
+    // expected-remark @above {{element_tile = [16, 8]}}
     %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<16x16xf16>
     %root2 = vector.transfer_read %arr2[%c0], %cst_0 {in_bounds = [true]} : memref<16xf16>, vector<16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}}
+    // expected-remark @above {{element_tile = [16]}}
     %root_transpose = vector.transpose %rootl, [1, 0] : vector<16x16xf16> to vector<16x16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY,  VECTORX], [2, 8]>, <[ VECTORY], [16]>>}}
+    // expected-remark @above {{element_tile = [8, 16]}}
     %root_red = vector.multi_reduction<add>, %root_transpose, %cst0_1 [0]  : vector<16x16xf16> to vector<16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}}
+    // expected-remark @above {{element_tile = [16]}}
     %c = arith.mulf %root_red, %b : vector<16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}}
+    // expected-remark @above {{element_tile = [16]}}
     %d = arith.addf %c, %a : vector<16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}}
+    // expected-remark @above {{element_tile = [16]}}
     %e = arith.divf %d, %root2 : vector<16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}}
+    // expected-remark @above {{element_tile = [16]}}
     func.return %e : vector<16xf16>
   }
 
@@ -155,9 +200,38 @@ builtin.module attributes { transform.with_named_sequence } {
 
 // -----
 
-#layoutA = #iree_vector_ext.layout<<[VECTORX], [32]>, <[VECTORY], [64]>>
-#layoutB = #iree_vector_ext.layout<<[VECTORX], [128]>, <[VECTORY], [64]>>
-#layoutC = #iree_vector_ext.layout<<[VECTORY], [128]>, <[VECTORX], [32]>>
+#layoutA = #iree_vector_ext.nested_layout<
+  subgroup_tile = [1, 1],
+  batch_tile = [1, 1],
+  outer_tile = [1, 1],
+  thread_tile = [1, 1],
+  element_tile = [32, 64],
+
+  subgroup_strides = [0, 0],
+  thread_strides   = [0, 0]
+>
+
+#layoutB = #iree_vector_ext.nested_layout<
+  subgroup_tile = [1, 1],
+  batch_tile = [1, 1],
+  outer_tile = [1, 1],
+  thread_tile = [1, 1],
+  element_tile = [128, 64],
+
+  subgroup_strides = [0, 0],
+  thread_strides   = [0, 0]
+>
+
+#layoutC = #iree_vector_ext.nested_layout<
+  subgroup_tile = [1, 1],
+  batch_tile = [1, 1],
+  outer_tile = [1, 1],
+  thread_tile = [1, 1],
+  element_tile = [128, 32],
+
+  subgroup_strides = [0, 0],
+  thread_strides   = [0, 0]
+>
 
 #map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
@@ -171,7 +245,7 @@ builtin.module attributes { transform.with_named_sequence } {
     %c = iree_vector_ext.to_layout %C to layout(#layoutC) : vector<128x32xf32>
 
     // Check if the layout of %C was properly propagated to %D.
-    // expected-remark @below {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [128]>, <[ VECTORX], [32]>>}}
+    // expected-remark @below {{element_tile = [128, 32]}}
     %D = vector.contract
         {indexing_maps = [#map1, #map2, #map3],
          iterator_types = ["parallel", "parallel", "reduction"],
@@ -190,21 +264,30 @@ builtin.module attributes { transform.with_named_sequence } {
 
 // -----
 
-#layout = #iree_vector_ext.layout<<[VECTORY], [16]>, <[VECTORX], [16]>>
+#layout = #iree_vector_ext.nested_layout<
+  subgroup_tile = [1, 1],
+  batch_tile = [1, 1],
+  outer_tile = [1, 1],
+  thread_tile = [1, 1],
+  element_tile = [16, 16],
+
+  subgroup_strides = [0, 0],
+  thread_strides   = [0, 0]
+>
 
 // Propagate the layout from transfer_read to everyone.
 builtin.module attributes { transform.with_named_sequence } {
   func.func @gather(%base: memref<16x16xf16>, %arr: memref<16x16xindex>) -> vector<16x16xf16> {
     %c0 = arith.constant 0 : index
     %mask = arith.constant dense<true> : vector<16x16xi1>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}}
+    // expected-remark @above {{element_tile = [16, 16]}}
     %pass = arith.constant dense<0.000000e+00> : vector<16x16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}}
+    // expected-remark @above {{element_tile = [16, 16]}}
     %index = vector.transfer_read %arr[%c0, %c0], %c0 {in_bounds = [true, true]} : memref<16x16xindex>, vector<16x16xindex>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}}
+    // expected-remark @above {{element_tile = [16, 16]}}
     %index_dist = iree_vector_ext.to_layout %index to layout(#layout) : vector<16x16xindex>
     %c = vector.gather %base[%c0, %c0] [%index_dist], %mask, %pass : memref<16x16xf16>, vector<16x16xindex>, vector<16x16xi1>, vector<16x16xf16> into vector<16x16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}}
+    // expected-remark @above {{element_tile = [16, 16]}}
     func.return %c : vector<16x16xf16>
   }
 
@@ -224,25 +307,44 @@ builtin.module attributes { transform.with_named_sequence } {
 // Useful proxy for ensuring that layout conversions on attention
 // happens where we intend it to happen.
 
-#layoutA = #iree_vector_ext.layout<<[BATCHY, LANEX], [2, 32]>, <[BATCHX,  VECTORY,  LANEY,  VECTORX], [2, 4, 2, 4]>>
-#layoutB = #iree_vector_ext.layout<<[BATCHY, LANEX], [2, 32]>, <[BATCHX,  LANEY,  VECTORX], [2, 4, 8]>>
+#layoutA = #iree_vector_ext.nested_layout<
+  subgroup_tile = [1, 1],
+  batch_tile = [2, 2],
+  outer_tile = [1, 4],
+  thread_tile = [32, 2],
+  element_tile = [1, 4],
+
+  subgroup_strides = [0, 0],
+  thread_strides   = [2, 1]
+>
+
+#layoutB = #iree_vector_ext.nested_layout<
+  subgroup_tile = [1, 1],
+  batch_tile = [2, 2],
+  outer_tile = [1, 1],
+  thread_tile = [32, 4],
+  element_tile = [1, 8],
+
+  subgroup_strides = [0, 0],
+  thread_strides   = [4, 1]
+>
 
 builtin.module attributes { transform.with_named_sequence } {
   func.func @resolve_select(%A : vector<64x64xf16>, %B : vector<64x64xf16>, %condition : i1) -> vector<64x64xf16> {
     %a = iree_vector_ext.to_layout %A to layout(#layoutA) : vector<64x64xf16>
     %b = iree_vector_ext.to_layout %B to layout(#layoutB) : vector<64x64xf16>
-    // expected-remark @below {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY,  LANEX], [2, 32]>, <[ BATCHX,  VECTORY,  LANEY,  VECTORX], [2, 4, 2, 4]>>}}
+    // expected-remark @below {{element_tile = [1, 4]}}
     %offset_0 = arith.constant dense<2.0> : vector<64x64xf16>
-    // expected-remark @below {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY,  LANEX], [2, 32]>, <[ BATCHX,  VECTORY,  LANEY,  VECTORX], [2, 4, 2, 4]>>}}
+    // expected-remark @below {{element_tile = [1, 4]}}
     %offset_1 = arith.constant dense<4.0> : vector<64x64xf16>
 
-    // expected-remark @below {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY,  LANEX], [2, 32]>, <[ BATCHX,  VECTORY,  LANEY,  VECTORX], [2, 4, 2, 4]>>}}
+    // expected-remark @below {{element_tile = [1, 4]}}
     %sel = arith.select %condition, %offset_0, %offset_1 : vector<64x64xf16>
-    // expected-remark @below {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY,  LANEX], [2, 32]>, <[ BATCHX,  VECTORY,  LANEY,  VECTORX], [2, 4, 2, 4]>>}}
+    // expected-remark @below {{element_tile = [1, 4]}}
     %add = arith.addf %a, %sel : vector<64x64xf16>
     %add_layout = iree_vector_ext.to_layout %add to layout(#layoutB) : vector<64x64xf16>
     // CHECK-COUNT-3: iree_vector_ext.to_layout
-    // expected-remark @below {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY,  LANEX], [2, 32]>, <[ BATCHX,  LANEY,  VECTORX], [2, 4, 8]>>}}
+    // expected-remark @below {{element_tile = [1, 8]}}
     %add_1 = arith.addf %add_layout, %b : vector<64x64xf16>
     func.return %add_1 : vector<64x64xf16>
   }
@@ -256,7 +358,16 @@ builtin.module attributes { transform.with_named_sequence } {
 
 // -----
 
-#layout = #iree_vector_ext.layout<<[VECTORY], [16]>, <[BATCHY, VECTORX], [2, 8]>>
+#layout = #iree_vector_ext.nested_layout<
+  subgroup_tile = [1, 1],
+  batch_tile = [1, 2],
+  outer_tile = [1, 1],
+  thread_tile = [1, 1],
+  element_tile = [16, 8],
+
+  subgroup_strides = [0, 0],
+  thread_strides   = [0, 0]
+>
 
 // Propagate and enforce through scf.for
 builtin.module attributes { transform.with_named_sequence } {
@@ -266,25 +377,24 @@ builtin.module attributes { transform.with_named_sequence } {
     %c1024 = arith.constant 1024 : index
     %cst_0 = arith.constant 0.0 : f16
     %cst0_1 = arith.constant dense<0.0> : vector<16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}}
-
+    // expected-remark @above {{element_tile = [16]}}
     %out = scf.for %iv = %c0 to %c1024 step %c1 iter_args(%arg1 = %cst0_1) -> (vector<16xf16>) {
-      // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}}
+      // expected-remark @above {{element_tile = [16]}}
       %root = vector.transfer_read %arr[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
-      // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ BATCHY,  VECTORX], [2, 8]>>}}
+      // expected-remark @above {{element_tile = [16, 8]}}
       %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<16x16xf16>
       %root2 = vector.transfer_read %arr2[%c0], %cst_0 {in_bounds = [true]} : memref<16xf16>, vector<16xf16>
-      // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}}
+      // expected-remark @above {{element_tile = [16]}}
       %root_transpose = vector.transpose %rootl, [1, 0] : vector<16x16xf16> to vector<16x16xf16>
-      // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY,  VECTORX], [2, 8]>, <[ VECTORY], [16]>>}}
+      // expected-remark @above {{element_tile = [8, 16]}}
       %root_red = vector.multi_reduction<add>, %root_transpose, %arg1 [0]  : vector<16x16xf16> to vector<16xf16>
-      // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}}
+      // expected-remark @above {{element_tile = [16]}}
       %c = arith.mulf %root_red, %b : vector<16xf16>
-      // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}}
+      // expected-remark @above {{element_tile = [16]}}
       %d = arith.addf %c, %a : vector<16xf16>
-      // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}}
+      // expected-remark @above {{element_tile = [16]}}
       %e = arith.divf %d, %root2 : vector<16xf16>
-      // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}}
+      // expected-remark @above {{element_tile = [16]}}
       scf.yield %e : vector<16xf16>
     }
 
@@ -565,7 +675,16 @@ builtin.module attributes { transform.with_named_sequence } {
 
 // -----
 
-#layout = #iree_vector_ext.layout<<[VECTORY], [16]>, <[BATCHY, VECTORX], [2, 8]>>
+#layout = #iree_vector_ext.nested_layout<
+  subgroup_tile = [1, 1],
+  batch_tile = [1, 2],
+  outer_tile = [1, 1],
+  thread_tile = [1, 1],
+  element_tile = [16, 8],
+
+  subgroup_strides = [0, 0],
+  thread_strides   = [0, 0]
+>
 
 // Propagate and enforce through scf.for
 builtin.module attributes { transform.with_named_sequence } {
@@ -578,7 +697,7 @@ builtin.module attributes { transform.with_named_sequence } {
 
     %out = scf.for %iv = %c0 to %c1024 step %c1 iter_args(%arg1 = %cst) -> (vector<f16>) {
       %root = vector.transfer_read %arr[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
-      // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ BATCHY,  VECTORX], [2, 8]>>}}
+      // expected-remark @above {{element_tile = [16, 8]}}
       %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<16x16xf16>
       %init = vector.extractelement %arg1[] : vector<f16>
       %root_red = vector.multi_reduction<add>, %rootl, %init [0, 1]  : vector<16x16xf16> to f16
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.cpp
index 92134cb9e1f1..d4062bbf703b 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.cpp
@@ -25,219 +25,6 @@ namespace mlir::iree_compiler::IREE::VectorExt {
 
 using VectorValue = TypedValue<VectorType>;
 
-bool PerDimLayoutAttr::contains(const LayoutDimension &dim) {
-  for (LayoutDimensionAttr label : getLabels()) {
-    if (label.getValue() == dim)
-      return true;
-  }
-  return false;
-}
-
-std::optional<int64_t> PerDimLayoutAttr::getShape(const LayoutDimension &dim) {
-  for (auto value : llvm::zip(getLabels(), getShapes())) {
-    if (dim == std::get<0>(value).getValue())
-      return std::get<1>(value);
-  }
-  return std::nullopt;
-}
-
-std::optional<int64_t> LayoutAttr::getShape(const LayoutDimension &dim) const {
-  for (PerDimLayoutAttr layout : getLayouts()) {
-    std::optional<int64_t> maybeShape = layout.getShape(dim);
-    if (maybeShape)
-      return maybeShape.value();
-  }
-  return std::nullopt;
-}
-
-// Get the SIMT Vector shape in the order specified by dims. If no dims are
-// specified, then return an empty vector.
-LogicalResult LayoutAttr::isValidLayout(ShapedType shapeTy,
-                                        Location loc) const {
-  ArrayRef<int64_t> shape = shapeTy.getShape();
-  if (shape.size() != getRank()) {
-    return emitError(loc, "Rank of vector (")
-           << shape.size() << ") does not match rank of layout (" << getRank()
-           << ").";
-  }
-  for (auto [idx, layout] : llvm::enumerate(getLayouts())) {
-    ArrayRef<int64_t> layoutShape = layout.getShapes();
-    int64_t expectedShape =
-        std::reduce(layoutShape.begin(), layoutShape.end(),
-                    static_cast<int64_t>(1), std::multiplies<int64_t>());
-    if (expectedShape != shape[idx]) {
-      std::string shapeStr;
-      llvm::raw_string_ostream shapeOs(shapeStr);
-      llvm::interleaveComma(shape, shapeOs);
-      std::string layoutStr;
-      llvm::raw_string_ostream layoutOs(layoutStr);
-      printStripped(layoutOs);
-      return emitError(loc, "Vector shape: [")
-             << shapeStr << "] does not match the layout (" << layoutStr
-             << ") at dim " << idx
-             << ". Dimension expected by layout: " << expectedShape
-             << " actual: " << shape[idx];
-    }
-  }
-  return success();
-}
-
-// Project out the layout for the specified dimensions
-// resulting in the layout for a lower dimensional vector.
-VectorLayoutInterface LayoutAttr::project(ArrayRef<bool> droppedDims) const {
-  assert(droppedDims.size() == getRank() &&
-         "droppedDims size must match layout size");
-
-  ArrayRef<PerDimLayoutAttr> layouts = getLayouts();
-  SmallVector<PerDimLayoutAttr> newLayouts;
-  for (auto pair : llvm::zip(droppedDims, layouts)) {
-    if (!std::get<0>(pair))
-      newLayouts.push_back(std::get<1>(pair));
-  }
-  return LayoutAttr::get(getContext(), newLayouts);
-}
-
-// Permute the layout according to the provided permutation
-// vector. The dimensionality of the layout remains the same.
-VectorLayoutInterface LayoutAttr::permute(ArrayRef<int64_t> permutation) const {
-  assert(permutation.size() == getRank() &&
-         "permutation size must match layout rank");
-
-  ArrayRef<PerDimLayoutAttr> layouts = getLayouts();
-  SmallVector<PerDimLayoutAttr> newLayouts;
-  for (unsigned index : permutation) {
-    assert(index >= 0 && index < getRank());
-    newLayouts.push_back(layouts[index]);
-  }
-  return LayoutAttr::get(getContext(), newLayouts);
-}
-
-// This function returns the distributed shape of the SIMT
-// vector and evaluates it in the following order:
-// BATCHX, BATCHY, VECTORY, VECTORX
-// The vector dimensions are combined into a single SIMT
-// vector dimension.
-SmallVector<int64_t> LayoutAttr::getDistributedShape() const {
-  SmallVector<LayoutDimension> labels{
-      LayoutDimension::BATCHX, LayoutDimension::BATCHY,
-      LayoutDimension::VECTORY, LayoutDimension::VECTORX};
-  SmallVector<int64_t> simtVectorShape;
-  std::optional<int64_t> vectorShape;
-  for (LayoutDimension dim : labels) {
-    ArrayRef<PerDimLayoutAttr> layouts = getLayouts();
-    for (PerDimLayoutAttr layout : layouts) {
-      if (!layout.contains(dim))
-        continue;
-      int64_t shape = layout.getShape(dim).value();
-      if (isVectorDimension(dim)) {
-        vectorShape = shape * vectorShape.value_or(1);
-        continue;
-      }
-      simtVectorShape.push_back(shape);
-    }
-  }
-  if (vectorShape)
-    simtVectorShape.push_back(vectorShape.value());
-  return simtVectorShape;
-}
-
-PerDimLayoutAttr LayoutAttr::getDimLayout(int64_t dim) const {
-  assert(dim >= 0 && dim < getRank());
-  return getLayouts()[dim];
-}
-
-std::optional<int64_t> LayoutAttr::getBatchDim(int64_t dim) {
-  assert(dim < getRank());
-  PerDimLayoutAttr layout = getDimLayout(dim);
-  for (auto [name, shape] :
-       llvm::zip_equal(layout.getLabels(), layout.getShapes())) {
-    if (isBatchDimension(name.getValue()))
-      return shape;
-  }
-  return std::nullopt;
-}
-
-std::optional<int64_t> LayoutAttr::getLaneDim(int64_t dim) {
-  assert(dim < getRank());
-  PerDimLayoutAttr layout = getDimLayout(dim);
-  for (auto [name, shape] :
-       llvm::zip_equal(layout.getLabels(), layout.getShapes())) {
-    if (isLaneDimension(name.getValue()))
-      return shape;
-  }
-  return std::nullopt;
-}
-
-std::optional<LayoutDimension> LayoutAttr::getLane(int64_t dim) {
-  assert(dim < getRank());
-  PerDimLayoutAttr layout = getDimLayout(dim);
-  for (auto [name, shape] :
-       llvm::zip_equal(layout.getLabels(), layout.getShapes())) {
-    if (isLaneDimension(name.getValue()))
-      return name.getValue();
-  }
-  return std::nullopt;
-}
-
-int64_t LayoutAttr::getRank() const { return getLayouts().size(); }
-
-std::tuple<int64_t, int64_t, int64_t> LayoutAttr::getLaneGrid() {
-  int64_t laneX = 1;
-  int64_t laneY = 1;
-  int64_t laneZ = 1;
-  for (PerDimLayoutAttr dimLayout : getLayouts()) {
-    // Note that valid layouts only include at most one instance of each
-    // dimension type, so this is simply doing assignment on the first instance
-    // of each lane index, not an accumulative product.
-    auto maybeXShape = dimLayout.getShape(LayoutDimension::LANEX);
-    laneX *= maybeXShape.value_or(1);
-    auto maybeYShape = dimLayout.getShape(LayoutDimension::LANEY);
-    laneY *= maybeYShape.value_or(1);
-    auto maybeZShape = dimLayout.getShape(LayoutDimension::LANEZ);
-    laneZ *= maybeZShape.value_or(1);
-  }
-  return std::make_tuple(laneX, laneY, laneZ);
-}
-
-uint64_t LayoutAttr::getShuffleOffset(int64_t reductionDim) {
-  uint64_t offset = 0;
-  std::optional<LayoutDimension> laneDim = getLane(reductionDim);
-  if (!laneDim)
-    return offset;
-  switch (laneDim.value()) {
-  case LayoutDimension::LANEX:
-    offset = 1;
-    break;
-  case LayoutDimension::LANEY:
-    offset = getShape(LayoutDimension::LANEX).value_or(0);
-    break;
-  case LayoutDimension::LANEZ:
-    offset = getShape(LayoutDimension::LANEX).value_or(0) *
-             getShape(LayoutDimension::LANEY).value_or(0);
-    break;
-  default:
-    assert(false && "Invalid dimension! Expected lane dimension");
-    break;
-  }
-  return offset;
-}
-
-bool LayoutAttr::hasLaneConflictWith(const LayoutAttr &other) {
-  SmallVector<LayoutDimension> laneDims{
-      LayoutDimension::LANEX, LayoutDimension::LANEY, LayoutDimension::LANEZ};
-  for (LayoutDimension dim : laneDims) {
-    std::optional<int64_t> shape = getShape(dim);
-    std::optional<int64_t> otherShape = other.getShape(dim);
-    if ((shape && !otherShape) || (!shape && otherShape))
-      return true;
-    if (shape && otherShape) {
-      if (shape.value() != otherShape.value())
-        return true;
-    }
-  }
-  return false;
-}
-
 // Project the nested layout. This take a mask on the dimensions of the vector
 // associated with this layout and projects out those dimensions. This reduces
 // the rank of the layout in the process.
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.td
index 913fb9f92dd3..c401e67a2dab 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.td
@@ -13,98 +13,6 @@ include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtBase.td"
 // Vector layout attributes
 //===---------------------------------------------------------------------===//
 
-// Defines the batch dimensions for the original SIMD tensor.
-// By convention, X is along rows and Y along columns.
-def BATCHX : I32EnumAttrCase<"BATCHX", 0>;
-def BATCHY : I32EnumAttrCase<"BATCHY", 1>;
-// Defines the vector dimension.
-def VECTORX : I32EnumAttrCase<"VECTORX", 2>;
-def VECTORY : I32EnumAttrCase<"VECTORY", 3>;
-def VECTORZ : I32EnumAttrCase<"VECTORZ", 4>;
-// Defines the lane dimensions.
-def LANEX : I32EnumAttrCase<"LANEX", 5>;
-def LANEY : I32EnumAttrCase<"LANEY", 6>;
-def LANEZ : I32EnumAttrCase<"LANEZ", 7>;
-
-def LayoutDimension : IREEVectorExt_I32EnumAttr<"LayoutDimension",
-    "Describes the dimension of the high-dimensional layout", [
-      BATCHX,
-      BATCHY,
-      VECTORX,
-      VECTORY,
-      VECTORZ,
-      LANEX,
-      LANEY,
-      LANEZ,
-    ]>;
-
-def LayoutDimensionAttr : IREEVectorExt_EnumAttr<LayoutDimension, "dimension">;
-
-def PerDimLayoutAttr : IREEVectorExt_Attr<"PerDimLayout"> {
-   let mnemonic = "per_dim_layout";
-   let summary = [{high-dimensional vector register layout for a given vector dimension}];
-   let description = [{
-    This attribute describes the per dimension register layout for a given vector
-    that could be prescribed by an operator such as matrix multiplication.
-    This is a way to explicitly represent the layout in the IR
-    when it is in the SIMD form prior to converting to the SIMT form so that
-    we can reason about layouts, propagating layouts and layout conflicts.
-   }];
-   let parameters = (ins
-     ArrayRefParameter<"LayoutDimensionAttr", "labels for the high dimensional layout dims">:$labels,
-     ArrayRefParameter<"int64_t", "shapes for the high dimensional layout dims">:$shapes
-   );
-   let assemblyFormat = "`<``[` $labels `]``,` `[` $shapes `]``>`";
-   let genVerifyDecl = 0;
-   let extraClassDeclaration = [{
-      std::optional<int64_t> getShape(const LayoutDimension &dim);
-      bool contains(const LayoutDimension &dim);
-   }];
-}
-
-def LayoutAttr : IREEVectorExt_Attr<"Layout",
-      [ DeclareAttrInterfaceMethods<VectorLayoutInterface> ]> {
-  let mnemonic = "layout";
-  let summary = [{high-dimensional vector register layout for a given vector}];
-  let description = [{
-    This contains a complete specification of the layout for a given vector,
-    whereas the attribute above only specifies the per dimension layout.
-  }];
-  let parameters = (ins
-    ArrayRefParameter<"PerDimLayoutAttr", "layout for each dimension of the vector">:$layouts
-  );
-  let assemblyFormat = "`<`$layouts`>`";
-  let genVerifyDecl = 0;
-  let extraClassDeclaration = [{
-    // Get the shape for a given layout dimension.
-    std::optional<int64_t> getShape(const LayoutDimension &dim) const;
-    std::optional<int64_t> getBatchDim(int64_t dim);
-    // Get the lane dimension shape for a provided simd tensor dim.
-    std::optional<int64_t> getLaneDim(int64_t dim);
-    // Get the lane dimension for a provided simd tensor dim.
-    std::optional<LayoutDimension> getLane(int64_t dim);
-
-    // Returns the grid of lane ids. Assumes a valid layout.
-    ::std::tuple<int64_t, int64_t, int64_t> getLaneGrid();
-    PerDimLayoutAttr getDimLayout(int64_t dim) const;
-
-    // Given the reduction dim, computes the shuffle offset
-    // based on the shapes of the lane dimensions. The shuffle
-    // offset is used during the thread global reduction
-    // when emitting a gpu::ShuffleOp and follows
-    // the semantics of the offset operand defined there,
-    // which is that for lane k, the shuffle op returns the
-    // value from lane k ^ offset.
-    uint64_t getShuffleOffset(int64_t reductionDim);
-
-    // Determines whether the other layout has a lane
-    // dimension that the current layout does not have OR whether
-    // the shape of the two layouts for a common lane dimension
-    // is not the same.
-    bool hasLaneConflictWith(const LayoutAttr &other);
-  }];
-}
-
 def NestedLayoutAttr : IREEVectorExt_Attr<"NestedLayout",
       [ DeclareAttrInterfaceMethods<VectorLayoutInterface> ]> {
   let mnemonic = "nested_layout";
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.cpp b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.cpp
index ba32c2326cb2..8c5abb9211cf 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.cpp
@@ -23,10 +23,6 @@ namespace mlir::iree_compiler::IREE::VectorExt {
 struct IREEVectorExtDialectOpAsmInterface : public OpAsmDialectInterface {
   using OpAsmDialectInterface::OpAsmDialectInterface;
   AliasResult getAlias(Attribute attr, raw_ostream &os) const override {
-    if (llvm::isa<LayoutAttr>(attr)) {
-      os << "layout";
-      return AliasResult::OverridableAlias;
-    }
     if (llvm::isa<NestedLayoutAttr>(attr)) {
       os << "nested";
       return AliasResult::OverridableAlias;
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.cpp b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.cpp
index c4da2ae68a09..7801bc54a99a 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.cpp
@@ -37,163 +37,6 @@ OpFoldResult ToSIMTOp::fold(FoldAdaptor) {
   return {};
 }
 
-void LayoutIterator::maybeFreezeAndConcatenate(
-    const LayoutIterator::State &frozenState) {
-  for (auto &[frozenDim, frozenIt] : frozenState.iterators) {
-    if (!state.contains(frozenDim)) {
-      frozenDimensions.insert(frozenDim);
-      state[frozenDim] = frozenIt;
-      state.ranges[frozenDim] = frozenState.ranges.lookup(frozenDim);
-    }
-  }
-}
-
-void LayoutIterator::initialize(const PerDimLayoutAttr &attr,
-                                DenseMap<LayoutDimension, int64_t> strides,
-                                std::optional<int64_t> simdIndex) {
-  auto reversedLabels = llvm::reverse(attr.getLabels());
-  auto reversedShapes = llvm::reverse(attr.getShapes());
-  for (auto [nameAttr, shape] : llvm::zip(reversedLabels, reversedShapes)) {
-    LayoutDimension dim = nameAttr.getValue();
-    if (isLaneDimension(dim))
-      continue;
-    int64_t stride = strides.contains(dim) ? strides[dim] : 1;
-    state.ranges[dim] = DimensionalRange(0, shape, stride);
-    state.iterators[dim] = state.ranges[dim].begin();
-    maxIterations *= shape / stride;
-    if (simdIndex) {
-      int64_t index = simdIndex.value();
-      if (!state.simdToLayoutDim.contains(index))
-        state.simdToLayoutDim[index] = {};
-      state.simdToLayoutDim[index].insert(dim);
-    }
-  }
-}
-
-LayoutIterator::LayoutIterator(LayoutAttr &attr,
-                               DenseMap<LayoutDimension, int64_t> strides) {
-  for (auto perDimAttr : llvm::enumerate(attr.getLayouts())) {
-    initialize(perDimAttr.value(), strides, perDimAttr.index());
-  }
-}
-
-LayoutIterator::LayoutIterator(LayoutAttr &attr) {
-  DenseMap<LayoutDimension, int64_t> strides;
-  for (auto [idx, attr] : llvm::enumerate(attr.getLayouts())) {
-    initialize(attr, strides, idx);
-  }
-}
-
-LayoutIterator::LayoutIterator(LayoutAttr &attr,
-                               DenseMap<LayoutDimension, int64_t> strides,
-                               int64_t simtIndex) {
-  for (auto [idx, attr] : llvm::enumerate(attr.getLayouts())) {
-    if (idx != simtIndex)
-      continue;
-    initialize(attr, strides, idx);
-  }
-}
-
-LayoutIterator::LayoutIterator(LayoutAttr &attr, int64_t simtIndex) {
-  DenseMap<LayoutDimension, int64_t> strides;
-  for (auto [idx, attr] : llvm::enumerate(attr.getLayouts())) {
-    if (idx != simtIndex)
-      continue;
-    initialize(attr, strides, idx);
-  }
-}
-
-LayoutIterator::LayoutIterator(PerDimLayoutAttr &attr,
-                               DenseMap<LayoutDimension, int64_t> strides) {
-  initialize(attr, strides, std::nullopt);
-}
-
-LayoutIterator &LayoutIterator::operator++() {
-  for (auto &[dim, it] : state.iterators) {
-    if (frozenDimensions.contains(dim))
-      continue;
-    ++it;
-    if (it == state.ranges[dim].end()) {
-      it = state.ranges[dim].begin();
-      continue;
-    }
-    break;
-  }
-  ++iterations;
-  return *this;
-}
-
-/// The iterator is done when all the loops are complete.
-bool LayoutIterator::iterationComplete() { return iterations == maxIterations; }
-
-void LayoutIterator::apply(
-    std::function<void(const LayoutIterator::State &)> callback) {
-  for (; !iterationComplete(); ++(*this)) {
-    callback(state);
-  }
-}
-
-// Get the offset into the SIMT vector corresponding to the incoming iterator.
-// The returned offsets will always be the same shape as the labels array.
-// Groups vector dimensions together. Assumes last dimension is vector
-// dimension.
-SmallVector<int64_t> LayoutIterator::State::computeSIMTIndex() const {
-  SmallVector<int64_t> offset;
-  std::optional<int64_t> vecOffset;
-  for (auto label : labels) {
-    for (auto [name, it] : iterators) {
-      if (name != label)
-        continue;
-      if (isBatchDimension(name)) {
-        offset.push_back(it.getPosition());
-        continue;
-      }
-      if (isVectorDimension(name)) {
-        int64_t step{1};
-        if (name == LayoutDimension::VECTORY) {
-          assert(ranges.contains(LayoutDimension::VECTORX) &&
-                 "Expected VectorX to be specified on layouts with VectorY.");
-          step = ranges.lookup(LayoutDimension::VECTORX).stop;
-        }
-        vecOffset = vecOffset.value_or(0) + it.getPosition() * step;
-      }
-    }
-  }
-  if (vecOffset)
-    offset.push_back(vecOffset.value());
-  return offset;
-}
-
-SmallVector<int64_t>
-LayoutIterator::State::computeIteratorProjectedSIMTIndex() const {
-  SmallVector<int64_t> indices = computeSIMTIndex();
-  SmallVector<int64_t> projectedIndices;
-  for (size_t i = 0, e = labels.size(); i != e; ++i) {
-    for (auto [name, it] : iterators) {
-      if (name == labels[i])
-        projectedIndices.push_back(indices[i]);
-    }
-  }
-  return projectedIndices;
-}
-
-void LayoutIterator::erase(LayoutDimension dim) {
-  if (state.contains(dim))
-    state.erase(dim);
-}
-
-LayoutIterator LayoutIterator::getBatchIterator() const {
-  LayoutIterator projectedIterator = *this;
-  for (auto [dim, it] : state.iterators) {
-    if (!isBatchDimension(dim)) {
-      DimensionalRange range = state.ranges.lookup(dim);
-      projectedIterator.maxIterations /= (range.stop / range.step);
-      projectedIterator.erase(dim);
-    }
-  }
-  return projectedIterator;
-}
-
 // clang-format off
 #define GET_OP_CLASSES
 #include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.cpp.inc" // IWYU pragma: keep
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.h b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.h
index 408c95a80548..22241e9c7681 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.h
+++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.h
@@ -19,121 +19,4 @@
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
-namespace mlir::iree_compiler::IREE::VectorExt {
-
-/// Dimensional Strided Iterator class used to represent
-/// an iterator through a single dimension of the layout.
-class DimensionalIterator {
-public:
-  DimensionalIterator(int64_t position = 0, int64_t stride = 1)
-      : position(position), stride(stride) {}
-  int64_t operator*() const { return position; }
-  DimensionalIterator &operator++() {
-    position += stride;
-    return *this;
-  }
-
-  bool operator==(const DimensionalIterator &other) const {
-    return position == other.position;
-  }
-  bool operator!=(const DimensionalIterator &other) const {
-    return !(*this == other);
-  }
-  bool operator<(const DimensionalIterator &other) const {
-    return position < other.position;
-  }
-
-  int64_t getPosition() const { return position; }
-
-private:
-  int64_t position, stride;
-};
-
-/// Dimensional Range class used to represent the range of
-/// a particular dimension of the layout. Can be iterated on
-/// using a DimensionalIterator.
-class DimensionalRange {
-public:
-  DimensionalRange() {}
-  DimensionalRange(int64_t start, int64_t stop, int64_t step = 1)
-      : start(start), stop(stop), step(step) {}
-  DimensionalIterator begin() const { return DimensionalIterator(start, step); }
-  DimensionalIterator end() const { return DimensionalIterator(stop, step); }
-
-  int64_t start, stop, step;
-};
-
-// Iterator class for LayoutAttrs and PerDimLayoutAttrs.
-// Provides O(1) access to state for any given dimension.
-// Also preserves insertion order.
-// Layout iterators skip lane dimensions as these are not
-// required during distribution.
-class LayoutIterator {
-public:
-  struct State {
-    SmallVector<int64_t> computeSIMTIndex() const;
-    SmallVector<int64_t> computeIteratorProjectedSIMTIndex() const;
-    bool contains(LayoutDimension dim) const { return iterators.contains(dim); }
-    void erase(LayoutDimension dim) { iterators.erase(dim); }
-    DimensionalIterator lookup(LayoutDimension dim) const {
-      return iterators.lookup(dim);
-    }
-    DimensionalIterator &operator[](LayoutDimension dim) {
-      return iterators[dim];
-    }
-    void print() const {
-      for (const auto &[dim, it] : iterators) {
-        llvm::outs() << stringifyLayoutDimension(dim).str() + ":" +
-                            std::to_string(*it) + ", ";
-      }
-      llvm::outs() << "\n";
-    }
-    llvm::MapVector<LayoutDimension, DimensionalIterator> iterators;
-    DenseMap<int64_t, DenseSet<LayoutDimension>> simdToLayoutDim;
-    llvm::MapVector<LayoutDimension, DimensionalRange> ranges;
-    SmallVector<LayoutDimension> labels{
-        LayoutDimension::BATCHX, LayoutDimension::BATCHY,
-        LayoutDimension::VECTORY, LayoutDimension::VECTORX};
-  };
-  void maybeFreezeAndConcatenate(const LayoutIterator::State &frozenState);
-  LayoutIterator(LayoutAttr &attr);
-  LayoutIterator(LayoutAttr &attr, int64_t simtIndex);
-  LayoutIterator(LayoutAttr &attr, DenseMap<LayoutDimension, int64_t> strides);
-  LayoutIterator(LayoutAttr &attr, DenseMap<LayoutDimension, int64_t> strides,
-                 int64_t simtIndex);
-  LayoutIterator(PerDimLayoutAttr &attr,
-                 DenseMap<LayoutDimension, int64_t> strides);
-  void apply(std::function<void(const LayoutIterator::State &)>);
-  LayoutIterator &operator++();
-  State getState() const { return state; }
-  void erase(LayoutDimension dim);
-  LayoutIterator getBatchIterator() const;
-  bool iterationComplete();
-
-private:
-  void initialize(const PerDimLayoutAttr &attr,
-                  DenseMap<LayoutDimension, int64_t> strides,
-                  std::optional<int64_t> simdIndex);
-  State state;
-  DenseSet<LayoutDimension> frozenDimensions;
-  int64_t iterations{0};
-  int64_t maxIterations{1};
-};
-
-inline bool isBatchDimension(LayoutDimension dim) {
-  return (dim == LayoutDimension::BATCHX) || (dim == LayoutDimension::BATCHY);
-}
-
-inline bool isLaneDimension(LayoutDimension dim) {
-  return (dim == LayoutDimension::LANEX) || (dim == LayoutDimension::LANEY) ||
-         (dim == LayoutDimension::LANEZ);
-}
-
-inline bool isVectorDimension(LayoutDimension dim) {
-  return (dim == LayoutDimension::VECTORX) ||
-         (dim == LayoutDimension::VECTORY) || (dim == LayoutDimension::VECTORZ);
-}
-
-} // namespace mlir::iree_compiler::IREE::VectorExt
-
 #endif // IREE_DIALECTS_DIALECT_VECTOREXT_IR_VECTOREXTOPS_H_
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/test/invalid.mlir b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/test/invalid.mlir
index 86c7753fa23f..0b2d31176589 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/test/invalid.mlir
+++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/test/invalid.mlir
@@ -1,13 +1,20 @@
 // RUN: iree-opt --split-input-file --verify-diagnostics %s
 
-#row_layout1 = #iree_vector_ext.per_dim_layout<[BATCHX, LANEX, VECTORY], [1, 1, 1]>
-#col_layout1 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEY, VECTORX], [4, 2, 4]>
-#layout1 = #iree_vector_ext.layout<#row_layout1, #col_layout1>
+#layout1 = #iree_vector_ext.nested_layout<
+  subgroup_tile = [1, 1],
+  batch_tile = [1, 1],
+  outer_tile = [1, 1],
+  thread_tile = [1, 1],
+  element_tile = [1, 1],
+
+  subgroup_strides = [0, 0],
+  thread_strides = [0, 0]>
+
 func.func @invalid_layout(%lhs: memref<32x32xf16>, %rhs: memref<32x32xf16>) -> vector<32x32xf16> {
   %cst_0 = arith.constant 0.0 : f16
   %c0 = arith.constant 0 : index
   %result = vector.transfer_read %lhs[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x32xf16>, vector<32x32xf16>
-  // expected-error @+1 {{Vector shape: [32, 32] does not match the layout (layout<<[ BATCHX,  LANEX,  VECTORY], [1, 1, 1]>, <[ BATCHY,  LANEY,  VECTORX], [4, 2, 4]>>) at dim 0. Dimension expected by layout: 1 actual: 32}}
+  // expected-error @+1 {{Vector shape: [32, 32] does not match the layout (nested_layout<subgroup_tile = [1, 1], batch_tile = [1, 1], outer_tile = [1, 1], thread_tile = [1, 1], element_tile = [1, 1], subgroup_strides = [0, 0], thread_strides = [0, 0]>) at dim 0. Dimension expected by layout: 1 actual: 32}}
   %2 = iree_vector_ext.to_layout %result to layout(#layout1) : vector<32x32xf16>
   return %2 : vector<32x32xf16>
 }
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/test/roundtrip.mlir b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/test/roundtrip.mlir
index fc14c3b6bc92..4dfa22a06a0e 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/test/roundtrip.mlir
+++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/test/roundtrip.mlir
@@ -1,22 +1,5 @@
 // RUN: iree-opt --split-input-file %s | FileCheck %s
 
-#row_layout1 = #iree_vector_ext.per_dim_layout<[BATCHX, LANEX, VECTORY], [2, 4, 4]>
-#col_layout1 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEY, VECTORX], [4, 2, 4]>
-#layout2 = #iree_vector_ext.layout<#col_layout1, #row_layout1>
-func.func @specify_layout(%lhs: memref<32x32xf16>) -> vector<32x32xf16> {
-  %cst_0 = arith.constant 0.0 : f16
-  %c0 = arith.constant 0 : index
-  %result = vector.transfer_read %lhs[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x32xf16>, vector<32x32xf16>
-  %2 = iree_vector_ext.to_layout %result to layout(#layout2) : vector<32x32xf16>
-  return %2 : vector<32x32xf16>
-}
-
-// CHECK-DAG: #[[$LAYOUT0:.+]] = #iree_vector_ext.layout<<[ BATCHY,  LANEY,  VECTORX], [4, 2, 4]>, <[ BATCHX,  LANEX,  VECTORY], [2, 4, 4]>>
-// CHECK-LABEL: func.func @specify_layout
-// CHECK:      iree_vector_ext.to_layout {{.*}} to layout(#[[$LAYOUT0]])
-
-// -----
-
 func.func @specify_inline_layout(%lhs: memref<32x32xf16>) -> vector<32x32xf16> {
   %cst_0 = arith.constant 0.0 : f16
   %c0 = arith.constant 0 : index
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/AMDGPUChainedMatmulPass.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/AMDGPUChainedMatmulPass.cpp
deleted file mode 100644
index 47950df28a6b..000000000000
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/AMDGPUChainedMatmulPass.cpp
+++ /dev/null
@@ -1,270 +0,0 @@
-// Copyright 2024 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include <numeric>
-
-#include "iree/compiler/Codegen/LLVMGPU/Passes.h"
-#include "iree/compiler/Codegen/Utils/VectorOpUtils.h"
-#include "mlir/Analysis/SliceAnalysis.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-
-namespace mlir::iree_compiler {
-
-#define GEN_PASS_DEF_AMDGPUPREPAREFORCHAINEDMATMULPASS
-#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
-
-using VectorValue = TypedValue<VectorType>;
-
-namespace {
-
-/// Let's assume that we only have vector.contract with the standard indexing
-/// maps:
-///    (m, n, k), A: (m, k), B: (k, n), C: (m, n).
-/// We will represent this contract operation by a "@".
-///
-/// Given a matmul:
-///
-/// C = A @ B
-///
-/// This pass decides when to convert this matmul to:
-///
-/// A.T = transpose(A)
-/// B.T = transpose(B)
-/// C.T = B.T @ A.T
-/// C = transpose(C.T)
-///
-/// This is useful when the "@" instruction that the hardware lowers to
-/// has a specific layout (see VectorLayoutInterface for more information)
-/// but the further uses of C expects a transposed layout to the produced
-/// layout.
-///
-/// For example, for "@" lowering to AMDGPU MFMA instructions, the operands
-/// have layout L and L.T and the result has the layout L.T .
-/// So if you have a chain of matmuls:
-///
-/// C (L.T) = A (L) @ B (L.T)
-/// E (L.T) = C (L.T)  @ D (L.T)
-///            ^^^^^^^
-///            Expected layout by instruction is L
-///
-/// To fix this, we can apply this transformation on the first matrix:
-///
-/// C.T (L.T) = B.T (L) @ A (L.T)
-/// C   (L)   = transpose C.T (L.T)
-/// E   (L.T) = C (L)  @ D (L.T)
-///            ^^^^^
-///            Layout matches the instruction!
-///
-/// Note that the mathematical formula
-///   C = A @ B --> C.T = B.T @ A.T
-/// is only defined on standard "@" function, it may be a different
-/// transformation for other indexing maps.
-struct AMDGPUPrepareForChainedMatmulPass final
-    : impl::AMDGPUPrepareForChainedMatmulPassBase<
-          AMDGPUPrepareForChainedMatmulPass> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<vector::VectorDialect>();
-  }
-
-  VectorContractOpInfo getOpInfo(vector::ContractionOp contract) const {
-    auto maybeOpInfo = VectorContractOpInfo::inferFromIndexingMaps(
-        contract.getIndexingMapsArray());
-    assert(succeeded(maybeOpInfo) &&
-           "contraction info for vector.contract should always be valid");
-    return maybeOpInfo.value();
-  }
-
-  VectorValue swapDims(RewriterBase &rewriter, VectorValue val, int64_t dimA,
-                       int64_t dimB) const {
-    ArrayRef<int64_t> shape = val.getType().getShape();
-    SmallVector<int64_t> perm(shape.size());
-    std::iota(perm.begin(), perm.end(), 0);
-    std::swap(perm[dimA], perm[dimB]);
-    return rewriter.create<vector::TransposeOp>(val.getLoc(), val, perm);
-  }
-
-  AffineMap swapDimsInMap(AffineMap map, int64_t dimA, int64_t dimB) const {
-    SmallVector<AffineExpr> results(map.getResults());
-    std::swap(results[dimA], results[dimB]);
-    return AffineMap::get(map.getNumDims(), map.getNumSymbols(), results,
-                          map.getContext());
-  }
-
-  /// Given a vector contract of the form
-  /// %output = vector.contract %lhs, %rhs, %acc
-  /// this function swaps the operands (%rhs, %lhs),
-  /// transposes the accumulator and output and updates
-  /// the indexing maps for the new contract op.
-  ///
-  /// Given a contract:
-  ///
-  ///   result = vector.contract lhs, rhs, acc
-  ///
-  /// transform it to
-  ///
-  ///   lhs.T = transpose(lhs)
-  ///   rhs.T = transpose(rhs)
-  ///   acc.T = transpose(acc)
-  ///   result.T = vector.contract rhs.T, lhs.T, acc.T
-  ///   result = transpose(result.T)
-  ///
-  /// This transformation holds for the "@" case we described above. For
-  /// other indexing maps, we need to take into account transposed which are
-  /// fused into the contract. `isOperandSwapInvariant` tells us when we can
-  /// simply swap the operands without transposing them.
-  void swapOperandsAndTranspose(RewriterBase &rewriter,
-                                vector::ContractionOp contractOp) const {
-    VectorContractOpInfo opInfo = getOpInfo(contractOp);
-    auto [lhsM, rhsN] = opInfo.getOperandMNIndex();
-    auto [lhsK, rhsK] = opInfo.getOperandKIndex();
-    auto [accM, accN] = opInfo.getResultMNIndex();
-    VectorValue lhs = contractOp.getLhs();
-    VectorValue rhs = contractOp.getRhs();
-    VectorValue acc = cast<VectorValue>(contractOp.getAcc());
-    rewriter.setInsertionPoint(contractOp);
-
-    SmallVector<AffineMap> maps = contractOp.getIndexingMapsArray();
-    AffineMap lhsMap = maps[0];
-    AffineMap rhsMap = maps[1];
-    AffineMap accMap = maps[2];
-
-    acc = swapDims(rewriter, acc, accN, accM);
-    accMap = swapDimsInMap(accMap, accN, accM);
-
-    if (!isOperandSwapInvariant(contractOp)) {
-      lhs = swapDims(rewriter, lhs, lhsK, lhsM);
-      rhs = swapDims(rewriter, rhs, rhsK, rhsN);
-      lhsMap = swapDimsInMap(lhsMap, lhsK, lhsM);
-      rhsMap = swapDimsInMap(rhsMap, rhsK, rhsN);
-    }
-
-    auto swappedOp = rewriter.create<vector::ContractionOp>(
-        contractOp.getLoc(), rhs, lhs, acc,
-        rewriter.getAffineMapArrayAttr({rhsMap, lhsMap, accMap}),
-        contractOp.getIteratorTypesAttr());
-    swappedOp->setDiscardableAttrs(contractOp->getDiscardableAttrDictionary());
-
-    acc = cast<VectorValue>(swappedOp.getResult());
-    acc = swapDims(rewriter, acc, accN, accM);
-
-    rewriter.replaceOp(contractOp, acc);
-  }
-
-  /// If one of the operands is transposed, while the other isn't, the
-  /// transformation boils down to an operand swap and result transpose. This
-  /// happens because transposing and swapping both operands, preserves the
-  /// structure of the contraction. For example:
-  ///
-  /// def matmul_transpose_b(A, B):
-  ///   B.T = transpose(B)
-  ///   C = A @ B.T
-  ///   return C
-  ///
-  /// def matmul_transpose_b_swapped(A, B):
-  ///   A.T = transpose(A)
-  ///   C.T = B @ A.T
-  ///   C   = transpose(C.T)
-  ///   return C
-  ///
-  /// matmul_transpose_b(B, A) = matmul_transpose_b_swapped(B, A).T
-  ///
-  /// For the sake of completeness, we also show that this does not hold
-  /// when no operands are transposed, or both operands are transposed:
-  ///
-  /// def matmul(A, B):
-  ///   C = A @ B
-  ///   return C
-  ///
-  /// def matmul_swapped(A, B):
-  ///  A.T = transpose(A)
-  ///  B.T = transpose(B)
-  ///  C.T = B.T @ A.T
-  ///  C   = transpose(C.T)
-  bool isOperandSwapInvariant(vector::ContractionOp contractOp) const {
-    // Check if the innermost m, n, k dimensions are in the order:
-    // lhs: (m, k), rhs: (n, k)
-    VectorContractOpInfo opInfo = getOpInfo(contractOp);
-    auto [lhsM, rhsN] = opInfo.getOperandMNIndex();
-    auto [lhsK, rhsK] = opInfo.getOperandKIndex();
-    bool isLhsTransposed = lhsM > lhsK;
-    bool isRhsTransposed = rhsN < rhsK;
-    return isLhsTransposed != isRhsTransposed;
-  }
-
-  /// Returns a vector.contract operation that this value was transitively
-  /// produced from.
-  ///
-  /// A chained matmul is one where the lhs of the candidate matrix
-  /// is a result of another matmul (a matmul lies in the backward slice of lhs
-  /// of the first matmul).
-  ///
-  /// TODO: This definition of a chained matmul is crude. We should actually be
-  /// checking if the layout of the result of the first matmul is transposed
-  /// to that expected by the second matmul.
-  FailureOr<vector::ContractionOp>
-  getTransitiveMatmulParent(vector::ContractionOp contractOp) const {
-    SetVector<Operation *> backwardSlice;
-    BackwardSliceOptions options;
-    options.inclusive = true;
-    getBackwardSlice(contractOp.getLhs(), &backwardSlice, options);
-    vector::ContractionOp result;
-    for (Operation *sliceOp : backwardSlice) {
-      auto chainParent = dyn_cast<vector::ContractionOp>(sliceOp);
-      if (!chainParent) {
-        continue;
-      }
-
-      // For now, we only support transpose invariant matmuls. This is because
-      // transposing the inputs may have a non-trivial cost which we need
-      // to think about.
-      // TODO: We should probably enable it always. Currently, this is
-      // only useful in Flash Attention, where the first matmul is generally
-      // a transpose.
-      if (!isOperandSwapInvariant(chainParent)) {
-        continue;
-      }
-
-      // If we have multiple matmul parents, we fail.
-      if (result) {
-        return failure();
-      }
-
-      result = chainParent;
-    }
-
-    if (result) {
-      return result;
-    }
-
-    return failure();
-  }
-
-  void runOnOperation() override {
-    auto funcOp = getOperation();
-    SmallVector<vector::ContractionOp> matmulCandidates;
-    funcOp.walk([&](vector::ContractionOp contractOp) {
-      matmulCandidates.push_back(contractOp);
-    });
-
-    IRRewriter rewriter(funcOp.getContext());
-    for (vector::ContractionOp candidate : matmulCandidates) {
-      FailureOr<vector::ContractionOp> maybeChainedParent =
-          getTransitiveMatmulParent(candidate);
-      if (failed(maybeChainedParent)) {
-        continue;
-      }
-      auto chainParent = maybeChainedParent.value();
-      swapOperandsAndTranspose(rewriter, chainParent);
-
-      // TODO: We should be only transposing the second matrix if the
-      // result of the first matmul is used by the second matmul transitively.
-      swapOperandsAndTranspose(rewriter, candidate);
-    }
-  }
-};
-
-} // namespace
-} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
index 3159126442c8..73e039798deb 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
@@ -85,7 +85,6 @@ iree_compiler_cc_library(
 iree_compiler_cc_library(
     name = "LLVMGPU",
     srcs = [
-        "AMDGPUChainedMatmulPass.cpp",
         "ConvertToLLVM.cpp",
         "ConvertToNVVM.cpp",
         "ConvertToROCDL.cpp",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
index fef9ca37e6aa..b33641bda92e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
@@ -70,7 +70,6 @@ iree_cc_library(
     "ROCDLKernelConfig.h"
     "ROCDLPasses.h"
   SRCS
-    "AMDGPUChainedMatmulPass.cpp"
     "ConvertToLLVM.cpp"
     "ConvertToNVVM.cpp"
     "ConvertToROCDL.cpp"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp
index 466d7bd1bf80..1640656b71a8 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp
@@ -34,7 +34,6 @@ class ContractionVectorLayoutOptions : public VectorLayoutOptions {
                                  int64_t subgroupSize)
       : VectorLayoutOptions(root), patterns(root->getContext()) {
     populateGPUDistributionPatterns(patterns);
-    populateGPUDistributionLayoutAttrPatterns(laneId, patterns);
     populateGPUDistributeNestedLayoutAttrPatterns(patterns, laneId,
                                                   subgroupSize);
     populateGPUDistributeNestedLayoutContractAMDGPUPatterns(patterns);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td
index 06d9960f180f..80eb87c81964 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td
@@ -13,34 +13,6 @@ include "mlir/Pass/PassBase.td"
 // LLVMGPU Passes (keep alphabetical)
 //------------------------------------------------------------------------------
 
-def AMDGPUPrepareForChainedMatmulPass :
-    InterfacePass<"iree-amdgpu-prepare-chained-matmul", "mlir::FunctionOpInterface"> {
-  let summary = "Pass to swap operands and transpose accumulator and result";
-  let description = [{
-    Given a chain of matmuls with some or no operations
-    in between, like
-
-    d = matmul_transpose_b(a, b) + c
-    ...
-    e = matmul_transpose_b(d, f) + g
-
-    this pattern transforms the above IR to
-
-    c.t = transpose c
-    d = matmul_transpose_b(b, a) + c.t
-    d.t = transpose d
-    ...
-    g.t = transpose g
-    e = matmul_transpose_b(f, d.t) + g.t
-    e.t = transpose e
-
-    On CDNA architectures, where the layouts of the RHS and result
-    are the same and transposed from the LHS layout, this type
-    of transformation can avoid trips to shared memory/shuffle instructions
-    on operators like Flash Attention.
-  }];
-}
-
 // TODO: Bring the argument in line with the names used elsewhere.
 def ConvertToNVVMPass :
     Pass<"iree-convert-to-nvvm", "ModuleOp"> {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
index a2c2187f3f14..b0022c452ab1 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
@@ -1476,13 +1476,9 @@ transform_dialect::AMDGPUDistributeVectorsOp::applyToOne(
       rewriter.create<gpu::ThreadIdOp>(target.getLoc(), gpu::Dimension::x);
 
   populateGPUDistributionPatterns(patterns);
-  populateGPUDistributionLayoutAttrPatterns(laneId, patterns);
-  populateGPUReductionDistributionPatterns(patterns);
   // For testing we use subgroup size = 64.
   populateGPUDistributeNestedLayoutAttrPatterns(patterns, laneId,
                                                 /*subgroupSize=*/64);
-  populateAMDGPUDistributionPatterns(patterns);
-  populateGPULayoutResolutionDistributionPatterns(patterns);
   if (failed(distributeVectorOps(target, patterns, options))) {
     return emitDefaultSilenceableFailure(target);
   }
@@ -1550,357 +1546,5 @@ transform_dialect::CreateMatmulMfmaTileSizesOp::apply(
   return DiagnosedSilenceableFailure::success();
 }
 
-//===----------------------------------------------------------------------===//
-// SetContractionLayoutAttributes
-//===----------------------------------------------------------------------===//
-
-/// This function creates a modified version of the MFMA layout that allows
-/// for reading more elements from LDS. Specifically, the MFMA layout looks
-/// something like this:
-/// <<[ BATCHY,  LANEX], [2, 16]>, <[ BATCHX,  LANEY,  VECTORX], [8, 4, 4]>>
-/// Here VECTORX specifies how many elements can be read from LDS.
-/// Now, in order to read more elements from LDS, we can modify this layout
-/// while maintaining the overall shape to:
-/// <<[ BATCHY,  LANEX], [2, 16]>, <[ BATCHX,  LANEY,  VECTORX], [4, 4, 8]>>
-/// This is what this function does. In situations where the batch dimension
-/// is too small, or if we are not transferring 4 elements at a time, it
-/// returns nullopt.
-static std::optional<VectorExt::LayoutAttr>
-createReadLayout(MLIRContext *ctx, const VectorExt::LayoutAttr &layout) {
-  SmallVector<VectorExt::PerDimLayoutAttr> perDimLayouts;
-  for (VectorExt::PerDimLayoutAttr perDimLayout : layout.getLayouts()) {
-    DenseSet<VectorExt::LayoutDimension> labels;
-    for (VectorExt::LayoutDimensionAttr dim : perDimLayout.getLabels()) {
-      labels.insert(dim.getValue());
-    }
-    if (!labels.contains(VectorExt::LayoutDimension::VECTORX)) {
-      perDimLayouts.push_back(perDimLayout);
-      continue;
-    }
-    SmallVector<int64_t> newShapes;
-    for (auto [label, shape] :
-         llvm::zip_equal(perDimLayout.getLabels(), perDimLayout.getShapes())) {
-      if (VectorExt::isBatchDimension(label.getValue())) {
-        if (shape == 1)
-          return std::nullopt;
-        newShapes.push_back(shape / 2);
-        continue;
-      }
-      if (label.getValue() == VectorExt::LayoutDimension::VECTORX) {
-        if (shape != 4)
-          return std::nullopt;
-        newShapes.push_back(shape * 2);
-        continue;
-      }
-      newShapes.push_back(shape);
-    }
-    perDimLayouts.push_back(VectorExt::PerDimLayoutAttr::get(
-        ctx, perDimLayout.getLabels(), newShapes));
-  }
-  return VectorExt::LayoutAttr::get(ctx, perDimLayouts);
-}
-
-// Struct containing concrete MMA shape, type, and layout information.
-struct ConcreteMmaLayout {
-  GPU::OpaqueMmaLayout base;
-  VectorExt::PerDimLayoutAttr aMLayout;
-  VectorExt::PerDimLayoutAttr aKLayout;
-  VectorExt::PerDimLayoutAttr bKLayout;
-  VectorExt::PerDimLayoutAttr bNLayout;
-  VectorExt::PerDimLayoutAttr cMLayout;
-  VectorExt::PerDimLayoutAttr cNLayout;
-};
-
-static std::tuple<VectorExt::PerDimLayoutAttr, VectorExt::PerDimLayoutAttr>
-getPerDimLayoutAttrs(MLIRContext *context, TileSwizzle swizzle) {
-  // Step 1: obtain the swizzled tile shape, but keeping track of the source
-  // dimension indices.
-  struct SrcIndexAndSwizzleDim {
-    size_t srcIndex;
-    TileSwizzle::Dim dim;
-  };
-  SmallVector<SrcIndexAndSwizzleDim> swizzledShape;
-  for (auto [i, e] : llvm::enumerate(swizzle.expandShape)) {
-    for (TileSwizzle::Dim d : e) {
-      swizzledShape.push_back(SrcIndexAndSwizzleDim{i, d});
-    }
-  }
-  applyPermutationToVector(swizzledShape, swizzle.permutation);
-
-  // Step 2: collect the appropriate labels to use for the swizzled dims.
-  VectorExt::LayoutDimension internalLabels[] = {
-      VectorExt::LayoutDimension::VECTORZ, VectorExt::LayoutDimension::VECTORY,
-      VectorExt::LayoutDimension::VECTORX};
-  VectorExt::LayoutDimension crossThreadLabels[] = {
-      VectorExt::LayoutDimension::LANEZ, VectorExt::LayoutDimension::LANEY,
-      VectorExt::LayoutDimension::LANEX};
-  auto internalLabelIter = std::end(internalLabels);
-  auto crossThreadLabelIter = std::end(crossThreadLabels);
-  for (SrcIndexAndSwizzleDim d : swizzledShape) {
-    if (d.dim.kind == TileSwizzle::Dim::Kind::Internal) {
-      assert(internalLabelIter != std::begin(internalLabels));
-      --internalLabelIter;
-    } else if (d.dim.kind == TileSwizzle::Dim::Kind::CrossThread) {
-      assert(crossThreadLabelIter != std::begin(crossThreadLabels));
-      --crossThreadLabelIter;
-    } else {
-      assert(false && "unexpected dimension kind in intrinsic swizzle");
-    }
-  }
-
-  // Step 3: put together the result PerDimLayoutAttr'd for the two source dims.
-  SmallVector<VectorExt::LayoutDimensionAttr> labels[2];
-  SmallVector<int64_t> shape[2];
-  for (SrcIndexAndSwizzleDim d : swizzledShape) {
-    shape[d.srcIndex].push_back(d.dim.size);
-    auto &labelIterRef = (d.dim.kind == TileSwizzle::Dim::Kind::Internal)
-                             ? internalLabelIter
-                             : crossThreadLabelIter;
-    labels[d.srcIndex].push_back(VectorExt::LayoutDimensionAttr::get(
-        context, static_cast<VectorExt::LayoutDimension>(*labelIterRef++)));
-  }
-  return {VectorExt::PerDimLayoutAttr::get(context, labels[0], shape[0]),
-          VectorExt::PerDimLayoutAttr::get(context, labels[1], shape[1])};
-};
-
-static ConcreteMmaLayout getConcreteMMALayout(MLIRContext *context,
-                                              GPU::MMAIntrinsic intrinsic) {
-  auto opaque = GPU::getOpaqueMMALayout(context, intrinsic);
-  ConcreteMmaLayout concreteLayout;
-  concreteLayout.base = opaque;
-  auto lhsSwizzle = getIntrinsicSwizzle(intrinsic, GPU::MMAFragment::Lhs);
-  auto rhsSwizzle = getIntrinsicSwizzle(intrinsic, GPU::MMAFragment::Rhs);
-  auto accSwizzle = getIntrinsicSwizzle(intrinsic, GPU::MMAFragment::Acc);
-  std::tie(concreteLayout.aMLayout, concreteLayout.aKLayout) =
-      getPerDimLayoutAttrs(context, lhsSwizzle);
-  std::tie(concreteLayout.bNLayout, concreteLayout.bKLayout) =
-      getPerDimLayoutAttrs(context, rhsSwizzle);
-  std::tie(concreteLayout.cMLayout, concreteLayout.cNLayout) =
-      getPerDimLayoutAttrs(context, accSwizzle);
-  return concreteLayout;
-}
-
-static VectorExt::PerDimLayoutAttr
-getBatchedPerDimLayoutAttr(VectorExt::LayoutDimensionAttr batchDim,
-                           VectorExt::PerDimLayoutAttr baseLayout,
-                           int64_t problemSize, int64_t fragmentDimSize) {
-  assert(problemSize % fragmentDimSize == 0 &&
-         "invalid layout fragment for problem size");
-
-  SmallVector<VectorExt::LayoutDimensionAttr, 3> dimAttrs(
-      baseLayout.getLabels());
-  dimAttrs.insert(dimAttrs.begin(), batchDim);
-
-  SmallVector<int64_t, 3> shapes(baseLayout.getShapes());
-  shapes.insert(shapes.begin(), problemSize / fragmentDimSize);
-  auto layout = VectorExt::PerDimLayoutAttr::get(baseLayout.getContext(),
-                                                 dimAttrs, shapes);
-  return layout;
-}
-
-// Get the batched layout attributes for the given fragment layouts, indexing
-// map, and problem shape. The canonical fragment map is used to compare against
-// the problem map |indexingMap|. For example, for mma fragment B (RHS):
-//
-// indexingMap = affine_map<(d0, d1, d2) -> (d1, d2) # Transposed B
-// fragmentMap = affine_map<(d0, d1, d2) -> (d2, d1)
-// problemShape = [32, 64]
-// fragmentSize = [16, 8]
-// fragmentLayouts = [kLayout, nLayout]
-//
-// Gives batched layout
-//
-// Dim0 Layout = [BATCHX, nLayoutLabels], [8, nLayoutShape]
-// Dim1 Layout = [BATCHY, kLayoutLabels], [2, kLayoutShape]
-static VectorExt::LayoutAttr
-getBatchedLayoutAttr(AffineMap indexingMap, AffineMap fragmentMap,
-                     ArrayRef<int64_t> problemShape,
-                     ArrayRef<int64_t> fragmentSize,
-                     ArrayRef<VectorExt::PerDimLayoutAttr> fragmentLayouts) {
-  // Current distribution to MFMA operations does not support batched
-  // contractions so that is reflected here.
-  assert(indexingMap.getNumResults() == 2 &&
-         "invalid indexing map to non-batched simple contraction");
-
-  VectorExt::LayoutDimensionAttr batchX = VectorExt::LayoutDimensionAttr::get(
-      indexingMap.getContext(), VectorExt::LayoutDimension::BATCHX);
-  VectorExt::LayoutDimensionAttr batchY = VectorExt::LayoutDimensionAttr::get(
-      indexingMap.getContext(), VectorExt::LayoutDimension::BATCHY);
-
-  SmallVector<VectorExt::PerDimLayoutAttr, 2> perDimAttrs;
-  for (auto [expr, batchType] : llvm::zip_equal(
-           indexingMap.getResults(),
-           SmallVector<VectorExt::LayoutDimensionAttr, 2>{batchX, batchY})) {
-    auto maybeResultPosition = fragmentMap.getResultPosition(expr);
-    assert(maybeResultPosition && "fragment map and problem map mismatch");
-    int64_t idx = *maybeResultPosition;
-    perDimAttrs.push_back(getBatchedPerDimLayoutAttr(
-        batchType, fragmentLayouts[idx], problemShape[idx], fragmentSize[idx]));
-  }
-
-  return VectorExt::LayoutAttr::get(indexingMap.getContext(), perDimAttrs);
-}
-
-static FailureOr<std::tuple<VectorLayoutInterface, VectorLayoutInterface,
-                            VectorLayoutInterface>>
-getContractionLayout(vector::ContractionOp contract, ConcreteMmaLayout layout) {
-  MLIRContext *context = contract.getContext();
-  FailureOr<linalg::ContractionDimensions> maybeContractionDims =
-      linalg::inferContractionDims(contract.getIndexingMapsArray());
-  if (failed(maybeContractionDims)) {
-    return failure();
-  }
-  auto contractionDims = *maybeContractionDims;
-  // TODO: Relax this condition to strictly alignment requirements.
-  if (contractionDims.k.size() != 1 || contractionDims.m.size() != 1 ||
-      contractionDims.n.size() != 1) {
-    return failure();
-  }
-  // TODO: Support batched contractions.
-  if (contractionDims.batch.size() > 0) {
-    return failure();
-  }
-  unsigned mDim = contractionDims.m[0];
-  unsigned nDim = contractionDims.n[0];
-  unsigned kDim = contractionDims.k[0];
-
-  SmallVector<int64_t> iterationBounds;
-  contract.getIterationBounds(iterationBounds);
-
-  int64_t problemMSize = iterationBounds[mDim];
-  int64_t problemNSize = iterationBounds[nDim];
-  int64_t problemKSize = iterationBounds[kDim];
-
-  int64_t mSize = layout.base.mSize;
-  int64_t nSize = layout.base.nSize;
-  int64_t kSize = layout.base.kSize;
-
-  // The problem size currently must be strictly aligned to the size of the mma.
-  // This is expected to succeed assuming the correct [masked] vector size was
-  // set at strategy configuration time (for this mma).
-  if (problemMSize % mSize != 0 || problemNSize % nSize ||
-      problemKSize % kSize) {
-    return failure();
-  }
-
-  VectorExt::LayoutAttr aLayout = getBatchedLayoutAttr(
-      contract.getIndexingMapsArray()[0],
-      AffineMap::getMultiDimMapWithTargets(3, {mDim, kDim}, context),
-      {problemMSize, problemKSize}, {mSize, kSize},
-      {layout.aMLayout, layout.aKLayout});
-  VectorExt::LayoutAttr bLayout = getBatchedLayoutAttr(
-      contract.getIndexingMapsArray()[1],
-      AffineMap::getMultiDimMapWithTargets(3, {kDim, nDim}, context),
-      {problemKSize, problemNSize}, {kSize, nSize},
-      {layout.bKLayout, layout.bNLayout});
-  VectorExt::LayoutAttr cLayout = getBatchedLayoutAttr(
-      contract.getIndexingMapsArray()[2],
-      AffineMap::getMultiDimMapWithTargets(3, {mDim, nDim}, context),
-      {problemMSize, problemNSize}, {mSize, nSize},
-      {layout.cMLayout, layout.cNLayout});
-
-  return std::make_tuple<VectorLayoutInterface, VectorLayoutInterface,
-                         VectorLayoutInterface>(aLayout, bLayout, cLayout);
-}
-
-FailureOr<std::tuple<
-    VectorLayoutInterface, VectorLayoutInterface,
-    VectorLayoutInterface>> static getContractionLayout(GPU::MMAAttr mma,
-                                                        vector::ContractionOp
-                                                            contract) {
-  ConcreteMmaLayout layout = getConcreteMMALayout(
-      contract->getContext(), mma.getIntrinsic().getValue());
-  return getContractionLayout(contract, layout);
-}
-
-DiagnosedSilenceableFailure
-transform_dialect::SetContractionLayoutAttributes::apply(
-    transform::TransformRewriter &rewriter,
-    transform::TransformResults &results, transform::TransformState &state) {
-  auto payloadList = state.getPayloadOps(getTarget());
-  auto typeList = state.getParams(getMmaType());
-  if (typeList.size() != 1) {
-    return emitDefiniteFailure()
-           << "invalid more than one attribute for contraction annotation";
-  }
-  auto mmaType = llvm::dyn_cast<GPU::MMAAttr>(typeList.front());
-  if (!mmaType) {
-    return emitDefiniteFailure()
-           << "invalid non-mma attribute for contraction annotation "
-           << typeList.front();
-  }
-
-  for (Operation *payload : payloadList) {
-    auto contract = llvm::dyn_cast<vector::ContractionOp>(payload);
-    if (!contract) {
-      return emitDefiniteFailure()
-             << "invalid non-contraction annotation " << payload;
-    }
-
-    auto maybeLayouts = getContractionLayout(mmaType, contract);
-    if (failed(maybeLayouts)) {
-      return emitDefiniteFailure()
-             << "invalid opaque mma layout for annotation " << mmaType;
-    }
-
-    Location loc = contract.getLoc();
-    auto [aLayout, bLayout, cLayout] = *maybeLayouts;
-
-    // Set packed read layout for specified indices.
-    ArrayRef<int64_t> operandIndices = getReadLayoutIndices();
-    if (!operandIndices.empty()) {
-      SmallVector<Value> operands;
-      SmallVector<VectorExt::VectorLayoutInterface> layouts;
-      for (int64_t index : operandIndices) {
-        operands.push_back(contract.getOperand(index));
-        layouts.push_back(index == 0 ? aLayout : bLayout);
-      }
-      rewriter.setInsertionPoint(contract);
-      for (const auto &idxAndVals :
-           llvm::enumerate(llvm::zip_equal(operands, layouts))) {
-        int64_t i = idxAndVals.index();
-        auto [operand, layoutInterface] = idxAndVals.value();
-        VectorExt::LayoutAttr layout =
-            dyn_cast<VectorExt::LayoutAttr>(layoutInterface);
-        std::optional<VectorExt::LayoutAttr> maybeReadLayout =
-            createReadLayout(rewriter.getContext(), layout);
-        if (!maybeReadLayout)
-          continue;
-        VectorExt::LayoutAttr readLayout = maybeReadLayout.value();
-        Operation *parentOp = operand.getDefiningOp();
-        if (!parentOp || (parentOp->getNumResults() != 1))
-          continue;
-        Value resolvedOperand =
-            rewriter.create<VectorExt::ToLayoutOp>(loc, operand, readLayout);
-        contract.setOperand(operandIndices[i], resolvedOperand);
-      }
-    }
-
-    // Set layout anchors.
-    rewriter.setInsertionPoint(contract);
-    Value newLhs =
-        rewriter.create<VectorExt::ToLayoutOp>(loc, contract.getLhs(), aLayout);
-    Value newRhs =
-        rewriter.create<VectorExt::ToLayoutOp>(loc, contract.getRhs(), bLayout);
-    Value newAcc =
-        rewriter.create<VectorExt::ToLayoutOp>(loc, contract.getAcc(), cLayout);
-    contract.setOperand(0, newLhs);
-    contract.setOperand(1, newRhs);
-    contract.setOperand(2, newAcc);
-
-    // Set intrinsic type.
-    contract->setAttr("iree.amdgpu.mma", mmaType);
-  }
-
-  return DiagnosedSilenceableFailure::success();
-}
-
-void transform_dialect::SetContractionLayoutAttributes::getEffects(
-    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
-  transform::onlyReadsHandle(getTargetMutable(), effects);
-  transform::onlyReadsHandle(getMmaTypeMutable(), effects);
-  transform::modifiesPayload(effects);
-}
-
 #define GET_OP_CLASSES
 #include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.cpp.inc"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
index ac3e7eef7513..69e766537c0b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
@@ -736,24 +736,4 @@ def CreateMatmulMfmaTileSizesOp :
   let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect";
 }
 
-def SetContractionLayoutAttributes :
-  Op<Transform_Dialect, "iree.set_contraction_layout_attributes",
-    [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
-     DeclareOpInterfaceMethods<TransformOpInterface>]> {
-  let description = [{
-    Infers and sets the layout of the target contraction op based on the given
-    MFMA attribute. The optional read_layout_indices attribute determines whether
-    to apply a modified version of the MFMA layout to the operands of
-    the contracts that enables loading a greater number of elements from LDS.
-    If empty, the read layout is not applied to any operand. 0 specifies
-    LHS and 1 RHS.
-  }];
-
-  let arguments = (ins TransformHandleTypeInterface:$target,
-                       TransformParamTypeInterface:$mma_type,
-                       DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$read_layout_indices);
-  let assemblyFormat = "$target `,` $mma_type attr-dict `:` type($target) `,` type($mma_type)";
-  let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect";
-}
-
 #endif // IREE_COMPILER_CODEGEN_LLVMGPU_TRANSFORMEXTENSIONS_LLVMGPUEXTENSIONS
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/AMDGPUDistributionPatterns.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/AMDGPUDistributionPatterns.cpp
deleted file mode 100644
index 48de6cdaeef8..000000000000
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/AMDGPUDistributionPatterns.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright 2024 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "iree/compiler/Codegen/Common/GPU/GPUVectorDistribution.h"
-#include "iree/compiler/Codegen/Common/VectorLayoutAnalysis.h"
-#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
-#include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.h"
-#include "iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h"
-#include "iree/compiler/Codegen/Utils/VectorOpUtils.h"
-
-namespace mlir::iree_compiler {
-
-using namespace mlir::iree_compiler::IREE::VectorExt;
-using VectorValue = TypedValue<VectorType>;
-
-namespace {
-
-struct DistributeContractions final
-    : OpDistributionPattern<vector::ContractionOp> {
-  using OpDistributionPattern::OpDistributionPattern;
-
-  LogicalResult matchAndRewrite(vector::ContractionOp contractOp,
-                                DistributionSignature &signature,
-                                PatternRewriter &rewriter) const override {
-    auto maybeOpInfo = VectorContractOpInfo::inferFromIndexingMaps(
-        contractOp.getIndexingMapsArray());
-    if (failed(maybeOpInfo)) {
-      return rewriter.notifyMatchFailure(contractOp, "invalid contraction");
-    }
-    VectorContractOpInfo opInfo = maybeOpInfo.value();
-
-    VectorValue result = dyn_cast<VectorValue>(contractOp.getResult());
-    if (!result) {
-      return rewriter.notifyMatchFailure(contractOp,
-                                         "result should be of type vector");
-    }
-
-    LayoutAttr resultLayout = dyn_cast<LayoutAttr>(signature[result]);
-    if (!resultLayout) {
-      return rewriter.notifyMatchFailure(
-          contractOp, "result layout should be of type LayoutAttr");
-    }
-
-    auto mmaAttr =
-        contractOp->getAttrOfType<IREE::GPU::MMAAttr>("iree.amdgpu.mma");
-    if (!mmaAttr) {
-      return rewriter.notifyMatchFailure(
-          contractOp, "missing iree.amdgpu.mma intrinsic attribute");
-    }
-
-    constexpr int LHS = 0;
-    constexpr int RHS = 1;
-    constexpr int ACC = 2;
-    SmallVector<VectorValue> operands;
-    SmallVector<LayoutAttr> layouts;
-    for (Value operand : contractOp->getOperands()) {
-      if (auto vectorOperand = dyn_cast<VectorValue>(operand)) {
-        auto layout = signature[vectorOperand];
-        if (auto vectorLayout = dyn_cast<LayoutAttr>(layout)) {
-          operands.push_back(vectorOperand);
-          layouts.push_back(vectorLayout);
-        }
-      }
-    }
-
-    Type elementType =
-        llvm::cast<ShapedType>(operands[ACC].getType()).getElementType();
-    SmallVector<int64_t> vectorShape = resultLayout.getDistributedShape();
-    auto vectorType = VectorType::get(vectorShape, elementType);
-    Location loc = contractOp.getLoc();
-    Value vector = rewriter.create<arith::ConstantOp>(
-        loc, vectorType, rewriter.getZeroAttr(vectorType));
-
-    auto [lhsK, rhsK] = opInfo.getOperandKIndex();
-
-    std::optional<int64_t> kBatch = layouts[LHS].getBatchDim(lhsK);
-    if (!kBatch) {
-      return failure();
-    }
-
-    auto contractFn = [&](const LayoutIterator::State &state) {
-      auto [lhsM, rhsN] = opInfo.getOperandMNIndex();
-      auto [lhsK, rhsK] = opInfo.getOperandKIndex();
-      SmallVector<int64_t> indices = state.computeIteratorProjectedSIMTIndex();
-      Value dMatrix = rewriter.create<vector::ExtractOp>(
-          loc, getDistributed(rewriter, operands[ACC], layouts[ACC]), indices);
-      for (int k = 0; k < kBatch; ++k) {
-        SmallVector<int64_t> lhsIndices(2);
-        SmallVector<int64_t> rhsIndices(2);
-        lhsIndices[lhsM] = indices[0];
-        lhsIndices[lhsK] = k;
-        rhsIndices[rhsN] = indices[1];
-        rhsIndices[rhsK] = k;
-
-        Value aMatrix = rewriter.create<vector::ExtractOp>(
-            loc, getDistributed(rewriter, operands[LHS], layouts[LHS]),
-            lhsIndices);
-
-        Value bMatrix = rewriter.create<vector::ExtractOp>(
-            loc, getDistributed(rewriter, operands[RHS], layouts[RHS]),
-            rhsIndices);
-
-        dMatrix = mmaAttr
-                      .buildMmaOperation(rewriter, loc, dMatrix.getType(),
-                                         aMatrix, bMatrix, dMatrix)
-                      .value();
-      }
-      vector = rewriter.create<vector::InsertOp>(loc, dMatrix, vector, indices);
-      return success();
-    };
-
-    LayoutIterator iterator(resultLayout);
-    LayoutIterator batchIterator = iterator.getBatchIterator();
-    batchIterator.apply(contractFn);
-    replaceOpWithDistributedValues(rewriter, contractOp, vector);
-    return success();
-  }
-};
-} // namespace
-
-void populateAMDGPUDistributionPatterns(RewritePatternSet &patterns) {
-  patterns.add<DistributeContractions>(patterns.getContext());
-}
-
-} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/BUILD.bazel
index eeb97bf8c033..113c6d56598f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/BUILD.bazel
@@ -17,7 +17,6 @@ package(
 iree_compiler_cc_library(
     name = "Utils",
     srcs = [
-        "AMDGPUDistributionPatterns.cpp",
         "LLVMGPUUtils.cpp",
         "PrefetchSharedMemoryCopy.cpp",
     ],
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/CMakeLists.txt
index ccd7f4bef826..6b66e96ded1f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/CMakeLists.txt
@@ -16,7 +16,6 @@ iree_cc_library(
   HDRS
     "LLVMGPUUtils.h"
   SRCS
-    "AMDGPUDistributionPatterns.cpp"
     "LLVMGPUUtils.cpp"
     "PrefetchSharedMemoryCopy.cpp"
   DEPS
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h
index ac4f79211d42..07d9ca5ed186 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h
@@ -18,10 +18,6 @@ namespace mlir::iree_compiler {
 void createAsyncGroups(RewriterBase &rewriter, mlir::FunctionOpInterface funcOp,
                        bool useMMASync);
 
-/// Function to do layout analysis and distribution.
-void doLayoutAnalysisAndDistribution(RewriterBase &rewriter,
-                                     mlir::FunctionOpInterface funcOp);
-
 /// Function to reorder transposes and elementwise ops.
 void reorderTranspose(RewriterBase &rewriter, mlir::FunctionOpInterface funcOp);
 
@@ -33,9 +29,6 @@ void reorderTranspose(RewriterBase &rewriter, mlir::FunctionOpInterface funcOp);
 /// from the previous alias group before starting a new one.
 void packSharedMemoryAlloc(mlir::FunctionOpInterface funcOp);
 
-// Add patterns to distribute contractions to MFMA ops.
-void populateAMDGPUDistributionPatterns(RewritePatternSet &patterns);
-
 // Prefetches data written to shared memory for the next iteration. Returns the
 // new loop on success or failure when the `forOp` is not supported.
 FailureOr<scf::ForOp> prefetchSharedMemoryCopy(RewriterBase &rewriter,
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
index 0256a74f2ecd..ff000d6715ca 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -18,9 +18,6 @@ iree_lit_test_suite(
     name = "lit",
     srcs = enforce_glob(
         [
-            "amdgpu_chained_matmul.mlir",
-            "amdgpu_contraction_distribution.mlir",
-            "amdgpu_set_anchor_layouts.mlir",
             "assign_constant_ordinals.mlir",
             "conv_pipeline_test_cuda.mlir",
             "convert_to_nvvm.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
index 635a49df1694..e46b413d20bf 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -14,9 +14,6 @@ iree_lit_test_suite(
   NAME
     lit
   SRCS
-    "amdgpu_chained_matmul.mlir"
-    "amdgpu_contraction_distribution.mlir"
-    "amdgpu_set_anchor_layouts.mlir"
     "assign_constant_ordinals.mlir"
     "cast_address_space_function.mlir"
     "cast_type_to_fit_mma.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_chained_matmul.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_chained_matmul.mlir
deleted file mode 100644
index f1d666579302..000000000000
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_chained_matmul.mlir
+++ /dev/null
@@ -1,189 +0,0 @@
-// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-amdgpu-prepare-chained-matmul),canonicalize,cse)" %s | FileCheck %s
-
-#accesses0 = [
-  affine_map<(m, n, k) -> (m, k)>,
-  affine_map<(m, n, k) -> (n, k)>,
-  affine_map<(m, n, k) -> (m, n)>
-]
-
-#trait0 = {
-  indexing_maps = #accesses0,
-  iterator_types = ["parallel", "parallel", "reduction"]
-}
-
-builtin.module {
-  // CHECK-DAG: #[[MAP:.*]] = affine_map<(d0, d1, d2) -> (d1, d2)>
-  // CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1, d2) -> (d0, d2)>
-  // CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (d1, d0)>
-  func.func @chained_matmul(%lhs : vector<32x8xf16>, %rhs : vector<16x8xf16>, %acc : vector<32x16xf16>,
-    // CHECK: func.func @chained_matmul(%[[LHS:.*]]: vector<32x8xf16>, %[[RHS:.*]]: vector<16x8xf16>, %[[ACC:.*]]: vector<32x16xf16>
-    // CHECK-SAME: %[[RHS2:.*]]: vector<8x16xf16>, %[[ACC2:.*]]: vector<32x8xf16>
-    %rhs2 : vector<8x16xf16>, %acc2 : vector<32x8xf16>) -> vector<32x8xf16> {
-    // CHECK: %[[TRANS_ACC:.*]] = vector.transpose %[[ACC]], [1, 0] : vector<32x16xf16> to vector<16x32xf16>
-    // CHECK: %[[TRANS_RES:.*]] = vector.contract {indexing_maps = [#[[MAP]], #[[MAP1]], #[[MAP2]]], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>}
-    // CHECK-SAME: %[[RHS]], %[[LHS]], %[[TRANS_ACC]] : vector<16x8xf16>, vector<32x8xf16> into vector<16x32xf16>
-    // CHECK: %[[RES:.*]] = vector.transpose %[[TRANS_RES]], [1, 0] : vector<16x32xf16> to vector<32x16xf16>
-    %result = vector.contract #trait0 %lhs, %rhs, %acc
-      : vector<32x8xf16>, vector<16x8xf16> into vector<32x16xf16>
-    // CHECK: %[[EXP:.*]] = math.exp2 %[[RES]] : vector<32x16xf16>
-    %exp = math.exp2 %result : vector<32x16xf16>
-    // CHECK: %[[TRANS_ACC2:.*]] = vector.transpose %[[ACC2]], [1, 0] : vector<32x8xf16> to vector<8x32xf16>
-    // CHECK: %[[TRANS_RES2:.*]] = vector.contract {indexing_maps = [#[[MAP]], #[[MAP1]], #[[MAP2]]], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>}
-    // CHECK-SAME: %[[RHS2]], %[[EXP]], %[[TRANS_ACC2]] : vector<8x16xf16>, vector<32x16xf16> into vector<8x32xf16>
-    // CHECK: %[[RES2:.*]] = vector.transpose %[[TRANS_RES2]], [1, 0] : vector<8x32xf16> to vector<32x8xf16>
-    %result2 = vector.contract #trait0 %exp, %rhs2, %acc2
-      : vector<32x16xf16>, vector<8x16xf16> into vector<32x8xf16>
-    func.return %result2 : vector<32x8xf16>
-  }
-}
-
-// -----
-
-#accesses0 = [
-  affine_map<(m, n, k) -> (m, k)>,
-  affine_map<(m, n, k) -> (n, k)>,
-  affine_map<(m, n, k) -> (m, n)>
-]
-
-#trait0 = {
-  indexing_maps = #accesses0,
-  iterator_types = ["parallel", "parallel", "reduction"]
-}
-
-builtin.module {
-  func.func @non_chained_matmul(%lhs : vector<32x8xf16>, %rhs : vector<16x8xf16>, %acc : vector<32x16xf16>
-    // CHECK: func.func @non_chained_matmul(%[[LHS:.*]]: vector<32x8xf16>, %[[RHS:.*]]: vector<16x8xf16>, %[[ACC:.*]]: vector<32x16xf16>
-    ) -> vector<32x16xf16> {
-    // CHECK-NOT: vector.transpose
-    %result = vector.contract #trait0 %lhs, %rhs, %acc
-      : vector<32x8xf16>, vector<16x8xf16> into vector<32x16xf16>
-    %exp = math.exp2 %result : vector<32x16xf16>
-    func.return %exp : vector<32x16xf16>
-  }
-}
-
-// -----
-
-#accesses0 = [
-  affine_map<(m, n, k) -> (m, k)>,
-  affine_map<(m, n, k) -> (n, k)>,
-  affine_map<(m, n, k) -> (m, n)>
-]
-
-#trait0 = {
-  indexing_maps = #accesses0,
-  iterator_types = ["parallel", "parallel", "reduction"]
-}
-
-builtin.module {
-  func.func @chained_matmul_second_operand(%lhs : vector<32x8xf16>, %rhs : vector<16x8xf16>, %acc : vector<32x16xf16>,
-    // CHECK: func.func @chained_matmul_second_operand(%[[LHS:.*]]: vector<32x8xf16>, %[[RHS:.*]]: vector<16x8xf16>, %[[ACC:.*]]: vector<32x16xf16>
-    %lhs2 : vector<32x16xf16>, %acc2 : vector<32x32xf16>) -> vector<32x32xf16> {
-    // CHECK-NOT: vector.transpose
-    %result = vector.contract #trait0 %lhs, %rhs, %acc
-      : vector<32x8xf16>, vector<16x8xf16> into vector<32x16xf16>
-    %exp = math.exp2 %result : vector<32x16xf16>
-    %result2 = vector.contract #trait0 %lhs2, %exp, %acc2
-      : vector<32x16xf16>, vector<32x16xf16> into vector<32x32xf16>
-    func.return %result2 : vector<32x32xf16>
-  }
-}
-
-// -----
-
-#accesses0 = [
-  affine_map<(m, n, k) -> (m, k)>,
-  affine_map<(m, n, k) -> (n, k)>,
-  affine_map<(m, n, k) -> (m, n)>
-]
-
-#accesses1 = [
-  affine_map<(m, n, k) -> (m, k)>,
-  affine_map<(m, n, k) -> (k, n)>,
-  affine_map<(m, n, k) -> (m, n)>
-]
-
-#trait0 = {
-  indexing_maps = #accesses0,
-  iterator_types = ["parallel", "parallel", "reduction"]
-}
-
-#trait1 = {
-  indexing_maps = #accesses1,
-  iterator_types = ["parallel", "parallel", "reduction"]
-}
-
-builtin.module {
-  func.func @chained_matmul_mmt_mm(%lhs : vector<32x8xf16>, %rhs : vector<16x8xf16>, %acc : vector<32x16xf16>,
-    // CHECK-DAG: #[[MAP:.*]]  = affine_map<(d0, d1, d2) -> (d1, d2)>
-    // CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1, d2) -> (d0, d2)>
-    // CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (d1, d0)>
-    // CHECK-DAG: #[[MAP3:.*]] = affine_map<(d0, d1, d2) -> (d2, d0)>
-    // CHECK: func.func @chained_matmul_mmt_mm(%[[LHS:.*]]: vector<32x8xf16>, %[[RHS:.*]]: vector<16x8xf16>, %[[ACC:.*]]: vector<32x16xf16>
-    // CHECK-SAME: %[[RHS2:.*]]: vector<16x8xf16>, %[[ACC2:.*]]: vector<32x8xf16>
-    %rhs2 : vector<16x8xf16>, %acc2 : vector<32x8xf16>) -> vector<32x8xf16> {
-    // CHECK: %[[TRANS_ACC:.*]] = vector.transpose %[[ACC]], [1, 0] : vector<32x16xf16> to vector<16x32xf16>
-    // CHECK: %[[TRANS_RES:.*]] = vector.contract {indexing_maps = [#[[MAP]], #[[MAP1]], #[[MAP2]]], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>}
-    // CHECK-SAME: %[[RHS]], %[[LHS]], %[[TRANS_ACC]] : vector<16x8xf16>, vector<32x8xf16> into vector<16x32xf16>
-    // CHECK: %[[RES:.*]] = vector.transpose %[[TRANS_RES]], [1, 0] : vector<16x32xf16> to vector<32x16xf16>
-    %result = vector.contract #trait0 %lhs, %rhs, %acc
-      : vector<32x8xf16>, vector<16x8xf16> into vector<32x16xf16>
-    // CHECK: %[[EXP:.*]] = math.exp2 %[[RES]] : vector<32x16xf16>
-    %exp = math.exp2 %result : vector<32x16xf16>
-    // CHECK: %[[TRANS_ACC2:.*]] = vector.transpose %[[ACC2]], [1, 0] : vector<32x8xf16> to vector<8x32xf16>
-    // CHECK: %[[TRANS_EXP:.*]] = vector.transpose %[[EXP]], [1, 0] : vector<32x16xf16> to vector<16x32xf16>
-    // CHECK: %[[TRANS_RHS2:.*]] = vector.transpose %[[RHS2]], [1, 0] : vector<16x8xf16> to vector<8x16xf16>
-    // CHECK: %[[TRANS_RES2:.*]] = vector.contract {indexing_maps = [#[[MAP]], #[[MAP3]], #[[MAP2]]], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>}
-    // CHECK-SAME: %[[TRANS_RHS2]], %[[TRANS_EXP]], %[[TRANS_ACC2]] : vector<8x16xf16>, vector<16x32xf16> into vector<8x32xf16>
-    // CHECK: %[[RES2:.*]] = vector.transpose %[[TRANS_RES2]], [1, 0] : vector<8x32xf16> to vector<32x8xf16>
-    %result2 = vector.contract #trait1 %exp, %rhs2, %acc2
-      : vector<32x16xf16>, vector<16x8xf16> into vector<32x8xf16>
-    func.return %result2 : vector<32x8xf16>
-  }
-}
-
-// -----
-
-#accesses0 = [
-  affine_map<(b, m1, m2, n, k) -> (b, m2, m1, k)>,
-  affine_map<(b, m1, m2, n, k) -> (b, n, k)>,
-  affine_map<(b, m1, m2, n, k) -> (b, m2, m1, n)>
-]
-
-#trait0 = {
-  indexing_maps = #accesses0,
-  iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]
-}
-
-builtin.module {
-  // CHECK-DAG: #[[MAP:.*]]  = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>
-  // CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d1, d4)>
-  // CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d1, d2)>
-  func.func @chained_matmul(%lhs  : vector<17x64x32x8xf16>,
-                            %rhs  : vector<17x16x8xf16>,
-                            %acc  : vector<17x64x32x16xf16>,
-                            %rhs2 : vector<17x8x16xf16>,
-                            %acc2 : vector<17x64x32x8xf16>) -> vector<17x64x32x8xf16> {
-
-    // CHECK: vector.transpose
-    // CHECK-NOT: vector.transpose
-    // CHECK: vector.contract
-    // CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP1]], #[[MAP2]]]
-    %result = vector.contract #trait0 %lhs, %rhs, %acc
-      : vector<17x64x32x8xf16>, vector<17x16x8xf16> into vector<17x64x32x16xf16>
-
-    // transpose from result will fold with transpose of the acc of the next
-    // contract
-
-    // CHECK: vector.transpose
-    // CHECK: vector.transpose
-    // CHECK-NOT: vector.transpose
-    // CHECK: vector.contract
-    // CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP1]], #[[MAP2]]]
-    %result2 = vector.contract #trait0 %result, %rhs2, %acc2
-      : vector<17x64x32x16xf16>, vector<17x8x16xf16> into vector<17x64x32x8xf16>
-    // CHECK: vector.transpose
-
-    func.return %result2 : vector<17x64x32x8xf16>
-  }
-}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir
deleted file mode 100644
index cc0688ac332e..000000000000
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir
+++ /dev/null
@@ -1,319 +0,0 @@
-// RUN: iree-opt --iree-transform-dialect-interpreter --split-input-file --cse %s | FileCheck %s
-
-// Refer to the distribution pattern documentation for what layoutA, layoutB,
-// layoutC means and how these layouts are assigned based on the instruction
-// type.
-
-#layout = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
-
-#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
-#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
-#map3 = affine_map<(d0, d1, d2) -> (d1, d0)>
-
-// A: vector<16x16>, layout = layoutA
-#row_layout = #iree_vector_ext.per_dim_layout<[BATCHX, LANEX], [1, 16]>
-#col_layout = #iree_vector_ext.per_dim_layout<[BATCHY, LANEY, VECTORX], [1, 4, 4]>
-#layout_a = #iree_vector_ext.layout<#row_layout, #col_layout>
-
-// B: vector<16x16>, layout = transpose(layoutB) = layoutA
-// Since shapes are also same, we can use the same layout attribute, layout_a.
-
-// C: vector<16x16>, layout = layoutC
-#row_layout2 = #iree_vector_ext.per_dim_layout<[BATCHX, LANEY, VECTORX], [1, 4, 4]>
-#col_layout2 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [1, 16]>
-#layout_c = #iree_vector_ext.layout<#row_layout2, #col_layout2>
-builtin.module attributes { transform.with_named_sequence } {
-  func.func @distribute_mfma_16x16x16_mmt(%a : vector<16x16xf16>, %b : vector<16x16xf16>, %c : vector<16x16xf32>) -> vector<16x16xf32> {
-    %output = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"],
-                               kind = #vector.kind<add>,
-                               "__vector_layout_test_anchor_operand_0" = #layout_a,
-                               "__vector_layout_test_anchor_operand_1" = #layout_a,
-                               "__vector_layout_test_anchor_operand_2" = #layout_c,
-                               "__vector_layout_test_anchor_result_0" = #layout_c
-                               }
-                                %a, %b, %c : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32>
-    return %output : vector<16x16xf32>
-  }
-  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
-    %contract = transform.structured.match ops{["vector.contract"]} in %variant_op :  (!transform.any_op) -> !transform.any_op
-    %layout16x16x16 = transform.param.constant #layout -> !transform.any_param
-    transform.iree.set_contraction_layout_attributes %contract, %layout16x16x16 : !transform.any_op, !transform.any_param
-
-    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.amdgpu_distribute_vectors %top_level_func test_conversion : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
-// CHECK-LABEL: distribute_mfma_16x16x16_mmt
-
-// CHECK-SAME: %[[ARG0:.+]]: vector<16x16xf16>, %[[ARG1:.+]]: vector<16x16xf16>, %[[ARG2:.+]]: vector<16x16xf32>
-// CHECK-DAG: %[[C:.+]] = iree_vector_ext.to_simt %[[ARG2]] : vector<16x16xf32> -> vector<1x1x4xf32>
-// CHECK-DAG: %[[CV:.+]] = vector.extract %[[C]][0, 0] : vector<4xf32> from vector<1x1x4xf32>
-// CHECK-DAG: %[[A:.+]] = iree_vector_ext.to_simt %[[ARG0]] : vector<16x16xf16> -> vector<1x1x4xf16>
-// CHECK-DAG: %[[AV:.+]] = vector.extract %[[A]][0, 0] : vector<4xf16> from vector<1x1x4xf16>
-// CHECK-DAG: %[[B:.+]] = iree_vector_ext.to_simt %[[ARG1]] : vector<16x16xf16> -> vector<1x1x4xf16>
-// CHECK-DAG: %[[BV:.+]] = vector.extract %[[B]][0, 0] : vector<4xf16> from vector<1x1x4xf16>
-// CHECK-DAG: %[[OUT:.+]] = amdgpu.mfma %[[AV]] * %[[BV]] + %[[CV]] {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
-
-// -----
-
-#layout = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
-
-#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
-#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
-#map3 = affine_map<(d0, d1, d2) -> (d1, d0)>
-
-// A: vector<32x128>, layout = layoutA
-#row_layout = #iree_vector_ext.per_dim_layout<[BATCHX, LANEX], [2, 16]>
-#col_layout = #iree_vector_ext.per_dim_layout<[BATCHY, LANEY, VECTORX], [8, 4, 4]>
-#layout_a = #iree_vector_ext.layout<#row_layout, #col_layout>
-
-// B: vector<64x128>, layout = transpose(layoutB) = layoutA
-#row_layout2 = #iree_vector_ext.per_dim_layout<[BATCHX, LANEX], [4, 16]>
-#col_layout2 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEY, VECTORX], [8, 4, 4]>
-#layout_b = #iree_vector_ext.layout<#row_layout2, #col_layout2>
-
-// C: vector<32x64>, layout = layoutC
-#row_layout3 = #iree_vector_ext.per_dim_layout<[BATCHX, LANEY, VECTORX], [2, 4, 4]>
-#col_layout3 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [4, 16]>
-#layout_c = #iree_vector_ext.layout<#row_layout3, #col_layout3>
-builtin.module attributes { transform.with_named_sequence } {
-  func.func @distribute_mfma_16x16x16_mmt_batch(%a : vector<32x128xf16>, %b : vector<64x128xf16>, %c : vector<32x64xf32>) -> vector<32x64xf32> {
-    %output = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"],
-                               kind = #vector.kind<add>,
-                               "__vector_layout_test_anchor_operand_0" = #layout_a,
-                               "__vector_layout_test_anchor_operand_1" = #layout_b,
-                               "__vector_layout_test_anchor_operand_2" = #layout_c,
-                               "__vector_layout_test_anchor_result_0" = #layout_c
-                               }
-                                %a, %b, %c : vector<32x128xf16>, vector<64x128xf16> into vector<32x64xf32>
-    return %output : vector<32x64xf32>
-  }
-  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
-    %contract = transform.structured.match ops{["vector.contract"]} in %variant_op :  (!transform.any_op) -> !transform.any_op
-    %layout16x16x16 = transform.param.constant #layout -> !transform.any_param
-    transform.iree.set_contraction_layout_attributes %contract, %layout16x16x16 : !transform.any_op, !transform.any_param
-
-    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.amdgpu_distribute_vectors %top_level_func test_conversion : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
-// CHECK-LABEL: distribute_mfma_16x16x16_mmt_batch
-
-// CHECK-COUNT-64: amdgpu.mfma {{.*}}, vector<4xf32>
-
-// -----
-
-#layout = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>
-
-#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
-#map2 = affine_map<(d0, d1, d2) -> (d2, d1)>
-#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
-
-// A: vector<32x8>, layout = layoutA
-#row_layout = #iree_vector_ext.per_dim_layout<[BATCHX, LANEX], [1, 32]>
-#col_layout = #iree_vector_ext.per_dim_layout<[BATCHY, LANEY, VECTORX], [1, 2, 4]>
-#layout_a = #iree_vector_ext.layout<#row_layout, #col_layout>
-
-// B: vector<8x32>, layout = layoutB
-#row_layout1 = #iree_vector_ext.per_dim_layout<[BATCHX, LANEY, VECTORX], [1, 2, 4]>
-#col_layout1 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [1, 32]>
-#layout_b = #iree_vector_ext.layout<#row_layout1, #col_layout1>
-
-// C: vector<32x32>, layout = layoutC
-#row_layout2 = #iree_vector_ext.per_dim_layout<[BATCHX, VECTORY, LANEY, VECTORX], [1, 4, 2, 4]>
-#col_layout2 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [1, 32]>
-#layout_c = #iree_vector_ext.layout<#row_layout2, #col_layout2>
-builtin.module attributes { transform.with_named_sequence } {
-  func.func @distribute_mfma_32x32x8_mm(%a : vector<32x8xf16>, %b : vector<8x32xf16>, %c : vector<32x32xf32>) -> vector<32x32xf32> {
-    %output = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"],
-                               kind = #vector.kind<add>,
-                               "__vector_layout_test_anchor_operand_0" = #layout_a,
-                               "__vector_layout_test_anchor_operand_1" = #layout_b,
-                               "__vector_layout_test_anchor_operand_2" = #layout_c,
-                               "__vector_layout_test_anchor_result_0" = #layout_c
-                               }
-                                %a, %b, %c : vector<32x8xf16>, vector<8x32xf16> into vector<32x32xf32>
-    return %output : vector<32x32xf32>
-  }
-  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
-    %contract = transform.structured.match ops{["vector.contract"]} in %variant_op :  (!transform.any_op) -> !transform.any_op
-    %layout32x32x8 = transform.param.constant #layout -> !transform.any_param
-    transform.iree.set_contraction_layout_attributes %contract, %layout32x32x8 : !transform.any_op, !transform.any_param
-
-    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.amdgpu_distribute_vectors %top_level_func test_conversion : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
-// CHECK-LABEL: distribute_mfma_32x32x8_mm
-
-// CHECK-SAME: %[[ARG0:.+]]: vector<32x8xf16>, %[[ARG1:.+]]: vector<8x32xf16>, %[[ARG2:.+]]: vector<32x32xf32>
-// CHECK-DAG: %[[C:.+]] = iree_vector_ext.to_simt %[[ARG2]] : vector<32x32xf32> -> vector<1x1x16xf32>
-// CHECK-DAG: %[[CV:.+]] = vector.extract %[[C]][0, 0] : vector<16xf32> from vector<1x1x16xf32>
-// CHECK-DAG: %[[A:.+]] = iree_vector_ext.to_simt %[[ARG0]] : vector<32x8xf16> -> vector<1x1x4xf16>
-// CHECK-DAG: %[[AV:.+]] = vector.extract %[[A]][0, 0] : vector<4xf16> from vector<1x1x4xf16>
-// CHECK-DAG: %[[B:.+]] = iree_vector_ext.to_simt %[[ARG1]] : vector<8x32xf16> -> vector<1x1x4xf16>
-// CHECK-DAG: %[[BV:.+]] = vector.extract %[[B]][0, 0] : vector<4xf16> from vector<1x1x4xf16>
-// CHECK-DAG: %[[OUT:.+]] = amdgpu.mfma %[[AV]] * %[[BV]] + %[[CV]] {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<16xf32>
-
-// -----
-
-#layout = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>
-
-#map1 = affine_map<(d0, d1, d2) -> (d2, d0)>
-#map2 = affine_map<(d0, d1, d2) -> (d2, d1)>
-#map3 = affine_map<(d0, d1, d2) -> (d0, d1)>
-
-// A: vector<8x64>, layout = transpose(layoutA) = layoutB
-#row_layout = #iree_vector_ext.per_dim_layout<[BATCHX, LANEY, VECTORX], [1, 2, 4]>
-#col_layout = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [2, 32]>
-#layout_a = #iree_vector_ext.layout<#row_layout, #col_layout>
-
-// B: vector<8x32>, layout = layoutB
-// We can use the same layout attribute, layout_a, since the shapes are same.
-#row_layout1 = #iree_vector_ext.per_dim_layout<[BATCHX, LANEY, VECTORX], [1, 2, 4]>
-#col_layout1 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [1, 32]>
-#layout_b = #iree_vector_ext.layout<#row_layout1, #col_layout1>
-
-// C: vector<64x32>, layout = layoutC
-#row_layout2 = #iree_vector_ext.per_dim_layout<[BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>
-#col_layout2 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [1, 32]>
-#layout_c = #iree_vector_ext.layout<#row_layout2, #col_layout2>
-builtin.module attributes { transform.with_named_sequence } {
-  func.func @distribute_mfma_32x32x8_mtm(%a : vector<8x64xf16>, %b : vector<8x32xf16>, %c : vector<64x32xf32>) -> vector<64x32xf32> {
-    %output = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"],
-                               kind = #vector.kind<add>,
-                               "__vector_layout_test_anchor_operand_0" = #layout_a,
-                               "__vector_layout_test_anchor_operand_1" = #layout_b,
-                               "__vector_layout_test_anchor_operand_2" = #layout_c,
-                               "__vector_layout_test_anchor_result_0" = #layout_c
-                               }
-                                %a, %b, %c : vector<8x64xf16>, vector<8x32xf16> into vector<64x32xf32>
-    return %output : vector<64x32xf32>
-  }
-  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
-    %contract = transform.structured.match ops{["vector.contract"]} in %variant_op :  (!transform.any_op) -> !transform.any_op
-    %layout32x32x8 = transform.param.constant #layout -> !transform.any_param
-    transform.iree.set_contraction_layout_attributes %contract, %layout32x32x8 : !transform.any_op, !transform.any_param
-
-    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.amdgpu_distribute_vectors %top_level_func test_conversion : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
-// CHECK-LABEL: distribute_mfma_32x32x8_mtm
-
-// CHECK-DAG: %[[A1:.+]] = vector.extract %[[A:.+]][0, 0] : vector<4xf16> from vector<1x2x4xf16>
-// CHECK-DAG: %[[B1:.+]] = vector.extract %[[B:.+]][0, 0] : vector<4xf16> from vector<1x1x4xf16>
-// CHECK-DAG: %{{.*}} = amdgpu.mfma %[[A1]] * %[[B1]]
-// CHECK-DAG: %[[A2:.+]] = vector.extract %[[A]][0, 1] : vector<4xf16> from vector<1x2x4xf16>
-// CHECK-DAG: %{{.*}} = amdgpu.mfma %[[A2]] * %[[B1]]
-// CHECK-NOT: amdgpu.mfma
-
-// -----
-
-#layout = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
-#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
-#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
-#map3 = affine_map<(d0, d1, d2) -> (d1, d0)>
-
-// A: vector<16x16>, layout = layoutA
-#row_layout = #iree_vector_ext.per_dim_layout<[BATCHX, LANEX], [1, 16]>
-#col_layout = #iree_vector_ext.per_dim_layout<[BATCHY, LANEY, VECTORX], [1, 1, 16]>
-#layout_a = #iree_vector_ext.layout<#row_layout, #col_layout>
-
-// B: vector<16x16>, layout = transpose(layoutB) = layoutA
-// Since shapes are also same, we can use the same layout attribute, layout_a.
-
-// C: vector<16x16>, layout = layoutC
-#row_layout2 = #iree_vector_ext.per_dim_layout<[BATCHX, VECTORY, LANEY, VECTORX], [1, 8, 2, 1]>
-#col_layout2 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [1, 16]>
-#layout_c = #iree_vector_ext.layout<#row_layout2, #col_layout2>
-builtin.module attributes { transform.with_named_sequence } {
-  func.func @distribute_wmma_16x16x16_mmt(%a : vector<16x16xf16>, %b : vector<16x16xf16>, %c : vector<16x16xf32>) -> vector<16x16xf32> {
-    %output = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"],
-                               kind = #vector.kind<add>,
-                               "__vector_layout_test_anchor_operand_0" = #layout_a,
-                               "__vector_layout_test_anchor_operand_1" = #layout_a,
-                               "__vector_layout_test_anchor_operand_2" = #layout_c,
-                               "__vector_layout_test_anchor_result_0" = #layout_c
-                               }
-                                %a, %b, %c : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32>
-    return %output : vector<16x16xf32>
-  }
-  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
-    %contract = transform.structured.match ops{["vector.contract"]} in %variant_op :  (!transform.any_op) -> !transform.any_op
-    %layout16x16x16 = transform.param.constant #layout -> !transform.any_param
-    transform.iree.set_contraction_layout_attributes %contract, %layout16x16x16 : !transform.any_op, !transform.any_param
-
-    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.amdgpu_distribute_vectors %top_level_func test_conversion : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
-// CHECK-LABEL: distribute_wmma_16x16x16_mmt
-
-// CHECK-SAME: %[[ARG0:.+]]: vector<16x16xf16>, %[[ARG1:.+]]: vector<16x16xf16>, %[[ARG2:.+]]: vector<16x16xf32>
-// CHECK-DAG: %[[C:.+]] = iree_vector_ext.to_simt %[[ARG2]] : vector<16x16xf32> -> vector<1x1x8xf32>
-// CHECK-DAG: %[[CV:.+]] = vector.extract %[[C]][0, 0] : vector<8xf32> from vector<1x1x8xf32>
-// CHECK-DAG: %[[A:.+]] = iree_vector_ext.to_simt %[[ARG0]] : vector<16x16xf16> -> vector<1x1x16xf16>
-// CHECK-DAG: %[[AV:.+]] = vector.extract %[[A]][0, 0] : vector<16xf16> from vector<1x1x16xf16>
-// CHECK-DAG: %[[B:.+]] = iree_vector_ext.to_simt %[[ARG1]] : vector<16x16xf16> -> vector<1x1x16xf16>
-// CHECK-DAG: %[[BV:.+]] = vector.extract %[[B]][0, 0] : vector<16xf16> from vector<1x1x16xf16>
-// CHECK-DAG: %[[OUT:.+]] = amdgpu.wmma %[[AV]] * %[[BV]] + %[[CV]] : vector<16xf16>, vector<16xf16>, vector<8xf32>
-
-// -----
-
-#layout = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
-
-#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
-#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
-#map3 = affine_map<(d0, d1, d2) -> (d1, d0)>
-
-// A: vector<32x128>, layout = layoutA
-#row_layout = #iree_vector_ext.per_dim_layout<[BATCHX, LANEX], [2, 16]>
-#col_layout = #iree_vector_ext.per_dim_layout<[BATCHY, LANEY, VECTORX], [8, 1, 16]>
-#layout_a = #iree_vector_ext.layout<#row_layout, #col_layout>
-
-// B: vector<64x128>, layout = transpose(layoutB) = layoutA
-#row_layout2 = #iree_vector_ext.per_dim_layout<[BATCHX, LANEX], [4, 16]>
-#col_layout2 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEY, VECTORX], [8, 1, 16]>
-#layout_b = #iree_vector_ext.layout<#row_layout2, #col_layout2>
-
-// C: vector<32x64>, layout = layoutC
-#row_layout3 = #iree_vector_ext.per_dim_layout<[BATCHX, VECTORY, LANEY, VECTORX], [2, 8, 2, 1]>
-#col_layout3 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [4, 16]>
-#layout_c = #iree_vector_ext.layout<#row_layout3, #col_layout3>
-builtin.module attributes { transform.with_named_sequence } {
-  func.func @distribute_wmma_16x16x16_mmt_batch(%a : vector<32x128xf16>, %b : vector<64x128xf16>, %c : vector<32x64xf32>) -> vector<32x64xf32> {
-    %output = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"],
-                               kind = #vector.kind<add>,
-                               "__vector_layout_test_anchor_operand_0" = #layout_a,
-                               "__vector_layout_test_anchor_operand_1" = #layout_b,
-                               "__vector_layout_test_anchor_operand_2" = #layout_c,
-                               "__vector_layout_test_anchor_result_0" = #layout_c
-                               }
-                                %a, %b, %c : vector<32x128xf16>, vector<64x128xf16> into vector<32x64xf32>
-    return %output : vector<32x64xf32>
-  }
-  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
-    %contract = transform.structured.match ops{["vector.contract"]} in %variant_op :  (!transform.any_op) -> !transform.any_op
-    %layout16x16x16 = transform.param.constant #layout -> !transform.any_param
-    transform.iree.set_contraction_layout_attributes %contract, %layout16x16x16 : !transform.any_op, !transform.any_param
-
-    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.amdgpu_distribute_vectors %top_level_func test_conversion : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
-// CHECK-LABEL: distribute_wmma_16x16x16_mmt_batch
-
-// CHECK-COUNT-64: amdgpu.wmma {{.*}} : vector<16xf16>, vector<16xf16>, vector<8xf32>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_set_anchor_layouts.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_set_anchor_layouts.mlir
deleted file mode 100644
index 9a2e0ad01fa1..000000000000
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_set_anchor_layouts.mlir
+++ /dev/null
@@ -1,95 +0,0 @@
-// RUN: iree-opt --iree-transform-dialect-interpreter --split-input-file --cse %s --verify-diagnostics
-
-// This tests that the compiler is setting the correct layout anchors for various vectorOps and shapes.
-// Currently only testing on contraction layoutV1, but can be expanded to others.
-
-#layout = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
-#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
-#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
-#map3 = affine_map<(d0, d1, d2) -> (d1, d0)>
-
-builtin.module attributes { transform.with_named_sequence } {
-  func.func @anchor_mfma_16x16x16_mmt(%a : memref<16x16xf16>, %b : memref<16x16xf16>, %init : vector<16x16xf32>) -> vector<16x16xf32> {
-    // CHECK-LABEL: anchor_mfma_16x16x16_mmt
-    %c0 = arith.constant 0 : index
-    %cst_0 = arith.constant 0.0 : f16
-    %lhs = vector.transfer_read %a[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHX,  LANEX], [1, 16]>, <[ BATCHY,  LANEY,  VECTORX], [1, 4, 4]>>}}
-    %rhs = vector.transfer_read %b[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHX,  LANEX], [1, 16]>, <[ BATCHY,  LANEY,  VECTORX], [1, 4, 4]>>}}
-    %output = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs, %rhs, %init : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHX,  LANEY,  VECTORX], [1, 4, 4]>, <[ BATCHY,  LANEX], [1, 16]>>}}
-    return %output : vector<16x16xf32>
-  }
-  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
-    %contract = transform.structured.match ops{["vector.contract"]} in %variant_op :  (!transform.any_op) -> !transform.any_op
-    %layout16x16x16 = transform.param.constant #layout -> !transform.any_param
-    transform.iree.set_contraction_layout_attributes %contract, %layout16x16x16 : !transform.any_op, !transform.any_param
-
-    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.test_vector_layout_analysis %top_level_func : !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
-#layout = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
-#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
-#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
-#map3 = affine_map<(d0, d1, d2) -> (d1, d0)>
-
-builtin.module attributes { transform.with_named_sequence } {
-  func.func @anchor_mfma_16x16x16_mmt_batch(%a : memref<32x128xf16>, %b : memref<64x128xf16>, %init : vector<32x64xf32>) -> vector<32x64xf32> {
-    // CHECK-LABEL: anchor_mfma_16x16x16_mmt_batch
-    %c0 = arith.constant 0 : index
-    %cst_0 = arith.constant 0.0 : f16
-    %lhs = vector.transfer_read %a[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x128xf16>, vector<32x128xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHX,  LANEX], [2, 16]>, <[ BATCHY,  LANEY,  VECTORX], [8, 4, 4]>>}}
-    %rhs = vector.transfer_read %b[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<64x128xf16>, vector<64x128xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHX,  LANEX], [4, 16]>, <[ BATCHY,  LANEY,  VECTORX], [8, 4, 4]>>}}
-    %output = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs, %rhs, %init : vector<32x128xf16>, vector<64x128xf16> into vector<32x64xf32>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHX,  LANEY,  VECTORX], [2, 4, 4]>, <[ BATCHY,  LANEX], [4, 16]>>}}
-    return %output : vector<32x64xf32>
-  }
-  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
-    %contract = transform.structured.match ops{["vector.contract"]} in %variant_op :  (!transform.any_op) -> !transform.any_op
-    %layout16x16x16 = transform.param.constant #layout -> !transform.any_param
-    transform.iree.set_contraction_layout_attributes %contract, %layout16x16x16 : !transform.any_op, !transform.any_param
-
-    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.test_vector_layout_analysis %top_level_func : !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
-#layout = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
-#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
-#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
-#map3 = affine_map<(d0, d1, d2) -> (d1, d0)>
-
-builtin.module attributes { transform.with_named_sequence } {
-  func.func @anchor_wmma_16x16x16_mmt(%a : memref<16x16xf16>, %b : memref<16x16xf16>, %init : vector<16x16xf32>) -> vector<16x16xf32> {
-    // CHECK-LABEL: anchor_wmma_16x16x16_mmt
-    %c0 = arith.constant 0 : index
-    %cst_0 = arith.constant 0.0 : f16
-    %lhs = vector.transfer_read %a[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHX,  LANEX], [1, 16]>, <[ BATCHY,  VECTORX], [1, 16]>>}}
-    %rhs = vector.transfer_read %b[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHX,  LANEX], [1, 16]>, <[ BATCHY,  VECTORX], [1, 16]>>}}
-    %output = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs, %rhs, %init : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32>
-    // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHX,  VECTORX,  LANEY], [1, 8, 2]>, <[ BATCHY,  LANEX], [1, 16]>>}}
-    return %output : vector<16x16xf32>
-  }
-  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
-    %contract = transform.structured.match ops{["vector.contract"]} in %variant_op :  (!transform.any_op) -> !transform.any_op
-    %layout16x16x16 = transform.param.constant #layout -> !transform.any_param
-    transform.iree.set_contraction_layout_attributes %contract, %layout16x16x16 : !transform.any_op, !transform.any_param
-
-    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.test_vector_layout_analysis %top_level_func : !transform.any_op
-    transform.yield
-  }
-}