diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp index 0ef6e64d2c26..d72ac17b0e9e 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributionPatterns.cpp @@ -27,65 +27,6 @@ using VectorValue = TypedValue; namespace { -/// Given the state of the iterator, compute the indices of the original vector -/// that the current iterator state is iterating over. These indices are -/// parameterized by the thread grid. -static SmallVector computeSIMDIndex(const LayoutIterator::State &state, - LayoutAttr layout, Value laneId, - RewriterBase &rewriter) { - MLIRContext *ctx = layout.getContext(); - AffineExpr threadX, threadY, threadZ; - bindSymbols(ctx, threadX, threadY, threadZ); - - SmallVector simdIndex; - // Calculate the index for each dim separately. - for (PerDimLayoutAttr dimLayout : layout.getLayouts()) { - AffineExpr offset = getAffineConstantExpr(0, ctx); - AffineExpr stride = getAffineConstantExpr(1, ctx); - for (auto [label, shape] : llvm::reverse( - llvm::zip(dimLayout.getLabels(), dimLayout.getShapes()))) { - int64_t position = state.lookup(label.getValue()).getPosition(); - - switch (label.getValue()) { - case LayoutDimension::LANEX: - offset = offset + stride * threadX; - break; - case LayoutDimension::LANEY: - offset = offset + stride * threadY; - break; - case LayoutDimension::LANEZ: - offset = offset + stride * threadZ; - break; - default: - offset = offset + stride * getAffineConstantExpr(position, ctx); - break; - } - stride = stride * getAffineConstantExpr(shape, ctx); - } - - auto [laneDimX, laneDimY, laneDimZ] = layout.getLaneGrid(); - SmallVector laneGrid = { - rewriter.create(laneId.getLoc(), laneDimZ), - rewriter.create(laneId.getLoc(), laneDimY), - rewriter.create(laneId.getLoc(), laneDimX)}; - FailureOr> maybeReversedLaneGridVals = - affine::delinearizeIndex(rewriter, laneId.getLoc(), laneId, laneGrid); - assert(succeeded(maybeReversedLaneGridVals) && - "Failed to delinearize lane index"); - SmallVector laneGridVals = {(*maybeReversedLaneGridVals)[2], - (*maybeReversedLaneGridVals)[1], - (*maybeReversedLaneGridVals)[0]}; - - // Compute the index for the dim. - AffineMap indexMap = AffineMap::get(0, 3, offset); - Value index = rewriter.create( - rewriter.getUnknownLoc(), indexMap, laneGridVals); - simdIndex.push_back(index); - } - - return simdIndex; -} - struct DistributeConstants final : OpDistributionPattern { using OpDistributionPattern::OpDistributionPattern; @@ -192,338 +133,6 @@ getReducedPermutation(AffineMap permutationMap, return permutation; } -template -struct DistributeXferLayoutAttr : OpDistributionPattern { - static_assert(std::is_same::value || - std::is_same::value, - "expected vector::TransferReadOp or vector::TransferWriteOp"); - - DistributeXferLayoutAttr(MLIRContext *context, Value laneId, - PatternBenefit benefit = 1) - : OpDistributionPattern(context, benefit), laneId(laneId) {} - - VectorValue accessMemory(OpTy xferOp, VectorValue accumulator, - LayoutAttr vectorLayout, - PatternRewriter &rewriter) const { - // We need to take special consideration of the permutation map when - // lowering. When accessing memory, we use the memoryLayout, because that - // is how the data is accessed in memory. The data is stored in the vector - // according to vectorLayout. - llvm::SmallBitVector unusedDims; - SmallVector permutation = - getReducedPermutation(xferOp.getPermutationMap(), unusedDims); - LayoutAttr memoryLayout = - cast(vectorLayout.permute(permutation)); - - int loadWidth = getLoadStoreWidth(memoryLayout); - DenseMap steps; - steps[LayoutDimension::VECTORX] = loadWidth; - LayoutIterator iterator(vectorLayout, steps); - - iterator.apply([&](const LayoutIterator::State &state) { - SmallVector memoryIndices = getMemoryIndices( - state, memoryLayout, xferOp.getIndices(), unusedDims, rewriter); - SmallVector accIndices = state.computeSIMTIndex(); - accumulator = accessUnit(xferOp, memoryIndices, accIndices, accumulator, - vectorLayout, memoryLayout, rewriter); - }); - - return accumulator; - } - - SmallVector getMemoryIndices(const LayoutIterator::State &state, - LayoutAttr memoryLayout, - SmallVector indices, - llvm::SmallBitVector &projectedDims, - RewriterBase &rewriter) const { - SmallVector simdIndices = - computeSIMDIndex(state, memoryLayout, laneId, rewriter); - SmallVector memoryIndices(indices); - - // The memory layout has some projected leading dims that indices doesn't. - int currSimd = 0; - for (int i = 0, e = memoryIndices.size(); i < e; ++i) { - if (projectedDims[i]) { - continue; - } - - memoryIndices[i] = rewriter.create( - rewriter.getUnknownLoc(), memoryIndices[i], simdIndices[currSimd]); - ++currSimd; - } - - return memoryIndices; - } - - virtual VectorValue accessUnit(OpTy xferOp, SmallVector &memoryIndices, - SmallVector &accIndices, - VectorValue accumulator, - LayoutAttr vectorLayout, - LayoutAttr memoryLayout, - PatternRewriter &rewriter) const = 0; - - int getLoadStoreWidth(LayoutAttr layout) const { - PerDimLayoutAttr fastestChanging = layout.getLayouts().back(); - if (std::optional width = - fastestChanging.getShape(LayoutDimension::VECTORX)) { - return *width; - } - return 1; - } - - Value laneId; -}; - -struct DistributeTransferReadLayoutAttr final - : DistributeXferLayoutAttr { - using DistributeXferLayoutAttr::DistributeXferLayoutAttr; - - LogicalResult matchAndRewrite(vector::TransferReadOp readOp, - DistributionSignature &signature, - PatternRewriter &rewriter) const override { - LayoutAttr vectorLayout = - dyn_cast(signature[readOp.getResult()]); - if (!vectorLayout) { - return failure(); - } - - // TODO: Return failure if we need masking. - - Type elementType = readOp.getSource().getType().getElementType(); - auto vectorType = - VectorType::get(vectorLayout.getDistributedShape(), elementType); - Value zero = rewriter.create( - readOp.getLoc(), vectorType, rewriter.getZeroAttr(vectorType)); - VectorValue acc = cast(zero); - - VectorValue readVec = accessMemory(readOp, acc, vectorLayout, rewriter); - - replaceOpWithDistributedValues(rewriter, readOp, readVec); - return success(); - } - - VectorValue accessUnit(vector::TransferReadOp readOp, - SmallVector &memoryIndices, - SmallVector &accIndices, - VectorValue accumulator, LayoutAttr vectorLayout, - LayoutAttr memoryLayout, - PatternRewriter &rewriter) const override { - auto unitType = VectorType::get({getLoadStoreWidth(memoryLayout)}, - accumulator.getType().getElementType()); - VectorValue load = rewriter.create( - readOp.getLoc(), unitType, readOp.getSource(), memoryIndices); - return rewriter.create( - readOp.getLoc(), load, accumulator, accIndices, - SmallVector{1}); - } -}; - -struct DistributeTransferWriteLayoutAttr final - : DistributeXferLayoutAttr { - using DistributeXferLayoutAttr::DistributeXferLayoutAttr; - - LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp, - DistributionSignature &signature, - PatternRewriter &rewriter) const override { - LayoutAttr vectorLayout = - dyn_cast(signature[writeOp.getVector()]); - if (!vectorLayout) { - return failure(); - } - - if (writeOp.getMask()) { - return failure(); - } - - accessMemory(writeOp, writeOp.getVector(), vectorLayout, rewriter); - - rewriter.eraseOp(writeOp); - return success(); - } - - VectorValue accessUnit(vector::TransferWriteOp writeOp, - SmallVector &memoryIndices, - SmallVector &accIndices, - VectorValue accumulator, LayoutAttr vectorLayout, - LayoutAttr memoryLayout, - PatternRewriter &rewriter) const override { - int width = getLoadStoreWidth(memoryLayout); - - SmallVector strides(accIndices.size(), 1); - SmallVector shapes(accIndices.size(), 1); - shapes[shapes.size() - 1] = width; - Value result = rewriter.create( - writeOp.getLoc(), getDistributed(rewriter, accumulator, vectorLayout), - accIndices, shapes, strides); - result = rewriter.create( - writeOp.getLoc(), result, - SmallVector(accIndices.size() - 1, 0)); - rewriter.create(writeOp.getLoc(), result, - writeOp.getSource(), memoryIndices); - - return accumulator; - } -}; - -struct DistributeReductions final - : OpDistributionPattern { - using OpDistributionPattern::OpDistributionPattern; - - DistributeReductions(MLIRContext *context, int64_t maxBitsPerShuffle) - : OpDistributionPattern(context), maxBitsPerShuffle(maxBitsPerShuffle) {} - - static constexpr int64_t kDefaultSubgroupSize = 32; - - // Do parallel reduction using butterfly shuffles. - Value doThreadGlobalReduction(Value result, uint64_t shuffleOffset, - int64_t laneSize, - vector::CombiningKind combiningKind, - int64_t entriesPerVector, Value mEmpty, - OpBuilder &rewriter, Location loc) const { - auto funcOp = result.getDefiningOp()->getParentOfType(); - std::optional maybeSubgroupSize = getSubgroupSize(funcOp); - if (!maybeSubgroupSize) - funcOp->emitWarning("No subgroup size specified, using default value = " + - Twine(kDefaultSubgroupSize)); - int64_t subgroupSize = maybeSubgroupSize.value_or(kDefaultSubgroupSize); - - Value mask; - assert(llvm::isPowerOf2_64(laneSize)); - for (uint64_t i = shuffleOffset; i < shuffleOffset * laneSize; i <<= 1) { - Value packed = packVectorToSupportedWidth(loc, rewriter, result); - auto shuffleOp = rewriter.create( - loc, packed, i, subgroupSize, gpu::ShuffleMode::XOR); - Value unpacked = - unpackToVector(loc, rewriter, shuffleOp.getShuffleResult(), - cast(result.getType())); - result = makeArithReduction(rewriter, loc, combiningKind, unpacked, - result, nullptr, mask); - } - - // Reduce packed vector with initial value. - Value reducedValue = rewriter.create( - loc, result, SmallVector{0}); - for (int i = 1; i < entriesPerVector; i++) { - Value next = rewriter.create(loc, result, - SmallVector{i}); - reducedValue = makeArithReduction(rewriter, loc, combiningKind, - reducedValue, next, nullptr, mask); - } - result = makeArithReduction(rewriter, loc, combiningKind, reducedValue, - mEmpty, nullptr, mask); - return result; - } - - // This pattern distributes reductions as follows: - // First, the data local to a specific thread is reduced. - // Then, the data between threads is reduced by emitting appropriate - // shuffle instructions. - // Currently, only 16 and 32 bit types are supported. - // TODO: Add ability to reduce n parallel dims together. - LogicalResult matchAndRewrite(vector::MultiDimReductionOp reductionOp, - DistributionSignature &signature, - PatternRewriter &rewriter) const override { - ArrayRef reductionDims = reductionOp.getReductionDims(); - // TODO: Add support for reductions along multiple dimensions. - if (reductionDims.size() > 1) - return failure(); - - VectorValue resultVec = dyn_cast(reductionOp.getResult()); - // TODO: Support results that are not vectors. - if (!resultVec) - return failure(); - LayoutAttr resultLayout = dyn_cast(signature[resultVec]); - if (!resultLayout) - return failure(); - - VectorValue source = reductionOp.getSource(); - ShapedType sourceType = llvm::cast(source.getType()); - // TODO: Add support for (n != 2)-D tensors. - if (sourceType.getRank() != 2) - return failure(); - - LayoutAttr sourceLayout = dyn_cast(signature[source]); - if (!sourceLayout) - return failure(); - - VectorValue acc = dyn_cast(reductionOp.getAcc()); - ShapedType accType = llvm::cast(acc.getType()); - Type elementType = accType.getElementType(); - int bitWidth = elementType.getIntOrFloatBitWidth(); - // TODO: Support additional bitwidths. - if ((bitWidth != 16) && (bitWidth != 32)) - return failure(); - - Location loc = reductionOp.getLoc(); - auto storeVectorType = - VectorType::get(resultLayout.getDistributedShape(), elementType); - Value storeVec = rewriter.create( - loc, storeVectorType, rewriter.getZeroAttr(storeVectorType)); - - int reductionDim = reductionDims[0]; - int parallelDim = reductionDim ^ 1; - if (!sourceLayout.getLane(reductionDim)) - return failure(); - uint64_t shuffleOffset = sourceLayout.getShuffleOffset(reductionDim); - int64_t laneSize = sourceLayout.getLaneDim(reductionDim).value(); - if (!llvm::isPowerOf2_64(laneSize)) - return failure(); - vector::CombiningKind combiningKind = reductionOp.getKind(); - - auto reduceFn = [&](const LayoutIterator::State &state) { - SmallVector parallelSimtIndices = state.computeSIMTIndex(); - Value mEmpty = rewriter.create( - loc, getDistributed(rewriter, acc, resultLayout), - parallelSimtIndices); - - // Store one or more elements in packed vector depending on type. - int64_t entriesPerVector = maxBitsPerShuffle / bitWidth; - Value packedVector = rewriter.create( - loc, rewriter.getZeroAttr( - VectorType::get({entriesPerVector}, elementType))); - - int64_t index{0}; - Value result, mask; - // Thread-local reduction. - auto reduceLocalFn = [&](const LayoutIterator::State &state) { - SmallVector indices = state.computeSIMTIndex(); - Value element = rewriter.create( - loc, getDistributed(rewriter, source, sourceLayout), indices); - packedVector = rewriter.create( - loc, element, packedVector, SmallVector{index}); - index = (index + 1) % entriesPerVector; - // Reduce packed vector when full. - if (index == 0) { - result = result - ? makeArithReduction(rewriter, loc, combiningKind, - result, packedVector, nullptr, mask) - : packedVector; - } - }; - - LayoutIterator reductionIterator(sourceLayout, reductionDim); - reductionIterator.maybeFreezeAndConcatenate(state); - reductionIterator.apply(reduceLocalFn); - - // Thread-global reduction. - result = doThreadGlobalReduction(result, shuffleOffset, laneSize, - combiningKind, entriesPerVector, mEmpty, - rewriter, loc); - storeVec = rewriter.create(loc, result, storeVec, - parallelSimtIndices); - }; - - LayoutIterator parallelIterator(sourceLayout, parallelDim); - parallelIterator.apply(reduceFn); - replaceOpWithDistributedValues(rewriter, reductionOp, storeVec); - - return success(); - } - -private: - int64_t maxBitsPerShuffle; -}; - struct DistributeScfFor final : OpDistributionPattern { using OpDistributionPattern::OpDistributionPattern; @@ -625,402 +234,6 @@ struct DistributeScfFor final : OpDistributionPattern { } }; -struct DistributeTransposeLayoutAttr final - : OpDistributionPattern { - using OpDistributionPattern::OpDistributionPattern; - - LogicalResult matchAndRewrite(vector::TransposeOp transposeOp, - DistributionSignature &signature, - PatternRewriter &rewriter) const override { - VectorValue value = transposeOp.getVector(); - VectorLayoutInterface layout = dyn_cast(signature[value]); - if (!layout) { - return rewriter.notifyMatchFailure(transposeOp, - "layout must be LayoutAttr"); - } - - /// Transpose only changes the notion of where the data carried by each - /// thread comes from in the SIMD vector. The data carried by each thread is - /// still the same, just iterated in a new permuted order. This iteration - /// information is carried by the layout. So, we can simply distribute - /// transpose to a no-op. Example: - /// - /// input: vector<2x4xf16> - /// - /// 0 0 1 1 - /// 2 2 3 3 - /// - /// after transpose, - /// - /// transp: vector<4x2xf16> - /// - /// 0 2 - /// 0 2 - /// 1 3 - /// 1 3 - /// - /// As it can be seen, each thread is still carrying the same data and - /// distributes to vector<2xf16>. - /// - /// The only difference is where this vector<2xf16> comes from and that - /// before transpose, this vector<2xf16> was representing the fastest - /// changing dimension, but after distribution it's not. - replaceOpWithDistributedValues(rewriter, transposeOp, - getDistributed(rewriter, value, layout)); - return success(); - } -}; - -struct DistributeBroadcastLayoutAttr final - : OpDistributionPattern { - using OpDistributionPattern::OpDistributionPattern; - - LogicalResult matchAndRewrite(vector::BroadcastOp broadcastOp, - DistributionSignature &signature, - PatternRewriter &rewriter) const override { - - VectorValue source = dyn_cast(broadcastOp.getSource()); - if (!source) { - // TODO: Add support for scalar broadcasting. - return failure(); - } - - VectorValue vector = broadcastOp.getVector(); - LayoutAttr layout = dyn_cast(signature[vector]); - if (!layout) { - return failure(); - } - - VectorLayoutInterface sourceLayout = signature[source]; - - // We currently only support 1-D to 2-D broadcasting. - if (source.getType().getRank() != 1 || vector.getType().getRank() != 2) { - return failure(); - } - - int broadcastedDim = 0; - int parallelDim = 1; - - Type elementType = - llvm::cast(vector.getType()).getElementType(); - auto vectorType = - VectorType::get(layout.getDistributedShape(), elementType); - Location loc = broadcastOp.getLoc(); - Value accumulator = rewriter.create( - loc, vectorType, rewriter.getZeroAttr(vectorType)); - - // Iterate over the parallel dimension.; - LayoutIterator parallelIterator(layout, parallelDim); - parallelIterator.apply([&](const LayoutIterator::State ¶llelState) { - // Extract the value from source. - SmallVector sourceIndices = parallelState.computeSIMTIndex(); - Value value = rewriter.create( - loc, getDistributed(rewriter, source, sourceLayout), sourceIndices); - - // Broadcast value over the broadcasted dimension. - LayoutIterator broadcastIterator(layout, broadcastedDim); - broadcastIterator.maybeFreezeAndConcatenate(parallelState); - broadcastIterator.apply([&](const LayoutIterator::State &broadcastState) { - SmallVector resultIndices = broadcastState.computeSIMTIndex(); - - accumulator = rewriter.create(loc, value, accumulator, - resultIndices); - }); - }); - - replaceOpWithDistributedValues(rewriter, broadcastOp, accumulator); - return success(); - } -}; - -/// This pattern implements a distribution pattern for layout conflict -/// resolutions where the resolution is a simple vector reshape. -/// In most cases, layout conflicts will need to be resolved with a -/// trip to shared memory or shuffle instructions and in those scenarios -/// this pattern will not work. -/// -/// Below we outline some scenarios where this pattern will be useful: -/// - Unary Operators which are permutation invariant -/// Example: -/// Say the data for a single row is distributed among 2 threads as -/// 0 0 0 0 1 1 1 1 -/// and we have a layout conflict that requires the data to be -/// distributed as -/// 0 0 1 1 0 0 1 1 -/// and we are interested in computing an elementwise operation like exp -/// or trying to do a reduction along the row, then since the operations -/// are permutation invariant, we can treat the resolution as a vector -/// reshape. -/// - Binary Operators which are permutation invariant -/// Example: -/// Using the same example as above, say we are trying to do a dot product -/// between two vectors that have the above layout. As long as both -/// operands are permuted the same way, we will end up with the correct -/// sequence of multiplications and additions. -/// -struct DistributeLayoutConflictResolutions final - : OpDistributionPattern { - using OpDistributionPattern::OpDistributionPattern; - - VectorValue reshapeVector(Location loc, RewriterBase &rewriter, - VectorValue src, LayoutAttr ¤tLayout, - LayoutAttr &targetLayout, Type elementType) const { - - SmallVector targetShape = targetLayout.getDistributedShape(); - SmallVector currentShape = currentLayout.getDistributedShape(); - - auto newVectorType = VectorType::get(targetShape, elementType); - auto constantOp = rewriter.create( - loc, newVectorType, rewriter.getZeroAttr(newVectorType)); - auto newVector = dyn_cast(constantOp.getResult()); - - int64_t innermostDim = targetShape.size() - 1; - int64_t step = - std::min(targetShape[innermostDim], currentShape[innermostDim]); - DenseMap steps; - LayoutDimension vecDim = LayoutDimension::VECTORX; - steps[vecDim] = step; - LayoutIterator srcIterator(currentLayout, steps); - LayoutIterator targetIterator(targetLayout, steps); - - for (; !srcIterator.iterationComplete() && - !targetIterator.iterationComplete(); - ++srcIterator, ++targetIterator) { - SmallVector srcOffset = - srcIterator.getState().computeSIMTIndex(); - SmallVector targetOffset = - targetIterator.getState().computeSIMTIndex(); - SmallVector sliceSize(srcOffset.size(), 1); - sliceSize[sliceSize.size() - 1] = step; - SmallVector sliceStride(srcOffset.size(), 1); - Value slice = rewriter.create( - loc, src, srcOffset, sliceSize, sliceStride); - newVector = rewriter.create( - loc, slice, newVector, targetOffset, sliceStride); - } - return newVector; - } - - LogicalResult matchAndRewrite(IREE::VectorExt::ToLayoutOp resolutionOp, - DistributionSignature &signature, - PatternRewriter &rewriter) const override { - auto vector = cast(resolutionOp.getInput()); - auto result = cast(resolutionOp.getOutput()); - LayoutAttr currentLayout = dyn_cast(signature[vector]); - if (!currentLayout) - return failure(); - LayoutAttr targetLayout = dyn_cast(signature[result]); - if (!targetLayout) - return failure(); - - if (currentLayout == targetLayout) { - return rewriter.notifyMatchFailure( - resolutionOp, "Layout conversion is not a conflict."); - } - - SmallVector currentVecShape = currentLayout.getDistributedShape(); - SmallVector targetVecShape = targetLayout.getDistributedShape(); - if (currentVecShape.size() != targetVecShape.size()) - return failure(); - - auto numElements = [](ArrayRef vector) { - return std::accumulate(vector.begin(), vector.end(), 1, - std::multiplies()); - }; - if (numElements(currentVecShape) != numElements(targetVecShape)) - return failure(); - - if (currentLayout.hasLaneConflictWith(targetLayout)) { - return failure(); - } - - Type elementType = - llvm::cast(result.getType()).getElementType(); - Value newVector = - reshapeVector(resolutionOp.getLoc(), rewriter, - getDistributed(rewriter, vector, targetLayout), - currentLayout, targetLayout, elementType); - replaceOpWithDistributedValues(rewriter, resolutionOp, newVector); - return success(); - } -}; - -/// Pattern that allows us to write to shared memory -/// and read back to register with correct layouts. -/// especially used when we don't have an optimized way -/// to resolve the conflict. -struct DistributeLayoutConflictToSharedMemory final - : OpDistributionPattern { - using OpDistributionPattern::OpDistributionPattern; - - LogicalResult matchAndRewrite(IREE::VectorExt::ToLayoutOp resolutionOp, - DistributionSignature &signature, - PatternRewriter &rewriter) const override { - auto loc = resolutionOp.getLoc(); - auto vector = cast(resolutionOp.getInput()); - auto result = cast(resolutionOp.getOutput()); - LayoutAttr currentLayout = dyn_cast(signature[vector]); - if (!currentLayout) { - return rewriter.notifyMatchFailure(resolutionOp, - "Source layout must be LayoutAttr."); - } - LayoutAttr targetLayout = dyn_cast(signature[result]); - if (!targetLayout) { - return rewriter.notifyMatchFailure(resolutionOp, - "Target layout must be LayoutAttr."); - } - - if (currentLayout == targetLayout) { - return rewriter.notifyMatchFailure( - resolutionOp, "Layout conversion is not a conflict."); - } - - SmallVector currentVecShape = currentLayout.getDistributedShape(); - SmallVector targetVecShape = targetLayout.getDistributedShape(); - if (currentVecShape.size() != targetVecShape.size()) { - return rewriter.notifyMatchFailure( - resolutionOp, - "Target's and source's distributed rank needs to match."); - } - - auto numElements = [](ArrayRef vector) { - return std::accumulate(vector.begin(), vector.end(), 1, - std::multiplies()); - }; - - if (numElements(currentVecShape) == numElements(targetVecShape) && - !currentLayout.hasLaneConflictWith(targetLayout)) { - // If the conditions suffice, we can skip the trip to shared memory - // and just use the default/more efficient layout conflict resolution - // distribution. - return rewriter.notifyMatchFailure(resolutionOp, - "Failing because condition suffice to " - "use better conflict resolutions."); - } - - // Compute Subgroup and Workgroup related information and offsets. - auto funcOp = resolutionOp->getParentOfType(); - if (!funcOp) { - return rewriter.notifyMatchFailure( - resolutionOp, "Expects a parent of type funcOp S.T we can compute " - "subgroup and workgroup related information."); - } - std::optional> workgroupSize = - getWorkgroupSize(funcOp); - std::optional subgroupSize = getSubgroupSize(funcOp); - if (!workgroupSize.has_value() || !subgroupSize.has_value()) { - return rewriter.notifyMatchFailure( - resolutionOp, "Expects workgroup/subgroup information to be " - "available to resolve conflict."); - } - int64_t flatThreadSize = ShapedType::getNumElements(workgroupSize.value()); - if (flatThreadSize % subgroupSize.value() != 0) - return failure(); - int64_t numSubgroups = flatThreadSize / subgroupSize.value(); - - // Define shapes and types needed to be roundtripped to shared-memory. - // The allocated shared-memory will stack subgroup data - // on fastest dimension. Hence, shape will be: - // [dim0, dim1, ..., subgroupCount * dimN] - - auto resolutionType = - llvm::dyn_cast_or_null(resolutionOp.getResult().getType()); - if (!resolutionType) { - return rewriter.notifyMatchFailure( - resolutionOp, - "Expects resolutionOp result to be of type vectorType."); - } - if (!resolutionType.hasStaticShape()) { - return rewriter.notifyMatchFailure( - resolutionOp, "Expects resolutionOp result to have static shape."); - } - auto paddedShape = SmallVector(resolutionType.getShape()); - int64_t vectorRank = resolutionType.getRank(); - paddedShape[vectorRank - 1] *= numSubgroups; - - // Offset and indexing computation such that subgroups can - // write and read to shared memory correctly and without conflicts. - AffineExpr d0, d1, d2, s0; - bindDims(rewriter.getContext(), d0, d1, d2); - bindSymbols(rewriter.getContext(), s0); - auto indexType = rewriter.getIndexType(); - Value threadX = - rewriter.create(loc, indexType, gpu::Dimension::x); - Value threadY = - rewriter.create(loc, indexType, gpu::Dimension::y); - Value threadZ = - rewriter.create(loc, indexType, gpu::Dimension::z); - Value flatThreadId = affine::makeComposedAffineApply( - rewriter, loc, - (d0 + workgroupSize.value()[0] * d1 + - (workgroupSize.value()[0] * workgroupSize.value()[1]) * d2), - {threadX, threadY, threadZ}); - Value subgroupOffset = affine::makeComposedAffineApply( - rewriter, loc, - s0.floorDiv(subgroupSize.value()) * - resolutionType.getShape()[vectorRank - 1], - {flatThreadId}); - - // Create shared memory to store the intermediate from src layout. - auto workgroupMemoryAddressSpace = Attribute(gpu::AddressSpaceAttr::get( - rewriter.getContext(), gpu::AddressSpace::Workgroup)); - MemRefType allocType = - MemRefType::get(paddedShape, resolutionType.getElementType(), - AffineMap(), workgroupMemoryAddressSpace); - auto alloc = rewriter.create(loc, allocType); - - SmallVector offsets(vectorRank, rewriter.getIndexAttr(0)); - SmallVector strides(vectorRank, rewriter.getIndexAttr(1)); - SmallVector shapes = llvm::to_vector( - llvm::map_range(resolutionType.getShape(), [&](int64_t dim) { - return OpFoldResult(rewriter.getIndexAttr(dim)); - })); - offsets[vectorRank - 1] = subgroupOffset; - auto subview = rewriter.create(loc, alloc, offsets, - shapes, strides); - - // Creating write/trip to shared memory using src layout. - Value c0 = rewriter.create(loc, 0); - SmallVector indices(resolutionType.getRank(), c0); - SmallVector inBounds(vectorRank, true); - auto write = rewriter.create(loc, vector, subview, - indices, inBounds); - // Insert gpu.barrier - rewriter.create(write.getLoc()); - - // Creating read from shared memory using dst layout. - // Read with offset starting from the warpIdx * OG fastest dim. - indices[vectorRank - 1] = subgroupOffset; - auto read = rewriter.create(loc, resolutionType, - alloc, indices); - - // Set layouts signature for write. - // We need to set the layout on the srcVector/first operand. - auto unitAttr = UnitAttr::get(rewriter.getContext()); - auto writeAttrs = SmallVector(write->getNumOperands(), unitAttr); - writeAttrs[0] = - currentLayout; // 1st operand is src which requires currentLayout. - ArrayAttr writeOperandsAttr = - ArrayAttr::get(rewriter.getContext(), writeAttrs); - ArrayAttr writeResultsAttr = ArrayAttr::get(rewriter.getContext(), {}); - setSignatureForRedistribution(rewriter, write.getOperation(), - writeOperandsAttr, writeResultsAttr); - - // Set layouts signature for read. - // We only need to set the layout on output. - ArrayAttr readOperandsAttr = ArrayAttr::get( - rewriter.getContext(), - SmallVector(read->getNumOperands(), unitAttr)); - ArrayAttr readResultsAttr = - ArrayAttr::get(rewriter.getContext(), {targetLayout}); - setSignatureForRedistribution(rewriter, read.getOperation(), - readOperandsAttr, readResultsAttr); - - rewriter.replaceOp(resolutionOp, read.getResult()); - return success(); - } -}; - struct DistributeTrivialLayoutConversions final : OpDistributionPattern { using OpDistributionPattern::OpDistributionPattern; @@ -1102,11 +315,6 @@ struct DistributeGather final : OpDistributionPattern { } // namespace -void populateGPUReductionDistributionPatterns(RewritePatternSet &patterns, - int64_t maxBitsPerShuffle) { - patterns.add(patterns.getContext(), maxBitsPerShuffle); -} - void populateGPUDistributionPatterns(RewritePatternSet &patterns) { patterns.add(patterns.getContext()); // Elementwise patterns. @@ -1116,20 +324,4 @@ void populateGPUDistributionPatterns(RewritePatternSet &patterns) { patterns.add(patterns.getContext()); } -void populateGPUDistributionLayoutAttrPatterns(Value laneId, - RewritePatternSet &patterns) { - patterns - .add( - patterns.getContext(), laneId); - patterns.add( - patterns.getContext()); -} - -// TODO: Need a new op/analysis to determine when this pattern is safe to use. -void populateGPULayoutResolutionDistributionPatterns( - RewritePatternSet &patterns) { - patterns.add(patterns.getContext()); -} - }; // namespace mlir::iree_compiler diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPatterns.h b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPatterns.h index 87303844853f..9e81014a4087 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPatterns.h +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPatterns.h @@ -31,12 +31,6 @@ void populateDropSharedMemoryDeallocOpPatterns(RewritePatternSet &patterns); void populateGPUDistributionPatterns(RewritePatternSet &patterns); -void populateGPUDistributionLayoutAttrPatterns(Value laneId, - RewritePatternSet &patterns); - -void populateGPUReductionDistributionPatterns(RewritePatternSet &patterns, - int64_t maxBitsPerShuffle = 32); - void populateGPUDistributeNestedLayoutAttrPatterns( RewritePatternSet &patterns, Value threadId, int64_t subgroupSize, int64_t maxBitsPerShuffle = 32); @@ -46,9 +40,6 @@ void populateGPUDistributeNestedLayoutAttrPatterns( void populateGPUDistributeNestedLayoutContractAMDGPUPatterns( RewritePatternSet &patterns); -void populateGPULayoutResolutionDistributionPatterns( - RewritePatternSet &patterns); - } // namespace mlir::iree_compiler #endif // IREE_COMPILER_CODEGEN_COMMON_GPUPATTERNS_H_ diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir index c392eb783581..a503664ecef4 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir @@ -1,46 +1,5 @@ // RUN: iree-opt --iree-transform-dialect-interpreter --split-input-file --canonicalize --cse %s | FileCheck %s -#layout = #iree_vector_ext.layout<<[VECTORY, LANEY], [4, 4]>, <[VECTORX, LANEX], [4, 4]>> - -// CHECK-LABEL: @distribute_elementwise_f16 -func.func @distribute_elementwise_f16(%a: vector<16x16xf16>, %b: vector<16x16xf16>, %denom: vector<16x16xf16>) -> vector<16x16xi1> { - %c0 = arith.constant 0 : index - %cst_0 = arith.constant 0.0 : f16 - // CHECK: %[[ROOT:.*]] = arith.constant dense<0.000000e+00> : vector<16xf16> - %root = arith.constant dense<0.0> : vector<16x16xf16> - %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<16x16xf16> - // CHECK-DAG: %[[B:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xf16> -> vector<16xf16> - // CHECK-DAG: %[[C:.*]] = arith.mulf %[[B]], %[[ROOT]] {{.*}} : vector<16xf16> - %c = arith.mulf %rootl, %b : vector<16x16xf16> - // CHECK-DAG: %[[DENOM:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xf16> -> vector<16xf16> - // CHECK-DAG: %[[DIVD:.*]] = arith.divf %[[C]], %[[DENOM]] {{.*}} : vector<16xf16> - %divd = arith.divf %c, %denom : vector<16x16xf16> - // CHECK-DAG: %[[A:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xf16> -> vector<16xf16> - // CHECK-DAG: %[[D:.*]] = arith.addf %[[DIVD]], %[[A]] fastmath {{.*}} : vector<16xf16> - %d = arith.addf %divd, %a fastmath : vector<16x16xf16> - // CHECK-DAG: %[[R:.*]] = arith.cmpf ult, %[[D]], %[[ROOT]] {{.*}} : vector<16xf16> - %r = arith.cmpf ult, %d, %root : vector<16x16xf16> - // CHECK: iree_vector_ext.to_simd %[[R]] : vector<16xi1> -> vector<16x16xi1> - return %r : vector<16x16xi1> -} - -// CHECK-LABEL: @distribute_elementwise_i32 -func.func @distribute_elementwise_i32(%a: vector<16x16xi32>, %b: vector<16x16xi32>) -> vector<16x16xi32> { - %c0 = arith.constant 0 : index - %cst_0 = arith.constant 0 : i32 - // CHECK: %[[ROOT:.*]] = arith.constant dense<2> : vector<16xi32> - %root = arith.constant dense<2> : vector<16x16xi32> - %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<16x16xi32> - // CHECK-DAG: %[[B:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xi32> -> vector<16xi32> - // CHECK-DAG: %[[C:.*]] = arith.muli %[[B]], %[[ROOT]] {{.*}} : vector<16xi32> - %c = arith.muli %rootl, %b : vector<16x16xi32> - // CHECK-DAG: %[[A:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xi32> -> vector<16xi32> - // CHECK-DAG: %[[D:.*]] = arith.addi %[[C]], %[[A]] {{.*}} : vector<16xi32> - %d = arith.addi %c, %a : vector<16x16xi32> - // CHECK: iree_vector_ext.to_simd %[[D]] : vector<16xi32> -> vector<16x16xi32> - return %d : vector<16x16xi32> -} - #nested = #iree_vector_ext.nested_layout< subgroup_tile = [2, 1, 1], batch_tile = [8, 2, 4], @@ -69,27 +28,35 @@ func.func @distribute_elementwise_nested_layout_f16(%a: vector<128x128x128xf16>, return %d : vector<128x128x128xf16> } +#layout = #iree_vector_ext.nested_layout< + subgroup_tile = [1, 1], + batch_tile = [1, 1], + outer_tile = [1, 1], + thread_tile = [1, 1], + element_tile = [16, 16], + + subgroup_strides = [1, 1], + thread_strides = [1, 1] +> + // CHECK-LABEL: @distribute_scf_for func.func @distribute_scf_for(%a: vector<16x16xi32>, %b: vector<16x16xi32>) -> vector<16x16xi32> { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c128 = arith.constant 128 : index %cst_0 = arith.constant 0 : i32 - // CHECK: %[[ROOT:.*]] = arith.constant dense<0> : vector<16xi32> + // CHECK: %[[ROOT:.*]] = arith.constant dense<0> : vector<1x1x1x1x16x16xi32> %root = arith.constant dense<0> : vector<16x16xi32> %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<16x16xi32> - // CHECK: iter_args(%[[ARG0:.*]] = %[[ROOT]]) -> (vector<16xi32>) + // CHECK: iter_args(%[[ARG0:.*]] = %[[ROOT]]) -> (vector<1x1x1x1x16x16xi32>) %out = scf.for %i = %c0 to %c128 step %c1 iter_args(%arg0 = %rootl) -> (vector<16x16xi32>) { - // These should be ideally folded if canonicalization was ever ran. - // Canonicalization currently breaks other tests. If canonicalization - // is ever ran, this should be updated. - // CHECK-DAG: %[[B:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xi32> -> vector<16xi32> - // CHECK-DAG: %[[C:.*]] = arith.muli %[[ARG0]], %[[B]] {{.*}} : vector<16xi32> + // CHECK-DAG: %[[B:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xi32> -> vector<1x1x1x1x16x16xi32> + // CHECK-DAG: %[[C:.*]] = arith.muli %[[ARG0]], %[[B]] {{.*}} : vector<1x1x1x1x16x16xi32> %c = arith.muli %arg0, %b : vector<16x16xi32> - // CHECK-DAG: %[[A:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xi32> -> vector<16xi32> - // CHECK-DAG: %[[D:.*]] = arith.addi %[[C]], %[[A]] {{.*}} : vector<16xi32> + // CHECK-DAG: %[[A:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xi32> -> vector<1x1x1x1x16x16xi32> + // CHECK-DAG: %[[D:.*]] = arith.addi %[[C]], %[[A]] {{.*}} : vector<1x1x1x1x16x16xi32> %d = arith.addi %c, %a : vector<16x16xi32> - // CHECK: scf.yield %[[D]] : vector<16xi32> + // CHECK: scf.yield %[[D]] : vector<1x1x1x1x16x16xi32> scf.yield %d : vector<16x16xi32> } return %out : vector<16x16xi32> @@ -102,632 +69,3 @@ builtin.module attributes { transform.with_named_sequence } { transform.yield } } - -// ----- - -#layout_row_major = #iree_vector_ext.layout<<[BATCHX, LANEY], [2, 8]>, <[BATCHY, LANEX, VECTORX], [2, 1, 8]>> -#layout_col_major = #iree_vector_ext.layout<<[BATCHX, LANEY, VECTORX], [1, 4, 4]>, <[BATCHY, LANEX], [2, 8]>> - -// CHECK-LABEL: @distribute_transfer_read_row_major -func.func @distribute_transfer_read_row_major(%alloc: memref<4x4xf16>) -> vector<16x16xf16> { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.0 : f16 - %root = vector.transfer_read %alloc[%c0, %c0], %cst - {in_bounds = [false, false]} - : memref<4x4xf16>, vector<16x16xf16> - %rootl = iree_vector_ext.to_layout %root to layout(#layout_row_major) : vector<16x16xf16> - // CHECK-COUNT-4: vector.load {{.*}}, vector<8xf16> - func.return %rootl : vector<16x16xf16> -} - -// CHECK-LABEL: @distribute_transfer_read_col_major -func.func @distribute_transfer_read_col_major(%alloc: memref<32x32xf16>) -> vector<16x16xf16> { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.0 : f16 - %root = vector.transfer_read %alloc[%c0, %c0], %cst - {in_bounds = [true, true]} - : memref<32x32xf16>, vector<16x16xf16> - %rootl = iree_vector_ext.to_layout %root to layout(#layout_col_major) : vector<16x16xf16> - // CHECK-COUNT-8: vector.load {{.*}}, vector<1xf16> - func.return %rootl : vector<16x16xf16> -} - -// CHECK-LABEL: @distribute_transfer_read_row_major_with_broadcast -func.func @distribute_transfer_read_row_major_with_broadcast(%a: index, %b: index, %alloc: memref<32x32x32x32xf16>) -> vector<16x16xf16> { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.0 : f16 - %root = vector.transfer_read %alloc[%c0, %c0, %a, %b], %cst - {in_bounds = [true, true], - permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)>} - : memref<32x32x32x32xf16>, vector<16x16xf16> - %rootl = iree_vector_ext.to_layout %root to layout(#layout_row_major) : vector<16x16xf16> - // CHECK-COUNT-4: vector.load {{.*}}, vector<8xf16> - func.return %rootl : vector<16x16xf16> -} - -// CHECK-LABEL: @distribute_transfer_read_col_major_with_broadcast -func.func @distribute_transfer_read_col_major_with_broadcast(%a: index, %b: index, %alloc: memref<32x32x32x32xf16>) -> vector<16x16xf16> { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.0 : f16 - %root = vector.transfer_read %alloc[%c0, %c0, %a, %b], %cst - {in_bounds = [true, true], - permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)>} - : memref<32x32x32x32xf16>, vector<16x16xf16> - %rootl = iree_vector_ext.to_layout %root to layout(#layout_col_major) : vector<16x16xf16> - // CHECK-COUNT-8: vector.load {{.*}}, vector<1xf16> - func.return %rootl : vector<16x16xf16> -} - -// CHECK-LABEL: @distribute_transfer_read_row_major_transpose -func.func @distribute_transfer_read_row_major_transpose(%a: index, %b: index, %alloc: memref<32x32x32x32xf16>) -> vector<16x16xf16> { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.0 : f16 - %root = vector.transfer_read %alloc[%c0, %c0, %a, %b], %cst - {in_bounds = [true, true], - permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d2)>} - : memref<32x32x32x32xf16>, vector<16x16xf16> - %rootl = iree_vector_ext.to_layout %root to layout(#layout_row_major) : vector<16x16xf16> - // CHECK-COUNT-32: vector.load {{.*}}, vector<1xf16> - func.return %rootl : vector<16x16xf16> -} - -// CHECK-LABEL: @distribute_transfer_read_col_major_transpose -func.func @distribute_transfer_read_col_major_transpose(%a: index, %b: index, %alloc: memref<32x32x32x32xf16>) -> vector<16x16xf16> { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.0 : f16 - %root = vector.transfer_read %alloc[%c0, %c0, %a, %b], %cst - {in_bounds = [true, true], - permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d2)>} - : memref<32x32x32x32xf16>, vector<16x16xf16> - %rootl = iree_vector_ext.to_layout %root to layout(#layout_col_major) : vector<16x16xf16> - // CHECK-COUNT-2: vector.load {{.*}}, vector<4xf16> - func.return %rootl : vector<16x16xf16> -} - -builtin.module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { - %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op - transform.yield - } -} - -// ----- - -#layout_row_major = #iree_vector_ext.layout<<[BATCHX, LANEY], [2, 8]>, <[BATCHY, LANEX, VECTORX], [2, 1, 8]>> -#layout_col_major = #iree_vector_ext.layout<<[BATCHX, LANEY, VECTORX], [1, 4, 4]>, <[BATCHY, LANEX], [2, 8]>> - -// TODO: Use affine min tricks based on the grid size to elide the mod. -// Note that this IR is invalid if subgroup size != 8. - -func.func @distribute_transfer_write_row_major(%root: vector<16x16xf16>, %alloc: memref<64x64xf16>) { - %c0 = arith.constant 0 : index - %rootl = iree_vector_ext.to_layout %root to layout(#layout_row_major) : vector<16x16xf16> - vector.transfer_write %rootl, %alloc[%c0, %c0] - {in_bounds = [true, true]} - : vector<16x16xf16>, memref<64x64xf16> - func.return -} -// CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 mod 8)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 mod 8 + 8)> - -// CHECK-LABEL: @distribute_transfer_write_row_major -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK-DAG: %[[LANEID:.+]] = gpu.thread_id x -// CHECK: %[[VEC_LANE_Y:.+]] = affine.apply #[[$MAP0]]()[%[[LANEID]]] -// CHECK: %[[DIST_SRC_VEC:.+]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xf16> -> vector<2x2x8xf16> -// CHECK: %[[BATCH_0_0:.+]] = vector.extract %[[DIST_SRC_VEC]][0, 0] : vector<8xf16> from vector<2x2x8xf16> -// CHECK: vector.store %[[BATCH_0_0]], %{{.*}}[%[[VEC_LANE_Y]], %[[C0]]] : memref<64x64xf16>, vector<8xf16> - -// CHECK: %[[NEXT_VEC_LANE_Y:.+]] = affine.apply #[[$MAP1]]()[%[[LANEID]]] -// CHECK: %[[BATCH_1_0:.+]] = vector.extract %[[DIST_SRC_VEC]][1, 0] : vector<8xf16> from vector<2x2x8xf16> -// CHECK: vector.store %[[BATCH_1_0]], %{{.*}}[%[[NEXT_VEC_LANE_Y]], %[[C0]]] : memref<64x64xf16>, vector<8xf16> - -// CHECK: %[[BATCH_0_1:.+]] = vector.extract %[[DIST_SRC_VEC]][0, 1] : vector<8xf16> from vector<2x2x8xf16> -// CHECK: vector.store %[[BATCH_0_1]], %{{.*}}[%[[VEC_LANE_Y]], %[[C8]]] : memref<64x64xf16>, vector<8xf16> - -// CHECK: %[[BATCH_1_1:.+]] = vector.extract %[[DIST_SRC_VEC]][1, 1] : vector<8xf16> from vector<2x2x8xf16> -// CHECK: vector.store %[[BATCH_1_1]], %{{.*}}[%[[NEXT_VEC_LANE_Y]], %[[C8]]] : memref<64x64xf16>, vector<8xf16> - -func.func @distribute_transfer_write_col_major(%root: vector<16x16xf16>, %alloc: memref<64x64xf16>) { - %c0 = arith.constant 0 : index - %rootl = iree_vector_ext.to_layout %root to layout(#layout_col_major) : vector<16x16xf16> - vector.transfer_write %rootl, %alloc[%c0, %c0] - {in_bounds = [true, true]} - : vector<16x16xf16>, memref<64x64xf16> - func.return -} -// CHECK-LABEL: @distribute_transfer_write_col_major -// CHECK-COUNT-8: vector.store {{.*}}, vector<1xf16> - -func.func @distribute_transfer_write_row_major_with_broadcast(%root: vector<16x16xf16>, %a: index, %b: index, %alloc: memref<32x32x32x32xf16>) { - %c0 = arith.constant 0 : index - %rootl = iree_vector_ext.to_layout %root to layout(#layout_row_major) : vector<16x16xf16> - vector.transfer_write %rootl, %alloc[%c0, %c0, %a, %b] - {in_bounds = [true, true], - permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)>} - : vector<16x16xf16>, memref<32x32x32x32xf16> - func.return -} -// CHECK-LABEL: @distribute_transfer_write_row_major_with_broadcast -// CHECK-COUNT-4: vector.store {{.*}}, vector<8xf16> - -func.func @distribute_transfer_write_col_major_with_broadcast(%root: vector<16x16xf16>, %a: index, %b: index, %alloc: memref<32x32x32x32xf16>) { - %c0 = arith.constant 0 : index - %rootl = iree_vector_ext.to_layout %root to layout(#layout_col_major) : vector<16x16xf16> - vector.transfer_write %rootl, %alloc[%c0, %c0, %a, %b] - {in_bounds = [true, true], - permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)>} - : vector<16x16xf16>, memref<32x32x32x32xf16> - func.return -} -// CHECK-LABEL: @distribute_transfer_write_col_major_with_broadcast -// CHECK-COUNT-8: vector.store {{.*}}, vector<1xf16> - -func.func @distribute_transfer_write_row_major_transpose(%root: vector<16x16xf16>, %a: index, %b: index, %alloc: memref<32x32x32x32xf16>) { - %c0 = arith.constant 0 : index - %rootl = iree_vector_ext.to_layout %root to layout(#layout_row_major) : vector<16x16xf16> - vector.transfer_write %rootl, %alloc[%c0, %c0, %a, %b] - {in_bounds = [true, true], - permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d2)>} - : vector<16x16xf16>, memref<32x32x32x32xf16> - func.return -} -// CHECK-LABEL: @distribute_transfer_write_row_major_transpose -// CHECK-COUNT-32: vector.store {{.*}}, vector<1xf16> - -func.func @distribute_transfer_write_col_major_transpose(%root: vector<16x16xf16>, %a: index, %b: index, %alloc: memref<32x32x32x32xf16>) { - %c0 = arith.constant 0 : index - %rootl = iree_vector_ext.to_layout %root to layout(#layout_col_major) : vector<16x16xf16> - vector.transfer_write %rootl, %alloc[%c0, %c0, %a, %b] - {in_bounds = [true, true], - permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d2)>} - : vector<16x16xf16>, memref<32x32x32x32xf16> - func.return -} -// CHECK-LABEL: @distribute_transfer_write_col_major_transpose -// CHECK-COUNT-2: vector.store {{.*}}, vector<4xf16> - - -func.func @distribute_transfer_write_with_non_contiguous_broadcast(%root: vector<16x16xf16>, %a: index, %b: index, %alloc: memref<32x32x32x32xf16>) { - %c0 = arith.constant 0 : index - %rootl = iree_vector_ext.to_layout %root to layout(#layout_row_major) : vector<16x16xf16> - vector.transfer_write %rootl, %alloc[%c0, %a, %c0, %b] - {in_bounds = [true, true], - permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, d3)>} - : vector<16x16xf16>, memref<32x32x32x32xf16> - func.return -} -// CHECK-LABEL: func.func @distribute_transfer_write_with_non_contiguous_broadcast -// CHECK-SAME: %[[ROOT:.+]]: vector<16x16xf16>, %[[A:.+]]: index, %[[B:.+]]: index, %[[ALLOC:.+]]: memref<32x32x32x32xf16>) -// CHECK: %[[C0:.+]] = arith.constant 0 : index -// CHECK-COUNT-4: vector.store %{{.+}}, %[[ALLOC]][%[[C0]], {{.+}}, %[[C0]], %{{.+}}] : memref<32x32x32x32xf16>, vector<8xf16> - -builtin.module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { - %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op - transform.yield - } -} - -// ----- - -#row_layout = #iree_vector_ext.per_dim_layout<[BATCHX, LANEY, VECTORX], [1, 4, 4]> -#col_layout = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [1, 16]> -#layout2d = #iree_vector_ext.layout<#row_layout, #col_layout> -#layout1d = #iree_vector_ext.layout<#col_layout> -#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {}> -#translation_info = #iree_codegen.translation_info -module { - func.func @distribute_reduction_f16(%source: vector<16x16xf16>, %init: vector<16xf16>) -> vector<16xf16> - attributes {hal.executable.target = #executable_target_rocm_hsaco_fb, translation_info = #translation_info} { - %sourcel = iree_vector_ext.to_layout %source to layout(#layout2d) : vector<16x16xf16> - %result = vector.multi_reduction , %sourcel, %init [0] - : vector<16x16xf16> to vector<16xf16> - func.return %result : vector<16xf16> - } -} -// CHECK: func.func @distribute_reduction_f16(%[[ARG0:[a-zA-Z0-9_]+]]: vector<16x16xf16>, %[[ARG1:[a-zA-Z0-9_]+]]: vector<16xf16>) -> vector<16xf16> -// CHECK-DAG: %[[C32_I32:.+]] = arith.constant 32 : i32 -// CHECK-DAG: %[[C64_I32:.+]] = arith.constant 64 : i32 -// CHECK-DAG: %[[C16_I32:.+]] = arith.constant 16 : i32 -// CHECK-DAG: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<2xf16> -// CHECK-DAG: %[[CST_0:.+]] = arith.constant dense<0.000000e+00> : vector<1xf16> -// CHECK: %[[D0:.+]] = iree_vector_ext.to_simt %[[ARG1]] : vector<16xf16> -> vector<1xf16> -// CHECK: %[[D1:.+]] = vector.extract %[[D0]][0] : f16 from vector<1xf16> -// CHECK: %[[D2:.+]] = iree_vector_ext.to_simt %[[ARG0]] : vector<16x16xf16> -> vector<1x1x4xf16> -// CHECK: %[[D3:.+]] = vector.extract %[[D2]][0, 0, 0] : f16 from vector<1x1x4xf16> -// CHECK: %[[D4:.+]] = vector.insert %[[D3]], %[[CST]] [0] : f16 into vector<2xf16> -// CHECK: %[[D5:.+]] = vector.extract %[[D2]][0, 0, 1] : f16 from vector<1x1x4xf16> -// CHECK: %[[D6:.+]] = vector.insert %[[D5]], %[[D4]] [1] : f16 into vector<2xf16> -// CHECK: %[[D7:.+]] = vector.extract %[[D2]][0, 0, 2] : f16 from vector<1x1x4xf16> -// CHECK: %[[D8:.+]] = vector.insert %[[D7]], %[[D6]] [0] : f16 into vector<2xf16> -// CHECK: %[[D9:.+]] = vector.extract %[[D2]][0, 0, 3] : f16 from vector<1x1x4xf16> -// CHECK: %[[D10:.+]] = vector.insert %[[D9]], %[[D8]] [1] : f16 into vector<2xf16> -// CHECK: %[[D11:.+]] = arith.maximumf %[[D6]], %[[D10]] : vector<2xf16> -// CHECK: %[[D12:.+]] = vector.bitcast %[[D11]] : vector<2xf16> to vector<1xi32> -// CHECK: %[[D13:.+]] = vector.extract %[[D12]][0] : i32 from vector<1xi32> -// CHECK: %[[SHUFFLERESULT:.+]], %[[VALID:.+]] = gpu.shuffle xor %[[D13]], %[[C16_I32]], %[[C64_I32]] : i32 -// CHECK: %[[D14:.+]] = vector.broadcast %[[SHUFFLERESULT]] : i32 to vector<1xi32> -// CHECK: %[[D15:.+]] = vector.bitcast %[[D14]] : vector<1xi32> to vector<2xf16> -// CHECK: %[[D16:.+]] = arith.maximumf %[[D15]], %[[D11]] : vector<2xf16> -// CHECK: %[[D17:.+]] = vector.bitcast %[[D16]] : vector<2xf16> to vector<1xi32> -// CHECK: %[[D18:.+]] = vector.extract %[[D17]][0] : i32 from vector<1xi32> -// CHECK: %[[SHUFFLERESULT_1:.+]], %[[VALID_2:.+]] = gpu.shuffle xor %[[D18]], %[[C32_I32]], %[[C64_I32]] : i32 -// CHECK: %[[D19:.+]] = vector.broadcast %[[SHUFFLERESULT_1]] : i32 to vector<1xi32> -// CHECK: %[[D20:.+]] = vector.bitcast %[[D19]] : vector<1xi32> to vector<2xf16> -// CHECK: %[[D21:.+]] = arith.maximumf %[[D20]], %[[D16]] : vector<2xf16> -// CHECK: %[[D22:.+]] = vector.extract %[[D21]][0] : f16 from vector<2xf16> -// CHECK: %[[D23:.+]] = vector.extract %[[D21]][1] : f16 from vector<2xf16> -// CHECK: %[[D24:.+]] = arith.maximumf %[[D22]], %[[D23]] : f16 -// CHECK: %[[D25:.+]] = arith.maximumf %[[D24]], %[[D1]] : f16 -// CHECK: %[[D26:.+]] = vector.insert %[[D25]], %[[CST_0]] [0] : f16 into vector<1xf16> -// CHECK: %[[D27:.+]] = iree_vector_ext.to_simd %[[D26]] : vector<1xf16> -> vector<16xf16> - -#executable_target_rocm_hsaco_fb2 = #hal.executable.target<"rocm", "rocm-hsaco-fb", {}> -module { - func.func @distribute_reduction_f32(%source: vector<16x16xf32>, %init: vector<16xf32>) -> vector<16xf32> - attributes {hal.executable.target = #executable_target_rocm_hsaco_fb, translation_info = #translation_info} { - %sourcel = iree_vector_ext.to_layout %source to layout(#layout2d) : vector<16x16xf32> - %result = vector.multi_reduction , %sourcel, %init [0] - : vector<16x16xf32> to vector<16xf32> - func.return %result : vector<16xf32> - } -} -// CHECK: func.func @distribute_reduction_f32(%[[ARG0:[a-zA-Z0-9_]+]]: vector<16x16xf32>, %[[ARG1:[a-zA-Z0-9_]+]]: vector<16xf32>) -> vector<16xf32> -// CHECK-DAG: %[[C32_I32:.+]] = arith.constant 32 : i32 -// CHECK-DAG: %[[C64_I32:.+]] = arith.constant 64 : i32 -// CHECK-DAG: %[[C16_I32:.+]] = arith.constant 16 : i32 -// CHECK-DAG: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<1xf32> -// CHECK: %[[D0:.+]] = iree_vector_ext.to_simt %[[ARG1]] : vector<16xf32> -> vector<1xf32> -// CHECK: %[[D1:.+]] = vector.extract %[[D0]][0] : f32 from vector<1xf32> -// CHECK: %[[D2:.+]] = iree_vector_ext.to_simt %[[ARG0]] : vector<16x16xf32> -> vector<1x1x4xf32> -// CHECK: %[[D3:.+]] = vector.extract %[[D2]][0, 0, 0] : f32 from vector<1x1x4xf32> -// CHECK: %[[D4:.+]] = vector.insert %[[D3]], %[[CST]] [0] : f32 into vector<1xf32> -// CHECK: %[[D5:.+]] = vector.extract %[[D2]][0, 0, 1] : f32 from vector<1x1x4xf32> -// CHECK: %[[D6:.+]] = vector.insert %[[D5]], %[[D4]] [0] : f32 into vector<1xf32> -// CHECK: %[[D7:.+]] = arith.maximumf %[[D4]], %[[D6]] : vector<1xf32> -// CHECK: %[[D8:.+]] = vector.extract %[[D2]][0, 0, 2] : f32 from vector<1x1x4xf32> -// CHECK: %[[D9:.+]] = vector.insert %[[D8]], %[[D6]] [0] : f32 into vector<1xf32> -// CHECK: %[[D10:.+]] = arith.maximumf %[[D7]], %[[D9]] : vector<1xf32> -// CHECK: %[[D11:.+]] = vector.extract %[[D2]][0, 0, 3] : f32 from vector<1x1x4xf32> -// CHECK: %[[D12:.+]] = vector.insert %[[D11]], %[[D9]] [0] : f32 into vector<1xf32> -// CHECK: %[[D13:.+]] = arith.maximumf %[[D10]], %[[D12]] : vector<1xf32> -// CHECK: %[[D14:.+]] = vector.bitcast %[[D13]] : vector<1xf32> to vector<1xi32> -// CHECK: %[[D15:.+]] = vector.extract %[[D14]][0] : i32 from vector<1xi32> -// CHECK: %[[SHUFFLERESULT:.+]], %[[VALID:.+]] = gpu.shuffle xor %[[D15]], %[[C16_I32]], %[[C64_I32]] : i32 -// CHECK: %[[D16:.+]] = vector.broadcast %[[SHUFFLERESULT]] : i32 to vector<1xi32> -// CHECK: %[[D17:.+]] = vector.bitcast %[[D16]] : vector<1xi32> to vector<1xf32> -// CHECK: %[[D18:.+]] = arith.maximumf %[[D17]], %[[D13]] : vector<1xf32> -// CHECK: %[[D19:.+]] = vector.bitcast %[[D18]] : vector<1xf32> to vector<1xi32> -// CHECK: %[[D20:.+]] = vector.extract %[[D19]][0] : i32 from vector<1xi32> -// CHECK: %[[SHUFFLERESULT_0:.+]], %[[VALID_1:.+]] = gpu.shuffle xor %[[D20]], %[[C32_I32]], %[[C64_I32]] : i32 -// CHECK: %[[D21:.+]] = vector.broadcast %[[SHUFFLERESULT_0]] : i32 to vector<1xi32> -// CHECK: %[[D22:.+]] = vector.bitcast %[[D21]] : vector<1xi32> to vector<1xf32> -// CHECK: %[[D23:.+]] = arith.maximumf %[[D22]], %[[D18]] : vector<1xf32> -// CHECK: %[[D24:.+]] = vector.extract %[[D23]][0] : f32 from vector<1xf32> -// CHECK: %[[D25:.+]] = arith.maximumf %[[D24]], %[[D1]] : f32 -// CHECK: %[[D26:.+]] = vector.insert %[[D25]], %[[CST]] [0] : f32 into vector<1xf32> -// CHECK: %[[D27:.+]] = iree_vector_ext.to_simd %[[D26]] : vector<1xf32> -> vector<16xf32> - -#transpose_test_layout = #iree_vector_ext.layout<<[LANEY], [32]>, <[LANEX, VECTORX], [4, 4]>> -func.func @distribute_transpose(%mem: memref<32x32xf16>, %mem1: memref<32x32xf16>) -> vector<32x16xf16> { - // CHECK: func.func @distribute_transpose(%[[MEM:.*]]: memref<32x32xf16>, %[[MEM1:.*]]: memref<32x32xf16> - %c0 = arith.constant 0 : index - %cst = arith.constant 0.0 : f16 - // CHECK-COUNT-1: vector.load %[[MEM]] - // CHECK-COUNT-4: vector.load %[[MEM1]] - %a = vector.transfer_read %mem[%c0, %c0], %cst : memref<32x32xf16>, vector<32x16xf16> - %b = vector.transfer_read %mem1[%c0, %c0], %cst : memref<32x32xf16>, vector<16x32xf16> - // CHECK-NOT: vector.transpose - %b_t = vector.transpose %b, [1, 0] : vector<16x32xf16> to vector<32x16xf16> - // CHECK: %[[ADD:.*]] = arith.addf %{{.*}}, %{{.*}} : vector<4xf16> - %c = arith.addf %a, %b_t : vector<32x16xf16> - %cl = iree_vector_ext.to_layout %c to layout(#transpose_test_layout) : vector<32x16xf16> - // CHECK: iree_vector_ext.to_simd %[[ADD]] : vector<4xf16> -> vector<32x16xf16> - func.return %cl : vector<32x16xf16> -} - -#row_broadcast_layout = #iree_vector_ext.per_dim_layout<[BATCHX, LANEX], [2, 16]> -#col_broadcast_layout = #iree_vector_ext.per_dim_layout<[BATCHY, LANEY, VECTORX], [2, 4, 4]> -#layout_broadcast_1d = #iree_vector_ext.layout<#row_broadcast_layout> -#layout_broadcast_2d = #iree_vector_ext.layout<#row_broadcast_layout, #col_broadcast_layout> -#layout_broadcast_1d_t = #iree_vector_ext.layout<#col_broadcast_layout> -#layout_broadcast_2d_t = #iree_vector_ext.layout<#col_broadcast_layout, #row_broadcast_layout> - -func.func @distribute_broadcast_row_col(%source: vector<32xf32>) -> vector<32x32xf32> { - %result = vector.broadcast %source : vector<32xf32> to vector<32x32xf32> - %resultl = iree_vector_ext.to_layout %result to layout(#layout_broadcast_2d) : vector<32x32xf32> - // CHECK-DAG: %[[S00:.*]] = vector.extract %[[SOURCE:.*]][0, 0] - // CHECK-DAG: vector.insert %[[S00]], %{{.*}} [0, 0, 0] - // CHECK-DAG: vector.insert %[[S00]], %{{.*}} [1, 0, 0] - // CHECK-DAG: %[[S01:.*]] = vector.extract %[[ACC:.*]][0, 1] - // CHECK-DAG: vector.insert %[[S01]], %{{.*}} [0, 0, 1] - // CHECK-DAG: vector.insert %[[S01]], %{{.*}} [1, 0, 1] - // CHECK-DAG: %[[S02:.*]] = vector.extract %[[ACC:.*]][0, 2] - // CHECK-DAG: vector.insert %[[S02]], %{{.*}} [0, 0, 2] - // CHECK-DAG: vector.insert %[[S02]], %{{.*}} [1, 0, 2] - // CHECK-DAG: %[[S03:.*]] = vector.extract %[[ACC:.*]][0, 3] - // CHECK-DAG: vector.insert %[[S03]], %{{.*}} [0, 0, 3] - // CHECK-DAG: vector.insert %[[S03]], %{{.*}} [1, 0, 3] - - // CHECK-DAG: %[[S10:.*]] = vector.extract %[[SOURCE]][1, 0] - // CHECK-DAG: vector.insert %[[S10]], %{{.*}} [0, 1, 0] - // CHECK-DAG: vector.insert %[[S10]], %{{.*}} [1, 1, 0] - // CHECK-DAG: %[[S11:.*]] = vector.extract %[[ACC:.*]][1, 1] - // CHECK-DAG: vector.insert %[[S11]], %{{.*}} [0, 1, 1] - // CHECK-DAG: vector.insert %[[S11]], %{{.*}} [1, 1, 1] - // CHECK-DAG: %[[S12:.*]] = vector.extract %[[ACC:.*]][1, 2] - // CHECK-DAG: vector.insert %[[S12]], %{{.*}} [0, 1, 2] - // CHECK-DAG: vector.insert %[[S12]], %{{.*}} [1, 1, 2] - // CHECK-DAG: %[[S13:.*]] = vector.extract %[[ACC:.*]][1, 3] - // CHECK-DAG: vector.insert %[[S13]], %{{.*}} [0, 1, 3] - // CHECK-DAG: vector.insert %[[S13]], %{{.*}} [1, 1, 3] - func.return %resultl : vector<32x32xf32> -} - -func.func @distribute_broadcast_col_row(%source: vector<32xf32>) -> vector<32x32xf32> { - %result = vector.broadcast %source : vector<32xf32> to vector<32x32xf32> - %resultl = iree_vector_ext.to_layout %result to layout(#layout_broadcast_2d_t) : vector<32x32xf32> - // CHECK-DAG: %[[S0:.*]] = vector.extract %[[SOURCE:.*]][0] - // CHECK-DAG: vector.insert %[[S0]], %{{.*}} [0, 0, 0] - // CHECK-DAG: vector.insert %[[S0]], %{{.*}} [0, 0, 1] - // CHECK-DAG: vector.insert %[[S0]], %{{.*}} [0, 0, 2] - // CHECK-DAG: vector.insert %[[S0]], %{{.*}} [0, 0, 3] - // CHECK-DAG: vector.insert %[[S0]], %{{.*}} [0, 1, 0] - // CHECK-DAG: vector.insert %[[S0]], %{{.*}} [0, 1, 1] - // CHECK-DAG: vector.insert %[[S0]], %{{.*}} [0, 1, 2] - // CHECK-DAG: vector.insert %[[S0]], %{{.*}} [0, 1, 3] - - // CHECK-DAG: %[[S1:.*]] = vector.extract %[[SOURCE:.*]][1] - // CHECK-DAG: vector.insert %[[S1]], %{{.*}} [1, 0, 0] - // CHECK-DAG: vector.insert %[[S1]], %{{.*}} [1, 0, 1] - // CHECK-DAG: vector.insert %[[S1]], %{{.*}} [1, 0, 2] - // CHECK-DAG: vector.insert %[[S1]], %{{.*}} [1, 0, 3] - // CHECK-DAG: vector.insert %[[S1]], %{{.*}} [1, 1, 0] - // CHECK-DAG: vector.insert %[[S1]], %{{.*}} [1, 1, 1] - // CHECK-DAG: vector.insert %[[S1]], %{{.*}} [1, 1, 2] - // CHECK-DAG: vector.insert %[[S1]], %{{.*}} [1, 1, 3] - func.return %resultl : vector<32x32xf32> -} - -#layout_broadcast_vectory_1d = #iree_vector_ext.layout< - <[BATCHY, VECTORX], [1, 4]> -> - -#layout_broadcast_vectory_2d = #iree_vector_ext.layout< - <[BATCHX, VECTORY], [1, 4]>, - <[BATCHY, VECTORX], [1, 4]> -> - -// This test case checks if we distribute correct when we have vectorx frozen -// and we iterate on vectory. -// This previously caused a bug, since calculating SIMT index for broadcast -// needs to know the range of vectorx. -func.func @distribute_broadcast_vectory(%source: vector<4xf32>) -> vector<4x4xf32> { - %result = vector.broadcast %source : vector<4xf32> to vector<4x4xf32> - %resultl = iree_vector_ext.to_layout %result to layout(#layout_broadcast_vectory_2d) : vector<4x4xf32> - // CHECK-DAG: %[[S00:.*]] = vector.extract %[[SOURCE:.*]][0, 0] : f32 from vector<1x4xf32> - // CHECK-DAG: %[[S01:.*]] = vector.extract %[[SOURCE:.*]][0, 1] : f32 from vector<1x4xf32> - // CHECK-DAG: %[[S02:.*]] = vector.extract %[[SOURCE:.*]][0, 2] : f32 from vector<1x4xf32> - // CHECK-DAG: %[[S02:.*]] = vector.extract %[[SOURCE:.*]][0, 3] : f32 from vector<1x4xf32> - // CHECK-DAG: vector.insert %[[S00:.*]] %{{.*}} [0, 0, 0] : f32 into vector<1x1x16xf32> - // CHECK-DAG: vector.insert %[[S00:.*]] %{{.*}} [0, 0, 4] : f32 into vector<1x1x16xf32> - // CHECK-DAG: vector.insert %[[S00:.*]] %{{.*}} [0, 0, 8] : f32 into vector<1x1x16xf32> - // CHECK-DAG: vector.insert %[[S00:.*]] %{{.*}} [0, 0, 12] : f32 into vector<1x1x16xf32> - // CHECK-DAG: vector.insert %[[S01:.*]] %{{.*}} [0, 0, 1] : f32 into vector<1x1x16xf32> - // CHECK-DAG: vector.insert %[[S01:.*]] %{{.*}} [0, 0, 5] : f32 into vector<1x1x16xf32> - // CHECK-DAG: vector.insert %[[S01:.*]] %{{.*}} [0, 0, 9] : f32 into vector<1x1x16xf32> - // CHECK-DAG: vector.insert %[[S01:.*]] %{{.*}} [0, 0, 13] : f32 into vector<1x1x16xf32> - // CHECK-DAG: vector.insert %[[S02:.*]] %{{.*}} [0, 0, 2] : f32 into vector<1x1x16xf32> - // CHECK-DAG: vector.insert %[[S02:.*]] %{{.*}} [0, 0, 6] : f32 into vector<1x1x16xf32> - // CHECK-DAG: vector.insert %[[S02:.*]] %{{.*}} [0, 0, 10] : f32 into vector<1x1x16xf32> - // CHECK-DAG: vector.insert %[[S02:.*]] %{{.*}} [0, 0, 14] : f32 into vector<1x1x16xf32> - // CHECK-DAG: vector.insert %[[S03:.*]] %{{.*}} [0, 0, 3] : f32 into vector<1x1x16xf32> - // CHECK-DAG: vector.insert %[[S03:.*]] %{{.*}} [0, 0, 7] : f32 into vector<1x1x16xf32> - // CHECK-DAG: vector.insert %[[S03:.*]] %{{.*}} [0, 0, 11] : f32 into vector<1x1x16xf32> - // CHECK-DAG: vector.insert %[[S03:.*]] %{{.*}} [0, 0, 15] : f32 into vector<1x1x16xf32> - func.return %resultl : vector<4x4xf32> -} - -builtin.module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { - %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op - transform.yield - } -} - -// ----- - -// This test case checks that chained WMMA contraction is distributable. -// Let C0 = matmul(A0, B0), and OUT = matmul(A1, C0). - -// In this case, since C-layout and the RHS-Layout of WMMA has lane conflict, -// and has different numel per lane/thread, we expect compiler to emit code -// that will write back data from C0 to shared memory, before loading it again -// as RHS-layout from shared memory to register. - -// We assume in this test that we have distributed it the IR at a subgroup level. - -#layoutA = #iree_vector_ext.layout<<[ BATCHX, LANEX], [1, 16]>, <[ BATCHY, LANEY, VECTORX], [1, 1, 16]>> -#layoutB = #iree_vector_ext.layout<<[ BATCHX, LANEX], [1, 16]>, <[ BATCHY, LANEY, VECTORX], [1, 1, 16]>> -#layoutC = #iree_vector_ext.layout<<[ BATCHX, VECTORY, LANEY, VECTORX], [1, 8, 2, 1]>, <[ BATCHY, LANEX], [1, 16]>> - -#layoutA2 = #iree_vector_ext.layout<<[ BATCHX, LANEX], [1, 16]>, <[ BATCHY, LANEY, VECTORX], [1, 1, 16]>> -#layoutB2 = #iree_vector_ext.layout<<[ BATCHX, LANEY, VECTORX], [1, 1, 16]>, <[ BATCHY, LANEX], [1, 16]>> -#layoutC2 = #iree_vector_ext.layout<<[ BATCHX, VECTORY, LANEY, VECTORX], [1, 8, 2, 1]>, <[ BATCHY, LANEX], [1, 16]>> - -// CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + (s0 floordiv 32) * 16)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 mod 16)> -// CHECK-LABEL: func.func @resolve_wmma_layout_conflict_with_shared_memory -func.func @resolve_wmma_layout_conflict_with_shared_memory(%15 : vector<16x16xf16>, - %14 : vector<16x16xf16>, - %16 : vector<16x16xf32>, - %35 : vector<16x16xf16>, - %33 : vector<16x16xf32>) - -> vector<16x16xf32> - attributes {translation_info = #iree_codegen.translation_info} { - - %A = iree_vector_ext.to_layout %15 to layout(#layoutA) : vector<16x16xf16> - %B = iree_vector_ext.to_layout %14 to layout(#layoutB) : vector<16x16xf16> - %C = iree_vector_ext.to_layout %16 to layout(#layoutC) : vector<16x16xf32> - - %M1 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, - affine_map<(d0, d1, d2) -> (d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)>], - iterator_types = ["parallel", "parallel", "reduction"], - kind = #vector.kind, - iree.amdgpu.mma = #iree_gpu.mma_layout} - %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32> - - %TM1 = arith.truncf %M1 : vector<16x16xf32> to vector<16x16xf16> - - %A2 = iree_vector_ext.to_layout %35 to layout(#layoutA2) : vector<16x16xf16> - %B2 = iree_vector_ext.to_layout %TM1 to layout(#layoutB2) : vector<16x16xf16> - %C2 = iree_vector_ext.to_layout %33 to layout(#layoutC2) : vector<16x16xf32> - - %M2 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)>], - iterator_types = ["parallel", "parallel", "reduction"], - kind = #vector.kind, - iree.amdgpu.mma = #iree_gpu.mma_layout} - %A2, %B2, %C2 : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32> - - func.return %M2 : vector<16x16xf32> -} -// CHECK-NOT: iree_vector_ext.layout_conflict_resolution -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index -// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index - -// CHECK: %[[VEC_INIT:.+]] = arith.constant dense<0.000000e+00> : vector<1x1x16xf16 -// CHECK: %[[TID_X:.+]] = gpu.thread_id x -// CHECK: %[[TID_Y:.+]] = gpu.thread_id y -// CHECK: %[[TID_Z:.+]] = gpu.thread_id z -// CHECK: %[[SUBGROUP_OFFSET:.+]] = affine.apply #[[$MAP0]]()[%[[TID_X]], %[[TID_Y]], %[[TID_Z]]] -// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<16x32xf16, #gpu.address_space> -// CHECK: %[[SUBVIEW:.+]] = memref.subview %[[ALLOC]][0, %[[SUBGROUP_OFFSET]]] [16, 16] [1, 1] -// CHECK: %[[HALF_LANE_ID:.+]] = affine.apply #[[$MAP1]]()[%[[TID_X]]] -// CHECK-COUNT-8: vector.store %{{.+}}, %[[SUBVIEW]][%{{.+}}, %[[HALF_LANE_ID]]] -// CHECK-AFTER: gpu.barrier - -// CHECK: %[[LANE_OFFSET:.+]] = arith.addi %[[SUBGROUP_OFFSET]], %[[HALF_LANE_ID]] -// CHECK: %[[LOAD0:.+]] = vector.load %[[ALLOC]][%[[C0]], %[[LANE_OFFSET]]] -// CHECK: %[[INSERT0:.+]] = vector.insert_strided_slice %[[LOAD0]], %[[VEC_INIT]] {offsets = [0, 0, 0], strides = [1]} : vector<1xf16> into vector<1x1x16xf16> -// CHECK: %[[LOAD1:.+]] = vector.load %[[ALLOC]][%[[C1]], %[[LANE_OFFSET]]] -// CHECK: %[[INSERT1:.+]] = vector.insert_strided_slice %[[LOAD1]], %[[INSERT0]] {offsets = [0, 0, 1], strides = [1]} : vector<1xf16> into vector<1x1x16xf16> -// CHECK: %[[LOAD2:.+]] = vector.load %[[ALLOC]][%[[C2]], %[[LANE_OFFSET]]] -// CHECK: %[[INSERT2:.+]] = vector.insert_strided_slice %[[LOAD2]], %[[INSERT1]] {offsets = [0, 0, 2], strides = [1]} : vector<1xf16> into vector<1x1x16xf16> -// CHECK: %[[LOAD3:.+]] = vector.load %[[ALLOC]][%[[C3]], %[[LANE_OFFSET]]] -// CHECK: %[[INSERT3:.+]] = vector.insert_strided_slice %[[LOAD3]], %[[INSERT2]] {offsets = [0, 0, 3], strides = [1]} : vector<1xf16> into vector<1x1x16xf16> -// CHECK: %[[LOAD4:.+]] = vector.load %[[ALLOC]][%[[C4]], %[[LANE_OFFSET]]] -// CHECK: %[[INSERT4:.+]] = vector.insert_strided_slice %[[LOAD4]], %[[INSERT3]] {offsets = [0, 0, 4], strides = [1]} : vector<1xf16> into vector<1x1x16xf16> -// CHECK-COUNT-11: %[[LOADN:.+]] = vector.load %[[ALLOC]] -// CHECK-AFTER: vector.insert_strided_slice %[[LOADN]] - -builtin.module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { - %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.iree.test_gpu_vector_distribution %top_level_func {experimental = true} : !transform.any_op - transform.yield - } -} - -// ----- - -// This test is used to ensure that we are handling cases -// where the same arith.constant has multiple users with different layouts. - -// Main motivation is to ensure we can distribute attention when the tile -// size for M,K1,N is the same. Which means the init of 1st contract, and -// IV's init uses the same constant. - -#layoutA = #iree_vector_ext.layout<<[BATCHY, LANEX], [2, 32]>, <[BATCHX, LANEY, VECTORX], [2, 4, 8]>> -#layoutB = #iree_vector_ext.layout<<[BATCHY, LANEX], [2, 32]>, <[BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>> - -builtin.module attributes { transform.with_named_sequence } { - func.func @resolve_constant_with_multiple_layout_uses(%A : vector<64x64xf16>, %B : vector<64x64xf16>) -> vector<64x64xf16> { - %a = iree_vector_ext.to_layout %A to layout(#layoutA) : vector<64x64xf16> - %b = iree_vector_ext.to_layout %B to layout(#layoutB) : vector<64x64xf16> - %zero = arith.constant dense<0.0> : vector<64x64xf16> - %add_0 = arith.addf %a, %zero : vector<64x64xf16> - %add_1 = arith.addf %b, %zero : vector<64x64xf16> - %layout_change = iree_vector_ext.to_layout %add_1 to layout(#layoutA) : vector<64x64xf16> - %out = arith.addf %layout_change, %add_0 : vector<64x64xf16> - func.return %out : vector<64x64xf16> - } -// CHECK-LABEL: func.func @resolve_constant_with_multiple_layout_uses -// CHECK-SAME: (%[[ARG0:.+]]: vector<64x64xf16>, %[[ARG0:.+]]: vector<64x64xf16>) -// CHECK: %[[V0:.+]] = arith.constant dense<0.000000e+00> : vector<2x2x8xf16> -// CHECK: %[[V1:.+]] = arith.constant dense<0.000000e+00> : vector<2x2x16xf16> -// CHECK: %[[ADD0:.+]] = arith.addf %{{.+}}, %[[V0]]{{.*}} : vector<2x2x8xf16> -// CHECK: %[[ADD1:.+]] = arith.addf %{{.+}}, %[[V1]]{{.*}} : vector<2x2x16xf16> -// CHECK: arith.addf %{{.+}}, %[[ADD0]]{{.*}} : vector<2x2x8xf16> - - transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { - %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op - transform.yield - } -} - -// ----- - -#row_layout = #iree_vector_ext.per_dim_layout<[BATCHX, LANEY, VECTORX], [2, 4, 4]> -#col_layout = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [1, 16]> -#layout0 = #iree_vector_ext.layout<#row_layout, #col_layout> -#row_layout2 = #iree_vector_ext.per_dim_layout<[BATCHX, LANEY, VECTORX], [1, 4, 8]> -#layout1 = #iree_vector_ext.layout<#row_layout2, #col_layout> -#row_layout3 = #iree_vector_ext.per_dim_layout<[BATCHX, LANEY, VECTORX], [4, 2, 4]> -#layout2 = #iree_vector_ext.layout<#row_layout3, #col_layout> - -func.func @resolved_layout_conflict(%a : memref<32x16xf16>, %b : memref<32x16xf16>) { - // CHECK: func.func @resolved_layout_conflict(%[[MEM:.*]]: memref<32x16xf16>, %[[MEM1:.*]]: memref<32x16xf16> - // CHECK-DAG: %[[CST0:.*]] = arith.constant dense<0.000000e+00> : vector<1x1x8xf16> - %c0 = arith.constant 0 : index - %cst = arith.constant 0.0 : f16 - // CHECK-COUNT-8: vector.load %[[MEM]] - %vec = vector.transfer_read %a[%c0, %c0], %cst : memref<32x16xf16>, vector<32x16xf16> - %vecl = iree_vector_ext.to_layout %vec to layout(#layout1) : vector<32x16xf16> - // CHECK: %[[R0:.+]] = vector.insert_strided_slice {{.*}} {offsets = [0, 0, 7], strides = [1]} : vector<1xf16> into vector<1x1x8xf16> - // CHECK: %[[ADD:.*]] = arith.addf %[[R0]], %[[R0]] {{.*}} : vector<1x1x8xf16> - %vec2 = arith.addf %vecl, %vecl : vector<32x16xf16> - %vec2l = iree_vector_ext.to_layout %vec2 to layout(#layout0) : vector<32x16xf16> - // CHECK: %[[R1:.*]] = vector.extract_strided_slice %[[ADD]] {offsets = [0, 0, 0], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x1x8xf16> to vector<1x1x4xf16> - // CHECK: %[[R2:.*]] = vector.extract_strided_slice %[[ADD]] {offsets = [0, 0, 4], sizes = [1, 1, 4], strides = [1, 1, 1]} : vector<1x1x8xf16> to vector<1x1x4xf16> - vector.transfer_write %vec2l, %b[%c0, %c0] {in_bounds = [true, true]} : vector<32x16xf16>, memref<32x16xf16> - // CHECK-COUNT-8: vector.store {{.*}}, vector<1xf16> - func.return -} - -func.func @unresolved_layout_conflict(%a : memref<32x16xf16>, %b : memref<32x16xf16>) { - // CHECK: func.func @unresolved_layout_conflict(%[[MEM:.*]]: memref<32x16xf16>, %[[MEM1:.*]]: memref<32x16xf16> - %c0 = arith.constant 0 : index - %cst = arith.constant 0.0 : f16 - %vcst = arith.constant dense<0.0> : vector<32x16xf16> - // CHECK-COUNT-8: vector.load %[[MEM]] - %vec = vector.transfer_read %a[%c0, %c0], %cst : memref<32x16xf16>, vector<32x16xf16> - %vecl = iree_vector_ext.to_layout %vec to layout(#layout1) : vector<32x16xf16> - // CHECK: iree_vector_ext.to_layout {{.*}} - %vec2 = arith.addf %vecl, %vcst : vector<32x16xf16> - // CHECK-COUNT-16: vector.store {{.*}}, vector<1xf16> - %vec2l = iree_vector_ext.to_layout %vec2 to layout(#layout2) : vector<32x16xf16> - vector.transfer_write %vec2l, %b[%c0, %c0] {in_bounds = [true, true]} : vector<32x16xf16>, memref<32x16xf16> - func.return -} - -builtin.module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { - %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.iree.test_gpu_vector_distribution %top_level_func {experimental = true} : !transform.any_op - transform.yield - } -} diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp index ac8ae7386f55..cc2649823f4e 100644 --- a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp @@ -1120,14 +1120,10 @@ transform_dialect::TestGpuVectorDistribution::applyToOne( rewriter.create(target.getLoc(), gpu::Dimension::x); populateGPUDistributionPatterns(patterns); - populateGPUDistributionLayoutAttrPatterns(laneId, patterns); - populateGPUReductionDistributionPatterns(patterns); // For testing we use subgroup size = 64. populateGPUDistributeNestedLayoutAttrPatterns(patterns, laneId, /*subgroupSize=*/64); populateGPUDistributeNestedLayoutContractAMDGPUPatterns(patterns); - if (getExperimental()) - populateGPULayoutResolutionDistributionPatterns(patterns); if (failed(distributeVectorOps(target, patterns, options))) { return emitDefaultDefiniteFailure(target); } diff --git a/compiler/src/iree/compiler/Codegen/Common/test/vector_layout_analysis.mlir b/compiler/src/iree/compiler/Codegen/Common/test/vector_layout_analysis.mlir index 6533a09e6d5a..03f581ee7552 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/vector_layout_analysis.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/vector_layout_analysis.mlir @@ -1,6 +1,15 @@ // RUN: iree-opt -iree-transform-dialect-interpreter --split-input-file %s --verify-diagnostics -#layout = #iree_vector_ext.layout<<[VECTORY], [16]>, <[VECTORX], [16]>> +#layout = #iree_vector_ext.nested_layout< + subgroup_tile = [1, 1], + batch_tile = [1, 1], + outer_tile = [1, 1], + thread_tile = [1, 1], + element_tile = [16, 16], + + subgroup_strides = [0, 0], + thread_strides = [0, 0] +> // Propagate the layout from transfer_read to everyone. builtin.module attributes { transform.with_named_sequence } { @@ -8,14 +17,14 @@ builtin.module attributes { transform.with_named_sequence } { %c0 = arith.constant 0 : index %cst_0 = arith.constant 0.0 : f16 %root = vector.transfer_read %arr[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}} + // expected-remark @above {{element_tile = [16, 16]}} %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<16x16xf16> %c = arith.mulf %rootl, %b : vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}} + // expected-remark @above {{element_tile = [16, 16]}} %d = arith.addf %c, %a : vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}} + // expected-remark @above {{element_tile = [16, 16]}} %e = arith.select %cond, %c, %d : vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}} + // expected-remark @above {{element_tile = [16, 16]}} func.return %e : vector<16x16xf16> } @@ -28,7 +37,16 @@ builtin.module attributes { transform.with_named_sequence } { // ----- -#layout = #iree_vector_ext.layout<<[VECTORY], [16]>, <[VECTORX], [16]>> +#layout = #iree_vector_ext.nested_layout< + subgroup_tile = [1, 1], + batch_tile = [1, 1], + outer_tile = [1, 1], + thread_tile = [1, 1], + element_tile = [16, 16], + + subgroup_strides = [0, 0], + thread_strides = [0, 0] +> // Enforce the layout from the transfer_write to everyone builtin.module attributes { transform.with_named_sequence } { @@ -36,11 +54,11 @@ builtin.module attributes { transform.with_named_sequence } { %c0 = arith.constant 0 : index %cst_0 = arith.constant 0.0 : f16 %cst0 = arith.constant dense<0.0> : vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}} + // expected-remark @above {{element_tile = [16, 16]}} %c = arith.mulf %cst0, %b : vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}} + // expected-remark @above {{element_tile = [16, 16]}} %d = arith.addf %c, %a : vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}} + // expected-remark @above {{element_tile = [16, 16]}} %dl = iree_vector_ext.to_layout %d to layout(#layout) : vector<16x16xf16> vector.transfer_write %dl, %arr[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<16x16xf16> func.return %d : vector<16x16xf16> @@ -55,7 +73,16 @@ builtin.module attributes { transform.with_named_sequence } { // ----- -#layout = #iree_vector_ext.layout<<[VECTORY], [16]>, <[VECTORX], [16]>> +#layout = #iree_vector_ext.nested_layout< + subgroup_tile = [1, 1], + batch_tile = [1, 1], + outer_tile = [1, 1], + thread_tile = [1, 1], + element_tile = [16, 16], + + subgroup_strides = [0, 0], + thread_strides = [0, 0] +> // First propagate the layout, and then enforce it up. builtin.module attributes { transform.with_named_sequence } { @@ -63,16 +90,16 @@ builtin.module attributes { transform.with_named_sequence } { %c0 = arith.constant 0 : index %cst_0 = arith.constant 0.0 : f16 %root = vector.transfer_read %arr[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}} + // expected-remark @above {{element_tile = [16, 16]}} %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<16x16xf16> %root2 = vector.transfer_read %arr2[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}} + // expected-remark @above {{element_tile = [16, 16]}} %c = arith.mulf %rootl, %b : vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}} + // expected-remark @above {{element_tile = [16, 16]}} %d = arith.addf %c, %a : vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}} + // expected-remark @above {{element_tile = [16, 16]}} %e = arith.divf %d, %root2 : vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}} + // expected-remark @above {{element_tile = [16, 16]}} func.return %e : vector<16x16xf16> } @@ -85,7 +112,16 @@ builtin.module attributes { transform.with_named_sequence } { // ----- -#layout = #iree_vector_ext.layout<<[VECTORY], [16]>, <[BATCHY, VECTORX], [2, 8]>> +#layout = #iree_vector_ext.nested_layout< + subgroup_tile = [1, 1], + batch_tile = [1, 2], + outer_tile = [1, 1], + thread_tile = [1, 1], + element_tile = [16, 8], + + subgroup_strides = [0, 0], + thread_strides = [0, 0] +> // Propagate and enforce through reduction. builtin.module attributes { transform.with_named_sequence } { @@ -93,20 +129,20 @@ builtin.module attributes { transform.with_named_sequence } { %c0 = arith.constant 0 : index %cst_0 = arith.constant 0.0 : f16 %cst0_1 = arith.constant dense<0.0> : vector<16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY, VECTORX], [2, 8]>>}} + // expected-remark @above {{element_tile = [8]}} %root = vector.transfer_read %arr[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ BATCHY, VECTORX], [2, 8]>>}} + // expected-remark @above {{element_tile = [16, 8]}} %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<16x16xf16> %root2 = vector.transfer_read %arr2[%c0], %cst_0 {in_bounds = [true]} : memref<16xf16>, vector<16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY, VECTORX], [2, 8]>>}} + // expected-remark @above {{element_tile = [8]}} %root_red = vector.multi_reduction, %rootl, %cst0_1 [0] : vector<16x16xf16> to vector<16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY, VECTORX], [2, 8]>>}} + // expected-remark @above {{element_tile = [8]}} %c = arith.mulf %root_red, %b : vector<16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY, VECTORX], [2, 8]>>}} + // expected-remark @above {{element_tile = [8]}} %d = arith.addf %c, %a : vector<16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY, VECTORX], [2, 8]>>}} + // expected-remark @above {{element_tile = [8]}} %e = arith.divf %d, %root2 : vector<16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY, VECTORX], [2, 8]>>}} + // expected-remark @above {{element_tile = [8]}} func.return %e : vector<16xf16> } @@ -119,7 +155,16 @@ builtin.module attributes { transform.with_named_sequence } { // ----- -#layout = #iree_vector_ext.layout<<[VECTORY], [16]>, <[BATCHY, VECTORX], [2, 8]>> +#layout = #iree_vector_ext.nested_layout< + subgroup_tile = [1, 1], + batch_tile = [1, 2], + outer_tile = [1, 1], + thread_tile = [1, 1], + element_tile = [16, 8], + + subgroup_strides = [0, 0], + thread_strides = [0, 0] +> // Propagate and enforce through transpose and then reduction. builtin.module attributes { transform.with_named_sequence } { @@ -127,22 +172,22 @@ builtin.module attributes { transform.with_named_sequence } { %c0 = arith.constant 0 : index %cst_0 = arith.constant 0.0 : f16 %cst0_1 = arith.constant dense<0.0> : vector<16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}} + // expected-remark @above {{element_tile = [16]}} %root = vector.transfer_read %arr[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ BATCHY, VECTORX], [2, 8]>>}} + // expected-remark @above {{element_tile = [16, 8]}} %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<16x16xf16> %root2 = vector.transfer_read %arr2[%c0], %cst_0 {in_bounds = [true]} : memref<16xf16>, vector<16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}} + // expected-remark @above {{element_tile = [16]}} %root_transpose = vector.transpose %rootl, [1, 0] : vector<16x16xf16> to vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY, VECTORX], [2, 8]>, <[ VECTORY], [16]>>}} + // expected-remark @above {{element_tile = [8, 16]}} %root_red = vector.multi_reduction, %root_transpose, %cst0_1 [0] : vector<16x16xf16> to vector<16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}} + // expected-remark @above {{element_tile = [16]}} %c = arith.mulf %root_red, %b : vector<16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}} + // expected-remark @above {{element_tile = [16]}} %d = arith.addf %c, %a : vector<16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}} + // expected-remark @above {{element_tile = [16]}} %e = arith.divf %d, %root2 : vector<16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}} + // expected-remark @above {{element_tile = [16]}} func.return %e : vector<16xf16> } @@ -155,9 +200,38 @@ builtin.module attributes { transform.with_named_sequence } { // ----- -#layoutA = #iree_vector_ext.layout<<[VECTORX], [32]>, <[VECTORY], [64]>> -#layoutB = #iree_vector_ext.layout<<[VECTORX], [128]>, <[VECTORY], [64]>> -#layoutC = #iree_vector_ext.layout<<[VECTORY], [128]>, <[VECTORX], [32]>> +#layoutA = #iree_vector_ext.nested_layout< + subgroup_tile = [1, 1], + batch_tile = [1, 1], + outer_tile = [1, 1], + thread_tile = [1, 1], + element_tile = [32, 64], + + subgroup_strides = [0, 0], + thread_strides = [0, 0] +> + +#layoutB = #iree_vector_ext.nested_layout< + subgroup_tile = [1, 1], + batch_tile = [1, 1], + outer_tile = [1, 1], + thread_tile = [1, 1], + element_tile = [128, 64], + + subgroup_strides = [0, 0], + thread_strides = [0, 0] +> + +#layoutC = #iree_vector_ext.nested_layout< + subgroup_tile = [1, 1], + batch_tile = [1, 1], + outer_tile = [1, 1], + thread_tile = [1, 1], + element_tile = [128, 32], + + subgroup_strides = [0, 0], + thread_strides = [0, 0] +> #map1 = affine_map<(d0, d1, d2) -> (d1, d2)> #map2 = affine_map<(d0, d1, d2) -> (d0, d2)> @@ -171,7 +245,7 @@ builtin.module attributes { transform.with_named_sequence } { %c = iree_vector_ext.to_layout %C to layout(#layoutC) : vector<128x32xf32> // Check if the layout of %C was properly propagated to %D. - // expected-remark @below {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [128]>, <[ VECTORX], [32]>>}} + // expected-remark @below {{element_tile = [128, 32]}} %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], @@ -190,21 +264,30 @@ builtin.module attributes { transform.with_named_sequence } { // ----- -#layout = #iree_vector_ext.layout<<[VECTORY], [16]>, <[VECTORX], [16]>> +#layout = #iree_vector_ext.nested_layout< + subgroup_tile = [1, 1], + batch_tile = [1, 1], + outer_tile = [1, 1], + thread_tile = [1, 1], + element_tile = [16, 16], + + subgroup_strides = [0, 0], + thread_strides = [0, 0] +> // Propagate the layout from transfer_read to everyone. builtin.module attributes { transform.with_named_sequence } { func.func @gather(%base: memref<16x16xf16>, %arr: memref<16x16xindex>) -> vector<16x16xf16> { %c0 = arith.constant 0 : index %mask = arith.constant dense : vector<16x16xi1> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}} + // expected-remark @above {{element_tile = [16, 16]}} %pass = arith.constant dense<0.000000e+00> : vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}} + // expected-remark @above {{element_tile = [16, 16]}} %index = vector.transfer_read %arr[%c0, %c0], %c0 {in_bounds = [true, true]} : memref<16x16xindex>, vector<16x16xindex> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}} + // expected-remark @above {{element_tile = [16, 16]}} %index_dist = iree_vector_ext.to_layout %index to layout(#layout) : vector<16x16xindex> %c = vector.gather %base[%c0, %c0] [%index_dist], %mask, %pass : memref<16x16xf16>, vector<16x16xindex>, vector<16x16xi1>, vector<16x16xf16> into vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ VECTORX], [16]>>}} + // expected-remark @above {{element_tile = [16, 16]}} func.return %c : vector<16x16xf16> } @@ -224,25 +307,44 @@ builtin.module attributes { transform.with_named_sequence } { // Useful proxy for ensuring that layout conversions on attention // happens where we intend it to happen. -#layoutA = #iree_vector_ext.layout<<[BATCHY, LANEX], [2, 32]>, <[BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>> -#layoutB = #iree_vector_ext.layout<<[BATCHY, LANEX], [2, 32]>, <[BATCHX, LANEY, VECTORX], [2, 4, 8]>> +#layoutA = #iree_vector_ext.nested_layout< + subgroup_tile = [1, 1], + batch_tile = [2, 2], + outer_tile = [1, 4], + thread_tile = [32, 2], + element_tile = [1, 4], + + subgroup_strides = [0, 0], + thread_strides = [2, 1] +> + +#layoutB = #iree_vector_ext.nested_layout< + subgroup_tile = [1, 1], + batch_tile = [2, 2], + outer_tile = [1, 1], + thread_tile = [32, 4], + element_tile = [1, 8], + + subgroup_strides = [0, 0], + thread_strides = [4, 1] +> builtin.module attributes { transform.with_named_sequence } { func.func @resolve_select(%A : vector<64x64xf16>, %B : vector<64x64xf16>, %condition : i1) -> vector<64x64xf16> { %a = iree_vector_ext.to_layout %A to layout(#layoutA) : vector<64x64xf16> %b = iree_vector_ext.to_layout %B to layout(#layoutB) : vector<64x64xf16> - // expected-remark @below {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY, LANEX], [2, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>}} + // expected-remark @below {{element_tile = [1, 4]}} %offset_0 = arith.constant dense<2.0> : vector<64x64xf16> - // expected-remark @below {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY, LANEX], [2, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>}} + // expected-remark @below {{element_tile = [1, 4]}} %offset_1 = arith.constant dense<4.0> : vector<64x64xf16> - // expected-remark @below {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY, LANEX], [2, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>}} + // expected-remark @below {{element_tile = [1, 4]}} %sel = arith.select %condition, %offset_0, %offset_1 : vector<64x64xf16> - // expected-remark @below {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY, LANEX], [2, 32]>, <[ BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]>>}} + // expected-remark @below {{element_tile = [1, 4]}} %add = arith.addf %a, %sel : vector<64x64xf16> %add_layout = iree_vector_ext.to_layout %add to layout(#layoutB) : vector<64x64xf16> // CHECK-COUNT-3: iree_vector_ext.to_layout - // expected-remark @below {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY, LANEX], [2, 32]>, <[ BATCHX, LANEY, VECTORX], [2, 4, 8]>>}} + // expected-remark @below {{element_tile = [1, 8]}} %add_1 = arith.addf %add_layout, %b : vector<64x64xf16> func.return %add_1 : vector<64x64xf16> } @@ -256,7 +358,16 @@ builtin.module attributes { transform.with_named_sequence } { // ----- -#layout = #iree_vector_ext.layout<<[VECTORY], [16]>, <[BATCHY, VECTORX], [2, 8]>> +#layout = #iree_vector_ext.nested_layout< + subgroup_tile = [1, 1], + batch_tile = [1, 2], + outer_tile = [1, 1], + thread_tile = [1, 1], + element_tile = [16, 8], + + subgroup_strides = [0, 0], + thread_strides = [0, 0] +> // Propagate and enforce through scf.for builtin.module attributes { transform.with_named_sequence } { @@ -266,25 +377,24 @@ builtin.module attributes { transform.with_named_sequence } { %c1024 = arith.constant 1024 : index %cst_0 = arith.constant 0.0 : f16 %cst0_1 = arith.constant dense<0.0> : vector<16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}} - + // expected-remark @above {{element_tile = [16]}} %out = scf.for %iv = %c0 to %c1024 step %c1 iter_args(%arg1 = %cst0_1) -> (vector<16xf16>) { - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}} + // expected-remark @above {{element_tile = [16]}} %root = vector.transfer_read %arr[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ BATCHY, VECTORX], [2, 8]>>}} + // expected-remark @above {{element_tile = [16, 8]}} %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<16x16xf16> %root2 = vector.transfer_read %arr2[%c0], %cst_0 {in_bounds = [true]} : memref<16xf16>, vector<16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}} + // expected-remark @above {{element_tile = [16]}} %root_transpose = vector.transpose %rootl, [1, 0] : vector<16x16xf16> to vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHY, VECTORX], [2, 8]>, <[ VECTORY], [16]>>}} + // expected-remark @above {{element_tile = [8, 16]}} %root_red = vector.multi_reduction, %root_transpose, %arg1 [0] : vector<16x16xf16> to vector<16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}} + // expected-remark @above {{element_tile = [16]}} %c = arith.mulf %root_red, %b : vector<16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}} + // expected-remark @above {{element_tile = [16]}} %d = arith.addf %c, %a : vector<16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}} + // expected-remark @above {{element_tile = [16]}} %e = arith.divf %d, %root2 : vector<16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>>}} + // expected-remark @above {{element_tile = [16]}} scf.yield %e : vector<16xf16> } @@ -565,7 +675,16 @@ builtin.module attributes { transform.with_named_sequence } { // ----- -#layout = #iree_vector_ext.layout<<[VECTORY], [16]>, <[BATCHY, VECTORX], [2, 8]>> +#layout = #iree_vector_ext.nested_layout< + subgroup_tile = [1, 1], + batch_tile = [1, 2], + outer_tile = [1, 1], + thread_tile = [1, 1], + element_tile = [16, 8], + + subgroup_strides = [0, 0], + thread_strides = [0, 0] +> // Propagate and enforce through scf.for builtin.module attributes { transform.with_named_sequence } { @@ -578,7 +697,7 @@ builtin.module attributes { transform.with_named_sequence } { %out = scf.for %iv = %c0 to %c1024 step %c1 iter_args(%arg1 = %cst) -> (vector) { %root = vector.transfer_read %arr[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ VECTORY], [16]>, <[ BATCHY, VECTORX], [2, 8]>>}} + // expected-remark @above {{element_tile = [16, 8]}} %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<16x16xf16> %init = vector.extractelement %arg1[] : vector %root_red = vector.multi_reduction, %rootl, %init [0, 1] : vector<16x16xf16> to f16 diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.cpp index 92134cb9e1f1..d4062bbf703b 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.cpp @@ -25,219 +25,6 @@ namespace mlir::iree_compiler::IREE::VectorExt { using VectorValue = TypedValue; -bool PerDimLayoutAttr::contains(const LayoutDimension &dim) { - for (LayoutDimensionAttr label : getLabels()) { - if (label.getValue() == dim) - return true; - } - return false; -} - -std::optional PerDimLayoutAttr::getShape(const LayoutDimension &dim) { - for (auto value : llvm::zip(getLabels(), getShapes())) { - if (dim == std::get<0>(value).getValue()) - return std::get<1>(value); - } - return std::nullopt; -} - -std::optional LayoutAttr::getShape(const LayoutDimension &dim) const { - for (PerDimLayoutAttr layout : getLayouts()) { - std::optional maybeShape = layout.getShape(dim); - if (maybeShape) - return maybeShape.value(); - } - return std::nullopt; -} - -// Get the SIMT Vector shape in the order specified by dims. If no dims are -// specified, then return an empty vector. -LogicalResult LayoutAttr::isValidLayout(ShapedType shapeTy, - Location loc) const { - ArrayRef shape = shapeTy.getShape(); - if (shape.size() != getRank()) { - return emitError(loc, "Rank of vector (") - << shape.size() << ") does not match rank of layout (" << getRank() - << ")."; - } - for (auto [idx, layout] : llvm::enumerate(getLayouts())) { - ArrayRef layoutShape = layout.getShapes(); - int64_t expectedShape = - std::reduce(layoutShape.begin(), layoutShape.end(), - static_cast(1), std::multiplies()); - if (expectedShape != shape[idx]) { - std::string shapeStr; - llvm::raw_string_ostream shapeOs(shapeStr); - llvm::interleaveComma(shape, shapeOs); - std::string layoutStr; - llvm::raw_string_ostream layoutOs(layoutStr); - printStripped(layoutOs); - return emitError(loc, "Vector shape: [") - << shapeStr << "] does not match the layout (" << layoutStr - << ") at dim " << idx - << ". Dimension expected by layout: " << expectedShape - << " actual: " << shape[idx]; - } - } - return success(); -} - -// Project out the layout for the specified dimensions -// resulting in the layout for a lower dimensional vector. -VectorLayoutInterface LayoutAttr::project(ArrayRef droppedDims) const { - assert(droppedDims.size() == getRank() && - "droppedDims size must match layout size"); - - ArrayRef layouts = getLayouts(); - SmallVector newLayouts; - for (auto pair : llvm::zip(droppedDims, layouts)) { - if (!std::get<0>(pair)) - newLayouts.push_back(std::get<1>(pair)); - } - return LayoutAttr::get(getContext(), newLayouts); -} - -// Permute the layout according to the provided permutation -// vector. The dimensionality of the layout remains the same. -VectorLayoutInterface LayoutAttr::permute(ArrayRef permutation) const { - assert(permutation.size() == getRank() && - "permutation size must match layout rank"); - - ArrayRef layouts = getLayouts(); - SmallVector newLayouts; - for (unsigned index : permutation) { - assert(index >= 0 && index < getRank()); - newLayouts.push_back(layouts[index]); - } - return LayoutAttr::get(getContext(), newLayouts); -} - -// This function returns the distributed shape of the SIMT -// vector and evaluates it in the following order: -// BATCHX, BATCHY, VECTORY, VECTORX -// The vector dimensions are combined into a single SIMT -// vector dimension. -SmallVector LayoutAttr::getDistributedShape() const { - SmallVector labels{ - LayoutDimension::BATCHX, LayoutDimension::BATCHY, - LayoutDimension::VECTORY, LayoutDimension::VECTORX}; - SmallVector simtVectorShape; - std::optional vectorShape; - for (LayoutDimension dim : labels) { - ArrayRef layouts = getLayouts(); - for (PerDimLayoutAttr layout : layouts) { - if (!layout.contains(dim)) - continue; - int64_t shape = layout.getShape(dim).value(); - if (isVectorDimension(dim)) { - vectorShape = shape * vectorShape.value_or(1); - continue; - } - simtVectorShape.push_back(shape); - } - } - if (vectorShape) - simtVectorShape.push_back(vectorShape.value()); - return simtVectorShape; -} - -PerDimLayoutAttr LayoutAttr::getDimLayout(int64_t dim) const { - assert(dim >= 0 && dim < getRank()); - return getLayouts()[dim]; -} - -std::optional LayoutAttr::getBatchDim(int64_t dim) { - assert(dim < getRank()); - PerDimLayoutAttr layout = getDimLayout(dim); - for (auto [name, shape] : - llvm::zip_equal(layout.getLabels(), layout.getShapes())) { - if (isBatchDimension(name.getValue())) - return shape; - } - return std::nullopt; -} - -std::optional LayoutAttr::getLaneDim(int64_t dim) { - assert(dim < getRank()); - PerDimLayoutAttr layout = getDimLayout(dim); - for (auto [name, shape] : - llvm::zip_equal(layout.getLabels(), layout.getShapes())) { - if (isLaneDimension(name.getValue())) - return shape; - } - return std::nullopt; -} - -std::optional LayoutAttr::getLane(int64_t dim) { - assert(dim < getRank()); - PerDimLayoutAttr layout = getDimLayout(dim); - for (auto [name, shape] : - llvm::zip_equal(layout.getLabels(), layout.getShapes())) { - if (isLaneDimension(name.getValue())) - return name.getValue(); - } - return std::nullopt; -} - -int64_t LayoutAttr::getRank() const { return getLayouts().size(); } - -std::tuple LayoutAttr::getLaneGrid() { - int64_t laneX = 1; - int64_t laneY = 1; - int64_t laneZ = 1; - for (PerDimLayoutAttr dimLayout : getLayouts()) { - // Note that valid layouts only include at most one instance of each - // dimension type, so this is simply doing assignment on the first instance - // of each lane index, not an accumulative product. - auto maybeXShape = dimLayout.getShape(LayoutDimension::LANEX); - laneX *= maybeXShape.value_or(1); - auto maybeYShape = dimLayout.getShape(LayoutDimension::LANEY); - laneY *= maybeYShape.value_or(1); - auto maybeZShape = dimLayout.getShape(LayoutDimension::LANEZ); - laneZ *= maybeZShape.value_or(1); - } - return std::make_tuple(laneX, laneY, laneZ); -} - -uint64_t LayoutAttr::getShuffleOffset(int64_t reductionDim) { - uint64_t offset = 0; - std::optional laneDim = getLane(reductionDim); - if (!laneDim) - return offset; - switch (laneDim.value()) { - case LayoutDimension::LANEX: - offset = 1; - break; - case LayoutDimension::LANEY: - offset = getShape(LayoutDimension::LANEX).value_or(0); - break; - case LayoutDimension::LANEZ: - offset = getShape(LayoutDimension::LANEX).value_or(0) * - getShape(LayoutDimension::LANEY).value_or(0); - break; - default: - assert(false && "Invalid dimension! Expected lane dimension"); - break; - } - return offset; -} - -bool LayoutAttr::hasLaneConflictWith(const LayoutAttr &other) { - SmallVector laneDims{ - LayoutDimension::LANEX, LayoutDimension::LANEY, LayoutDimension::LANEZ}; - for (LayoutDimension dim : laneDims) { - std::optional shape = getShape(dim); - std::optional otherShape = other.getShape(dim); - if ((shape && !otherShape) || (!shape && otherShape)) - return true; - if (shape && otherShape) { - if (shape.value() != otherShape.value()) - return true; - } - } - return false; -} - // Project the nested layout. This take a mask on the dimensions of the vector // associated with this layout and projects out those dimensions. This reduces // the rank of the layout in the process. diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.td index 913fb9f92dd3..c401e67a2dab 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.td @@ -13,98 +13,6 @@ include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtBase.td" // Vector layout attributes //===---------------------------------------------------------------------===// -// Defines the batch dimensions for the original SIMD tensor. -// By convention, X is along rows and Y along columns. -def BATCHX : I32EnumAttrCase<"BATCHX", 0>; -def BATCHY : I32EnumAttrCase<"BATCHY", 1>; -// Defines the vector dimension. -def VECTORX : I32EnumAttrCase<"VECTORX", 2>; -def VECTORY : I32EnumAttrCase<"VECTORY", 3>; -def VECTORZ : I32EnumAttrCase<"VECTORZ", 4>; -// Defines the lane dimensions. -def LANEX : I32EnumAttrCase<"LANEX", 5>; -def LANEY : I32EnumAttrCase<"LANEY", 6>; -def LANEZ : I32EnumAttrCase<"LANEZ", 7>; - -def LayoutDimension : IREEVectorExt_I32EnumAttr<"LayoutDimension", - "Describes the dimension of the high-dimensional layout", [ - BATCHX, - BATCHY, - VECTORX, - VECTORY, - VECTORZ, - LANEX, - LANEY, - LANEZ, - ]>; - -def LayoutDimensionAttr : IREEVectorExt_EnumAttr; - -def PerDimLayoutAttr : IREEVectorExt_Attr<"PerDimLayout"> { - let mnemonic = "per_dim_layout"; - let summary = [{high-dimensional vector register layout for a given vector dimension}]; - let description = [{ - This attribute describes the per dimension register layout for a given vector - that could be prescribed by an operator such as matrix multiplication. - This is a way to explicitly represent the layout in the IR - when it is in the SIMD form prior to converting to the SIMT form so that - we can reason about layouts, propagating layouts and layout conflicts. - }]; - let parameters = (ins - ArrayRefParameter<"LayoutDimensionAttr", "labels for the high dimensional layout dims">:$labels, - ArrayRefParameter<"int64_t", "shapes for the high dimensional layout dims">:$shapes - ); - let assemblyFormat = "`<``[` $labels `]``,` `[` $shapes `]``>`"; - let genVerifyDecl = 0; - let extraClassDeclaration = [{ - std::optional getShape(const LayoutDimension &dim); - bool contains(const LayoutDimension &dim); - }]; -} - -def LayoutAttr : IREEVectorExt_Attr<"Layout", - [ DeclareAttrInterfaceMethods ]> { - let mnemonic = "layout"; - let summary = [{high-dimensional vector register layout for a given vector}]; - let description = [{ - This contains a complete specification of the layout for a given vector, - whereas the attribute above only specifies the per dimension layout. - }]; - let parameters = (ins - ArrayRefParameter<"PerDimLayoutAttr", "layout for each dimension of the vector">:$layouts - ); - let assemblyFormat = "`<`$layouts`>`"; - let genVerifyDecl = 0; - let extraClassDeclaration = [{ - // Get the shape for a given layout dimension. - std::optional getShape(const LayoutDimension &dim) const; - std::optional getBatchDim(int64_t dim); - // Get the lane dimension shape for a provided simd tensor dim. - std::optional getLaneDim(int64_t dim); - // Get the lane dimension for a provided simd tensor dim. - std::optional getLane(int64_t dim); - - // Returns the grid of lane ids. Assumes a valid layout. - ::std::tuple getLaneGrid(); - PerDimLayoutAttr getDimLayout(int64_t dim) const; - - // Given the reduction dim, computes the shuffle offset - // based on the shapes of the lane dimensions. The shuffle - // offset is used during the thread global reduction - // when emitting a gpu::ShuffleOp and follows - // the semantics of the offset operand defined there, - // which is that for lane k, the shuffle op returns the - // value from lane k ^ offset. - uint64_t getShuffleOffset(int64_t reductionDim); - - // Determines whether the other layout has a lane - // dimension that the current layout does not have OR whether - // the shape of the two layouts for a common lane dimension - // is not the same. - bool hasLaneConflictWith(const LayoutAttr &other); - }]; -} - def NestedLayoutAttr : IREEVectorExt_Attr<"NestedLayout", [ DeclareAttrInterfaceMethods ]> { let mnemonic = "nested_layout"; diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.cpp b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.cpp index ba32c2326cb2..8c5abb9211cf 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.cpp @@ -23,10 +23,6 @@ namespace mlir::iree_compiler::IREE::VectorExt { struct IREEVectorExtDialectOpAsmInterface : public OpAsmDialectInterface { using OpAsmDialectInterface::OpAsmDialectInterface; AliasResult getAlias(Attribute attr, raw_ostream &os) const override { - if (llvm::isa(attr)) { - os << "layout"; - return AliasResult::OverridableAlias; - } if (llvm::isa(attr)) { os << "nested"; return AliasResult::OverridableAlias; diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.cpp b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.cpp index c4da2ae68a09..7801bc54a99a 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.cpp @@ -37,163 +37,6 @@ OpFoldResult ToSIMTOp::fold(FoldAdaptor) { return {}; } -void LayoutIterator::maybeFreezeAndConcatenate( - const LayoutIterator::State &frozenState) { - for (auto &[frozenDim, frozenIt] : frozenState.iterators) { - if (!state.contains(frozenDim)) { - frozenDimensions.insert(frozenDim); - state[frozenDim] = frozenIt; - state.ranges[frozenDim] = frozenState.ranges.lookup(frozenDim); - } - } -} - -void LayoutIterator::initialize(const PerDimLayoutAttr &attr, - DenseMap strides, - std::optional simdIndex) { - auto reversedLabels = llvm::reverse(attr.getLabels()); - auto reversedShapes = llvm::reverse(attr.getShapes()); - for (auto [nameAttr, shape] : llvm::zip(reversedLabels, reversedShapes)) { - LayoutDimension dim = nameAttr.getValue(); - if (isLaneDimension(dim)) - continue; - int64_t stride = strides.contains(dim) ? strides[dim] : 1; - state.ranges[dim] = DimensionalRange(0, shape, stride); - state.iterators[dim] = state.ranges[dim].begin(); - maxIterations *= shape / stride; - if (simdIndex) { - int64_t index = simdIndex.value(); - if (!state.simdToLayoutDim.contains(index)) - state.simdToLayoutDim[index] = {}; - state.simdToLayoutDim[index].insert(dim); - } - } -} - -LayoutIterator::LayoutIterator(LayoutAttr &attr, - DenseMap strides) { - for (auto perDimAttr : llvm::enumerate(attr.getLayouts())) { - initialize(perDimAttr.value(), strides, perDimAttr.index()); - } -} - -LayoutIterator::LayoutIterator(LayoutAttr &attr) { - DenseMap strides; - for (auto [idx, attr] : llvm::enumerate(attr.getLayouts())) { - initialize(attr, strides, idx); - } -} - -LayoutIterator::LayoutIterator(LayoutAttr &attr, - DenseMap strides, - int64_t simtIndex) { - for (auto [idx, attr] : llvm::enumerate(attr.getLayouts())) { - if (idx != simtIndex) - continue; - initialize(attr, strides, idx); - } -} - -LayoutIterator::LayoutIterator(LayoutAttr &attr, int64_t simtIndex) { - DenseMap strides; - for (auto [idx, attr] : llvm::enumerate(attr.getLayouts())) { - if (idx != simtIndex) - continue; - initialize(attr, strides, idx); - } -} - -LayoutIterator::LayoutIterator(PerDimLayoutAttr &attr, - DenseMap strides) { - initialize(attr, strides, std::nullopt); -} - -LayoutIterator &LayoutIterator::operator++() { - for (auto &[dim, it] : state.iterators) { - if (frozenDimensions.contains(dim)) - continue; - ++it; - if (it == state.ranges[dim].end()) { - it = state.ranges[dim].begin(); - continue; - } - break; - } - ++iterations; - return *this; -} - -/// The iterator is done when all the loops are complete. -bool LayoutIterator::iterationComplete() { return iterations == maxIterations; } - -void LayoutIterator::apply( - std::function callback) { - for (; !iterationComplete(); ++(*this)) { - callback(state); - } -} - -// Get the offset into the SIMT vector corresponding to the incoming iterator. -// The returned offsets will always be the same shape as the labels array. -// Groups vector dimensions together. Assumes last dimension is vector -// dimension. -SmallVector LayoutIterator::State::computeSIMTIndex() const { - SmallVector offset; - std::optional vecOffset; - for (auto label : labels) { - for (auto [name, it] : iterators) { - if (name != label) - continue; - if (isBatchDimension(name)) { - offset.push_back(it.getPosition()); - continue; - } - if (isVectorDimension(name)) { - int64_t step{1}; - if (name == LayoutDimension::VECTORY) { - assert(ranges.contains(LayoutDimension::VECTORX) && - "Expected VectorX to be specified on layouts with VectorY."); - step = ranges.lookup(LayoutDimension::VECTORX).stop; - } - vecOffset = vecOffset.value_or(0) + it.getPosition() * step; - } - } - } - if (vecOffset) - offset.push_back(vecOffset.value()); - return offset; -} - -SmallVector -LayoutIterator::State::computeIteratorProjectedSIMTIndex() const { - SmallVector indices = computeSIMTIndex(); - SmallVector projectedIndices; - for (size_t i = 0, e = labels.size(); i != e; ++i) { - for (auto [name, it] : iterators) { - if (name == labels[i]) - projectedIndices.push_back(indices[i]); - } - } - return projectedIndices; -} - -void LayoutIterator::erase(LayoutDimension dim) { - if (state.contains(dim)) - state.erase(dim); -} - -LayoutIterator LayoutIterator::getBatchIterator() const { - LayoutIterator projectedIterator = *this; - for (auto [dim, it] : state.iterators) { - if (!isBatchDimension(dim)) { - DimensionalRange range = state.ranges.lookup(dim); - projectedIterator.maxIterations /= (range.stop / range.step); - projectedIterator.erase(dim); - } - } - return projectedIterator; -} - // clang-format off #define GET_OP_CLASSES #include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.cpp.inc" // IWYU pragma: keep diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.h b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.h index 408c95a80548..22241e9c7681 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.h +++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.h @@ -19,121 +19,4 @@ #include "mlir/IR/OpDefinition.h" #include "mlir/Interfaces/SideEffectInterfaces.h" -namespace mlir::iree_compiler::IREE::VectorExt { - -/// Dimensional Strided Iterator class used to represent -/// an iterator through a single dimension of the layout. -class DimensionalIterator { -public: - DimensionalIterator(int64_t position = 0, int64_t stride = 1) - : position(position), stride(stride) {} - int64_t operator*() const { return position; } - DimensionalIterator &operator++() { - position += stride; - return *this; - } - - bool operator==(const DimensionalIterator &other) const { - return position == other.position; - } - bool operator!=(const DimensionalIterator &other) const { - return !(*this == other); - } - bool operator<(const DimensionalIterator &other) const { - return position < other.position; - } - - int64_t getPosition() const { return position; } - -private: - int64_t position, stride; -}; - -/// Dimensional Range class used to represent the range of -/// a particular dimension of the layout. Can be iterated on -/// using a DimensionalIterator. -class DimensionalRange { -public: - DimensionalRange() {} - DimensionalRange(int64_t start, int64_t stop, int64_t step = 1) - : start(start), stop(stop), step(step) {} - DimensionalIterator begin() const { return DimensionalIterator(start, step); } - DimensionalIterator end() const { return DimensionalIterator(stop, step); } - - int64_t start, stop, step; -}; - -// Iterator class for LayoutAttrs and PerDimLayoutAttrs. -// Provides O(1) access to state for any given dimension. -// Also preserves insertion order. -// Layout iterators skip lane dimensions as these are not -// required during distribution. -class LayoutIterator { -public: - struct State { - SmallVector computeSIMTIndex() const; - SmallVector computeIteratorProjectedSIMTIndex() const; - bool contains(LayoutDimension dim) const { return iterators.contains(dim); } - void erase(LayoutDimension dim) { iterators.erase(dim); } - DimensionalIterator lookup(LayoutDimension dim) const { - return iterators.lookup(dim); - } - DimensionalIterator &operator[](LayoutDimension dim) { - return iterators[dim]; - } - void print() const { - for (const auto &[dim, it] : iterators) { - llvm::outs() << stringifyLayoutDimension(dim).str() + ":" + - std::to_string(*it) + ", "; - } - llvm::outs() << "\n"; - } - llvm::MapVector iterators; - DenseMap> simdToLayoutDim; - llvm::MapVector ranges; - SmallVector labels{ - LayoutDimension::BATCHX, LayoutDimension::BATCHY, - LayoutDimension::VECTORY, LayoutDimension::VECTORX}; - }; - void maybeFreezeAndConcatenate(const LayoutIterator::State &frozenState); - LayoutIterator(LayoutAttr &attr); - LayoutIterator(LayoutAttr &attr, int64_t simtIndex); - LayoutIterator(LayoutAttr &attr, DenseMap strides); - LayoutIterator(LayoutAttr &attr, DenseMap strides, - int64_t simtIndex); - LayoutIterator(PerDimLayoutAttr &attr, - DenseMap strides); - void apply(std::function); - LayoutIterator &operator++(); - State getState() const { return state; } - void erase(LayoutDimension dim); - LayoutIterator getBatchIterator() const; - bool iterationComplete(); - -private: - void initialize(const PerDimLayoutAttr &attr, - DenseMap strides, - std::optional simdIndex); - State state; - DenseSet frozenDimensions; - int64_t iterations{0}; - int64_t maxIterations{1}; -}; - -inline bool isBatchDimension(LayoutDimension dim) { - return (dim == LayoutDimension::BATCHX) || (dim == LayoutDimension::BATCHY); -} - -inline bool isLaneDimension(LayoutDimension dim) { - return (dim == LayoutDimension::LANEX) || (dim == LayoutDimension::LANEY) || - (dim == LayoutDimension::LANEZ); -} - -inline bool isVectorDimension(LayoutDimension dim) { - return (dim == LayoutDimension::VECTORX) || - (dim == LayoutDimension::VECTORY) || (dim == LayoutDimension::VECTORZ); -} - -} // namespace mlir::iree_compiler::IREE::VectorExt - #endif // IREE_DIALECTS_DIALECT_VECTOREXT_IR_VECTOREXTOPS_H_ diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/test/invalid.mlir b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/test/invalid.mlir index 86c7753fa23f..0b2d31176589 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/test/invalid.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/test/invalid.mlir @@ -1,13 +1,20 @@ // RUN: iree-opt --split-input-file --verify-diagnostics %s -#row_layout1 = #iree_vector_ext.per_dim_layout<[BATCHX, LANEX, VECTORY], [1, 1, 1]> -#col_layout1 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEY, VECTORX], [4, 2, 4]> -#layout1 = #iree_vector_ext.layout<#row_layout1, #col_layout1> +#layout1 = #iree_vector_ext.nested_layout< + subgroup_tile = [1, 1], + batch_tile = [1, 1], + outer_tile = [1, 1], + thread_tile = [1, 1], + element_tile = [1, 1], + + subgroup_strides = [0, 0], + thread_strides = [0, 0]> + func.func @invalid_layout(%lhs: memref<32x32xf16>, %rhs: memref<32x32xf16>) -> vector<32x32xf16> { %cst_0 = arith.constant 0.0 : f16 %c0 = arith.constant 0 : index %result = vector.transfer_read %lhs[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x32xf16>, vector<32x32xf16> - // expected-error @+1 {{Vector shape: [32, 32] does not match the layout (layout<<[ BATCHX, LANEX, VECTORY], [1, 1, 1]>, <[ BATCHY, LANEY, VECTORX], [4, 2, 4]>>) at dim 0. Dimension expected by layout: 1 actual: 32}} + // expected-error @+1 {{Vector shape: [32, 32] does not match the layout (nested_layout) at dim 0. Dimension expected by layout: 1 actual: 32}} %2 = iree_vector_ext.to_layout %result to layout(#layout1) : vector<32x32xf16> return %2 : vector<32x32xf16> } diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/test/roundtrip.mlir b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/test/roundtrip.mlir index fc14c3b6bc92..4dfa22a06a0e 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/test/roundtrip.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/test/roundtrip.mlir @@ -1,22 +1,5 @@ // RUN: iree-opt --split-input-file %s | FileCheck %s -#row_layout1 = #iree_vector_ext.per_dim_layout<[BATCHX, LANEX, VECTORY], [2, 4, 4]> -#col_layout1 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEY, VECTORX], [4, 2, 4]> -#layout2 = #iree_vector_ext.layout<#col_layout1, #row_layout1> -func.func @specify_layout(%lhs: memref<32x32xf16>) -> vector<32x32xf16> { - %cst_0 = arith.constant 0.0 : f16 - %c0 = arith.constant 0 : index - %result = vector.transfer_read %lhs[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x32xf16>, vector<32x32xf16> - %2 = iree_vector_ext.to_layout %result to layout(#layout2) : vector<32x32xf16> - return %2 : vector<32x32xf16> -} - -// CHECK-DAG: #[[$LAYOUT0:.+]] = #iree_vector_ext.layout<<[ BATCHY, LANEY, VECTORX], [4, 2, 4]>, <[ BATCHX, LANEX, VECTORY], [2, 4, 4]>> -// CHECK-LABEL: func.func @specify_layout -// CHECK: iree_vector_ext.to_layout {{.*}} to layout(#[[$LAYOUT0]]) - -// ----- - func.func @specify_inline_layout(%lhs: memref<32x32xf16>) -> vector<32x32xf16> { %cst_0 = arith.constant 0.0 : f16 %c0 = arith.constant 0 : index diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/AMDGPUChainedMatmulPass.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/AMDGPUChainedMatmulPass.cpp deleted file mode 100644 index 47950df28a6b..000000000000 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/AMDGPUChainedMatmulPass.cpp +++ /dev/null @@ -1,270 +0,0 @@ -// Copyright 2024 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include - -#include "iree/compiler/Codegen/LLVMGPU/Passes.h" -#include "iree/compiler/Codegen/Utils/VectorOpUtils.h" -#include "mlir/Analysis/SliceAnalysis.h" -#include "mlir/Dialect/Vector/IR/VectorOps.h" - -namespace mlir::iree_compiler { - -#define GEN_PASS_DEF_AMDGPUPREPAREFORCHAINEDMATMULPASS -#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc" - -using VectorValue = TypedValue; - -namespace { - -/// Let's assume that we only have vector.contract with the standard indexing -/// maps: -/// (m, n, k), A: (m, k), B: (k, n), C: (m, n). -/// We will represent this contract operation by a "@". -/// -/// Given a matmul: -/// -/// C = A @ B -/// -/// This pass decides when to convert this matmul to: -/// -/// A.T = transpose(A) -/// B.T = transpose(B) -/// C.T = B.T @ A.T -/// C = transpose(C.T) -/// -/// This is useful when the "@" instruction that the hardware lowers to -/// has a specific layout (see VectorLayoutInterface for more information) -/// but the further uses of C expects a transposed layout to the produced -/// layout. -/// -/// For example, for "@" lowering to AMDGPU MFMA instructions, the operands -/// have layout L and L.T and the result has the layout L.T . -/// So if you have a chain of matmuls: -/// -/// C (L.T) = A (L) @ B (L.T) -/// E (L.T) = C (L.T) @ D (L.T) -/// ^^^^^^^ -/// Expected layout by instruction is L -/// -/// To fix this, we can apply this transformation on the first matrix: -/// -/// C.T (L.T) = B.T (L) @ A (L.T) -/// C (L) = transpose C.T (L.T) -/// E (L.T) = C (L) @ D (L.T) -/// ^^^^^ -/// Layout matches the instruction! -/// -/// Note that the mathematical formula -/// C = A @ B --> C.T = B.T @ A.T -/// is only defined on standard "@" function, it may be a different -/// transformation for other indexing maps. -struct AMDGPUPrepareForChainedMatmulPass final - : impl::AMDGPUPrepareForChainedMatmulPassBase< - AMDGPUPrepareForChainedMatmulPass> { - void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); - } - - VectorContractOpInfo getOpInfo(vector::ContractionOp contract) const { - auto maybeOpInfo = VectorContractOpInfo::inferFromIndexingMaps( - contract.getIndexingMapsArray()); - assert(succeeded(maybeOpInfo) && - "contraction info for vector.contract should always be valid"); - return maybeOpInfo.value(); - } - - VectorValue swapDims(RewriterBase &rewriter, VectorValue val, int64_t dimA, - int64_t dimB) const { - ArrayRef shape = val.getType().getShape(); - SmallVector perm(shape.size()); - std::iota(perm.begin(), perm.end(), 0); - std::swap(perm[dimA], perm[dimB]); - return rewriter.create(val.getLoc(), val, perm); - } - - AffineMap swapDimsInMap(AffineMap map, int64_t dimA, int64_t dimB) const { - SmallVector results(map.getResults()); - std::swap(results[dimA], results[dimB]); - return AffineMap::get(map.getNumDims(), map.getNumSymbols(), results, - map.getContext()); - } - - /// Given a vector contract of the form - /// %output = vector.contract %lhs, %rhs, %acc - /// this function swaps the operands (%rhs, %lhs), - /// transposes the accumulator and output and updates - /// the indexing maps for the new contract op. - /// - /// Given a contract: - /// - /// result = vector.contract lhs, rhs, acc - /// - /// transform it to - /// - /// lhs.T = transpose(lhs) - /// rhs.T = transpose(rhs) - /// acc.T = transpose(acc) - /// result.T = vector.contract rhs.T, lhs.T, acc.T - /// result = transpose(result.T) - /// - /// This transformation holds for the "@" case we described above. For - /// other indexing maps, we need to take into account transposed which are - /// fused into the contract. `isOperandSwapInvariant` tells us when we can - /// simply swap the operands without transposing them. - void swapOperandsAndTranspose(RewriterBase &rewriter, - vector::ContractionOp contractOp) const { - VectorContractOpInfo opInfo = getOpInfo(contractOp); - auto [lhsM, rhsN] = opInfo.getOperandMNIndex(); - auto [lhsK, rhsK] = opInfo.getOperandKIndex(); - auto [accM, accN] = opInfo.getResultMNIndex(); - VectorValue lhs = contractOp.getLhs(); - VectorValue rhs = contractOp.getRhs(); - VectorValue acc = cast(contractOp.getAcc()); - rewriter.setInsertionPoint(contractOp); - - SmallVector maps = contractOp.getIndexingMapsArray(); - AffineMap lhsMap = maps[0]; - AffineMap rhsMap = maps[1]; - AffineMap accMap = maps[2]; - - acc = swapDims(rewriter, acc, accN, accM); - accMap = swapDimsInMap(accMap, accN, accM); - - if (!isOperandSwapInvariant(contractOp)) { - lhs = swapDims(rewriter, lhs, lhsK, lhsM); - rhs = swapDims(rewriter, rhs, rhsK, rhsN); - lhsMap = swapDimsInMap(lhsMap, lhsK, lhsM); - rhsMap = swapDimsInMap(rhsMap, rhsK, rhsN); - } - - auto swappedOp = rewriter.create( - contractOp.getLoc(), rhs, lhs, acc, - rewriter.getAffineMapArrayAttr({rhsMap, lhsMap, accMap}), - contractOp.getIteratorTypesAttr()); - swappedOp->setDiscardableAttrs(contractOp->getDiscardableAttrDictionary()); - - acc = cast(swappedOp.getResult()); - acc = swapDims(rewriter, acc, accN, accM); - - rewriter.replaceOp(contractOp, acc); - } - - /// If one of the operands is transposed, while the other isn't, the - /// transformation boils down to an operand swap and result transpose. This - /// happens because transposing and swapping both operands, preserves the - /// structure of the contraction. For example: - /// - /// def matmul_transpose_b(A, B): - /// B.T = transpose(B) - /// C = A @ B.T - /// return C - /// - /// def matmul_transpose_b_swapped(A, B): - /// A.T = transpose(A) - /// C.T = B @ A.T - /// C = transpose(C.T) - /// return C - /// - /// matmul_transpose_b(B, A) = matmul_transpose_b_swapped(B, A).T - /// - /// For the sake of completeness, we also show that this does not hold - /// when no operands are transposed, or both operands are transposed: - /// - /// def matmul(A, B): - /// C = A @ B - /// return C - /// - /// def matmul_swapped(A, B): - /// A.T = transpose(A) - /// B.T = transpose(B) - /// C.T = B.T @ A.T - /// C = transpose(C.T) - bool isOperandSwapInvariant(vector::ContractionOp contractOp) const { - // Check if the innermost m, n, k dimensions are in the order: - // lhs: (m, k), rhs: (n, k) - VectorContractOpInfo opInfo = getOpInfo(contractOp); - auto [lhsM, rhsN] = opInfo.getOperandMNIndex(); - auto [lhsK, rhsK] = opInfo.getOperandKIndex(); - bool isLhsTransposed = lhsM > lhsK; - bool isRhsTransposed = rhsN < rhsK; - return isLhsTransposed != isRhsTransposed; - } - - /// Returns a vector.contract operation that this value was transitively - /// produced from. - /// - /// A chained matmul is one where the lhs of the candidate matrix - /// is a result of another matmul (a matmul lies in the backward slice of lhs - /// of the first matmul). - /// - /// TODO: This definition of a chained matmul is crude. We should actually be - /// checking if the layout of the result of the first matmul is transposed - /// to that expected by the second matmul. - FailureOr - getTransitiveMatmulParent(vector::ContractionOp contractOp) const { - SetVector backwardSlice; - BackwardSliceOptions options; - options.inclusive = true; - getBackwardSlice(contractOp.getLhs(), &backwardSlice, options); - vector::ContractionOp result; - for (Operation *sliceOp : backwardSlice) { - auto chainParent = dyn_cast(sliceOp); - if (!chainParent) { - continue; - } - - // For now, we only support transpose invariant matmuls. This is because - // transposing the inputs may have a non-trivial cost which we need - // to think about. - // TODO: We should probably enable it always. Currently, this is - // only useful in Flash Attention, where the first matmul is generally - // a transpose. - if (!isOperandSwapInvariant(chainParent)) { - continue; - } - - // If we have multiple matmul parents, we fail. - if (result) { - return failure(); - } - - result = chainParent; - } - - if (result) { - return result; - } - - return failure(); - } - - void runOnOperation() override { - auto funcOp = getOperation(); - SmallVector matmulCandidates; - funcOp.walk([&](vector::ContractionOp contractOp) { - matmulCandidates.push_back(contractOp); - }); - - IRRewriter rewriter(funcOp.getContext()); - for (vector::ContractionOp candidate : matmulCandidates) { - FailureOr maybeChainedParent = - getTransitiveMatmulParent(candidate); - if (failed(maybeChainedParent)) { - continue; - } - auto chainParent = maybeChainedParent.value(); - swapOperandsAndTranspose(rewriter, chainParent); - - // TODO: We should be only transposing the second matrix if the - // result of the first matmul is used by the second matmul transitively. - swapOperandsAndTranspose(rewriter, candidate); - } - } -}; - -} // namespace -} // namespace mlir::iree_compiler diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel index 3159126442c8..73e039798deb 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel @@ -85,7 +85,6 @@ iree_compiler_cc_library( iree_compiler_cc_library( name = "LLVMGPU", srcs = [ - "AMDGPUChainedMatmulPass.cpp", "ConvertToLLVM.cpp", "ConvertToNVVM.cpp", "ConvertToROCDL.cpp", diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt index fef9ca37e6aa..b33641bda92e 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt @@ -70,7 +70,6 @@ iree_cc_library( "ROCDLKernelConfig.h" "ROCDLPasses.h" SRCS - "AMDGPUChainedMatmulPass.cpp" "ConvertToLLVM.cpp" "ConvertToNVVM.cpp" "ConvertToROCDL.cpp" diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp index 466d7bd1bf80..1640656b71a8 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp @@ -34,7 +34,6 @@ class ContractionVectorLayoutOptions : public VectorLayoutOptions { int64_t subgroupSize) : VectorLayoutOptions(root), patterns(root->getContext()) { populateGPUDistributionPatterns(patterns); - populateGPUDistributionLayoutAttrPatterns(laneId, patterns); populateGPUDistributeNestedLayoutAttrPatterns(patterns, laneId, subgroupSize); populateGPUDistributeNestedLayoutContractAMDGPUPatterns(patterns); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td index 06d9960f180f..80eb87c81964 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td @@ -13,34 +13,6 @@ include "mlir/Pass/PassBase.td" // LLVMGPU Passes (keep alphabetical) //------------------------------------------------------------------------------ -def AMDGPUPrepareForChainedMatmulPass : - InterfacePass<"iree-amdgpu-prepare-chained-matmul", "mlir::FunctionOpInterface"> { - let summary = "Pass to swap operands and transpose accumulator and result"; - let description = [{ - Given a chain of matmuls with some or no operations - in between, like - - d = matmul_transpose_b(a, b) + c - ... - e = matmul_transpose_b(d, f) + g - - this pattern transforms the above IR to - - c.t = transpose c - d = matmul_transpose_b(b, a) + c.t - d.t = transpose d - ... - g.t = transpose g - e = matmul_transpose_b(f, d.t) + g.t - e.t = transpose e - - On CDNA architectures, where the layouts of the RHS and result - are the same and transposed from the LHS layout, this type - of transformation can avoid trips to shared memory/shuffle instructions - on operators like Flash Attention. - }]; -} - // TODO: Bring the argument in line with the names used elsewhere. def ConvertToNVVMPass : Pass<"iree-convert-to-nvvm", "ModuleOp"> { diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp index a2c2187f3f14..b0022c452ab1 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp @@ -1476,13 +1476,9 @@ transform_dialect::AMDGPUDistributeVectorsOp::applyToOne( rewriter.create(target.getLoc(), gpu::Dimension::x); populateGPUDistributionPatterns(patterns); - populateGPUDistributionLayoutAttrPatterns(laneId, patterns); - populateGPUReductionDistributionPatterns(patterns); // For testing we use subgroup size = 64. populateGPUDistributeNestedLayoutAttrPatterns(patterns, laneId, /*subgroupSize=*/64); - populateAMDGPUDistributionPatterns(patterns); - populateGPULayoutResolutionDistributionPatterns(patterns); if (failed(distributeVectorOps(target, patterns, options))) { return emitDefaultSilenceableFailure(target); } @@ -1550,357 +1546,5 @@ transform_dialect::CreateMatmulMfmaTileSizesOp::apply( return DiagnosedSilenceableFailure::success(); } -//===----------------------------------------------------------------------===// -// SetContractionLayoutAttributes -//===----------------------------------------------------------------------===// - -/// This function creates a modified version of the MFMA layout that allows -/// for reading more elements from LDS. Specifically, the MFMA layout looks -/// something like this: -/// <<[ BATCHY, LANEX], [2, 16]>, <[ BATCHX, LANEY, VECTORX], [8, 4, 4]>> -/// Here VECTORX specifies how many elements can be read from LDS. -/// Now, in order to read more elements from LDS, we can modify this layout -/// while maintaining the overall shape to: -/// <<[ BATCHY, LANEX], [2, 16]>, <[ BATCHX, LANEY, VECTORX], [4, 4, 8]>> -/// This is what this function does. In situations where the batch dimension -/// is too small, or if we are not transferring 4 elements at a time, it -/// returns nullopt. -static std::optional -createReadLayout(MLIRContext *ctx, const VectorExt::LayoutAttr &layout) { - SmallVector perDimLayouts; - for (VectorExt::PerDimLayoutAttr perDimLayout : layout.getLayouts()) { - DenseSet labels; - for (VectorExt::LayoutDimensionAttr dim : perDimLayout.getLabels()) { - labels.insert(dim.getValue()); - } - if (!labels.contains(VectorExt::LayoutDimension::VECTORX)) { - perDimLayouts.push_back(perDimLayout); - continue; - } - SmallVector newShapes; - for (auto [label, shape] : - llvm::zip_equal(perDimLayout.getLabels(), perDimLayout.getShapes())) { - if (VectorExt::isBatchDimension(label.getValue())) { - if (shape == 1) - return std::nullopt; - newShapes.push_back(shape / 2); - continue; - } - if (label.getValue() == VectorExt::LayoutDimension::VECTORX) { - if (shape != 4) - return std::nullopt; - newShapes.push_back(shape * 2); - continue; - } - newShapes.push_back(shape); - } - perDimLayouts.push_back(VectorExt::PerDimLayoutAttr::get( - ctx, perDimLayout.getLabels(), newShapes)); - } - return VectorExt::LayoutAttr::get(ctx, perDimLayouts); -} - -// Struct containing concrete MMA shape, type, and layout information. -struct ConcreteMmaLayout { - GPU::OpaqueMmaLayout base; - VectorExt::PerDimLayoutAttr aMLayout; - VectorExt::PerDimLayoutAttr aKLayout; - VectorExt::PerDimLayoutAttr bKLayout; - VectorExt::PerDimLayoutAttr bNLayout; - VectorExt::PerDimLayoutAttr cMLayout; - VectorExt::PerDimLayoutAttr cNLayout; -}; - -static std::tuple -getPerDimLayoutAttrs(MLIRContext *context, TileSwizzle swizzle) { - // Step 1: obtain the swizzled tile shape, but keeping track of the source - // dimension indices. - struct SrcIndexAndSwizzleDim { - size_t srcIndex; - TileSwizzle::Dim dim; - }; - SmallVector swizzledShape; - for (auto [i, e] : llvm::enumerate(swizzle.expandShape)) { - for (TileSwizzle::Dim d : e) { - swizzledShape.push_back(SrcIndexAndSwizzleDim{i, d}); - } - } - applyPermutationToVector(swizzledShape, swizzle.permutation); - - // Step 2: collect the appropriate labels to use for the swizzled dims. - VectorExt::LayoutDimension internalLabels[] = { - VectorExt::LayoutDimension::VECTORZ, VectorExt::LayoutDimension::VECTORY, - VectorExt::LayoutDimension::VECTORX}; - VectorExt::LayoutDimension crossThreadLabels[] = { - VectorExt::LayoutDimension::LANEZ, VectorExt::LayoutDimension::LANEY, - VectorExt::LayoutDimension::LANEX}; - auto internalLabelIter = std::end(internalLabels); - auto crossThreadLabelIter = std::end(crossThreadLabels); - for (SrcIndexAndSwizzleDim d : swizzledShape) { - if (d.dim.kind == TileSwizzle::Dim::Kind::Internal) { - assert(internalLabelIter != std::begin(internalLabels)); - --internalLabelIter; - } else if (d.dim.kind == TileSwizzle::Dim::Kind::CrossThread) { - assert(crossThreadLabelIter != std::begin(crossThreadLabels)); - --crossThreadLabelIter; - } else { - assert(false && "unexpected dimension kind in intrinsic swizzle"); - } - } - - // Step 3: put together the result PerDimLayoutAttr'd for the two source dims. - SmallVector labels[2]; - SmallVector shape[2]; - for (SrcIndexAndSwizzleDim d : swizzledShape) { - shape[d.srcIndex].push_back(d.dim.size); - auto &labelIterRef = (d.dim.kind == TileSwizzle::Dim::Kind::Internal) - ? internalLabelIter - : crossThreadLabelIter; - labels[d.srcIndex].push_back(VectorExt::LayoutDimensionAttr::get( - context, static_cast(*labelIterRef++))); - } - return {VectorExt::PerDimLayoutAttr::get(context, labels[0], shape[0]), - VectorExt::PerDimLayoutAttr::get(context, labels[1], shape[1])}; -}; - -static ConcreteMmaLayout getConcreteMMALayout(MLIRContext *context, - GPU::MMAIntrinsic intrinsic) { - auto opaque = GPU::getOpaqueMMALayout(context, intrinsic); - ConcreteMmaLayout concreteLayout; - concreteLayout.base = opaque; - auto lhsSwizzle = getIntrinsicSwizzle(intrinsic, GPU::MMAFragment::Lhs); - auto rhsSwizzle = getIntrinsicSwizzle(intrinsic, GPU::MMAFragment::Rhs); - auto accSwizzle = getIntrinsicSwizzle(intrinsic, GPU::MMAFragment::Acc); - std::tie(concreteLayout.aMLayout, concreteLayout.aKLayout) = - getPerDimLayoutAttrs(context, lhsSwizzle); - std::tie(concreteLayout.bNLayout, concreteLayout.bKLayout) = - getPerDimLayoutAttrs(context, rhsSwizzle); - std::tie(concreteLayout.cMLayout, concreteLayout.cNLayout) = - getPerDimLayoutAttrs(context, accSwizzle); - return concreteLayout; -} - -static VectorExt::PerDimLayoutAttr -getBatchedPerDimLayoutAttr(VectorExt::LayoutDimensionAttr batchDim, - VectorExt::PerDimLayoutAttr baseLayout, - int64_t problemSize, int64_t fragmentDimSize) { - assert(problemSize % fragmentDimSize == 0 && - "invalid layout fragment for problem size"); - - SmallVector dimAttrs( - baseLayout.getLabels()); - dimAttrs.insert(dimAttrs.begin(), batchDim); - - SmallVector shapes(baseLayout.getShapes()); - shapes.insert(shapes.begin(), problemSize / fragmentDimSize); - auto layout = VectorExt::PerDimLayoutAttr::get(baseLayout.getContext(), - dimAttrs, shapes); - return layout; -} - -// Get the batched layout attributes for the given fragment layouts, indexing -// map, and problem shape. The canonical fragment map is used to compare against -// the problem map |indexingMap|. For example, for mma fragment B (RHS): -// -// indexingMap = affine_map<(d0, d1, d2) -> (d1, d2) # Transposed B -// fragmentMap = affine_map<(d0, d1, d2) -> (d2, d1) -// problemShape = [32, 64] -// fragmentSize = [16, 8] -// fragmentLayouts = [kLayout, nLayout] -// -// Gives batched layout -// -// Dim0 Layout = [BATCHX, nLayoutLabels], [8, nLayoutShape] -// Dim1 Layout = [BATCHY, kLayoutLabels], [2, kLayoutShape] -static VectorExt::LayoutAttr -getBatchedLayoutAttr(AffineMap indexingMap, AffineMap fragmentMap, - ArrayRef problemShape, - ArrayRef fragmentSize, - ArrayRef fragmentLayouts) { - // Current distribution to MFMA operations does not support batched - // contractions so that is reflected here. - assert(indexingMap.getNumResults() == 2 && - "invalid indexing map to non-batched simple contraction"); - - VectorExt::LayoutDimensionAttr batchX = VectorExt::LayoutDimensionAttr::get( - indexingMap.getContext(), VectorExt::LayoutDimension::BATCHX); - VectorExt::LayoutDimensionAttr batchY = VectorExt::LayoutDimensionAttr::get( - indexingMap.getContext(), VectorExt::LayoutDimension::BATCHY); - - SmallVector perDimAttrs; - for (auto [expr, batchType] : llvm::zip_equal( - indexingMap.getResults(), - SmallVector{batchX, batchY})) { - auto maybeResultPosition = fragmentMap.getResultPosition(expr); - assert(maybeResultPosition && "fragment map and problem map mismatch"); - int64_t idx = *maybeResultPosition; - perDimAttrs.push_back(getBatchedPerDimLayoutAttr( - batchType, fragmentLayouts[idx], problemShape[idx], fragmentSize[idx])); - } - - return VectorExt::LayoutAttr::get(indexingMap.getContext(), perDimAttrs); -} - -static FailureOr> -getContractionLayout(vector::ContractionOp contract, ConcreteMmaLayout layout) { - MLIRContext *context = contract.getContext(); - FailureOr maybeContractionDims = - linalg::inferContractionDims(contract.getIndexingMapsArray()); - if (failed(maybeContractionDims)) { - return failure(); - } - auto contractionDims = *maybeContractionDims; - // TODO: Relax this condition to strictly alignment requirements. - if (contractionDims.k.size() != 1 || contractionDims.m.size() != 1 || - contractionDims.n.size() != 1) { - return failure(); - } - // TODO: Support batched contractions. - if (contractionDims.batch.size() > 0) { - return failure(); - } - unsigned mDim = contractionDims.m[0]; - unsigned nDim = contractionDims.n[0]; - unsigned kDim = contractionDims.k[0]; - - SmallVector iterationBounds; - contract.getIterationBounds(iterationBounds); - - int64_t problemMSize = iterationBounds[mDim]; - int64_t problemNSize = iterationBounds[nDim]; - int64_t problemKSize = iterationBounds[kDim]; - - int64_t mSize = layout.base.mSize; - int64_t nSize = layout.base.nSize; - int64_t kSize = layout.base.kSize; - - // The problem size currently must be strictly aligned to the size of the mma. - // This is expected to succeed assuming the correct [masked] vector size was - // set at strategy configuration time (for this mma). - if (problemMSize % mSize != 0 || problemNSize % nSize || - problemKSize % kSize) { - return failure(); - } - - VectorExt::LayoutAttr aLayout = getBatchedLayoutAttr( - contract.getIndexingMapsArray()[0], - AffineMap::getMultiDimMapWithTargets(3, {mDim, kDim}, context), - {problemMSize, problemKSize}, {mSize, kSize}, - {layout.aMLayout, layout.aKLayout}); - VectorExt::LayoutAttr bLayout = getBatchedLayoutAttr( - contract.getIndexingMapsArray()[1], - AffineMap::getMultiDimMapWithTargets(3, {kDim, nDim}, context), - {problemKSize, problemNSize}, {kSize, nSize}, - {layout.bKLayout, layout.bNLayout}); - VectorExt::LayoutAttr cLayout = getBatchedLayoutAttr( - contract.getIndexingMapsArray()[2], - AffineMap::getMultiDimMapWithTargets(3, {mDim, nDim}, context), - {problemMSize, problemNSize}, {mSize, nSize}, - {layout.cMLayout, layout.cNLayout}); - - return std::make_tuple(aLayout, bLayout, cLayout); -} - -FailureOr> static getContractionLayout(GPU::MMAAttr mma, - vector::ContractionOp - contract) { - ConcreteMmaLayout layout = getConcreteMMALayout( - contract->getContext(), mma.getIntrinsic().getValue()); - return getContractionLayout(contract, layout); -} - -DiagnosedSilenceableFailure -transform_dialect::SetContractionLayoutAttributes::apply( - transform::TransformRewriter &rewriter, - transform::TransformResults &results, transform::TransformState &state) { - auto payloadList = state.getPayloadOps(getTarget()); - auto typeList = state.getParams(getMmaType()); - if (typeList.size() != 1) { - return emitDefiniteFailure() - << "invalid more than one attribute for contraction annotation"; - } - auto mmaType = llvm::dyn_cast(typeList.front()); - if (!mmaType) { - return emitDefiniteFailure() - << "invalid non-mma attribute for contraction annotation " - << typeList.front(); - } - - for (Operation *payload : payloadList) { - auto contract = llvm::dyn_cast(payload); - if (!contract) { - return emitDefiniteFailure() - << "invalid non-contraction annotation " << payload; - } - - auto maybeLayouts = getContractionLayout(mmaType, contract); - if (failed(maybeLayouts)) { - return emitDefiniteFailure() - << "invalid opaque mma layout for annotation " << mmaType; - } - - Location loc = contract.getLoc(); - auto [aLayout, bLayout, cLayout] = *maybeLayouts; - - // Set packed read layout for specified indices. - ArrayRef operandIndices = getReadLayoutIndices(); - if (!operandIndices.empty()) { - SmallVector operands; - SmallVector layouts; - for (int64_t index : operandIndices) { - operands.push_back(contract.getOperand(index)); - layouts.push_back(index == 0 ? aLayout : bLayout); - } - rewriter.setInsertionPoint(contract); - for (const auto &idxAndVals : - llvm::enumerate(llvm::zip_equal(operands, layouts))) { - int64_t i = idxAndVals.index(); - auto [operand, layoutInterface] = idxAndVals.value(); - VectorExt::LayoutAttr layout = - dyn_cast(layoutInterface); - std::optional maybeReadLayout = - createReadLayout(rewriter.getContext(), layout); - if (!maybeReadLayout) - continue; - VectorExt::LayoutAttr readLayout = maybeReadLayout.value(); - Operation *parentOp = operand.getDefiningOp(); - if (!parentOp || (parentOp->getNumResults() != 1)) - continue; - Value resolvedOperand = - rewriter.create(loc, operand, readLayout); - contract.setOperand(operandIndices[i], resolvedOperand); - } - } - - // Set layout anchors. - rewriter.setInsertionPoint(contract); - Value newLhs = - rewriter.create(loc, contract.getLhs(), aLayout); - Value newRhs = - rewriter.create(loc, contract.getRhs(), bLayout); - Value newAcc = - rewriter.create(loc, contract.getAcc(), cLayout); - contract.setOperand(0, newLhs); - contract.setOperand(1, newRhs); - contract.setOperand(2, newAcc); - - // Set intrinsic type. - contract->setAttr("iree.amdgpu.mma", mmaType); - } - - return DiagnosedSilenceableFailure::success(); -} - -void transform_dialect::SetContractionLayoutAttributes::getEffects( - SmallVectorImpl &effects) { - transform::onlyReadsHandle(getTargetMutable(), effects); - transform::onlyReadsHandle(getMmaTypeMutable(), effects); - transform::modifiesPayload(effects); -} - #define GET_OP_CLASSES #include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.cpp.inc" diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td index ac3e7eef7513..69e766537c0b 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td @@ -736,24 +736,4 @@ def CreateMatmulMfmaTileSizesOp : let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect"; } -def SetContractionLayoutAttributes : - Op, - DeclareOpInterfaceMethods]> { - let description = [{ - Infers and sets the layout of the target contraction op based on the given - MFMA attribute. The optional read_layout_indices attribute determines whether - to apply a modified version of the MFMA layout to the operands of - the contracts that enables loading a greater number of elements from LDS. - If empty, the read layout is not applied to any operand. 0 specifies - LHS and 1 RHS. - }]; - - let arguments = (ins TransformHandleTypeInterface:$target, - TransformParamTypeInterface:$mma_type, - DefaultValuedOptionalAttr:$read_layout_indices); - let assemblyFormat = "$target `,` $mma_type attr-dict `:` type($target) `,` type($mma_type)"; - let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect"; -} - #endif // IREE_COMPILER_CODEGEN_LLVMGPU_TRANSFORMEXTENSIONS_LLVMGPUEXTENSIONS diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/AMDGPUDistributionPatterns.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/AMDGPUDistributionPatterns.cpp deleted file mode 100644 index 48de6cdaeef8..000000000000 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/AMDGPUDistributionPatterns.cpp +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright 2024 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "iree/compiler/Codegen/Common/GPU/GPUVectorDistribution.h" -#include "iree/compiler/Codegen/Common/VectorLayoutAnalysis.h" -#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h" -#include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.h" -#include "iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h" -#include "iree/compiler/Codegen/Utils/VectorOpUtils.h" - -namespace mlir::iree_compiler { - -using namespace mlir::iree_compiler::IREE::VectorExt; -using VectorValue = TypedValue; - -namespace { - -struct DistributeContractions final - : OpDistributionPattern { - using OpDistributionPattern::OpDistributionPattern; - - LogicalResult matchAndRewrite(vector::ContractionOp contractOp, - DistributionSignature &signature, - PatternRewriter &rewriter) const override { - auto maybeOpInfo = VectorContractOpInfo::inferFromIndexingMaps( - contractOp.getIndexingMapsArray()); - if (failed(maybeOpInfo)) { - return rewriter.notifyMatchFailure(contractOp, "invalid contraction"); - } - VectorContractOpInfo opInfo = maybeOpInfo.value(); - - VectorValue result = dyn_cast(contractOp.getResult()); - if (!result) { - return rewriter.notifyMatchFailure(contractOp, - "result should be of type vector"); - } - - LayoutAttr resultLayout = dyn_cast(signature[result]); - if (!resultLayout) { - return rewriter.notifyMatchFailure( - contractOp, "result layout should be of type LayoutAttr"); - } - - auto mmaAttr = - contractOp->getAttrOfType("iree.amdgpu.mma"); - if (!mmaAttr) { - return rewriter.notifyMatchFailure( - contractOp, "missing iree.amdgpu.mma intrinsic attribute"); - } - - constexpr int LHS = 0; - constexpr int RHS = 1; - constexpr int ACC = 2; - SmallVector operands; - SmallVector layouts; - for (Value operand : contractOp->getOperands()) { - if (auto vectorOperand = dyn_cast(operand)) { - auto layout = signature[vectorOperand]; - if (auto vectorLayout = dyn_cast(layout)) { - operands.push_back(vectorOperand); - layouts.push_back(vectorLayout); - } - } - } - - Type elementType = - llvm::cast(operands[ACC].getType()).getElementType(); - SmallVector vectorShape = resultLayout.getDistributedShape(); - auto vectorType = VectorType::get(vectorShape, elementType); - Location loc = contractOp.getLoc(); - Value vector = rewriter.create( - loc, vectorType, rewriter.getZeroAttr(vectorType)); - - auto [lhsK, rhsK] = opInfo.getOperandKIndex(); - - std::optional kBatch = layouts[LHS].getBatchDim(lhsK); - if (!kBatch) { - return failure(); - } - - auto contractFn = [&](const LayoutIterator::State &state) { - auto [lhsM, rhsN] = opInfo.getOperandMNIndex(); - auto [lhsK, rhsK] = opInfo.getOperandKIndex(); - SmallVector indices = state.computeIteratorProjectedSIMTIndex(); - Value dMatrix = rewriter.create( - loc, getDistributed(rewriter, operands[ACC], layouts[ACC]), indices); - for (int k = 0; k < kBatch; ++k) { - SmallVector lhsIndices(2); - SmallVector rhsIndices(2); - lhsIndices[lhsM] = indices[0]; - lhsIndices[lhsK] = k; - rhsIndices[rhsN] = indices[1]; - rhsIndices[rhsK] = k; - - Value aMatrix = rewriter.create( - loc, getDistributed(rewriter, operands[LHS], layouts[LHS]), - lhsIndices); - - Value bMatrix = rewriter.create( - loc, getDistributed(rewriter, operands[RHS], layouts[RHS]), - rhsIndices); - - dMatrix = mmaAttr - .buildMmaOperation(rewriter, loc, dMatrix.getType(), - aMatrix, bMatrix, dMatrix) - .value(); - } - vector = rewriter.create(loc, dMatrix, vector, indices); - return success(); - }; - - LayoutIterator iterator(resultLayout); - LayoutIterator batchIterator = iterator.getBatchIterator(); - batchIterator.apply(contractFn); - replaceOpWithDistributedValues(rewriter, contractOp, vector); - return success(); - } -}; -} // namespace - -void populateAMDGPUDistributionPatterns(RewritePatternSet &patterns) { - patterns.add(patterns.getContext()); -} - -} // namespace mlir::iree_compiler diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/BUILD.bazel index eeb97bf8c033..113c6d56598f 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/BUILD.bazel @@ -17,7 +17,6 @@ package( iree_compiler_cc_library( name = "Utils", srcs = [ - "AMDGPUDistributionPatterns.cpp", "LLVMGPUUtils.cpp", "PrefetchSharedMemoryCopy.cpp", ], diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/CMakeLists.txt index ccd7f4bef826..6b66e96ded1f 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/CMakeLists.txt @@ -16,7 +16,6 @@ iree_cc_library( HDRS "LLVMGPUUtils.h" SRCS - "AMDGPUDistributionPatterns.cpp" "LLVMGPUUtils.cpp" "PrefetchSharedMemoryCopy.cpp" DEPS diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h index ac4f79211d42..07d9ca5ed186 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h @@ -18,10 +18,6 @@ namespace mlir::iree_compiler { void createAsyncGroups(RewriterBase &rewriter, mlir::FunctionOpInterface funcOp, bool useMMASync); -/// Function to do layout analysis and distribution. -void doLayoutAnalysisAndDistribution(RewriterBase &rewriter, - mlir::FunctionOpInterface funcOp); - /// Function to reorder transposes and elementwise ops. void reorderTranspose(RewriterBase &rewriter, mlir::FunctionOpInterface funcOp); @@ -33,9 +29,6 @@ void reorderTranspose(RewriterBase &rewriter, mlir::FunctionOpInterface funcOp); /// from the previous alias group before starting a new one. void packSharedMemoryAlloc(mlir::FunctionOpInterface funcOp); -// Add patterns to distribute contractions to MFMA ops. -void populateAMDGPUDistributionPatterns(RewritePatternSet &patterns); - // Prefetches data written to shared memory for the next iteration. Returns the // new loop on success or failure when the `forOp` is not supported. FailureOr prefetchSharedMemoryCopy(RewriterBase &rewriter, diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel index 0256a74f2ecd..ff000d6715ca 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel @@ -18,9 +18,6 @@ iree_lit_test_suite( name = "lit", srcs = enforce_glob( [ - "amdgpu_chained_matmul.mlir", - "amdgpu_contraction_distribution.mlir", - "amdgpu_set_anchor_layouts.mlir", "assign_constant_ordinals.mlir", "conv_pipeline_test_cuda.mlir", "convert_to_nvvm.mlir", diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt index 635a49df1694..e46b413d20bf 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt @@ -14,9 +14,6 @@ iree_lit_test_suite( NAME lit SRCS - "amdgpu_chained_matmul.mlir" - "amdgpu_contraction_distribution.mlir" - "amdgpu_set_anchor_layouts.mlir" "assign_constant_ordinals.mlir" "cast_address_space_function.mlir" "cast_type_to_fit_mma.mlir" diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_chained_matmul.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_chained_matmul.mlir deleted file mode 100644 index f1d666579302..000000000000 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_chained_matmul.mlir +++ /dev/null @@ -1,189 +0,0 @@ -// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-amdgpu-prepare-chained-matmul),canonicalize,cse)" %s | FileCheck %s - -#accesses0 = [ - affine_map<(m, n, k) -> (m, k)>, - affine_map<(m, n, k) -> (n, k)>, - affine_map<(m, n, k) -> (m, n)> -] - -#trait0 = { - indexing_maps = #accesses0, - iterator_types = ["parallel", "parallel", "reduction"] -} - -builtin.module { - // CHECK-DAG: #[[MAP:.*]] = affine_map<(d0, d1, d2) -> (d1, d2)> - // CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1, d2) -> (d0, d2)> - // CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (d1, d0)> - func.func @chained_matmul(%lhs : vector<32x8xf16>, %rhs : vector<16x8xf16>, %acc : vector<32x16xf16>, - // CHECK: func.func @chained_matmul(%[[LHS:.*]]: vector<32x8xf16>, %[[RHS:.*]]: vector<16x8xf16>, %[[ACC:.*]]: vector<32x16xf16> - // CHECK-SAME: %[[RHS2:.*]]: vector<8x16xf16>, %[[ACC2:.*]]: vector<32x8xf16> - %rhs2 : vector<8x16xf16>, %acc2 : vector<32x8xf16>) -> vector<32x8xf16> { - // CHECK: %[[TRANS_ACC:.*]] = vector.transpose %[[ACC]], [1, 0] : vector<32x16xf16> to vector<16x32xf16> - // CHECK: %[[TRANS_RES:.*]] = vector.contract {indexing_maps = [#[[MAP]], #[[MAP1]], #[[MAP2]]], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} - // CHECK-SAME: %[[RHS]], %[[LHS]], %[[TRANS_ACC]] : vector<16x8xf16>, vector<32x8xf16> into vector<16x32xf16> - // CHECK: %[[RES:.*]] = vector.transpose %[[TRANS_RES]], [1, 0] : vector<16x32xf16> to vector<32x16xf16> - %result = vector.contract #trait0 %lhs, %rhs, %acc - : vector<32x8xf16>, vector<16x8xf16> into vector<32x16xf16> - // CHECK: %[[EXP:.*]] = math.exp2 %[[RES]] : vector<32x16xf16> - %exp = math.exp2 %result : vector<32x16xf16> - // CHECK: %[[TRANS_ACC2:.*]] = vector.transpose %[[ACC2]], [1, 0] : vector<32x8xf16> to vector<8x32xf16> - // CHECK: %[[TRANS_RES2:.*]] = vector.contract {indexing_maps = [#[[MAP]], #[[MAP1]], #[[MAP2]]], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} - // CHECK-SAME: %[[RHS2]], %[[EXP]], %[[TRANS_ACC2]] : vector<8x16xf16>, vector<32x16xf16> into vector<8x32xf16> - // CHECK: %[[RES2:.*]] = vector.transpose %[[TRANS_RES2]], [1, 0] : vector<8x32xf16> to vector<32x8xf16> - %result2 = vector.contract #trait0 %exp, %rhs2, %acc2 - : vector<32x16xf16>, vector<8x16xf16> into vector<32x8xf16> - func.return %result2 : vector<32x8xf16> - } -} - -// ----- - -#accesses0 = [ - affine_map<(m, n, k) -> (m, k)>, - affine_map<(m, n, k) -> (n, k)>, - affine_map<(m, n, k) -> (m, n)> -] - -#trait0 = { - indexing_maps = #accesses0, - iterator_types = ["parallel", "parallel", "reduction"] -} - -builtin.module { - func.func @non_chained_matmul(%lhs : vector<32x8xf16>, %rhs : vector<16x8xf16>, %acc : vector<32x16xf16> - // CHECK: func.func @non_chained_matmul(%[[LHS:.*]]: vector<32x8xf16>, %[[RHS:.*]]: vector<16x8xf16>, %[[ACC:.*]]: vector<32x16xf16> - ) -> vector<32x16xf16> { - // CHECK-NOT: vector.transpose - %result = vector.contract #trait0 %lhs, %rhs, %acc - : vector<32x8xf16>, vector<16x8xf16> into vector<32x16xf16> - %exp = math.exp2 %result : vector<32x16xf16> - func.return %exp : vector<32x16xf16> - } -} - -// ----- - -#accesses0 = [ - affine_map<(m, n, k) -> (m, k)>, - affine_map<(m, n, k) -> (n, k)>, - affine_map<(m, n, k) -> (m, n)> -] - -#trait0 = { - indexing_maps = #accesses0, - iterator_types = ["parallel", "parallel", "reduction"] -} - -builtin.module { - func.func @chained_matmul_second_operand(%lhs : vector<32x8xf16>, %rhs : vector<16x8xf16>, %acc : vector<32x16xf16>, - // CHECK: func.func @chained_matmul_second_operand(%[[LHS:.*]]: vector<32x8xf16>, %[[RHS:.*]]: vector<16x8xf16>, %[[ACC:.*]]: vector<32x16xf16> - %lhs2 : vector<32x16xf16>, %acc2 : vector<32x32xf16>) -> vector<32x32xf16> { - // CHECK-NOT: vector.transpose - %result = vector.contract #trait0 %lhs, %rhs, %acc - : vector<32x8xf16>, vector<16x8xf16> into vector<32x16xf16> - %exp = math.exp2 %result : vector<32x16xf16> - %result2 = vector.contract #trait0 %lhs2, %exp, %acc2 - : vector<32x16xf16>, vector<32x16xf16> into vector<32x32xf16> - func.return %result2 : vector<32x32xf16> - } -} - -// ----- - -#accesses0 = [ - affine_map<(m, n, k) -> (m, k)>, - affine_map<(m, n, k) -> (n, k)>, - affine_map<(m, n, k) -> (m, n)> -] - -#accesses1 = [ - affine_map<(m, n, k) -> (m, k)>, - affine_map<(m, n, k) -> (k, n)>, - affine_map<(m, n, k) -> (m, n)> -] - -#trait0 = { - indexing_maps = #accesses0, - iterator_types = ["parallel", "parallel", "reduction"] -} - -#trait1 = { - indexing_maps = #accesses1, - iterator_types = ["parallel", "parallel", "reduction"] -} - -builtin.module { - func.func @chained_matmul_mmt_mm(%lhs : vector<32x8xf16>, %rhs : vector<16x8xf16>, %acc : vector<32x16xf16>, - // CHECK-DAG: #[[MAP:.*]] = affine_map<(d0, d1, d2) -> (d1, d2)> - // CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1, d2) -> (d0, d2)> - // CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (d1, d0)> - // CHECK-DAG: #[[MAP3:.*]] = affine_map<(d0, d1, d2) -> (d2, d0)> - // CHECK: func.func @chained_matmul_mmt_mm(%[[LHS:.*]]: vector<32x8xf16>, %[[RHS:.*]]: vector<16x8xf16>, %[[ACC:.*]]: vector<32x16xf16> - // CHECK-SAME: %[[RHS2:.*]]: vector<16x8xf16>, %[[ACC2:.*]]: vector<32x8xf16> - %rhs2 : vector<16x8xf16>, %acc2 : vector<32x8xf16>) -> vector<32x8xf16> { - // CHECK: %[[TRANS_ACC:.*]] = vector.transpose %[[ACC]], [1, 0] : vector<32x16xf16> to vector<16x32xf16> - // CHECK: %[[TRANS_RES:.*]] = vector.contract {indexing_maps = [#[[MAP]], #[[MAP1]], #[[MAP2]]], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} - // CHECK-SAME: %[[RHS]], %[[LHS]], %[[TRANS_ACC]] : vector<16x8xf16>, vector<32x8xf16> into vector<16x32xf16> - // CHECK: %[[RES:.*]] = vector.transpose %[[TRANS_RES]], [1, 0] : vector<16x32xf16> to vector<32x16xf16> - %result = vector.contract #trait0 %lhs, %rhs, %acc - : vector<32x8xf16>, vector<16x8xf16> into vector<32x16xf16> - // CHECK: %[[EXP:.*]] = math.exp2 %[[RES]] : vector<32x16xf16> - %exp = math.exp2 %result : vector<32x16xf16> - // CHECK: %[[TRANS_ACC2:.*]] = vector.transpose %[[ACC2]], [1, 0] : vector<32x8xf16> to vector<8x32xf16> - // CHECK: %[[TRANS_EXP:.*]] = vector.transpose %[[EXP]], [1, 0] : vector<32x16xf16> to vector<16x32xf16> - // CHECK: %[[TRANS_RHS2:.*]] = vector.transpose %[[RHS2]], [1, 0] : vector<16x8xf16> to vector<8x16xf16> - // CHECK: %[[TRANS_RES2:.*]] = vector.contract {indexing_maps = [#[[MAP]], #[[MAP3]], #[[MAP2]]], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} - // CHECK-SAME: %[[TRANS_RHS2]], %[[TRANS_EXP]], %[[TRANS_ACC2]] : vector<8x16xf16>, vector<16x32xf16> into vector<8x32xf16> - // CHECK: %[[RES2:.*]] = vector.transpose %[[TRANS_RES2]], [1, 0] : vector<8x32xf16> to vector<32x8xf16> - %result2 = vector.contract #trait1 %exp, %rhs2, %acc2 - : vector<32x16xf16>, vector<16x8xf16> into vector<32x8xf16> - func.return %result2 : vector<32x8xf16> - } -} - -// ----- - -#accesses0 = [ - affine_map<(b, m1, m2, n, k) -> (b, m2, m1, k)>, - affine_map<(b, m1, m2, n, k) -> (b, n, k)>, - affine_map<(b, m1, m2, n, k) -> (b, m2, m1, n)> -] - -#trait0 = { - indexing_maps = #accesses0, - iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"] -} - -builtin.module { - // CHECK-DAG: #[[MAP:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> - // CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d1, d4)> - // CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d1, d2)> - func.func @chained_matmul(%lhs : vector<17x64x32x8xf16>, - %rhs : vector<17x16x8xf16>, - %acc : vector<17x64x32x16xf16>, - %rhs2 : vector<17x8x16xf16>, - %acc2 : vector<17x64x32x8xf16>) -> vector<17x64x32x8xf16> { - - // CHECK: vector.transpose - // CHECK-NOT: vector.transpose - // CHECK: vector.contract - // CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP1]], #[[MAP2]]] - %result = vector.contract #trait0 %lhs, %rhs, %acc - : vector<17x64x32x8xf16>, vector<17x16x8xf16> into vector<17x64x32x16xf16> - - // transpose from result will fold with transpose of the acc of the next - // contract - - // CHECK: vector.transpose - // CHECK: vector.transpose - // CHECK-NOT: vector.transpose - // CHECK: vector.contract - // CHECK-SAME: indexing_maps = [#[[MAP]], #[[MAP1]], #[[MAP2]]] - %result2 = vector.contract #trait0 %result, %rhs2, %acc2 - : vector<17x64x32x16xf16>, vector<17x8x16xf16> into vector<17x64x32x8xf16> - // CHECK: vector.transpose - - func.return %result2 : vector<17x64x32x8xf16> - } -} diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir deleted file mode 100644 index cc0688ac332e..000000000000 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir +++ /dev/null @@ -1,319 +0,0 @@ -// RUN: iree-opt --iree-transform-dialect-interpreter --split-input-file --cse %s | FileCheck %s - -// Refer to the distribution pattern documentation for what layoutA, layoutB, -// layoutC means and how these layouts are assigned based on the instruction -// type. - -#layout = #iree_gpu.mma_layout - -#map1 = affine_map<(d0, d1, d2) -> (d1, d2)> -#map2 = affine_map<(d0, d1, d2) -> (d0, d2)> -#map3 = affine_map<(d0, d1, d2) -> (d1, d0)> - -// A: vector<16x16>, layout = layoutA -#row_layout = #iree_vector_ext.per_dim_layout<[BATCHX, LANEX], [1, 16]> -#col_layout = #iree_vector_ext.per_dim_layout<[BATCHY, LANEY, VECTORX], [1, 4, 4]> -#layout_a = #iree_vector_ext.layout<#row_layout, #col_layout> - -// B: vector<16x16>, layout = transpose(layoutB) = layoutA -// Since shapes are also same, we can use the same layout attribute, layout_a. - -// C: vector<16x16>, layout = layoutC -#row_layout2 = #iree_vector_ext.per_dim_layout<[BATCHX, LANEY, VECTORX], [1, 4, 4]> -#col_layout2 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [1, 16]> -#layout_c = #iree_vector_ext.layout<#row_layout2, #col_layout2> -builtin.module attributes { transform.with_named_sequence } { - func.func @distribute_mfma_16x16x16_mmt(%a : vector<16x16xf16>, %b : vector<16x16xf16>, %c : vector<16x16xf32>) -> vector<16x16xf32> { - %output = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], - kind = #vector.kind, - "__vector_layout_test_anchor_operand_0" = #layout_a, - "__vector_layout_test_anchor_operand_1" = #layout_a, - "__vector_layout_test_anchor_operand_2" = #layout_c, - "__vector_layout_test_anchor_result_0" = #layout_c - } - %a, %b, %c : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32> - return %output : vector<16x16xf32> - } - transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { - %contract = transform.structured.match ops{["vector.contract"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %layout16x16x16 = transform.param.constant #layout -> !transform.any_param - transform.iree.set_contraction_layout_attributes %contract, %layout16x16x16 : !transform.any_op, !transform.any_param - - %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.iree.amdgpu_distribute_vectors %top_level_func test_conversion : (!transform.any_op) -> !transform.any_op - transform.yield - } -} - -// CHECK-LABEL: distribute_mfma_16x16x16_mmt - -// CHECK-SAME: %[[ARG0:.+]]: vector<16x16xf16>, %[[ARG1:.+]]: vector<16x16xf16>, %[[ARG2:.+]]: vector<16x16xf32> -// CHECK-DAG: %[[C:.+]] = iree_vector_ext.to_simt %[[ARG2]] : vector<16x16xf32> -> vector<1x1x4xf32> -// CHECK-DAG: %[[CV:.+]] = vector.extract %[[C]][0, 0] : vector<4xf32> from vector<1x1x4xf32> -// CHECK-DAG: %[[A:.+]] = iree_vector_ext.to_simt %[[ARG0]] : vector<16x16xf16> -> vector<1x1x4xf16> -// CHECK-DAG: %[[AV:.+]] = vector.extract %[[A]][0, 0] : vector<4xf16> from vector<1x1x4xf16> -// CHECK-DAG: %[[B:.+]] = iree_vector_ext.to_simt %[[ARG1]] : vector<16x16xf16> -> vector<1x1x4xf16> -// CHECK-DAG: %[[BV:.+]] = vector.extract %[[B]][0, 0] : vector<4xf16> from vector<1x1x4xf16> -// CHECK-DAG: %[[OUT:.+]] = amdgpu.mfma %[[AV]] * %[[BV]] + %[[CV]] {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32> - -// ----- - -#layout = #iree_gpu.mma_layout - -#map1 = affine_map<(d0, d1, d2) -> (d1, d2)> -#map2 = affine_map<(d0, d1, d2) -> (d0, d2)> -#map3 = affine_map<(d0, d1, d2) -> (d1, d0)> - -// A: vector<32x128>, layout = layoutA -#row_layout = #iree_vector_ext.per_dim_layout<[BATCHX, LANEX], [2, 16]> -#col_layout = #iree_vector_ext.per_dim_layout<[BATCHY, LANEY, VECTORX], [8, 4, 4]> -#layout_a = #iree_vector_ext.layout<#row_layout, #col_layout> - -// B: vector<64x128>, layout = transpose(layoutB) = layoutA -#row_layout2 = #iree_vector_ext.per_dim_layout<[BATCHX, LANEX], [4, 16]> -#col_layout2 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEY, VECTORX], [8, 4, 4]> -#layout_b = #iree_vector_ext.layout<#row_layout2, #col_layout2> - -// C: vector<32x64>, layout = layoutC -#row_layout3 = #iree_vector_ext.per_dim_layout<[BATCHX, LANEY, VECTORX], [2, 4, 4]> -#col_layout3 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [4, 16]> -#layout_c = #iree_vector_ext.layout<#row_layout3, #col_layout3> -builtin.module attributes { transform.with_named_sequence } { - func.func @distribute_mfma_16x16x16_mmt_batch(%a : vector<32x128xf16>, %b : vector<64x128xf16>, %c : vector<32x64xf32>) -> vector<32x64xf32> { - %output = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], - kind = #vector.kind, - "__vector_layout_test_anchor_operand_0" = #layout_a, - "__vector_layout_test_anchor_operand_1" = #layout_b, - "__vector_layout_test_anchor_operand_2" = #layout_c, - "__vector_layout_test_anchor_result_0" = #layout_c - } - %a, %b, %c : vector<32x128xf16>, vector<64x128xf16> into vector<32x64xf32> - return %output : vector<32x64xf32> - } - transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { - %contract = transform.structured.match ops{["vector.contract"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %layout16x16x16 = transform.param.constant #layout -> !transform.any_param - transform.iree.set_contraction_layout_attributes %contract, %layout16x16x16 : !transform.any_op, !transform.any_param - - %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.iree.amdgpu_distribute_vectors %top_level_func test_conversion : (!transform.any_op) -> !transform.any_op - transform.yield - } -} - -// CHECK-LABEL: distribute_mfma_16x16x16_mmt_batch - -// CHECK-COUNT-64: amdgpu.mfma {{.*}}, vector<4xf32> - -// ----- - -#layout = #iree_gpu.mma_layout - -#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> -#map2 = affine_map<(d0, d1, d2) -> (d2, d1)> -#map3 = affine_map<(d0, d1, d2) -> (d0, d1)> - -// A: vector<32x8>, layout = layoutA -#row_layout = #iree_vector_ext.per_dim_layout<[BATCHX, LANEX], [1, 32]> -#col_layout = #iree_vector_ext.per_dim_layout<[BATCHY, LANEY, VECTORX], [1, 2, 4]> -#layout_a = #iree_vector_ext.layout<#row_layout, #col_layout> - -// B: vector<8x32>, layout = layoutB -#row_layout1 = #iree_vector_ext.per_dim_layout<[BATCHX, LANEY, VECTORX], [1, 2, 4]> -#col_layout1 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [1, 32]> -#layout_b = #iree_vector_ext.layout<#row_layout1, #col_layout1> - -// C: vector<32x32>, layout = layoutC -#row_layout2 = #iree_vector_ext.per_dim_layout<[BATCHX, VECTORY, LANEY, VECTORX], [1, 4, 2, 4]> -#col_layout2 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [1, 32]> -#layout_c = #iree_vector_ext.layout<#row_layout2, #col_layout2> -builtin.module attributes { transform.with_named_sequence } { - func.func @distribute_mfma_32x32x8_mm(%a : vector<32x8xf16>, %b : vector<8x32xf16>, %c : vector<32x32xf32>) -> vector<32x32xf32> { - %output = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], - kind = #vector.kind, - "__vector_layout_test_anchor_operand_0" = #layout_a, - "__vector_layout_test_anchor_operand_1" = #layout_b, - "__vector_layout_test_anchor_operand_2" = #layout_c, - "__vector_layout_test_anchor_result_0" = #layout_c - } - %a, %b, %c : vector<32x8xf16>, vector<8x32xf16> into vector<32x32xf32> - return %output : vector<32x32xf32> - } - transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { - %contract = transform.structured.match ops{["vector.contract"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %layout32x32x8 = transform.param.constant #layout -> !transform.any_param - transform.iree.set_contraction_layout_attributes %contract, %layout32x32x8 : !transform.any_op, !transform.any_param - - %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.iree.amdgpu_distribute_vectors %top_level_func test_conversion : (!transform.any_op) -> !transform.any_op - transform.yield - } -} - -// CHECK-LABEL: distribute_mfma_32x32x8_mm - -// CHECK-SAME: %[[ARG0:.+]]: vector<32x8xf16>, %[[ARG1:.+]]: vector<8x32xf16>, %[[ARG2:.+]]: vector<32x32xf32> -// CHECK-DAG: %[[C:.+]] = iree_vector_ext.to_simt %[[ARG2]] : vector<32x32xf32> -> vector<1x1x16xf32> -// CHECK-DAG: %[[CV:.+]] = vector.extract %[[C]][0, 0] : vector<16xf32> from vector<1x1x16xf32> -// CHECK-DAG: %[[A:.+]] = iree_vector_ext.to_simt %[[ARG0]] : vector<32x8xf16> -> vector<1x1x4xf16> -// CHECK-DAG: %[[AV:.+]] = vector.extract %[[A]][0, 0] : vector<4xf16> from vector<1x1x4xf16> -// CHECK-DAG: %[[B:.+]] = iree_vector_ext.to_simt %[[ARG1]] : vector<8x32xf16> -> vector<1x1x4xf16> -// CHECK-DAG: %[[BV:.+]] = vector.extract %[[B]][0, 0] : vector<4xf16> from vector<1x1x4xf16> -// CHECK-DAG: %[[OUT:.+]] = amdgpu.mfma %[[AV]] * %[[BV]] + %[[CV]] {blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32} blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32> - -// ----- - -#layout = #iree_gpu.mma_layout - -#map1 = affine_map<(d0, d1, d2) -> (d2, d0)> -#map2 = affine_map<(d0, d1, d2) -> (d2, d1)> -#map3 = affine_map<(d0, d1, d2) -> (d0, d1)> - -// A: vector<8x64>, layout = transpose(layoutA) = layoutB -#row_layout = #iree_vector_ext.per_dim_layout<[BATCHX, LANEY, VECTORX], [1, 2, 4]> -#col_layout = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [2, 32]> -#layout_a = #iree_vector_ext.layout<#row_layout, #col_layout> - -// B: vector<8x32>, layout = layoutB -// We can use the same layout attribute, layout_a, since the shapes are same. -#row_layout1 = #iree_vector_ext.per_dim_layout<[BATCHX, LANEY, VECTORX], [1, 2, 4]> -#col_layout1 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [1, 32]> -#layout_b = #iree_vector_ext.layout<#row_layout1, #col_layout1> - -// C: vector<64x32>, layout = layoutC -#row_layout2 = #iree_vector_ext.per_dim_layout<[BATCHX, VECTORY, LANEY, VECTORX], [2, 4, 2, 4]> -#col_layout2 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [1, 32]> -#layout_c = #iree_vector_ext.layout<#row_layout2, #col_layout2> -builtin.module attributes { transform.with_named_sequence } { - func.func @distribute_mfma_32x32x8_mtm(%a : vector<8x64xf16>, %b : vector<8x32xf16>, %c : vector<64x32xf32>) -> vector<64x32xf32> { - %output = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], - kind = #vector.kind, - "__vector_layout_test_anchor_operand_0" = #layout_a, - "__vector_layout_test_anchor_operand_1" = #layout_b, - "__vector_layout_test_anchor_operand_2" = #layout_c, - "__vector_layout_test_anchor_result_0" = #layout_c - } - %a, %b, %c : vector<8x64xf16>, vector<8x32xf16> into vector<64x32xf32> - return %output : vector<64x32xf32> - } - transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { - %contract = transform.structured.match ops{["vector.contract"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %layout32x32x8 = transform.param.constant #layout -> !transform.any_param - transform.iree.set_contraction_layout_attributes %contract, %layout32x32x8 : !transform.any_op, !transform.any_param - - %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.iree.amdgpu_distribute_vectors %top_level_func test_conversion : (!transform.any_op) -> !transform.any_op - transform.yield - } -} - -// CHECK-LABEL: distribute_mfma_32x32x8_mtm - -// CHECK-DAG: %[[A1:.+]] = vector.extract %[[A:.+]][0, 0] : vector<4xf16> from vector<1x2x4xf16> -// CHECK-DAG: %[[B1:.+]] = vector.extract %[[B:.+]][0, 0] : vector<4xf16> from vector<1x1x4xf16> -// CHECK-DAG: %{{.*}} = amdgpu.mfma %[[A1]] * %[[B1]] -// CHECK-DAG: %[[A2:.+]] = vector.extract %[[A]][0, 1] : vector<4xf16> from vector<1x2x4xf16> -// CHECK-DAG: %{{.*}} = amdgpu.mfma %[[A2]] * %[[B1]] -// CHECK-NOT: amdgpu.mfma - -// ----- - -#layout = #iree_gpu.mma_layout -#map1 = affine_map<(d0, d1, d2) -> (d1, d2)> -#map2 = affine_map<(d0, d1, d2) -> (d0, d2)> -#map3 = affine_map<(d0, d1, d2) -> (d1, d0)> - -// A: vector<16x16>, layout = layoutA -#row_layout = #iree_vector_ext.per_dim_layout<[BATCHX, LANEX], [1, 16]> -#col_layout = #iree_vector_ext.per_dim_layout<[BATCHY, LANEY, VECTORX], [1, 1, 16]> -#layout_a = #iree_vector_ext.layout<#row_layout, #col_layout> - -// B: vector<16x16>, layout = transpose(layoutB) = layoutA -// Since shapes are also same, we can use the same layout attribute, layout_a. - -// C: vector<16x16>, layout = layoutC -#row_layout2 = #iree_vector_ext.per_dim_layout<[BATCHX, VECTORY, LANEY, VECTORX], [1, 8, 2, 1]> -#col_layout2 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [1, 16]> -#layout_c = #iree_vector_ext.layout<#row_layout2, #col_layout2> -builtin.module attributes { transform.with_named_sequence } { - func.func @distribute_wmma_16x16x16_mmt(%a : vector<16x16xf16>, %b : vector<16x16xf16>, %c : vector<16x16xf32>) -> vector<16x16xf32> { - %output = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], - kind = #vector.kind, - "__vector_layout_test_anchor_operand_0" = #layout_a, - "__vector_layout_test_anchor_operand_1" = #layout_a, - "__vector_layout_test_anchor_operand_2" = #layout_c, - "__vector_layout_test_anchor_result_0" = #layout_c - } - %a, %b, %c : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32> - return %output : vector<16x16xf32> - } - transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { - %contract = transform.structured.match ops{["vector.contract"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %layout16x16x16 = transform.param.constant #layout -> !transform.any_param - transform.iree.set_contraction_layout_attributes %contract, %layout16x16x16 : !transform.any_op, !transform.any_param - - %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.iree.amdgpu_distribute_vectors %top_level_func test_conversion : (!transform.any_op) -> !transform.any_op - transform.yield - } -} - -// CHECK-LABEL: distribute_wmma_16x16x16_mmt - -// CHECK-SAME: %[[ARG0:.+]]: vector<16x16xf16>, %[[ARG1:.+]]: vector<16x16xf16>, %[[ARG2:.+]]: vector<16x16xf32> -// CHECK-DAG: %[[C:.+]] = iree_vector_ext.to_simt %[[ARG2]] : vector<16x16xf32> -> vector<1x1x8xf32> -// CHECK-DAG: %[[CV:.+]] = vector.extract %[[C]][0, 0] : vector<8xf32> from vector<1x1x8xf32> -// CHECK-DAG: %[[A:.+]] = iree_vector_ext.to_simt %[[ARG0]] : vector<16x16xf16> -> vector<1x1x16xf16> -// CHECK-DAG: %[[AV:.+]] = vector.extract %[[A]][0, 0] : vector<16xf16> from vector<1x1x16xf16> -// CHECK-DAG: %[[B:.+]] = iree_vector_ext.to_simt %[[ARG1]] : vector<16x16xf16> -> vector<1x1x16xf16> -// CHECK-DAG: %[[BV:.+]] = vector.extract %[[B]][0, 0] : vector<16xf16> from vector<1x1x16xf16> -// CHECK-DAG: %[[OUT:.+]] = amdgpu.wmma %[[AV]] * %[[BV]] + %[[CV]] : vector<16xf16>, vector<16xf16>, vector<8xf32> - -// ----- - -#layout = #iree_gpu.mma_layout - -#map1 = affine_map<(d0, d1, d2) -> (d1, d2)> -#map2 = affine_map<(d0, d1, d2) -> (d0, d2)> -#map3 = affine_map<(d0, d1, d2) -> (d1, d0)> - -// A: vector<32x128>, layout = layoutA -#row_layout = #iree_vector_ext.per_dim_layout<[BATCHX, LANEX], [2, 16]> -#col_layout = #iree_vector_ext.per_dim_layout<[BATCHY, LANEY, VECTORX], [8, 1, 16]> -#layout_a = #iree_vector_ext.layout<#row_layout, #col_layout> - -// B: vector<64x128>, layout = transpose(layoutB) = layoutA -#row_layout2 = #iree_vector_ext.per_dim_layout<[BATCHX, LANEX], [4, 16]> -#col_layout2 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEY, VECTORX], [8, 1, 16]> -#layout_b = #iree_vector_ext.layout<#row_layout2, #col_layout2> - -// C: vector<32x64>, layout = layoutC -#row_layout3 = #iree_vector_ext.per_dim_layout<[BATCHX, VECTORY, LANEY, VECTORX], [2, 8, 2, 1]> -#col_layout3 = #iree_vector_ext.per_dim_layout<[BATCHY, LANEX], [4, 16]> -#layout_c = #iree_vector_ext.layout<#row_layout3, #col_layout3> -builtin.module attributes { transform.with_named_sequence } { - func.func @distribute_wmma_16x16x16_mmt_batch(%a : vector<32x128xf16>, %b : vector<64x128xf16>, %c : vector<32x64xf32>) -> vector<32x64xf32> { - %output = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], - kind = #vector.kind, - "__vector_layout_test_anchor_operand_0" = #layout_a, - "__vector_layout_test_anchor_operand_1" = #layout_b, - "__vector_layout_test_anchor_operand_2" = #layout_c, - "__vector_layout_test_anchor_result_0" = #layout_c - } - %a, %b, %c : vector<32x128xf16>, vector<64x128xf16> into vector<32x64xf32> - return %output : vector<32x64xf32> - } - transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { - %contract = transform.structured.match ops{["vector.contract"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %layout16x16x16 = transform.param.constant #layout -> !transform.any_param - transform.iree.set_contraction_layout_attributes %contract, %layout16x16x16 : !transform.any_op, !transform.any_param - - %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.iree.amdgpu_distribute_vectors %top_level_func test_conversion : (!transform.any_op) -> !transform.any_op - transform.yield - } -} - -// CHECK-LABEL: distribute_wmma_16x16x16_mmt_batch - -// CHECK-COUNT-64: amdgpu.wmma {{.*}} : vector<16xf16>, vector<16xf16>, vector<8xf32> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_set_anchor_layouts.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_set_anchor_layouts.mlir deleted file mode 100644 index 9a2e0ad01fa1..000000000000 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_set_anchor_layouts.mlir +++ /dev/null @@ -1,95 +0,0 @@ -// RUN: iree-opt --iree-transform-dialect-interpreter --split-input-file --cse %s --verify-diagnostics - -// This tests that the compiler is setting the correct layout anchors for various vectorOps and shapes. -// Currently only testing on contraction layoutV1, but can be expanded to others. - -#layout = #iree_gpu.mma_layout -#map1 = affine_map<(d0, d1, d2) -> (d1, d2)> -#map2 = affine_map<(d0, d1, d2) -> (d0, d2)> -#map3 = affine_map<(d0, d1, d2) -> (d1, d0)> - -builtin.module attributes { transform.with_named_sequence } { - func.func @anchor_mfma_16x16x16_mmt(%a : memref<16x16xf16>, %b : memref<16x16xf16>, %init : vector<16x16xf32>) -> vector<16x16xf32> { - // CHECK-LABEL: anchor_mfma_16x16x16_mmt - %c0 = arith.constant 0 : index - %cst_0 = arith.constant 0.0 : f16 - %lhs = vector.transfer_read %a[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHX, LANEX], [1, 16]>, <[ BATCHY, LANEY, VECTORX], [1, 4, 4]>>}} - %rhs = vector.transfer_read %b[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHX, LANEX], [1, 16]>, <[ BATCHY, LANEY, VECTORX], [1, 4, 4]>>}} - %output = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %lhs, %rhs, %init : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHX, LANEY, VECTORX], [1, 4, 4]>, <[ BATCHY, LANEX], [1, 16]>>}} - return %output : vector<16x16xf32> - } - transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { - %contract = transform.structured.match ops{["vector.contract"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %layout16x16x16 = transform.param.constant #layout -> !transform.any_param - transform.iree.set_contraction_layout_attributes %contract, %layout16x16x16 : !transform.any_op, !transform.any_param - - %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.iree.test_vector_layout_analysis %top_level_func : !transform.any_op - transform.yield - } -} - -// ----- - -#layout = #iree_gpu.mma_layout -#map1 = affine_map<(d0, d1, d2) -> (d1, d2)> -#map2 = affine_map<(d0, d1, d2) -> (d0, d2)> -#map3 = affine_map<(d0, d1, d2) -> (d1, d0)> - -builtin.module attributes { transform.with_named_sequence } { - func.func @anchor_mfma_16x16x16_mmt_batch(%a : memref<32x128xf16>, %b : memref<64x128xf16>, %init : vector<32x64xf32>) -> vector<32x64xf32> { - // CHECK-LABEL: anchor_mfma_16x16x16_mmt_batch - %c0 = arith.constant 0 : index - %cst_0 = arith.constant 0.0 : f16 - %lhs = vector.transfer_read %a[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x128xf16>, vector<32x128xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHX, LANEX], [2, 16]>, <[ BATCHY, LANEY, VECTORX], [8, 4, 4]>>}} - %rhs = vector.transfer_read %b[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<64x128xf16>, vector<64x128xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHX, LANEX], [4, 16]>, <[ BATCHY, LANEY, VECTORX], [8, 4, 4]>>}} - %output = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %lhs, %rhs, %init : vector<32x128xf16>, vector<64x128xf16> into vector<32x64xf32> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHX, LANEY, VECTORX], [2, 4, 4]>, <[ BATCHY, LANEX], [4, 16]>>}} - return %output : vector<32x64xf32> - } - transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { - %contract = transform.structured.match ops{["vector.contract"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %layout16x16x16 = transform.param.constant #layout -> !transform.any_param - transform.iree.set_contraction_layout_attributes %contract, %layout16x16x16 : !transform.any_op, !transform.any_param - - %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.iree.test_vector_layout_analysis %top_level_func : !transform.any_op - transform.yield - } -} - -// ----- - -#layout = #iree_gpu.mma_layout -#map1 = affine_map<(d0, d1, d2) -> (d1, d2)> -#map2 = affine_map<(d0, d1, d2) -> (d0, d2)> -#map3 = affine_map<(d0, d1, d2) -> (d1, d0)> - -builtin.module attributes { transform.with_named_sequence } { - func.func @anchor_wmma_16x16x16_mmt(%a : memref<16x16xf16>, %b : memref<16x16xf16>, %init : vector<16x16xf32>) -> vector<16x16xf32> { - // CHECK-LABEL: anchor_wmma_16x16x16_mmt - %c0 = arith.constant 0 : index - %cst_0 = arith.constant 0.0 : f16 - %lhs = vector.transfer_read %a[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHX, LANEX], [1, 16]>, <[ BATCHY, VECTORX], [1, 16]>>}} - %rhs = vector.transfer_read %b[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHX, LANEX], [1, 16]>, <[ BATCHY, VECTORX], [1, 16]>>}} - %output = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %lhs, %rhs, %init : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf32> - // expected-remark @above {{layout of result #0 is #iree_vector_ext.layout<<[ BATCHX, VECTORX, LANEY], [1, 8, 2]>, <[ BATCHY, LANEX], [1, 16]>>}} - return %output : vector<16x16xf32> - } - transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { - %contract = transform.structured.match ops{["vector.contract"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %layout16x16x16 = transform.param.constant #layout -> !transform.any_param - transform.iree.set_contraction_layout_attributes %contract, %layout16x16x16 : !transform.any_op, !transform.any_param - - %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.iree.test_vector_layout_analysis %top_level_func : !transform.any_op - transform.yield - } -}