diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Flow/Transforms/BUILD.bazel index 88b43b52e4fb..5d56e7dbdf7f 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/BUILD.bazel +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/BUILD.bazel @@ -58,7 +58,6 @@ iree_compiler_cc_library( "InjectDispatchTracing.cpp", "InjectTensorTracing.cpp", "InsertDispatchDebugTargets.cpp", - "InterchangeTransposeGenericOps.cpp", "MaterializeDefaultWorkgroupCountRegion.cpp", "OutlineConstants.cpp", "OutlineDispatchExterns.cpp", @@ -69,6 +68,7 @@ iree_compiler_cc_library( "SplitReduction.cpp", "TensorPadToTensorInsertSlice.cpp", "TopLevelSCFToCFG.cpp", + "TransposeGenericOps.cpp", "VerifyInputLegality.cpp", ], hdrs = [ diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/Flow/Transforms/CMakeLists.txt index 95a30c6ab146..2eeaa89bc9f3 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/CMakeLists.txt +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/CMakeLists.txt @@ -58,7 +58,6 @@ iree_cc_library( "InjectDispatchTracing.cpp" "InjectTensorTracing.cpp" "InsertDispatchDebugTargets.cpp" - "InterchangeTransposeGenericOps.cpp" "MaterializeDefaultWorkgroupCountRegion.cpp" "OutlineConstants.cpp" "OutlineDispatchExterns.cpp" @@ -69,6 +68,7 @@ iree_cc_library( "SplitReduction.cpp" "TensorPadToTensorInsertSlice.cpp" "TopLevelSCFToCFG.cpp" + "TransposeGenericOps.cpp" "VerifyInputLegality.cpp" DEPS ::PassesIncGen diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/ConvertRegionToWorkgroups.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/ConvertRegionToWorkgroups.cpp index c1bd272c161f..fcd5bf3f1d2b 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/ConvertRegionToWorkgroups.cpp +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/ConvertRegionToWorkgroups.cpp @@ -18,9 +18,6 @@ namespace mlir::iree_compiler::IREE::Flow { -#define GEN_PASS_DEF_CONVERTREGIONTOWORKGROUPSPASS -#include "iree/compiler/Dialect/Flow/Transforms/Passes.h.inc" - namespace { /// Compute the dynamic dims of the given value and add them to the vector. @@ -256,26 +253,4 @@ rewriteFlowDispatchRegionToFlowDispatchWorkgroups( return workgroupsOp; } -namespace { -struct ConvertRegionToWorkgroupsPass - : public IREE::Flow::impl::ConvertRegionToWorkgroupsPassBase< - ConvertRegionToWorkgroupsPass> { - void runOnOperation() override { - SmallVector ops; - getOperation()->walk( - [&](IREE::Flow::DispatchRegionOp op) { ops.push_back(op); }); - - IRRewriter rewriter(getOperation()->getContext()); - for (IREE::Flow::DispatchRegionOp regionOp : ops) { - if (failed(rewriteFlowDispatchRegionToFlowDispatchWorkgroups(regionOp, - rewriter))) { - signalPassFailure(); - return; - } - } - } -}; - -} // namespace - } // namespace mlir::iree_compiler::IREE::Flow diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/FusionPreprocessing.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/FusionPreprocessing.cpp index 4aa3fc12f432..49d485920b27 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/FusionPreprocessing.cpp +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/FusionPreprocessing.cpp @@ -36,39 +36,6 @@ namespace mlir::iree_compiler::IREE::Flow { namespace { -//===----------------------------------------------------------------------===// -// GenericOpInterchangePattern -//===----------------------------------------------------------------------===// - -struct GenericOpInterchangePattern - : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - LogicalResult matchAndRewrite(linalg::GenericOp genericOp, - PatternRewriter &rewriter) const override { - SmallVector interchange; - bool needInterchange = false; - unsigned numParallelLoop = genericOp.getNumParallelLoops(); - if (numParallelLoop == 0) - return failure(); - for (auto iter : llvm::enumerate(genericOp.getIteratorTypesArray())) { - if (linalg::isParallelIterator(iter.value())) { - interchange.push_back(iter.index()); - if (iter.index() >= numParallelLoop) - needInterchange = true; - } - } - // If all the parallel loops are outter loops skip the pattern. - if (!needInterchange) - return failure(); - for (auto iter : llvm::enumerate(genericOp.getIteratorTypesArray())) { - if (linalg::isReductionIterator(iter.value())) { - interchange.push_back(iter.index()); - } - } - return interchangeGenericOp(rewriter, genericOp, interchange); - } -}; - //===----------------------------------------------------------------------===// // ElementwiseOpInterchangePattern //===----------------------------------------------------------------------===// @@ -235,8 +202,7 @@ struct FusionPreprocessingPass void runOnOperation() override { RewritePatternSet patterns(&getContext()); patterns.add( + FoldSuccessiveTensorInsertSliceOps, GatherFusionPattern>( &getContext()); // Fold away `tensor.dim` operations that can be resolved in terms of its diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp index 9ab790e4e614..1693f494b282 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.cpp @@ -185,7 +185,35 @@ void addDispatchRegionCreationPreprocessingPasses(OpPassManager &passManager) { // producer-consumer fusion. .addPass(IREE::Flow::createSinkReshapesPass) .addPass(IREE::Flow::createCanonicalizerPass) - .addPass(mlir::createCSEPass); + .addPass(mlir::createCSEPass) + + // 5. After all the reshape propagations, fuse elementwise operations + // even if the producer has multiple uses. + .addPass(IREE::Flow::createFuseMultiUseElementwiseProducerPass) + + // 6. Some more "post elementwise fusion passes". + // a. Detensorize. + // TODO: This is probably not in the right place. + .addPredicatedPass(clDetensoring, + [&]() { return mlir::createLinalgDetensorizePass(); }) + .addPass(IREE::Flow::createCanonicalizerPass) + .addPass(mlir::createCSEPass) + + // b. For ops with multiple reduction dimensions, collapse the + // reduction dimension. + // TODO: This pass is only needed till all backends can handle + // multiple reduction dimensions. + .addPredicatedPass(clCollapseReductionDims, + IREE::Flow::createCollapseReductionDimensionsPass) + + // c. Split reduction operations into parallel and reduction, i.e + // . + .addPass(IREE::Flow::createSplitReductionPass) + + // d. Transpose generic ops to + // - help with dispatch region formation. + // - move reduction iterators to be innermost. + .addPass(IREE::Flow::createTransposeGenericOpsPass); } // Pipeline to first create `flow.dispatch.region` ops and then lower to @@ -207,7 +235,7 @@ static void addDispatchRegionCreationPasses(OpPassManager &passManager) { // Create dispatches for scalar operations as roots .addPass(IREE::Flow::createFormScalarDispatchesPass) // Create `flow.dispatch.region` centered around a root and fuse with - // producers + // producers and consumers. .addPass([&]() { return IREE::Flow::createFormDispatchRegionsPass( FormDispatchRegionsPassOptions{ @@ -256,25 +284,6 @@ void addDispatchRegionCreationPasses(OpPassManager &passManager, .addPass(mlir::createCSEPass); addDispatchRegionCreationPreprocessingPasses(passManager); - - FunctionLikeNest(passManager) - .addPass(IREE::Flow::createFuseMultiUseElementwiseProducerPass) - .addPredicatedPass(clDetensoring, - [&]() { return mlir::createLinalgDetensorizePass(); }) - .addPass(IREE::Flow::createCanonicalizerPass) - .addPass(mlir::createCSEPass) - .addPredicatedPass(clCollapseReductionDims, - IREE::Flow::createCollapseReductionDimensionsPass) - // Split reduction operations into parallel and reduction. - .addPass(IREE::Flow::createSplitReductionPass) - // SplitReductionPass may create reduction dimension that are not the last - // dimension. - .addPass(IREE::Flow::createFusionPreprocessingPass) - // Normalize the input indexing map to make the input indexing map - // identity. This helps fusing named linalg op with a generic op with - // transpose. - .addPass(IREE::Flow::createInterchangeTransposeGenericOpsPass); - addDispatchRegionCreationPasses(passManager); } diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td index b95bc7988c01..ebfc5c0e708a 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/Passes.td @@ -9,10 +9,18 @@ include "mlir/Pass/PassBase.td" -def AnnotateDispatchesPass : - Pass<"iree-flow-annotate-dispatches", "mlir::ModuleOp"> { - let summary = "Annotates executable dispatches based on their contents."; -} +// File organization: +// Groups passes that are related under one banner //===....===//. For example +// the dispatch region creation preprocessing passes and dispatch region +// formation passes are a couple of such groups. For any new pass add it to the +// relevant group and keep them alphabetical within a group. + +//===---------------------------------------------------------------------===// +// Dispatch region creation preprocessing passes : +// Passes that transform the program before forming dispatches, like +// - Elementwise operation fusion +// - Reshape propagation passes +//===---------------------------------------------------------------------===// def BubbleUpExpandShapesPass : Pass<"iree-flow-bubble-up-expand-shapes"> { @@ -22,109 +30,135 @@ def BubbleUpExpandShapesPass : ]; } -def CanonicalizerPass : - Pass<"iree-flow-canonicalize", ""> { - let summary = "Flow specific canonicalization pass"; - let options = [ - Option<"testConvergence", "test-convergence", "bool", - /*default=*/"false", "Fails if the patterns fail to converge"> +def CollapseReductionDimensionsPass : + Pass<"iree-flow-collapse-reduction-dimensions", ""> { + let summary = "Collapse reduction dimensions when possible."; + let dependentDialects = [ + "mlir::linalg::LinalgDialect", ]; } -def CaptureDynamicDimsPass : - Pass<"iree-flow-capture-dynamic-dims", ""> { - let summary = "Captures dynamic shape dimensions required by dispatch operands/results and control flow operations."; +def ElementwiseOpFusionPass : + Pass<"iree-flow-elementwise-op-fusion", ""> { + let summary = "Fuse elementwise operations."; + let options = [ + Option<"fuseMultiReduction", "fuse-multi-reduction", "bool", + /*default=*/"true", "Fuse ops that have multiple reduction iterators"> + ]; let dependentDialects = [ - "IREE::Flow::FlowDialect", + "mlir::affine::AffineDialect", ]; } -def CleanupTensorShapesPass : - Pass<"iree-flow-cleanup-tensor-shapes", ""> { - let summary = "Cleans up any remaining tensor shape metadata after lowering."; -} - -def CloneProducersIntoDispatchRegionsPass : - InterfacePass<"iree-flow-clone-producers-into-dispatch-regions", "mlir::FunctionOpInterface"> { - let summary = "Clone producers into dispatch regions to be isolated above."; +def FoldUnitExtentDimsPass : + Pass<"iree-flow-fold-unit-extent-dims", "mlir::ModuleOp"> { + let summary = "Fold unit extent dimension of operations."; let description = [{ - Pass to clone into dispatch regions producers of values used in the dispatch - regions but defined in the above. This prepares the dispatch regions for - converting to dispatch workgroups with explicit captures. + Imports upstream patterns to fold unit extent dims but with IREE control. }]; + let dependentDialects = [ + "mlir::affine::AffineDialect", + "mlir::arith::ArithDialect", + "mlir::linalg::LinalgDialect", + "mlir::tensor::TensorDialect", + ]; } -def CollapseReductionDimensionsPass : - Pass<"iree-flow-collapse-reduction-dimensions", ""> { - let summary = "Collapse reduction dimensions when possible."; +def FuseMultiUseElementwiseProducerPass : + InterfacePass<"iree-flow-fuse-multi-use-elementwise-producer", + "mlir::FunctionOpInterface"> { + let summary = "Fuse elementwise linalg operations on tensors when producers have multiple uses."; + let options = [ + Option<"numIterations", "num-iterations", "unsigned", + /*default=*/"2", "Number of iterations to fuse multiuse ops"> + ]; let dependentDialects = [ + "mlir::affine::AffineDialect", + "mlir::arith::ArithDialect", "mlir::linalg::LinalgDialect", + "mlir::math::MathDialect", ]; } -def ConvertMeshToFlowPass : - Pass<"iree-convert-mesh-to-flow", "mlir::ModuleOp"> { - let summary = "Convert Mesh dialect operations to flow."; - let description = [{ - Each mesh corresponds to a default !flow.channel with the same group name. - ``` - mesh.mesh @mesh_1(shape = 2x3) - ``` - ``` - %channel = flow.channel.default "mesh_1" : !flow.channel - ``` - If there is onl one mesh in the program than the name is omitted and the - ``` - %channel = flow.channel.default : !flow.channel - ``` +def FusionPreprocessingPass : + Pass<"iree-flow-fusion-preprocessing", ""> { + let summary = "Run useful preprocessing patterns that help with fusion."; + let dependentDialects = [ + "mlir::affine::AffineDialect", + ]; +} - Each (mesh, mesh_axes) pair partitions and orders the devices into disjoint - groups, each corresponding to a !flow.channel to perform a collective - operation. - For example - ``` - mesh.mesh @mesh(shape = 2x3x4x5) - ... - %1 = mesh.all_reduce on @mesh mesh_axes = [2, 0] : tensor<10x20xf32> - ``` - For more information see - [Mesh dialect](https://mlir.llvm.org/docs/Dialects/Mesh/#device-groups). +def SinkReshapesPass : + Pass<"iree-flow-sink-reshapes", ""> { + let summary = "Sink reshapes to allow for compute op -> consumer fusion."; + let dependentDialects = [ + "mlir::affine::AffineDialect", + "mlir::arith::ArithDialect", + ]; +} - The mesh partition and device ordering determines the values for the - `color` and `key` in the corresponding `flow.channel.split` operation used - to create the channel. - For more information on the meaning of `color` and `key` see - [MPI_Comm_split](https://www.mpi-forum.org/docs/mpi-4.1/mpi41-report/node188.htm#Node188) - in the MPI standard. +def SplitReductionPass : + Pass<"iree-flow-split-reduction-ops", ""> { + let summary = "Split reduction dimension to increase parallelism."; + let dependentDialects = [ + "mlir::linalg::LinalgDialect", + ]; +} - Each !flow.channel is wrapped in an IREE `util.global` and its construction - is done only once with `util.initializer`. - }]; +def TensorPadToTensorInsertSlicePass : + Pass<"iree-flow-tensor-pad-to-tensor-insert-slice", ""> { + let summary = "Convert tensor.pad into linalg.fill + tensor.insert_slice."; + let options = [ + Option<"skipSingleLinalgOpUses", "skip-one-linalg-use-case", "bool", + /*default=*/"false", + "Skip the op that has only one use which is used" + "by a Linalg op">, + ]; let dependentDialects = [ - "mlir::affine::AffineDialect", + "mlir::arith::ArithDialect", "mlir::linalg::LinalgDialect", - "mlir::mesh::MeshDialect", - "mlir::tensor::TensorDialect", - "IREE::Flow::FlowDialect", + "mlir::math::MathDialect", + "mlir::memref::MemRefDialect", ]; } -def ConvertRegionToWorkgroupsPass : - Pass<"iree-flow-convert-region-to-workgroups", ""> { - let summary = "Convert dispatch.region ops to dispatch.workgroups ops."; +def TransposeGenericOpsPass : + Pass<"iree-flow-transpose-generic-ops", ""> { + let summary = "Transpose generic op loops."; let dependentDialects = [ - "mlir::tensor::TensorDialect", - "IREE::Flow::FlowDialect", + "mlir::linalg::LinalgDialect", ]; } -def ConvertToFlowPass : - Pass<"iree-flow-convert-to-flow", ""> { - let summary = "Convert operations to flow. Currently just a test pass."; +//===---------------------------------------------------------------------===// +// Dispatch region creation passes. +//===---------------------------------------------------------------------===// + +def CloneProducersIntoDispatchRegionsPass : + InterfacePass<"iree-flow-clone-producers-into-dispatch-regions", "mlir::FunctionOpInterface"> { + let summary = "Clone producers into dispatch regions to be isolated above."; let description = [{ - Convert operations to `flow` ops. This pass is currently only used for - testing, since the conversion to Flow ops happens within dispatch region - formation. + Pass to clone into dispatch regions producers of values used in the dispatch + regions but defined in the above. This prepares the dispatch regions for + converting to dispatch workgroups with explicit captures. + }]; +} + +def CollapseDimensionsPass : + InterfacePass<"iree-flow-collapse-dimensions", "mlir::FunctionOpInterface"> { + let summary = "Collapse dimensions of Linalg Ops on tensor ops."; + let description = [{ + Collapse dimensions of Linalg Ops on tensor ops inside dispatch.region ops + and hoist the reshaping operations out of the dispatch. + }]; +} + +def ConvertDispatchRegionsToWorkgroupsPass : + InterfacePass<"iree-flow-convert-dispatch-regions-to-workgroups", "mlir::FunctionOpInterface"> { + let summary = "Convert dispatch regions to dispatch workgroups."; + let description = [{ + Pass to convert dispatch regions to dispatch workgroups. This pass is + intended to be used after dispatch regions have been formed. }]; let dependentDialects = [ "mlir::affine::AffineDialect", @@ -135,22 +169,50 @@ def ConvertToFlowPass : ]; } -def DeduplicateExecutablesPass : - Pass<"iree-flow-deduplicate-executables", "mlir::ModuleOp"> { - let summary = "Deduplicates executables that are identical."; +def ConvertTensorToFlowPass : + InterfacePass<"iree-flow-convert-tensor-to-flow", "mlir::FunctionOpInterface"> { + let summary = "Convert tensor operations to flow"; + let description = [{ + Pass to convert tensor operations to flow.tensor.* operations. + }]; + let dependentDialects = [ + "mlir::affine::AffineDialect", + "mlir::arith::ArithDialect", + "mlir::linalg::LinalgDialect", + "mlir::tensor::TensorDialect", + "IREE::Flow::FlowDialect", + ]; } -def FoldUnitExtentDimsPass : - Pass<"iree-flow-fold-unit-extent-dims", "mlir::ModuleOp"> { - let summary = "Fold unit extent dimension of operations."; +def DispatchWithTransformDialectPass : Pass<"iree-flow-dispatch-with-transform-dialect"> { + let summary = "Dispatch Linalg operations on tensors by using the transform dialect interpreter."; let description = [{ - Imports upstream patterns to fold unit extent dims but with IREE control. + Pass to perform dispatch of Linalg on tensor ops by using the transform + dialect. Dispatch regions are created as specified by the transform module + that is parsed from `transformSpecPath`. + + TODO: Drop this pass in favor of the one upstream. The one upstream requires + separate loading of the module and thus isn't suited for single-use + transform scripts. }]; let dependentDialects = [ "mlir::affine::AffineDialect", "mlir::arith::ArithDialect", "mlir::linalg::LinalgDialect", + "mlir::pdl::PDLDialect", + "mlir::pdl_interp::PDLInterpDialect", + "mlir::scf::SCFDialect", "mlir::tensor::TensorDialect", + "mlir::transform::TransformDialect", + "IREE::Flow::FlowDialect", + "IREE::LinalgExt::IREELinalgExtDialect", + ]; + let options = [ + Option<"disableExpensiveChecks", "disable-expensive-checks", "bool", + "false", + "Disable expensive checks in the interpreter for a faster run.">, + Option<"transformSpecPath", "transform-spec-path", "std::string", + /*default=*/"", "File path to the transform spec to use.">, ]; } @@ -180,39 +242,20 @@ def FormDispatchRegionsPass : ]; } -def ConvertDispatchRegionsToWorkgroupsPass : - InterfacePass<"iree-flow-convert-dispatch-regions-to-workgroups", "mlir::FunctionOpInterface"> { - let summary = "Convert dispatch regions to dispatch workgroups."; - let description = [{ - Pass to convert dispatch regions to dispatch workgroups. This pass is - intended to be used after dispatch regions have been formed. - }]; +def FormScalarDispatchesPass : + InterfacePass<"iree-flow-form-scalar-dispatches", "mlir::FunctionOpInterface"> { + let summary = "Form Dispatch Regions for scalar computations."; let dependentDialects = [ "mlir::affine::AffineDialect", "mlir::linalg::LinalgDialect", - "mlir::scf::SCFDialect", "mlir::tensor::TensorDialect", "IREE::Flow::FlowDialect", ]; } -def ConvertTensorToFlowPass : - InterfacePass<"iree-flow-convert-tensor-to-flow", "mlir::FunctionOpInterface"> { - let summary = "Convert tensor operations to flow"; - let description = [{ - Pass to convert tensor operations to flow.tensor.* operations. - }]; - let dependentDialects = [ - "mlir::affine::AffineDialect", - "mlir::arith::ArithDialect", - "mlir::linalg::LinalgDialect", - "mlir::tensor::TensorDialect", - "IREE::Flow::FlowDialect", - ]; -} - def MaterializeDefaultWorkgroupCountRegionPass: - InterfacePass<"iree-flow-materialize-default-workgroup-count-region", "mlir::FunctionOpInterface"> { + InterfacePass<"iree-flow-materialize-default-workgroup-count-region", + "mlir::FunctionOpInterface"> { let summary = "Canonicalize dispatch workgroups ops."; let description = [{ Apply dispatch workgroups canonicalization patterns. @@ -226,58 +269,106 @@ def MaterializeDefaultWorkgroupCountRegionPass: ]; } -def FormScalarDispatchesPass : - InterfacePass<"iree-flow-form-scalar-dispatches", "mlir::FunctionOpInterface"> { - let summary = "Form Dispatch Regions for scalar computations."; +//===---------------------------------------------------------------------===// +// General Flow passes +//===---------------------------------------------------------------------===// + +def AnnotateDispatchesPass : + Pass<"iree-flow-annotate-dispatches", "mlir::ModuleOp"> { + let summary = "Annotates executable dispatches based on their contents."; +} + +def CanonicalizerPass : + Pass<"iree-flow-canonicalize", ""> { + let summary = "Flow specific canonicalization pass"; + let options = [ + Option<"testConvergence", "test-convergence", "bool", + /*default=*/"false", "Fails if the patterns fail to converge"> + ]; +} + +def CaptureDynamicDimsPass : + Pass<"iree-flow-capture-dynamic-dims", ""> { + let summary = "Captures dynamic shape dimensions required by dispatch operands/results and control flow operations."; let dependentDialects = [ - "mlir::affine::AffineDialect", - "mlir::linalg::LinalgDialect", - "mlir::tensor::TensorDialect", "IREE::Flow::FlowDialect", ]; } -def CollapseDimensionsPass : - InterfacePass<"iree-flow-collapse-dimensions", "mlir::FunctionOpInterface"> { - let summary = "Collapse dimensions of Linalg Ops on tensor ops."; +def CleanupTensorShapesPass : + Pass<"iree-flow-cleanup-tensor-shapes", ""> { + let summary = "Cleans up any remaining tensor shape metadata after lowering."; +} + +def ConvertMeshToFlowPass : + Pass<"iree-convert-mesh-to-flow", "mlir::ModuleOp"> { + let summary = "Convert Mesh dialect operations to flow."; let description = [{ - Collapse dimensions of Linalg Ops on tensor ops inside dispatch.region ops - and hoist the reshaping operations out of the dispatch. + Each mesh corresponds to a default !flow.channel with the same group name. + ``` + mesh.mesh @mesh_1(shape = 2x3) + ``` + ``` + %channel = flow.channel.default "mesh_1" : !flow.channel + ``` + If there is onl one mesh in the program than the name is omitted and the + ``` + %channel = flow.channel.default : !flow.channel + ``` + + Each (mesh, mesh_axes) pair partitions and orders the devices into disjoint + groups, each corresponding to a !flow.channel to perform a collective + operation. + For example + ``` + mesh.mesh @mesh(shape = 2x3x4x5) + ... + %1 = mesh.all_reduce on @mesh mesh_axes = [2, 0] : tensor<10x20xf32> + ``` + For more information see + [Mesh dialect](https://mlir.llvm.org/docs/Dialects/Mesh/#device-groups). + + The mesh partition and device ordering determines the values for the + `color` and `key` in the corresponding `flow.channel.split` operation used + to create the channel. + For more information on the meaning of `color` and `key` see + [MPI_Comm_split](https://www.mpi-forum.org/docs/mpi-4.1/mpi41-report/node188.htm#Node188) + in the MPI standard. + + Each !flow.channel is wrapped in an IREE `util.global` and its construction + is done only once with `util.initializer`. }]; + let dependentDialects = [ + "mlir::affine::AffineDialect", + "mlir::linalg::LinalgDialect", + "mlir::mesh::MeshDialect", + "mlir::tensor::TensorDialect", + "IREE::Flow::FlowDialect", + ]; } -def DispatchWithTransformDialectPass : Pass<"iree-flow-dispatch-with-transform-dialect"> { - let summary = "Dispatch Linalg operations on tensors by using the transform dialect interpreter."; +def ConvertToFlowPass : + Pass<"iree-flow-convert-to-flow", ""> { + let summary = "Convert operations to flow. Currently just a test pass."; let description = [{ - Pass to perform dispatch of Linalg on tensor ops by using the transform - dialect. Dispatch regions are created as specified by the transform module - that is parsed from `transformSpecPath`. - - TODO: Drop this pass in favor of the one upstream. The one upstream requires - separate loading of the module and thus isn't suited for single-use - transform scripts. + Convert operations to `flow` ops. This pass is currently only used for + testing, since the conversion to Flow ops happens within dispatch region + formation. }]; let dependentDialects = [ "mlir::affine::AffineDialect", - "mlir::arith::ArithDialect", "mlir::linalg::LinalgDialect", - "mlir::pdl::PDLDialect", - "mlir::pdl_interp::PDLInterpDialect", "mlir::scf::SCFDialect", "mlir::tensor::TensorDialect", - "mlir::transform::TransformDialect", "IREE::Flow::FlowDialect", - "IREE::LinalgExt::IREELinalgExtDialect", - ]; - let options = [ - Option<"disableExpensiveChecks", "disable-expensive-checks", "bool", - "false", - "Disable expensive checks in the interpreter for a faster run.">, - Option<"transformSpecPath", "transform-spec-path", "std::string", - /*default=*/"", "File path to the transform spec to use.">, ]; } +def DeduplicateExecutablesPass : + Pass<"iree-flow-deduplicate-executables", "mlir::ModuleOp"> { + let summary = "Deduplicates executables that are identical."; +} + def DumpDispatchGraphPass : Pass<"iree-flow-dump-dispatch-graph-pass"> { let summary = "Dump visualization of dispatches within the program."; let options = [ @@ -300,19 +391,6 @@ def DumpDispatchGraphPass : Pass<"iree-flow-dump-dispatch-graph-pass"> { ]; } -def ElementwiseOpFusionPass : - Pass<"iree-flow-elementwise-op-fusion", ""> { - let summary = "Fuse elementwise operations."; - let options = [ - Option<"fuseMultiReduction", "fuse-multi-reduction", "bool", - /*default=*/"true", "Fuse ops that have multiple reduction iterators"> - ]; - let dependentDialects = [ - "mlir::affine::AffineDialect", - ]; -} - - def ExportBenchmarkFuncsPass : Pass<"iree-flow-export-benchmark-funcs-pass", "mlir::ModuleOp"> { let summary = "Exports benchmark functions."; @@ -327,28 +405,6 @@ def ExportBenchmarkFuncsPass : ]; } -def FuseMultiUseElementwiseProducerPass : - InterfacePass<"iree-flow-fuse-multi-use-elementwise-producer", "mlir::FunctionOpInterface"> { - let summary = "Fuse elementwise linalg operations on tensors when producers have multiple uses."; - let options = [ - Option<"numIterations", "num-iterations", "unsigned", - /*default=*/"2", "Number of iterations to fuse multiuse ops"> - ]; - let dependentDialects = [ - "mlir::affine::AffineDialect", - "mlir::arith::ArithDialect", - "mlir::linalg::LinalgDialect", - "mlir::math::MathDialect", - ]; -} - -def FusionPreprocessingPass : - Pass<"iree-flow-fusion-preprocessing", ""> { - let summary = "Run useful preprocessing patterns that help with fusion."; - let dependentDialects = [ - "mlir::affine::AffineDialect", - ]; -} def InitializeEmptyTensorsPass : Pass<"iree-flow-initialize-empty-tensors", ""> { @@ -418,14 +474,6 @@ def InsertDebugTargetAtOrdinalPass : ]; } -def InterchangeTransposeGenericOpsPass : - Pass<"iree-flow-interchange-transpose-generic-ops", ""> { - let summary = "Interchange transpose generic op loops to make the input indeximg map indentity."; - let dependentDialects = [ - "mlir::linalg::LinalgDialect", - ]; -} - def OutlineConstantsPass : Pass<"iree-flow-outline-constants", "mlir::ModuleOp"> { let summary = "Outlines tensor constants into util.globals at the module level."; @@ -458,40 +506,6 @@ def OutlineDispatchRegionsPass : ]; } -def SinkReshapesPass : - Pass<"iree-flow-sink-reshapes", ""> { - let summary = "Sink reshapes to allow for compute op -> consumer fusion."; - let dependentDialects = [ - "mlir::affine::AffineDialect", - "mlir::arith::ArithDialect", - ]; -} - -def SplitReductionPass : - Pass<"iree-flow-split-reduction-ops", ""> { - let summary = "Split reduction dimension to increase parallelism."; - let dependentDialects = [ - "mlir::linalg::LinalgDialect", - ]; -} - -def TensorPadToTensorInsertSlicePass : - Pass<"iree-flow-tensor-pad-to-tensor-insert-slice", ""> { - let summary = "Convert tensor.pad into linalg.fill + tensor.insert_slice."; - let options = [ - Option<"skipSingleLinalgOpUses", "skip-one-linalg-use-case", "bool", - /*default=*/"false", - "Skip the op that has only one use which is used" - "by a Linalg op">, - ]; - let dependentDialects = [ - "mlir::arith::ArithDialect", - "mlir::linalg::LinalgDialect", - "mlir::math::MathDialect", - "mlir::memref::MemRefDialect", - ]; -} - def TopLevelSCFToCFGPass : InterfacePass<"iree-top-level-scf-to-cfg", "mlir::FunctionOpInterface"> { let summary = "Converts non-nested SCF constructs to CFG (not traversing into opaque operations)."; diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/InterchangeTransposeGenericOps.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/TransposeGenericOps.cpp similarity index 61% rename from compiler/src/iree/compiler/Dialect/Flow/Transforms/InterchangeTransposeGenericOps.cpp rename to compiler/src/iree/compiler/Dialect/Flow/Transforms/TransposeGenericOps.cpp index 7a03bc81510e..ea053244abd5 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/InterchangeTransposeGenericOps.cpp +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/TransposeGenericOps.cpp @@ -18,11 +18,50 @@ namespace mlir::iree_compiler::IREE::Flow { -#define GEN_PASS_DEF_INTERCHANGETRANSPOSEGENERICOPSPASS +#define GEN_PASS_DEF_TRANSPOSEGENERICOPSPASS #include "iree/compiler/Dialect/Flow/Transforms/Passes.h.inc" namespace { +//===----------------------------------------------------------------------===// +// MakeReductionInnermostPattern +//===----------------------------------------------------------------------===// + +/// For generic ops that are reduction, make the reduction the innermost +/// dimension. +struct MakeReductionInnermostPattern + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(linalg::GenericOp genericOp, + PatternRewriter &rewriter) const override { + SmallVector interchange; + bool needInterchange = false; + unsigned numParallelLoop = genericOp.getNumParallelLoops(); + if (numParallelLoop == 0) + return failure(); + for (auto iter : llvm::enumerate(genericOp.getIteratorTypesArray())) { + if (linalg::isParallelIterator(iter.value())) { + interchange.push_back(iter.index()); + if (iter.index() >= numParallelLoop) + needInterchange = true; + } + } + // If all the parallel loops are outter loops skip the pattern. + if (!needInterchange) + return failure(); + for (auto iter : llvm::enumerate(genericOp.getIteratorTypesArray())) { + if (linalg::isReductionIterator(iter.value())) { + interchange.push_back(iter.index()); + } + } + return interchangeGenericOp(rewriter, genericOp, interchange); + } +}; + +/// For elementwise ops that consumer values produced by named ops (or reduction +/// ops), the dispatch region fusion logic requires the indexing maps to be +/// identity (or projections that are not transposing as well). This pattern +/// fixes up elementwise operations for which that is not the case. struct TransposeGenericOpPattern : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(linalg::GenericOp genericOp, @@ -73,12 +112,13 @@ struct TransposeGenericOpPattern : public OpRewritePattern { } }; -struct InterchangeTransposeGenericOpsPass - : public IREE::Flow::impl::InterchangeTransposeGenericOpsPassBase< - InterchangeTransposeGenericOpsPass> { +struct TransposeGenericOpsPass + : public IREE::Flow::impl::TransposeGenericOpsPassBase< + TransposeGenericOpsPass> { void runOnOperation() override { RewritePatternSet patterns(&getContext()); - patterns.add(&getContext()); + patterns.add( + &getContext()); if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)))) { return signalPassFailure(); diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel index 09e8511c7845..dfd39ae0fb98 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/BUILD.bazel @@ -43,7 +43,6 @@ iree_lit_test_suite( "inject_dispatch_tracing.mlir", "inject_tensor_tracing.mlir", "insert_dispatch_debug_targets.mlir", - "interchange_transpose_generic_ops.mlir", "outline_constants.mlir", "outline_dispatch_externs.mlir", "outline_dispatch_regions.mlir", @@ -55,6 +54,7 @@ iree_lit_test_suite( "tensor_pad_to_tensor_insert_slice.mlir", "top_level_scf_to_cfg.mlir", "transform_dispatch_region_formation.mlir", + "transpose_generic_ops.mlir", "verify_input_ir.mlir", ], include = ["*.mlir"], diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/CMakeLists.txt index 9324301f5f95..71f8083b0a31 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/CMakeLists.txt @@ -41,7 +41,6 @@ iree_lit_test_suite( "inject_dispatch_tracing.mlir" "inject_tensor_tracing.mlir" "insert_dispatch_debug_targets.mlir" - "interchange_transpose_generic_ops.mlir" "outline_constants.mlir" "outline_dispatch_externs.mlir" "outline_dispatch_regions.mlir" @@ -53,6 +52,7 @@ iree_lit_test_suite( "tensor_pad_to_tensor_insert_slice.mlir" "top_level_scf_to_cfg.mlir" "transform_dispatch_region_formation.mlir" + "transpose_generic_ops.mlir" "verify_input_ir.mlir" TOOLS FileCheck diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/convert_region_to_workgroups.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/convert_region_to_workgroups.mlir index 1f94c2efc42f..3caa5b0061c4 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/convert_region_to_workgroups.mlir +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/convert_region_to_workgroups.mlir @@ -1,4 +1,4 @@ -// RUN: iree-opt %s --iree-flow-convert-region-to-workgroups --iree-flow-canonicalize -cse -split-input-file | FileCheck %s +// RUN: iree-opt %s --pass-pipeline="builtin.module(util.func(iree-flow-convert-dispatch-regions-to-workgroups, iree-flow-canonicalize, cse))" -split-input-file | FileCheck %s // CHECK-LABEL: util.func public @foo( // CHECK: %[[argA:.*]]: tensor, %[[argB:.*]]: tensor<5x10xf32>, %[[argC:.*]]: tensor<10x11xf32> diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion_with_transpose.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion_with_transpose.mlir index 4a9b2d828645..a091b9011d44 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion_with_transpose.mlir +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_fusion_with_transpose.mlir @@ -1,4 +1,4 @@ -// RUN: iree-opt --split-input-file --verify-diagnostics --pass-pipeline="builtin.module(util.func(iree-flow-interchange-transpose-generic-ops,iree-flow-form-dispatch-regions{aggressive-fusion=true}, iree-flow-convert-dispatch-regions-to-workgroups, iree-flow-canonicalize, cse))" --mlir-print-local-scope %s | FileCheck %s +// RUN: iree-opt --split-input-file --verify-diagnostics --pass-pipeline="builtin.module(util.func(iree-flow-transpose-generic-ops,iree-flow-form-dispatch-regions{aggressive-fusion=true}, iree-flow-convert-dispatch-regions-to-workgroups, iree-flow-canonicalize, cse))" --mlir-print-local-scope %s | FileCheck %s util.func @fuse_conv(%arg0 : tensor<2x130x130x16xf32>, %arg1 : tensor<3x3x16x320xf32>) -> tensor<2x320x128x128xf32> { %empty = tensor.empty() : tensor<2x128x128x320xf32> diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/fusion_preprocessing.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/fusion_preprocessing.mlir index 04713f0fc901..b1865bc9c803 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/fusion_preprocessing.mlir +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/fusion_preprocessing.mlir @@ -1,29 +1,5 @@ // RUN: iree-opt --iree-flow-fusion-preprocessing --split-input-file %s | FileCheck %s -// CHECK: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> -// CHECK: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d0, d1)> -// CHECK: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d2, d0, d1)> -// CHECK: util.func public @interchange -// CHECK: linalg.generic {indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]] -// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]} -util.func public @interchange(%arg0: tensor, %arg1: tensor, %arg2: tensor) -> (tensor) { - %0 = linalg.generic {indexing_maps = [ - affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)>, - affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2, d3) -> (d3, d1, d2)>], - iterator_types = ["reduction", "parallel", "parallel", "parallel"]} - ins(%arg0, %arg1 : tensor, tensor) - outs(%arg2 : tensor) { - ^bb0(%arg3: f32, %arg4: f32, %arg5: f32): // no predecessors - %m = arith.mulf %arg3, %arg4 : f32 - %a = arith.addf %arg5, %m : f32 - linalg.yield %a : f32 - } -> tensor - util.return %0 : tensor -} - -// ----- - util.func public @fold_insert_slices(%source : tensor, %dest0 : tensor, %dest1 : tensor, %val: f32, %o1 : index, %o2 : index, %o3 : index, %o4 : index, diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/interchange_transpose_generic_ops.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transpose_generic_ops.mlir similarity index 71% rename from compiler/src/iree/compiler/Dialect/Flow/Transforms/test/interchange_transpose_generic_ops.mlir rename to compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transpose_generic_ops.mlir index 3a809eebccf1..e857cdb9b595 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/interchange_transpose_generic_ops.mlir +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/transpose_generic_ops.mlir @@ -1,4 +1,4 @@ -// RUN: iree-opt --split-input-file --verify-diagnostics --iree-flow-interchange-transpose-generic-ops --iree-flow-canonicalize -cse --mlir-print-local-scope %s | FileCheck %s +// RUN: iree-opt --split-input-file --verify-diagnostics --iree-flow-transpose-generic-ops --iree-flow-canonicalize -cse --mlir-print-local-scope %s | FileCheck %s util.func @supported_conv(%arg0 : tensor<2x130x130x16xf16>, %arg1 : tensor<3x3x16x320xf16>) -> tensor<2x320x128x128xf16> { %empty = tensor.empty() : tensor<2x128x128x320xf32> @@ -55,3 +55,27 @@ util.func @generalize_to_any_linalg_op(%arg0 : tensor, %arg1 : tenso // CHECK-SAME: indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, // CHECK-SAME: affine_map<(d0, d1, d2, d3) -> (d3, d0, d1, d2)>] // CHECK: return %[[RESULT]] + +// ----- + +// CHECK: util.func public @interchange +// CHECK: linalg.generic {indexing_maps = [ +// CHECK-SAME: affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>, +// CHECK-SAME: affine_map<(d0, d1, d2, d3) -> (d3, d0, d1)> +// CHECK-SAME: affine_map<(d0, d1, d2, d3) -> (d2, d0, d1)> +// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]} +util.func public @interchange(%arg0: tensor, %arg1: tensor, %arg2: tensor) -> (tensor) { + %0 = linalg.generic {indexing_maps = [ + affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)>, + affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, + affine_map<(d0, d1, d2, d3) -> (d3, d1, d2)>], + iterator_types = ["reduction", "parallel", "parallel", "parallel"]} + ins(%arg0, %arg1 : tensor, tensor) + outs(%arg2 : tensor) { + ^bb0(%arg3: f32, %arg4: f32, %arg5: f32): // no predecessors + %m = arith.mulf %arg3, %arg4 : f32 + %a = arith.addf %arg5, %m : f32 + linalg.yield %a : f32 + } -> tensor + util.return %0 : tensor +}