Skip to content

Commit

Permalink
[Codegen][GPU] Sink out shared memory and barriers in vector distribu…
Browse files Browse the repository at this point in the history
…tion (iree-org#15496)

This handles shared memory allocations as well as barriers in vector
distribution patterns by simply sinking/hoisting them out of warp
execute regions where appropriate. Because upstream takes a non-specific
view on the synchronization primitives for these patterns we handle it
in IREE.
  • Loading branch information
qedawkins authored Nov 9, 2023
1 parent 1eb5713 commit bc98b9a
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,13 @@ moveScalarAndBindingUniformCode(vector::WarpExecuteOnLane0Op warpOp) {
return true;
if (isUniformLoad(op))
return true;
// Shared memory is already scoped to the workgroup and can safely be
// hoisted out of the the warp op.
if (auto allocOp = dyn_cast<memref::AllocOp>(op)) {
if (hasSharedMemoryAddressSpace(allocOp.getType())) {
return true;
}
}

return false;
};
Expand Down Expand Up @@ -146,6 +153,26 @@ class InsertElementToBroadcast final
}
};

/// Pattern to sink `gpu.barrier` ops out of a `warp_execute_on_lane_0` op.
class WarpOpBarrier : public OpRewritePattern<vector::WarpExecuteOnLane0Op> {
using OpRewritePattern<vector::WarpExecuteOnLane0Op>::OpRewritePattern;

LogicalResult matchAndRewrite(vector::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
auto yield = cast<vector::YieldOp>(
warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
Operation *lastNode = yield->getPrevNode();
auto barrierOp = dyn_cast_or_null<gpu::BarrierOp>(lastNode);
if (!barrierOp)
return failure();

rewriter.setInsertionPointAfter(warpOp);
(void)rewriter.create<gpu::BarrierOp>(barrierOp.getLoc());
rewriter.eraseOp(barrierOp);
return success();
}
};

static Value simpleWarpShuffleFunction(Location loc, OpBuilder &builder,
Value val, Value srcIdx,
int64_t warpSz) {
Expand Down Expand Up @@ -251,6 +278,7 @@ class VectorReductionToGPUPass
vector::populateDistributeReduction(patterns, groupReductionFn);
vector::populateDistributeTransferWriteOpPatterns(patterns,
distributionFn);
patterns.add<WarpOpBarrier>(patterns.getContext(), 3);
(void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,47 @@ hal.executable private @reduce_storage_buffer_offset {
// CHECK: vector.reduction
// CHECK-COUNT-5: gpu.shuffle
// CHECK: scf.yield

// -----

#executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
#hal.descriptor_set.layout<0, bindings = [
#hal.descriptor_set.binding<0, storage_buffer>,
#hal.descriptor_set.binding<1, storage_buffer>
]>
]>
hal.executable private @shared_memory_copy {
hal.executable.variant @cuda target(#executable_target_cuda_nvptx_fb) {
hal.executable.export @shared_memory_copy layout(#pipeline_layout) attributes {
workgroup_size = [32 : index, 1 : index, 1 : index]
}
builtin.module {
func.func @shared_memory_copy() {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<0.000000e+00> : vector<1xf32>
%cst_0 = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<128x32xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<128x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%alloc = memref.alloc() {alignment = 64 : i64} : memref<32xf32, #gpu.address_space<workgroup>>
%2 = vector.transfer_read %0[%workgroup_id_x, %c0], %cst_0 {in_bounds = [true]} : memref<128x32xf32>, vector<32xf32>
vector.transfer_write %2, %alloc[%c0] {in_bounds = [true]} : vector<32xf32>, memref<32xf32, #gpu.address_space<workgroup>>
gpu.barrier
%3 = vector.transfer_read %alloc[%c0], %cst_0 {in_bounds = [true]} : memref<32xf32, #gpu.address_space<workgroup>>, vector<32xf32>
vector.transfer_write %3, %1[%workgroup_id_x, %c0] {in_bounds = [true]} : vector<32xf32>, memref<128x32xf32>
return
}
}
}
}

// CHECK-LABEL: func.func @shared_memory_copy() {
// CHECK: %[[ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<32xf32, #gpu.address_space<workgroup>>
// CHECK: vector.transfer_read {{.*}} : memref<128x32xf32>, vector<1xf32>
// CHECK: vector.transfer_write {{.*}} %[[ALLOC]]{{.*}} : vector<1xf32>, memref<32xf32, #gpu.address_space<workgroup>>
// CHECK: gpu.barrier
// CHECK: vector.transfer_read %[[ALLOC]]{{.*}} : memref<32xf32, #gpu.address_space<workgroup>>, vector<1xf32>
// CHECK: vector.transfer_write {{.*}} : vector<1xf32>, memref<128x32xf32>
// CHECK: return

0 comments on commit bc98b9a

Please sign in to comment.