[XLA:GPU] Use f32->bfloat conversion instructions on sm_80+

We tried this before with an intrinsic, but that breaks vectorization. Relying on native LLVM types doesn't while delivering the same code improvements. The downside is that LLVM now knows that it's a bfloat instead of a i16 and will optimize based on it. While making this change I had to patch a bunch of holes in the NVPTX LLVM backend, there might be more. Depends on llvm/llvm-project#74827 PiperOrigin-RevId: 590118269
47-studio-org · Dec 12, 2023 · a28a99a · a28a99a
1 parent d820ba9
commit a28a99a
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 0 deletions.
diff --git a/third_party/xla/xla/service/gpu/elemental_ir_emitter.cc b/third_party/xla/xla/service/gpu/elemental_ir_emitter.cc
@@ -353,6 +353,18 @@ llvm::Value* GpuElementalIrEmitter::EmitThreadId() {
   return NSWAdd(NSWMul(block_id, threads_per_block), thread_id_in_block);
 }
 
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitF32ToBF16(
+    llvm::Value* f32_value) {
+  // sm_80 and up has an instruction to convert f32 into bf16.
+  if (ir_emitter_context_.cuda_compute_capability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    return BitCast(
+        FPTrunc(BitCast(f32_value, b()->getFloatTy()), b()->getBFloatTy()),
+        b()->getInt16Ty());
+  }
+  return ElementalIrEmitter::EmitF32ToBF16(f32_value);
+}
+
 StatusOr<std::vector<llvm::Value*>> GpuElementalIrEmitter::EmitThreadLocalCall(
     const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
     absl::string_view, bool /*is_reducer*/) {

diff --git a/third_party/xla/xla/service/gpu/elemental_ir_emitter.h b/third_party/xla/xla/service/gpu/elemental_ir_emitter.h
@@ -96,6 +96,8 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
 
   llvm::Value* EmitThreadId() override;
 
+  StatusOr<llvm::Value*> EmitF32ToBF16(llvm::Value* f32_value) override;
+
   bool fast_min_max() override {
     return ir_emitter_context_.debug_options().xla_gpu_enable_fast_min_max();
   }

diff --git a/third_party/xla/xla/service/gpu/tests/single_instruction.hlo b/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
@@ -1,5 +1,6 @@
 // RUN: hlo_to_llvm_ir --ptx %s | FileCheck %s
 // RUN: hlo_to_llvm_ir --ptx %s --sm=80 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SM80
+// RUN: hlo_to_llvm_ir --ptx %s --sm=90 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SM90
 
 // CHECK-DAG: sqrt.approx.f32
 
@@ -80,3 +81,21 @@ ENTRY main {
   b = f32[] parameter(1)
   ROOT wrapped_b = f32[] fusion(f32[] a, f32[] b), kind=kLoop, calls=fused_computation
 }
+
+// -----
+
+// CHECK-SM80: cvt.rn.f32.s16
+// CHECK-SM80: cvt.rn.bf16.f32
+// CHECK-SM90: cvt.rn.bf16.s16
+
+HloModule Test, is_scheduled=true
+
+fused_computation {
+  param_0 = s16[] parameter(0)
+  ROOT b.1 = bf16[] convert(s16[] param_0)
+}
+
+ENTRY main {
+  a = s16[] parameter(0)
+  ROOT wrapped_b = bf16[] fusion(s16[] a), kind=kLoop, calls=fused_computation
+}