From 7747ffcde2f22c179155e07819148bbff8d04231 Mon Sep 17 00:00:00 2001 From: Matt Keeter Date: Sat, 26 Oct 2024 09:54:01 -0700 Subject: [PATCH] Optimized floating-point packing (#170) For small integer `f32` values, this saves 4 bytes per immediate by not loading the lower 2 bytes. This is probably irrelevant to performance, since we don't have that many immediates, but it makes the assembly slightly nicer to read. --- fidget/src/jit/aarch64/float_slice.rs | 22 +++++++++++++++++----- fidget/src/jit/aarch64/grad_slice.rs | 22 +++++++++++++++++----- fidget/src/jit/aarch64/interval.rs | 24 ++++++++++++++++++------ fidget/src/jit/aarch64/point.rs | 22 +++++++++++++++++----- 4 files changed, 69 insertions(+), 21 deletions(-) diff --git a/fidget/src/jit/aarch64/float_slice.rs b/fidget/src/jit/aarch64/float_slice.rs index d7af2670..01dbd322 100644 --- a/fidget/src/jit/aarch64/float_slice.rs +++ b/fidget/src/jit/aarch64/float_slice.rs @@ -402,11 +402,23 @@ impl Assembler for FloatSliceAssembler { /// Loads an immediate into register V4, using W9 as an intermediary fn load_imm(&mut self, imm: f32) -> u8 { let imm_u32 = imm.to_bits(); - dynasm!(self.0.ops - ; movz w9, imm_u32 >> 16, lsl 16 - ; movk w9, imm_u32 - ; dup V(IMM_REG as u32).s4, w9 - ); + if imm_u32 & 0xFFFF == 0 { + dynasm!(self.0.ops + ; movz w9, imm_u32 >> 16, lsl 16 + ; dup V(IMM_REG as u32).s4, w9 + ); + } else if imm_u32 & 0xFFFF_0000 == 0 { + dynasm!(self.0.ops + ; movz w9, imm_u32 & 0xFFFF + ; dup V(IMM_REG as u32).s4, w9 + ); + } else { + dynasm!(self.0.ops + ; movz w9, imm_u32 >> 16, lsl 16 + ; movk w9, imm_u32 & 0xFFFF + ; dup V(IMM_REG as u32).s4, w9 + ); + } IMM_REG.wrapping_sub(OFFSET) } diff --git a/fidget/src/jit/aarch64/grad_slice.rs b/fidget/src/jit/aarch64/grad_slice.rs index c09e240e..09a87086 100644 --- a/fidget/src/jit/aarch64/grad_slice.rs +++ b/fidget/src/jit/aarch64/grad_slice.rs @@ -488,11 +488,23 @@ impl Assembler for GradSliceAssembler { /// Loads an immediate into register S4, using W9 as an intermediary fn load_imm(&mut self, imm: f32) -> u8 { let imm_u32 = imm.to_bits(); - dynasm!(self.0.ops - ; movz w9, imm_u32 >> 16, lsl 16 - ; movk w9, imm_u32 - ; fmov S(IMM_REG as u32), w9 - ); + if imm_u32 & 0xFFFF == 0 { + dynasm!(self.0.ops + ; movz w9, imm_u32 >> 16, lsl 16 + ; fmov S(IMM_REG as u32), w9 + ); + } else if imm_u32 & 0xFFFF_0000 == 0 { + dynasm!(self.0.ops + ; movz w9, imm_u32 & 0xFFFF + ; fmov S(IMM_REG as u32), w9 + ); + } else { + dynasm!(self.0.ops + ; movz w9, imm_u32 >> 16, lsl 16 + ; movk w9, imm_u32 & 0xFFFF + ; fmov S(IMM_REG as u32), w9 + ); + } IMM_REG.wrapping_sub(OFFSET) } diff --git a/fidget/src/jit/aarch64/interval.rs b/fidget/src/jit/aarch64/interval.rs index d4b97ec1..6f4c0a17 100644 --- a/fidget/src/jit/aarch64/interval.rs +++ b/fidget/src/jit/aarch64/interval.rs @@ -729,14 +729,26 @@ impl Assembler for IntervalAssembler { ); } - /// Loads an immediate into register S4, using W9 as an intermediary + /// Loads an immediate into register S4, using W15 as an intermediary fn load_imm(&mut self, imm: f32) -> u8 { let imm_u32 = imm.to_bits(); - dynasm!(self.0.ops - ; movz w15, imm_u32 >> 16, lsl 16 - ; movk w15, imm_u32 - ; dup V(IMM_REG as u32).s2, w15 - ); + if imm_u32 & 0xFFFF == 0 { + dynasm!(self.0.ops + ; movz w15, imm_u32 >> 16, lsl 16 + ; dup V(IMM_REG as u32).s2, w15 + ); + } else if imm_u32 & 0xFFFF_0000 == 0 { + dynasm!(self.0.ops + ; movz w15, imm_u32 & 0xFFFF + ; dup V(IMM_REG as u32).s2, w15 + ); + } else { + dynasm!(self.0.ops + ; movz w15, imm_u32 >> 16, lsl 16 + ; movk w15, imm_u32 & 0xFFFF + ; dup V(IMM_REG as u32).s2, w15 + ); + } IMM_REG.wrapping_sub(OFFSET) } diff --git a/fidget/src/jit/aarch64/point.rs b/fidget/src/jit/aarch64/point.rs index db6a281e..949d520c 100644 --- a/fidget/src/jit/aarch64/point.rs +++ b/fidget/src/jit/aarch64/point.rs @@ -434,11 +434,23 @@ impl Assembler for PointAssembler { /// Loads an immediate into register S4, using W9 as an intermediary fn load_imm(&mut self, imm: f32) -> u8 { let imm_u32 = imm.to_bits(); - dynasm!(self.0.ops - ; movz w9, imm_u32 >> 16, lsl 16 - ; movk w9, imm_u32 - ; fmov S(IMM_REG as u32), w9 - ); + if imm_u32 & 0xFFFF == 0 { + dynasm!(self.0.ops + ; movz w9, imm_u32 >> 16, lsl 16 + ; fmov S(IMM_REG as u32), w9 + ); + } else if imm_u32 & 0xFFFF_0000 == 0 { + dynasm!(self.0.ops + ; movz w9, imm_u32 & 0xFFFF + ; fmov S(IMM_REG as u32), w9 + ); + } else { + dynasm!(self.0.ops + ; movz w9, imm_u32 >> 16, lsl 16 + ; movk w9, imm_u32 & 0xFFFF + ; fmov S(IMM_REG as u32), w9 + ); + } IMM_REG.wrapping_sub(OFFSET) }