Skip to content

Commit

Permalink
Optimized floating-point packing (#170)
Browse files Browse the repository at this point in the history
For small integer `f32` values, this saves 4 bytes per immediate by not
loading the lower 2 bytes.

This is probably irrelevant to performance, since we don't have that
many immediates, but it makes the assembly slightly nicer to read.
  • Loading branch information
mkeeter authored Oct 26, 2024
1 parent 85db55b commit 7747ffc
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 21 deletions.
22 changes: 17 additions & 5 deletions fidget/src/jit/aarch64/float_slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -402,11 +402,23 @@ impl Assembler for FloatSliceAssembler {
/// Loads an immediate into register V4, using W9 as an intermediary
fn load_imm(&mut self, imm: f32) -> u8 {
let imm_u32 = imm.to_bits();
dynasm!(self.0.ops
; movz w9, imm_u32 >> 16, lsl 16
; movk w9, imm_u32
; dup V(IMM_REG as u32).s4, w9
);
if imm_u32 & 0xFFFF == 0 {
dynasm!(self.0.ops
; movz w9, imm_u32 >> 16, lsl 16
; dup V(IMM_REG as u32).s4, w9
);
} else if imm_u32 & 0xFFFF_0000 == 0 {
dynasm!(self.0.ops
; movz w9, imm_u32 & 0xFFFF
; dup V(IMM_REG as u32).s4, w9
);
} else {
dynasm!(self.0.ops
; movz w9, imm_u32 >> 16, lsl 16
; movk w9, imm_u32 & 0xFFFF
; dup V(IMM_REG as u32).s4, w9
);
}
IMM_REG.wrapping_sub(OFFSET)
}

Expand Down
22 changes: 17 additions & 5 deletions fidget/src/jit/aarch64/grad_slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -488,11 +488,23 @@ impl Assembler for GradSliceAssembler {
/// Loads an immediate into register S4, using W9 as an intermediary
fn load_imm(&mut self, imm: f32) -> u8 {
let imm_u32 = imm.to_bits();
dynasm!(self.0.ops
; movz w9, imm_u32 >> 16, lsl 16
; movk w9, imm_u32
; fmov S(IMM_REG as u32), w9
);
if imm_u32 & 0xFFFF == 0 {
dynasm!(self.0.ops
; movz w9, imm_u32 >> 16, lsl 16
; fmov S(IMM_REG as u32), w9
);
} else if imm_u32 & 0xFFFF_0000 == 0 {
dynasm!(self.0.ops
; movz w9, imm_u32 & 0xFFFF
; fmov S(IMM_REG as u32), w9
);
} else {
dynasm!(self.0.ops
; movz w9, imm_u32 >> 16, lsl 16
; movk w9, imm_u32 & 0xFFFF
; fmov S(IMM_REG as u32), w9
);
}
IMM_REG.wrapping_sub(OFFSET)
}

Expand Down
24 changes: 18 additions & 6 deletions fidget/src/jit/aarch64/interval.rs
Original file line number Diff line number Diff line change
Expand Up @@ -729,14 +729,26 @@ impl Assembler for IntervalAssembler {
);
}

/// Loads an immediate into register S4, using W9 as an intermediary
/// Loads an immediate into register S4, using W15 as an intermediary
fn load_imm(&mut self, imm: f32) -> u8 {
let imm_u32 = imm.to_bits();
dynasm!(self.0.ops
; movz w15, imm_u32 >> 16, lsl 16
; movk w15, imm_u32
; dup V(IMM_REG as u32).s2, w15
);
if imm_u32 & 0xFFFF == 0 {
dynasm!(self.0.ops
; movz w15, imm_u32 >> 16, lsl 16
; dup V(IMM_REG as u32).s2, w15
);
} else if imm_u32 & 0xFFFF_0000 == 0 {
dynasm!(self.0.ops
; movz w15, imm_u32 & 0xFFFF
; dup V(IMM_REG as u32).s2, w15
);
} else {
dynasm!(self.0.ops
; movz w15, imm_u32 >> 16, lsl 16
; movk w15, imm_u32 & 0xFFFF
; dup V(IMM_REG as u32).s2, w15
);
}
IMM_REG.wrapping_sub(OFFSET)
}

Expand Down
22 changes: 17 additions & 5 deletions fidget/src/jit/aarch64/point.rs
Original file line number Diff line number Diff line change
Expand Up @@ -434,11 +434,23 @@ impl Assembler for PointAssembler {
/// Loads an immediate into register S4, using W9 as an intermediary
fn load_imm(&mut self, imm: f32) -> u8 {
let imm_u32 = imm.to_bits();
dynasm!(self.0.ops
; movz w9, imm_u32 >> 16, lsl 16
; movk w9, imm_u32
; fmov S(IMM_REG as u32), w9
);
if imm_u32 & 0xFFFF == 0 {
dynasm!(self.0.ops
; movz w9, imm_u32 >> 16, lsl 16
; fmov S(IMM_REG as u32), w9
);
} else if imm_u32 & 0xFFFF_0000 == 0 {
dynasm!(self.0.ops
; movz w9, imm_u32 & 0xFFFF
; fmov S(IMM_REG as u32), w9
);
} else {
dynasm!(self.0.ops
; movz w9, imm_u32 >> 16, lsl 16
; movk w9, imm_u32 & 0xFFFF
; fmov S(IMM_REG as u32), w9
);
}
IMM_REG.wrapping_sub(OFFSET)
}

Expand Down

0 comments on commit 7747ffc

Please sign in to comment.