From 88a57038900de44e8831f1261b86939090857157 Mon Sep 17 00:00:00 2001 From: FreezyLemon Date: Sun, 4 Feb 2024 06:47:44 +0100 Subject: [PATCH] Precalculate float consts for RecursiveGaussian at build time (#17) --- Cargo.toml | 6 +- build.rs | 141 ++++++++++++++++ src/blur.rs | 376 ------------------------------------------- src/blur/gaussian.rs | 186 +++++++++++++++++++++ src/blur/mod.rs | 60 +++++++ src/lib.rs | 2 +- 6 files changed, 391 insertions(+), 380 deletions(-) create mode 100644 build.rs delete mode 100644 src/blur.rs create mode 100644 src/blur/gaussian.rs create mode 100644 src/blur/mod.rs diff --git a/Cargo.toml b/Cargo.toml index 6962336..2e3a25e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,14 +14,14 @@ license = "BSD-2-Clause" default = ["rayon"] [dependencies] -aligned = "0.4.1" -nalgebra = "0.32.2" num-traits = "0.2.15" rayon = { version = "1.5.3", optional = true } thiserror = "1.0.56" -wide = "0.7.5" yuvxyb = "0.3.0" +[build-dependencies] +nalgebra = "0.32.2" + [dev-dependencies] criterion = "0.4.0" image = "0.24.4" diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..9d8214e --- /dev/null +++ b/build.rs @@ -0,0 +1,141 @@ +use std::env; +use std::f64::consts::PI; +use std::fs::File; +use std::io; +use std::io::Write; +use std::path::Path; + +use nalgebra::{Matrix3, Matrix3x1}; + +fn main() { + let out_dir = &env::var("OUT_DIR").expect("can read OUT_DIR"); + + init_recursive_gaussian(out_dir).expect("can init recursive gaussian"); +} + +fn write_const_f32(w: &mut W, name: &str, val: f32) -> io::Result<()> { + writeln!(w, "pub const {name}: f32 = {val}_f32;") +} + +fn write_const_usize(w: &mut W, name: &str, val: usize) -> io::Result<()> { + writeln!(w, "pub const {name}: usize = {val}_usize;") +} + +fn init_recursive_gaussian(out_path: &str) -> io::Result<()> { + const SIGMA: f64 = 1.5f64; + + // (57), "N" + let radius = 3.2795f64.mul_add(SIGMA, 0.2546).round(); + + // Table I, first row + let pi_div_2r = PI / (2.0f64 * radius); + let omega = [pi_div_2r, 3.0f64 * pi_div_2r, 5.0f64 * pi_div_2r]; + + // (37), k={1,3,5} + let p_1 = 1.0f64 / (0.5 * omega[0]).tan(); + let p_3 = -1.0f64 / (0.5 * omega[1]).tan(); + let p_5 = 1.0f64 / (0.5 * omega[2]).tan(); + + // (44), k={1,3,5} + let r_1 = p_1 * p_1 / omega[0].sin(); + let r_3 = -p_3 * p_3 / omega[1].sin(); + let r_5 = p_5 * p_5 / omega[2].sin(); + + // (50), k={1,3,5} + let neg_half_sigma2 = -0.5f64 * SIGMA * SIGMA; + let recip_radius = 1.0f64 / radius; + let mut rho = [0.0f64; 3]; + for i in 0..3 { + rho[i] = (neg_half_sigma2 * omega[i] * omega[i]).exp() * recip_radius; + } + + // second part of (52), k1,k2 = 1,3; 3,5; 5,1 + let d_13 = p_1.mul_add(r_3, -r_1 * p_3); + let d_35 = p_3.mul_add(r_5, -r_3 * p_5); + let d_51 = p_5.mul_add(r_1, -r_5 * p_1); + + // (52), k=5 + let recip_d13 = 1.0f64 / d_13; + let zeta_15 = d_35 * recip_d13; + let zeta_35 = d_51 * recip_d13; + + // (56) + let a = Matrix3::from_row_slice(&[p_1, p_3, p_5, r_1, r_3, r_5, zeta_15, zeta_35, 1.0f64]) + .try_inverse() + .expect("Has inverse"); + // (55) + let gamma = Matrix3x1::from_column_slice(&[ + 1.0f64, + radius.mul_add(radius, -SIGMA * SIGMA), + zeta_15.mul_add(rho[0], zeta_35 * rho[1]) + rho[2], + ]); + // (53) + let beta = a * gamma; + + // Sanity check: correctly solved for beta (IIR filter weights are normalized) + // (39) + let sum = beta[2].mul_add(p_5, beta[0].mul_add(p_1, beta[1] * p_3)); + assert!((sum - 1.0).abs() < 1E-12f64); + + let mut n2 = [0f64; 3]; + let mut d1 = [0f64; 3]; + let mut mul_prev = [0f32; 3 * 4]; + let mut mul_prev2 = [0f32; 3 * 4]; + let mut mul_in = [0f32; 3 * 4]; + for i in 0..3 { + // (33) + n2[i] = -beta[i] * (omega[i] * (radius + 1.0)).cos(); + d1[i] = -2.0f64 * omega[i].cos(); + + let d_2 = d1[i] * d1[i]; + + // Obtained by expanding (35) for four consecutive outputs via + // sympy: n, d, p, pp = symbols('n d p pp') + // i0, i1, i2, i3 = symbols('i0 i1 i2 i3') + // o0, o1, o2, o3 = symbols('o0 o1 o2 o3') + // o0 = n*i0 - d*p - pp + // o1 = n*i1 - d*o0 - p + // o2 = n*i2 - d*o1 - o0 + // o3 = n*i3 - d*o2 - o1 + // Then expand(o3) and gather terms for p(prev), pp(prev2) etc. + mul_prev[4 * i] = -d1[i] as f32; + mul_prev[4 * i + 1] = (d_2 - 1.0f64) as f32; + mul_prev[4 * i + 2] = (-d_2).mul_add(d1[i], 2.0f64 * d1[i]) as f32; + mul_prev[4 * i + 3] = d_2.mul_add(d_2, 3.0f64.mul_add(-d_2, 1.0f64)) as f32; + mul_prev2[4 * i] = -1.0f32; + mul_prev2[4 * i + 1] = d1[i] as f32; + mul_prev2[4 * i + 2] = (-d_2 + 1.0f64) as f32; + mul_prev2[4 * i + 3] = d_2.mul_add(d1[i], -2.0f64 * d1[i]) as f32; + mul_in[4 * i] = n2[i] as f32; + mul_in[4 * i + 1] = (-d1[i] * n2[i]) as f32; + mul_in[4 * i + 2] = d_2.mul_add(n2[i], -n2[i]) as f32; + mul_in[4 * i + 3] = (-d_2 * d1[i]).mul_add(n2[i], 2.0f64 * d1[i] * n2[i]) as f32; + } + + let file_path = Path::new(out_path).join("recursive_gaussian.rs"); + let mut out_file = File::create(file_path)?; + + write_const_usize(&mut out_file, "RADIUS", radius as usize)?; + + write_const_f32(&mut out_file, "VERT_MUL_IN_1", n2[0] as f32)?; + write_const_f32(&mut out_file, "VERT_MUL_IN_3", n2[1] as f32)?; + write_const_f32(&mut out_file, "VERT_MUL_IN_5", n2[2] as f32)?; + + write_const_f32(&mut out_file, "VERT_MUL_PREV_1", d1[0] as f32)?; + write_const_f32(&mut out_file, "VERT_MUL_PREV_3", d1[1] as f32)?; + write_const_f32(&mut out_file, "VERT_MUL_PREV_5", d1[2] as f32)?; + + write_const_f32(&mut out_file, "MUL_IN_1", mul_in[0])?; + write_const_f32(&mut out_file, "MUL_IN_3", mul_in[4])?; + write_const_f32(&mut out_file, "MUL_IN_5", mul_in[8])?; + + write_const_f32(&mut out_file, "MUL_PREV_1", mul_prev[0])?; + write_const_f32(&mut out_file, "MUL_PREV_3", mul_prev[4])?; + write_const_f32(&mut out_file, "MUL_PREV_5", mul_prev[8])?; + + write_const_f32(&mut out_file, "MUL_PREV2_1", mul_prev2[0])?; + write_const_f32(&mut out_file, "MUL_PREV2_3", mul_prev2[4])?; + write_const_f32(&mut out_file, "MUL_PREV2_5", mul_prev2[8])?; + + Ok(()) +} diff --git a/src/blur.rs b/src/blur.rs deleted file mode 100644 index bd327f4..0000000 --- a/src/blur.rs +++ /dev/null @@ -1,376 +0,0 @@ -use std::f64::consts::PI; - -use aligned::{Aligned, A16}; -use nalgebra::base::{Matrix3, Matrix3x1}; - -/// Structure handling image blur. -/// -/// This struct contains the necessary buffers and the kernel used for blurring -/// (currently a recursive approximation of the Gaussian filter). -/// -/// Note that the width and height of the image passed to [blur][Self::blur] needs to exactly -/// match the width and height of this instance. If you reduce the image size (e.g. via -/// downscaling), [`shrink_to`][Self::shrink_to] can be used to resize the internal buffers. -pub struct Blur { - kernel: RecursiveGaussian, - temp: Vec, - width: usize, - height: usize, -} - -impl Blur { - /// Create a new [Blur] for images of the given width and height. - /// This pre-allocates the necessary buffers. - #[must_use] - pub fn new(width: usize, height: usize) -> Self { - Blur { - kernel: RecursiveGaussian::new(), - temp: vec![0.0f32; width * height], - width, - height, - } - } - - /// Truncates the internal buffers to fit images of the given width and height. - /// - /// This will [truncate][Vec::truncate] the internal buffers - /// without affecting the allocated memory. - pub fn shrink_to(&mut self, width: usize, height: usize) { - self.temp.truncate(width * height); - self.width = width; - self.height = height; - } - - /// Blur the given image. - pub fn blur(&mut self, img: &[Vec; 3]) -> [Vec; 3] { - [ - self.blur_plane(&img[0]), - self.blur_plane(&img[1]), - self.blur_plane(&img[2]), - ] - } - - fn blur_plane(&mut self, plane: &[f32]) -> Vec { - let mut out = vec![0f32; self.width * self.height]; - self.kernel - .horizontal_pass(plane, &mut self.temp, self.width); - self.kernel - .vertical_pass_chunked::<128, 32>(&self.temp, &mut out, self.width, self.height); - out - } -} - -/// Implements "Recursive Implementation of the Gaussian Filter Using Truncated -/// Cosine Functions" by Charalampidis [2016]. -struct RecursiveGaussian { - radius: usize, - /// For k={1,3,5} in that order. - vert_mul_in: [f32; 3], - vert_mul_prev: [f32; 3], - /// We unroll horizontal passes 4x - one output per lane. These are each - /// lane's multiplier for the previous output (relative to the first of - /// the four outputs). Indexing: 4 * 0..2 (for {1,3,5}) + 0..3 for the - /// lane index. - mul_prev: Aligned, - /// Ditto for the second to last output. - mul_prev2: Aligned, - /// We multiply a vector of inputs 0..3 by a vector shifted from this array. - /// in=0 uses all 4 (nonzero) terms; for in=3, the lower three lanes are 0. - mul_in: Aligned, -} - -impl RecursiveGaussian { - pub fn new() -> Self { - const SIGMA: f64 = 1.5f64; - - // (57), "N" - let radius = 3.2795f64.mul_add(SIGMA, 0.2546).round(); - - // Table I, first row - let pi_div_2r = PI / (2.0f64 * radius); - let omega = [pi_div_2r, 3.0f64 * pi_div_2r, 5.0f64 * pi_div_2r]; - - // (37), k={1,3,5} - let p_1 = 1.0f64 / (0.5 * omega[0]).tan(); - let p_3 = -1.0f64 / (0.5 * omega[1]).tan(); - let p_5 = 1.0f64 / (0.5 * omega[2]).tan(); - - // (44), k={1,3,5} - let r_1 = p_1 * p_1 / omega[0].sin(); - let r_3 = -p_3 * p_3 / omega[1].sin(); - let r_5 = p_5 * p_5 / omega[2].sin(); - - // (50), k={1,3,5} - let neg_half_sigma2 = -0.5f64 * SIGMA * SIGMA; - let recip_radius = 1.0f64 / radius; - let mut rho = [0.0f64; 3]; - for i in 0..3 { - rho[i] = (neg_half_sigma2 * omega[i] * omega[i]).exp() * recip_radius; - } - - // second part of (52), k1,k2 = 1,3; 3,5; 5,1 - let d_13 = p_1.mul_add(r_3, -r_1 * p_3); - let d_35 = p_3.mul_add(r_5, -r_3 * p_5); - let d_51 = p_5.mul_add(r_1, -r_5 * p_1); - - // (52), k=5 - let recip_d13 = 1.0f64 / d_13; - let zeta_15 = d_35 * recip_d13; - let zeta_35 = d_51 * recip_d13; - - // (56) - let a = Matrix3::from_row_slice(&[p_1, p_3, p_5, r_1, r_3, r_5, zeta_15, zeta_35, 1.0f64]) - .try_inverse() - .expect("Has inverse"); - // (55) - let gamma = Matrix3x1::from_column_slice(&[ - 1.0f64, - radius.mul_add(radius, -SIGMA * SIGMA), - zeta_15.mul_add(rho[0], zeta_35 * rho[1]) + rho[2], - ]); - // (53) - let beta = a * gamma; - - // Sanity check: correctly solved for beta (IIR filter weights are normalized) - // (39) - let sum = beta[2].mul_add(p_5, beta[0].mul_add(p_1, beta[1] * p_3)); - assert!((sum - 1.0).abs() < 1E-12f64); - - let mut n2 = [0f64; 3]; - let mut d1 = [0f64; 3]; - let mut mul_prev = [0f32; 3 * 4]; - let mut mul_prev2 = [0f32; 3 * 4]; - let mut mul_in = [0f32; 3 * 4]; - for i in 0..3 { - // (33) - n2[i] = -beta[i] * (omega[i] * (radius + 1.0)).cos(); - d1[i] = -2.0f64 * omega[i].cos(); - - let d_2 = d1[i] * d1[i]; - - // Obtained by expanding (35) for four consecutive outputs via - // sympy: n, d, p, pp = symbols('n d p pp') - // i0, i1, i2, i3 = symbols('i0 i1 i2 i3') - // o0, o1, o2, o3 = symbols('o0 o1 o2 o3') - // o0 = n*i0 - d*p - pp - // o1 = n*i1 - d*o0 - p - // o2 = n*i2 - d*o1 - o0 - // o3 = n*i3 - d*o2 - o1 - // Then expand(o3) and gather terms for p(prev), pp(prev2) etc. - mul_prev[4 * i] = -d1[i] as f32; - mul_prev[4 * i + 1] = (d_2 - 1.0f64) as f32; - mul_prev[4 * i + 2] = (-d_2).mul_add(d1[i], 2.0f64 * d1[i]) as f32; - mul_prev[4 * i + 3] = d_2.mul_add(d_2, 3.0f64.mul_add(-d_2, 1.0f64)) as f32; - mul_prev2[4 * i] = -1.0f32; - mul_prev2[4 * i + 1] = d1[i] as f32; - mul_prev2[4 * i + 2] = (-d_2 + 1.0f64) as f32; - mul_prev2[4 * i + 3] = d_2.mul_add(d1[i], -2.0f64 * d1[i]) as f32; - mul_in[4 * i] = n2[i] as f32; - mul_in[4 * i + 1] = (-d1[i] * n2[i]) as f32; - mul_in[4 * i + 2] = d_2.mul_add(n2[i], -n2[i]) as f32; - mul_in[4 * i + 3] = (-d_2 * d1[i]).mul_add(n2[i], 2.0f64 * d1[i] * n2[i]) as f32; - } - - Self { - radius: radius as usize, - vert_mul_in: n2.map(|f| f as f32), - vert_mul_prev: d1.map(|f| f as f32), - mul_prev: Aligned(mul_prev), - mul_prev2: Aligned(mul_prev2), - mul_in: Aligned(mul_in), - } - } - - #[cfg(feature = "rayon")] - pub fn horizontal_pass(&self, input: &[f32], output: &mut [f32], width: usize) { - use rayon::iter::{IndexedParallelIterator, ParallelIterator}; - use rayon::prelude::ParallelSliceMut; - use rayon::slice::ParallelSlice; - - assert_eq!(input.len(), output.len()); - - input - .par_chunks_exact(width) - .zip(output.par_chunks_exact_mut(width)) - .for_each(|(input, output)| self.horizontal_row(input, output, width)); - } - - #[cfg(not(feature = "rayon"))] - pub fn horizontal_pass(&self, input: &[f32], output: &mut [f32], width: usize) { - assert_eq!(input.len(), output.len()); - - for (input, output) in input - .chunks_exact(width) - .zip(output.chunks_exact_mut(width)) - { - self.horizontal_row(input, output, width); - } - } - - fn horizontal_row(&self, input: &[f32], output: &mut [f32], width: usize) { - let big_n = self.radius as isize; - let mul_in_1 = self.mul_in[0]; - let mul_in_3 = self.mul_in[4]; - let mul_in_5 = self.mul_in[8]; - let mul_prev_1 = self.mul_prev[0]; - let mul_prev_3 = self.mul_prev[4]; - let mul_prev_5 = self.mul_prev[8]; - let mul_prev2_1 = self.mul_prev2[0]; - let mul_prev2_3 = self.mul_prev2[4]; - let mul_prev2_5 = self.mul_prev2[8]; - let mut prev_1 = 0f32; - let mut prev_3 = 0f32; - let mut prev_5 = 0f32; - let mut prev2_1 = 0f32; - let mut prev2_3 = 0f32; - let mut prev2_5 = 0f32; - - let mut n = (-big_n) + 1; - while n < width as isize { - let left = n - big_n - 1; - let right = n + big_n - 1; - let left_val = if left >= 0 { - // SAFETY: `left` can never be bigger than `width` - unsafe { *input.get_unchecked(left as usize) } - } else { - 0f32 - }; - let right_val = if right < width as isize { - // SAFETY: this branch ensures that `right` is not bigger than `width` - unsafe { *input.get_unchecked(right as usize) } - } else { - 0f32 - }; - let sum = left_val + right_val; - - let mut out_1 = sum * mul_in_1; - let mut out_3 = sum * mul_in_3; - let mut out_5 = sum * mul_in_5; - - out_1 = mul_prev2_1.mul_add(prev2_1, out_1); - out_3 = mul_prev2_3.mul_add(prev2_3, out_3); - out_5 = mul_prev2_5.mul_add(prev2_5, out_5); - prev2_1 = prev_1; - prev2_3 = prev_3; - prev2_5 = prev_5; - - out_1 = mul_prev_1.mul_add(prev_1, out_1); - out_3 = mul_prev_3.mul_add(prev_3, out_3); - out_5 = mul_prev_5.mul_add(prev_5, out_5); - prev_1 = out_1; - prev_3 = out_3; - prev_5 = out_5; - - if n >= 0 { - // SAFETY: We know that this chunk of output is of size `width`, - // which `n` cannot be larger than. - unsafe { - *output.get_unchecked_mut(n as usize) = out_1 + out_3 + out_5; - } - } - - n += 1; - } - } - - pub fn vertical_pass_chunked( - &self, - input: &[f32], - output: &mut [f32], - width: usize, - height: usize, - ) { - assert!(J > K); - assert!(K > 0); - - assert_eq!(input.len(), output.len()); - - let mut x = 0; - while x + J <= width { - self.vertical_pass::(&input[x..], &mut output[x..], width, height); - x += J; - } - - while x + K <= width { - self.vertical_pass::(&input[x..], &mut output[x..], width, height); - x += K; - } - - while x < width { - self.vertical_pass::<1>(&input[x..], &mut output[x..], width, height); - x += 1; - } - } - - // Apply 1D vertical scan on COLUMNS elements at a time - pub fn vertical_pass( - &self, - input: &[f32], - output: &mut [f32], - width: usize, - height: usize, - ) { - assert_eq!(input.len(), output.len()); - - let big_n = self.radius as isize; - - let zeroes = vec![0f32; COLUMNS]; - let mut prev = vec![0f32; 3 * COLUMNS]; - let mut prev2 = vec![0f32; 3 * COLUMNS]; - let mut out = vec![0f32; 3 * COLUMNS]; - - let mut n = (-big_n) + 1; - while n < height as isize { - let top = n - big_n - 1; - let bottom = n + big_n - 1; - let top_row = if top >= 0 { - &input[top as usize * width..][..COLUMNS] - } else { - &zeroes - }; - - let bottom_row = if bottom < height as isize { - &input[bottom as usize * width..][..COLUMNS] - } else { - &zeroes - }; - - for i in 0..COLUMNS { - let sum = top_row[i] + bottom_row[i]; - - let i1 = i; - let i3 = i1 + COLUMNS; - let i5 = i3 + COLUMNS; - - let mp1 = self.vert_mul_prev[0]; - let mp3 = self.vert_mul_prev[1]; - let mp5 = self.vert_mul_prev[2]; - - let out1 = prev[i1].mul_add(mp1, prev2[i1]); - let out3 = prev[i3].mul_add(mp3, prev2[i3]); - let out5 = prev[i5].mul_add(mp5, prev2[i5]); - - let mi1 = self.vert_mul_in[0]; - let mi3 = self.vert_mul_in[1]; - let mi5 = self.vert_mul_in[2]; - - let out1 = sum.mul_add(mi1, -out1); - let out3 = sum.mul_add(mi3, -out3); - let out5 = sum.mul_add(mi5, -out5); - - out[i1] = out1; - out[i3] = out3; - out[i5] = out5; - - if n >= 0 { - output[n as usize * width + i] = out1 + out3 + out5; - } - } - - prev2.copy_from_slice(&prev); - prev.copy_from_slice(&out); - - n += 1; - } - } -} diff --git a/src/blur/gaussian.rs b/src/blur/gaussian.rs new file mode 100644 index 0000000..bd99067 --- /dev/null +++ b/src/blur/gaussian.rs @@ -0,0 +1,186 @@ +mod consts { + #![allow(clippy::unreadable_literal)] + include!(concat!(env!("OUT_DIR"), "/recursive_gaussian.rs")); +} + +/// Implements "Recursive Implementation of the Gaussian Filter Using Truncated +/// Cosine Functions" by Charalampidis [2016]. +pub struct RecursiveGaussian; + +impl RecursiveGaussian { + #[cfg(feature = "rayon")] + pub fn horizontal_pass(&self, input: &[f32], output: &mut [f32], width: usize) { + use rayon::iter::{IndexedParallelIterator, ParallelIterator}; + use rayon::prelude::ParallelSliceMut; + use rayon::slice::ParallelSlice; + + assert_eq!(input.len(), output.len()); + + input + .par_chunks_exact(width) + .zip(output.par_chunks_exact_mut(width)) + .for_each(|(input, output)| self.horizontal_row(input, output, width)); + } + + #[cfg(not(feature = "rayon"))] + pub fn horizontal_pass(&self, input: &[f32], output: &mut [f32], width: usize) { + assert_eq!(input.len(), output.len()); + + for (input, output) in input + .chunks_exact(width) + .zip(output.chunks_exact_mut(width)) + { + self.horizontal_row(input, output, width); + } + } + + fn horizontal_row(&self, input: &[f32], output: &mut [f32], width: usize) { + let big_n = consts::RADIUS as isize; + let mut prev_1 = 0f32; + let mut prev_3 = 0f32; + let mut prev_5 = 0f32; + let mut prev2_1 = 0f32; + let mut prev2_3 = 0f32; + let mut prev2_5 = 0f32; + + let mut n = (-big_n) + 1; + while n < width as isize { + let left = n - big_n - 1; + let right = n + big_n - 1; + let left_val = if left >= 0 { + // SAFETY: `left` can never be bigger than `width` + unsafe { *input.get_unchecked(left as usize) } + } else { + 0f32 + }; + let right_val = if right < width as isize { + // SAFETY: this branch ensures that `right` is not bigger than `width` + unsafe { *input.get_unchecked(right as usize) } + } else { + 0f32 + }; + let sum = left_val + right_val; + + let mut out_1 = sum * consts::MUL_IN_1; + let mut out_3 = sum * consts::MUL_IN_3; + let mut out_5 = sum * consts::MUL_IN_5; + + out_1 = consts::MUL_PREV2_1.mul_add(prev2_1, out_1); + out_3 = consts::MUL_PREV2_3.mul_add(prev2_3, out_3); + out_5 = consts::MUL_PREV2_5.mul_add(prev2_5, out_5); + prev2_1 = prev_1; + prev2_3 = prev_3; + prev2_5 = prev_5; + + out_1 = consts::MUL_PREV_1.mul_add(prev_1, out_1); + out_3 = consts::MUL_PREV_3.mul_add(prev_3, out_3); + out_5 = consts::MUL_PREV_5.mul_add(prev_5, out_5); + prev_1 = out_1; + prev_3 = out_3; + prev_5 = out_5; + + if n >= 0 { + // SAFETY: We know that this chunk of output is of size `width`, + // which `n` cannot be larger than. + unsafe { + *output.get_unchecked_mut(n as usize) = out_1 + out_3 + out_5; + } + } + + n += 1; + } + } + + pub fn vertical_pass_chunked( + &self, + input: &[f32], + output: &mut [f32], + width: usize, + height: usize, + ) { + assert!(J > K); + assert!(K > 0); + + assert_eq!(input.len(), output.len()); + + let mut x = 0; + while x + J <= width { + self.vertical_pass::(&input[x..], &mut output[x..], width, height); + x += J; + } + + while x + K <= width { + self.vertical_pass::(&input[x..], &mut output[x..], width, height); + x += K; + } + + while x < width { + self.vertical_pass::<1>(&input[x..], &mut output[x..], width, height); + x += 1; + } + } + + // Apply 1D vertical scan on COLUMNS elements at a time + pub fn vertical_pass( + &self, + input: &[f32], + output: &mut [f32], + width: usize, + height: usize, + ) { + assert_eq!(input.len(), output.len()); + + let big_n = consts::RADIUS as isize; + + let zeroes = vec![0f32; COLUMNS]; + let mut prev = vec![0f32; 3 * COLUMNS]; + let mut prev2 = vec![0f32; 3 * COLUMNS]; + let mut out = vec![0f32; 3 * COLUMNS]; + + let mut n = (-big_n) + 1; + while n < height as isize { + let top = n - big_n - 1; + let bottom = n + big_n - 1; + let top_row = if top >= 0 { + &input[top as usize * width..][..COLUMNS] + } else { + &zeroes + }; + + let bottom_row = if bottom < height as isize { + &input[bottom as usize * width..][..COLUMNS] + } else { + &zeroes + }; + + for i in 0..COLUMNS { + let sum = top_row[i] + bottom_row[i]; + + let i1 = i; + let i3 = i1 + COLUMNS; + let i5 = i3 + COLUMNS; + + let out1 = prev[i1].mul_add(consts::VERT_MUL_PREV_1, prev2[i1]); + let out3 = prev[i3].mul_add(consts::VERT_MUL_PREV_3, prev2[i3]); + let out5 = prev[i5].mul_add(consts::VERT_MUL_PREV_5, prev2[i5]); + + let out1 = sum.mul_add(consts::VERT_MUL_IN_1, -out1); + let out3 = sum.mul_add(consts::VERT_MUL_IN_3, -out3); + let out5 = sum.mul_add(consts::VERT_MUL_IN_5, -out5); + + out[i1] = out1; + out[i3] = out3; + out[i5] = out5; + + if n >= 0 { + output[n as usize * width + i] = out1 + out3 + out5; + } + } + + prev2.copy_from_slice(&prev); + prev.copy_from_slice(&out); + + n += 1; + } + } +} diff --git a/src/blur/mod.rs b/src/blur/mod.rs new file mode 100644 index 0000000..cadca76 --- /dev/null +++ b/src/blur/mod.rs @@ -0,0 +1,60 @@ +mod gaussian; + +use gaussian::RecursiveGaussian; + +/// Structure handling image blur. +/// +/// This struct contains the necessary buffers and the kernel used for blurring +/// (currently a recursive approximation of the Gaussian filter). +/// +/// Note that the width and height of the image passed to [blur][Self::blur] needs to exactly +/// match the width and height of this instance. If you reduce the image size (e.g. via +/// downscaling), [`shrink_to`][Self::shrink_to] can be used to resize the internal buffers. +pub struct Blur { + kernel: RecursiveGaussian, + temp: Vec, + width: usize, + height: usize, +} + +impl Blur { + /// Create a new [Blur] for images of the given width and height. + /// This pre-allocates the necessary buffers. + #[must_use] + pub fn new(width: usize, height: usize) -> Self { + Blur { + kernel: RecursiveGaussian, + temp: vec![0.0f32; width * height], + width, + height, + } + } + + /// Truncates the internal buffers to fit images of the given width and height. + /// + /// This will [truncate][Vec::truncate] the internal buffers + /// without affecting the allocated memory. + pub fn shrink_to(&mut self, width: usize, height: usize) { + self.temp.truncate(width * height); + self.width = width; + self.height = height; + } + + /// Blur the given image. + pub fn blur(&mut self, img: &[Vec; 3]) -> [Vec; 3] { + [ + self.blur_plane(&img[0]), + self.blur_plane(&img[1]), + self.blur_plane(&img[2]), + ] + } + + fn blur_plane(&mut self, plane: &[f32]) -> Vec { + let mut out = vec![0f32; self.width * self.height]; + self.kernel + .horizontal_pass(plane, &mut self.temp, self.width); + self.kernel + .vertical_pass_chunked::<128, 32>(&self.temp, &mut out, self.width, self.height); + out + } +} diff --git a/src/lib.rs b/src/lib.rs index d290c3f..e723046 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,6 +13,7 @@ #![allow(clippy::similar_names)] #![allow(clippy::struct_excessive_bools)] #![allow(clippy::use_self)] +#![allow(clippy::unused_self)] #![warn(clippy::clone_on_ref_ptr)] #![warn(clippy::create_dir)] #![warn(clippy::dbg_macro)] @@ -24,7 +25,6 @@ #![warn(clippy::lossy_float_literal)] #![warn(clippy::map_err_ignore)] #![warn(clippy::mem_forget)] -#![warn(clippy::mod_module_files)] #![warn(clippy::multiple_inherent_impl)] #![warn(clippy::pattern_type_mismatch)] #![warn(clippy::rc_buffer)]