Update raven filter memory estimator

This is for the raven filter's state at the following commit in `httomolibgpu`: https://github.com/DiamondLightSource/httomolibgpu/tree/66ce152a91abd474ed64f7a11b2bf59e03bb5b51
DiamondLightSource · Dec 18, 2024 · 45992e6 · 45992e6
1 parent 1b57bdb
commit 45992e6
Showing 1 changed file with 23 additions and 14 deletions.
diff --git a/..._backends/methods_database/packages/backends/httomolibgpu/supporting_funcs/prep/stripe.py b/..._backends/methods_database/packages/backends/httomolibgpu/supporting_funcs/prep/stripe.py
@@ -24,7 +24,7 @@
 from typing import Tuple
 import numpy as np
 
-from httomo_backends.cufft import CufftType, cufft_estimate_2d
+from httomo_backends.cufft import CufftType, cufft_estimate_1d
 
 
 __all__ = [
@@ -80,42 +80,51 @@ def _calc_memory_bytes_raven_filter(
     pad_x = kwargs["pad_x"]
     pad_y = kwargs["pad_y"]
 
+    # Unpadded input
     input_size = np.prod(non_slice_dims_shape) * dtype.itemsize
-    output_size = np.prod(non_slice_dims_shape) * dtype.itemsize
 
     # Padded input
     padded_non_slice_dims_shape = (
         non_slice_dims_shape[0] + 2 * pad_y,
         non_slice_dims_shape[1] + 2 * pad_x,
     )
-
     in_slice_size_pad = (
         (padded_non_slice_dims_shape[0])
         * (padded_non_slice_dims_shape[1])
         * dtype.itemsize
     )
-    out_slice_size_pad = in_slice_size_pad
 
+    # Conversion of padded input data to `complex64` (implicitly done by `fft2()` function)
     complex_slice_fft_data = in_slice_size_pad / dtype.itemsize * np.complex64().nbytes
-    complex_slice_fft_data_shifted = complex_slice_fft_data
-    data_out_ifft_complex = complex_slice_fft_data
 
-    # Plan size for 2D FFT
-    fftplan_slice = cufft_estimate_2d(
-        nx=padded_non_slice_dims_shape[1],
-        ny=padded_non_slice_dims_shape[0],
+    # 2D FFT becomes two 1D FFTs (possibly due to applying 2D FFT to non-adjacent axes 0 and
+    # 2), so a plan for a 1D FFT is needed rather than a plan for a 2D FFT
+    fft_1d_plan = cufft_estimate_1d(
+        nx=padded_non_slice_dims_shape[0],
         fft_type=CufftType.CUFFT_C2C,
+        batch=non_slice_dims_shape[1],
     )
 
+    # Copy from applying fftshift to FFT result
+    complex_slice_fft_data_shifted = complex_slice_fft_data
+
+    # Two copies of `complex64` data come from 2D IFFT becoming a loop over two 1D IFFTs, and
+    # applying 1D IFFT to non-adjacent axes 0 and 2 causes data to not be C contiguous (thus,
+    # needing to be copied to get a version of the data which is C contiguous)
+    #
+    # NOTE: The same copies are generated by the 2D FFT becoming two 1D FFTs, but the order of
+    # allocations and deallocations of those copies are such that they don't contribute to peak
+    # GPU memory usage. Thus, they aren't accounted for in the estimated memory, unlike the
+    # copies generated by the 2D IFFT becoming two 1D IFFTs
+    ifft_complex64_copies = 2 * complex_slice_fft_data
+
     tot_memory_bytes = int(
         input_size
-        + output_size
         + in_slice_size_pad
-        + out_slice_size_pad
         + complex_slice_fft_data
+        + 2 * fft_1d_plan
         + complex_slice_fft_data_shifted
-        + data_out_ifft_complex
-        + 2.1 * fftplan_slice
+        + ifft_complex64_copies
     )
 
     return (tot_memory_bytes, 0)