From e68d1bb5cf4fbdd315c47163fa620d185feb9b6a Mon Sep 17 00:00:00 2001 From: Serhii Yablokov Date: Mon, 11 Mar 2024 03:13:56 +0100 Subject: [PATCH] Refactor simd classes (match closer to std::simd proposal) --- CMakeLists.txt | 10 +- internal/Atmosphere.cpp | 528 +-- internal/Atmosphere.h | 24 +- internal/BVHSplit.cpp | 42 +- internal/BVHSplit.h | 10 +- internal/Convolution.h | 360 +- internal/Core.cpp | 56 +- internal/CoreDX.h | 2 +- internal/CoreRef.cpp | 1248 +++--- internal/CoreRef.h | 223 +- internal/CoreSIMD.h | 3391 ++++++++--------- internal/CoreVK.h | 2 +- internal/RastState.h | 4 +- internal/RendererAVX.cpp | 88 +- internal/RendererAVX2.cpp | 88 +- internal/RendererAVX512.cpp | 88 +- internal/RendererCPU.h | 76 +- internal/RendererNEON.cpp | 88 +- internal/RendererSSE2.cpp | 87 +- internal/RendererSSE41.cpp | 88 +- internal/SceneCPU.cpp | 94 +- internal/SceneCommon.cpp | 58 +- internal/SceneGPU.h | 106 +- internal/TextureUtils.cpp | 50 +- internal/simd/{simd_vec.h => simd.h} | 421 +- internal/simd/simd_avx.h | 1406 +++++++ internal/simd/simd_avx512.h | 1193 ++++++ .../simd/{simd_vec_neon.h => simd_neon.h} | 648 ++-- internal/simd/simd_sse.h | 1090 ++++++ internal/simd/simd_vec_avx.h | 1309 ------- internal/simd/simd_vec_avx512.h | 1110 ------ internal/simd/simd_vec_sse.h | 1052 ----- tests/test_simd.cpp | 6 +- tests/test_simd.ipp | 380 +- tests/test_simd_avx.cpp | 2 +- tests/test_simd_avx2.cpp | 2 +- tests/test_simd_avx512.cpp | 2 +- tests/test_simd_sse41.cpp | 2 +- 38 files changed, 7834 insertions(+), 7600 deletions(-) rename internal/simd/{simd_vec.h => simd.h} (56%) create mode 100644 internal/simd/simd_avx.h create mode 100644 internal/simd/simd_avx512.h rename internal/simd/{simd_vec_neon.h => simd_neon.h} (54%) create mode 100644 internal/simd/simd_sse.h delete mode 100644 internal/simd/simd_vec_avx.h delete mode 100644 internal/simd/simd_vec_avx512.h delete mode 100644 internal/simd/simd_vec_sse.h diff --git a/CMakeLists.txt b/CMakeLists.txt index ed87b8293..382533e97 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -204,11 +204,11 @@ set(SOURCE_FILES Bitmask.h set(SIMD_FILES internal/simd/aligned_allocator.h internal/simd/detect.h internal/simd/detect.cpp - internal/simd/simd_vec.h - internal/simd/simd_vec_sse.h - internal/simd/simd_vec_avx.h - internal/simd/simd_vec_avx512.h - internal/simd/simd_vec_neon.h) + internal/simd/simd.h + internal/simd/simd_sse.h + internal/simd/simd_avx.h + internal/simd/simd_avx512.h + internal/simd/simd_neon.h) set(VK_SOURCE_FILES internal/Vk/AccStructureVK.h internal/Vk/AccStructureVK.cpp diff --git a/internal/Atmosphere.cpp b/internal/Atmosphere.cpp index 2d0d5232b..4df25d637 100644 --- a/internal/Atmosphere.cpp +++ b/internal/Atmosphere.cpp @@ -32,47 +32,47 @@ force_inline float smoothstep(float edge0, float edge1, float x) { } // Math -Ref::simd_fvec2 SphereIntersection(Ref::simd_fvec4 ray_start, const Ref::simd_fvec4 &ray_dir, - const Ref::simd_fvec4 &sphere_center, const float sphere_radius) { +Ref::fvec2 SphereIntersection(Ref::fvec4 ray_start, const Ref::fvec4 &ray_dir, + const Ref::fvec4 &sphere_center, const float sphere_radius) { ray_start -= sphere_center; const float a = dot(ray_dir, ray_dir); const float b = 2.0f * dot(ray_start, ray_dir); const float c = dot(ray_start, ray_start) - (sphere_radius * sphere_radius); float d = b * b - 4 * a * c; if (d < 0) { - return Ref::simd_fvec2{-1}; + return Ref::fvec2{-1}; } else { d = sqrt(d); - return Ref::simd_fvec2{-b - d, -b + d} / (2 * a); + return Ref::fvec2{-b - d, -b + d} / (2 * a); } } -Ref::simd_fvec2 PlanetIntersection(const atmosphere_params_t ¶ms, const Ref::simd_fvec4 &ray_start, - const Ref::simd_fvec4 &ray_dir) { - const Ref::simd_fvec4 planet_center = Ref::simd_fvec4(0, -params.planet_radius, 0, 0); +Ref::fvec2 PlanetIntersection(const atmosphere_params_t ¶ms, const Ref::fvec4 &ray_start, + const Ref::fvec4 &ray_dir) { + const Ref::fvec4 planet_center = Ref::fvec4(0, -params.planet_radius, 0, 0); return SphereIntersection(ray_start, ray_dir, planet_center, params.planet_radius); } -Ref::simd_fvec2 AtmosphereIntersection(const atmosphere_params_t ¶ms, const Ref::simd_fvec4 &ray_start, - const Ref::simd_fvec4 &ray_dir) { - const Ref::simd_fvec4 planet_center = Ref::simd_fvec4(0, -params.planet_radius, 0, 0); +Ref::fvec2 AtmosphereIntersection(const atmosphere_params_t ¶ms, const Ref::fvec4 &ray_start, + const Ref::fvec4 &ray_dir) { + const Ref::fvec4 planet_center = Ref::fvec4(0, -params.planet_radius, 0, 0); return SphereIntersection(ray_start, ray_dir, planet_center, params.planet_radius + params.atmosphere_height); } -Ref::simd_fvec4 CloudsIntersection(const atmosphere_params_t ¶ms, const Ref::simd_fvec4 &ray_start, - const Ref::simd_fvec4 &ray_dir) { - const Ref::simd_fvec4 planet_center = Ref::simd_fvec4(0, -params.planet_radius, 0, 0); - const Ref::simd_fvec2 beg = +Ref::fvec4 CloudsIntersection(const atmosphere_params_t ¶ms, const Ref::fvec4 &ray_start, + const Ref::fvec4 &ray_dir) { + const Ref::fvec4 planet_center = Ref::fvec4(0, -params.planet_radius, 0, 0); + const Ref::fvec2 beg = SphereIntersection(ray_start, ray_dir, planet_center, params.planet_radius + params.clouds_height_beg); - const Ref::simd_fvec2 end = + const Ref::fvec2 end = SphereIntersection(ray_start, ray_dir, planet_center, params.planet_radius + params.clouds_height_end); return {beg.get<0>(), beg.get<1>(), end.get<0>(), end.get<1>()}; } -Ref::simd_fvec2 MoonIntersection(const atmosphere_params_t ¶ms, const Ref::simd_fvec4 &ray_start, - const Ref::simd_fvec4 &ray_dir) { - const Ref::simd_fvec4 planet_center = - Ref::simd_fvec4{params.moon_dir, Ref::simd_mem_aligned} * params.moon_distance; +Ref::fvec2 MoonIntersection(const atmosphere_params_t ¶ms, const Ref::fvec4 &ray_start, + const Ref::fvec4 &ray_dir) { + const Ref::fvec4 planet_center = + Ref::fvec4{params.moon_dir, Ref::vector_aligned} * params.moon_distance; return SphereIntersection(ray_start, ray_dir, planet_center, params.moon_radius); } @@ -86,7 +86,7 @@ float PhaseMie(const float costh, float g = 0.85f) { } const float WrenningePhaseScale = 0.9f; -const Ref::simd_fvec2 WrenningePhaseParameters = Ref::simd_fvec2(-0.2f, 0.8f); +const Ref::fvec2 WrenningePhaseParameters = Ref::fvec2(-0.2f, 0.8f); force_inline float HenyeyGreenstein(const float mu, const float inG) { return (1.0f - inG * inG) / (powf(1.0f + inG * inG - 2.0f * inG * mu, 1.5f) * 4.0f * PI); @@ -97,8 +97,8 @@ force_inline float CloudPhaseFunction(const float mu) { HenyeyGreenstein(mu, WrenningePhaseParameters.get<1>()), 0.7f); } -Ref::simd_fvec4 PhaseWrenninge(float mu) { - Ref::simd_fvec4 phase = 0.0f; +Ref::fvec4 PhaseWrenninge(float mu) { + Ref::fvec4 phase = 0.0f; // Wrenninge multiscatter approximation phase.set<0>(CloudPhaseFunction(mu)); phase.set<1>(CloudPhaseFunction(mu * WrenningePhaseScale)); @@ -108,28 +108,28 @@ Ref::simd_fvec4 PhaseWrenninge(float mu) { // dl is the density sampled along the light ray for the given sample position. // dC is the low lod sample of density at the given sample position. -float GetLightEnergy(const float dl, const float dC, const Ref::simd_fvec4 &phase_probability) { +float GetLightEnergy(const float dl, const float dC, const Ref::fvec4 &phase_probability) { // Wrenninge multi scatter approximation - const auto exp_scale = Ref::simd_fvec4(0.8f, 0.1f, 0.002f, 0.0f); - const auto total_scale = Ref::simd_fvec4(2.0f, 0.8f, 0.4f, 0.0f); - const Ref::simd_fvec4 intensity_curve = exp(-dl * exp_scale); + const auto exp_scale = Ref::fvec4(0.8f, 0.1f, 0.002f, 0.0f); + const auto total_scale = Ref::fvec4(2.0f, 0.8f, 0.4f, 0.0f); + const Ref::fvec4 intensity_curve = exp(-dl * exp_scale); return dot(total_scale * phase_probability, intensity_curve); } // Atmosphere -float AtmosphereHeight(const atmosphere_params_t ¶ms, const Ref::simd_fvec4 &position_ws, - Ref::simd_fvec4 &up_vector) { - const Ref::simd_fvec4 planet_center = Ref::simd_fvec4(0, -params.planet_radius, 0, 0); +float AtmosphereHeight(const atmosphere_params_t ¶ms, const Ref::fvec4 &position_ws, + Ref::fvec4 &up_vector) { + const Ref::fvec4 planet_center = Ref::fvec4(0, -params.planet_radius, 0, 0); up_vector = (position_ws - planet_center); const float height = length(up_vector); up_vector /= height; return height - params.planet_radius; } -force_inline Ref::simd_fvec4 AtmosphereDensity(const atmosphere_params_t ¶ms, const float h) { +force_inline Ref::fvec4 AtmosphereDensity(const atmosphere_params_t ¶ms, const float h) { #if 1 // expf bug workaround (fp exception on unused simd lanes) - const Ref::simd_fvec4 density_rayleigh = exp(Ref::simd_fvec4{-fmaxf(0.0f, h / params.rayleigh_height)}); - const Ref::simd_fvec4 density_mie = exp(Ref::simd_fvec4{-fmaxf(0.0f, h / params.mie_height)}); + const Ref::fvec4 density_rayleigh = exp(Ref::fvec4{-fmaxf(0.0f, h / params.rayleigh_height)}); + const Ref::fvec4 density_mie = exp(Ref::fvec4{-fmaxf(0.0f, h / params.mie_height)}); #else const float density_rayleigh = expf(-fmaxf(0.0f, h / params.rayleigh_height)); const float density_mie = expf(-fmaxf(0.0f, h / params.mie_height)); @@ -137,42 +137,42 @@ force_inline Ref::simd_fvec4 AtmosphereDensity(const atmosphere_params_t ¶ms const float density_ozone = fmaxf(0.0f, 1.0f - fabsf(h - params.ozone_height_center) / params.ozone_half_width); return params.atmosphere_density * - Ref::simd_fvec4{density_rayleigh.get<0>(), density_mie.get<0>(), density_ozone, 0.0f}; + Ref::fvec4{density_rayleigh.get<0>(), density_mie.get<0>(), density_ozone, 0.0f}; } struct atmosphere_medium_t { - Ref::simd_fvec4 scattering; - Ref::simd_fvec4 absorption; - Ref::simd_fvec4 extinction; + Ref::fvec4 scattering; + Ref::fvec4 absorption; + Ref::fvec4 extinction; - Ref::simd_fvec4 scattering_mie; - Ref::simd_fvec4 absorption_mie; - Ref::simd_fvec4 extinction_mie; + Ref::fvec4 scattering_mie; + Ref::fvec4 absorption_mie; + Ref::fvec4 extinction_mie; - Ref::simd_fvec4 scattering_ray; - Ref::simd_fvec4 absorption_ray; - Ref::simd_fvec4 extinction_ray; + Ref::fvec4 scattering_ray; + Ref::fvec4 absorption_ray; + Ref::fvec4 extinction_ray; - Ref::simd_fvec4 scattering_ozo; - Ref::simd_fvec4 absorption_ozo; - Ref::simd_fvec4 extinction_ozo; + Ref::fvec4 scattering_ozo; + Ref::fvec4 absorption_ozo; + Ref::fvec4 extinction_ozo; }; force_inline atmosphere_medium_t SampleAtmosphereMedium(const atmosphere_params_t ¶ms, const float h) { - const Ref::simd_fvec4 local_density = AtmosphereDensity(params, h); + const Ref::fvec4 local_density = AtmosphereDensity(params, h); atmosphere_medium_t s; - s.scattering_mie = local_density.get<1>() * Ref::simd_fvec4{params.mie_scattering, Ref::simd_mem_aligned}; - s.absorption_mie = local_density.get<1>() * Ref::simd_fvec4{params.mie_absorption, Ref::simd_mem_aligned}; - s.extinction_mie = local_density.get<1>() * Ref::simd_fvec4{params.mie_extinction, Ref::simd_mem_aligned}; + s.scattering_mie = local_density.get<1>() * Ref::fvec4{params.mie_scattering, Ref::vector_aligned}; + s.absorption_mie = local_density.get<1>() * Ref::fvec4{params.mie_absorption, Ref::vector_aligned}; + s.extinction_mie = local_density.get<1>() * Ref::fvec4{params.mie_extinction, Ref::vector_aligned}; - s.scattering_ray = local_density.get<0>() * Ref::simd_fvec4{params.rayleigh_scattering, Ref::simd_mem_aligned}; + s.scattering_ray = local_density.get<0>() * Ref::fvec4{params.rayleigh_scattering, Ref::vector_aligned}; s.absorption_ray = 0.0f; s.extinction_ray = s.scattering_ray + s.absorption_ray; s.scattering_ozo = 0.0; - s.absorption_ozo = local_density.get<2>() * Ref::simd_fvec4{params.ozone_absorbtion, Ref::simd_mem_aligned}; + s.absorption_ozo = local_density.get<2>() * Ref::fvec4{params.ozone_absorbtion, Ref::vector_aligned}; s.extinction_ozo = s.scattering_ozo + s.absorption_ozo; s.scattering = s.scattering_mie + s.scattering_ray + s.scattering_ozo; @@ -183,72 +183,72 @@ force_inline atmosphere_medium_t SampleAtmosphereMedium(const atmosphere_params_ return s; } -Ref::simd_fvec4 SampleTransmittanceLUT(Span lut, Ref::simd_fvec2 uv) { - uv = uv * Ref::simd_fvec2(TRANSMITTANCE_LUT_W, TRANSMITTANCE_LUT_H); - auto iuv0 = Ref::simd_ivec2(uv); - iuv0 = clamp(iuv0, Ref::simd_ivec2{0, 0}, Ref::simd_ivec2{TRANSMITTANCE_LUT_W - 1, TRANSMITTANCE_LUT_H - 1}); - const Ref::simd_ivec2 iuv1 = min(iuv0 + 1, Ref::simd_ivec2{TRANSMITTANCE_LUT_W - 1, TRANSMITTANCE_LUT_H - 1}); +Ref::fvec4 SampleTransmittanceLUT(Span lut, Ref::fvec2 uv) { + uv = uv * Ref::fvec2(TRANSMITTANCE_LUT_W, TRANSMITTANCE_LUT_H); + auto iuv0 = Ref::ivec2(uv); + iuv0 = clamp(iuv0, Ref::ivec2{0, 0}, Ref::ivec2{TRANSMITTANCE_LUT_W - 1, TRANSMITTANCE_LUT_H - 1}); + const Ref::ivec2 iuv1 = min(iuv0 + 1, Ref::ivec2{TRANSMITTANCE_LUT_W - 1, TRANSMITTANCE_LUT_H - 1}); - const auto tr00 = Ref::simd_fvec4(&lut[4 * (iuv0.get<1>() * TRANSMITTANCE_LUT_W + iuv0.get<0>())], - Ref::simd_mem_aligned), - tr01 = Ref::simd_fvec4(&lut[4 * (iuv0.get<1>() * TRANSMITTANCE_LUT_W + iuv1.get<0>())], - Ref::simd_mem_aligned), - tr10 = Ref::simd_fvec4(&lut[4 * (iuv1.get<1>() * TRANSMITTANCE_LUT_W + iuv0.get<0>())], - Ref::simd_mem_aligned), - tr11 = Ref::simd_fvec4(&lut[4 * (iuv1.get<1>() * TRANSMITTANCE_LUT_W + iuv1.get<0>())], - Ref::simd_mem_aligned); + const auto tr00 = Ref::fvec4(&lut[4 * (iuv0.get<1>() * TRANSMITTANCE_LUT_W + iuv0.get<0>())], + Ref::vector_aligned), + tr01 = Ref::fvec4(&lut[4 * (iuv0.get<1>() * TRANSMITTANCE_LUT_W + iuv1.get<0>())], + Ref::vector_aligned), + tr10 = Ref::fvec4(&lut[4 * (iuv1.get<1>() * TRANSMITTANCE_LUT_W + iuv0.get<0>())], + Ref::vector_aligned), + tr11 = Ref::fvec4(&lut[4 * (iuv1.get<1>() * TRANSMITTANCE_LUT_W + iuv1.get<0>())], + Ref::vector_aligned); - const Ref::simd_fvec2 k = fract(uv); + const Ref::fvec2 k = fract(uv); - const Ref::simd_fvec4 tr0 = tr01 * k.get<0>() + tr00 * (1.0f - k.get<0>()), + const Ref::fvec4 tr0 = tr01 * k.get<0>() + tr00 * (1.0f - k.get<0>()), tr1 = tr11 * k.get<0>() + tr10 * (1.0f - k.get<0>()); return (tr1 * k.get<1>() + tr0 * (1.0f - k.get<1>())); } -Ref::simd_fvec4 SampleMultiscatterLUT(Span lut, Ref::simd_fvec2 uv) { - uv = uv * Ref::simd_fvec2(MULTISCATTER_LUT_RES); - auto iuv0 = Ref::simd_ivec2(uv); - iuv0 = clamp(iuv0, Ref::simd_ivec2{0, 0}, Ref::simd_ivec2{MULTISCATTER_LUT_RES - 1}); - const Ref::simd_ivec2 iuv1 = min(iuv0 + 1, Ref::simd_ivec2{MULTISCATTER_LUT_RES - 1}); +Ref::fvec4 SampleMultiscatterLUT(Span lut, Ref::fvec2 uv) { + uv = uv * Ref::fvec2(MULTISCATTER_LUT_RES); + auto iuv0 = Ref::ivec2(uv); + iuv0 = clamp(iuv0, Ref::ivec2{0, 0}, Ref::ivec2{MULTISCATTER_LUT_RES - 1}); + const Ref::ivec2 iuv1 = min(iuv0 + 1, Ref::ivec2{MULTISCATTER_LUT_RES - 1}); - const auto ms00 = Ref::simd_fvec4(&lut[4 * (iuv0.get<1>() * MULTISCATTER_LUT_RES + iuv0.get<0>())], - Ref::simd_mem_aligned), - ms01 = Ref::simd_fvec4(&lut[4 * (iuv0.get<1>() * MULTISCATTER_LUT_RES + iuv1.get<0>())], - Ref::simd_mem_aligned), - ms10 = Ref::simd_fvec4(&lut[4 * (iuv1.get<1>() * MULTISCATTER_LUT_RES + iuv0.get<0>())], - Ref::simd_mem_aligned), - ms11 = Ref::simd_fvec4(&lut[4 * (iuv1.get<1>() * MULTISCATTER_LUT_RES + iuv1.get<0>())], - Ref::simd_mem_aligned); + const auto ms00 = Ref::fvec4(&lut[4 * (iuv0.get<1>() * MULTISCATTER_LUT_RES + iuv0.get<0>())], + Ref::vector_aligned), + ms01 = Ref::fvec4(&lut[4 * (iuv0.get<1>() * MULTISCATTER_LUT_RES + iuv1.get<0>())], + Ref::vector_aligned), + ms10 = Ref::fvec4(&lut[4 * (iuv1.get<1>() * MULTISCATTER_LUT_RES + iuv0.get<0>())], + Ref::vector_aligned), + ms11 = Ref::fvec4(&lut[4 * (iuv1.get<1>() * MULTISCATTER_LUT_RES + iuv1.get<0>())], + Ref::vector_aligned); - const Ref::simd_fvec2 k = fract(uv); + const Ref::fvec2 k = fract(uv); - const Ref::simd_fvec4 ms0 = ms01 * k.get<0>() + ms00 * (1.0f - k.get<0>()), + const Ref::fvec4 ms0 = ms01 * k.get<0>() + ms00 * (1.0f - k.get<0>()), ms1 = ms11 * k.get<0>() + ms10 * (1.0f - k.get<0>()); return (ms1 * k.get<1>() + ms0 * (1.0f - k.get<1>())); } -force_inline Ref::simd_fvec4 FetchWeatherTex(const int x, const int y) { - return Ref::simd_fvec4{float(__weather_tex[3 * (y * WEATHER_TEX_RES + x) + 0]), +force_inline Ref::fvec4 FetchWeatherTex(const int x, const int y) { + return Ref::fvec4{float(__weather_tex[3 * (y * WEATHER_TEX_RES + x) + 0]), float(__weather_tex[3 * (y * WEATHER_TEX_RES + x) + 1]), float(__weather_tex[3 * (y * WEATHER_TEX_RES + x) + 2]), 0.0f}; } -Ref::simd_fvec4 SampleWeatherTex(Ref::simd_fvec2 uv) { - uv = uv * Ref::simd_fvec2(WEATHER_TEX_RES); - auto iuv0 = Ref::simd_ivec2{uv}; - iuv0 = clamp(iuv0, Ref::simd_ivec2{0, 0}, Ref::simd_ivec2{WEATHER_TEX_RES - 1}); - const Ref::simd_ivec2 iuv1 = (iuv0 + 1) & Ref::simd_ivec2{WEATHER_TEX_RES - 1}; +Ref::fvec4 SampleWeatherTex(Ref::fvec2 uv) { + uv = uv * Ref::fvec2(WEATHER_TEX_RES); + auto iuv0 = Ref::ivec2{uv}; + iuv0 = clamp(iuv0, Ref::ivec2{0, 0}, Ref::ivec2{WEATHER_TEX_RES - 1}); + const Ref::ivec2 iuv1 = (iuv0 + 1) & Ref::ivec2{WEATHER_TEX_RES - 1}; - const Ref::simd_fvec4 w00 = FetchWeatherTex(iuv0.get<0>(), iuv0.get<1>()), + const Ref::fvec4 w00 = FetchWeatherTex(iuv0.get<0>(), iuv0.get<1>()), w01 = FetchWeatherTex(iuv1.get<0>(), iuv0.get<1>()), w10 = FetchWeatherTex(iuv0.get<0>(), iuv1.get<1>()), w11 = FetchWeatherTex(iuv1.get<0>(), iuv1.get<1>()); - const Ref::simd_fvec2 k = fract(uv); + const Ref::fvec2 k = fract(uv); - const Ref::simd_fvec4 w0 = w01 * k.get<0>() + w00 * (1.0f - k.get<0>()), + const Ref::fvec4 w0 = w01 * k.get<0>() + w00 * (1.0f - k.get<0>()), w1 = w11 * k.get<0>() + w10 * (1.0f - k.get<0>()); return (w1 * k.get<1>() + w0 * (1.0f - k.get<1>())) * (1.0f / 255.0f); @@ -256,13 +256,13 @@ Ref::simd_fvec4 SampleWeatherTex(Ref::simd_fvec2 uv) { // Taken from https://github.com/armory3d/armory_ci/blob/master/build_untitled/compiled/Shaders/world_pass.frag.glsl float GetDensityHeightGradientForPoint(float height, float cloud_type) { - const auto stratusGrad = Ref::simd_fvec4(0.02f, 0.05f, 0.09f, 0.11f); - const auto stratocumulusGrad = Ref::simd_fvec4(0.02f, 0.2f, 0.48f, 0.625f); - const auto cumulusGrad = Ref::simd_fvec4(0.01f, 0.0625f, 0.78f, 1.0f); + const auto stratusGrad = Ref::fvec4(0.02f, 0.05f, 0.09f, 0.11f); + const auto stratocumulusGrad = Ref::fvec4(0.02f, 0.2f, 0.48f, 0.625f); + const auto cumulusGrad = Ref::fvec4(0.01f, 0.0625f, 0.78f, 1.0f); float stratus = 1.0f - clamp(cloud_type * 2.0f, 0, 1); float stratocumulus = 1.0f - abs(cloud_type - 0.5f) * 2.0f; float cumulus = clamp(cloud_type - 0.5f, 0, 1) * 2.0f; - Ref::simd_fvec4 cloudGradient = stratusGrad * stratus + stratocumulusGrad * stratocumulus + cumulusGrad * cumulus; + Ref::fvec4 cloudGradient = stratusGrad * stratus + stratocumulusGrad * stratocumulus + cumulusGrad * cumulus; return smoothstep(cloudGradient.get<0>(), cloudGradient.get<1>(), height) - smoothstep(cloudGradient.get<2>(), cloudGradient.get<3>(), height); } @@ -271,11 +271,11 @@ force_inline float Fetch3dNoiseTex(const int x, const int y, const int z) { return __3d_noise_tex[z * NOISE_3D_RES * NOISE_3D_RES + y * NOISE_3D_RES + x]; } -float Sample3dNoiseTex(Ref::simd_fvec4 uvw) { +float Sample3dNoiseTex(Ref::fvec4 uvw) { uvw *= NOISE_3D_RES; - Ref::simd_ivec4 iuvw0 = Ref::simd_ivec4(uvw); - iuvw0 = clamp(iuvw0, Ref::simd_ivec4{0}, Ref::simd_ivec4{NOISE_3D_RES - 1}); - const Ref::simd_ivec4 iuvw1 = (iuvw0 + 1) & Ref::simd_ivec4{NOISE_3D_RES - 1}; + Ref::ivec4 iuvw0 = Ref::ivec4(uvw); + iuvw0 = clamp(iuvw0, Ref::ivec4{0}, Ref::ivec4{NOISE_3D_RES - 1}); + const Ref::ivec4 iuvw1 = (iuvw0 + 1) & Ref::ivec4{NOISE_3D_RES - 1}; const float n000 = Fetch3dNoiseTex(iuvw0.get<0>(), iuvw0.get<1>(), iuvw0.get<2>()), n001 = Fetch3dNoiseTex(iuvw1.get<0>(), iuvw0.get<1>(), iuvw0.get<2>()), @@ -286,7 +286,7 @@ float Sample3dNoiseTex(Ref::simd_fvec4 uvw) { n110 = Fetch3dNoiseTex(iuvw0.get<0>(), iuvw1.get<1>(), iuvw1.get<2>()), n111 = Fetch3dNoiseTex(iuvw1.get<0>(), iuvw1.get<1>(), iuvw1.get<2>()); - const Ref::simd_fvec4 k = fract(uvw); + const Ref::fvec4 k = fract(uvw); const float n00x = (1.0f - k.get<0>()) * n000 + k.get<0>() * n001, n01x = (1.0f - k.get<0>()) * n010 + k.get<0>() * n011, @@ -299,17 +299,17 @@ float Sample3dNoiseTex(Ref::simd_fvec4 uvw) { return ((1.0f - k.get<2>()) * n0xx + k.get<2>() * n1xx) / 255.0f; } -float GetCloudsDensity(const atmosphere_params_t ¶ms, Ref::simd_fvec4 local_position, float &out_local_height, - float &out_height_fraction, Ref::simd_fvec4 &out_up_vector) { +float GetCloudsDensity(const atmosphere_params_t ¶ms, Ref::fvec4 local_position, float &out_local_height, + float &out_height_fraction, Ref::fvec4 &out_up_vector) { out_local_height = AtmosphereHeight(params, local_position, out_up_vector); out_height_fraction = (out_local_height - params.clouds_height_beg) / (params.clouds_height_end - params.clouds_height_beg); - Ref::simd_fvec2 weather_uv = {local_position.get<0>() + params.clouds_offset_x, + Ref::fvec2 weather_uv = {local_position.get<0>() + params.clouds_offset_x, local_position.get<2>() + params.clouds_offset_z}; weather_uv = fract(weather_uv * 0.00007f); - const Ref::simd_fvec4 weather_sample = SampleWeatherTex(weather_uv); + const Ref::fvec4 weather_sample = SampleWeatherTex(weather_uv); float cloud_coverage = mix(weather_sample.get<2>(), weather_sample.get<1>(), params.clouds_variety); cloud_coverage = remap(cloud_coverage, saturate(1.0f - params.clouds_density + 0.5f * out_height_fraction)); @@ -329,19 +329,19 @@ float GetCloudsDensity(const atmosphere_params_t ¶ms, Ref::simd_fvec4 local_ powf(5.0f * remap(cloud_coverage, 0.6f * noise_read), 1.0f - out_height_fraction); } -float TraceCloudShadow(const atmosphere_params_t ¶ms, const uint32_t rand_hash, Ref::simd_fvec4 ray_start, - const Ref::simd_fvec4 &ray_dir) { - const Ref::simd_fvec4 clouds_intersection = CloudsIntersection(params, ray_start, ray_dir); +float TraceCloudShadow(const atmosphere_params_t ¶ms, const uint32_t rand_hash, Ref::fvec4 ray_start, + const Ref::fvec4 &ray_dir) { + const Ref::fvec4 clouds_intersection = CloudsIntersection(params, ray_start, ray_dir); if (clouds_intersection.get<3>() > 0) { const int SampleCount = 32; const float StepSize = 16.0f; - Ref::simd_fvec4 pos = ray_start + Ref::construct_float(rand_hash) * ray_dir * StepSize; + Ref::fvec4 pos = ray_start + Ref::construct_float(rand_hash) * ray_dir * StepSize; float ret = 0.0f; for (int i = 0; i < SampleCount; ++i) { float local_height, height_fraction; - Ref::simd_fvec4 up_vector; + Ref::fvec4 up_vector; const float local_density = GetCloudsDensity(params, pos, local_height, height_fraction, up_vector); ret += local_density; pos += ray_dir * StepSize; @@ -353,10 +353,10 @@ float TraceCloudShadow(const atmosphere_params_t ¶ms, const uint32_t rand_ha } // https://www.shadertoy.com/view/NtsBzB -Ref::simd_fvec4 stars_hash(Ref::simd_fvec4 p) { - p = Ref::simd_fvec4{dot(p, Ref::simd_fvec4{127.1f, 311.7f, 74.7f, 0.0f}), - dot(p, Ref::simd_fvec4{269.5f, 183.3f, 246.1f, 0.0f}), - dot(p, Ref::simd_fvec4{113.5f, 271.9f, 124.6f, 0.0f}), 0.0f}; +Ref::fvec4 stars_hash(Ref::fvec4 p) { + p = Ref::fvec4{dot(p, Ref::fvec4{127.1f, 311.7f, 74.7f, 0.0f}), + dot(p, Ref::fvec4{269.5f, 183.3f, 246.1f, 0.0f}), + dot(p, Ref::fvec4{113.5f, 271.9f, 124.6f, 0.0f}), 0.0f}; p.set<0>(sinf(p.get<0>())); p.set<1>(sinf(p.get<1>())); @@ -365,81 +365,81 @@ Ref::simd_fvec4 stars_hash(Ref::simd_fvec4 p) { return -1.0f + 2.0f * fract(p * 43758.5453123f); } -float stars_noise(const Ref::simd_fvec4 &p) { - Ref::simd_fvec4 i = floor(p); - Ref::simd_fvec4 f = fract(p); +float stars_noise(const Ref::fvec4 &p) { + Ref::fvec4 i = floor(p); + Ref::fvec4 f = fract(p); - Ref::simd_fvec4 u = f * f * (3.0f - 2.0f * f); + Ref::fvec4 u = f * f * (3.0f - 2.0f * f); - return mix(mix(mix(dot(stars_hash(i + Ref::simd_fvec4(0.0f, 0.0f, 0.0f, 0.0f)), - f - Ref::simd_fvec4(0.0f, 0.0f, 0.0f, 0.0f)), - dot(stars_hash(i + Ref::simd_fvec4(1.0f, 0.0f, 0.0f, 0.0f)), - f - Ref::simd_fvec4(1.0f, 0.0f, 0.0f, 0.0f)), + return mix(mix(mix(dot(stars_hash(i + Ref::fvec4(0.0f, 0.0f, 0.0f, 0.0f)), + f - Ref::fvec4(0.0f, 0.0f, 0.0f, 0.0f)), + dot(stars_hash(i + Ref::fvec4(1.0f, 0.0f, 0.0f, 0.0f)), + f - Ref::fvec4(1.0f, 0.0f, 0.0f, 0.0f)), u.get<0>()), - mix(dot(stars_hash(i + Ref::simd_fvec4(0.0f, 1.0f, 0.0f, 0.0f)), - f - Ref::simd_fvec4(0.0f, 1.0f, 0.0f, 0.0f)), - dot(stars_hash(i + Ref::simd_fvec4(1.0f, 1.0f, 0.0f, 0.0f)), - f - Ref::simd_fvec4(1.0f, 1.0f, 0.0f, 0.0f)), + mix(dot(stars_hash(i + Ref::fvec4(0.0f, 1.0f, 0.0f, 0.0f)), + f - Ref::fvec4(0.0f, 1.0f, 0.0f, 0.0f)), + dot(stars_hash(i + Ref::fvec4(1.0f, 1.0f, 0.0f, 0.0f)), + f - Ref::fvec4(1.0f, 1.0f, 0.0f, 0.0f)), u.get<0>()), u.get<1>()), - mix(mix(dot(stars_hash(i + Ref::simd_fvec4(0.0f, 0.0f, 1.0f, 0.0f)), - f - Ref::simd_fvec4(0.0f, 0.0f, 1.0f, 0.0f)), - dot(stars_hash(i + Ref::simd_fvec4(1.0f, 0.0f, 1.0f, 0.0f)), - f - Ref::simd_fvec4(1.0f, 0.0f, 1.0f, 0.0f)), + mix(mix(dot(stars_hash(i + Ref::fvec4(0.0f, 0.0f, 1.0f, 0.0f)), + f - Ref::fvec4(0.0f, 0.0f, 1.0f, 0.0f)), + dot(stars_hash(i + Ref::fvec4(1.0f, 0.0f, 1.0f, 0.0f)), + f - Ref::fvec4(1.0f, 0.0f, 1.0f, 0.0f)), u.get<0>()), - mix(dot(stars_hash(i + Ref::simd_fvec4(0.0f, 1.0f, 1.0f, 0.0f)), - f - Ref::simd_fvec4(0.0f, 1.0f, 1.0f, 0.0f)), - dot(stars_hash(i + Ref::simd_fvec4(1.0f, 1.0f, 1.0f, 0.0f)), - f - Ref::simd_fvec4(1.0f, 1.0f, 1.0f, 0.0f)), + mix(dot(stars_hash(i + Ref::fvec4(0.0f, 1.0f, 1.0f, 0.0f)), + f - Ref::fvec4(0.0f, 1.0f, 1.0f, 0.0f)), + dot(stars_hash(i + Ref::fvec4(1.0f, 1.0f, 1.0f, 0.0f)), + f - Ref::fvec4(1.0f, 1.0f, 1.0f, 0.0f)), u.get<0>()), u.get<1>()), u.get<2>()); } -force_inline Ref::simd_fvec4 FetchMoonTex(const int x, const int y) { - return Ref::simd_fvec4{float(__moon_tex[3 * (y * MOON_TEX_W + x) + 0]), +force_inline Ref::fvec4 FetchMoonTex(const int x, const int y) { + return Ref::fvec4{float(__moon_tex[3 * (y * MOON_TEX_W + x) + 0]), float(__moon_tex[3 * (y * MOON_TEX_W + x) + 1]), float(__moon_tex[3 * (y * MOON_TEX_W + x) + 2]), 0.0f}; } -Ref::simd_fvec4 SampleMoonTex(Ref::simd_fvec2 uv) { - uv = uv * Ref::simd_fvec2(MOON_TEX_W, MOON_TEX_H); - auto iuv0 = Ref::simd_ivec2{uv}; - iuv0 = clamp(iuv0, Ref::simd_ivec2{0, 0}, Ref::simd_ivec2{MOON_TEX_W - 1, MOON_TEX_H - 1}); - const Ref::simd_ivec2 iuv1 = (iuv0 + 1) & Ref::simd_ivec2{MOON_TEX_W - 1, MOON_TEX_H - 1}; +Ref::fvec4 SampleMoonTex(Ref::fvec2 uv) { + uv = uv * Ref::fvec2(MOON_TEX_W, MOON_TEX_H); + auto iuv0 = Ref::ivec2{uv}; + iuv0 = clamp(iuv0, Ref::ivec2{0, 0}, Ref::ivec2{MOON_TEX_W - 1, MOON_TEX_H - 1}); + const Ref::ivec2 iuv1 = (iuv0 + 1) & Ref::ivec2{MOON_TEX_W - 1, MOON_TEX_H - 1}; - const Ref::simd_fvec4 m00 = FetchMoonTex(iuv0.get<0>(), iuv0.get<1>()), + const Ref::fvec4 m00 = FetchMoonTex(iuv0.get<0>(), iuv0.get<1>()), m01 = FetchMoonTex(iuv1.get<0>(), iuv0.get<1>()), m10 = FetchMoonTex(iuv0.get<0>(), iuv1.get<1>()), m11 = FetchMoonTex(iuv1.get<0>(), iuv1.get<1>()); - const Ref::simd_fvec2 k = fract(uv); + const Ref::fvec2 k = fract(uv); - const Ref::simd_fvec4 m0 = m01 * k.get<0>() + m00 * (1.0f - k.get<0>()), + const Ref::fvec4 m0 = m01 * k.get<0>() + m00 * (1.0f - k.get<0>()), m1 = m11 * k.get<0>() + m10 * (1.0f - k.get<0>()); return srgb_to_rgb((m1 * k.get<1>() + m0 * (1.0f - k.get<1>())) * (1.0f / 255.0f)); } -force_inline Ref::simd_fvec2 FetchCirrusTex(const int x, const int y) { - return Ref::simd_fvec2{float(__cirrus_tex[2 * (y * CIRRUS_TEX_W + x) + 0]), +force_inline Ref::fvec2 FetchCirrusTex(const int x, const int y) { + return Ref::fvec2{float(__cirrus_tex[2 * (y * CIRRUS_TEX_W + x) + 0]), float(__cirrus_tex[2 * (y * CIRRUS_TEX_W + x) + 1])}; } -Ref::simd_fvec2 SampleCirrusTex(Ref::simd_fvec2 uv) { - uv = uv * Ref::simd_fvec2(CIRRUS_TEX_W, CIRRUS_TEX_H); - auto iuv0 = Ref::simd_ivec2{uv}; - iuv0 = clamp(iuv0, Ref::simd_ivec2{0, 0}, Ref::simd_ivec2{CIRRUS_TEX_W - 1, CIRRUS_TEX_H - 1}); - const Ref::simd_ivec2 iuv1 = (iuv0 + 1) & Ref::simd_ivec2{CIRRUS_TEX_W - 1, CIRRUS_TEX_H - 1}; +Ref::fvec2 SampleCirrusTex(Ref::fvec2 uv) { + uv = uv * Ref::fvec2(CIRRUS_TEX_W, CIRRUS_TEX_H); + auto iuv0 = Ref::ivec2{uv}; + iuv0 = clamp(iuv0, Ref::ivec2{0, 0}, Ref::ivec2{CIRRUS_TEX_W - 1, CIRRUS_TEX_H - 1}); + const Ref::ivec2 iuv1 = (iuv0 + 1) & Ref::ivec2{CIRRUS_TEX_W - 1, CIRRUS_TEX_H - 1}; - const Ref::simd_fvec2 m00 = FetchCirrusTex(iuv0.get<0>(), iuv0.get<1>()), + const Ref::fvec2 m00 = FetchCirrusTex(iuv0.get<0>(), iuv0.get<1>()), m01 = FetchCirrusTex(iuv1.get<0>(), iuv0.get<1>()), m10 = FetchCirrusTex(iuv0.get<0>(), iuv1.get<1>()), m11 = FetchCirrusTex(iuv1.get<0>(), iuv1.get<1>()); - const Ref::simd_fvec2 k = fract(uv); + const Ref::fvec2 k = fract(uv); - const Ref::simd_fvec2 m0 = m01 * k.get<0>() + m00 * (1.0f - k.get<0>()), + const Ref::fvec2 m0 = m01 * k.get<0>() + m00 * (1.0f - k.get<0>()), m1 = m11 * k.get<0>() + m10 * (1.0f - k.get<0>()); return srgb_to_rgb((m1 * k.get<1>() + m0 * (1.0f - k.get<1>())) * (1.0f / 255.0f)); @@ -447,18 +447,18 @@ Ref::simd_fvec2 SampleCirrusTex(Ref::simd_fvec2 uv) { } // namespace Ray -Ray::Ref::simd_fvec4 Ray::IntegrateOpticalDepth(const atmosphere_params_t ¶ms, const Ref::simd_fvec4 &ray_start, - const Ref::simd_fvec4 &ray_dir) { - Ref::simd_fvec2 intersection = AtmosphereIntersection(params, ray_start, ray_dir); +Ray::Ref::fvec4 Ray::IntegrateOpticalDepth(const atmosphere_params_t ¶ms, const Ref::fvec4 &ray_start, + const Ref::fvec4 &ray_dir) { + Ref::fvec2 intersection = AtmosphereIntersection(params, ray_start, ray_dir); float ray_length = intersection[1]; const int SampleCount = 64; float step_size = ray_length / SampleCount; - Ref::simd_fvec4 optical_depth = 0.0f; + Ref::fvec4 optical_depth = 0.0f; for (int i = 0; i < SampleCount; i++) { - Ref::simd_fvec4 local_pos = ray_start + ray_dir * (i + 0.5f) * step_size, up_vector; + Ref::fvec4 local_pos = ray_start + ray_dir * (i + 0.5f) * step_size, up_vector; const float local_height = AtmosphereHeight(params, local_pos, up_vector); const atmosphere_medium_t medium = SampleAtmosphereMedium(params, local_height); optical_depth += medium.extinction * step_size; @@ -468,20 +468,20 @@ Ray::Ref::simd_fvec4 Ray::IntegrateOpticalDepth(const atmosphere_params_t ¶m } template -std::pair -Ray::IntegrateScatteringMain(const atmosphere_params_t ¶ms, const Ref::simd_fvec4 &ray_start, - const Ref::simd_fvec4 &ray_dir, float ray_length, const Ref::simd_fvec4 &light_dir, - const Ref::simd_fvec4 &moon_dir, const Ref::simd_fvec4 &light_color, +std::pair +Ray::IntegrateScatteringMain(const atmosphere_params_t ¶ms, const Ref::fvec4 &ray_start, + const Ref::fvec4 &ray_dir, float ray_length, const Ref::fvec4 &light_dir, + const Ref::fvec4 &moon_dir, const Ref::fvec4 &light_color, Span transmittance_lut, Span multiscatter_lut, - const float rand_offset, const int sample_count, Ref::simd_fvec4 &inout_transmittance) { - const Ref::simd_fvec2 atm_intersection = AtmosphereIntersection(params, ray_start, ray_dir); + const float rand_offset, const int sample_count, Ref::fvec4 &inout_transmittance) { + const Ref::fvec2 atm_intersection = AtmosphereIntersection(params, ray_start, ray_dir); ray_length = fminf(ray_length, atm_intersection.get<1>()); - const Ref::simd_fvec2 planet_intersection = PlanetIntersection(params, ray_start, ray_dir); + const Ref::fvec2 planet_intersection = PlanetIntersection(params, ray_start, ray_dir); if (planet_intersection.get<0>() > 0) { ray_length = fminf(ray_length, planet_intersection.get<0>()); } - Ref::simd_fvec4 _unused; + Ref::fvec4 _unused; const float ray_height = AtmosphereHeight(params, ray_start, _unused); const float sample_distribution_exponent = 1.0f + saturate(1.0f - ray_height / params.atmosphere_height) * 8.0f; @@ -493,7 +493,7 @@ Ray::IntegrateScatteringMain(const atmosphere_params_t ¶ms, const Ref::simd_ const float phase_uniform = 1.0f / (4.0f * PI); - Ref::simd_fvec4 radiance = 0.0f, multiscat_as_1 = 0.0f; + Ref::fvec4 radiance = 0.0f, multiscat_as_1 = 0.0f; // // Atmosphere @@ -508,36 +508,36 @@ Ray::IntegrateScatteringMain(const atmosphere_params_t ¶ms, const Ref::simd_ } const float step_size = (ray_time - prev_ray_time); - const Ref::simd_fvec4 local_position = ray_start + ray_dir * (ray_time - 0.1f * rand_offset * step_size); - Ref::simd_fvec4 up_vector; + const Ref::fvec4 local_position = ray_start + ray_dir * (ray_time - 0.1f * rand_offset * step_size); + Ref::fvec4 up_vector; const float local_height = AtmosphereHeight(params, local_position, up_vector); const atmosphere_medium_t medium = SampleAtmosphereMedium(params, local_height); - const Ref::simd_fvec4 optical_depth = medium.extinction * step_size; - const Ref::simd_fvec4 local_transmittance = exp(-optical_depth); + const Ref::fvec4 optical_depth = medium.extinction * step_size; + const Ref::fvec4 local_transmittance = exp(-optical_depth); - Ref::simd_fvec4 S = 0.0f; + Ref::fvec4 S = 0.0f; if (light_dir.get<1>() > -0.025f) { // main light contribution const float view_zenith_cos_angle = dot(light_dir, up_vector); - const Ref::simd_fvec2 uv = + const Ref::fvec2 uv = LutTransmittanceParamsToUv(params, local_height + params.planet_radius, view_zenith_cos_angle); - const Ref::simd_fvec4 light_transmittance = SampleTransmittanceLUT(transmittance_lut, uv); + const Ref::fvec4 light_transmittance = SampleTransmittanceLUT(transmittance_lut, uv); - const Ref::simd_fvec2 planet_intersection = PlanetIntersection(params, local_position, light_dir); + const Ref::fvec2 planet_intersection = PlanetIntersection(params, local_position, light_dir); const float planet_shadow = planet_intersection.get<0>() > 0 ? 0.0f : 1.0f; - Ref::simd_fvec4 multiscattered_lum = 0.0f; + Ref::fvec4 multiscattered_lum = 0.0f; if (!multiscatter_lut.empty()) { - Ref::simd_fvec2 uv = saturate( - Ref::simd_fvec2(view_zenith_cos_angle * 0.5f + 0.5f, local_height / params.atmosphere_height)); - uv = Ref::simd_fvec2(from_unit_to_sub_uvs(uv.get<0>(), MULTISCATTER_LUT_RES), + Ref::fvec2 uv = saturate( + Ref::fvec2(view_zenith_cos_angle * 0.5f + 0.5f, local_height / params.atmosphere_height)); + uv = Ref::fvec2(from_unit_to_sub_uvs(uv.get<0>(), MULTISCATTER_LUT_RES), from_unit_to_sub_uvs(uv.get<1>(), MULTISCATTER_LUT_RES)); multiscattered_lum = SampleMultiscatterLUT(multiscatter_lut, uv); } - const Ref::simd_fvec4 phase_times_scattering = + const Ref::fvec4 phase_times_scattering = UniformPhase ? medium.scattering * phase_uniform : medium.scattering_ray * phase_r + medium.scattering_mie * phase_m; S += (planet_shadow * light_transmittance * phase_times_scattering + @@ -546,21 +546,21 @@ Ray::IntegrateScatteringMain(const atmosphere_params_t ¶ms, const Ref::simd_ } else if (params.moon_radius > 0.0f) { // moon reflection contribution (totally fake) const float view_zenith_cos_angle = dot(moon_dir, up_vector); - const Ref::simd_fvec2 uv = + const Ref::fvec2 uv = LutTransmittanceParamsToUv(params, local_height + params.planet_radius, view_zenith_cos_angle); - const Ref::simd_fvec4 light_transmittance = SampleTransmittanceLUT(transmittance_lut, uv); + const Ref::fvec4 light_transmittance = SampleTransmittanceLUT(transmittance_lut, uv); - Ref::simd_fvec4 multiscattered_lum = 0.0f; + Ref::fvec4 multiscattered_lum = 0.0f; if (!multiscatter_lut.empty()) { - Ref::simd_fvec2 uv = saturate( - Ref::simd_fvec2(view_zenith_cos_angle * 0.5f + 0.5f, local_height / params.atmosphere_height)); - uv = Ref::simd_fvec2(from_unit_to_sub_uvs(uv.get<0>(), MULTISCATTER_LUT_RES), + Ref::fvec2 uv = saturate( + Ref::fvec2(view_zenith_cos_angle * 0.5f + 0.5f, local_height / params.atmosphere_height)); + uv = Ref::fvec2(from_unit_to_sub_uvs(uv.get<0>(), MULTISCATTER_LUT_RES), from_unit_to_sub_uvs(uv.get<1>(), MULTISCATTER_LUT_RES)); multiscattered_lum = SampleMultiscatterLUT(multiscatter_lut, uv); } - const Ref::simd_fvec4 phase_times_scattering = + const Ref::fvec4 phase_times_scattering = medium.scattering_ray * moon_phase_r + medium.scattering_mie * moon_phase_m; S += MoonSunRelation * (light_transmittance * phase_times_scattering + multiscattered_lum * medium.scattering) * light_color; @@ -568,14 +568,14 @@ Ray::IntegrateScatteringMain(const atmosphere_params_t ¶ms, const Ref::simd_ // 1 is the integration of luminance over the 4pi of a sphere, and assuming an isotropic phase function // of 1.0/(4*PI) - const Ref::simd_fvec4 MS = medium.scattering * 1.0f; - const Ref::simd_fvec4 MS_int = (MS - MS * local_transmittance) / medium.extinction; + const Ref::fvec4 MS = medium.scattering * 1.0f; + const Ref::fvec4 MS_int = (MS - MS * local_transmittance) / medium.extinction; multiscat_as_1 += inout_transmittance * MS_int; #if 0 radiance += inout_transmittance * S * step_size; #else - const Ref::simd_fvec4 S_int = (S - S * local_transmittance) / medium.extinction; + const Ref::fvec4 S_int = (S - S * local_transmittance) / medium.extinction; radiance += inout_transmittance * S_int; #endif inout_transmittance *= local_transmittance; @@ -587,48 +587,48 @@ Ray::IntegrateScatteringMain(const atmosphere_params_t ¶ms, const Ref::simd_ // Ground 'floor' // if (planet_intersection.get<0>() > 0) { - const Ref::simd_fvec4 local_position = ray_start + ray_dir * planet_intersection.get<0>(); - Ref::simd_fvec4 up_vector; + const Ref::fvec4 local_position = ray_start + ray_dir * planet_intersection.get<0>(); + Ref::fvec4 up_vector; const float local_height = AtmosphereHeight(params, local_position, up_vector); const float view_zenith_cos_angle = dot(light_dir, up_vector); - const Ref::simd_fvec2 uv = + const Ref::fvec2 uv = LutTransmittanceParamsToUv(params, local_height + params.planet_radius, view_zenith_cos_angle); - const Ref::simd_fvec4 light_transmittance = SampleTransmittanceLUT(transmittance_lut, uv); - radiance += Ref::simd_fvec4{params.ground_albedo, Ref::simd_mem_aligned} * saturate(dot(up_vector, light_dir)) * + const Ref::fvec4 light_transmittance = SampleTransmittanceLUT(transmittance_lut, uv); + radiance += Ref::fvec4{params.ground_albedo, Ref::vector_aligned} * saturate(dot(up_vector, light_dir)) * inout_transmittance * light_transmittance * light_color; } return std::make_pair(radiance, multiscat_as_1); } -template std::pair Ray::IntegrateScatteringMain( - const atmosphere_params_t ¶ms, const Ref::simd_fvec4 &ray_start, const Ref::simd_fvec4 &ray_dir, - float ray_length, const Ref::simd_fvec4 &light_dir, const Ref::simd_fvec4 &moon_dir, - const Ref::simd_fvec4 &light_color, Span transmittance_lut, Span multiscatter_lut, - float rand_offset, int sample_count, Ref::simd_fvec4 &inout_transmittance); -template std::pair Ray::IntegrateScatteringMain( - const atmosphere_params_t ¶ms, const Ref::simd_fvec4 &ray_start, const Ref::simd_fvec4 &ray_dir, - float ray_length, const Ref::simd_fvec4 &light_dir, const Ref::simd_fvec4 &moon_dir, - const Ref::simd_fvec4 &light_color, Span transmittance_lut, Span multiscatter_lut, - float rand_offset, int sample_count, Ref::simd_fvec4 &inout_transmittance); -template std::pair Ray::IntegrateScatteringMain( - const atmosphere_params_t ¶ms, const Ref::simd_fvec4 &ray_start, const Ref::simd_fvec4 &ray_dir, - float ray_length, const Ref::simd_fvec4 &light_dir, const Ref::simd_fvec4 &moon_dir, - const Ref::simd_fvec4 &light_color, Span transmittance_lut, Span multiscatter_lut, - float rand_offset, int sample_count, Ref::simd_fvec4 &inout_transmittance); -template std::pair Ray::IntegrateScatteringMain( - const atmosphere_params_t ¶ms, const Ref::simd_fvec4 &ray_start, const Ref::simd_fvec4 &ray_dir, - float ray_length, const Ref::simd_fvec4 &light_dir, const Ref::simd_fvec4 &moon_dir, - const Ref::simd_fvec4 &light_color, Span transmittance_lut, Span multiscatter_lut, - float rand_offset, int sample_count, Ref::simd_fvec4 &inout_transmittance); - -Ray::Ref::simd_fvec4 Ray::IntegrateScattering(const atmosphere_params_t ¶ms, Ref::simd_fvec4 ray_start, - const Ref::simd_fvec4 &ray_dir, float ray_length, - const Ref::simd_fvec4 &light_dir, const float light_angle, - const Ref::simd_fvec4 &light_color, Span transmittance_lut, +template std::pair Ray::IntegrateScatteringMain( + const atmosphere_params_t ¶ms, const Ref::fvec4 &ray_start, const Ref::fvec4 &ray_dir, + float ray_length, const Ref::fvec4 &light_dir, const Ref::fvec4 &moon_dir, + const Ref::fvec4 &light_color, Span transmittance_lut, Span multiscatter_lut, + float rand_offset, int sample_count, Ref::fvec4 &inout_transmittance); +template std::pair Ray::IntegrateScatteringMain( + const atmosphere_params_t ¶ms, const Ref::fvec4 &ray_start, const Ref::fvec4 &ray_dir, + float ray_length, const Ref::fvec4 &light_dir, const Ref::fvec4 &moon_dir, + const Ref::fvec4 &light_color, Span transmittance_lut, Span multiscatter_lut, + float rand_offset, int sample_count, Ref::fvec4 &inout_transmittance); +template std::pair Ray::IntegrateScatteringMain( + const atmosphere_params_t ¶ms, const Ref::fvec4 &ray_start, const Ref::fvec4 &ray_dir, + float ray_length, const Ref::fvec4 &light_dir, const Ref::fvec4 &moon_dir, + const Ref::fvec4 &light_color, Span transmittance_lut, Span multiscatter_lut, + float rand_offset, int sample_count, Ref::fvec4 &inout_transmittance); +template std::pair Ray::IntegrateScatteringMain( + const atmosphere_params_t ¶ms, const Ref::fvec4 &ray_start, const Ref::fvec4 &ray_dir, + float ray_length, const Ref::fvec4 &light_dir, const Ref::fvec4 &moon_dir, + const Ref::fvec4 &light_color, Span transmittance_lut, Span multiscatter_lut, + float rand_offset, int sample_count, Ref::fvec4 &inout_transmittance); + +Ray::Ref::fvec4 Ray::IntegrateScattering(const atmosphere_params_t ¶ms, Ref::fvec4 ray_start, + const Ref::fvec4 &ray_dir, float ray_length, + const Ref::fvec4 &light_dir, const float light_angle, + const Ref::fvec4 &light_color, Span transmittance_lut, Span multiscatter_lut, uint32_t rand_hash) { - const Ref::simd_fvec2 atm_intersection = AtmosphereIntersection(params, ray_start, ray_dir); + const Ref::fvec2 atm_intersection = AtmosphereIntersection(params, ray_start, ray_dir); ray_length = fminf(ray_length, atm_intersection.get<1>()); if (atm_intersection.get<0>() > 0) { // Advance ray to the atmosphere entry point @@ -636,33 +636,33 @@ Ray::Ref::simd_fvec4 Ray::IntegrateScattering(const atmosphere_params_t ¶ms, ray_length -= atm_intersection.get<0>(); } - const Ref::simd_fvec2 planet_intersection = PlanetIntersection(params, ray_start, ray_dir); + const Ref::fvec2 planet_intersection = PlanetIntersection(params, ray_start, ray_dir); if (planet_intersection.get<0>() > 0) { ray_length = fminf(ray_length, planet_intersection.get<0>()); } if (ray_length <= 0.0f) { - return Ref::simd_fvec4{0.0f}; + return Ref::fvec4{0.0f}; } - const Ref::simd_fvec2 moon_intersection = MoonIntersection(params, ray_start, ray_dir); - Ref::simd_fvec4 moon_dir = Ref::simd_fvec4{params.moon_dir, Ref::simd_mem_aligned}; - const Ref::simd_fvec4 moon_point = moon_dir * params.moon_distance + 0.5f * light_dir * params.moon_radius; + const Ref::fvec2 moon_intersection = MoonIntersection(params, ray_start, ray_dir); + Ref::fvec4 moon_dir = Ref::fvec4{params.moon_dir, Ref::vector_aligned}; + const Ref::fvec4 moon_point = moon_dir * params.moon_distance + 0.5f * light_dir * params.moon_radius; moon_dir = normalize(moon_point); const float costh = dot(ray_dir, light_dir); - const Ref::simd_fvec4 phase_w = PhaseWrenninge(costh); + const Ref::fvec4 phase_w = PhaseWrenninge(costh); const float moon_costh = dot(ray_dir, moon_dir); - const Ref::simd_fvec4 moon_phase_w = PhaseWrenninge(moon_costh); + const Ref::fvec4 moon_phase_w = PhaseWrenninge(moon_costh); const int PreAtmosphereSampleCount = 8, MainAtmosphereSampleCount = 24, CloudsSampleCount = 128; const float light_brightness = light_color.get<0>() + light_color.get<1>() + light_color.get<2>(); - Ref::simd_fvec4 total_radiance = 0.0f, total_transmittance = 1.0f; + Ref::fvec4 total_radiance = 0.0f, total_transmittance = 1.0f; - const Ref::simd_fvec4 clouds_intersection = CloudsIntersection(params, ray_start, ray_dir); + const Ref::fvec4 clouds_intersection = CloudsIntersection(params, ray_start, ray_dir); // // Atmosphere before clouds @@ -688,33 +688,33 @@ Ray::Ref::simd_fvec4 Ray::IntegrateScattering(const atmosphere_params_t ¶ms, light_brightness > 0.0f && ray_dir.get<1>() > CloudsHorizonCutoff) { float clouds_ray_length = fminf(ray_length, clouds_intersection.get<3>()); - Ref::simd_fvec4 clouds_ray_start = ray_start + ray_dir * clouds_intersection.get<1>(); + Ref::fvec4 clouds_ray_start = ray_start + ray_dir * clouds_intersection.get<1>(); clouds_ray_length -= clouds_intersection.get<1>(); if (clouds_ray_length > 0.0f) { const float step_size = clouds_ray_length / float(CloudsSampleCount); - Ref::simd_fvec4 local_position = clouds_ray_start + ray_dir * Ref::construct_float(rand_hash) * step_size; + Ref::fvec4 local_position = clouds_ray_start + ray_dir * Ref::construct_float(rand_hash) * step_size; rand_hash = Ref::hash(rand_hash); - Ref::simd_fvec4 clouds = 0.0f; + Ref::fvec4 clouds = 0.0f; // NOTE: We assume transmittance is constant along the clouds range (~500m) - Ref::simd_fvec4 light_transmittance, moon_transmittance, multiscattered_lum = 0.0f, + Ref::fvec4 light_transmittance, moon_transmittance, multiscattered_lum = 0.0f, moon_multiscattered_lum = 0.0f; { - Ref::simd_fvec4 up_vector; + Ref::fvec4 up_vector; const float local_height = AtmosphereHeight(params, local_position, up_vector); { const float view_zenith_cos_angle = dot(light_dir, up_vector); - const Ref::simd_fvec2 uv = + const Ref::fvec2 uv = LutTransmittanceParamsToUv(params, local_height + params.planet_radius, view_zenith_cos_angle); light_transmittance = SampleTransmittanceLUT(transmittance_lut, uv); if (!multiscatter_lut.empty()) { - Ref::simd_fvec2 uv = saturate(Ref::simd_fvec2(view_zenith_cos_angle * 0.5f + 0.5f, + Ref::fvec2 uv = saturate(Ref::fvec2(view_zenith_cos_angle * 0.5f + 0.5f, local_height / params.atmosphere_height)); - uv = Ref::simd_fvec2(from_unit_to_sub_uvs(uv.get<0>(), MULTISCATTER_LUT_RES), + uv = Ref::fvec2(from_unit_to_sub_uvs(uv.get<0>(), MULTISCATTER_LUT_RES), from_unit_to_sub_uvs(uv.get<1>(), MULTISCATTER_LUT_RES)); multiscattered_lum = SampleMultiscatterLUT(multiscatter_lut, uv); @@ -722,14 +722,14 @@ Ray::Ref::simd_fvec4 Ray::IntegrateScattering(const atmosphere_params_t ¶ms, } { const float view_zenith_cos_angle = dot(moon_dir, up_vector); - const Ref::simd_fvec2 uv = + const Ref::fvec2 uv = LutTransmittanceParamsToUv(params, local_height + params.planet_radius, view_zenith_cos_angle); moon_transmittance = SampleTransmittanceLUT(transmittance_lut, uv); if (!multiscatter_lut.empty()) { - Ref::simd_fvec2 uv = saturate(Ref::simd_fvec2(view_zenith_cos_angle * 0.5f + 0.5f, + Ref::fvec2 uv = saturate(Ref::fvec2(view_zenith_cos_angle * 0.5f + 0.5f, local_height / params.atmosphere_height)); - uv = Ref::simd_fvec2(from_unit_to_sub_uvs(uv.get<0>(), MULTISCATTER_LUT_RES), + uv = Ref::fvec2(from_unit_to_sub_uvs(uv.get<0>(), MULTISCATTER_LUT_RES), from_unit_to_sub_uvs(uv.get<1>(), MULTISCATTER_LUT_RES)); moon_multiscattered_lum = SampleMultiscatterLUT(multiscatter_lut, uv); @@ -737,11 +737,11 @@ Ray::Ref::simd_fvec4 Ray::IntegrateScattering(const atmosphere_params_t ¶ms, } } - Ref::simd_fvec4 transmittance_before = total_transmittance; + Ref::fvec4 transmittance_before = total_transmittance; for (int i = 0; i < CloudsSampleCount /*&& total_transmittance.get<1>() > 0.001f*/; ++i) { float local_height, height_fraction; - Ref::simd_fvec4 up_vector; + Ref::fvec4 up_vector; const float local_density = GetCloudsDensity(params, local_position, local_height, height_fraction, up_vector); if (local_density > 0.0f) { @@ -750,7 +750,7 @@ Ray::Ref::simd_fvec4 Ray::IntegrateScattering(const atmosphere_params_t ¶ms, if (light_dir.get<1>() > -0.025f) { // main light contribution - const Ref::simd_fvec2 planet_intersection = + const Ref::fvec2 planet_intersection = PlanetIntersection(params, local_position, light_dir); const float planet_shadow = planet_intersection.get<0>() > 0 ? 0.0f : 1.0f; const float cloud_shadow = TraceCloudShadow(params, rand_hash, local_position, light_dir); @@ -788,12 +788,12 @@ Ray::Ref::simd_fvec4 Ray::IntegrateScattering(const atmosphere_params_t ¶ms, // if (planet_intersection.get<0>() < 0 && clouds_intersection.get<1>() > 0 && params.cirrus_clouds_amount > 0.0f && light_brightness > 0.0f) { - Ref::simd_fvec2 cirrus_coords = - 3e-4f * Ref::simd_fvec2{params.clouds_offset_z, params.clouds_offset_x} + - 0.8f * (Ref::simd_fvec2{ray_dir.get<2>(), ray_dir.get<0>()}) / (fabsf(ray_dir.get<1>()) + 0.02f); + Ref::fvec2 cirrus_coords = + 3e-4f * Ref::fvec2{params.clouds_offset_z, params.clouds_offset_x} + + 0.8f * (Ref::fvec2{ray_dir.get<2>(), ray_dir.get<0>()}) / (fabsf(ray_dir.get<1>()) + 0.02f); cirrus_coords.set<1>(cirrus_coords.get<1>() + 1.75f); - float noise_read = 1.0f - Sample3dNoiseTex(fract(Ref::simd_fvec4{0.0f, cirrus_coords.get<0>() * 0.03f, + float noise_read = 1.0f - Sample3dNoiseTex(fract(Ref::fvec4{0.0f, cirrus_coords.get<0>() * 0.03f, cirrus_coords.get<1>() * 0.03f, 0.0f})); noise_read = saturate(noise_read - 1.0f + params.cirrus_clouds_amount * 0.6f) / (params.cirrus_clouds_amount + 1e-9f); @@ -802,28 +802,28 @@ Ray::Ref::simd_fvec4 Ray::IntegrateScattering(const atmosphere_params_t ¶ms, // cirrus_coords.set<0>(cirrus_coords.get<0>() + 0.25f); - noise_read = 1.0f - Sample3dNoiseTex(fract(Ref::simd_fvec4{0.7f, cirrus_coords.get<0>() * 0.02f, + noise_read = 1.0f - Sample3dNoiseTex(fract(Ref::fvec4{0.7f, cirrus_coords.get<0>() * 0.02f, cirrus_coords.get<1>() * 0.02f, 0.0f})); noise_read = saturate(noise_read - 1.0f + params.cirrus_clouds_amount * 0.7f) / (params.cirrus_clouds_amount + 1e-9f); dC += 0.6f * smoothstep(0.0f, 1.0f, noise_read) * SampleCirrusTex(fract(cirrus_coords * 0.25f)).get<1>(); - Ref::simd_fvec4 local_position = ray_start + ray_dir * params.cirrus_clouds_height; + Ref::fvec4 local_position = ray_start + ray_dir * params.cirrus_clouds_height; - Ref::simd_fvec4 light_transmittance, moon_transmittance; + Ref::fvec4 light_transmittance, moon_transmittance; { - Ref::simd_fvec4 up_vector; + Ref::fvec4 up_vector; const float local_height = AtmosphereHeight(params, local_position, up_vector); { const float view_zenith_cos_angle = dot(light_dir, up_vector); - const Ref::simd_fvec2 uv = + const Ref::fvec2 uv = LutTransmittanceParamsToUv(params, local_height + params.planet_radius, view_zenith_cos_angle); light_transmittance = SampleTransmittanceLUT(transmittance_lut, uv); } { const float view_zenith_cos_angle = dot(moon_dir, up_vector); - const Ref::simd_fvec2 uv = + const Ref::fvec2 uv = LutTransmittanceParamsToUv(params, local_height + params.planet_radius, view_zenith_cos_angle); moon_transmittance = SampleTransmittanceLUT(transmittance_lut, uv); } @@ -844,7 +844,7 @@ Ray::Ref::simd_fvec4 Ray::IntegrateScattering(const atmosphere_params_t ¶ms, // if (planet_intersection.get<0>() < 0 && light_brightness > 0.0f) { float main_ray_length = ray_length; - Ref::simd_fvec4 main_ray_start = ray_start + ray_dir * clouds_intersection.get<3>(); + Ref::fvec4 main_ray_start = ray_start + ray_dir * clouds_intersection.get<3>(); main_ray_length -= clouds_intersection.get<1>(); const float rand_offset = Ref::construct_float(rand_hash); @@ -862,7 +862,7 @@ Ray::Ref::simd_fvec4 Ray::IntegrateScattering(const atmosphere_params_t ¶ms, if (light_angle > 0.0f && planet_intersection.get<0>() < 0.0f && light_brightness > 0.0f) { const float cos_theta = cosf(light_angle); const float BlendVal = 0.000005f; - Ref::simd_fvec4 sun_disk = total_transmittance * smoothstep(cos_theta - BlendVal, cos_theta + BlendVal, costh); + Ref::fvec4 sun_disk = total_transmittance * smoothstep(cos_theta - BlendVal, cos_theta + BlendVal, costh); // 'de-multiply' by disk area (to get original brightness) const float radius = tanf(light_angle); sun_disk /= (PI * radius * radius); @@ -885,9 +885,9 @@ Ray::Ref::simd_fvec4 Ray::IntegrateScattering(const atmosphere_params_t ¶ms, // if (planet_intersection.get<0>() < 0 && moon_intersection.get<0>() > 0 && params.moon_radius > 0.0f && light_brightness > 0.0f) { - const Ref::simd_fvec4 moon_center = - Ref::simd_fvec4{params.moon_dir, Ref::simd_mem_aligned} * params.moon_distance; - const Ref::simd_fvec4 moon_normal = normalize(ray_start + moon_intersection.get<0>() * ray_dir - moon_center); + const Ref::fvec4 moon_center = + Ref::fvec4{params.moon_dir, Ref::vector_aligned} * params.moon_distance; + const Ref::fvec4 moon_normal = normalize(ray_start + moon_intersection.get<0>() * ray_dir - moon_center); const float theta = acosf(clamp(moon_normal.get<1>(), -1.0f, 1.0f)) / PI; @@ -901,8 +901,8 @@ Ray::Ref::simd_fvec4 Ray::IntegrateScattering(const atmosphere_params_t ¶ms, const float u = Ref::fract(0.5f * phi / PI); - const Ref::simd_fvec2 uvs = Ref::simd_fvec2(u, theta); - const Ref::simd_fvec4 albedo = SampleMoonTex(uvs); + const Ref::fvec2 uvs = Ref::fvec2(u, theta); + const Ref::fvec4 albedo = SampleMoonTex(uvs); total_radiance += total_transmittance * fmaxf(dot(moon_normal, light_dir), 0.0f) * albedo; } @@ -910,7 +910,7 @@ Ray::Ref::simd_fvec4 Ray::IntegrateScattering(const atmosphere_params_t ¶ms, return total_radiance; } -void Ray::UvToLutTransmittanceParams(const atmosphere_params_t ¶ms, Ref::simd_fvec2 uv, float &view_height, +void Ray::UvToLutTransmittanceParams(const atmosphere_params_t ¶ms, Ref::fvec2 uv, float &view_height, float &view_zenith_cos_angle) { const float top_radius = params.planet_radius + params.atmosphere_height; @@ -927,7 +927,7 @@ void Ray::UvToLutTransmittanceParams(const atmosphere_params_t ¶ms, Ref::sim view_zenith_cos_angle = clamp(view_zenith_cos_angle, -1.0f, 1.0f); } -Ray::Ref::simd_fvec2 Ray::LutTransmittanceParamsToUv(const atmosphere_params_t ¶ms, const float view_height, +Ray::Ref::fvec2 Ray::LutTransmittanceParamsToUv(const atmosphere_params_t ¶ms, const float view_height, const float view_zenith_cos_angle) { const float top_radius = params.planet_radius + params.atmosphere_height; @@ -944,5 +944,5 @@ Ray::Ref::simd_fvec2 Ray::LutTransmittanceParamsToUv(const atmosphere_params_t & const float x_mu = (d - d_min) / (d_max - d_min); const float x_r = rho / H; - return Ref::simd_fvec2{x_mu, x_r}; + return Ref::fvec2{x_mu, x_r}; } diff --git a/internal/Atmosphere.h b/internal/Atmosphere.h index c686804cf..dd7644bd5 100644 --- a/internal/Atmosphere.h +++ b/internal/Atmosphere.h @@ -15,27 +15,27 @@ force_inline float from_sub_uvs_to_unit(float u, float resolution) { return (u - 0.5f / resolution) * (resolution / (resolution - 1.0f)); } -Ref::simd_fvec4 IntegrateOpticalDepth(const atmosphere_params_t ¶ms, const Ref::simd_fvec4 &ray_start, - const Ref::simd_fvec4 &ray_dir); +Ref::fvec4 IntegrateOpticalDepth(const atmosphere_params_t ¶ms, const Ref::fvec4 &ray_start, + const Ref::fvec4 &ray_dir); template -std::pair -IntegrateScatteringMain(const atmosphere_params_t ¶ms, const Ref::simd_fvec4 &ray_start, - const Ref::simd_fvec4 &ray_dir, float ray_length, const Ref::simd_fvec4 &light_dir, - const Ref::simd_fvec4 &moon_dir, const Ref::simd_fvec4 &light_color, +std::pair +IntegrateScatteringMain(const atmosphere_params_t ¶ms, const Ref::fvec4 &ray_start, + const Ref::fvec4 &ray_dir, float ray_length, const Ref::fvec4 &light_dir, + const Ref::fvec4 &moon_dir, const Ref::fvec4 &light_color, Span transmittance_lut, Span multiscatter_lut, float rand_offset, - int sample_count, Ref::simd_fvec4 &inout_transmittance); + int sample_count, Ref::fvec4 &inout_transmittance); -Ref::simd_fvec4 IntegrateScattering(const atmosphere_params_t ¶ms, Ref::simd_fvec4 ray_start, - const Ref::simd_fvec4 &ray_dir, float ray_length, const Ref::simd_fvec4 &light_dir, - float light_angle, const Ref::simd_fvec4 &light_color, +Ref::fvec4 IntegrateScattering(const atmosphere_params_t ¶ms, Ref::fvec4 ray_start, + const Ref::fvec4 &ray_dir, float ray_length, const Ref::fvec4 &light_dir, + float light_angle, const Ref::fvec4 &light_color, Span transmittance_lut, Span multiscatter_lut, uint32_t rand_hash); // Transmittance LUT function parameterisation from Bruneton 2017 // https://github.com/ebruneton/precomputed_atmospheric_scattering -void UvToLutTransmittanceParams(const atmosphere_params_t ¶ms, Ref::simd_fvec2 uv, float &view_height, +void UvToLutTransmittanceParams(const atmosphere_params_t ¶ms, Ref::fvec2 uv, float &view_height, float &view_zenith_cos_angle); -Ref::simd_fvec2 LutTransmittanceParamsToUv(const atmosphere_params_t ¶ms, float view_height, +Ref::fvec2 LutTransmittanceParamsToUv(const atmosphere_params_t ¶ms, float view_height, float view_zenith_cos_angle); } // namespace Ray diff --git a/internal/BVHSplit.cpp b/internal/BVHSplit.cpp index 217f05b9c..2e903f20d 100644 --- a/internal/BVHSplit.cpp +++ b/internal/BVHSplit.cpp @@ -15,27 +15,27 @@ const float SpatialSplitAlpha = 0.00001f; const int NumSpatialSplitBins = 256; struct bbox_t { - Ref::simd_fvec4 min = {FLT_MAX}, max = {-FLT_MAX}; + Ref::fvec4 min = {FLT_MAX}, max = {-FLT_MAX}; bbox_t() = default; - bbox_t(const Ref::simd_fvec4 &_min, const Ref::simd_fvec4 &_max) : min(_min), max(_max) {} + bbox_t(const Ref::fvec4 &_min, const Ref::fvec4 &_max) : min(_min), max(_max) {} float surface_area() const { return surface_area(min, max); } - static float surface_area(const Ref::simd_fvec4 &min, const Ref::simd_fvec4 &max) { - const Ref::simd_fvec4 e = max - min; + static float surface_area(const Ref::fvec4 &min, const Ref::fvec4 &max) { + const Ref::fvec4 e = max - min; return 2 * (e.get<0>() + e.get<1>() + e.get<2>()); // return e[0] * e[1] + e[0] * e[2] + e[1] * e[2]; } }; // stolen from Mitsuba -static int sutherland_hodgman(const Ref::simd_dvec3 *input, const int in_count, Ref::simd_dvec3 *output, const int axis, +static int sutherland_hodgman(const Ref::dvec3 *input, const int in_count, Ref::dvec3 *output, const int axis, const double split_pos, const bool is_minimum) { if (in_count < 3) { return 0; } - Ref::simd_dvec3 cur = input[0]; + Ref::dvec3 cur = input[0]; const double sign = is_minimum ? 1.0 : -1.0; double distance = sign * (cur[axis] - split_pos); bool cur_is_inside = (distance >= 0); @@ -46,7 +46,7 @@ static int sutherland_hodgman(const Ref::simd_dvec3 *input, const int in_count, if (nextIdx == in_count) { nextIdx = 0; } - const Ref::simd_dvec3 &next = input[nextIdx]; + const Ref::dvec3 &next = input[nextIdx]; distance = sign * (next[axis] - split_pos); bool next_is_inside = (distance >= 0); @@ -56,13 +56,13 @@ static int sutherland_hodgman(const Ref::simd_dvec3 *input, const int in_count, } else if (cur_is_inside && !next_is_inside) { // Going outside -- add the intersection double t = (split_pos - cur[axis]) / (next[axis] - cur[axis]); - Ref::simd_dvec3 p = cur + (next - cur) * t; + Ref::dvec3 p = cur + (next - cur) * t; p.set(axis, split_pos); // Avoid roundoff errors output[out_count++] = p; } else if (!cur_is_inside && next_is_inside) { // Coming back inside -- add the intersection + next vertex double t = (split_pos - cur[axis]) / (next[axis] - cur[axis]); - Ref::simd_dvec3 &p = output[out_count++]; + Ref::dvec3 &p = output[out_count++]; p = cur + (next - cur) * t; p.set(axis, split_pos); // Avoid roundoff errors output[out_count++] = next; @@ -105,9 +105,9 @@ force_inline float castflt_up(const double val) { return a; } -bbox_t GetClippedAABB(const Ref::simd_fvec3 &_v0, const Ref::simd_fvec3 &_v1, const Ref::simd_fvec3 &_v2, +bbox_t GetClippedAABB(const Ref::fvec3 &_v0, const Ref::fvec3 &_v1, const Ref::fvec3 &_v2, const bbox_t &limits) { - Ref::simd_dvec3 vertices1[9], vertices2[9]; + Ref::dvec3 vertices1[9], vertices2[9]; int vertex_count = 3; vertices1[0] = {double(_v0[0]), double(_v0[1]), double(_v0[2])}; @@ -134,9 +134,9 @@ bbox_t GetClippedAABB(const Ref::simd_fvec3 &_v0, const Ref::simd_fvec3 &_v1, co } // namespace Ray Ray::split_data_t Ray::SplitPrimitives_SAH(const prim_t *primitives, Span prim_indices, - const vtx_attribute_t &positions, const Ref::simd_fvec4 &bbox_min, - const Ref::simd_fvec4 &bbox_max, const Ref::simd_fvec4 &root_min, - const Ref::simd_fvec4 &root_max, const bvh_settings_t &s) { + const vtx_attribute_t &positions, const Ref::fvec4 &bbox_min, + const Ref::fvec4 &bbox_max, const Ref::fvec4 &root_min, + const Ref::fvec4 &root_max, const bvh_settings_t &s) { const int num_prims = int(prim_indices.size()); const bbox_t whole_box = {bbox_min, bbox_max}; @@ -148,9 +148,9 @@ Ray::split_data_t Ray::SplitPrimitives_SAH(const prim_t *primitives, Span(0); if (s.allow_spatial_splits && test.all_zeros() && @@ -379,9 +379,9 @@ Ray::split_data_t Ray::SplitPrimitives_SAH(const prim_t *primitives, Span left_indices, right_indices; - Ref::simd_fvec4 left_bounds[2], right_bounds[2]; + Ref::fvec4 left_bounds[2], right_bounds[2]; }; split_data_t SplitPrimitives_SAH(const prim_t *primitives, Span prim_indices, - const vtx_attribute_t &positions, const Ref::simd_fvec4 &bbox_min, - const Ref::simd_fvec4 &bbox_max, const Ref::simd_fvec4 &root_min, - const Ref::simd_fvec4 &root_max, const bvh_settings_t &s); + const vtx_attribute_t &positions, const Ref::fvec4 &bbox_min, + const Ref::fvec4 &bbox_max, const Ref::fvec4 &root_min, + const Ref::fvec4 &root_max, const bvh_settings_t &s); } // namespace Ray \ No newline at end of file diff --git a/internal/Convolution.h b/internal/Convolution.h index cde2b96d3..4b38f6c67 100644 --- a/internal/Convolution.h +++ b/internal/Convolution.h @@ -134,43 +134,40 @@ void Convolution3x3_Direct_ProcessRows(int y, const float *__restrict data, cons index(y + 8, x - 1)}; for (int i = 0; i < OutChannels; ++i) { - simd_fvec val[RowsPortion] = {}; + fvec val[RowsPortion] = {}; for (int j = 0; j < 3 * InChannels; j += S) { if (RowsPortion == 8) { UNROLLED_FOR(k, 8, { val[k % RowsPortion] = - fmadd(simd_fvec{&weights[i * InChannels * 9 + 0 * InChannels + j], simd_mem_aligned}, - simd_fvec{&data[ii[k + 0] + j]}, val[k % RowsPortion]); + fmadd(fvec{&weights[i * InChannels * 9 + 0 * InChannels + j], vector_aligned}, + fvec{&data[ii[k + 0] + j]}, val[k % RowsPortion]); val[k % RowsPortion] = - fmadd(simd_fvec{&weights[i * InChannels * 9 + 3 * InChannels + j], simd_mem_aligned}, - simd_fvec{&data[ii[k + 1] + j]}, val[k % RowsPortion]); + fmadd(fvec{&weights[i * InChannels * 9 + 3 * InChannels + j], vector_aligned}, + fvec{&data[ii[k + 1] + j]}, val[k % RowsPortion]); val[k % RowsPortion] = - fmadd(simd_fvec{&weights[i * InChannels * 9 + 6 * InChannels + j], simd_mem_aligned}, - simd_fvec{&data[ii[k + 2] + j]}, val[k % RowsPortion]); + fmadd(fvec{&weights[i * InChannels * 9 + 6 * InChannels + j], vector_aligned}, + fvec{&data[ii[k + 2] + j]}, val[k % RowsPortion]); }) } else if (RowsPortion == 4) { UNROLLED_FOR(k, 4, { val[k % RowsPortion] = - fmadd(simd_fvec{&weights[i * InChannels * 9 + 0 * InChannels + j], simd_mem_aligned}, - simd_fvec{&data[ii[k + 0] + j]}, val[k % RowsPortion]); + fmadd(fvec{&weights[i * InChannels * 9 + 0 * InChannels + j], vector_aligned}, + fvec{&data[ii[k + 0] + j]}, val[k % RowsPortion]); val[k % RowsPortion] = - fmadd(simd_fvec{&weights[i * InChannels * 9 + 3 * InChannels + j], simd_mem_aligned}, - simd_fvec{&data[ii[k + 1] + j]}, val[k % RowsPortion]); + fmadd(fvec{&weights[i * InChannels * 9 + 3 * InChannels + j], vector_aligned}, + fvec{&data[ii[k + 1] + j]}, val[k % RowsPortion]); val[k % RowsPortion] = - fmadd(simd_fvec{&weights[i * InChannels * 9 + 6 * InChannels + j], simd_mem_aligned}, - simd_fvec{&data[ii[k + 2] + j]}, val[k % RowsPortion]); + fmadd(fvec{&weights[i * InChannels * 9 + 6 * InChannels + j], vector_aligned}, + fvec{&data[ii[k + 2] + j]}, val[k % RowsPortion]); }) } else { for (int k = 0; k < RowsPortion; ++k) { - val[k] = - fmadd(simd_fvec{&weights[i * InChannels * 9 + 0 * InChannels + j], simd_mem_aligned}, - simd_fvec{&data[ii[k + 0] + j]}, val[k]); - val[k] = - fmadd(simd_fvec{&weights[i * InChannels * 9 + 3 * InChannels + j], simd_mem_aligned}, - simd_fvec{&data[ii[k + 1] + j]}, val[k]); - val[k] = - fmadd(simd_fvec{&weights[i * InChannels * 9 + 6 * InChannels + j], simd_mem_aligned}, - simd_fvec{&data[ii[k + 2] + j]}, val[k]); + val[k] = fmadd(fvec{&weights[i * InChannels * 9 + 0 * InChannels + j], vector_aligned}, + fvec{&data[ii[k + 0] + j]}, val[k]); + val[k] = fmadd(fvec{&weights[i * InChannels * 9 + 3 * InChannels + j], vector_aligned}, + fvec{&data[ii[k + 1] + j]}, val[k]); + val[k] = fmadd(fvec{&weights[i * InChannels * 9 + 6 * InChannels + j], vector_aligned}, + fvec{&data[ii[k + 2] + j]}, val[k]); } } } @@ -238,32 +235,32 @@ void ConvolutionConcat3x3_Direct_ProcessRows(int y, const float *__restrict data }; for (int i = 0; i < OutChannels; ++i) { - simd_fvec val[8] = {}; + fvec val[8] = {}; const float *p_weights = &weights[i * (InChannels1 + InChannels2) * 9]; for (int j = 0; j < InChannels1; j += S) { UNROLLED_FOR(k, 8, { if (k < RowsPortion) { - val[k] = fmadd(simd_fvec{&p_weights[0 * InChannels1 + j], simd_mem_aligned}, - simd_fvec{&data1[ii1[k + 0] + ((add + 0) / div1) * InChannels1 + j]}, val[k]); - val[k] = fmadd(simd_fvec{&p_weights[1 * InChannels1 + j], simd_mem_aligned}, - simd_fvec{&data1[ii1[k + 0] + ((add + 1) / div1) * InChannels1 + j]}, val[k]); - val[k] = fmadd(simd_fvec{&p_weights[2 * InChannels1 + j], simd_mem_aligned}, - simd_fvec{&data1[ii1[k + 0] + ((add + 2) / div1) * InChannels1 + j]}, val[k]); - - val[k] = fmadd(simd_fvec{&p_weights[3 * InChannels1 + j], simd_mem_aligned}, - simd_fvec{&data1[ii1[k + 1] + ((add + 0) / div1) * InChannels1 + j]}, val[k]); - val[k] = fmadd(simd_fvec{&p_weights[4 * InChannels1 + j], simd_mem_aligned}, - simd_fvec{&data1[ii1[k + 1] + ((add + 1) / div1) * InChannels1 + j]}, val[k]); - val[k] = fmadd(simd_fvec{&p_weights[5 * InChannels1 + j], simd_mem_aligned}, - simd_fvec{&data1[ii1[k + 1] + ((add + 2) / div1) * InChannels1 + j]}, val[k]); - - val[k] = fmadd(simd_fvec{&p_weights[6 * InChannels1 + j], simd_mem_aligned}, - simd_fvec{&data1[ii1[k + 2] + ((add + 0) / div1) * InChannels1 + j]}, val[k]); - val[k] = fmadd(simd_fvec{&p_weights[7 * InChannels1 + j], simd_mem_aligned}, - simd_fvec{&data1[ii1[k + 2] + ((add + 1) / div1) * InChannels1 + j]}, val[k]); - val[k] = fmadd(simd_fvec{&p_weights[8 * InChannels1 + j], simd_mem_aligned}, - simd_fvec{&data1[ii1[k + 2] + ((add + 2) / div1) * InChannels1 + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[0 * InChannels1 + j], vector_aligned}, + fvec{&data1[ii1[k + 0] + ((add + 0) / div1) * InChannels1 + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[1 * InChannels1 + j], vector_aligned}, + fvec{&data1[ii1[k + 0] + ((add + 1) / div1) * InChannels1 + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[2 * InChannels1 + j], vector_aligned}, + fvec{&data1[ii1[k + 0] + ((add + 2) / div1) * InChannels1 + j]}, val[k]); + + val[k] = fmadd(fvec{&p_weights[3 * InChannels1 + j], vector_aligned}, + fvec{&data1[ii1[k + 1] + ((add + 0) / div1) * InChannels1 + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[4 * InChannels1 + j], vector_aligned}, + fvec{&data1[ii1[k + 1] + ((add + 1) / div1) * InChannels1 + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[5 * InChannels1 + j], vector_aligned}, + fvec{&data1[ii1[k + 1] + ((add + 2) / div1) * InChannels1 + j]}, val[k]); + + val[k] = fmadd(fvec{&p_weights[6 * InChannels1 + j], vector_aligned}, + fvec{&data1[ii1[k + 2] + ((add + 0) / div1) * InChannels1 + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[7 * InChannels1 + j], vector_aligned}, + fvec{&data1[ii1[k + 2] + ((add + 1) / div1) * InChannels1 + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[8 * InChannels1 + j], vector_aligned}, + fvec{&data1[ii1[k + 2] + ((add + 2) / div1) * InChannels1 + j]}, val[k]); } }) } @@ -271,30 +268,30 @@ void ConvolutionConcat3x3_Direct_ProcessRows(int y, const float *__restrict data for (int j = 0; j < 3 * InChannels2; j += S) { if (RowsPortion == 8) { UNROLLED_FOR(k, 8, { - val[k] = fmadd(simd_fvec{&p_weights[0 * InChannels2 + j], simd_mem_aligned}, - simd_fvec{&data2[ii2[k + 0] + j]}, val[k]); - val[k] = fmadd(simd_fvec{&p_weights[3 * InChannels2 + j], simd_mem_aligned}, - simd_fvec{&data2[ii2[k + 1] + j]}, val[k]); - val[k] = fmadd(simd_fvec{&p_weights[6 * InChannels2 + j], simd_mem_aligned}, - simd_fvec{&data2[ii2[k + 2] + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[0 * InChannels2 + j], vector_aligned}, + fvec{&data2[ii2[k + 0] + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[3 * InChannels2 + j], vector_aligned}, + fvec{&data2[ii2[k + 1] + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[6 * InChannels2 + j], vector_aligned}, + fvec{&data2[ii2[k + 2] + j]}, val[k]); }) } else if (RowsPortion == 4) { UNROLLED_FOR(k, 4, { - val[k] = fmadd(simd_fvec{&p_weights[0 * InChannels2 + j], simd_mem_aligned}, - simd_fvec{&data2[ii2[k + 0] + j]}, val[k]); - val[k] = fmadd(simd_fvec{&p_weights[3 * InChannels2 + j], simd_mem_aligned}, - simd_fvec{&data2[ii2[k + 1] + j]}, val[k]); - val[k] = fmadd(simd_fvec{&p_weights[6 * InChannels2 + j], simd_mem_aligned}, - simd_fvec{&data2[ii2[k + 2] + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[0 * InChannels2 + j], vector_aligned}, + fvec{&data2[ii2[k + 0] + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[3 * InChannels2 + j], vector_aligned}, + fvec{&data2[ii2[k + 1] + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[6 * InChannels2 + j], vector_aligned}, + fvec{&data2[ii2[k + 2] + j]}, val[k]); }) } else { for (int k = 0; k < RowsPortion; ++k) { - val[k] = fmadd(simd_fvec{&p_weights[0 * InChannels2 + j], simd_mem_aligned}, - simd_fvec{&data2[ii2[k + 0] + j]}, val[k]); - val[k] = fmadd(simd_fvec{&p_weights[3 * InChannels2 + j], simd_mem_aligned}, - simd_fvec{&data2[ii2[k + 1] + j]}, val[k]); - val[k] = fmadd(simd_fvec{&p_weights[6 * InChannels2 + j], simd_mem_aligned}, - simd_fvec{&data2[ii2[k + 2] + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[0 * InChannels2 + j], vector_aligned}, + fvec{&data2[ii2[k + 0] + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[3 * InChannels2 + j], vector_aligned}, + fvec{&data2[ii2[k + 1] + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[6 * InChannels2 + j], vector_aligned}, + fvec{&data2[ii2[k + 2] + j]}, val[k]); } } } @@ -400,12 +397,11 @@ void Convolution3x3_GEMM(const float data1[], const float data2[], const float d } for (int i = 0; i < OutChannels; ++i) { - simd_fvec val = 0.0f; + fvec val = 0.0f; int j = 0; for (; j < InChannels * 9 - S + 1; j += S) { - val = fmadd(simd_fvec{&weights[i * InChannels * 9 + j]}, - simd_fvec{&input[j], simd_mem_aligned}, val); + val = fmadd(fvec{&weights[i * InChannels * 9 + j]}, fvec{&input[j], vector_aligned}, val); } float final_val = biases[i]; @@ -493,73 +489,73 @@ void ConvolutionConcat3x3_GEMM(const float *__restrict data1, const float *__res if ((InChannels1 % S) == 0 && InChannels1 >= S && (InChannels2 % S) == 0 && InChannels2 >= S) { for (int i = 0; i < OutChannels; ++i) { - simd_fvec val[3] = {0.0f}; + fvec val[3] = {0.0f}; for (int j = 0; j < InChannels1 * 9; j += S * 9) { - val[0] = fmadd( - simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 0 * S], simd_mem_aligned}, - simd_fvec{&input1[j + 0 * S], simd_mem_aligned}, val[0]); - val[1] = fmadd( - simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 1 * S], simd_mem_aligned}, - simd_fvec{&input1[j + 1 * S], simd_mem_aligned}, val[1]); - val[2] = fmadd( - simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 2 * S], simd_mem_aligned}, - simd_fvec{&input1[j + 2 * S], simd_mem_aligned}, val[2]); - val[0] = fmadd( - simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 3 * S], simd_mem_aligned}, - simd_fvec{&input1[j + 3 * S], simd_mem_aligned}, val[0]); - val[1] = fmadd( - simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 4 * S], simd_mem_aligned}, - simd_fvec{&input1[j + 4 * S], simd_mem_aligned}, val[1]); - val[2] = fmadd( - simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 5 * S], simd_mem_aligned}, - simd_fvec{&input1[j + 5 * S], simd_mem_aligned}, val[2]); - val[0] = fmadd( - simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 6 * S], simd_mem_aligned}, - simd_fvec{&input1[j + 6 * S], simd_mem_aligned}, val[0]); - val[1] = fmadd( - simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 7 * S], simd_mem_aligned}, - simd_fvec{&input1[j + 7 * S], simd_mem_aligned}, val[1]); - val[2] = fmadd( - simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 8 * S], simd_mem_aligned}, - simd_fvec{&input1[j + 8 * S], simd_mem_aligned}, val[2]); + val[0] = + fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 0 * S], vector_aligned}, + fvec{&input1[j + 0 * S], vector_aligned}, val[0]); + val[1] = + fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 1 * S], vector_aligned}, + fvec{&input1[j + 1 * S], vector_aligned}, val[1]); + val[2] = + fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 2 * S], vector_aligned}, + fvec{&input1[j + 2 * S], vector_aligned}, val[2]); + val[0] = + fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 3 * S], vector_aligned}, + fvec{&input1[j + 3 * S], vector_aligned}, val[0]); + val[1] = + fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 4 * S], vector_aligned}, + fvec{&input1[j + 4 * S], vector_aligned}, val[1]); + val[2] = + fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 5 * S], vector_aligned}, + fvec{&input1[j + 5 * S], vector_aligned}, val[2]); + val[0] = + fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 6 * S], vector_aligned}, + fvec{&input1[j + 6 * S], vector_aligned}, val[0]); + val[1] = + fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 7 * S], vector_aligned}, + fvec{&input1[j + 7 * S], vector_aligned}, val[1]); + val[2] = + fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 8 * S], vector_aligned}, + fvec{&input1[j + 8 * S], vector_aligned}, val[2]); } for (int j = 0; j < InChannels2 * 9; j += S * 9) { - val[0] = fmadd( - simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 0 * S], - simd_mem_aligned}, - simd_fvec{&input2[j + 0 * S], simd_mem_aligned}, val[0]); - val[1] = fmadd( - simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 1 * S], - simd_mem_aligned}, - simd_fvec{&input2[j + 1 * S], simd_mem_aligned}, val[1]); - val[2] = fmadd( - simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 2 * S], - simd_mem_aligned}, - simd_fvec{&input2[j + 2 * S], simd_mem_aligned}, val[2]); - val[0] = fmadd( - simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 3 * S], - simd_mem_aligned}, - simd_fvec{&input2[j + 3 * S], simd_mem_aligned}, val[0]); - val[1] = fmadd( - simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 4 * S], - simd_mem_aligned}, - simd_fvec{&input2[j + 4 * S], simd_mem_aligned}, val[1]); - val[2] = fmadd( - simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 5 * S], - simd_mem_aligned}, - simd_fvec{&input2[j + 5 * S], simd_mem_aligned}, val[2]); - val[0] = fmadd( - simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 6 * S], - simd_mem_aligned}, - simd_fvec{&input2[j + 6 * S], simd_mem_aligned}, val[0]); - val[1] = fmadd( - simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 7 * S], - simd_mem_aligned}, - simd_fvec{&input2[j + 7 * S], simd_mem_aligned}, val[1]); - val[2] = fmadd( - simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 8 * S], - simd_mem_aligned}, - simd_fvec{&input2[j + 8 * S], simd_mem_aligned}, val[2]); + val[0] = + fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 0 * S], + vector_aligned}, + fvec{&input2[j + 0 * S], vector_aligned}, val[0]); + val[1] = + fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 1 * S], + vector_aligned}, + fvec{&input2[j + 1 * S], vector_aligned}, val[1]); + val[2] = + fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 2 * S], + vector_aligned}, + fvec{&input2[j + 2 * S], vector_aligned}, val[2]); + val[0] = + fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 3 * S], + vector_aligned}, + fvec{&input2[j + 3 * S], vector_aligned}, val[0]); + val[1] = + fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 4 * S], + vector_aligned}, + fvec{&input2[j + 4 * S], vector_aligned}, val[1]); + val[2] = + fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 5 * S], + vector_aligned}, + fvec{&input2[j + 5 * S], vector_aligned}, val[2]); + val[0] = + fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 6 * S], + vector_aligned}, + fvec{&input2[j + 6 * S], vector_aligned}, val[0]); + val[1] = + fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 7 * S], + vector_aligned}, + fvec{&input2[j + 7 * S], vector_aligned}, val[1]); + val[2] = + fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 8 * S], + vector_aligned}, + fvec{&input2[j + 8 * S], vector_aligned}, val[2]); } val[0] += val[1]; @@ -575,46 +571,46 @@ void ConvolutionConcat3x3_GEMM(const float *__restrict data1, const float *__res } } else if ((InChannels1 % S) == 0 && InChannels1 >= S && InChannels2 == 3 && S <= 8) { for (int i = 0; i < OutChannels; ++i) { - simd_fvec val[3] = {0.0f, 0.0f, 0.0f}; + fvec val[3] = {0.0f, 0.0f, 0.0f}; for (int j = 0; j < InChannels1 * 9; j += S * 9) { - val[0] = fmadd(simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 0 * S]}, - simd_fvec{&input1[j + 0 * S], simd_mem_aligned}, val[0]); - val[1] = fmadd(simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 1 * S]}, - simd_fvec{&input1[j + 1 * S], simd_mem_aligned}, val[1]); - val[2] = fmadd(simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 2 * S]}, - simd_fvec{&input1[j + 2 * S], simd_mem_aligned}, val[2]); - val[0] = fmadd(simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 3 * S]}, - simd_fvec{&input1[j + 3 * S], simd_mem_aligned}, val[0]); - val[1] = fmadd(simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 4 * S]}, - simd_fvec{&input1[j + 4 * S], simd_mem_aligned}, val[1]); - val[2] = fmadd(simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 5 * S]}, - simd_fvec{&input1[j + 5 * S], simd_mem_aligned}, val[2]); - val[0] = fmadd(simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 6 * S]}, - simd_fvec{&input1[j + 6 * S], simd_mem_aligned}, val[0]); - val[1] = fmadd(simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 7 * S]}, - simd_fvec{&input1[j + 7 * S], simd_mem_aligned}, val[1]); - val[2] = fmadd(simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 8 * S]}, - simd_fvec{&input1[j + 8 * S], simd_mem_aligned}, val[2]); + val[0] = fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 0 * S]}, + fvec{&input1[j + 0 * S], vector_aligned}, val[0]); + val[1] = fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 1 * S]}, + fvec{&input1[j + 1 * S], vector_aligned}, val[1]); + val[2] = fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 2 * S]}, + fvec{&input1[j + 2 * S], vector_aligned}, val[2]); + val[0] = fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 3 * S]}, + fvec{&input1[j + 3 * S], vector_aligned}, val[0]); + val[1] = fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 4 * S]}, + fvec{&input1[j + 4 * S], vector_aligned}, val[1]); + val[2] = fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 5 * S]}, + fvec{&input1[j + 5 * S], vector_aligned}, val[2]); + val[0] = fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 6 * S]}, + fvec{&input1[j + 6 * S], vector_aligned}, val[0]); + val[1] = fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 7 * S]}, + fvec{&input1[j + 7 * S], vector_aligned}, val[1]); + val[2] = fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + j + 8 * S]}, + fvec{&input1[j + 8 * S], vector_aligned}, val[2]); } int j = 0; for (; j < InChannels2 * 9 - S; j += S * 3) { - val[0] = fmadd( - simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 0 * S]}, - simd_fvec{&input2[j + 0 * S], simd_mem_aligned}, val[0]); - val[1] = fmadd( - simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 1 * S]}, - simd_fvec{&input2[j + 1 * S], simd_mem_aligned}, val[1]); - val[2] = fmadd( - simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 2 * S]}, - simd_fvec{&input2[j + 2 * S], simd_mem_aligned}, val[2]); + val[0] = + fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 0 * S]}, + fvec{&input2[j + 0 * S], vector_aligned}, val[0]); + val[1] = + fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 1 * S]}, + fvec{&input2[j + 1 * S], vector_aligned}, val[1]); + val[2] = + fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j + 2 * S]}, + fvec{&input2[j + 2 * S], vector_aligned}, val[2]); } - simd_fvec last_input = 0.0f; + fvec last_input = 0.0f; last_input.template set<0>(input2[j + 0]); last_input.template set<1>(input2[j + 1]); last_input.template set<2>(input2[j + 2]); - val[0] = fmadd(simd_fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j]}, + val[0] = fmadd(fvec{&weights[i * (InChannels1 + InChannels2) * 9 + InChannels1 * 9 + j]}, last_input, val[0]); val[0] += val[1]; @@ -778,32 +774,32 @@ void ConvolutionConcat3x3_1Direct_2GEMM_ProcessRows(int y, const float data1[], const int InChannels234 = (InChannels2 + InChannels3 + InChannels4); for (int i = 0; i < OutChannels; ++i) { - simd_fvec val[8] = {}; + fvec val[8] = {}; const float *p_weights = &weights[i * (InChannels1 + InChannels234) * 9]; for (int j = 0; j < InChannels1; j += S) { UNROLLED_FOR(k, 8, { if (k < RowsPortion) { - val[k] = fmadd(simd_fvec{&p_weights[0 * InChannels1 + j]}, - simd_fvec{&data1[ii1[k + 0] + ((add + 0) / div1) * InChannels1 + j]}, val[k]); - val[k] = fmadd(simd_fvec{&p_weights[1 * InChannels1 + j]}, - simd_fvec{&data1[ii1[k + 0] + ((add + 1) / div1) * InChannels1 + j]}, val[k]); - val[k] = fmadd(simd_fvec{&p_weights[2 * InChannels1 + j]}, - simd_fvec{&data1[ii1[k + 0] + ((add + 2) / div1) * InChannels1 + j]}, val[k]); - - val[k] = fmadd(simd_fvec{&p_weights[3 * InChannels1 + j]}, - simd_fvec{&data1[ii1[k + 1] + ((add + 0) / div1) * InChannels1 + j]}, val[k]); - val[k] = fmadd(simd_fvec{&p_weights[4 * InChannels1 + j]}, - simd_fvec{&data1[ii1[k + 1] + ((add + 1) / div1) * InChannels1 + j]}, val[k]); - val[k] = fmadd(simd_fvec{&p_weights[5 * InChannels1 + j]}, - simd_fvec{&data1[ii1[k + 1] + ((add + 2) / div1) * InChannels1 + j]}, val[k]); - - val[k] = fmadd(simd_fvec{&p_weights[6 * InChannels1 + j]}, - simd_fvec{&data1[ii1[k + 2] + ((add + 0) / div1) * InChannels1 + j]}, val[k]); - val[k] = fmadd(simd_fvec{&p_weights[7 * InChannels1 + j]}, - simd_fvec{&data1[ii1[k + 2] + ((add + 1) / div1) * InChannels1 + j]}, val[k]); - val[k] = fmadd(simd_fvec{&p_weights[8 * InChannels1 + j]}, - simd_fvec{&data1[ii1[k + 2] + ((add + 2) / div1) * InChannels1 + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[0 * InChannels1 + j]}, + fvec{&data1[ii1[k + 0] + ((add + 0) / div1) * InChannels1 + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[1 * InChannels1 + j]}, + fvec{&data1[ii1[k + 0] + ((add + 1) / div1) * InChannels1 + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[2 * InChannels1 + j]}, + fvec{&data1[ii1[k + 0] + ((add + 2) / div1) * InChannels1 + j]}, val[k]); + + val[k] = fmadd(fvec{&p_weights[3 * InChannels1 + j]}, + fvec{&data1[ii1[k + 1] + ((add + 0) / div1) * InChannels1 + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[4 * InChannels1 + j]}, + fvec{&data1[ii1[k + 1] + ((add + 1) / div1) * InChannels1 + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[5 * InChannels1 + j]}, + fvec{&data1[ii1[k + 1] + ((add + 2) / div1) * InChannels1 + j]}, val[k]); + + val[k] = fmadd(fvec{&p_weights[6 * InChannels1 + j]}, + fvec{&data1[ii1[k + 2] + ((add + 0) / div1) * InChannels1 + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[7 * InChannels1 + j]}, + fvec{&data1[ii1[k + 2] + ((add + 1) / div1) * InChannels1 + j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[8 * InChannels1 + j]}, + fvec{&data1[ii1[k + 2] + ((add + 2) / div1) * InChannels1 + j]}, val[k]); } }) } @@ -814,19 +810,19 @@ void ConvolutionConcat3x3_1Direct_2GEMM_ProcessRows(int y, const float data1[], for (; j < InChannels234 * 9 - S + 1; j += S) { UNROLLED_FOR(k, 8, { if (k < RowsPortion) { - val[k] = fmadd(simd_fvec{&p_weights[j]}, simd_fvec{&input234[k][j]}, val[k]); + val[k] = fmadd(fvec{&p_weights[j]}, fvec{&input234[k][j]}, val[k]); } }) } for (int k = 0; k < RowsPortion; ++k) { - simd_fvec last_input = 0.0f; + fvec last_input = 0.0f; UNROLLED_FOR(l, 16, { if (l < ((InChannels234 * 9) % S)) { last_input.template set(input234[k][j + l]); } }) - val[k] = fmadd(simd_fvec{&p_weights[j]}, last_input, val[k]); + val[k] = fmadd(fvec{&p_weights[j]}, last_input, val[k]); float final_val = biases[i] + hsum(val[k]); if (Activation == eActivation::ReLU) { diff --git a/internal/Core.cpp b/internal/Core.cpp index d32617be8..1fa63f684 100644 --- a/internal/Core.cpp +++ b/internal/Core.cpp @@ -14,7 +14,7 @@ namespace Ray { #include "precomputed/__pmj02_samples.inl" -force_inline Ref::simd_fvec3 cross(const Ref::simd_fvec3 &v1, const Ref::simd_fvec3 &v2) { +force_inline Ref::fvec3 cross(const Ref::fvec3 &v1, const Ref::fvec3 &v2) { return {v1[1] * v2[2] - v1[2] * v2[1], v1[2] * v2[0] - v1[0] * v2[2], v1[0] * v2[1] - v1[1] * v2[0]}; } @@ -243,7 +243,7 @@ uint32_t Ray::PreprocessMesh(const vtx_attribute_t &positions, Span &out_nodes) { if (bit_index == -1 || prim_count < 8) { - Ref::simd_fvec4 bbox_min = {FLT_MAX}, bbox_max = {-FLT_MAX}; + Ref::fvec4 bbox_min = {FLT_MAX}, bbox_max = {-FLT_MAX}; for (uint32_t i = prim_index; i < prim_index + prim_count; i++) { bbox_min = min(bbox_min, prims[indices[i]].bbox_min); @@ -387,7 +387,7 @@ uint32_t Ray::EmitLBVH(const prim_t *prims, const uint32_t *indices, const uint3 proc_item_t &cur = proc_stack[stack_size - 1]; if (cur.bit_index == -1 || cur.prim_count < 8) { - Ref::simd_fvec4 bbox_min = {FLT_MAX}, bbox_max = {-FLT_MAX}; + Ref::fvec4 bbox_min = {FLT_MAX}, bbox_max = {-FLT_MAX}; for (uint32_t i = cur.prim_index; i < cur.prim_index + cur.prim_count; i++) { bbox_min = min(bbox_min, prims[indices[i]].bbox_min); @@ -465,9 +465,9 @@ uint32_t Ray::PreprocessPrims_SAH(Span prims, const vtx_attribute_ std::vector &out_nodes, std::vector &out_indices) { struct prims_coll_t { std::vector indices; - Ref::simd_fvec4 min = {FLT_MAX, FLT_MAX, FLT_MAX, 0.0f}, max = {-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f}; + Ref::fvec4 min = {FLT_MAX, FLT_MAX, FLT_MAX, 0.0f}, max = {-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f}; prims_coll_t() = default; - prims_coll_t(std::vector &&_indices, const Ref::simd_fvec4 &_min, const Ref::simd_fvec4 &_max) + prims_coll_t(std::vector &&_indices, const Ref::fvec4 &_min, const Ref::fvec4 &_max) : indices(std::move(_indices)), min(_min), max(_max) {} }; @@ -483,7 +483,7 @@ uint32_t Ray::PreprocessPrims_SAH(Span prims, const vtx_attribute_ prim_lists.back().max = max(prim_lists.back().max, prims[j].bbox_max); } - Ref::simd_fvec4 root_min = prim_lists.back().min, root_max = prim_lists.back().max; + Ref::fvec4 root_min = prim_lists.back().min, root_max = prim_lists.back().max; while (!prim_lists.empty()) { split_data_t split_data = @@ -492,7 +492,7 @@ uint32_t Ray::PreprocessPrims_SAH(Span prims, const vtx_attribute_ prim_lists.pop_back(); if (split_data.right_indices.empty()) { - Ref::simd_fvec4 bbox_min = split_data.left_bounds[0], bbox_max = split_data.left_bounds[1]; + Ref::fvec4 bbox_min = split_data.left_bounds[0], bbox_max = split_data.left_bounds[1]; out_nodes.emplace_back(); bvh_node_t &n = out_nodes.back(); @@ -506,10 +506,10 @@ uint32_t Ray::PreprocessPrims_SAH(Span prims, const vtx_attribute_ const auto index = uint32_t(num_nodes); uint32_t space_axis = 0; - const Ref::simd_fvec4 c_left = (split_data.left_bounds[0] + split_data.left_bounds[1]) / 2.0f, + const Ref::fvec4 c_left = (split_data.left_bounds[0] + split_data.left_bounds[1]) / 2.0f, c_right = (split_data.right_bounds[0] + split_data.right_bounds[1]) / 2.0f; - const Ref::simd_fvec4 dist = abs(c_left - c_right); + const Ref::fvec4 dist = abs(c_left - c_right); if (dist.get<0>() > dist.get<1>() && dist.get<0>() > dist.get<2>()) { space_axis = 0; @@ -519,7 +519,7 @@ uint32_t Ray::PreprocessPrims_SAH(Span prims, const vtx_attribute_ space_axis = 2; } - const Ref::simd_fvec4 bbox_min = min(split_data.left_bounds[0], split_data.right_bounds[0]), + const Ref::fvec4 bbox_min = min(split_data.left_bounds[0], split_data.right_bounds[0]), bbox_max = max(split_data.left_bounds[1], split_data.right_bounds[1]); out_nodes.emplace_back(); @@ -544,7 +544,7 @@ uint32_t Ray::PreprocessPrims_HLBVH(Span prims, std::vector &out_indices) { std::vector morton_codes(prims.size()); - Ref::simd_fvec4 whole_min = {FLT_MAX, FLT_MAX, FLT_MAX, 0.0f}, whole_max = {-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f}; + Ref::fvec4 whole_min = {FLT_MAX, FLT_MAX, FLT_MAX, 0.0f}, whole_max = {-FLT_MAX, -FLT_MAX, -FLT_MAX, 0.0f}; const auto indices_start = uint32_t(out_indices.size()); out_indices.reserve(out_indices.size() + prims.size()); @@ -558,12 +558,12 @@ uint32_t Ray::PreprocessPrims_HLBVH(Span prims, std::vector()), y = uint32_t(code.get<1>()), z = uint32_t(code.get<2>()); @@ -766,22 +766,22 @@ uint32_t Ray::FlattenBVH_r(const bvh_node_t *nodes, const uint32_t node_index, c } // Sort children in morton order - Ref::simd_fvec3 children_centers[8], whole_box_min = {FLT_MAX}, whole_box_max = {-FLT_MAX}; + Ref::fvec3 children_centers[8], whole_box_min = {FLT_MAX}, whole_box_max = {-FLT_MAX}; for (int i = 0; i < children_count; i++) { children_centers[i] = - 0.5f * (Ref::simd_fvec3{nodes[children[i]].bbox_min} + Ref::simd_fvec3{nodes[children[i]].bbox_max}); + 0.5f * (Ref::fvec3{nodes[children[i]].bbox_min} + Ref::fvec3{nodes[children[i]].bbox_max}); whole_box_min = min(whole_box_min, children_centers[i]); whole_box_max = max(whole_box_max, children_centers[i]); } - whole_box_max += Ref::simd_fvec3{0.001f}; + whole_box_max += Ref::fvec3{0.001f}; - const Ref::simd_fvec3 scale = 2.0f / (whole_box_max - whole_box_min); + const Ref::fvec3 scale = 2.0f / (whole_box_max - whole_box_min); uint32_t sorted_children[8] = {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff}; for (int i = 0; i < children_count; i++) { - Ref::simd_fvec3 code = (children_centers[i] - whole_box_min) * scale; + Ref::fvec3 code = (children_centers[i] - whole_box_min) * scale; const auto x = uint32_t(code[0]), y = uint32_t(code[1]), z = uint32_t(code[2]); @@ -910,22 +910,22 @@ uint32_t Ray::FlattenBVH_r(const light_bvh_node_t *nodes, const uint32_t node_in } // Sort children in morton order - Ref::simd_fvec3 children_centers[8], whole_box_min = {FLT_MAX}, whole_box_max = {-FLT_MAX}; + Ref::fvec3 children_centers[8], whole_box_min = {FLT_MAX}, whole_box_max = {-FLT_MAX}; for (int i = 0; i < children_count; i++) { children_centers[i] = - 0.5f * (Ref::simd_fvec3{nodes[children[i]].bbox_min} + Ref::simd_fvec3{nodes[children[i]].bbox_max}); + 0.5f * (Ref::fvec3{nodes[children[i]].bbox_min} + Ref::fvec3{nodes[children[i]].bbox_max}); whole_box_min = min(whole_box_min, children_centers[i]); whole_box_max = max(whole_box_max, children_centers[i]); } - whole_box_max += Ref::simd_fvec3{0.001f}; + whole_box_max += Ref::fvec3{0.001f}; - const Ref::simd_fvec3 scale = 2.0f / (whole_box_max - whole_box_min); + const Ref::fvec3 scale = 2.0f / (whole_box_max - whole_box_min); uint32_t sorted_children[8] = {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff}; for (int i = 0; i < children_count; i++) { - Ref::simd_fvec3 code = (children_centers[i] - whole_box_min) * scale; + Ref::fvec3 code = (children_centers[i] - whole_box_min) * scale; const auto x = uint32_t(code[0]), y = uint32_t(code[1]), z = uint32_t(code[2]); @@ -1023,7 +1023,7 @@ void Ray::ConstructCamera(const eCamType type, const ePixelFilter filter, const const float lens_rotation, const float lens_ratio, const int lens_blades, const float clip_start, const float clip_end, camera_t *cam) { if (type == eCamType::Persp) { - auto o = Ref::simd_fvec3{origin}, f = Ref::simd_fvec3{fwd}, u = Ref::simd_fvec3{up}; + auto o = Ref::fvec3{origin}, f = Ref::fvec3{fwd}, u = Ref::fvec3{up}; if (u.length2() < FLT_EPS) { if (fabsf(f[1]) >= 0.999f) { @@ -1033,7 +1033,7 @@ void Ray::ConstructCamera(const eCamType type, const ePixelFilter filter, const } } - const Ref::simd_fvec3 s = normalize(cross(f, u)); + const Ref::fvec3 s = normalize(cross(f, u)); u = cross(s, f); cam->type = type; diff --git a/internal/CoreDX.h b/internal/CoreDX.h index ec35a8706..0dbf7a7e5 100644 --- a/internal/CoreDX.h +++ b/internal/CoreDX.h @@ -20,7 +20,7 @@ #pragma message("Ray::Ref::simd_vec will not use SIMD") #endif -#include "simd/simd_vec.h" +#include "simd/simd.h" #undef USE_SSE2 #undef USE_NEON diff --git a/internal/CoreRef.cpp b/internal/CoreRef.cpp index 4e8808f8a..4b48481ce 100644 --- a/internal/CoreRef.cpp +++ b/internal/CoreRef.cpp @@ -58,29 +58,29 @@ force_inline void IntersectTri(const float ro[3], const float rd[3], const tri_a force_inline void IntersectTri(const float ro[3], const float rd[3], const mtri_accel_t &tri, const uint32_t prim_index, hit_data_t &inter) { #if VECTORIZE_TRI_INTERSECTION - simd_ivec4 _mask = 0, _prim_index; - simd_fvec4 _t = inter.t, _u, _v; + ivec4 _mask = 0, _prim_index; + fvec4 _t = inter.t, _u, _v; for (int i = 0; i < 8; i += 4) { - simd_fvec4 det = rd[0] * simd_fvec4{&tri.n_plane[0][i], simd_mem_aligned} + - rd[1] * simd_fvec4{&tri.n_plane[1][i], simd_mem_aligned} + - rd[2] * simd_fvec4{&tri.n_plane[2][i], simd_mem_aligned}; - const simd_fvec4 dett = simd_fvec4{&tri.n_plane[3][i], simd_mem_aligned} - - ro[0] * simd_fvec4{&tri.n_plane[0][i], simd_mem_aligned} - - ro[1] * simd_fvec4{&tri.n_plane[1][i], simd_mem_aligned} - - ro[2] * simd_fvec4{&tri.n_plane[2][i], simd_mem_aligned}; + fvec4 det = rd[0] * fvec4{&tri.n_plane[0][i], vector_aligned} + + rd[1] * fvec4{&tri.n_plane[1][i], vector_aligned} + + rd[2] * fvec4{&tri.n_plane[2][i], vector_aligned}; + const fvec4 dett = fvec4{&tri.n_plane[3][i], vector_aligned} - + ro[0] * fvec4{&tri.n_plane[0][i], vector_aligned} - + ro[1] * fvec4{&tri.n_plane[1][i], vector_aligned} - + ro[2] * fvec4{&tri.n_plane[2][i], vector_aligned}; // compare sign bits - simd_ivec4 is_active_lane = ~srai(simd_cast(dett ^ (det * _t - dett)), 31); + ivec4 is_active_lane = ~srai(simd_cast(dett ^ (det * _t - dett)), 31); if (is_active_lane.all_zeros()) { continue; } - const simd_fvec4 p[3] = {det * ro[0] + dett * rd[0], det * ro[1] + dett * rd[1], det * ro[2] + dett * rd[2]}; + const fvec4 p[3] = {det * ro[0] + dett * rd[0], det * ro[1] + dett * rd[1], det * ro[2] + dett * rd[2]}; - const simd_fvec4 detu = p[0] * simd_fvec4{&tri.u_plane[0][i], simd_mem_aligned} + - p[1] * simd_fvec4{&tri.u_plane[1][i], simd_mem_aligned} + - p[2] * simd_fvec4{&tri.u_plane[2][i], simd_mem_aligned} + - det * simd_fvec4{&tri.u_plane[3][i], simd_mem_aligned}; + const fvec4 detu = p[0] * fvec4{&tri.u_plane[0][i], vector_aligned} + + p[1] * fvec4{&tri.u_plane[1][i], vector_aligned} + + p[2] * fvec4{&tri.u_plane[2][i], vector_aligned} + + det * fvec4{&tri.u_plane[3][i], vector_aligned}; // compare sign bits is_active_lane &= ~srai(simd_cast(detu ^ (det - detu)), 31); @@ -88,10 +88,10 @@ force_inline void IntersectTri(const float ro[3], const float rd[3], const mtri_ continue; } - const simd_fvec4 detv = p[0] * simd_fvec4{&tri.v_plane[0][i], simd_mem_aligned} + - p[1] * simd_fvec4{&tri.v_plane[1][i], simd_mem_aligned} + - p[2] * simd_fvec4{&tri.v_plane[2][i], simd_mem_aligned} + - det * simd_fvec4{&tri.v_plane[3][i], simd_mem_aligned}; + const fvec4 detv = p[0] * fvec4{&tri.v_plane[0][i], vector_aligned} + + p[1] * fvec4{&tri.v_plane[1][i], vector_aligned} + + p[2] * fvec4{&tri.v_plane[2][i], vector_aligned} + + det * fvec4{&tri.v_plane[3][i], vector_aligned}; // compare sign bits is_active_lane &= ~srai(simd_cast(detv ^ (det - detu - detv)), 31); @@ -100,10 +100,10 @@ force_inline void IntersectTri(const float ro[3], const float rd[3], const mtri_ } where(~is_active_lane, det) = FLT_EPS; - const simd_fvec4 rdet = (1.0f / det); + const fvec4 rdet = (1.0f / det); - simd_ivec4 prim = -(int(prim_index) + i + simd_ivec4{0, 1, 2, 3}) - 1; - where(det < 0.0f, prim) = int(prim_index) + i + simd_ivec4{0, 1, 2, 3}; + ivec4 prim = -(int(prim_index) + i + ivec4{0, 1, 2, 3}) - 1; + where(det < 0.0f, prim) = int(prim_index) + i + ivec4{0, 1, 2, 3}; _mask |= is_active_lane; where(is_active_lane, _prim_index) = prim; @@ -230,12 +230,12 @@ force_inline bool bbox_test(const float p[3], const bvh_node_t &node) { force_inline long bbox_test_oct(const float p[3], const wbvh_node_t &node) { long mask = 0; UNROLLED_FOR_R(i, 2, { // NOLINT - const simd_fvec4 fmask = (simd_fvec4{&node.bbox_min[0][4 * i], simd_mem_aligned} <= p[0]) & - (simd_fvec4{&node.bbox_min[1][4 * i], simd_mem_aligned} <= p[1]) & - (simd_fvec4{&node.bbox_min[2][4 * i], simd_mem_aligned} <= p[2]) & - (simd_fvec4{&node.bbox_max[0][4 * i], simd_mem_aligned} >= p[0]) & - (simd_fvec4{&node.bbox_max[1][4 * i], simd_mem_aligned} >= p[1]) & - (simd_fvec4{&node.bbox_max[2][4 * i], simd_mem_aligned} >= p[2]); + const fvec4 fmask = (fvec4{&node.bbox_min[0][4 * i], vector_aligned} <= p[0]) & + (fvec4{&node.bbox_min[1][4 * i], vector_aligned} <= p[1]) & + (fvec4{&node.bbox_min[2][4 * i], vector_aligned} <= p[2]) & + (fvec4{&node.bbox_max[0][4 * i], vector_aligned} >= p[0]) & + (fvec4{&node.bbox_max[1][4 * i], vector_aligned} >= p[1]) & + (fvec4{&node.bbox_max[2][4 * i], vector_aligned} >= p[2]); mask <<= 4; mask |= simd_cast(fmask).movemask(); }) @@ -289,28 +289,28 @@ force_inline long bbox_test_oct(const float o[3], const float inv_d[3], const fl float out_dist[8]) { long mask = 0; #if VECTORIZE_BBOX_INTERSECTION - simd_fvec4 lo, hi, tmin, tmax; + fvec4 lo, hi, tmin, tmax; UNROLLED_FOR_R(i, 2, { // NOLINT - lo = inv_d[0] * (simd_fvec4{&node.bbox_min[0][4 * i], simd_mem_aligned} - o[0]); - hi = inv_d[0] * (simd_fvec4{&node.bbox_max[0][4 * i], simd_mem_aligned} - o[0]); + lo = inv_d[0] * (fvec4{&node.bbox_min[0][4 * i], vector_aligned} - o[0]); + hi = inv_d[0] * (fvec4{&node.bbox_max[0][4 * i], vector_aligned} - o[0]); tmin = min(lo, hi); tmax = max(lo, hi); - lo = inv_d[1] * (simd_fvec4{&node.bbox_min[1][4 * i], simd_mem_aligned} - o[1]); - hi = inv_d[1] * (simd_fvec4{&node.bbox_max[1][4 * i], simd_mem_aligned} - o[1]); + lo = inv_d[1] * (fvec4{&node.bbox_min[1][4 * i], vector_aligned} - o[1]); + hi = inv_d[1] * (fvec4{&node.bbox_max[1][4 * i], vector_aligned} - o[1]); tmin = max(tmin, min(lo, hi)); tmax = min(tmax, max(lo, hi)); - lo = inv_d[2] * (simd_fvec4{&node.bbox_min[2][4 * i], simd_mem_aligned} - o[2]); - hi = inv_d[2] * (simd_fvec4{&node.bbox_max[2][4 * i], simd_mem_aligned} - o[2]); + lo = inv_d[2] * (fvec4{&node.bbox_min[2][4 * i], vector_aligned} - o[2]); + hi = inv_d[2] * (fvec4{&node.bbox_max[2][4 * i], vector_aligned} - o[2]); tmin = max(tmin, min(lo, hi)); tmax = min(tmax, max(lo, hi)); tmax *= 1.00000024f; - const simd_fvec4 fmask = (tmin <= tmax) & (tmin <= t) & (tmax > 0.0f); + const fvec4 fmask = (tmin <= tmax) & (tmin <= t) & (tmax > 0.0f); mask <<= 4; mask |= simd_cast(fmask).movemask(); - tmin.store_to(&out_dist[4 * i], simd_mem_aligned); + tmin.store_to(&out_dist[4 * i], vector_aligned); }) // NOLINT #else UNROLLED_FOR(i, 8, { // NOLINT @@ -499,13 +499,13 @@ force_inline int clamp(const int val, const int min, const int max) { return val < min ? min : (val > max ? max : val); } -force_inline simd_fvec4 cross(const simd_fvec4 &v1, const simd_fvec4 &v2) { - return simd_fvec4{v1.get<1>() * v2.get<2>() - v1.get<2>() * v2.get<1>(), +force_inline fvec4 cross(const fvec4 &v1, const fvec4 &v2) { + return fvec4{v1.get<1>() * v2.get<2>() - v1.get<2>() * v2.get<1>(), v1.get<2>() * v2.get<0>() - v1.get<0>() * v2.get<2>(), v1.get<0>() * v2.get<1>() - v1.get<1>() * v2.get<0>(), 0.0f}; } -force_inline simd_fvec4 reflect(const simd_fvec4 &I, const simd_fvec4 &N, const float dot_N_I) { +force_inline fvec4 reflect(const fvec4 &I, const fvec4 &N, const float dot_N_I) { return I - 2 * dot_N_I * N; } @@ -557,13 +557,13 @@ force_inline void radix_sort(ray_chunk_t *begin, ray_chunk_t *end, ray_chunk_t * _radix_sort_lsb(begin, end, begin1, 24); } -force_inline simd_fvec4 YCoCg_to_RGB(const simd_fvec4 &col) { +force_inline fvec4 YCoCg_to_RGB(const fvec4 &col) { const float scale = (col.get<2>() * (255.0f / 8.0f)) + 1.0f; const float Y = col.get<3>(); const float Co = (col.get<0>() - (0.5f * 256.0f / 255.0f)) / scale; const float Cg = (col.get<1>() - (0.5f * 256.0f / 255.0f)) / scale; - simd_fvec4 col_rgb = 1.0f; + fvec4 col_rgb = 1.0f; col_rgb.set<0>(Y + Co - Cg); col_rgb.set<1>(Y + Cg); col_rgb.set<2>(Y - Co - Cg); @@ -616,7 +616,7 @@ force_inline float safe_div_neg(const float a, const float b) { #endif } -force_inline simd_fvec4 safe_normalize(const simd_fvec4 &a) { +force_inline fvec4 safe_normalize(const fvec4 &a) { #if USE_SAFE_MATH const float l = length(a); return l > 0.0f ? (a / l) : a; @@ -627,23 +627,23 @@ force_inline simd_fvec4 safe_normalize(const simd_fvec4 &a) { #define sqr(x) ((x) * (x)) -force_inline float lum(const simd_fvec3 &color) { +force_inline float lum(const fvec3 &color) { return 0.212671f * color.get<0>() + 0.715160f * color.get<1>() + 0.072169f * color.get<2>(); } -force_inline float lum(const simd_fvec4 &color) { +force_inline float lum(const fvec4 &color) { return 0.212671f * color.get<0>() + 0.715160f * color.get<1>() + 0.072169f * color.get<2>(); } -float get_texture_lod(const Cpu::TexStorageBase *const textures[], const uint32_t index, const simd_fvec2 &duv_dx, - const simd_fvec2 &duv_dy) { +float get_texture_lod(const Cpu::TexStorageBase *const textures[], const uint32_t index, const fvec2 &duv_dx, + const fvec2 &duv_dy) { #ifdef FORCE_TEXTURE_LOD const float lod = float(FORCE_TEXTURE_LOD); #else - simd_fvec2 sz; + fvec2 sz; textures[index >> 28]->GetFRes(index & 0x00ffffff, 0, value_ptr(sz)); - const simd_fvec2 _duv_dx = duv_dx * sz, _duv_dy = duv_dy * sz; - const simd_fvec2 _diagonal = _duv_dx + _duv_dy; + const fvec2 _duv_dx = duv_dx * sz, _duv_dy = duv_dy * sz; + const fvec2 _diagonal = _duv_dx + _duv_dy; // Find minimal dimention of parallelogram const float min_length2 = fminf(fminf(_duv_dx.length2(), _duv_dy.length2()), _diagonal.length2()); @@ -659,7 +659,7 @@ float get_texture_lod(const Cpu::TexStorageBase *const textures[], const uint32_ #ifdef FORCE_TEXTURE_LOD const float lod = float(FORCE_TEXTURE_LOD); #else - simd_fvec2 res; + fvec2 res; textures[index >> 28]->GetFRes(index & 0x00ffffff, 0, value_ptr(res)); // Find lod float lod = lambda + 0.5f * fast_log2(res.get<0>() * res.get<1>()); @@ -719,11 +719,11 @@ float fresnel_dielectric_cos(float cosi, float eta) { return result; } -force_inline simd_fvec2 calc_alpha(const float roughness, const float anisotropy, const float regularize_alpha) { +force_inline fvec2 calc_alpha(const float roughness, const float anisotropy, const float regularize_alpha) { const float roughness2 = sqr(roughness); const float aspect = sqrtf(1.0f - 0.9f * anisotropy); - simd_fvec2 alpha = {roughness2 / aspect, roughness2 * aspect}; + fvec2 alpha = {roughness2 / aspect, roughness2 * aspect}; where(alpha < regularize_alpha, alpha) = clamp(2 * alpha, 0.25f * regularize_alpha, regularize_alpha); return alpha; } @@ -747,24 +747,24 @@ force_inline float int_as_float(const int32_t v) { return ret.f; } -simd_fvec4 offset_ray(const simd_fvec4 &p, const simd_fvec4 &n) { +fvec4 offset_ray(const fvec4 &p, const fvec4 &n) { const float Origin = 1.0f / 32.0f; const float FloatScale = 1.0f / 65536.0f; const float IntScale = 128.0f; // 256.0f; - const simd_ivec4 of_i(IntScale * n); + const ivec4 of_i(IntScale * n); - const simd_fvec4 p_i( + const fvec4 p_i( int_as_float(float_as_int(p.get<0>()) + ((p.get<0>() < 0.0f) ? -of_i.get<0>() : of_i.get<0>())), int_as_float(float_as_int(p.get<1>()) + ((p.get<1>() < 0.0f) ? -of_i.get<1>() : of_i.get<1>())), int_as_float(float_as_int(p.get<2>()) + ((p.get<2>() < 0.0f) ? -of_i.get<2>() : of_i.get<2>())), 0.0f); - return simd_fvec4{fabsf(p.get<0>()) < Origin ? (p.get<0>() + FloatScale * n.get<0>()) : p_i.get<0>(), + return fvec4{fabsf(p.get<0>()) < Origin ? (p.get<0>() + FloatScale * n.get<0>()) : p_i.get<0>(), fabsf(p.get<1>()) < Origin ? (p.get<1>() + FloatScale * n.get<1>()) : p_i.get<1>(), fabsf(p.get<2>()) < Origin ? (p.get<2>() + FloatScale * n.get<2>()) : p_i.get<2>(), 0.0f}; } -simd_fvec3 sample_GTR1(const float rgh, const float r1, const float r2) { +fvec3 sample_GTR1(const float rgh, const float r1, const float r2) { const float a = fmaxf(0.001f, rgh); const float a2 = sqr(a); @@ -774,10 +774,10 @@ simd_fvec3 sample_GTR1(const float rgh, const float r1, const float r2) { const float sinTheta = sqrtf(fmaxf(0.0f, 1.0f - (cosTheta * cosTheta))); const float sinPhi = sinf(phi), cosPhi = cosf(phi); - return simd_fvec3{sinTheta * cosPhi, sinTheta * sinPhi, cosTheta}; + return fvec3{sinTheta * cosPhi, sinTheta * sinPhi, cosTheta}; } -simd_fvec3 SampleGGX_NDF(const float rgh, const float r1, const float r2) { +fvec3 SampleGGX_NDF(const float rgh, const float r1, const float r2) { const float a = fmaxf(0.001f, rgh); const float phi = r1 * (2.0f * PI); @@ -786,16 +786,16 @@ simd_fvec3 SampleGGX_NDF(const float rgh, const float r1, const float r2) { const float sinTheta = saturate(sqrtf(1.0f - (cosTheta * cosTheta))); const float sinPhi = sinf(phi), cosPhi = cosf(phi); - return simd_fvec3{sinTheta * cosPhi, sinTheta * sinPhi, cosTheta}; + return fvec3{sinTheta * cosPhi, sinTheta * sinPhi, cosTheta}; } // http://jcgt.org/published/0007/04/01/paper.pdf -simd_fvec4 SampleVNDF_Hemisphere_CrossSect(const simd_fvec4 &Vh, float U1, float U2) { +fvec4 SampleVNDF_Hemisphere_CrossSect(const fvec4 &Vh, float U1, float U2) { // orthonormal basis (with special case if cross product is zero) const float lensq = sqr(Vh.get<0>()) + sqr(Vh.get<1>()); - const simd_fvec4 T1 = lensq > 0.0f ? simd_fvec4(-Vh.get<1>(), Vh.get<0>(), 0.0f, 0.0f) / sqrtf(lensq) - : simd_fvec4(1.0f, 0.0f, 0.0f, 0.0f); - const simd_fvec4 T2 = cross(Vh, T1); + const fvec4 T1 = lensq > 0.0f ? fvec4(-Vh.get<1>(), Vh.get<0>(), 0.0f, 0.0f) / sqrtf(lensq) + : fvec4(1.0f, 0.0f, 0.0f, 0.0f); + const fvec4 T2 = cross(Vh, T1); // parameterization of the projected area const float r = sqrtf(U1); const float phi = 2.0f * PI * U2; @@ -804,31 +804,31 @@ simd_fvec4 SampleVNDF_Hemisphere_CrossSect(const simd_fvec4 &Vh, float U1, float const float s = 0.5f * (1.0f + Vh.get<2>()); t2 = (1.0f - s) * sqrtf(1.0f - t1 * t1) + s * t2; // reprojection onto hemisphere - const simd_fvec4 Nh = t1 * T1 + t2 * T2 + sqrtf(fmaxf(0.0f, 1.0f - t1 * t1 - t2 * t2)) * Vh; + const fvec4 Nh = t1 * T1 + t2 * T2 + sqrtf(fmaxf(0.0f, 1.0f - t1 * t1 - t2 * t2)) * Vh; // normalization will be done later return Nh; } // https://arxiv.org/pdf/2306.05044.pdf -simd_fvec4 SampleVNDF_Hemisphere_SphCap(const simd_fvec4 &Vh, const simd_fvec2 alpha, const simd_fvec2 rand) { +fvec4 SampleVNDF_Hemisphere_SphCap(const fvec4 &Vh, const fvec2 alpha, const fvec2 rand) { // sample a spherical cap in (-Vh.z, 1] const float phi = 2.0f * PI * rand.get<0>(); const float z = fma(1.0f - rand.get<1>(), 1.0f + Vh.get<2>(), -Vh.get<2>()); const float sin_theta = sqrtf(saturate(1.0f - z * z)); const float x = sin_theta * cosf(phi); const float y = sin_theta * sinf(phi); - const simd_fvec4 c = simd_fvec4{x, y, z, 0.0f}; + const fvec4 c = fvec4{x, y, z, 0.0f}; // normalization will be done later return c + Vh; } // https://gpuopen.com/download/publications/Bounded_VNDF_Sampling_for_Smith-GGX_Reflections.pdf -simd_fvec4 SampleVNDF_Hemisphere_SphCap_Bounded(const simd_fvec4 &Ve, const simd_fvec4 &Vh, const simd_fvec2 alpha, - const simd_fvec2 rand) { +fvec4 SampleVNDF_Hemisphere_SphCap_Bounded(const fvec4 &Ve, const fvec4 &Vh, const fvec2 alpha, + const fvec2 rand) { // sample a spherical cap in (-Vh.z, 1] const float phi = 2.0f * PI * rand.get<0>(); const float a = saturate(fminf(alpha.get<0>(), alpha.get<1>())); - const float s = 1.0f + length(simd_fvec2{Ve.get<0>(), Ve.get<1>()}); + const float s = 1.0f + length(fvec2{Ve.get<0>(), Ve.get<1>()}); const float a2 = a * a, s2 = s * s; const float k = (1.0f - a2) * s2 / (s2 + a2 * Ve.get<2>() * Ve.get<2>()); const float b = (Ve.get<2>() > 0.0f) ? k * Vh.get<2>() : Vh.get<2>(); @@ -836,7 +836,7 @@ simd_fvec4 SampleVNDF_Hemisphere_SphCap_Bounded(const simd_fvec4 &Ve, const simd const float sin_theta = sqrtf(saturate(1.0f - z * z)); const float x = sin_theta * cosf(phi); const float y = sin_theta * sinf(phi); - const simd_fvec4 c = simd_fvec4{x, y, z, 0.0f}; + const fvec4 c = fvec4{x, y, z, 0.0f}; // normalization will be done later return c + Vh; } @@ -845,37 +845,37 @@ simd_fvec4 SampleVNDF_Hemisphere_SphCap_Bounded(const simd_fvec4 &Ve, const simd // Input alpha_x, alpha_y: roughness parameters // Input U1, U2: uniform random numbers // Output Ne: normal sampled with PDF D_Ve(Ne) = G1(Ve) * max(0, dot(Ve, Ne)) * D(Ne) / Ve.z -simd_fvec4 SampleGGX_VNDF(const simd_fvec4 &Ve, simd_fvec2 alpha, simd_fvec2 rand) { +fvec4 SampleGGX_VNDF(const fvec4 &Ve, fvec2 alpha, fvec2 rand) { // transforming the view direction to the hemisphere configuration - const simd_fvec4 Vh = - normalize(simd_fvec4(alpha.get<0>() * Ve.get<0>(), alpha.get<1>() * Ve.get<1>(), Ve.get<2>(), 0.0f)); + const fvec4 Vh = + normalize(fvec4(alpha.get<0>() * Ve.get<0>(), alpha.get<1>() * Ve.get<1>(), Ve.get<2>(), 0.0f)); // sample the hemisphere - const simd_fvec4 Nh = SampleVNDF_Hemisphere_SphCap(Vh, alpha, rand); + const fvec4 Nh = SampleVNDF_Hemisphere_SphCap(Vh, alpha, rand); // transforming the normal back to the ellipsoid configuration - const simd_fvec4 Ne = normalize( - simd_fvec4(alpha.get<0>() * Nh.get<0>(), alpha.get<1>() * Nh.get<1>(), fmaxf(0.0f, Nh.get<2>()), 0.0f)); + const fvec4 Ne = normalize( + fvec4(alpha.get<0>() * Nh.get<0>(), alpha.get<1>() * Nh.get<1>(), fmaxf(0.0f, Nh.get<2>()), 0.0f)); return Ne; } -simd_fvec4 SampleGGX_VNDF_Bounded(const simd_fvec4 &Ve, simd_fvec2 alpha, simd_fvec2 rand) { +fvec4 SampleGGX_VNDF_Bounded(const fvec4 &Ve, fvec2 alpha, fvec2 rand) { // transforming the view direction to the hemisphere configuration - const simd_fvec4 Vh = - normalize(simd_fvec4(alpha.get<0>() * Ve.get<0>(), alpha.get<1>() * Ve.get<1>(), Ve.get<2>(), 0.0f)); + const fvec4 Vh = + normalize(fvec4(alpha.get<0>() * Ve.get<0>(), alpha.get<1>() * Ve.get<1>(), Ve.get<2>(), 0.0f)); // sample the hemisphere - const simd_fvec4 Nh = SampleVNDF_Hemisphere_SphCap_Bounded(Ve, Vh, alpha, rand); + const fvec4 Nh = SampleVNDF_Hemisphere_SphCap_Bounded(Ve, Vh, alpha, rand); // transforming the normal back to the ellipsoid configuration - const simd_fvec4 Ne = normalize( - simd_fvec4(alpha.get<0>() * Nh.get<0>(), alpha.get<1>() * Nh.get<1>(), fmaxf(0.0f, Nh.get<2>()), 0.0f)); + const fvec4 Ne = normalize( + fvec4(alpha.get<0>() * Nh.get<0>(), alpha.get<1>() * Nh.get<1>(), fmaxf(0.0f, Nh.get<2>()), 0.0f)); return Ne; } -float GGX_VNDF_Reflection_Bounded_PDF(const float D, const simd_fvec4 &view_dir_ts, const simd_fvec2 alpha) { - const simd_fvec2 ai = alpha * simd_fvec2{view_dir_ts.get<0>(), view_dir_ts.get<1>()}; +float GGX_VNDF_Reflection_Bounded_PDF(const float D, const fvec4 &view_dir_ts, const fvec2 alpha) { + const fvec2 ai = alpha * fvec2{view_dir_ts.get<0>(), view_dir_ts.get<1>()}; const float len2 = dot(ai, ai); const float t = sqrtf(len2 + view_dir_ts.get<2>() * view_dir_ts.get<2>()); if (view_dir_ts.get<2>() >= 0.0f) { const float a = saturate(fminf(alpha.get<0>(), alpha.get<1>())); - const float s = 1.0f + length(simd_fvec2{view_dir_ts.get<0>(), view_dir_ts.get<1>()}); + const float s = 1.0f + length(fvec2{view_dir_ts.get<0>(), view_dir_ts.get<1>()}); const float a2 = a * a, s2 = s * s; const float k = (1.0f - a2) * s2 / (s2 + a2 * view_dir_ts.get<2>() * view_dir_ts.get<2>()); return D / (2.0f * (k * view_dir_ts.get<2>() + t)); @@ -884,7 +884,7 @@ float GGX_VNDF_Reflection_Bounded_PDF(const float D, const simd_fvec4 &view_dir_ } // Smith shadowing function -force_inline float G1(const simd_fvec4 &Ve, simd_fvec2 alpha) { +force_inline float G1(const fvec4 &Ve, fvec2 alpha) { alpha *= alpha; const float delta = (-1.0f + sqrtf(1.0f + safe_div_pos(alpha.get<0>() * sqr(Ve.get<0>()) + alpha.get<1>() * sqr(Ve.get<1>()), @@ -914,7 +914,7 @@ float D_GTR2(const float N_dot_H, const float a) { return a2 / (PI * t * t); } -float D_GGX(const simd_fvec4 &H, const simd_fvec2 alpha) { +float D_GGX(const fvec4 &H, const fvec2 alpha) { if (H.get<2>() == 0.0f) { return 0.0f; } @@ -925,15 +925,15 @@ float D_GGX(const simd_fvec4 &H, const simd_fvec2 alpha) { return 1.0f / (sqr(s1) * PI * alpha.get<0>() * alpha.get<1>() * cos_theta_h4); } -void create_tbn_matrix(const simd_fvec4 &N, simd_fvec4 out_TBN[3]) { - simd_fvec4 U; +void create_tbn_matrix(const fvec4 &N, fvec4 out_TBN[3]) { + fvec4 U; if (fabsf(N.get<1>()) < 0.999f) { U = {0.0f, 1.0f, 0.0f, 0.0f}; } else { U = {1.0f, 0.0f, 0.0f, 0.0f}; } - simd_fvec4 T = normalize(cross(U, N)); + fvec4 T = normalize(cross(U, N)); U = cross(N, T); out_TBN[0].set<0>(T.get<0>()); @@ -949,8 +949,8 @@ void create_tbn_matrix(const simd_fvec4 &N, simd_fvec4 out_TBN[3]) { out_TBN[2].set<2>(N.get<2>()); } -void create_tbn_matrix(const simd_fvec4 &N, simd_fvec4 &T, simd_fvec4 out_TBN[3]) { - simd_fvec4 U = normalize(cross(T, N)); +void create_tbn_matrix(const fvec4 &N, fvec4 &T, fvec4 out_TBN[3]) { + fvec4 U = normalize(cross(T, N)); T = cross(N, U); out_TBN[0].set<0>(T.get<0>()); @@ -966,8 +966,8 @@ void create_tbn_matrix(const simd_fvec4 &N, simd_fvec4 &T, simd_fvec4 out_TBN[3] out_TBN[2].set<2>(N.get<2>()); } -void create_tbn(const simd_fvec4 &N, simd_fvec4 &out_T, simd_fvec4 &out_B) { - simd_fvec4 U; +void create_tbn(const fvec4 &N, fvec4 &out_T, fvec4 &out_B) { + fvec4 U; if (fabsf(N.get<1>()) < 0.999f) { U = {0.0f, 1.0f, 0.0f, 0.0f}; } else { @@ -978,8 +978,8 @@ void create_tbn(const simd_fvec4 &N, simd_fvec4 &out_T, simd_fvec4 &out_B) { out_B = cross(N, out_T); } -simd_fvec4 map_to_cone(float r1, float r2, simd_fvec4 N, float radius) { - const simd_fvec2 offset = 2.0f * simd_fvec2(r1, r2) - simd_fvec2(1.0f); +fvec4 map_to_cone(float r1, float r2, fvec4 N, float radius) { + const fvec2 offset = 2.0f * fvec2(r1, r2) - fvec2(1.0f); if (offset.get<0>() == 0.0f && offset.get<1>() == 0.0f) { return N; } @@ -994,17 +994,17 @@ simd_fvec4 map_to_cone(float r1, float r2, simd_fvec4 N, float radius) { theta = 0.5f * PI * (1.0f - 0.5f * (offset.get<0>() / offset.get<1>())); } - const simd_fvec2 uv = simd_fvec2(radius * r * cosf(theta), radius * r * sinf(theta)); + const fvec2 uv = fvec2(radius * r * cosf(theta), radius * r * sinf(theta)); - simd_fvec4 LT, LB; + fvec4 LT, LB; create_tbn(N, LT, LB); return N + uv.get<0>() * LT + uv.get<1>() * LB; } -force_inline float sphere_intersection(const simd_fvec4 ¢er, const float radius, const simd_fvec4 &ro, - const simd_fvec4 &rd) { - const simd_fvec4 oc = ro - center; +force_inline float sphere_intersection(const fvec4 ¢er, const float radius, const fvec4 &ro, + const fvec4 &rd) { + const fvec4 oc = ro - center; const float a = dot(rd, rd); const float b = 2 * dot(oc, rd); const float c = dot(oc, oc) - radius * radius; @@ -1012,10 +1012,10 @@ force_inline float sphere_intersection(const simd_fvec4 ¢er, const float rad return (-b - sqrtf(fmaxf(discriminant, 0.0f))) / (2 * a); } -simd_fvec4 rotate_around_axis(const simd_fvec4 &p, const simd_fvec4 &axis, const float angle) { +fvec4 rotate_around_axis(const fvec4 &p, const fvec4 &axis, const float angle) { const float costheta = cosf(angle); const float sintheta = sinf(angle); - simd_fvec4 r; + fvec4 r; r.set<0>(((costheta + (1.0f - costheta) * axis.get<0>() * axis.get<0>()) * p.get<0>()) + (((1.0f - costheta) * axis.get<0>() * axis.get<1>() - axis.get<2>() * sintheta) * p.get<1>()) + @@ -1031,7 +1031,7 @@ simd_fvec4 rotate_around_axis(const simd_fvec4 &p, const simd_fvec4 &axis, const return r; } -void transpose(const simd_fvec3 in_3x3[3], simd_fvec3 out_3x3[3]) { +void transpose(const fvec3 in_3x3[3], fvec3 out_3x3[3]) { out_3x3[0].set<0>(in_3x3[0].get<0>()); out_3x3[0].set<1>(in_3x3[1].get<0>()); out_3x3[0].set<2>(in_3x3[2].get<0>()); @@ -1045,8 +1045,8 @@ void transpose(const simd_fvec3 in_3x3[3], simd_fvec3 out_3x3[3]) { out_3x3[2].set<2>(in_3x3[2].get<2>()); } -simd_fvec3 mul(const simd_fvec3 in_mat[3], const simd_fvec3 &in_vec) { - simd_fvec3 out_vec; +fvec3 mul(const fvec3 in_mat[3], const fvec3 &in_vec) { + fvec3 out_vec; out_vec.set<0>(in_mat[0].get<0>() * in_vec.get<0>() + in_mat[1].get<0>() * in_vec.get<1>() + in_mat[2].get<0>() * in_vec.get<2>()); out_vec.set<1>(in_mat[0].get<1>() * in_vec.get<0>() + in_mat[1].get<1>() * in_vec.get<1>() + @@ -1059,8 +1059,8 @@ simd_fvec3 mul(const simd_fvec3 in_mat[3], const simd_fvec3 &in_vec) { force_inline float safe_sqrtf(float f) { return sqrtf(fmaxf(f, 0.0f)); } // Taken from Cycles -simd_fvec4 ensure_valid_reflection(const simd_fvec4 &Ng, const simd_fvec4 &I, const simd_fvec4 &N) { - const simd_fvec4 R = 2 * dot(N, I) * N - I; +fvec4 ensure_valid_reflection(const fvec4 &Ng, const fvec4 &I, const fvec4 &N) { + const fvec4 R = 2 * dot(N, I) * N - I; // Reflection rays may always be at least as shallow as the incoming ray. const float threshold = fminf(0.9f * dot(Ng, I), 0.01f); @@ -1072,7 +1072,7 @@ simd_fvec4 ensure_valid_reflection(const simd_fvec4 &Ng, const simd_fvec4 &I, co // The X axis is found by normalizing the component of N that's orthogonal to Ng. // The Y axis isn't actually needed. const float NdotNg = dot(N, Ng); - const simd_fvec4 X = normalize(N - NdotNg * Ng); + const fvec4 X = normalize(N - NdotNg * Ng); // Calculate N.z and N.x in the local coordinate system. // @@ -1125,11 +1125,11 @@ simd_fvec4 ensure_valid_reflection(const simd_fvec4 &Ng, const simd_fvec4 &I, co bool valid1 = (N1_z2 > 1e-5f) && (N1_z2 <= (1.0f + 1e-5f)); bool valid2 = (N2_z2 > 1e-5f) && (N2_z2 <= (1.0f + 1e-5f)); - simd_fvec2 N_new; + fvec2 N_new; if (valid1 && valid2) { // If both are possible, do the expensive reflection-based check. - const simd_fvec2 N1 = simd_fvec2(safe_sqrtf(1.0f - N1_z2), safe_sqrtf(N1_z2)); - const simd_fvec2 N2 = simd_fvec2(safe_sqrtf(1.0f - N2_z2), safe_sqrtf(N2_z2)); + const fvec2 N1 = fvec2(safe_sqrtf(1.0f - N1_z2), safe_sqrtf(N1_z2)); + const fvec2 N2 = fvec2(safe_sqrtf(1.0f - N2_z2), safe_sqrtf(N2_z2)); const float R1 = 2 * (N1.get<0>() * Ix + N1.get<1>() * Iz) * N1.get<1>() - Iz; const float R2 = 2 * (N2.get<0>() * Ix + N2.get<1>() * Iz) * N2.get<1>() - Iz; @@ -1148,7 +1148,7 @@ simd_fvec4 ensure_valid_reflection(const simd_fvec4 &Ng, const simd_fvec4 &I, co } else if (valid1 || valid2) { // Only one solution passes the N'.z criterium, so pick that one. const float Nz2 = valid1 ? N1_z2 : N2_z2; - N_new = simd_fvec2(safe_sqrtf(1.0f - Nz2), safe_sqrtf(Nz2)); + N_new = fvec2(safe_sqrtf(1.0f - Nz2), safe_sqrtf(Nz2)); } else { return Ng; } @@ -1156,14 +1156,14 @@ simd_fvec4 ensure_valid_reflection(const simd_fvec4 &Ng, const simd_fvec4 &I, co return N_new.get<0>() * X + N_new.get<1>() * Ng; } -force_inline simd_fvec4 world_from_tangent(const simd_fvec4 &T, const simd_fvec4 &B, const simd_fvec4 &N, - const simd_fvec4 &V) { +force_inline fvec4 world_from_tangent(const fvec4 &T, const fvec4 &B, const fvec4 &N, + const fvec4 &V) { return V.get<0>() * T + V.get<1>() * B + V.get<2>() * N; } -force_inline simd_fvec4 tangent_from_world(const simd_fvec4 &T, const simd_fvec4 &B, const simd_fvec4 &N, - const simd_fvec4 &V) { - return simd_fvec4{dot(V, T), dot(V, B), dot(V, N), 0.0f}; +force_inline fvec4 tangent_from_world(const fvec4 &T, const fvec4 &B, const fvec4 &N, + const fvec4 &V) { + return fvec4{dot(V, T), dot(V, B), dot(V, N), 0.0f}; } force_inline bool quadratic(float a, float b, float c, float &t0, float &t1) { @@ -1187,7 +1187,7 @@ force_inline float ngon_rad(const float theta, const float n) { return cosf(PI / n) / cosf(theta - (2.0f * PI / n) * floorf((n * theta + PI) / (2.0f * PI))); } -force_inline simd_fvec4 make_fvec3(const float *f) { return simd_fvec4{f[0], f[1], f[2], 0.0f}; } +force_inline fvec4 make_fvec3(const float *f) { return fvec4{f[0], f[1], f[2], 0.0f}; } void push_ior_stack(float stack[4], const float val) { UNROLLED_FOR(i, 3, { @@ -1244,8 +1244,8 @@ float approx_atan2(const float y, const float x) { // max error is 0.000004f return t3; } -simd_fvec4 approx_atan2(const simd_fvec4 y, const simd_fvec4 x) { - simd_fvec4 t0, t1, t3, t4; +fvec4 approx_atan2(const fvec4 y, const fvec4 x) { + fvec4 t0, t1, t3, t4; t3 = abs(x); t1 = abs(y); @@ -1278,7 +1278,7 @@ force_inline float approx_cos(float x) { // max error is 0.056010f return x; } -force_inline simd_fvec4 approx_cos(simd_fvec4 x) { +force_inline fvec4 approx_cos(fvec4 x) { const float tp = 1.0f / (2.0f * PI); x *= tp; x -= 0.25f + floor(x + 0.25f); @@ -1301,11 +1301,11 @@ force_inline float approx_acos(float x) { // max error is 0.000068f return negate * PI + ret; } -force_inline simd_fvec4 approx_acos(simd_fvec4 x) { - simd_fvec4 negate = 0.0f; +force_inline fvec4 approx_acos(fvec4 x) { + fvec4 negate = 0.0f; where(x < 0.0f, negate) = 1.0f; x = abs(x); - simd_fvec4 ret = -0.0187293f; + fvec4 ret = -0.0187293f; ret = ret * x; ret = ret + 0.0742610f; ret = ret * x; @@ -1317,13 +1317,13 @@ force_inline simd_fvec4 approx_acos(simd_fvec4 x) { return negate * PI + ret; } -float calc_lnode_importance(const light_bvh_node_t &n, const simd_fvec4 &P) { +float calc_lnode_importance(const light_bvh_node_t &n, const fvec4 &P) { float mul = 1.0f, v_len2 = 1.0f; if (n.bbox_min[0] > -MAX_DIST) { // check if this is a local light - simd_fvec4 v = P - 0.5f * (simd_fvec4{n.bbox_min} + simd_fvec4{n.bbox_max}); + fvec4 v = P - 0.5f * (fvec4{n.bbox_min} + fvec4{n.bbox_max}); v.set<3>(0.0f); - simd_fvec4 ext = simd_fvec4{n.bbox_max} - simd_fvec4{n.bbox_min}; + fvec4 ext = fvec4{n.bbox_max} - fvec4{n.bbox_min}; ext.set<3>(0.0f); const float extent = 0.5f * length(ext); @@ -1331,7 +1331,7 @@ float calc_lnode_importance(const light_bvh_node_t &n, const simd_fvec4 &P) { const float v_len = sqrtf(v_len2); const float omega_u = approx_atan2(extent, v_len) + 0.000005f; - simd_fvec4 axis = simd_fvec4{n.axis}; + fvec4 axis = fvec4{n.axis}; axis.set<3>(0.0f); const float omega = approx_acos(fminf(dot(axis, v / v_len), 1.0f)) - 0.00007f; @@ -1343,48 +1343,48 @@ float calc_lnode_importance(const light_bvh_node_t &n, const simd_fvec4 &P) { return n.flux * mul / v_len2; } -force_inline simd_fvec4 dot(const simd_fvec4 v1[3], const simd_fvec4 v2[3]) { +force_inline fvec4 dot(const fvec4 v1[3], const fvec4 v2[3]) { return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2]; } -force_inline simd_fvec4 length(const simd_fvec4 v[3]) { return sqrt(dot(v, v)); } +force_inline fvec4 length(const fvec4 v[3]) { return sqrt(dot(v, v)); } -void calc_lnode_importance(const light_wbvh_node_t &n, const simd_fvec4 &P, float importance[8]) { +void calc_lnode_importance(const light_wbvh_node_t &n, const fvec4 &P, float importance[8]) { for (int i = 0; i < 8; i += 4) { - simd_fvec4 mul = 1.0f, v_len2 = 1.0f; + fvec4 mul = 1.0f, v_len2 = 1.0f; - const simd_ivec4 mask = simd_cast(simd_fvec4{&n.bbox_min[0][i], simd_mem_aligned} > -MAX_DIST); + const ivec4 mask = simd_cast(fvec4{&n.bbox_min[0][i], vector_aligned} > -MAX_DIST); if (mask.not_all_zeros()) { - simd_fvec4 v[3] = {P.get<0>() - 0.5f * (simd_fvec4{&n.bbox_min[0][i], simd_mem_aligned} + - simd_fvec4{&n.bbox_max[0][i], simd_mem_aligned}), - P.get<1>() - 0.5f * (simd_fvec4{&n.bbox_min[1][i], simd_mem_aligned} + - simd_fvec4{&n.bbox_max[1][i], simd_mem_aligned}), - P.get<2>() - 0.5f * (simd_fvec4{&n.bbox_min[2][i], simd_mem_aligned} + - simd_fvec4{&n.bbox_max[2][i], simd_mem_aligned})}; - const simd_fvec4 ext[3] = { - simd_fvec4{&n.bbox_max[0][i], simd_mem_aligned} - simd_fvec4{&n.bbox_min[0][i], simd_mem_aligned}, - simd_fvec4{&n.bbox_max[1][i], simd_mem_aligned} - simd_fvec4{&n.bbox_min[1][i], simd_mem_aligned}, - simd_fvec4{&n.bbox_max[2][i], simd_mem_aligned} - simd_fvec4{&n.bbox_min[2][i], simd_mem_aligned}}; - - const simd_fvec4 extent = 0.5f * length(ext); + fvec4 v[3] = {P.get<0>() - 0.5f * (fvec4{&n.bbox_min[0][i], vector_aligned} + + fvec4{&n.bbox_max[0][i], vector_aligned}), + P.get<1>() - 0.5f * (fvec4{&n.bbox_min[1][i], vector_aligned} + + fvec4{&n.bbox_max[1][i], vector_aligned}), + P.get<2>() - 0.5f * (fvec4{&n.bbox_min[2][i], vector_aligned} + + fvec4{&n.bbox_max[2][i], vector_aligned})}; + const fvec4 ext[3] = { + fvec4{&n.bbox_max[0][i], vector_aligned} - fvec4{&n.bbox_min[0][i], vector_aligned}, + fvec4{&n.bbox_max[1][i], vector_aligned} - fvec4{&n.bbox_min[1][i], vector_aligned}, + fvec4{&n.bbox_max[2][i], vector_aligned} - fvec4{&n.bbox_min[2][i], vector_aligned}}; + + const fvec4 extent = 0.5f * length(ext); where(mask, v_len2) = dot(v, v); - const simd_fvec4 v_len = sqrt(v_len2); - const simd_fvec4 omega_u = approx_atan2(extent, v_len) + 0.000005f; + const fvec4 v_len = sqrt(v_len2); + const fvec4 omega_u = approx_atan2(extent, v_len) + 0.000005f; - const simd_fvec4 axis[3] = {simd_fvec4{&n.axis[0][i], simd_mem_aligned}, - simd_fvec4{&n.axis[1][i], simd_mem_aligned}, - simd_fvec4{&n.axis[2][i], simd_mem_aligned}}; + const fvec4 axis[3] = {fvec4{&n.axis[0][i], vector_aligned}, + fvec4{&n.axis[1][i], vector_aligned}, + fvec4{&n.axis[2][i], vector_aligned}}; UNROLLED_FOR(j, 3, { v[j] /= v_len; }) - const simd_fvec4 omega = approx_acos(min(dot(axis, v), 1.0f)) - 0.00007f; - const simd_fvec4 omega_ = max(0.0f, omega - simd_fvec4{&n.omega_n[i], simd_mem_aligned} - omega_u); + const fvec4 omega = approx_acos(min(dot(axis, v), 1.0f)) - 0.00007f; + const fvec4 omega_ = max(0.0f, omega - fvec4{&n.omega_n[i], vector_aligned} - omega_u); where(mask, mul) = 0.0f; - where(mask & simd_cast(omega_ < simd_fvec4{&n.omega_e[i], simd_mem_aligned}), mul) = + where(mask & simd_cast(omega_ < fvec4{&n.omega_e[i], vector_aligned}), mul) = approx_cos(omega_) + 0.057f; } - const simd_fvec4 imp = simd_fvec4{&n.flux[i], simd_mem_aligned} * mul / v_len2; - imp.store_to(&importance[i], simd_mem_aligned); + const fvec4 imp = fvec4{&n.flux[i], vector_aligned} * mul / v_len2; + imp.store_to(&importance[i], vector_aligned); } } @@ -1425,23 +1425,23 @@ force_inline float scramble_unorm(const uint32_t seed, uint32_t val) { return float(val >> 8) / 16777216.0f; } -simd_fvec2 get_scrambled_2d_rand(const uint32_t dim, const uint32_t seed, const int sample, const uint32_t rand_seq[]) { +fvec2 get_scrambled_2d_rand(const uint32_t dim, const uint32_t seed, const int sample, const uint32_t rand_seq[]) { const uint32_t shuffled_dim = nested_uniform_scramble_base2(dim, seed) & (RAND_DIMS_COUNT - 1); const uint32_t shuffled_i = nested_uniform_scramble_base2(sample, hash_combine(seed, dim)) & (RAND_SAMPLES_COUNT - 1); - return simd_fvec2{scramble_unorm(hash_combine(seed, 2 * dim + 0), + return fvec2{scramble_unorm(hash_combine(seed, 2 * dim + 0), rand_seq[shuffled_dim * 2 * RAND_SAMPLES_COUNT + 2 * shuffled_i + 0]), scramble_unorm(hash_combine(seed, 2 * dim + 1), rand_seq[shuffled_dim * 2 * RAND_SAMPLES_COUNT + 2 * shuffled_i + 1])}; } // Gram-Schmidt method -force_inline simd_fvec4 orthogonalize(const simd_fvec4 &a, const simd_fvec4 &b) { +force_inline fvec4 orthogonalize(const fvec4 &a, const fvec4 &b) { // we assume that a is normalized return normalize(b - dot(a, b) * a); } -force_inline simd_fvec4 slerp(const simd_fvec4 &start, const simd_fvec4 &end, const float percent) { +force_inline fvec4 slerp(const fvec4 &start, const fvec4 &end, const float percent) { // Dot product - the cosine of the angle between 2 vectors. float cos_theta = dot(start, end); // Clamp it to be in the range of Acos() @@ -1452,7 +1452,7 @@ force_inline simd_fvec4 slerp(const simd_fvec4 &start, const simd_fvec4 &end, co // And multiplying that by percent returns the angle between // start and the final result. const float theta = acosf(cos_theta) * percent; - simd_fvec4 relative_vec = safe_normalize(end - start * cos_theta); + fvec4 relative_vec = safe_normalize(end - start * cos_theta); // Orthonormal basis // The final result. return start * cosf(theta) + relative_vec * sinf(theta); @@ -1504,7 +1504,7 @@ float portable_acosf(float x) { // Equivalent to acosf(dot(a, b)), but more numerically stable // Taken from PBRT source code -force_inline float angle_between(const simd_fvec4 &v1, const simd_fvec4 &v2) { +force_inline float angle_between(const fvec4 &v1, const fvec4 &v2) { if (dot(v1, v2) < 0) { return PI - 2 * portable_asinf(length(v1 + v2) / 2); } else { @@ -1514,18 +1514,18 @@ force_inline float angle_between(const simd_fvec4 &v1, const simd_fvec4 &v2) { // "Stratified Sampling of Spherical Triangles" https://www.graphics.cornell.edu/pubs/1995/Arv95c.pdf // Based on https://www.shadertoy.com/view/4tGGzd -float SampleSphericalTriangle(const simd_fvec4 &P, const simd_fvec4 &p1, const simd_fvec4 &p2, const simd_fvec4 &p3, - const simd_fvec2 Xi, simd_fvec4 *out_dir) { +float SampleSphericalTriangle(const fvec4 &P, const fvec4 &p1, const fvec4 &p2, const fvec4 &p3, + const fvec2 Xi, fvec4 *out_dir) { // Setup spherical triangle - const simd_fvec4 A = normalize(p1 - P), B = normalize(p2 - P), C = normalize(p3 - P); + const fvec4 A = normalize(p1 - P), B = normalize(p2 - P), C = normalize(p3 - P); // calculate internal angles of spherical triangle: alpha, beta and gamma - const simd_fvec4 BA = orthogonalize(A, B - A); - const simd_fvec4 CA = orthogonalize(A, C - A); - const simd_fvec4 AB = orthogonalize(B, A - B); - const simd_fvec4 CB = orthogonalize(B, C - B); - const simd_fvec4 BC = orthogonalize(C, B - C); - const simd_fvec4 AC = orthogonalize(C, A - C); + const fvec4 BA = orthogonalize(A, B - A); + const fvec4 CA = orthogonalize(A, C - A); + const fvec4 AB = orthogonalize(B, A - B); + const fvec4 CB = orthogonalize(B, C - B); + const fvec4 BC = orthogonalize(C, B - C); + const fvec4 AC = orthogonalize(C, A - C); const float alpha = angle_between(BA, CA); const float beta = angle_between(AB, CB); @@ -1559,7 +1559,7 @@ float SampleSphericalTriangle(const simd_fvec4 &P, const simd_fvec4 &p1, const s (1.0f / b) * portable_acosf(clamp(safe_div(((v * q - u * p) * cosf(alpha) - v), denom), -1.0f, 1.0f)); // Compute the third vertex of the sub - triangle - const simd_fvec4 C_s = slerp(A, C, s); + const fvec4 C_s = slerp(A, C, s); // Compute the t coordinate using C_s and Xi[1] const float denom2 = portable_acosf(clamp(dot(C_s, B), -1.0f, 1.0f)); @@ -1576,16 +1576,16 @@ float SampleSphericalTriangle(const simd_fvec4 &P, const simd_fvec4 &p1, const s // "An Area-Preserving Parametrization for Spherical Rectangles" // https://www.arnoldrenderer.com/research/egsr2013_spherical_rectangle.pdf // NOTE: no precomputation is done, everything is calculated in-place -float SampleSphericalRectangle(const simd_fvec4 &P, const simd_fvec4 &light_pos, const simd_fvec4 &axis_u, - const simd_fvec4 &axis_v, const simd_fvec2 Xi, simd_fvec4 *out_p) { - const simd_fvec4 corner = light_pos - 0.5f * axis_u - 0.5f * axis_v; +float SampleSphericalRectangle(const fvec4 &P, const fvec4 &light_pos, const fvec4 &axis_u, + const fvec4 &axis_v, const fvec2 Xi, fvec4 *out_p) { + const fvec4 corner = light_pos - 0.5f * axis_u - 0.5f * axis_v; float axisu_len, axisv_len; - const simd_fvec4 x = normalize_len(axis_u, axisu_len), y = normalize_len(axis_v, axisv_len); - simd_fvec4 z = cross(x, y); + const fvec4 x = normalize_len(axis_u, axisu_len), y = normalize_len(axis_v, axisv_len); + fvec4 z = cross(x, y); // compute rectangle coords in local reference system - const simd_fvec4 dir = corner - P; + const fvec4 dir = corner - P; float z0 = dot(dir, z); // flip z to make it point against Q if (z0 > 0.0f) { @@ -1597,8 +1597,8 @@ float SampleSphericalRectangle(const simd_fvec4 &P, const simd_fvec4 &light_pos, const float x1 = x0 + axisu_len; const float y1 = y0 + axisv_len; // compute internal angles (gamma_i) - const simd_fvec4 diff = simd_fvec4{x0, y1, x1, y0} - simd_fvec4{x1, y0, x0, y1}; - simd_fvec4 nz = simd_fvec4{y0, x1, y1, x0} * diff; + const fvec4 diff = fvec4{x0, y1, x1, y0} - fvec4{x1, y0, x0, y1}; + fvec4 nz = fvec4{y0, x1, y1, x0} * diff; nz = nz / sqrt(z0 * z0 * diff * diff + nz * nz); const float g0 = portable_acosf(clamp(-nz.get<0>() * nz.get<1>(), -1.0f, 1.0f)); const float g1 = portable_acosf(clamp(-nz.get<1>() * nz.get<2>(), -1.0f, 1.0f)); @@ -1648,7 +1648,7 @@ void Ray::Ref::GeneratePrimaryRays(const camera_t &cam, const rect_t &r, const i const uint32_t rand_seq[], const uint32_t rand_seed, const float filter_table[], const int iteration, const uint16_t required_samples[], aligned_vector &out_rays, aligned_vector &out_inters) { - const simd_fvec4 cam_origin = make_fvec3(cam.origin), fwd = make_fvec3(cam.fwd), side = make_fvec3(cam.side), + const fvec4 cam_origin = make_fvec3(cam.origin), fwd = make_fvec3(cam.fwd), side = make_fvec3(cam.side), up = make_fvec3(cam.up); const float focus_distance = cam.focus_distance; @@ -1657,8 +1657,8 @@ void Ray::Ref::GeneratePrimaryRays(const camera_t &cam, const rect_t &r, const i const float fov_k = temp * focus_distance; const float spread_angle = atanf(2.0f * temp / float(h)); - auto get_pix_dir = [&](const float x, const float y, const simd_fvec4 &origin) { - simd_fvec4 p(2 * fov_k * (float(x) / float(w) + cam.shift[0] / k) - fov_k, + auto get_pix_dir = [&](const float x, const float y, const fvec4 &origin) { + fvec4 p(2 * fov_k * (float(x) / float(w) + cam.shift[0] / k) - fov_k, 2 * fov_k * (float(-y) / float(h) + cam.shift[1]) + fov_k, focus_distance, 0.0f); p = cam_origin + k * p.get<0>() * side + p.get<1>() * up + p.get<2>() * fwd; return normalize(p - origin); @@ -1695,7 +1695,7 @@ void Ray::Ref::GeneratePrimaryRays(const camera_t &cam, const rect_t &r, const i const uint32_t px_hash = hash((x << 16) | y); const uint32_t rand_hash = hash_combine(px_hash, rand_seed); - const simd_fvec2 filter_rand = get_scrambled_2d_rand(RAND_DIM_FILTER, rand_hash, iteration - 1, rand_seq); + const fvec2 filter_rand = get_scrambled_2d_rand(RAND_DIM_FILTER, rand_hash, iteration - 1, rand_seq); float rx = filter_rand.get<0>(), ry = filter_rand.get<1>(); if (cam.filter != ePixelFilter::Box) { @@ -1706,12 +1706,12 @@ void Ray::Ref::GeneratePrimaryRays(const camera_t &cam, const rect_t &r, const i fx += rx; fy += ry; - simd_fvec2 offset = 0.0f; + fvec2 offset = 0.0f; if (cam.fstop > 0.0f) { - const simd_fvec2 lens_rand = get_scrambled_2d_rand(RAND_DIM_LENS, rand_hash, iteration - 1, rand_seq); + const fvec2 lens_rand = get_scrambled_2d_rand(RAND_DIM_LENS, rand_hash, iteration - 1, rand_seq); - offset = 2.0f * lens_rand - simd_fvec2{1.0f, 1.0f}; + offset = 2.0f * lens_rand - fvec2{1.0f, 1.0f}; if (offset.get<0>() != 0.0f && offset.get<1>() != 0.0f) { float theta, r; if (fabsf(offset.get<0>()) > fabsf(offset.get<1>())) { @@ -1738,8 +1738,8 @@ void Ray::Ref::GeneratePrimaryRays(const camera_t &cam, const rect_t &r, const i ray_data_t &out_r = out_rays[i]; - const simd_fvec4 _origin = cam_origin + side * offset.get<0>() + up * offset.get<1>(); - const simd_fvec4 _d = get_pix_dir(fx, fy, _origin); + const fvec4 _origin = cam_origin + side * offset.get<0>() + up * offset.get<1>(); + const fvec4 _d = get_pix_dir(fx, fy, _origin); const float clip_start = cam.clip_start / dot(_d, fwd); for (int j = 0; j < 3; j++) { @@ -1790,8 +1790,8 @@ void Ray::Ref::SampleMeshInTextureSpace(const int iteration, const int obj_index } } - const simd_ivec2 irect_min = {r.x, r.y}, irect_max = {r.x + r.w - 1, r.y + r.h - 1}; - const simd_fvec2 size = {float(width), float(height)}; + const ivec2 irect_min = {r.x, r.y}, irect_max = {r.x + r.w - 1, r.y + r.h - 1}; + const fvec2 size = {float(width), float(height)}; for (uint32_t tri = mesh.tris_index; tri < mesh.tris_index + mesh.tris_count; tri++) { const vertex_t &v0 = vertices[vtx_indices[tri * 3 + 0]]; @@ -1799,11 +1799,11 @@ void Ray::Ref::SampleMeshInTextureSpace(const int iteration, const int obj_index const vertex_t &v2 = vertices[vtx_indices[tri * 3 + 2]]; // TODO: use uv_layer - const auto t0 = simd_fvec2{v0.t[0], 1.0f - v0.t[1]} * size; - const auto t1 = simd_fvec2{v1.t[0], 1.0f - v1.t[1]} * size; - const auto t2 = simd_fvec2{v2.t[0], 1.0f - v2.t[1]} * size; + const auto t0 = fvec2{v0.t[0], 1.0f - v0.t[1]} * size; + const auto t1 = fvec2{v1.t[0], 1.0f - v1.t[1]} * size; + const auto t2 = fvec2{v2.t[0], 1.0f - v2.t[1]} * size; - simd_fvec2 bbox_min = t0, bbox_max = t0; + fvec2 bbox_min = t0, bbox_max = t0; bbox_min = min(bbox_min, t1); bbox_min = min(bbox_min, t2); @@ -1811,8 +1811,8 @@ void Ray::Ref::SampleMeshInTextureSpace(const int iteration, const int obj_index bbox_max = max(bbox_max, t1); bbox_max = max(bbox_max, t2); - simd_ivec2 ibbox_min = simd_ivec2{bbox_min}, - ibbox_max = simd_ivec2{int(roundf(bbox_max.get<0>())), int(roundf(bbox_max.get<1>()))}; + ivec2 ibbox_min = ivec2{bbox_min}, + ibbox_max = ivec2{int(roundf(bbox_max.get<0>())), int(roundf(bbox_max.get<1>()))}; if (ibbox_max.get<0>() < irect_min.get<0>() || ibbox_max.get<1>() < irect_min.get<1>() || ibbox_min.get<0>() > irect_max.get<0>() || ibbox_min.get<1>() > irect_max.get<1>()) { @@ -1822,7 +1822,7 @@ void Ray::Ref::SampleMeshInTextureSpace(const int iteration, const int obj_index ibbox_min = max(ibbox_min, irect_min); ibbox_max = min(ibbox_max, irect_max); - const simd_fvec2 d01 = t0 - t1, d12 = t1 - t2, d20 = t2 - t0; + const fvec2 d01 = t0 - t1, d12 = t1 - t2, d20 = t2 - t0; const float area = d01.get<0>() * d20.get<1>() - d20.get<0>() * d01.get<1>(); if (area < FLT_EPS) { @@ -1850,17 +1850,17 @@ void Ray::Ref::SampleMeshInTextureSpace(const int iteration, const int obj_index w = d20.get<0>() * (_y - t2.get<1>()) - d20.get<1>() * (_x - t2.get<0>()); if (u >= -FLT_EPS && v >= -FLT_EPS && w >= -FLT_EPS) { - const auto p0 = simd_fvec4{v0.p}, p1 = simd_fvec4{v1.p}, p2 = simd_fvec4{v2.p}; - const auto n0 = simd_fvec4{v0.n}, n1 = simd_fvec4{v1.n}, n2 = simd_fvec4{v2.n}; + const auto p0 = fvec4{v0.p}, p1 = fvec4{v1.p}, p2 = fvec4{v2.p}; + const auto n0 = fvec4{v0.n}, n1 = fvec4{v1.n}, n2 = fvec4{v2.n}; u *= inv_area; v *= inv_area; w *= inv_area; - const simd_fvec4 p = TransformPoint(p0 * v + p1 * w + p2 * u, mi.xform), + const fvec4 p = TransformPoint(p0 * v + p1 * w + p2 * u, mi.xform), n = TransformNormal(n0 * v + n1 * w + n2 * u, mi.inv_xform); - const simd_fvec4 o = p + n, d = -n; + const fvec4 o = p + n, d = -n; memcpy(&out_ray.o[0], value_ptr(o), 3 * sizeof(float)); memcpy(&out_ray.d[0], value_ptr(d), 3 * sizeof(float)); @@ -2707,8 +2707,8 @@ bool Ray::Ref::Traverse_BLAS_WithStack_AnyHit(const float ro[3], const float rd[ return false; } -float Ray::Ref::BRDF_PrincipledDiffuse(const simd_fvec4 &V, const simd_fvec4 &N, const simd_fvec4 &L, - const simd_fvec4 &H, const float roughness) { +float Ray::Ref::BRDF_PrincipledDiffuse(const fvec4 &V, const fvec4 &N, const fvec4 &L, + const fvec4 &H, const float roughness) { const float N_dot_L = dot(N, L); const float N_dot_V = dot(N, V); if (N_dot_L <= 0.0f /*|| N_dot_V <= 0.0f*/) { @@ -2725,8 +2725,8 @@ float Ray::Ref::BRDF_PrincipledDiffuse(const simd_fvec4 &V, const simd_fvec4 &N, return Fd; } -Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_OrenDiffuse_BSDF(const simd_fvec4 &V, const simd_fvec4 &N, const simd_fvec4 &L, - const float roughness, const simd_fvec4 &base_color) { +Ray::Ref::fvec4 Ray::Ref::Evaluate_OrenDiffuse_BSDF(const fvec4 &V, const fvec4 &N, const fvec4 &L, + const float roughness, const fvec4 &base_color) { const float sigma = roughness; const float div = 1.0f / (PI + ((3.0f * PI - 4.0f) / 6.0f) * sigma); @@ -2744,31 +2744,31 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_OrenDiffuse_BSDF(const simd_fvec4 &V, co } const float is = nl * (a + b * t); - simd_fvec4 diff_col = is * base_color; + fvec4 diff_col = is * base_color; diff_col.set<3>(0.5f / PI); return diff_col; } -Ray::Ref::simd_fvec4 Ray::Ref::Sample_OrenDiffuse_BSDF(const simd_fvec4 &T, const simd_fvec4 &B, const simd_fvec4 &N, - const simd_fvec4 &I, const float roughness, - const simd_fvec4 &base_color, const simd_fvec2 rand, - simd_fvec4 &out_V) { +Ray::Ref::fvec4 Ray::Ref::Sample_OrenDiffuse_BSDF(const fvec4 &T, const fvec4 &B, const fvec4 &N, + const fvec4 &I, const float roughness, + const fvec4 &base_color, const fvec2 rand, + fvec4 &out_V) { const float phi = 2 * PI * rand.get<1>(); const float cos_phi = cosf(phi), sin_phi = sinf(phi); const float dir = sqrtf(1.0f - rand.get<0>() * rand.get<1>()); - auto V = simd_fvec4{dir * cos_phi, dir * sin_phi, rand.get<0>(), 0.0f}; // in tangent-space + auto V = fvec4{dir * cos_phi, dir * sin_phi, rand.get<0>(), 0.0f}; // in tangent-space out_V = world_from_tangent(T, B, N, V); return Evaluate_OrenDiffuse_BSDF(-I, N, out_V, roughness, base_color); } -Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_PrincipledDiffuse_BSDF(const simd_fvec4 &V, const simd_fvec4 &N, - const simd_fvec4 &L, const float roughness, - const simd_fvec4 &base_color, - const simd_fvec4 &sheen_color, +Ray::Ref::fvec4 Ray::Ref::Evaluate_PrincipledDiffuse_BSDF(const fvec4 &V, const fvec4 &N, + const fvec4 &L, const float roughness, + const fvec4 &base_color, + const fvec4 &sheen_color, const bool uniform_sampling) { float weight, pdf; if (uniform_sampling) { @@ -2779,12 +2779,12 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_PrincipledDiffuse_BSDF(const simd_fvec4 pdf = dot(N, L) / PI; } - simd_fvec4 H = normalize(L + V); + fvec4 H = normalize(L + V); if (dot(V, H) < 0.0f) { H = -H; } - simd_fvec4 diff_col = base_color * (weight * BRDF_PrincipledDiffuse(V, N, L, H, roughness)); + fvec4 diff_col = base_color * (weight * BRDF_PrincipledDiffuse(V, N, L, H, roughness)); const float FH = PI * schlick_weight(dot(L, H)); diff_col += FH * sheen_color; @@ -2793,39 +2793,39 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_PrincipledDiffuse_BSDF(const simd_fvec4 return diff_col; } -Ray::Ref::simd_fvec4 Ray::Ref::Sample_PrincipledDiffuse_BSDF(const simd_fvec4 &T, const simd_fvec4 &B, - const simd_fvec4 &N, const simd_fvec4 &I, - const float roughness, const simd_fvec4 &base_color, - const simd_fvec4 &sheen_color, const bool uniform_sampling, - const simd_fvec2 rand, simd_fvec4 &out_V) { +Ray::Ref::fvec4 Ray::Ref::Sample_PrincipledDiffuse_BSDF(const fvec4 &T, const fvec4 &B, + const fvec4 &N, const fvec4 &I, + const float roughness, const fvec4 &base_color, + const fvec4 &sheen_color, const bool uniform_sampling, + const fvec2 rand, fvec4 &out_V) { const float phi = 2 * PI * rand.get<1>(); const float cos_phi = cosf(phi), sin_phi = sinf(phi); - simd_fvec4 V; + fvec4 V; if (uniform_sampling) { const float dir = sqrtf(1.0f - rand.get<0>() * rand.get<0>()); - V = simd_fvec4{dir * cos_phi, dir * sin_phi, rand.get<0>(), 0.0f}; // in tangent-space + V = fvec4{dir * cos_phi, dir * sin_phi, rand.get<0>(), 0.0f}; // in tangent-space } else { const float dir = sqrtf(rand.get<0>()); const float k = sqrtf(1.0f - rand.get<0>()); - V = simd_fvec4{dir * cos_phi, dir * sin_phi, k, 0.0f}; // in tangent-space + V = fvec4{dir * cos_phi, dir * sin_phi, k, 0.0f}; // in tangent-space } out_V = world_from_tangent(T, B, N, V); return Evaluate_PrincipledDiffuse_BSDF(-I, N, out_V, roughness, base_color, sheen_color, uniform_sampling); } -Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_GGXSpecular_BSDF(const simd_fvec4 &view_dir_ts, - const simd_fvec4 &sampled_normal_ts, - const simd_fvec4 &reflected_dir_ts, const simd_fvec2 alpha, +Ray::Ref::fvec4 Ray::Ref::Evaluate_GGXSpecular_BSDF(const fvec4 &view_dir_ts, + const fvec4 &sampled_normal_ts, + const fvec4 &reflected_dir_ts, const fvec2 alpha, const float spec_ior, const float spec_F0, - const simd_fvec4 &spec_col, const simd_fvec4 &spec_col_90) { + const fvec4 &spec_col, const fvec4 &spec_col_90) { const float D = D_GGX(sampled_normal_ts, alpha); const float G = G1(view_dir_ts, alpha) * G1(reflected_dir_ts, alpha); const float FH = (fresnel_dielectric_cos(dot(view_dir_ts, sampled_normal_ts), spec_ior) - spec_F0) / (1.0f - spec_F0); - simd_fvec4 F = mix(spec_col, spec_col_90, FH); + fvec4 F = mix(spec_col, spec_col_90, FH); const float denom = 4.0f * fabsf(view_dir_ts.get<2>() * reflected_dir_ts.get<2>()); F *= (denom != 0.0f) ? (D * G / denom) : 0.0f; @@ -2837,36 +2837,36 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_GGXSpecular_BSDF(const simd_fvec4 &view_ return F; } -Ray::Ref::simd_fvec4 Ray::Ref::Sample_GGXSpecular_BSDF(const simd_fvec4 &T, const simd_fvec4 &B, const simd_fvec4 &N, - const simd_fvec4 &I, const simd_fvec2 alpha, +Ray::Ref::fvec4 Ray::Ref::Sample_GGXSpecular_BSDF(const fvec4 &T, const fvec4 &B, const fvec4 &N, + const fvec4 &I, const fvec2 alpha, const float spec_ior, const float spec_F0, - const simd_fvec4 &spec_col, const simd_fvec4 &spec_col_90, - const simd_fvec2 rand, simd_fvec4 &out_V) { + const fvec4 &spec_col, const fvec4 &spec_col_90, + const fvec2 rand, fvec4 &out_V) { if (alpha.get<0>() * alpha.get<1>() < 1e-7f) { - const simd_fvec4 V = reflect(I, N, dot(N, I)); + const fvec4 V = reflect(I, N, dot(N, I)); const float FH = (fresnel_dielectric_cos(dot(V, N), spec_ior) - spec_F0) / (1.0f - spec_F0); - simd_fvec4 F = mix(spec_col, spec_col_90, FH); + fvec4 F = mix(spec_col, spec_col_90, FH); out_V = V; - return simd_fvec4{F.get<0>() * 1e6f, F.get<1>() * 1e6f, F.get<2>() * 1e6f, 1e6f}; + return fvec4{F.get<0>() * 1e6f, F.get<1>() * 1e6f, F.get<2>() * 1e6f, 1e6f}; } - const simd_fvec4 view_dir_ts = normalize(tangent_from_world(T, B, N, -I)); - const simd_fvec4 sampled_normal_ts = SampleGGX_VNDF_Bounded(view_dir_ts, alpha, rand); + const fvec4 view_dir_ts = normalize(tangent_from_world(T, B, N, -I)); + const fvec4 sampled_normal_ts = SampleGGX_VNDF_Bounded(view_dir_ts, alpha, rand); const float dot_N_V = -dot(sampled_normal_ts, view_dir_ts); - const simd_fvec4 reflected_dir_ts = normalize(reflect(-view_dir_ts, sampled_normal_ts, dot_N_V)); + const fvec4 reflected_dir_ts = normalize(reflect(-view_dir_ts, sampled_normal_ts, dot_N_V)); out_V = world_from_tangent(T, B, N, reflected_dir_ts); return Evaluate_GGXSpecular_BSDF(view_dir_ts, sampled_normal_ts, reflected_dir_ts, alpha, spec_ior, spec_F0, spec_col, spec_col_90); } -Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_GGXRefraction_BSDF(const simd_fvec4 &view_dir_ts, - const simd_fvec4 &sampled_normal_ts, - const simd_fvec4 &refr_dir_ts, const simd_fvec2 alpha, - float eta, const simd_fvec4 &refr_col) { +Ray::Ref::fvec4 Ray::Ref::Evaluate_GGXRefraction_BSDF(const fvec4 &view_dir_ts, + const fvec4 &sampled_normal_ts, + const fvec4 &refr_dir_ts, const fvec2 alpha, + float eta, const fvec4 &refr_col) { if (refr_dir_ts.get<2>() >= 0.0f || view_dir_ts.get<2>() <= 0.0f || alpha.get<0>() * alpha.get<1>() < 1e-7f) { - return simd_fvec4{0.0f}; + return fvec4{0.0f}; } const float D = D_GGX(sampled_normal_ts, alpha); @@ -2884,56 +2884,56 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_GGXRefraction_BSDF(const simd_fvec4 &vie // const float pdf = D * fmaxf(sampled_normal_ts.get<2>(), 0.0f) * jacobian; // const float pdf = D * sampled_normal_ts.get<2>() * fmaxf(-dot(refr_dir_ts, sampled_normal_ts), 0.0f) / denom; - simd_fvec4 ret = F * refr_col; + fvec4 ret = F * refr_col; // ret *= (-refr_dir_ts.get<2>()); ret.set<3>(pdf); return ret; } -Ray::Ref::simd_fvec4 Ray::Ref::Sample_GGXRefraction_BSDF(const simd_fvec4 &T, const simd_fvec4 &B, const simd_fvec4 &N, - const simd_fvec4 &I, const simd_fvec2 alpha, const float eta, - const simd_fvec4 &refr_col, const simd_fvec2 rand, - simd_fvec4 &out_V) { +Ray::Ref::fvec4 Ray::Ref::Sample_GGXRefraction_BSDF(const fvec4 &T, const fvec4 &B, const fvec4 &N, + const fvec4 &I, const fvec2 alpha, const float eta, + const fvec4 &refr_col, const fvec2 rand, + fvec4 &out_V) { if (alpha.get<0>() * alpha.get<1>() < 1e-7f) { const float cosi = -dot(I, N); const float cost2 = 1.0f - eta * eta * (1.0f - cosi * cosi); if (cost2 < 0) { - return simd_fvec4{0.0f}; + return fvec4{0.0f}; } const float m = eta * cosi - sqrtf(cost2); - const simd_fvec4 V = normalize(eta * I + m * N); + const fvec4 V = normalize(eta * I + m * N); - out_V = simd_fvec4{V.get<0>(), V.get<1>(), V.get<2>(), m}; - return simd_fvec4{refr_col.get<0>() * 1e6f, refr_col.get<1>() * 1e6f, refr_col.get<2>() * 1e6f, 1e6f}; + out_V = fvec4{V.get<0>(), V.get<1>(), V.get<2>(), m}; + return fvec4{refr_col.get<0>() * 1e6f, refr_col.get<1>() * 1e6f, refr_col.get<2>() * 1e6f, 1e6f}; } - const simd_fvec4 view_dir_ts = normalize(tangent_from_world(T, B, N, -I)); - const simd_fvec4 sampled_normal_ts = SampleGGX_VNDF(view_dir_ts, alpha, rand); + const fvec4 view_dir_ts = normalize(tangent_from_world(T, B, N, -I)); + const fvec4 sampled_normal_ts = SampleGGX_VNDF(view_dir_ts, alpha, rand); const float cosi = dot(view_dir_ts, sampled_normal_ts); const float cost2 = 1.0f - eta * eta * (1.0f - cosi * cosi); if (cost2 < 0) { - return simd_fvec4{0.0f}; + return fvec4{0.0f}; } const float m = eta * cosi - sqrtf(cost2); - const simd_fvec4 refr_dir_ts = normalize(-eta * view_dir_ts + m * sampled_normal_ts); + const fvec4 refr_dir_ts = normalize(-eta * view_dir_ts + m * sampled_normal_ts); - const simd_fvec4 F = Evaluate_GGXRefraction_BSDF(view_dir_ts, sampled_normal_ts, refr_dir_ts, alpha, eta, refr_col); + const fvec4 F = Evaluate_GGXRefraction_BSDF(view_dir_ts, sampled_normal_ts, refr_dir_ts, alpha, eta, refr_col); - const simd_fvec4 V = world_from_tangent(T, B, N, refr_dir_ts); - out_V = simd_fvec4{V.get<0>(), V.get<1>(), V.get<2>(), m}; + const fvec4 V = world_from_tangent(T, B, N, refr_dir_ts); + out_V = fvec4{V.get<0>(), V.get<1>(), V.get<2>(), m}; return F; } -Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_PrincipledClearcoat_BSDF(const simd_fvec4 &view_dir_ts, - const simd_fvec4 &sampled_normal_ts, - const simd_fvec4 &reflected_dir_ts, +Ray::Ref::fvec4 Ray::Ref::Evaluate_PrincipledClearcoat_BSDF(const fvec4 &view_dir_ts, + const fvec4 &sampled_normal_ts, + const fvec4 &reflected_dir_ts, const float clearcoat_roughness2, const float clearcoat_ior, const float clearcoat_F0) { const float D = D_GTR1(sampled_normal_ts.get<2>(), clearcoat_roughness2); // Always assume roughness of 0.25 for clearcoat - const simd_fvec2 clearcoat_alpha = {0.25f * 0.25f}; + const fvec2 clearcoat_alpha = {0.25f * 0.25f}; const float G = G1(view_dir_ts, clearcoat_alpha) * G1(reflected_dir_ts, clearcoat_alpha); const float FH = (fresnel_dielectric_cos(dot(reflected_dir_ts, sampled_normal_ts), clearcoat_ior) - clearcoat_F0) / @@ -2945,30 +2945,30 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_PrincipledClearcoat_BSDF(const simd_fvec F *= fmaxf(reflected_dir_ts.get<2>(), 0.0f); const float pdf = GGX_VNDF_Reflection_Bounded_PDF(D, view_dir_ts, clearcoat_alpha); - return simd_fvec4{F, F, F, pdf}; + return fvec4{F, F, F, pdf}; } -Ray::Ref::simd_fvec4 Ray::Ref::Sample_PrincipledClearcoat_BSDF(const simd_fvec4 &T, const simd_fvec4 &B, - const simd_fvec4 &N, const simd_fvec4 &I, +Ray::Ref::fvec4 Ray::Ref::Sample_PrincipledClearcoat_BSDF(const fvec4 &T, const fvec4 &B, + const fvec4 &N, const fvec4 &I, const float clearcoat_roughness2, const float clearcoat_ior, const float clearcoat_F0, - const simd_fvec2 rand, simd_fvec4 &out_V) { + const fvec2 rand, fvec4 &out_V) { if (sqr(clearcoat_roughness2) < 1e-7f) { - const simd_fvec4 V = reflect(I, N, dot(N, I)); + const fvec4 V = reflect(I, N, dot(N, I)); const float FH = (fresnel_dielectric_cos(dot(V, N), clearcoat_ior) - clearcoat_F0) / (1.0f - clearcoat_F0); const float F = mix(0.04f, 1.0f, FH); out_V = V; - return simd_fvec4{F * 1e6f, F * 1e6f, F * 1e6f, 1e6f}; + return fvec4{F * 1e6f, F * 1e6f, F * 1e6f, 1e6f}; } - const simd_fvec4 view_dir_ts = normalize(tangent_from_world(T, B, N, -I)); + const fvec4 view_dir_ts = normalize(tangent_from_world(T, B, N, -I)); // NOTE: GTR1 distribution is not used for sampling because Cycles does it this way (???!) - const simd_fvec4 sampled_normal_ts = SampleGGX_VNDF_Bounded(view_dir_ts, clearcoat_roughness2, rand); + const fvec4 sampled_normal_ts = SampleGGX_VNDF_Bounded(view_dir_ts, clearcoat_roughness2, rand); const float dot_N_V = -dot(sampled_normal_ts, view_dir_ts); - const simd_fvec4 reflected_dir_ts = normalize(reflect(-view_dir_ts, sampled_normal_ts, dot_N_V)); + const fvec4 reflected_dir_ts = normalize(reflect(-view_dir_ts, sampled_normal_ts, dot_N_V)); out_V = world_from_tangent(T, B, N, reflected_dir_ts); @@ -2976,12 +2976,12 @@ Ray::Ref::simd_fvec4 Ray::Ref::Sample_PrincipledClearcoat_BSDF(const simd_fvec4 clearcoat_ior, clearcoat_F0); } -float Ray::Ref::Evaluate_EnvQTree(const float y_rotation, const simd_fvec4 *const *qtree_mips, const int qtree_levels, - const simd_fvec4 &L) { +float Ray::Ref::Evaluate_EnvQTree(const float y_rotation, const fvec4 *const *qtree_mips, const int qtree_levels, + const fvec4 &L) { int res = 2; int lod = qtree_levels - 1; - simd_fvec2 p; + fvec2 p; DirToCanonical(value_ptr(L), -y_rotation, value_ptr(p)); float factor = 1.0f; @@ -2996,7 +2996,7 @@ float Ray::Ref::Evaluate_EnvQTree(const float y_rotation, const simd_fvec4 *cons const int qx = x / 2; const int qy = y / 2; - const simd_fvec4 quad = qtree_mips[lod][qy * res / 2 + qx]; + const fvec4 quad = qtree_mips[lod][qy * res / 2 + qx]; const float total = quad.get<0>() + quad.get<1>() + quad.get<2>() + quad.get<3>(); if (total <= 0.0f) { break; @@ -3011,7 +3011,7 @@ float Ray::Ref::Evaluate_EnvQTree(const float y_rotation, const simd_fvec4 *cons return factor / (4.0f * PI); } -Ray::Ref::simd_fvec4 Ray::Ref::Sample_EnvQTree(const float y_rotation, const simd_fvec4 *const *qtree_mips, +Ray::Ref::fvec4 Ray::Ref::Sample_EnvQTree(const float y_rotation, const fvec4 *const *qtree_mips, const int qtree_levels, const float rand, const float rx, const float ry) { int res = 2; @@ -3020,14 +3020,14 @@ Ray::Ref::simd_fvec4 Ray::Ref::Sample_EnvQTree(const float y_rotation, const sim float sample = rand; int lod = qtree_levels - 1; - simd_fvec2 origin = {0.0f, 0.0f}; + fvec2 origin = {0.0f, 0.0f}; float factor = 1.0f; while (lod >= 0) { const int qx = int(origin.get<0>() * float(res)) / 2; const int qy = int(origin.get<1>() * float(res)) / 2; - const simd_fvec4 quad = qtree_mips[lod][qy * res / 2 + qx]; + const fvec4 quad = qtree_mips[lod][qy * res / 2 + qx]; const float top_left = quad.get<0>(); const float top_right = quad.get<1>(); @@ -3068,12 +3068,12 @@ Ray::Ref::simd_fvec4 Ray::Ref::Sample_EnvQTree(const float y_rotation, const sim step *= 0.5f; } - origin += 2 * step * simd_fvec2{rx, ry}; + origin += 2 * step * fvec2{rx, ry}; - // origin = simd_fvec2{rx, ry}; + // origin = fvec2{rx, ry}; // factor = 1.0f; - simd_fvec4 dir_and_pdf; + fvec4 dir_and_pdf; CanonicalToDir(value_ptr(origin), y_rotation, value_ptr(dir_and_pdf)); dir_and_pdf.set<3>(factor / (4.0f * PI)); @@ -3091,40 +3091,40 @@ void Ray::Ref::TransformRay(const float ro[3], const float rd[3], const float *x out_rd[2] = xform[2] * rd[0] + xform[6] * rd[1] + xform[10] * rd[2]; } -Ray::Ref::simd_fvec4 Ray::Ref::TransformPoint(const simd_fvec4 &p, const float *xform) { - return simd_fvec4{xform[0] * p.get<0>() + xform[4] * p.get<1>() + xform[8] * p.get<2>() + xform[12], +Ray::Ref::fvec4 Ray::Ref::TransformPoint(const fvec4 &p, const float *xform) { + return fvec4{xform[0] * p.get<0>() + xform[4] * p.get<1>() + xform[8] * p.get<2>() + xform[12], xform[1] * p.get<0>() + xform[5] * p.get<1>() + xform[9] * p.get<2>() + xform[13], xform[2] * p.get<0>() + xform[6] * p.get<1>() + xform[10] * p.get<2>() + xform[14], 0.0f}; } -Ray::Ref::simd_fvec4 Ray::Ref::TransformDirection(const simd_fvec4 &p, const float *xform) { - return simd_fvec4{xform[0] * p.get<0>() + xform[4] * p.get<1>() + xform[8] * p.get<2>(), +Ray::Ref::fvec4 Ray::Ref::TransformDirection(const fvec4 &p, const float *xform) { + return fvec4{xform[0] * p.get<0>() + xform[4] * p.get<1>() + xform[8] * p.get<2>(), xform[1] * p.get<0>() + xform[5] * p.get<1>() + xform[9] * p.get<2>(), xform[2] * p.get<0>() + xform[6] * p.get<1>() + xform[10] * p.get<2>(), 0.0f}; } -Ray::Ref::simd_fvec4 Ray::Ref::TransformNormal(const simd_fvec4 &n, const float *inv_xform) { - return simd_fvec4{inv_xform[0] * n.get<0>() + inv_xform[1] * n.get<1>() + inv_xform[2] * n.get<2>(), +Ray::Ref::fvec4 Ray::Ref::TransformNormal(const fvec4 &n, const float *inv_xform) { + return fvec4{inv_xform[0] * n.get<0>() + inv_xform[1] * n.get<1>() + inv_xform[2] * n.get<2>(), inv_xform[4] * n.get<0>() + inv_xform[5] * n.get<1>() + inv_xform[6] * n.get<2>(), inv_xform[8] * n.get<0>() + inv_xform[9] * n.get<1>() + inv_xform[10] * n.get<2>(), 0.0f}; } -Ray::Ref::simd_fvec4 Ray::Ref::SampleNearest(const Cpu::TexStorageBase *const textures[], const uint32_t index, - const simd_fvec2 &uvs, const int lod) { +Ray::Ref::fvec4 Ray::Ref::SampleNearest(const Cpu::TexStorageBase *const textures[], const uint32_t index, + const fvec2 &uvs, const int lod) { const Cpu::TexStorageBase &storage = *textures[index >> 28]; const auto &pix = storage.Fetch(int(index & 0x00ffffff), uvs.get<0>(), uvs.get<1>(), lod); - return simd_fvec4{pix.v[0], pix.v[1], pix.v[2], pix.v[3]}; + return fvec4{pix.v[0], pix.v[1], pix.v[2], pix.v[3]}; } -Ray::Ref::simd_fvec4 Ray::Ref::SampleBilinear(const Cpu::TexStorageBase *const textures[], const uint32_t index, - const simd_fvec2 &uvs, const int lod, const simd_fvec2 &rand) { +Ray::Ref::fvec4 Ray::Ref::SampleBilinear(const Cpu::TexStorageBase *const textures[], const uint32_t index, + const fvec2 &uvs, const int lod, const fvec2 &rand) { const Cpu::TexStorageBase &storage = *textures[index >> 28]; const int tex = int(index & 0x00ffffff); - simd_fvec2 img_size; + fvec2 img_size; storage.GetFRes(tex, lod, value_ptr(img_size)); - simd_fvec2 _uvs = fract(uvs); + fvec2 _uvs = fract(uvs); _uvs = _uvs * img_size - 0.5f; #if USE_STOCH_TEXTURE_FILTERING @@ -3132,7 +3132,7 @@ Ray::Ref::simd_fvec4 Ray::Ref::SampleBilinear(const Cpu::TexStorageBase *const t _uvs += rand; const auto &p00 = storage.Fetch(tex, int(_uvs.get<0>()), int(_uvs.get<1>()), lod); - return simd_fvec4{p00.v}; + return fvec4{p00.v}; #else // USE_STOCH_TEXTURE_FILTERING const auto &p00 = storage.Fetch(tex, int(_uvs.get<0>()) + 0, int(_uvs.get<1>()) + 0, lod); const auto &p01 = storage.Fetch(tex, int(_uvs.get<0>()) + 1, int(_uvs.get<1>()) + 0, lod); @@ -3141,70 +3141,70 @@ Ray::Ref::simd_fvec4 Ray::Ref::SampleBilinear(const Cpu::TexStorageBase *const t const float kx = fract(_uvs.get<0>()), ky = fract(_uvs.get<1>()); - const auto p0 = simd_fvec4{p01.v[0] * kx + p00.v[0] * (1 - kx), p01.v[1] * kx + p00.v[1] * (1 - kx), + const auto p0 = fvec4{p01.v[0] * kx + p00.v[0] * (1 - kx), p01.v[1] * kx + p00.v[1] * (1 - kx), p01.v[2] * kx + p00.v[2] * (1 - kx), p01.v[3] * kx + p00.v[3] * (1 - kx)}; - const auto p1 = simd_fvec4{p11.v[0] * kx + p10.v[0] * (1 - kx), p11.v[1] * kx + p10.v[1] * (1 - kx), + const auto p1 = fvec4{p11.v[0] * kx + p10.v[0] * (1 - kx), p11.v[1] * kx + p10.v[1] * (1 - kx), p11.v[2] * kx + p10.v[2] * (1 - kx), p11.v[3] * kx + p10.v[3] * (1 - kx)}; return (p1 * ky + p0 * (1.0f - ky)); #endif // USE_STOCH_TEXTURE_FILTERING } -Ray::Ref::simd_fvec4 Ray::Ref::SampleBilinear(const Cpu::TexStorageBase &storage, const uint32_t tex, - const simd_fvec2 &iuvs, const int lod, const simd_fvec2 &rand) { +Ray::Ref::fvec4 Ray::Ref::SampleBilinear(const Cpu::TexStorageBase &storage, const uint32_t tex, + const fvec2 &iuvs, const int lod, const fvec2 &rand) { #if USE_STOCH_TEXTURE_FILTERING // Jitter UVs - simd_fvec2 _uvs = iuvs + rand; + fvec2 _uvs = iuvs + rand; const auto &p00 = storage.Fetch(tex, int(_uvs.get<0>()), int(_uvs.get<1>()), lod); - return simd_fvec4{p00.v}; + return fvec4{p00.v}; #else // USE_STOCH_TEXTURE_FILTERING const auto &p00 = storage.Fetch(int(tex), int(iuvs.get<0>()) + 0, int(iuvs.get<1>()) + 0, lod); const auto &p01 = storage.Fetch(int(tex), int(iuvs.get<0>()) + 1, int(iuvs.get<1>()) + 0, lod); const auto &p10 = storage.Fetch(int(tex), int(iuvs.get<0>()) + 0, int(iuvs.get<1>()) + 1, lod); const auto &p11 = storage.Fetch(int(tex), int(iuvs.get<0>()) + 1, int(iuvs.get<1>()) + 1, lod); - const simd_fvec2 k = fract(iuvs); + const fvec2 k = fract(iuvs); - const auto _p00 = simd_fvec4{p00.v[0], p00.v[1], p00.v[2], p00.v[3]}; - const auto _p01 = simd_fvec4{p01.v[0], p01.v[1], p01.v[2], p01.v[3]}; - const auto _p10 = simd_fvec4{p10.v[0], p10.v[1], p10.v[2], p10.v[3]}; - const auto _p11 = simd_fvec4{p11.v[0], p11.v[1], p11.v[2], p11.v[3]}; + const auto _p00 = fvec4{p00.v[0], p00.v[1], p00.v[2], p00.v[3]}; + const auto _p01 = fvec4{p01.v[0], p01.v[1], p01.v[2], p01.v[3]}; + const auto _p10 = fvec4{p10.v[0], p10.v[1], p10.v[2], p10.v[3]}; + const auto _p11 = fvec4{p11.v[0], p11.v[1], p11.v[2], p11.v[3]}; - const simd_fvec4 p0X = _p01 * k.get<0>() + _p00 * (1 - k.get<0>()); - const simd_fvec4 p1X = _p11 * k.get<0>() + _p10 * (1 - k.get<0>()); + const fvec4 p0X = _p01 * k.get<0>() + _p00 * (1 - k.get<0>()); + const fvec4 p1X = _p11 * k.get<0>() + _p10 * (1 - k.get<0>()); return (p1X * k.get<1>() + p0X * (1 - k.get<1>())); #endif // USE_STOCH_TEXTURE_FILTERING } -Ray::Ref::simd_fvec4 Ray::Ref::SampleTrilinear(const Cpu::TexStorageBase *const textures[], const uint32_t index, - const simd_fvec2 &uvs, const float lod, const simd_fvec2 &rand) { - const simd_fvec4 col1 = SampleBilinear(textures, index, uvs, int(floorf(lod)), rand); - const simd_fvec4 col2 = SampleBilinear(textures, index, uvs, int(ceilf(lod)), rand); +Ray::Ref::fvec4 Ray::Ref::SampleTrilinear(const Cpu::TexStorageBase *const textures[], const uint32_t index, + const fvec2 &uvs, const float lod, const fvec2 &rand) { + const fvec4 col1 = SampleBilinear(textures, index, uvs, int(floorf(lod)), rand); + const fvec4 col2 = SampleBilinear(textures, index, uvs, int(ceilf(lod)), rand); const float k = fract(lod); return col1 * (1 - k) + col2 * k; } -Ray::Ref::simd_fvec4 Ray::Ref::SampleAnisotropic(const Cpu::TexStorageBase *const textures[], const uint32_t index, - const simd_fvec2 &uvs, const simd_fvec2 &duv_dx, - const simd_fvec2 &duv_dy) { +Ray::Ref::fvec4 Ray::Ref::SampleAnisotropic(const Cpu::TexStorageBase *const textures[], const uint32_t index, + const fvec2 &uvs, const fvec2 &duv_dx, + const fvec2 &duv_dy) { const Cpu::TexStorageBase &storage = *textures[index >> 28]; const int tex = int(index & 0x00ffffff); - simd_fvec2 sz; + fvec2 sz; storage.GetFRes(tex, 0, value_ptr(sz)); - const simd_fvec2 _duv_dx = abs(duv_dx * sz); - const simd_fvec2 _duv_dy = abs(duv_dy * sz); + const fvec2 _duv_dx = abs(duv_dx * sz); + const fvec2 _duv_dy = abs(duv_dy * sz); const float l1 = length(_duv_dx); const float l2 = length(_duv_dy); float lod, k; - simd_fvec2 step; + fvec2 step; if (l1 <= l2) { lod = fast_log2(fminf(_duv_dx.get<0>(), _duv_dx.get<1>())); @@ -3218,19 +3218,19 @@ Ray::Ref::simd_fvec4 Ray::Ref::SampleAnisotropic(const Cpu::TexStorageBase *cons lod = clamp(lod, 0.0f, float(MAX_MIP_LEVEL)); - simd_fvec2 _uvs = uvs - step * 0.5f; + fvec2 _uvs = uvs - step * 0.5f; int num = int(2.0f / k); num = clamp(num, 1, 4); step = step / float(num); - auto res = simd_fvec4{0.0f}; + auto res = fvec4{0.0f}; const int lod1 = int(floorf(lod)); const int lod2 = int(ceilf(lod)); - simd_fvec2 size1, size2; + fvec2 size1, size2; storage.GetFRes(tex, lod1, value_ptr(size1)); storage.GetFRes(tex, lod2, value_ptr(size2)); @@ -3239,11 +3239,11 @@ Ray::Ref::simd_fvec4 Ray::Ref::SampleAnisotropic(const Cpu::TexStorageBase *cons for (int i = 0; i < num; ++i) { _uvs = fract(_uvs); - const simd_fvec2 _uvs1 = _uvs * size1; + const fvec2 _uvs1 = _uvs * size1; res += (1 - kz) * SampleBilinear(storage, tex, _uvs1, lod1, {}); if (kz > 0.0001f) { - const simd_fvec2 _uvs2 = _uvs * size2; + const fvec2 _uvs2 = _uvs * size2; res += kz * SampleBilinear(storage, tex, _uvs2, lod2, {}); } @@ -3253,8 +3253,8 @@ Ray::Ref::simd_fvec4 Ray::Ref::SampleAnisotropic(const Cpu::TexStorageBase *cons return res / float(num); } -Ray::Ref::simd_fvec4 Ray::Ref::SampleLatlong_RGBE(const Cpu::TexStorageRGBA &storage, const uint32_t index, - const simd_fvec4 &dir, float y_rotation, const simd_fvec2 &rand) { +Ray::Ref::fvec4 Ray::Ref::SampleLatlong_RGBE(const Cpu::TexStorageRGBA &storage, const uint32_t index, + const fvec4 &dir, float y_rotation, const fvec2 &rand) { const float theta = acosf(clamp(dir.get<1>(), -1.0f, 1.0f)) / PI; float phi = atan2f(dir.get<2>(), dir.get<0>()) + y_rotation; if (phi < 0) { @@ -3267,33 +3267,33 @@ Ray::Ref::simd_fvec4 Ray::Ref::SampleLatlong_RGBE(const Cpu::TexStorageRGBA &sto const float u = fract(0.5f * phi / PI); const int tex = int(index & 0x00ffffff); - simd_fvec2 size; + fvec2 size; storage.GetFRes(tex, 0, value_ptr(size)); - simd_fvec2 uvs = simd_fvec2{u, theta} * size; + fvec2 uvs = fvec2{u, theta} * size; #if USE_STOCH_TEXTURE_FILTERING // Jitter UVs uvs += rand - 0.5f; - const simd_ivec2 iuvs = simd_ivec2(uvs); + const ivec2 iuvs = ivec2(uvs); const auto &p00 = storage.Get(tex, iuvs.get<0>(), iuvs.get<1>(), 0); return rgbe_to_rgb(p00); #else // USE_STOCH_TEXTURE_FILTERING - const simd_ivec2 iuvs = simd_ivec2(uvs); + const ivec2 iuvs = ivec2(uvs); const auto &p00 = storage.Get(tex, iuvs.get<0>() + 0, iuvs.get<1>() + 0, 0); const auto &p01 = storage.Get(tex, iuvs.get<0>() + 1, iuvs.get<1>() + 0, 0); const auto &p10 = storage.Get(tex, iuvs.get<0>() + 0, iuvs.get<1>() + 1, 0); const auto &p11 = storage.Get(tex, iuvs.get<0>() + 1, iuvs.get<1>() + 1, 0); - const simd_fvec2 k = fract(uvs); + const fvec2 k = fract(uvs); - const simd_fvec4 _p00 = rgbe_to_rgb(p00), _p01 = rgbe_to_rgb(p01); - const simd_fvec4 _p10 = rgbe_to_rgb(p10), _p11 = rgbe_to_rgb(p11); + const fvec4 _p00 = rgbe_to_rgb(p00), _p01 = rgbe_to_rgb(p01); + const fvec4 _p10 = rgbe_to_rgb(p10), _p11 = rgbe_to_rgb(p11); - const simd_fvec4 p0X = _p01 * k.get<0>() + _p00 * (1 - k.get<0>()); - const simd_fvec4 p1X = _p11 * k.get<0>() + _p10 * (1 - k.get<0>()); + const fvec4 p0X = _p01 * k.get<0>() + _p00 * (1 - k.get<0>()); + const fvec4 p1X = _p11 * k.get<0>() + _p10 * (1 - k.get<0>()); return (p1X * k.get<1>() + p0X * (1 - k.get<1>())); #endif // USE_STOCH_TEXTURE_FILTERING @@ -3307,8 +3307,8 @@ void Ray::Ref::IntersectScene(Span rays, const int min_transp_depth, ray_data_t &r = rays[i]; hit_data_t &inter = out_inter[i]; - const simd_fvec4 rd = make_fvec3(r.d); - simd_fvec4 ro = make_fvec3(r.o); + const fvec4 rd = make_fvec3(r.d); + fvec4 ro = make_fvec3(r.o); const uint32_t ray_flags = (1u << get_ray_type(r.depth)); @@ -3352,11 +3352,11 @@ void Ray::Ref::IntersectScene(Span rays, const int min_transp_depth, const vertex_t &v3 = sc.vertices[sc.vtx_indices[tri_index * 3 + 2]]; const float w = 1.0f - inter.u - inter.v; - const simd_fvec2 uvs = simd_fvec2(v1.t) * w + simd_fvec2(v2.t) * inter.u + simd_fvec2(v3.t) * inter.v; + const fvec2 uvs = fvec2(v1.t) * w + fvec2(v2.t) * inter.u + fvec2(v3.t) * inter.v; - const simd_fvec2 mix_term_rand = + const fvec2 mix_term_rand = get_scrambled_2d_rand(rand_dim + RAND_DIM_BSDF_PICK, rand_hash, iteration - 1, rand_seq); - const simd_fvec2 tex_rand = + const fvec2 tex_rand = get_scrambled_2d_rand(rand_dim + RAND_DIM_TEX, rand_hash, iteration - 1, rand_seq); float trans_r = mix_term_rand.get<0>(); @@ -3366,7 +3366,7 @@ void Ray::Ref::IntersectScene(Span rays, const int min_transp_depth, float mix_val = mat->strength; const uint32_t base_texture = mat->textures[BASE_TEXTURE]; if (base_texture != 0xffffffff) { - simd_fvec4 tex_color = SampleBilinear(textures, base_texture, uvs, 0, tex_rand); + fvec4 tex_color = SampleBilinear(textures, base_texture, uvs, 0, tex_rand); if (base_texture & TEX_YCOCG_BIT) { tex_color = YCoCg_to_RGB(tex_color); } @@ -3423,13 +3423,13 @@ void Ray::Ref::IntersectScene(Span rays, const int min_transp_depth, } } -Ray::Ref::simd_fvec4 Ray::Ref::IntersectScene(const shadow_ray_t &r, const int max_transp_depth, const scene_data_t &sc, +Ray::Ref::fvec4 Ray::Ref::IntersectScene(const shadow_ray_t &r, const int max_transp_depth, const scene_data_t &sc, const uint32_t root_index, const uint32_t rand_seq[], const uint32_t rand_seed, const int iteration, const Cpu::TexStorageBase *const textures[]) { - const simd_fvec4 rd = make_fvec3(r.d); - simd_fvec4 ro = make_fvec3(r.o); - simd_fvec4 rc = make_fvec3(r.c); + const fvec4 rd = make_fvec3(r.d); + fvec4 ro = make_fvec3(r.o); + fvec4 rc = make_fvec3(r.c); int depth = get_transp_depth(r.depth); const uint32_t px_hash = hash(r.xy); @@ -3472,9 +3472,9 @@ Ray::Ref::simd_fvec4 Ray::Ref::IntersectScene(const shadow_ray_t &r, const int m const vertex_t &v3 = sc.vertices[sc.vtx_indices[tri_index * 3 + 2]]; const float w = 1.0f - inter.u - inter.v; - const simd_fvec2 sh_uvs = simd_fvec2(v1.t) * w + simd_fvec2(v2.t) * inter.u + simd_fvec2(v3.t) * inter.v; + const fvec2 sh_uvs = fvec2(v1.t) * w + fvec2(v2.t) * inter.u + fvec2(v3.t) * inter.v; - const simd_fvec2 tex_rand = get_scrambled_2d_rand(rand_dim + RAND_DIM_TEX, rand_hash, iteration - 1, rand_seq); + const fvec2 tex_rand = get_scrambled_2d_rand(rand_dim + RAND_DIM_TEX, rand_hash, iteration - 1, rand_seq); struct { uint32_t index; @@ -3484,7 +3484,7 @@ Ray::Ref::simd_fvec4 Ray::Ref::IntersectScene(const shadow_ray_t &r, const int m stack[stack_size++] = {mat_index, 1.0f}; - simd_fvec4 throughput = 0.0f; + fvec4 throughput = 0.0f; while (stack_size--) { const material_t *mat = &sc.materials[stack[stack_size].index]; @@ -3495,7 +3495,7 @@ Ray::Ref::simd_fvec4 Ray::Ref::IntersectScene(const shadow_ray_t &r, const int m float mix_val = mat->strength; const uint32_t base_texture = mat->textures[BASE_TEXTURE]; if (base_texture != 0xffffffff) { - simd_fvec4 tex_color = SampleBilinear(textures, base_texture, sh_uvs, 0, tex_rand); + fvec4 tex_color = SampleBilinear(textures, base_texture, sh_uvs, 0, tex_rand); if (base_texture & TEX_YCOCG_BIT) { tex_color = YCoCg_to_RGB(tex_color); } @@ -3528,10 +3528,10 @@ Ray::Ref::simd_fvec4 Ray::Ref::IntersectScene(const shadow_ray_t &r, const int m return rc; } -void Ray::Ref::SampleLightSource(const simd_fvec4 &P, const simd_fvec4 &T, const simd_fvec4 &B, const simd_fvec4 &N, +void Ray::Ref::SampleLightSource(const fvec4 &P, const fvec4 &T, const fvec4 &B, const fvec4 &N, const scene_data_t &sc, const Cpu::TexStorageBase *const textures[], - const float rand_pick_light, const simd_fvec2 rand_light_uv, - const simd_fvec2 rand_tex_uv, light_sample_t &ls) { + const float rand_pick_light, const fvec2 rand_light_uv, + const fvec2 rand_tex_uv, light_sample_t &ls) { float u1 = rand_pick_light; #if USE_HIERARCHICAL_NEE @@ -3543,7 +3543,7 @@ void Ray::Ref::SampleLightSource(const simd_fvec4 &P, const simd_fvec4 &T, const calc_lnode_importance(sc.light_wnodes[i], P, importance); const float total_importance = - hsum(simd_fvec4{&importance[0], simd_mem_aligned} + simd_fvec4{&importance[4], simd_mem_aligned}); + hsum(fvec4{&importance[0], vector_aligned} + fvec4{&importance[4], vector_aligned}); if (total_importance == 0.0f) { // failed to find lightsource for sampling return; @@ -3561,9 +3561,9 @@ void Ray::Ref::SampleLightSource(const simd_fvec4 &P, const simd_fvec4 &T, const } }) - simd_ivec4 less_eq[2] = {}; - where(simd_fvec4{&factors_cdf[1]} <= u1, less_eq[0]) = 1; - where(simd_fvec4{&factors_cdf[5]} <= u1, less_eq[1]) = 1; + ivec4 less_eq[2] = {}; + where(fvec4{&factors_cdf[1]} <= u1, less_eq[0]) = 1; + where(fvec4{&factors_cdf[5]} <= u1, less_eq[1]) = 1; const int next = hsum(less_eq[0] + less_eq[1]); assert(next < 8); @@ -3589,16 +3589,16 @@ void Ray::Ref::SampleLightSource(const simd_fvec4 &P, const simd_fvec4 &T, const if (l.type == LIGHT_TYPE_SPHERE) { const float r1 = rand_light_uv.get<0>(), r2 = rand_light_uv.get<1>(); - const simd_fvec4 center = make_fvec3(l.sph.pos); - const simd_fvec4 surface_to_center = center - P; + const fvec4 center = make_fvec3(l.sph.pos); + const fvec4 surface_to_center = center - P; float disk_dist; - const simd_fvec4 sampled_dir = normalize_len(map_to_cone(r1, r2, surface_to_center, l.sph.radius), disk_dist); + const fvec4 sampled_dir = normalize_len(map_to_cone(r1, r2, surface_to_center, l.sph.radius), disk_dist); if (l.sph.radius > 0.0f) { const float ls_dist = sphere_intersection(center, l.sph.radius, P, sampled_dir); - const simd_fvec4 light_surf_pos = P + sampled_dir * ls_dist; - const simd_fvec4 light_forward = normalize(light_surf_pos - center); + const fvec4 light_surf_pos = P + sampled_dir * ls_dist; + const fvec4 light_forward = normalize(light_surf_pos - center); ls.lp = offset_ray(light_surf_pos, light_forward); ls.pdf = (disk_dist * disk_dist) / (PI * l.sph.radius * l.sph.radius); @@ -3614,7 +3614,7 @@ void Ray::Ref::SampleLightSource(const simd_fvec4 &P, const simd_fvec4 &T, const } if (l.sph.spot > 0.0f) { - const float _dot = -dot(ls.L, simd_fvec4{l.sph.dir}); + const float _dot = -dot(ls.L, fvec4{l.sph.dir}); if (_dot > 0.0f) { const float _angle = acosf(saturate(_dot)); ls.col *= saturate((l.sph.spot - _angle) / l.sph.blend); @@ -3643,11 +3643,11 @@ void Ray::Ref::SampleLightSource(const simd_fvec4 &P, const simd_fvec4 &T, const ls.area = 0.0f; } } else if (l.type == LIGHT_TYPE_RECT) { - const simd_fvec4 light_pos = make_fvec3(l.rect.pos); - const simd_fvec4 light_u = make_fvec3(l.rect.u), light_v = make_fvec3(l.rect.v); - const simd_fvec4 light_forward = normalize(cross(light_u, light_v)); + const fvec4 light_pos = make_fvec3(l.rect.pos); + const fvec4 light_u = make_fvec3(l.rect.u), light_v = make_fvec3(l.rect.v); + const fvec4 light_forward = normalize(cross(light_u, light_v)); - simd_fvec4 lp; + fvec4 lp; float pdf; #if USE_SPHERICAL_AREA_LIGHT_SAMPLING @@ -3668,7 +3668,7 @@ void Ray::Ref::SampleLightSource(const simd_fvec4 &P, const simd_fvec4 &T, const ls.pdf = (pdf > 0.0f) ? pdf : (ls_dist * ls_dist) / (l.rect.area * cos_theta); ls.area = l.visible ? l.rect.area : 0.0f; if (l.sky_portal != 0) { - simd_fvec4 env_col = make_fvec3(sc.env.env_col); + fvec4 env_col = make_fvec3(sc.env.env_col); if (sc.env.env_map != 0xffffffff) { env_col *= SampleLatlong_RGBE(*static_cast(textures[0]), sc.env.env_map, ls.L, sc.env.env_map_rotation, rand_tex_uv); @@ -3678,12 +3678,12 @@ void Ray::Ref::SampleLightSource(const simd_fvec4 &P, const simd_fvec4 &T, const } } } else if (l.type == LIGHT_TYPE_DISK) { - const simd_fvec4 light_pos = make_fvec3(l.disk.pos); - const simd_fvec4 light_u = make_fvec3(l.disk.u), light_v = make_fvec3(l.disk.v); + const fvec4 light_pos = make_fvec3(l.disk.pos); + const fvec4 light_u = make_fvec3(l.disk.u), light_v = make_fvec3(l.disk.v); const float r1 = rand_light_uv.get<0>(), r2 = rand_light_uv.get<1>(); - simd_fvec2 offset = 2.0f * simd_fvec2{r1, r2} - simd_fvec2{1.0f, 1.0f}; + fvec2 offset = 2.0f * fvec2{r1, r2} - fvec2{1.0f, 1.0f}; if (offset.get<0>() != 0.0f && offset.get<1>() != 0.0f) { float theta, r; if (fabsf(offset.get<0>()) > fabsf(offset.get<1>())) { @@ -3698,8 +3698,8 @@ void Ray::Ref::SampleLightSource(const simd_fvec4 &P, const simd_fvec4 &T, const offset.set(1, 0.5f * r * sinf(theta)); } - const simd_fvec4 lp = light_pos + light_u * offset.get<0>() + light_v * offset.get<1>(); - const simd_fvec4 light_forward = normalize(cross(light_u, light_v)); + const fvec4 lp = light_pos + light_u * offset.get<0>() + light_v * offset.get<1>(); + const fvec4 light_forward = normalize(cross(light_u, light_v)); ls.lp = offset_ray(lp, light_forward); float ls_dist; @@ -3716,7 +3716,7 @@ void Ray::Ref::SampleLightSource(const simd_fvec4 &P, const simd_fvec4 &T, const } if (l.sky_portal != 0) { - simd_fvec4 env_col = make_fvec3(sc.env.env_col); + fvec4 env_col = make_fvec3(sc.env.env_col); if (sc.env.env_map != 0xffffffff) { env_col *= SampleLatlong_RGBE(*static_cast(textures[0]), sc.env.env_map, ls.L, sc.env.env_map_rotation, rand_tex_uv); @@ -3725,20 +3725,20 @@ void Ray::Ref::SampleLightSource(const simd_fvec4 &P, const simd_fvec4 &T, const ls.from_env = 1; } } else if (l.type == LIGHT_TYPE_LINE) { - const simd_fvec4 light_pos = make_fvec3(l.line.pos); - const simd_fvec4 light_dir = make_fvec3(l.line.v); + const fvec4 light_pos = make_fvec3(l.line.pos); + const fvec4 light_dir = make_fvec3(l.line.v); const float r1 = rand_light_uv.get<0>(), r2 = rand_light_uv.get<1>(); - const simd_fvec4 center_to_surface = P - light_pos; + const fvec4 center_to_surface = P - light_pos; - simd_fvec4 light_u = normalize(cross(center_to_surface, light_dir)); - simd_fvec4 light_v = cross(light_u, light_dir); + fvec4 light_u = normalize(cross(center_to_surface, light_dir)); + fvec4 light_v = cross(light_u, light_dir); const float phi = PI * r1; - const simd_fvec4 normal = cosf(phi) * light_u + sinf(phi) * light_v; + const fvec4 normal = cosf(phi) * light_u + sinf(phi) * light_v; - const simd_fvec4 lp = light_pos + normal * l.line.radius + (r2 - 0.5f) * light_dir * l.line.height; + const fvec4 lp = light_pos + normal * l.line.radius + (r2 - 0.5f) * light_dir * l.line.height; ls.lp = lp; float ls_dist; @@ -3761,18 +3761,18 @@ void Ray::Ref::SampleLightSource(const simd_fvec4 &P, const simd_fvec4 &T, const &v2 = sc.vertices[sc.vtx_indices[ltri_index * 3 + 1]], &v3 = sc.vertices[sc.vtx_indices[ltri_index * 3 + 2]]; - const simd_fvec4 p1 = TransformPoint(simd_fvec4(v1.p[0], v1.p[1], v1.p[2], 0.0f), lmi.xform), - p2 = TransformPoint(simd_fvec4(v2.p[0], v2.p[1], v2.p[2], 0.0f), lmi.xform), - p3 = TransformPoint(simd_fvec4(v3.p[0], v3.p[1], v3.p[2], 0.0f), lmi.xform); - const simd_fvec2 uv1 = simd_fvec2(v1.t), uv2 = simd_fvec2(v2.t), uv3 = simd_fvec2(v3.t); + const fvec4 p1 = TransformPoint(fvec4(v1.p[0], v1.p[1], v1.p[2], 0.0f), lmi.xform), + p2 = TransformPoint(fvec4(v2.p[0], v2.p[1], v2.p[2], 0.0f), lmi.xform), + p3 = TransformPoint(fvec4(v3.p[0], v3.p[1], v3.p[2], 0.0f), lmi.xform); + const fvec2 uv1 = fvec2(v1.t), uv2 = fvec2(v2.t), uv3 = fvec2(v3.t); - const simd_fvec4 e1 = p2 - p1, e2 = p3 - p1; + const fvec4 e1 = p2 - p1, e2 = p3 - p1; float light_fwd_len; - const simd_fvec4 light_forward = normalize_len(cross(e1, e2), light_fwd_len); + const fvec4 light_forward = normalize_len(cross(e1, e2), light_fwd_len); ls.area = 0.5f * light_fwd_len; - simd_fvec4 lp; - simd_fvec2 luvs; + fvec4 lp; + fvec2 luvs; float pdf; #if USE_SPHERICAL_AREA_LIGHT_SAMPLING @@ -3780,8 +3780,8 @@ void Ray::Ref::SampleLightSource(const simd_fvec4 &P, const simd_fvec4 &T, const pdf = SampleSphericalTriangle(P, p1, p2, p3, rand_light_uv, &ls.L); if (pdf > 0.0f) { // find u, v of intersection point - const simd_fvec4 pvec = cross(ls.L, e2); - const simd_fvec4 tvec = P - p1, qvec = cross(tvec, e1); + const fvec4 pvec = cross(ls.L, e2); + const fvec4 tvec = P - p1, qvec = cross(tvec, e1); const float inv_det = 1.0f / dot(e1, pvec); const float tri_u = dot(tvec, pvec) * inv_det, tri_v = dot(ls.L, qvec) * inv_det; @@ -3812,7 +3812,7 @@ void Ray::Ref::SampleLightSource(const simd_fvec4 &P, const simd_fvec4 &T, const if (cos_theta > 0.0f) { ls.pdf = pdf; if (l.tri.tex_index != 0xffffffff) { - simd_fvec4 tex_color = SampleBilinear(textures, l.tri.tex_index, luvs, 0 /* lod */, rand_tex_uv); + fvec4 tex_color = SampleBilinear(textures, l.tri.tex_index, luvs, 0 /* lod */, rand_tex_uv); if (l.tri.tex_index & TEX_YCOCG_BIT) { tex_color = YCoCg_to_RGB(tex_color); } @@ -3825,10 +3825,10 @@ void Ray::Ref::SampleLightSource(const simd_fvec4 &P, const simd_fvec4 &T, const } else if (l.type == LIGHT_TYPE_ENV) { const float rx = rand_light_uv.get<0>(), ry = rand_light_uv.get<1>(); - simd_fvec4 dir_and_pdf; + fvec4 dir_and_pdf; if (sc.env.qtree_levels) { // Sample environment using quadtree - const auto *qtree_mips = reinterpret_cast(sc.env.qtree_mips); + const auto *qtree_mips = reinterpret_cast(sc.env.qtree_mips); dir_and_pdf = Sample_EnvQTree(sc.env.env_map_rotation, qtree_mips, sc.env.qtree_levels, u1, rx, ry); } else { // Sample environment as hemishpere @@ -3836,13 +3836,13 @@ void Ray::Ref::SampleLightSource(const simd_fvec4 &P, const simd_fvec4 &T, const const float cos_phi = cosf(phi), sin_phi = sinf(phi); const float dir = sqrtf(1.0f - rx * rx); - auto V = simd_fvec4{dir * cos_phi, dir * sin_phi, rx, 0.0f}; // in tangent-space + auto V = fvec4{dir * cos_phi, dir * sin_phi, rx, 0.0f}; // in tangent-space dir_and_pdf = world_from_tangent(T, B, N, V); dir_and_pdf.set<3>(0.5f / PI); } - ls.L = simd_fvec4{dir_and_pdf.get<0>(), dir_and_pdf.get<1>(), dir_and_pdf.get<2>(), 0.0f}; + ls.L = fvec4{dir_and_pdf.get<0>(), dir_and_pdf.get<1>(), dir_and_pdf.get<2>(), 0.0f}; ls.col *= {sc.env.env_col[0], sc.env.env_col[1], sc.env.env_col[2], 0.0f}; if (sc.env.env_map != 0xffffffff) { @@ -3866,8 +3866,8 @@ void Ray::Ref::IntersectAreaLights(Span rays, Span rays, Span rays, Span rays, Span= 0.0f) { @@ -3987,7 +3987,7 @@ void Ray::Ref::IntersectAreaLights(Span rays, Span HIT_EPS && (t1 < inout_inter.t || no_shadow)) { bool accept = true; if (l.sph.spot > 0.0f) { - const float _dot = -dot(rd, simd_fvec4{l.sph.dir}); + const float _dot = -dot(rd, fvec4{l.sph.dir}); if (_dot > 0.0f) { const float _angle = acosf(saturate(_dot)); accept &= (_angle <= l.sph.spot); @@ -4009,7 +4009,7 @@ void Ray::Ref::IntersectAreaLights(Span rays, Span cosf(l.dir.angle)) { inout_inter.v = 0.0f; @@ -4018,10 +4018,10 @@ void Ray::Ref::IntersectAreaLights(Span rays, Span rays, Span= -0.5f && a1 <= 0.5f) { const float a2 = dot(light_v, vi); @@ -4045,10 +4045,10 @@ void Ray::Ref::IntersectAreaLights(Span rays, Span rays, Span rays, Span() * _rd.get<2>() + _rd.get<1>() * _rd.get<1>(); const float B = 2.0f * (_rd.get<2>() * _ro.get<2>() + _rd.get<1>() * _ro.get<1>()); @@ -4088,7 +4088,7 @@ void Ray::Ref::IntersectAreaLights(Span rays, Span HIT_EPS && t1 > HIT_EPS) { const float t = fminf(t0, t1); - const simd_fvec4 p = _ro + t * _rd; + const fvec4 p = _ro + t * _rd; if (fabsf(p.get<0>()) < 0.5f * l.line.height && (t < inout_inter.t || no_shadow)) { inout_inter.v = 0.0f; inout_inter.obj_index = -int(light_index) - 1; @@ -4112,8 +4112,8 @@ void Ray::Ref::IntersectAreaLights(Span rays, Span rays, Span= 0.0f) { @@ -4183,7 +4183,7 @@ void Ray::Ref::IntersectAreaLights(Span rays, Span HIT_EPS && (t1 < inout_inter.t || no_shadow)) { bool accept = true; if (l.sph.spot > 0.0f) { - const float _dot = -dot(rd, simd_fvec4{l.sph.dir}); + const float _dot = -dot(rd, fvec4{l.sph.dir}); if (_dot > 0.0f) { const float _angle = acosf(saturate(_dot)); accept &= (_angle <= l.sph.spot); @@ -4205,7 +4205,7 @@ void Ray::Ref::IntersectAreaLights(Span rays, Span cosf(l.dir.angle)) { inout_inter.v = 0.0f; @@ -4214,10 +4214,10 @@ void Ray::Ref::IntersectAreaLights(Span rays, Span rays, Span= -0.5f && a1 <= 0.5f) { const float a2 = dot(light_v, vi); @@ -4241,10 +4241,10 @@ void Ray::Ref::IntersectAreaLights(Span rays, Span rays, Span rays, Span() * _rd.get<2>() + _rd.get<1>() * _rd.get<1>(); const float B = 2.0f * (_rd.get<2>() * _ro.get<2>() + _rd.get<1>() * _ro.get<1>()); @@ -4284,7 +4284,7 @@ void Ray::Ref::IntersectAreaLights(Span rays, Span HIT_EPS && t1 > HIT_EPS) { const float t = fminf(t0, t1); - const simd_fvec4 p = _ro + t * _rd; + const fvec4 p = _ro + t * _rd; if (fabsf(p.get<0>()) < 0.5f * l.line.height && (t < inout_inter.t || no_shadow)) { inout_inter.v = 0.0f; inout_inter.obj_index = -int(light_index) - 1; @@ -4306,8 +4306,8 @@ float Ray::Ref::IntersectAreaLights(const shadow_ray_t &ray, Span Span nodes) { const float rdist = fabsf(ray.dist); - const simd_fvec4 ro = make_fvec3(ray.o); - const simd_fvec4 rd = make_fvec3(ray.d); + const fvec4 ro = make_fvec3(ray.o); + const fvec4 rd = make_fvec3(ray.d); float inv_d[3]; safe_invert(value_ptr(rd), inv_d); @@ -4395,9 +4395,9 @@ float Ray::Ref::IntersectAreaLights(const shadow_ray_t &ray, Span continue; } if (l.type == LIGHT_TYPE_RECT) { - const simd_fvec4 light_pos = make_fvec3(l.rect.pos); - simd_fvec4 light_u = make_fvec3(l.rect.u), light_v = make_fvec3(l.rect.v); - const simd_fvec4 light_forward = normalize(cross(light_u, light_v)); + const fvec4 light_pos = make_fvec3(l.rect.pos); + fvec4 light_u = make_fvec3(l.rect.u), light_v = make_fvec3(l.rect.v); + const fvec4 light_forward = normalize(cross(light_u, light_v)); const float plane_dist = dot(light_forward, light_pos); const float cos_theta = dot(rd, light_forward); @@ -4408,7 +4408,7 @@ float Ray::Ref::IntersectAreaLights(const shadow_ray_t &ray, Span light_v /= dot(light_v, light_v); const auto p = ro + rd * t; - const simd_fvec4 vi = p - light_pos; + const fvec4 vi = p - light_pos; const float a1 = dot(light_u, vi); if (a1 >= -0.5f && a1 <= 0.5f) { const float a2 = dot(light_v, vi); @@ -4418,10 +4418,10 @@ float Ray::Ref::IntersectAreaLights(const shadow_ray_t &ray, Span } } } else if (l.type == LIGHT_TYPE_DISK) { - const simd_fvec4 light_pos = make_fvec3(l.disk.pos); - simd_fvec4 light_u = make_fvec3(l.disk.u), light_v = make_fvec3(l.disk.v); + const fvec4 light_pos = make_fvec3(l.disk.pos); + fvec4 light_u = make_fvec3(l.disk.u), light_v = make_fvec3(l.disk.v); - const simd_fvec4 light_forward = normalize(cross(light_u, light_v)); + const fvec4 light_forward = normalize(cross(light_u, light_v)); const float plane_dist = dot(light_forward, light_pos); const float cos_theta = dot(rd, light_forward); @@ -4432,7 +4432,7 @@ float Ray::Ref::IntersectAreaLights(const shadow_ray_t &ray, Span light_v /= dot(light_v, light_v); const auto p = ro + rd * t; - const simd_fvec4 vi = p - light_pos; + const fvec4 vi = p - light_pos; const float a1 = dot(light_u, vi); const float a2 = dot(light_v, vi); @@ -4446,7 +4446,7 @@ float Ray::Ref::IntersectAreaLights(const shadow_ray_t &ray, Span return 1.0f; } -float Ray::Ref::EvalTriLightFactor(const simd_fvec4 &P, const simd_fvec4 &ro, const uint32_t tri_index, +float Ray::Ref::EvalTriLightFactor(const fvec4 &P, const fvec4 &ro, const uint32_t tri_index, Span lights, Span nodes) { uint32_t stack[MAX_STACK_SIZE]; float stack_factors[MAX_STACK_SIZE]; @@ -4496,7 +4496,7 @@ float Ray::Ref::EvalTriLightFactor(const simd_fvec4 &P, const simd_fvec4 &ro, co return 1.0f; } -float Ray::Ref::EvalTriLightFactor(const simd_fvec4 &P, const simd_fvec4 &ro, uint32_t tri_index, +float Ray::Ref::EvalTriLightFactor(const fvec4 &P, const fvec4 &ro, uint32_t tri_index, Span lights, Span nodes) { uint32_t stack[MAX_STACK_SIZE]; float stack_factors[MAX_STACK_SIZE]; @@ -4516,7 +4516,7 @@ float Ray::Ref::EvalTriLightFactor(const simd_fvec4 &P, const simd_fvec4 &ro, ui calc_lnode_importance(nodes[cur], ro, importance); const float total_importance = - hsum(simd_fvec4{&importance[0], simd_mem_aligned} + simd_fvec4{&importance[4], simd_mem_aligned}); + hsum(fvec4{&importance[0], vector_aligned} + fvec4{&importance[4], vector_aligned}); assert(total_importance > 0.0f); do { @@ -4568,7 +4568,7 @@ void Ray::Ref::TraceShadowRays(Span rays, int max_transp_dep const int x = (sh_r.xy >> 16) & 0x0000ffff; const int y = sh_r.xy & 0x0000ffff; - simd_fvec4 rc = + fvec4 rc = IntersectScene(sh_r, max_transp_depth, sc, node_index, rand_seq, rand_seed, iteration, textures); if (sc.blocker_lights_count) { rc *= IntersectAreaLights(sh_r, sc.lights, sc.light_wnodes); @@ -4580,17 +4580,17 @@ void Ray::Ref::TraceShadowRays(Span rays, int max_transp_dep rc *= (limit / sum); } - auto old_val = simd_fvec4{out_color[y * img_w + x].v, simd_mem_aligned}; + auto old_val = fvec4{out_color[y * img_w + x].v, vector_aligned}; old_val += rc; - old_val.store_to(out_color[y * img_w + x].v, simd_mem_aligned); + old_val.store_to(out_color[y * img_w + x].v, vector_aligned); } } -Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_EnvColor(const ray_data_t &ray, const environment_t &env, +Ray::Ref::fvec4 Ray::Ref::Evaluate_EnvColor(const ray_data_t &ray, const environment_t &env, const Cpu::TexStorageRGBA &tex_storage, const float pdf_factor, - const simd_fvec2 &rand) { - const simd_fvec4 I = make_fvec3(ray.d); - simd_fvec4 env_col = 1.0f; + const fvec2 &rand) { + const fvec4 I = make_fvec3(ray.d); + fvec4 env_col = 1.0f; const uint32_t env_map = is_indirect(ray.depth) ? env.env_map : env.back_map; const float env_map_rotation = is_indirect(ray.depth) ? env.env_map_rotation : env.back_map_rotation; @@ -4601,7 +4601,7 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_EnvColor(const ray_data_t &ray, const en #if USE_NEE if (env.light_index != 0xffffffff && pdf_factor >= 0.0f && is_indirect(ray.depth)) { if (env.qtree_levels) { - const auto *qtree_mips = reinterpret_cast(env.qtree_mips); + const auto *qtree_mips = reinterpret_cast(env.qtree_mips); const float light_pdf = safe_div_pos(Evaluate_EnvQTree(env_map_rotation, qtree_mips, env.qtree_levels, I), pdf_factor); @@ -4619,18 +4619,18 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_EnvColor(const ray_data_t &ray, const en } #endif - env_col *= is_indirect(ray.depth) ? simd_fvec4{env.env_col[0], env.env_col[1], env.env_col[2], 1.0f} - : simd_fvec4{env.back_col[0], env.back_col[1], env.back_col[2], 1.0f}; + env_col *= is_indirect(ray.depth) ? fvec4{env.env_col[0], env.env_col[1], env.env_col[2], 1.0f} + : fvec4{env.back_col[0], env.back_col[1], env.back_col[2], 1.0f}; env_col.set<3>(1.0f); return env_col; } -Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_LightColor(const ray_data_t &ray, const hit_data_t &inter, +Ray::Ref::fvec4 Ray::Ref::Evaluate_LightColor(const ray_data_t &ray, const hit_data_t &inter, const environment_t &env, const Cpu::TexStorageRGBA &tex_storage, Span lights, const uint32_t lights_count, - const simd_fvec2 &rand) { - const simd_fvec4 ro = make_fvec3(ray.o), I = make_fvec3(ray.d); + const fvec2 &rand) { + const fvec4 ro = make_fvec3(ray.o), I = make_fvec3(ray.d); const light_t &l = lights[-inter.obj_index - 1]; #if USE_HIERARCHICAL_NEE @@ -4639,9 +4639,9 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_LightColor(const ray_data_t &ray, const const float pdf_factor = float(lights_count); #endif - simd_fvec4 lcol = make_fvec3(l.col); + fvec4 lcol = make_fvec3(l.col); if (l.sky_portal != 0) { - simd_fvec4 env_col = make_fvec3(env.env_col); + fvec4 env_col = make_fvec3(env.env_col); if (env.env_map != 0xffffffff) { env_col *= SampleLatlong_RGBE(tex_storage, env.env_map, I, env.env_map_rotation, rand); } @@ -4649,9 +4649,9 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_LightColor(const ray_data_t &ray, const } #if USE_NEE if (l.type == LIGHT_TYPE_SPHERE) { - const simd_fvec4 light_pos = make_fvec3(l.sph.pos); + const fvec4 light_pos = make_fvec3(l.sph.pos); - const simd_fvec4 disk_normal = normalize(ro - light_pos); + const fvec4 disk_normal = normalize(ro - light_pos); const float disk_dist = dot(ro, disk_normal) - dot(light_pos, disk_normal); const float light_pdf = (disk_dist * disk_dist) / (PI * l.sph.radius * l.sph.radius * pdf_factor); @@ -4661,7 +4661,7 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_LightColor(const ray_data_t &ray, const lcol *= mis_weight; if (l.sph.spot > 0.0f && l.sph.blend > 0.0f) { - const float _dot = -dot(I, simd_fvec4{l.sph.dir}); + const float _dot = -dot(I, fvec4{l.sph.dir}); assert(_dot > 0.0f); const float _angle = acosf(saturate(_dot)); assert(_angle <= l.sph.spot); @@ -4681,8 +4681,8 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_LightColor(const ray_data_t &ray, const const float mis_weight = power_heuristic(bsdf_pdf, light_pdf); lcol *= mis_weight; } else if (l.type == LIGHT_TYPE_RECT) { - const simd_fvec4 light_pos = make_fvec3(l.rect.pos); - const simd_fvec4 light_u = make_fvec3(l.rect.u), light_v = make_fvec3(l.rect.v); + const fvec4 light_pos = make_fvec3(l.rect.pos); + const fvec4 light_u = make_fvec3(l.rect.u), light_v = make_fvec3(l.rect.v); float light_pdf; #if USE_SPHERICAL_AREA_LIGHT_SAMPLING @@ -4690,7 +4690,7 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_LightColor(const ray_data_t &ray, const if (light_pdf == 0.0f) #endif { - const simd_fvec4 light_forward = normalize(cross(light_u, light_v)); + const fvec4 light_forward = normalize(cross(light_u, light_v)); const float light_area = l.rect.area; const float cos_theta = dot(I, light_forward); light_pdf = (inter.t * inter.t) / (light_area * cos_theta * pdf_factor); @@ -4700,9 +4700,9 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_LightColor(const ray_data_t &ray, const const float mis_weight = power_heuristic(bsdf_pdf, light_pdf); lcol *= mis_weight; } else if (l.type == LIGHT_TYPE_DISK) { - simd_fvec4 light_u = make_fvec3(l.disk.u), light_v = make_fvec3(l.disk.v); + fvec4 light_u = make_fvec3(l.disk.u), light_v = make_fvec3(l.disk.v); - const simd_fvec4 light_forward = normalize(cross(light_u, light_v)); + const fvec4 light_forward = normalize(cross(light_u, light_v)); const float light_area = l.disk.area; const float cos_theta = dot(I, light_forward); @@ -4713,7 +4713,7 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_LightColor(const ray_data_t &ray, const const float mis_weight = power_heuristic(bsdf_pdf, light_pdf); lcol *= mis_weight; } else if (l.type == LIGHT_TYPE_LINE) { - const simd_fvec4 light_dir = make_fvec3(l.line.v); + const fvec4 light_dir = make_fvec3(l.line.v); const float light_area = l.line.area; const float cos_theta = 1.0f - fabsf(dot(I, light_dir)); @@ -4728,13 +4728,13 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_LightColor(const ray_data_t &ray, const return lcol; } -Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_DiffuseNode(const light_sample_t &ls, const ray_data_t &ray, - const surface_t &surf, const simd_fvec4 &base_color, +Ray::Ref::fvec4 Ray::Ref::Evaluate_DiffuseNode(const light_sample_t &ls, const ray_data_t &ray, + const surface_t &surf, const fvec4 &base_color, const float roughness, const float mix_weight, const bool use_mis, shadow_ray_t &sh_r) { - const simd_fvec4 I = make_fvec3(ray.d); + const fvec4 I = make_fvec3(ray.d); - const simd_fvec4 diff_col = Evaluate_OrenDiffuse_BSDF(-I, surf.N, ls.L, roughness, base_color); + const fvec4 diff_col = Evaluate_OrenDiffuse_BSDF(-I, surf.N, ls.L, roughness, base_color); const float bsdf_pdf = diff_col[3]; float mis_weight = 1.0f; @@ -4742,7 +4742,7 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_DiffuseNode(const light_sample_t &ls, co mis_weight = power_heuristic(ls.pdf, bsdf_pdf); } - const simd_fvec4 lcol = ls.col * diff_col * (mix_weight * mis_weight / ls.pdf); + const fvec4 lcol = ls.col * diff_col * (mix_weight * mis_weight / ls.pdf); if (!ls.cast_shadow) { // apply light immediately @@ -4752,16 +4752,16 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_DiffuseNode(const light_sample_t &ls, co // schedule shadow ray memcpy(&sh_r.o[0], value_ptr(offset_ray(surf.P, surf.plane_N)), 3 * sizeof(float)); UNROLLED_FOR(i, 3, { sh_r.c[i] = ray.c[i] * lcol[i]; }) - return simd_fvec4{0.0f}; + return fvec4{0.0f}; } -void Ray::Ref::Sample_DiffuseNode(const ray_data_t &ray, const surface_t &surf, const simd_fvec4 &base_color, - const float roughness, const simd_fvec2 rand, const float mix_weight, +void Ray::Ref::Sample_DiffuseNode(const ray_data_t &ray, const surface_t &surf, const fvec4 &base_color, + const float roughness, const fvec2 rand, const float mix_weight, ray_data_t &new_ray) { - const simd_fvec4 I = make_fvec3(ray.d); + const fvec4 I = make_fvec3(ray.d); - simd_fvec4 V; - const simd_fvec4 F = Sample_OrenDiffuse_BSDF(surf.T, surf.B, surf.N, I, roughness, base_color, rand, V); + fvec4 V; + const fvec4 F = Sample_OrenDiffuse_BSDF(surf.T, surf.B, surf.N, I, roughness, base_color, rand, V); new_ray.depth = pack_ray_type(RAY_TYPE_DIFFUSE); new_ray.depth |= mask_ray_depth(ray.depth) + pack_ray_depth(1, 0, 0, 0); @@ -4772,24 +4772,24 @@ void Ray::Ref::Sample_DiffuseNode(const ray_data_t &ray, const surface_t &surf, new_ray.pdf = F[3]; } -Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_GlossyNode(const light_sample_t &ls, const ray_data_t &ray, - const surface_t &surf, const simd_fvec4 &base_color, +Ray::Ref::fvec4 Ray::Ref::Evaluate_GlossyNode(const light_sample_t &ls, const ray_data_t &ray, + const surface_t &surf, const fvec4 &base_color, const float roughness, const float regularize_alpha, const float spec_ior, const float spec_F0, const float mix_weight, const bool use_mis, shadow_ray_t &sh_r) { - const simd_fvec4 I = make_fvec3(ray.d); - const simd_fvec4 H = normalize(ls.L - I); + const fvec4 I = make_fvec3(ray.d); + const fvec4 H = normalize(ls.L - I); - const simd_fvec4 view_dir_ts = tangent_from_world(surf.T, surf.B, surf.N, -I); - const simd_fvec4 light_dir_ts = tangent_from_world(surf.T, surf.B, surf.N, ls.L); - const simd_fvec4 sampled_normal_ts = tangent_from_world(surf.T, surf.B, surf.N, H); + const fvec4 view_dir_ts = tangent_from_world(surf.T, surf.B, surf.N, -I); + const fvec4 light_dir_ts = tangent_from_world(surf.T, surf.B, surf.N, ls.L); + const fvec4 sampled_normal_ts = tangent_from_world(surf.T, surf.B, surf.N, H); - const simd_fvec2 alpha = calc_alpha(roughness, 0.0f, regularize_alpha); + const fvec2 alpha = calc_alpha(roughness, 0.0f, regularize_alpha); if (alpha.get<0>() * alpha.get<1>() < 1e-7f) { - return simd_fvec4{0.0f}; + return fvec4{0.0f}; } - const simd_fvec4 spec_col = Evaluate_GGXSpecular_BSDF(view_dir_ts, sampled_normal_ts, light_dir_ts, alpha, spec_ior, + const fvec4 spec_col = Evaluate_GGXSpecular_BSDF(view_dir_ts, sampled_normal_ts, light_dir_ts, alpha, spec_ior, spec_F0, base_color, base_color); const float bsdf_pdf = spec_col[3]; @@ -4797,7 +4797,7 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_GlossyNode(const light_sample_t &ls, con if (use_mis && ls.area > 0.0f) { mis_weight = power_heuristic(ls.pdf, bsdf_pdf); } - const simd_fvec4 lcol = ls.col * spec_col * (mix_weight * mis_weight / ls.pdf); + const fvec4 lcol = ls.col * spec_col * (mix_weight * mis_weight / ls.pdf); if (!ls.cast_shadow) { // apply light immediately @@ -4809,17 +4809,17 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_GlossyNode(const light_sample_t &ls, con sh_r.c[0] = ray.c[0] * lcol.get<0>(); sh_r.c[1] = ray.c[1] * lcol.get<1>(); sh_r.c[2] = ray.c[2] * lcol.get<2>(); - return simd_fvec4{0.0f}; + return fvec4{0.0f}; } -void Ray::Ref::Sample_GlossyNode(const ray_data_t &ray, const surface_t &surf, const simd_fvec4 &base_color, +void Ray::Ref::Sample_GlossyNode(const ray_data_t &ray, const surface_t &surf, const fvec4 &base_color, const float roughness, const float regularize_alpha, const float spec_ior, - const float spec_F0, const simd_fvec2 rand, const float mix_weight, + const float spec_F0, const fvec2 rand, const float mix_weight, ray_data_t &new_ray) { - const simd_fvec4 I = make_fvec3(ray.d); + const fvec4 I = make_fvec3(ray.d); - simd_fvec4 V; - const simd_fvec4 F = + fvec4 V; + const fvec4 F = Sample_GGXSpecular_BSDF(surf.T, surf.B, surf.N, I, calc_alpha(roughness, 0.0f, regularize_alpha), spec_ior, spec_F0, base_color, base_color, rand, V); @@ -4833,19 +4833,19 @@ void Ray::Ref::Sample_GlossyNode(const ray_data_t &ray, const surface_t &surf, c new_ray.pdf = F[3]; } -Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_RefractiveNode(const light_sample_t &ls, const ray_data_t &ray, - const surface_t &surf, const simd_fvec4 &base_color, +Ray::Ref::fvec4 Ray::Ref::Evaluate_RefractiveNode(const light_sample_t &ls, const ray_data_t &ray, + const surface_t &surf, const fvec4 &base_color, const float roughness, const float regularize_alpha, const float eta, const float mix_weight, const bool use_mis, shadow_ray_t &sh_r) { - const simd_fvec4 I = make_fvec3(ray.d); + const fvec4 I = make_fvec3(ray.d); - const simd_fvec4 H = normalize(ls.L - I * eta); - const simd_fvec4 view_dir_ts = tangent_from_world(surf.T, surf.B, surf.N, -I); - const simd_fvec4 light_dir_ts = tangent_from_world(surf.T, surf.B, surf.N, ls.L); - const simd_fvec4 sampled_normal_ts = tangent_from_world(surf.T, surf.B, surf.N, H); + const fvec4 H = normalize(ls.L - I * eta); + const fvec4 view_dir_ts = tangent_from_world(surf.T, surf.B, surf.N, -I); + const fvec4 light_dir_ts = tangent_from_world(surf.T, surf.B, surf.N, ls.L); + const fvec4 sampled_normal_ts = tangent_from_world(surf.T, surf.B, surf.N, H); - const simd_fvec4 refr_col = Evaluate_GGXRefraction_BSDF( + const fvec4 refr_col = Evaluate_GGXRefraction_BSDF( view_dir_ts, sampled_normal_ts, light_dir_ts, calc_alpha(roughness, 0.0f, regularize_alpha), eta, base_color); const float bsdf_pdf = refr_col[3]; @@ -4853,7 +4853,7 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_RefractiveNode(const light_sample_t &ls, if (use_mis && ls.area > 0.0f) { mis_weight = power_heuristic(ls.pdf, bsdf_pdf); } - const simd_fvec4 lcol = ls.col * refr_col * (mix_weight * mis_weight / ls.pdf); + const fvec4 lcol = ls.col * refr_col * (mix_weight * mis_weight / ls.pdf); if (!ls.cast_shadow) { // apply light immediately @@ -4863,18 +4863,18 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_RefractiveNode(const light_sample_t &ls, // schedule shadow ray memcpy(&sh_r.o[0], value_ptr(offset_ray(surf.P, -surf.plane_N)), 3 * sizeof(float)); UNROLLED_FOR(i, 3, { sh_r.c[i] = ray.c[i] * lcol.get(); }) - return simd_fvec4{0.0f}; + return fvec4{0.0f}; } -void Ray::Ref::Sample_RefractiveNode(const ray_data_t &ray, const surface_t &surf, const simd_fvec4 &base_color, +void Ray::Ref::Sample_RefractiveNode(const ray_data_t &ray, const surface_t &surf, const fvec4 &base_color, const float roughness, const float regularize_alpha, const bool is_backfacing, - const float int_ior, const float ext_ior, const simd_fvec2 rand, + const float int_ior, const float ext_ior, const fvec2 rand, const float mix_weight, ray_data_t &new_ray) { - const simd_fvec4 I = make_fvec3(ray.d); + const fvec4 I = make_fvec3(ray.d); const float eta = is_backfacing ? (int_ior / ext_ior) : (ext_ior / int_ior); - simd_fvec4 V; - const simd_fvec4 F = Sample_GGXRefraction_BSDF( + fvec4 V; + const fvec4 F = Sample_GGXRefraction_BSDF( surf.T, surf.B, surf.N, I, calc_alpha(roughness, 0.0f, regularize_alpha), eta, base_color, rand, V); new_ray.depth = pack_ray_type(RAY_TYPE_REFR); @@ -4895,18 +4895,18 @@ void Ray::Ref::Sample_RefractiveNode(const ray_data_t &ray, const surface_t &sur memcpy(&new_ray.d[0], value_ptr(V), 3 * sizeof(float)); } -Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_PrincipledNode( +Ray::Ref::fvec4 Ray::Ref::Evaluate_PrincipledNode( const light_sample_t &ls, const ray_data_t &ray, const surface_t &surf, const lobe_weights_t &lobe_weights, const diff_params_t &diff, const spec_params_t &spec, const clearcoat_params_t &coat, const transmission_params_t &trans, const float metallic, const float transmission, const float N_dot_L, const float mix_weight, const bool use_mis, const float regularize_alpha, shadow_ray_t &sh_r) { - const simd_fvec4 I = make_fvec3(ray.d); + const fvec4 I = make_fvec3(ray.d); - simd_fvec4 lcol = 0.0f; + fvec4 lcol = 0.0f; float bsdf_pdf = 0.0f; if (lobe_weights.diffuse > 0.0f && N_dot_L > 0.0f) { - simd_fvec4 diff_col = + fvec4 diff_col = Evaluate_PrincipledDiffuse_BSDF(-I, surf.N, ls.L, diff.roughness, diff.base_color, diff.sheen_color, false); bsdf_pdf += lobe_weights.diffuse * diff_col.get<3>(); diff_col *= (1.0f - metallic) * (1.0f - transmission); @@ -4914,29 +4914,29 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_PrincipledNode( lcol += ls.col * N_dot_L * diff_col / (PI * ls.pdf); } - simd_fvec4 H; + fvec4 H; if (N_dot_L > 0.0f) { H = normalize(ls.L - I); } else { H = normalize(ls.L - I * trans.eta); } - const simd_fvec4 view_dir_ts = tangent_from_world(surf.T, surf.B, surf.N, -I); - const simd_fvec4 light_dir_ts = tangent_from_world(surf.T, surf.B, surf.N, ls.L); - const simd_fvec4 sampled_normal_ts = tangent_from_world(surf.T, surf.B, surf.N, H); + const fvec4 view_dir_ts = tangent_from_world(surf.T, surf.B, surf.N, -I); + const fvec4 light_dir_ts = tangent_from_world(surf.T, surf.B, surf.N, ls.L); + const fvec4 sampled_normal_ts = tangent_from_world(surf.T, surf.B, surf.N, H); - const simd_fvec2 spec_alpha = calc_alpha(spec.roughness, spec.anisotropy, regularize_alpha); + const fvec2 spec_alpha = calc_alpha(spec.roughness, spec.anisotropy, regularize_alpha); if (lobe_weights.specular > 0.0f && spec_alpha.get<0>() * spec_alpha.get<1>() >= 1e-7f && N_dot_L > 0.0f) { - const simd_fvec4 spec_col = Evaluate_GGXSpecular_BSDF(view_dir_ts, sampled_normal_ts, light_dir_ts, spec_alpha, - spec.ior, spec.F0, spec.tmp_col, simd_fvec4{1.0f}); + const fvec4 spec_col = Evaluate_GGXSpecular_BSDF(view_dir_ts, sampled_normal_ts, light_dir_ts, spec_alpha, + spec.ior, spec.F0, spec.tmp_col, fvec4{1.0f}); bsdf_pdf += lobe_weights.specular * spec_col.get<3>(); lcol += ls.col * spec_col / ls.pdf; } - const simd_fvec2 coat_alpha = calc_alpha(coat.roughness, 0.0f, regularize_alpha); + const fvec2 coat_alpha = calc_alpha(coat.roughness, 0.0f, regularize_alpha); if (lobe_weights.clearcoat > 0.0f && coat_alpha.get<0>() * coat_alpha.get<1>() >= 1e-7f && N_dot_L > 0.0f) { - const simd_fvec4 clearcoat_col = Evaluate_PrincipledClearcoat_BSDF(view_dir_ts, sampled_normal_ts, light_dir_ts, + const fvec4 clearcoat_col = Evaluate_PrincipledClearcoat_BSDF(view_dir_ts, sampled_normal_ts, light_dir_ts, coat_alpha.get<0>(), coat.ior, coat.F0); bsdf_pdf += lobe_weights.clearcoat * clearcoat_col.get<3>(); @@ -4944,19 +4944,19 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_PrincipledNode( } if (lobe_weights.refraction > 0.0f) { - const simd_fvec2 refr_spec_alpha = calc_alpha(spec.roughness, 0.0f, regularize_alpha); + const fvec2 refr_spec_alpha = calc_alpha(spec.roughness, 0.0f, regularize_alpha); if (trans.fresnel != 0.0f && refr_spec_alpha.get<0>() * refr_spec_alpha.get<1>() >= 1e-7f && N_dot_L > 0.0f) { - const simd_fvec4 spec_col = + const fvec4 spec_col = Evaluate_GGXSpecular_BSDF(view_dir_ts, sampled_normal_ts, light_dir_ts, refr_spec_alpha, 1.0f /* ior */, - 0.0f /* F0 */, simd_fvec4{1.0f}, simd_fvec4{1.0f}); + 0.0f /* F0 */, fvec4{1.0f}, fvec4{1.0f}); bsdf_pdf += lobe_weights.refraction * trans.fresnel * spec_col.get<3>(); lcol += ls.col * spec_col * (trans.fresnel / ls.pdf); } - const simd_fvec2 refr_trans_alpha = calc_alpha(trans.roughness, 0.0f, regularize_alpha); + const fvec2 refr_trans_alpha = calc_alpha(trans.roughness, 0.0f, regularize_alpha); if (trans.fresnel != 1.0f && refr_trans_alpha.get<0>() * refr_trans_alpha.get<1>() >= 1e-7f && N_dot_L < 0.0f) { - const simd_fvec4 refr_col = Evaluate_GGXRefraction_BSDF(view_dir_ts, sampled_normal_ts, light_dir_ts, + const fvec4 refr_col = Evaluate_GGXRefraction_BSDF(view_dir_ts, sampled_normal_ts, light_dir_ts, refr_trans_alpha, trans.eta, diff.base_color); bsdf_pdf += lobe_weights.refraction * (1.0f - trans.fresnel) * refr_col.get<3>(); @@ -4978,16 +4978,16 @@ Ray::Ref::simd_fvec4 Ray::Ref::Evaluate_PrincipledNode( // schedule shadow ray memcpy(&sh_r.o[0], value_ptr(offset_ray(surf.P, N_dot_L < 0.0f ? -surf.plane_N : surf.plane_N)), 3 * sizeof(float)); UNROLLED_FOR(i, 3, { sh_r.c[i] = ray.c[i] * lcol.get(); }) - return simd_fvec4{0.0f}; + return fvec4{0.0f}; } void Ray::Ref::Sample_PrincipledNode(const pass_settings_t &ps, const ray_data_t &ray, const surface_t &surf, const lobe_weights_t &lobe_weights, const diff_params_t &diff, const spec_params_t &spec, const clearcoat_params_t &coat, const transmission_params_t &trans, const float metallic, const float transmission, - const simd_fvec2 rand, float mix_rand, const float mix_weight, + const fvec2 rand, float mix_rand, const float mix_weight, const float regularize_alpha, ray_data_t &new_ray) { - const simd_fvec4 I = make_fvec3(ray.d); + const fvec4 I = make_fvec3(ray.d); const int diff_depth = get_diff_depth(ray.depth), spec_depth = get_spec_depth(ray.depth), refr_depth = get_refr_depth(ray.depth); @@ -4999,8 +4999,8 @@ void Ray::Ref::Sample_PrincipledNode(const pass_settings_t &ps, const ray_data_t // Diffuse lobe // if (diff_depth < ps.max_diff_depth && total_depth < ps.max_total_depth) { - simd_fvec4 V; - simd_fvec4 F = Sample_PrincipledDiffuse_BSDF(surf.T, surf.B, surf.N, I, diff.roughness, diff.base_color, + fvec4 V; + fvec4 F = Sample_PrincipledDiffuse_BSDF(surf.T, surf.B, surf.N, I, diff.roughness, diff.base_color, diff.sheen_color, false, rand, V); const float pdf = F.get<3>(); // * lobe_weights.diffuse; @@ -5021,10 +5021,10 @@ void Ray::Ref::Sample_PrincipledNode(const pass_settings_t &ps, const ray_data_t // Main specular lobe // if (spec_depth < ps.max_spec_depth && total_depth < ps.max_total_depth) { - simd_fvec4 V; - simd_fvec4 F = Sample_GGXSpecular_BSDF(surf.T, surf.B, surf.N, I, + fvec4 V; + fvec4 F = Sample_GGXSpecular_BSDF(surf.T, surf.B, surf.N, I, calc_alpha(spec.roughness, spec.anisotropy, regularize_alpha), - spec.ior, spec.F0, spec.tmp_col, simd_fvec4{1.0f}, rand, V); + spec.ior, spec.F0, spec.tmp_col, fvec4{1.0f}, rand, V); const float pdf = F.get<3>() * lobe_weights.specular; new_ray.depth = pack_ray_type(RAY_TYPE_SPECULAR); @@ -5041,8 +5041,8 @@ void Ray::Ref::Sample_PrincipledNode(const pass_settings_t &ps, const ray_data_t // Clearcoat lobe (secondary specular) // if (spec_depth < ps.max_spec_depth && total_depth < ps.max_total_depth) { - simd_fvec4 V; - simd_fvec4 F = Sample_PrincipledClearcoat_BSDF(surf.T, surf.B, surf.N, I, + fvec4 V; + fvec4 F = Sample_PrincipledClearcoat_BSDF(surf.T, surf.B, surf.N, I, calc_alpha(coat.roughness, 0.0f, regularize_alpha).get<0>(), coat.ior, coat.F0, rand, V); const float pdf = F.get<3>() * lobe_weights.clearcoat; @@ -5068,11 +5068,11 @@ void Ray::Ref::Sample_PrincipledNode(const pass_settings_t &ps, const ray_data_t (mix_rand < trans.fresnel && spec_depth < ps.max_spec_depth)) && total_depth < ps.max_total_depth) { - simd_fvec4 F, V; + fvec4 F, V; if (mix_rand < trans.fresnel) { F = Sample_GGXSpecular_BSDF(surf.T, surf.B, surf.N, I, calc_alpha(spec.roughness, 0.0f, regularize_alpha), 1.0f /* ior */, - 0.0f /* F0 */, simd_fvec4{1.0f}, simd_fvec4{1.0f}, rand, V); + 0.0f /* F0 */, fvec4{1.0f}, fvec4{1.0f}, rand, V); new_ray.depth = pack_ray_type(RAY_TYPE_SPECULAR); new_ray.depth |= mask_ray_depth(ray.depth) + pack_ray_depth(0, 1, 0, 0); @@ -5112,15 +5112,15 @@ Ray::color_rgba_t Ray::Ref::ShadeSurface(const pass_settings_t &ps, const float int *out_secondary_rays_count, shadow_ray_t *out_shadow_rays, int *out_shadow_rays_count, color_rgba_t *out_base_color, color_rgba_t *out_depth_normal) { - const simd_fvec4 I = make_fvec3(ray.d); - const simd_fvec4 ro = make_fvec3(ray.o); + const fvec4 I = make_fvec3(ray.d); + const fvec4 ro = make_fvec3(ray.o); // used to randomize random sequence among pixels const uint32_t px_hash = hash(ray.xy); const uint32_t rand_hash = hash_combine(px_hash, rand_seed); const uint32_t rand_dim = RAND_DIM_BASE_COUNT + get_total_depth(ray.depth) * RAND_DIM_BOUNCE_COUNT; - const simd_fvec2 tex_rand = get_scrambled_2d_rand(rand_dim + RAND_DIM_TEX, rand_hash, iteration - 1, rand_seq); + const fvec2 tex_rand = get_scrambled_2d_rand(rand_dim + RAND_DIM_TEX, rand_hash, iteration - 1, rand_seq); if (inter.v < 0.0f) { #if USE_HIERARCHICAL_NEE @@ -5131,9 +5131,9 @@ Ray::color_rgba_t Ray::Ref::ShadeSurface(const pass_settings_t &ps, const float (get_total_depth(ray.depth) < ps.max_total_depth) ? float(sc.li_indices.size()) : -1.0f; #endif - simd_fvec4 env_col = Evaluate_EnvColor(ray, sc.env, *static_cast(textures[0]), + fvec4 env_col = Evaluate_EnvColor(ray, sc.env, *static_cast(textures[0]), pdf_factor, tex_rand); - env_col *= simd_fvec4{ray.c[0], ray.c[1], ray.c[2], 0.0f}; + env_col *= fvec4{ray.c[0], ray.c[1], ray.c[2], 0.0f}; const float sum = hsum(env_col); if (sum > limits[0]) { @@ -5147,10 +5147,10 @@ Ray::color_rgba_t Ray::Ref::ShadeSurface(const pass_settings_t &ps, const float surf.P = ro + inter.t * I; if (inter.obj_index < 0) { // Area light intersection - simd_fvec4 lcol = + fvec4 lcol = Evaluate_LightColor(ray, inter, sc.env, *static_cast(textures[0]), sc.lights, uint32_t(sc.li_indices.size()), tex_rand); - lcol *= simd_fvec4{ray.c[0], ray.c[1], ray.c[2], 0.0f}; + lcol *= fvec4{ray.c[0], ray.c[1], ray.c[2], 0.0f}; const float sum = hsum(lcol); if (sum > limits[0]) { @@ -5172,10 +5172,10 @@ Ray::color_rgba_t Ray::Ref::ShadeSurface(const pass_settings_t &ps, const float const float w = 1.0f - inter.u - inter.v; surf.N = normalize(make_fvec3(v1.n) * w + make_fvec3(v2.n) * inter.u + make_fvec3(v3.n) * inter.v); - surf.uvs = simd_fvec2(v1.t) * w + simd_fvec2(v2.t) * inter.u + simd_fvec2(v3.t) * inter.v; + surf.uvs = fvec2(v1.t) * w + fvec2(v2.t) * inter.u + fvec2(v3.t) * inter.v; float pa; - surf.plane_N = normalize_len(cross(simd_fvec4{v2.p} - simd_fvec4{v1.p}, simd_fvec4{v3.p} - simd_fvec4{v1.p}), pa); + surf.plane_N = normalize_len(cross(fvec4{v2.p} - fvec4{v1.p}, fvec4{v3.p} - fvec4{v1.p}), pa); surf.B = make_fvec3(v1.b) * w + make_fvec3(v2.b) * inter.u + make_fvec3(v3.b) * inter.v; surf.T = cross(surf.B, surf.N); @@ -5214,14 +5214,14 @@ Ray::color_rgba_t Ray::Ref::ShadeSurface(const pass_settings_t &ps, const float const float ext_ior = peek_ior_stack(ray.ior, is_backfacing); - simd_fvec4 col = {0.0f}; + fvec4 col = {0.0f}; const int diff_depth = get_diff_depth(ray.depth), spec_depth = get_spec_depth(ray.depth), refr_depth = get_refr_depth(ray.depth); // NOTE: transparency depth is not accounted here const int total_depth = diff_depth + spec_depth + refr_depth; - const simd_fvec2 mix_term_rand = + const fvec2 mix_term_rand = get_scrambled_2d_rand(rand_dim + RAND_DIM_BSDF_PICK, rand_hash, iteration - 1, rand_seq); float mix_rand = mix_term_rand.get<0>(); @@ -5232,7 +5232,7 @@ Ray::color_rgba_t Ray::Ref::ShadeSurface(const pass_settings_t &ps, const float float mix_val = mat->strength; const uint32_t base_texture = mat->textures[BASE_TEXTURE]; if (base_texture != 0xffffffff) { - simd_fvec4 tex_color = SampleBilinear(textures, base_texture, surf.uvs, 0, tex_rand); + fvec4 tex_color = SampleBilinear(textures, base_texture, surf.uvs, 0, tex_rand); if (base_texture & TEX_YCOCG_BIT) { tex_color = YCoCg_to_RGB(tex_color); } @@ -5262,13 +5262,13 @@ Ray::color_rgba_t Ray::Ref::ShadeSurface(const pass_settings_t &ps, const float // apply normal map if (mat->textures[NORMALS_TEXTURE] != 0xffffffff) { - simd_fvec4 normals = SampleBilinear(textures, mat->textures[NORMALS_TEXTURE], surf.uvs, 0, tex_rand); + fvec4 normals = SampleBilinear(textures, mat->textures[NORMALS_TEXTURE], surf.uvs, 0, tex_rand); normals = normals * 2.0f - 1.0f; normals.set<2>(1.0f); if (mat->textures[NORMALS_TEXTURE] & TEX_RECONSTRUCT_Z_BIT) { normals.set<2>(safe_sqrt(1.0f - normals.get<0>() * normals.get<0>() - normals.get<1>() * normals.get<1>())); } - simd_fvec4 in_normal = surf.N; + fvec4 in_normal = surf.N; surf.N = normalize(normals.get<0>() * surf.T + normals.get<2>() * surf.N + normals.get<1>() * surf.B); if (mat->normal_map_strength_unorm != 0xffff) { surf.N = normalize(in_normal + (surf.N - in_normal) * unpack_unorm_16(mat->normal_map_strength_unorm)); @@ -5280,9 +5280,9 @@ Ray::color_rgba_t Ray::Ref::ShadeSurface(const pass_settings_t &ps, const float create_tbn_matrix(N, _tangent_from_world); #else // Find radial tangent in local space - const simd_fvec4 P_ls = make_fvec3(v1.p) * w + make_fvec3(v2.p) * inter.u + make_fvec3(v3.p) * inter.v; + const fvec4 P_ls = make_fvec3(v1.p) * w + make_fvec3(v2.p) * inter.u + make_fvec3(v3.p) * inter.v; // rotate around Y axis by 90 degrees in 2d - simd_fvec4 tangent = {-P_ls.get<2>(), 0.0f, P_ls.get<0>(), 0.0f}; + fvec4 tangent = {-P_ls.get<2>(), 0.0f, P_ls.get<0>(), 0.0f}; tangent = TransformNormal(tangent, mi->inv_xform); if (length2(cross(tangent, surf.N)) == 0.0f) { tangent = TransformNormal(P_ls, mi->inv_xform); @@ -5300,7 +5300,7 @@ Ray::color_rgba_t Ray::Ref::ShadeSurface(const pass_settings_t &ps, const float if ((!sc.light_wnodes.empty() || !sc.light_nodes.empty()) && mat->type != eShadingNode::Emissive) { const float rand_pick_light = get_scrambled_2d_rand(rand_dim + RAND_DIM_LIGHT_PICK, rand_hash, iteration - 1, rand_seq).get<0>(); - const simd_fvec2 rand_light_uv = + const fvec2 rand_light_uv = get_scrambled_2d_rand(rand_dim + RAND_DIM_LIGHT, rand_hash, iteration - 1, rand_seq); SampleLightSource(surf.P, surf.T, surf.B, surf.N, sc, textures, rand_pick_light, rand_light_uv, tex_rand, ls); @@ -5309,11 +5309,11 @@ Ray::color_rgba_t Ray::Ref::ShadeSurface(const pass_settings_t &ps, const float #endif // sample base texture - simd_fvec4 base_color = simd_fvec4{mat->base_color[0], mat->base_color[1], mat->base_color[2], 1.0f}; + fvec4 base_color = fvec4{mat->base_color[0], mat->base_color[1], mat->base_color[2], 1.0f}; if (mat->textures[BASE_TEXTURE] != 0xffffffff) { const uint32_t base_texture = mat->textures[BASE_TEXTURE]; const float base_lod = get_texture_lod(textures, base_texture, lambda); - simd_fvec4 tex_color = SampleBilinear(textures, base_texture, surf.uvs, int(base_lod), tex_rand); + fvec4 tex_color = SampleBilinear(textures, base_texture, surf.uvs, int(base_lod), tex_rand); if (base_texture & TEX_YCOCG_BIT) { tex_color = YCoCg_to_RGB(tex_color); } @@ -5331,7 +5331,7 @@ Ray::color_rgba_t Ray::Ref::ShadeSurface(const pass_settings_t &ps, const float out_depth_normal->v[3] = inter.t; } - simd_fvec4 tint_color = {0.0f}; + fvec4 tint_color = {0.0f}; const float base_color_lum = lum(base_color); if (base_color_lum > 0.0f) { @@ -5342,7 +5342,7 @@ Ray::color_rgba_t Ray::Ref::ShadeSurface(const pass_settings_t &ps, const float if (mat->textures[ROUGH_TEXTURE] != 0xffffffff) { const uint32_t roughness_tex = mat->textures[ROUGH_TEXTURE]; const float roughness_lod = get_texture_lod(textures, roughness_tex, lambda); - simd_fvec4 roughness_color = + fvec4 roughness_color = SampleBilinear(textures, roughness_tex, surf.uvs, int(roughness_lod), tex_rand).get<0>(); if (roughness_tex & TEX_SRGB_BIT) { roughness_color = srgb_to_rgb(roughness_color); @@ -5350,7 +5350,7 @@ Ray::color_rgba_t Ray::Ref::ShadeSurface(const pass_settings_t &ps, const float roughness *= roughness_color.get<0>(); } - const simd_fvec2 rand_bsdf = get_scrambled_2d_rand(rand_dim + RAND_DIM_BSDF, rand_hash, iteration - 1, rand_seq); + const fvec2 rand_bsdf = get_scrambled_2d_rand(rand_dim + RAND_DIM_BSDF, rand_hash, iteration - 1, rand_seq); ray_data_t &new_ray = out_secondary_rays[*out_secondary_rays_count]; memcpy(new_ray.ior, ray.ior, 4 * sizeof(float)); @@ -5422,7 +5422,7 @@ Ray::color_rgba_t Ray::Ref::ShadeSurface(const pass_settings_t &ps, const float const auto p1 = make_fvec3(v1.p), p2 = make_fvec3(v2.p), p3 = make_fvec3(v3.p); float light_forward_len; - simd_fvec4 light_forward = + fvec4 light_forward = normalize_len(TransformDirection(cross(p2 - p1, p3 - p1), mi->xform), light_forward_len); const float tri_area = 0.5f * light_forward_len; @@ -5430,7 +5430,7 @@ Ray::color_rgba_t Ray::Ref::ShadeSurface(const pass_settings_t &ps, const float if (cos_theta > 0.0f) { float light_pdf; #if USE_SPHERICAL_AREA_LIGHT_SAMPLING - const simd_fvec4 P = TransformPoint(ro, mi->inv_xform); + const fvec4 P = TransformPoint(ro, mi->inv_xform); light_pdf = SampleSphericalTriangle(P, p1, p2, p3, {}, nullptr) / pdf_factor; if (light_pdf == 0.0f) #endif @@ -5456,7 +5456,7 @@ Ray::color_rgba_t Ray::Ref::ShadeSurface(const pass_settings_t &ps, const float if (mat->textures[SPECULAR_TEXTURE] != 0xffffffff) { const uint32_t specular_tex = mat->textures[SPECULAR_TEXTURE]; const float specular_lod = get_texture_lod(textures, specular_tex, lambda); - simd_fvec4 specular_color = SampleBilinear(textures, specular_tex, surf.uvs, int(specular_lod), tex_rand); + fvec4 specular_color = SampleBilinear(textures, specular_tex, surf.uvs, int(specular_lod), tex_rand); if (specular_tex & TEX_SRGB_BIT) { specular_color = srgb_to_rgb(specular_color); } @@ -5472,11 +5472,11 @@ Ray::color_rgba_t Ray::Ref::ShadeSurface(const pass_settings_t &ps, const float diff_params_t diff = {}; diff.base_color = base_color; - diff.sheen_color = sheen * mix(simd_fvec4{1.0f}, tint_color, sheen_tint); + diff.sheen_color = sheen * mix(fvec4{1.0f}, tint_color, sheen_tint); diff.roughness = roughness; spec_params_t spec = {}; - spec.tmp_col = mix(simd_fvec4{1.0f}, tint_color, specular_tint); + spec.tmp_col = mix(fvec4{1.0f}, tint_color, specular_tint); spec.tmp_col = mix(specular * 0.08f * spec.tmp_col, base_color, metallic); spec.roughness = roughness; spec.ior = (2.0f / (1.0f - sqrtf(0.08f * specular))) - 1.0f; @@ -5498,7 +5498,7 @@ Ray::color_rgba_t Ray::Ref::ShadeSurface(const pass_settings_t &ps, const float // Approximation of FH (using shading normal) const float FN = (fresnel_dielectric_cos(dot(I, surf.N), spec.ior) - spec.F0) / (1.0f - spec.F0); - const simd_fvec4 approx_spec_col = mix(spec.tmp_col, simd_fvec4(1.0f), FN); + const fvec4 approx_spec_col = mix(spec.tmp_col, fvec4(1.0f), FN); const float spec_color_lum = lum(approx_spec_col); const auto lobe_weights = get_lobe_weights(mix(base_color_lum, 1.0f, sheen), spec_color_lum, specular, metallic, @@ -5538,7 +5538,7 @@ Ray::color_rgba_t Ray::Ref::ShadeSurface(const pass_settings_t &ps, const float const float sh_lum = fmaxf(sh_r.c[0], fmaxf(sh_r.c[1], sh_r.c[2])); if (sh_lum > 0.0f) { // actual ray direction accouning for bias from both ends - const simd_fvec4 to_light = normalize_len(ls.lp - simd_fvec4{sh_r.o[0], sh_r.o[1], sh_r.o[2], 0.0f}, sh_r.dist); + const fvec4 to_light = normalize_len(ls.lp - fvec4{sh_r.o[0], sh_r.o[1], sh_r.o[2], 0.0f}, sh_r.dist); memcpy(&sh_r.d[0], value_ptr(to_light), 3 * sizeof(float)); sh_r.dist *= ls.dist_mul; if (ls.from_env) { @@ -5549,7 +5549,7 @@ Ray::color_rgba_t Ray::Ref::ShadeSurface(const pass_settings_t &ps, const float } #endif - col *= simd_fvec4{ray.c[0], ray.c[1], ray.c[2], 0.0f}; + col *= fvec4{ray.c[0], ray.c[1], ray.c[2], 0.0f}; const float sum = hsum(col); if (sum > limits[1]) { @@ -5581,14 +5581,14 @@ void Ray::Ref::ShadePrimary(const pass_settings_t &ps, Span in out_color[y * img_w + x] = col; { // base color - auto old_val = Ref::simd_fvec4{out_base_color[y * img_w + x].v, Ref::simd_mem_aligned}; - old_val += (Ref::simd_fvec4{base_color.v, Ref::simd_mem_aligned} - old_val) * mix_factor; - old_val.store_to(out_base_color[y * img_w + x].v, Ref::simd_mem_aligned); + auto old_val = Ref::fvec4{out_base_color[y * img_w + x].v, Ref::vector_aligned}; + old_val += (Ref::fvec4{base_color.v, Ref::vector_aligned} - old_val) * mix_factor; + old_val.store_to(out_base_color[y * img_w + x].v, Ref::vector_aligned); } { // depth-normals - auto old_val = Ref::simd_fvec4{out_depth_normal[y * img_w + x].v, Ref::simd_mem_aligned}; - old_val += (Ref::simd_fvec4{depth_normal.v, Ref::simd_mem_aligned} - old_val) * mix_factor; - old_val.store_to(out_depth_normal[y * img_w + x].v, Ref::simd_mem_aligned); + auto old_val = Ref::fvec4{out_depth_normal[y * img_w + x].v, Ref::vector_aligned}; + old_val += (Ref::fvec4{depth_normal.v, Ref::vector_aligned} - old_val) * mix_factor; + old_val.store_to(out_depth_normal[y * img_w + x].v, Ref::vector_aligned); } } } @@ -5613,11 +5613,11 @@ void Ray::Ref::ShadeSecondary(const pass_settings_t &ps, const float clamp_direc out_shadow_rays_count, nullptr, nullptr); col.v[3] = 0.0f; - auto vcol = Ref::simd_fvec4{col.v}; + auto vcol = Ref::fvec4{col.v}; - auto old_val = Ref::simd_fvec4{out_color[y * img_w + x].v, Ref::simd_mem_aligned}; + auto old_val = Ref::fvec4{out_color[y * img_w + x].v, Ref::vector_aligned}; old_val += vcol; - old_val.store_to(out_color[y * img_w + x].v, Ref::simd_mem_aligned); + old_val.store_to(out_color[y * img_w + x].v, Ref::vector_aligned); } } @@ -5637,7 +5637,7 @@ void JointNLMFilter(const color_rgba_t *restrict input, const rect_t &rect, cons for (int iy = rect.y; iy < rect.y + rect.h; ++iy) { for (int ix = rect.x; ix < rect.x + rect.w; ++ix) { - simd_fvec4 sum_output = {}; + fvec4 sum_output = {}; float sum_weight = 0.0f; for (int k = -WindowRadius; k <= WindowRadius; ++k) { @@ -5646,16 +5646,16 @@ void JointNLMFilter(const color_rgba_t *restrict input, const rect_t &rect, cons for (int l = -WindowRadius; l <= WindowRadius; ++l) { const int jx = ix + l; - simd_fvec4 color_distance = {}; + fvec4 color_distance = {}; for (int q = -NeighborRadius; q <= NeighborRadius; ++q) { for (int p = -NeighborRadius; p <= NeighborRadius; ++p) { - const simd_fvec4 ipx = {input[(iy + q) * input_stride + (ix + p)].v, simd_mem_aligned}; - const simd_fvec4 jpx = {input[(jy + q) * input_stride + (jx + p)].v, simd_mem_aligned}; + const fvec4 ipx = {input[(iy + q) * input_stride + (ix + p)].v, vector_aligned}; + const fvec4 jpx = {input[(jy + q) * input_stride + (jx + p)].v, vector_aligned}; - const simd_fvec4 ivar = {variance[(iy + q) * input_stride + (ix + p)].v, simd_mem_aligned}; - const simd_fvec4 jvar = {variance[(jy + q) * input_stride + (jx + p)].v, simd_mem_aligned}; - const simd_fvec4 min_var = min(ivar, jvar); + const fvec4 ivar = {variance[(iy + q) * input_stride + (ix + p)].v, vector_aligned}; + const fvec4 jvar = {variance[(jy + q) * input_stride + (jx + p)].v, vector_aligned}; + const fvec4 min_var = min(ivar, jvar); color_distance += ((ipx - jpx) * (ipx - jpx) - alpha * (ivar + min_var)) / (0.0001f + damping * damping * (ivar + jvar)); @@ -5668,16 +5668,16 @@ void JointNLMFilter(const color_rgba_t *restrict input, const rect_t &rect, cons float weight = expf(-fmaxf(0.0f, patch_distance)); if (FEATURE0 || FEATURE1) { - simd_fvec4 feature_distance = {}; + fvec4 feature_distance = {}; if (FEATURE0) { - const simd_fvec4 ipx = {feature0[iy * input_stride + ix].v, simd_mem_aligned}; - const simd_fvec4 jpx = {feature0[jy * input_stride + jx].v, simd_mem_aligned}; + const fvec4 ipx = {feature0[iy * input_stride + ix].v, vector_aligned}; + const fvec4 jpx = {feature0[jy * input_stride + jx].v, vector_aligned}; feature_distance = feature0_weight * (ipx - jpx) * (ipx - jpx); } if (FEATURE1) { - const simd_fvec4 ipx = {feature1[iy * input_stride + ix].v, simd_mem_aligned}; - const simd_fvec4 jpx = {feature1[jy * input_stride + jx].v, simd_mem_aligned}; + const fvec4 ipx = {feature1[iy * input_stride + ix].v, vector_aligned}; + const fvec4 jpx = {feature1[jy * input_stride + jx].v, vector_aligned}; feature_distance = max(feature_distance, feature1_weight * (ipx - jpx) * (ipx - jpx)); } @@ -5690,7 +5690,7 @@ void JointNLMFilter(const color_rgba_t *restrict input, const rect_t &rect, cons weight = fminf(weight, feature_weight); } - sum_output += simd_fvec4{input[jy * input_stride + jx].v, simd_mem_aligned} * weight; + sum_output += fvec4{input[jy * input_stride + jx].v, vector_aligned} * weight; sum_weight += weight; } } @@ -5700,7 +5700,7 @@ void JointNLMFilter(const color_rgba_t *restrict input, const rect_t &rect, cons } sum_output.store_to(output[(output_rect.y + iy - rect.y) * output_stride + (output_rect.x + ix - rect.x)].v, - simd_mem_aligned); + vector_aligned); } } } @@ -5984,37 +5984,37 @@ const uint32_t *transform_luts[] = { static_assert(sizeof(transform_luts) / sizeof(transform_luts[0]) == int(eViewTransform::_Count), "!"); namespace Ref { -force_inline simd_fvec4 FetchLUT(const eViewTransform view_transform, const int ix, const int iy, const int iz) { +force_inline fvec4 FetchLUT(const eViewTransform view_transform, const int ix, const int iy, const int iz) { const uint32_t packed_val = transform_luts[int(view_transform)][iz * LUT_DIMS * LUT_DIMS + iy * LUT_DIMS + ix]; - const simd_ivec4 ret = simd_ivec4{int((packed_val >> 0) & 0x3ff), int((packed_val >> 10) & 0x3ff), + const ivec4 ret = ivec4{int((packed_val >> 0) & 0x3ff), int((packed_val >> 10) & 0x3ff), int((packed_val >> 20) & 0x3ff), int((packed_val >> 30) & 0x3)}; - return simd_fvec4(ret) * simd_fvec4{1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f}; + return fvec4(ret) * fvec4{1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f}; } } // namespace Ref } // namespace Ray -Ray::Ref::simd_fvec4 vectorcall Ray::Ref::TonemapFilmic(const eViewTransform view_transform, simd_fvec4 color) { - const simd_fvec4 encoded = color / (color + 1.0f); - const simd_fvec4 uv = encoded * float(LUT_DIMS - 1); - const simd_ivec4 xyz = simd_ivec4(uv); - const simd_fvec4 f = fract(uv); - const simd_ivec4 xyz_next = min(xyz + 1, simd_ivec4{LUT_DIMS - 1}); +Ray::Ref::fvec4 vectorcall Ray::Ref::TonemapFilmic(const eViewTransform view_transform, fvec4 color) { + const fvec4 encoded = color / (color + 1.0f); + const fvec4 uv = encoded * float(LUT_DIMS - 1); + const ivec4 xyz = ivec4(uv); + const fvec4 f = fract(uv); + const ivec4 xyz_next = min(xyz + 1, ivec4{LUT_DIMS - 1}); const int ix = xyz.get<0>(), iy = xyz.get<1>(), iz = xyz.get<2>(); const int jx = xyz_next.get<0>(), jy = xyz_next.get<1>(), jz = xyz_next.get<2>(); const float fx = f.get<0>(), fy = f.get<1>(), fz = f.get<2>(); - const simd_fvec4 c000 = FetchLUT(view_transform, ix, iy, iz), c001 = FetchLUT(view_transform, jx, iy, iz), + const fvec4 c000 = FetchLUT(view_transform, ix, iy, iz), c001 = FetchLUT(view_transform, jx, iy, iz), c010 = FetchLUT(view_transform, ix, jy, iz), c011 = FetchLUT(view_transform, jx, jy, iz), c100 = FetchLUT(view_transform, ix, iy, jz), c101 = FetchLUT(view_transform, jx, iy, jz), c110 = FetchLUT(view_transform, ix, jy, jz), c111 = FetchLUT(view_transform, jx, jy, jz); - const simd_fvec4 c00x = (1.0f - fx) * c000 + fx * c001, c01x = (1.0f - fx) * c010 + fx * c011, + const fvec4 c00x = (1.0f - fx) * c000 + fx * c001, c01x = (1.0f - fx) * c010 + fx * c011, c10x = (1.0f - fx) * c100 + fx * c101, c11x = (1.0f - fx) * c110 + fx * c111; - const simd_fvec4 c0xx = (1.0f - fy) * c00x + fy * c01x, c1xx = (1.0f - fy) * c10x + fy * c11x; + const fvec4 c0xx = (1.0f - fy) * c00x + fy * c01x, c1xx = (1.0f - fy) * c10x + fy * c11x; - simd_fvec4 cxxx = (1.0f - fz) * c0xx + fz * c1xx; + fvec4 cxxx = (1.0f - fz) * c0xx + fz * c1xx; cxxx.set<3>(color.get<3>()); return cxxx; diff --git a/internal/CoreRef.h b/internal/CoreRef.h index a578a7aa3..43137de68 100644 --- a/internal/CoreRef.h +++ b/internal/CoreRef.h @@ -22,7 +22,7 @@ #pragma message("Ray::Ref::simd_vec will not use SIMD") #endif -#include "simd/simd_vec.h" +#include "simd/simd.h" #undef USE_SSE2 #undef USE_NEON @@ -96,21 +96,21 @@ struct hit_data_t { // Surface at the hit point struct surface_t { // position, tangent, bitangent, smooth normal and planar normal - simd_fvec4 P, T, B, N, plane_N; + fvec4 P, T, B, N, plane_N; // texture coordinates - simd_fvec2 uvs; + fvec2 uvs; }; // Surface derivatives at the hit point struct derivatives_t { - simd_fvec4 do_dx, do_dy, dd_dx, dd_dy; - simd_fvec2 duv_dx, duv_dy; - simd_fvec4 dndx, dndy; + fvec4 do_dx, do_dy, dd_dx, dd_dy; + fvec2 duv_dx, duv_dy; + fvec4 dndx, dndy; float ddn_dx, ddn_dy; }; struct light_sample_t { - simd_fvec4 col, L, lp; + fvec4 col, L, lp; float area = 0, dist_mul = 1, pdf = 0; uint32_t cast_shadow : 1; uint32_t from_env : 1; @@ -143,13 +143,13 @@ force_inline float construct_float(uint32_t m) { } force_inline float fract(const float v) { - //float _unused; - //return modff(v, &_unused); + // float _unused; + // return modff(v, &_unused); return v - floorf(v); } -force_inline simd_fvec4 srgb_to_rgb(const simd_fvec4 &col) { - simd_fvec4 ret; +force_inline fvec4 srgb_to_rgb(const fvec4 &col) { + fvec4 ret; UNROLLED_FOR(i, 3, { if (col.get() > 0.04045f) { ret.set(powf((col.get() + 0.055f) / 1.055f, 2.4f)); @@ -162,8 +162,8 @@ force_inline simd_fvec4 srgb_to_rgb(const simd_fvec4 &col) { return ret; } -force_inline simd_fvec2 srgb_to_rgb(const simd_fvec2 &col) { - simd_fvec2 ret; +force_inline fvec2 srgb_to_rgb(const fvec2 &col) { + fvec2 ret; UNROLLED_FOR(i, 2, { if (col.get() > 0.04045f) { ret.set(powf((col.get() + 0.055f) / 1.055f, 2.4f)); @@ -174,9 +174,9 @@ force_inline simd_fvec2 srgb_to_rgb(const simd_fvec2 &col) { return ret; } -force_inline simd_fvec4 rgbe_to_rgb(const color_t &rgbe) { +force_inline fvec4 rgbe_to_rgb(const color_t &rgbe) { const float f = exp2f(float(rgbe.v[3]) - 128.0f); - return simd_fvec4{to_norm_float(rgbe.v[0]) * f, to_norm_float(rgbe.v[1]) * f, to_norm_float(rgbe.v[2]) * f, 1.0f}; + return fvec4{to_norm_float(rgbe.v[0]) * f, to_norm_float(rgbe.v[1]) * f, to_norm_float(rgbe.v[2]) * f, 1.0f}; } force_inline uint32_t mask_ray_depth(const uint32_t depth) { return depth & 0x0fffffff; } @@ -274,78 +274,72 @@ bool Traverse_BLAS_WithStack_AnyHit(const float ro[3], const float rd[3], const hit_data_t &inter); // BRDFs -float BRDF_PrincipledDiffuse(const simd_fvec4 &V, const simd_fvec4 &N, const simd_fvec4 &L, const simd_fvec4 &H, - float roughness); - -simd_fvec4 Evaluate_OrenDiffuse_BSDF(const simd_fvec4 &V, const simd_fvec4 &N, const simd_fvec4 &L, float roughness, - const simd_fvec4 &base_color); -simd_fvec4 Sample_OrenDiffuse_BSDF(const simd_fvec4 &T, const simd_fvec4 &B, const simd_fvec4 &N, const simd_fvec4 &I, - float roughness, const simd_fvec4 &base_color, simd_fvec2 rand, simd_fvec4 &out_V); - -simd_fvec4 Evaluate_PrincipledDiffuse_BSDF(const simd_fvec4 &V, const simd_fvec4 &N, const simd_fvec4 &L, - float roughness, const simd_fvec4 &base_color, const simd_fvec4 &sheen_color, - bool uniform_sampling); -simd_fvec4 Sample_PrincipledDiffuse_BSDF(const simd_fvec4 &T, const simd_fvec4 &B, const simd_fvec4 &N, - const simd_fvec4 &I, float roughness, const simd_fvec4 &base_color, - const simd_fvec4 &sheen_color, bool uniform_sampling, simd_fvec2 rand, - simd_fvec4 &out_V); - -simd_fvec4 Evaluate_GGXSpecular_BSDF(const simd_fvec4 &view_dir_ts, const simd_fvec4 &sampled_normal_ts, - const simd_fvec4 &reflected_dir_ts, simd_fvec2 alpha, float spec_ior, - float spec_F0, const simd_fvec4 &spec_col, const simd_fvec4 &spec_col_90); -simd_fvec4 Sample_GGXSpecular_BSDF(const simd_fvec4 &T, const simd_fvec4 &B, const simd_fvec4 &N, const simd_fvec4 &I, - simd_fvec2 alpha, float spec_ior, float spec_F0, const simd_fvec4 &spec_col, - const simd_fvec4 &spec_col_90, simd_fvec2 rand, simd_fvec4 &out_V); - -simd_fvec4 Evaluate_GGXRefraction_BSDF(const simd_fvec4 &view_dir_ts, const simd_fvec4 &sampled_normal_ts, - const simd_fvec4 &refr_dir_ts, simd_fvec2 slpha, float eta, - const simd_fvec4 &refr_col); -simd_fvec4 Sample_GGXRefraction_BSDF(const simd_fvec4 &T, const simd_fvec4 &B, const simd_fvec4 &N, const simd_fvec4 &I, - simd_fvec2 alpha, float eta, const simd_fvec4 &refr_col, simd_fvec2 rand, - simd_fvec4 &out_V); - -simd_fvec4 Evaluate_PrincipledClearcoat_BSDF(const simd_fvec4 &view_dir_ts, const simd_fvec4 &sampled_normal_ts, - const simd_fvec4 &reflected_dir_ts, float clearcoat_roughness2, - float clearcoat_ior, float clearcoat_F0); -simd_fvec4 Sample_PrincipledClearcoat_BSDF(const simd_fvec4 &T, const simd_fvec4 &B, const simd_fvec4 &N, - const simd_fvec4 &I, float clearcoat_roughness2, float clearcoat_ior, - float clearcoat_F0, simd_fvec2 rand, simd_fvec4 &out_V); - -float Evaluate_EnvQTree(float y_rotation, const simd_fvec4 *const *qtree_mips, int qtree_levels, const simd_fvec4 &L); -simd_fvec4 Sample_EnvQTree(float y_rotation, const simd_fvec4 *const *qtree_mips, int qtree_levels, float rand, - float rx, float ry); +float BRDF_PrincipledDiffuse(const fvec4 &V, const fvec4 &N, const fvec4 &L, const fvec4 &H, float roughness); + +fvec4 Evaluate_OrenDiffuse_BSDF(const fvec4 &V, const fvec4 &N, const fvec4 &L, float roughness, + const fvec4 &base_color); +fvec4 Sample_OrenDiffuse_BSDF(const fvec4 &T, const fvec4 &B, const fvec4 &N, const fvec4 &I, float roughness, + const fvec4 &base_color, fvec2 rand, fvec4 &out_V); + +fvec4 Evaluate_PrincipledDiffuse_BSDF(const fvec4 &V, const fvec4 &N, const fvec4 &L, float roughness, + const fvec4 &base_color, const fvec4 &sheen_color, bool uniform_sampling); +fvec4 Sample_PrincipledDiffuse_BSDF(const fvec4 &T, const fvec4 &B, const fvec4 &N, const fvec4 &I, float roughness, + const fvec4 &base_color, const fvec4 &sheen_color, bool uniform_sampling, + fvec2 rand, fvec4 &out_V); + +fvec4 Evaluate_GGXSpecular_BSDF(const fvec4 &view_dir_ts, const fvec4 &sampled_normal_ts, const fvec4 &reflected_dir_ts, + fvec2 alpha, float spec_ior, float spec_F0, const fvec4 &spec_col, + const fvec4 &spec_col_90); +fvec4 Sample_GGXSpecular_BSDF(const fvec4 &T, const fvec4 &B, const fvec4 &N, const fvec4 &I, fvec2 alpha, + float spec_ior, float spec_F0, const fvec4 &spec_col, const fvec4 &spec_col_90, + fvec2 rand, fvec4 &out_V); + +fvec4 Evaluate_GGXRefraction_BSDF(const fvec4 &view_dir_ts, const fvec4 &sampled_normal_ts, const fvec4 &refr_dir_ts, + fvec2 slpha, float eta, const fvec4 &refr_col); +fvec4 Sample_GGXRefraction_BSDF(const fvec4 &T, const fvec4 &B, const fvec4 &N, const fvec4 &I, fvec2 alpha, float eta, + const fvec4 &refr_col, fvec2 rand, fvec4 &out_V); + +fvec4 Evaluate_PrincipledClearcoat_BSDF(const fvec4 &view_dir_ts, const fvec4 &sampled_normal_ts, + const fvec4 &reflected_dir_ts, float clearcoat_roughness2, float clearcoat_ior, + float clearcoat_F0); +fvec4 Sample_PrincipledClearcoat_BSDF(const fvec4 &T, const fvec4 &B, const fvec4 &N, const fvec4 &I, + float clearcoat_roughness2, float clearcoat_ior, float clearcoat_F0, fvec2 rand, + fvec4 &out_V); + +float Evaluate_EnvQTree(float y_rotation, const fvec4 *const *qtree_mips, int qtree_levels, const fvec4 &L); +fvec4 Sample_EnvQTree(float y_rotation, const fvec4 *const *qtree_mips, int qtree_levels, float rand, float rx, + float ry); // Transform void TransformRay(const float ro[3], const float rd[3], const float *xform, float out_ro[3], float out_rd[3]); -simd_fvec4 TransformPoint(const simd_fvec4 &p, const float *xform); -simd_fvec4 TransformDirection(const simd_fvec4 &p, const float *xform); -simd_fvec4 TransformNormal(const simd_fvec4 &n, const float *inv_xform); +fvec4 TransformPoint(const fvec4 &p, const float *xform); +fvec4 TransformDirection(const fvec4 &p, const float *xform); +fvec4 TransformNormal(const fvec4 &n, const float *inv_xform); // Sample Texture -simd_fvec4 SampleNearest(const Cpu::TexStorageBase *const textures[], uint32_t index, const simd_fvec2 &uvs, int lod); -simd_fvec4 SampleBilinear(const Cpu::TexStorageBase *const textures[], uint32_t index, const simd_fvec2 &uvs, int lod, - const simd_fvec2 &rand); -simd_fvec4 SampleBilinear(const Cpu::TexStorageBase &storage, uint32_t tex, const simd_fvec2 &iuvs, int lod, - const simd_fvec2 &rand); -simd_fvec4 SampleTrilinear(const Cpu::TexStorageBase *const textures[], uint32_t index, const simd_fvec2 &uvs, - float lod, const simd_fvec2 &rand); -simd_fvec4 SampleAnisotropic(const Cpu::TexStorageBase *const textures[], uint32_t index, const simd_fvec2 &uvs, - const simd_fvec2 &duv_dx, const simd_fvec2 &duv_dy); -simd_fvec4 SampleLatlong_RGBE(const Cpu::TexStorageRGBA &storage, uint32_t index, const simd_fvec4 &dir, - float y_rotation, const simd_fvec2 &rand); +fvec4 SampleNearest(const Cpu::TexStorageBase *const textures[], uint32_t index, const fvec2 &uvs, int lod); +fvec4 SampleBilinear(const Cpu::TexStorageBase *const textures[], uint32_t index, const fvec2 &uvs, int lod, + const fvec2 &rand); +fvec4 SampleBilinear(const Cpu::TexStorageBase &storage, uint32_t tex, const fvec2 &iuvs, int lod, const fvec2 &rand); +fvec4 SampleTrilinear(const Cpu::TexStorageBase *const textures[], uint32_t index, const fvec2 &uvs, float lod, + const fvec2 &rand); +fvec4 SampleAnisotropic(const Cpu::TexStorageBase *const textures[], uint32_t index, const fvec2 &uvs, + const fvec2 &duv_dx, const fvec2 &duv_dy); +fvec4 SampleLatlong_RGBE(const Cpu::TexStorageRGBA &storage, uint32_t index, const fvec4 &dir, float y_rotation, + const fvec2 &rand); // Trace rays through scene hierarchy void IntersectScene(Span rays, int min_transp_depth, int max_transp_depth, const uint32_t rand_seq[], uint32_t random_seed, int iteration, const scene_data_t &sc, uint32_t root_index, const Cpu::TexStorageBase *const textures[], Span out_inter); -simd_fvec4 IntersectScene(const shadow_ray_t &r, int max_transp_depth, const scene_data_t &sc, uint32_t node_index, - const uint32_t rand_seq[], uint32_t random_seed, int iteration, - const Cpu::TexStorageBase *const textures[]); +fvec4 IntersectScene(const shadow_ray_t &r, int max_transp_depth, const scene_data_t &sc, uint32_t node_index, + const uint32_t rand_seq[], uint32_t random_seed, int iteration, + const Cpu::TexStorageBase *const textures[]); // Pick point on any light source for evaluation -void SampleLightSource(const simd_fvec4 &P, const simd_fvec4 &T, const simd_fvec4 &B, const simd_fvec4 &N, - const scene_data_t &sc, const Cpu::TexStorageBase *const textures[], float rand_pick_light, - simd_fvec2 rand_light_uv, simd_fvec2 rand_tex_uv, light_sample_t &ls); +void SampleLightSource(const fvec4 &P, const fvec4 &T, const fvec4 &B, const fvec4 &N, const scene_data_t &sc, + const Cpu::TexStorageBase *const textures[], float rand_pick_light, fvec2 rand_light_uv, + fvec2 rand_tex_uv, light_sample_t &ls); // Account for visible lights contribution void IntersectAreaLights(Span rays, Span lights, Span nodes, @@ -353,9 +347,9 @@ void IntersectAreaLights(Span rays, Span lights void IntersectAreaLights(Span rays, Span lights, Span nodes, Span inout_inters); float IntersectAreaLights(const shadow_ray_t &ray, Span lights, Span nodes); -float EvalTriLightFactor(const simd_fvec4 &P, const simd_fvec4 &ro, uint32_t tri_index, Span lights, +float EvalTriLightFactor(const fvec4 &P, const fvec4 &ro, uint32_t tri_index, Span lights, Span nodes); -float EvalTriLightFactor(const simd_fvec4 &P, const simd_fvec4 &ro, uint32_t tri_index, Span lights, +float EvalTriLightFactor(const fvec4 &P, const fvec4 &ro, uint32_t tri_index, Span lights, Span nodes); void TraceRays(Span rays, int min_transp_depth, int max_transp_depth, const scene_data_t &sc, @@ -366,42 +360,42 @@ void TraceShadowRays(Span rays, int max_transp_depth, float const Cpu::TexStorageBase *const textures[], int img_w, color_rgba_t *out_color); // Get environment collor at direction -simd_fvec4 Evaluate_EnvColor(const ray_data_t &ray, const environment_t &env, const Cpu::TexStorageRGBA &tex_storage, - float pdf_factor, const simd_fvec2 &rand); +fvec4 Evaluate_EnvColor(const ray_data_t &ray, const environment_t &env, const Cpu::TexStorageRGBA &tex_storage, + float pdf_factor, const fvec2 &rand); // Get light color at intersection point -simd_fvec4 Evaluate_LightColor(const ray_data_t &ray, const hit_data_t &inter, const environment_t &env, - const Cpu::TexStorageRGBA &tex_storage, Span lights, - uint32_t lights_count, const simd_fvec2 &rand); +fvec4 Evaluate_LightColor(const ray_data_t &ray, const hit_data_t &inter, const environment_t &env, + const Cpu::TexStorageRGBA &tex_storage, Span lights, uint32_t lights_count, + const fvec2 &rand); // Evaluate individual nodes -simd_fvec4 Evaluate_DiffuseNode(const light_sample_t &ls, const ray_data_t &ray, const surface_t &surf, - const simd_fvec4 &base_color, float roughness, float mix_weight, bool use_mis, - shadow_ray_t &sh_r); -void Sample_DiffuseNode(const ray_data_t &ray, const surface_t &surf, const simd_fvec4 &base_color, float roughness, - simd_fvec2 rand, float mix_weight, ray_data_t &new_ray); - -simd_fvec4 Evaluate_GlossyNode(const light_sample_t &ls, const ray_data_t &ray, const surface_t &surf, - const simd_fvec4 &base_color, float roughness, float regularize_alpha, float spec_ior, - float spec_F0, float mix_weight, bool use_mis, shadow_ray_t &sh_r); -void Sample_GlossyNode(const ray_data_t &ray, const surface_t &surf, const simd_fvec4 &base_color, float roughness, - float regularize_alpha, float spec_ior, float spec_F0, simd_fvec2 rand, float mix_weight, +fvec4 Evaluate_DiffuseNode(const light_sample_t &ls, const ray_data_t &ray, const surface_t &surf, + const fvec4 &base_color, float roughness, float mix_weight, bool use_mis, + shadow_ray_t &sh_r); +void Sample_DiffuseNode(const ray_data_t &ray, const surface_t &surf, const fvec4 &base_color, float roughness, + fvec2 rand, float mix_weight, ray_data_t &new_ray); + +fvec4 Evaluate_GlossyNode(const light_sample_t &ls, const ray_data_t &ray, const surface_t &surf, + const fvec4 &base_color, float roughness, float regularize_alpha, float spec_ior, + float spec_F0, float mix_weight, bool use_mis, shadow_ray_t &sh_r); +void Sample_GlossyNode(const ray_data_t &ray, const surface_t &surf, const fvec4 &base_color, float roughness, + float regularize_alpha, float spec_ior, float spec_F0, fvec2 rand, float mix_weight, ray_data_t &new_ray); -simd_fvec4 Evaluate_RefractiveNode(const light_sample_t &ls, const ray_data_t &ray, const surface_t &surf, - const simd_fvec4 &base_color, float roughness, float regularize_alpha, float eta, - float mix_weight, bool use_mis, shadow_ray_t &sh_r); -void Sample_RefractiveNode(const ray_data_t &ray, const surface_t &surf, const simd_fvec4 &base_color, float roughness, - float regularize_alpha, bool is_backfacing, float int_ior, float ext_ior, simd_fvec2 rand, +fvec4 Evaluate_RefractiveNode(const light_sample_t &ls, const ray_data_t &ray, const surface_t &surf, + const fvec4 &base_color, float roughness, float regularize_alpha, float eta, + float mix_weight, bool use_mis, shadow_ray_t &sh_r); +void Sample_RefractiveNode(const ray_data_t &ray, const surface_t &surf, const fvec4 &base_color, float roughness, + float regularize_alpha, bool is_backfacing, float int_ior, float ext_ior, fvec2 rand, float mix_weight, ray_data_t &new_ray); struct diff_params_t { - simd_fvec4 base_color; - simd_fvec4 sheen_color; + fvec4 base_color; + fvec4 sheen_color; float roughness; }; struct spec_params_t { - simd_fvec4 tmp_col; + fvec4 tmp_col; float roughness; float ior; float F0; @@ -426,17 +420,16 @@ struct lobe_weights_t { float diffuse, specular, clearcoat, refraction; }; -simd_fvec4 Evaluate_PrincipledNode(const light_sample_t &ls, const ray_data_t &ray, const surface_t &surf, - const lobe_weights_t &lobe_weights, const diff_params_t &diff, - const spec_params_t &spec, const clearcoat_params_t &coat, - const transmission_params_t &trans, float metallic, float transmission, - float N_dot_L, float mix_weight, bool use_mis, float regularize_alpha, - shadow_ray_t &sh_r); +fvec4 Evaluate_PrincipledNode(const light_sample_t &ls, const ray_data_t &ray, const surface_t &surf, + const lobe_weights_t &lobe_weights, const diff_params_t &diff, const spec_params_t &spec, + const clearcoat_params_t &coat, const transmission_params_t &trans, float metallic, + float transmission, float N_dot_L, float mix_weight, bool use_mis, float regularize_alpha, + shadow_ray_t &sh_r); void Sample_PrincipledNode(const pass_settings_t &ps, const ray_data_t &ray, const surface_t &surf, const lobe_weights_t &lobe_weights, const diff_params_t &diff, const spec_params_t &spec, const clearcoat_params_t &coat, const transmission_params_t &trans, float metallic, - float transmission, simd_fvec2 rand, float mix_rand, float mix_weight, - float regularize_alpha, ray_data_t &new_ray); + float transmission, fvec2 rand, float mix_rand, float mix_weight, float regularize_alpha, + ray_data_t &new_ray); // Shade color_rgba_t ShadeSurface(const pass_settings_t &ps, const float limits[2], const hit_data_t &inter, @@ -495,11 +488,11 @@ void ClearBorders(const rect_t &rect, int w, int h, bool downscaled, int out_cha // Tonemap // https://gpuopen.com/learn/optimized-reversible-tonemapper-for-resolve/ -force_inline simd_fvec4 vectorcall reversible_tonemap(const simd_fvec4 c) { +force_inline fvec4 vectorcall reversible_tonemap(const fvec4 c) { return c / (fmaxf(c.get<0>(), fmaxf(c.get<1>(), c.get<2>())) + 1.0f); } -force_inline simd_fvec4 vectorcall reversible_tonemap_invert(const simd_fvec4 c) { +force_inline fvec4 vectorcall reversible_tonemap_invert(const fvec4 c) { return c / (1.0f - fmaxf(c.get<0>(), fmaxf(c.get<1>(), c.get<2>()))); } @@ -508,7 +501,7 @@ struct tonemap_params_t { float inv_gamma; }; -force_inline simd_fvec4 vectorcall TonemapStandard(simd_fvec4 c) { +force_inline fvec4 vectorcall TonemapStandard(fvec4 c) { UNROLLED_FOR(i, 3, { if (c.get() < 0.0031308f) { c.set(12.92f * c.get()); @@ -519,9 +512,9 @@ force_inline simd_fvec4 vectorcall TonemapStandard(simd_fvec4 c) { return c; } -simd_fvec4 vectorcall TonemapFilmic(eViewTransform view_transform, simd_fvec4 color); +fvec4 vectorcall TonemapFilmic(eViewTransform view_transform, fvec4 color); -force_inline simd_fvec4 vectorcall Tonemap(const tonemap_params_t ¶ms, simd_fvec4 c) { +force_inline fvec4 vectorcall Tonemap(const tonemap_params_t ¶ms, fvec4 c) { if (params.view_transform == eViewTransform::Standard) { c = TonemapStandard(c); } else { @@ -529,7 +522,7 @@ force_inline simd_fvec4 vectorcall Tonemap(const tonemap_params_t ¶ms, simd_ } if (params.inv_gamma != 1.0f) { - c = pow(c, simd_fvec4{params.inv_gamma, params.inv_gamma, params.inv_gamma, 1.0f}); + c = pow(c, fvec4{params.inv_gamma, params.inv_gamma, params.inv_gamma, 1.0f}); } return saturate(c); diff --git a/internal/CoreSIMD.h b/internal/CoreSIMD.h index d740d1891..645ae77a3 100644 --- a/internal/CoreSIMD.h +++ b/internal/CoreSIMD.h @@ -8,7 +8,7 @@ #include -#include "simd/simd_vec.h" +#include "simd/simd.h" #include "Convolution.h" #include "TextureStorageCPU.h" @@ -58,42 +58,42 @@ alignas(64) const int ascending_counter[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1 template struct ray_data_t { // active rays mask - simd_ivec mask; + ivec mask; // origins of rays in packet - simd_fvec o[3]; + fvec o[3]; // directions of rays in packet - simd_fvec d[3], pdf; + fvec d[3], pdf; // throughput color of ray - simd_fvec c[3]; + fvec c[3]; // stack of ior values - simd_fvec ior[4]; + fvec ior[4]; // ray cone params - simd_fvec cone_width, cone_spread; + fvec cone_width, cone_spread; // 16-bit pixel coordinates of rays in packet ((x << 16) | y) - simd_uvec xy; + uvec xy; // four 8-bit ray depth counters - simd_uvec depth; + uvec depth; }; template struct shadow_ray_t { // active rays mask - simd_ivec mask; + ivec mask; // origins of rays in packet - simd_fvec o[3]; + fvec o[3]; // four 8-bit ray depth counters - simd_uvec depth; + uvec depth; // directions of rays in packet - simd_fvec d[3], dist; + fvec d[3], dist; // throughput color of ray - simd_fvec c[3]; + fvec c[3]; // 16-bit pixel coordinates of rays in packet ((x << 16) | y) - simd_uvec xy; + uvec xy; }; template struct hit_data_t { - simd_ivec obj_index; - simd_ivec prim_index; - simd_fvec t, u, v; + ivec obj_index; + ivec prim_index; + fvec t, u, v; explicit hit_data_t(eUninitialize) {} force_inline hit_data_t() { @@ -106,60 +106,60 @@ template struct hit_data_t { }; template struct surface_t { - simd_fvec P[3] = {0.0f, 0.0f, 0.0f}, T[3], B[3], N[3], plane_N[3]; - simd_fvec uvs[2]; + fvec P[3] = {0.0f, 0.0f, 0.0f}, T[3], B[3], N[3], plane_N[3]; + fvec uvs[2]; force_inline surface_t() = default; }; template struct light_sample_t { - simd_fvec col[3] = {0.0f, 0.0f, 0.0f}, L[3] = {0.0f, 0.0f, 0.0f}, lp[3] = {0.0f, 0.0f, 0.0f}; - simd_fvec area = 0.0f, dist_mul = 1.0f, pdf = 0.0f; + fvec col[3] = {0.0f, 0.0f, 0.0f}, L[3] = {0.0f, 0.0f, 0.0f}, lp[3] = {0.0f, 0.0f, 0.0f}; + fvec area = 0.0f, dist_mul = 1.0f, pdf = 0.0f; // TODO: merge these two into bitflags - simd_ivec cast_shadow = -1, from_env = 0; + ivec cast_shadow = -1, from_env = 0; force_inline light_sample_t() = default; }; -template force_inline simd_uvec mask_ray_depth(const simd_uvec depth) { return depth & 0x0fffffff; } +template force_inline uvec mask_ray_depth(const uvec depth) { return depth & 0x0fffffff; } force_inline uint32_t pack_ray_type(const int ray_type) { assert(ray_type < 0xf); return uint32_t(ray_type << 28); } template -force_inline simd_uvec pack_depth(const simd_ivec &diff_depth, const simd_ivec &spec_depth, - const simd_ivec &refr_depth, const simd_ivec &transp_depth) { +force_inline uvec pack_depth(const ivec &diff_depth, const ivec &spec_depth, + const ivec &refr_depth, const ivec &transp_depth) { assert((diff_depth >= 0x7f).all_zeros() && (spec_depth >= 0x7f).all_zeros() && (refr_depth >= 0x7f).all_zeros() && (transp_depth >= 0x7f).all_zeros()); - simd_uvec ret = 0u; - ret |= simd_uvec(diff_depth) << 0u; - ret |= simd_uvec(spec_depth) << 7u; - ret |= simd_uvec(refr_depth) << 14u; - ret |= simd_uvec(transp_depth) << 21u; + uvec ret = 0u; + ret |= uvec(diff_depth) << 0u; + ret |= uvec(spec_depth) << 7u; + ret |= uvec(refr_depth) << 14u; + ret |= uvec(transp_depth) << 21u; return ret; } -template force_inline simd_ivec get_diff_depth(const simd_uvec &depth) { - return simd_ivec(depth & 0x7f); +template force_inline ivec get_diff_depth(const uvec &depth) { + return ivec(depth & 0x7f); } -template force_inline simd_ivec get_spec_depth(const simd_uvec &depth) { - return simd_ivec(depth >> 7) & 0x7f; +template force_inline ivec get_spec_depth(const uvec &depth) { + return ivec(depth >> 7) & 0x7f; } -template force_inline simd_ivec get_refr_depth(const simd_uvec &depth) { - return simd_ivec(depth >> 14) & 0x7f; +template force_inline ivec get_refr_depth(const uvec &depth) { + return ivec(depth >> 14) & 0x7f; } -template force_inline simd_ivec get_transp_depth(const simd_uvec &depth) { - return simd_ivec(depth >> 21) & 0x7f; +template force_inline ivec get_transp_depth(const uvec &depth) { + return ivec(depth >> 21) & 0x7f; } -template force_inline simd_ivec get_total_depth(const simd_uvec &depth) { +template force_inline ivec get_total_depth(const uvec &depth) { return get_diff_depth(depth) + get_spec_depth(depth) + get_refr_depth(depth) + get_transp_depth(depth); } -template force_inline simd_ivec get_ray_type(const simd_uvec &depth) { - return simd_ivec(depth >> 28) & 0xf; +template force_inline ivec get_ray_type(const uvec &depth) { + return ivec(depth >> 28) & 0xf; } -template force_inline simd_ivec is_indirect(const simd_uvec &depth) { +template force_inline ivec is_indirect(const uvec &depth) { // not only transparency ray - return simd_ivec((depth & 0x001fffff) != 0u); + return ivec((depth & 0x001fffff) != 0u); } // Generating rays @@ -176,19 +176,19 @@ void SampleMeshInTextureSpace(int iteration, int obj_index, int uv_layer, const // Sorting rays template -int SortRays_CPU(Span> rays, const float root_min[3], const float cell_size[3], simd_ivec *hash_values, +int SortRays_CPU(Span> rays, const float root_min[3], const float cell_size[3], ivec *hash_values, uint32_t *scan_values, ray_chunk_t *chunks, ray_chunk_t *chunks_temp); template -int SortRays_GPU(Span> rays, const float root_min[3], const float cell_size[3], simd_ivec *hash_values, +int SortRays_GPU(Span> rays, const float root_min[3], const float cell_size[3], ivec *hash_values, int *head_flags, uint32_t *scan_values, ray_chunk_t *chunks, ray_chunk_t *chunks_temp, uint32_t *skeleton); // Intersect primitives template -bool IntersectTris_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], const simd_ivec &ray_mask, +bool IntersectTris_ClosestHit(const fvec ro[3], const fvec rd[3], const ivec &ray_mask, const tri_accel_t *tris, uint32_t num_tris, int obj_index, hit_data_t &out_inter); template -bool IntersectTris_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], const simd_ivec &ray_mask, +bool IntersectTris_ClosestHit(const fvec ro[3], const fvec rd[3], const ivec &ray_mask, const tri_accel_t *tris, int tri_start, int tri_end, int obj_index, hit_data_t &out_inter); template @@ -198,10 +198,10 @@ template bool IntersectTris_ClosestHit(const float o[3], const float d[3], const mtri_accel_t *mtris, int tri_start, int tri_end, int &inter_prim_index, float &inter_t, float &inter_u, float &inter_v); template -bool IntersectTris_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], const simd_ivec &ray_mask, +bool IntersectTris_AnyHit(const fvec ro[3], const fvec rd[3], const ivec &ray_mask, const tri_accel_t *tris, uint32_t num_tris, int obj_index, hit_data_t &out_inter); template -bool IntersectTris_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], const simd_ivec &ray_mask, +bool IntersectTris_AnyHit(const fvec ro[3], const fvec rd[3], const ivec &ray_mask, const tri_accel_t *tris, int tri_start, int tri_end, int obj_index, hit_data_t &out_inter); template bool IntersectTris_AnyHit(const float o[3], const float d[3], int i, const tri_accel_t *tris, @@ -214,37 +214,37 @@ bool IntersectTris_AnyHit(const float o[3], const float d[3], const mtri_accel_t // Traverse acceleration structure template -bool Traverse_TLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_uvec &ray_flags, const simd_ivec &ray_mask, +bool Traverse_TLAS_WithStack_ClosestHit(const fvec ro[3], const fvec rd[3], + const uvec &ray_flags, const ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, const mesh_t *meshes, const tri_accel_t *tris, const uint32_t *tri_indices, hit_data_t &inter); template -bool Traverse_TLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_uvec &ray_flags, const simd_ivec &ray_mask, +bool Traverse_TLAS_WithStack_ClosestHit(const fvec ro[3], const fvec rd[3], + const uvec &ray_flags, const ivec &ray_mask, const wbvh_node_t *nodes, uint32_t node_index, const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, const mesh_t *meshes, const mtri_accel_t *mtris, const uint32_t *tri_indices, hit_data_t &inter); template -simd_ivec Traverse_TLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], int ray_type, - const simd_ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, +ivec Traverse_TLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], int ray_type, + const ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, const mesh_t *meshes, const tri_accel_t *tris, const tri_mat_data_t *materials, const uint32_t *tri_indices, hit_data_t &inter); template -simd_ivec Traverse_TLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], int ray_type, - const simd_ivec &ray_mask, const wbvh_node_t *nodes, uint32_t node_index, +ivec Traverse_TLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], int ray_type, + const ivec &ray_mask, const wbvh_node_t *nodes, uint32_t node_index, const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, const mesh_t *meshes, const mtri_accel_t *mtris, const tri_mat_data_t *materials, const uint32_t *tri_indices, hit_data_t &inter); // traditional bvh traversal with stack for inner nodes template -bool Traverse_BLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, +bool Traverse_BLAS_WithStack_ClosestHit(const fvec ro[3], const fvec rd[3], + const ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, const tri_accel_t *tris, const uint32_t *tri_indices, int obj_index, hit_data_t &inter); template @@ -253,8 +253,8 @@ bool Traverse_BLAS_WithStack_ClosestHit(const float ro[3], const float rd[3], co int &inter_prim_index, float &inter_t, float &inter_u, float &inter_v); // returns 0 - no hit, 1 - hit, 2 - solid hit (no need to check for transparency) template -simd_ivec Traverse_BLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, +ivec Traverse_BLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], + const ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, const tri_accel_t *tris, const tri_mat_data_t *materials, const uint32_t *tri_indices, int obj_index, hit_data_t &inter); template @@ -265,107 +265,107 @@ int Traverse_BLAS_WithStack_AnyHit(const float ro[3], const float rd[3], const w // BRDFs template -simd_fvec BRDF_PrincipledDiffuse(const simd_fvec V[3], const simd_fvec N[3], const simd_fvec L[3], - const simd_fvec H[3], const simd_fvec &roughness); +fvec BRDF_PrincipledDiffuse(const fvec V[3], const fvec N[3], const fvec L[3], + const fvec H[3], const fvec &roughness); template -void Evaluate_OrenDiffuse_BSDF(const simd_fvec V[3], const simd_fvec N[3], const simd_fvec L[3], - const simd_fvec &roughness, const simd_fvec base_color[3], - simd_fvec out_color[4]); +void Evaluate_OrenDiffuse_BSDF(const fvec V[3], const fvec N[3], const fvec L[3], + const fvec &roughness, const fvec base_color[3], + fvec out_color[4]); template -void Sample_OrenDiffuse_BSDF(const simd_fvec T[3], const simd_fvec B[3], const simd_fvec N[3], - const simd_fvec I[3], const simd_fvec &roughness, const simd_fvec base_color[3], - const simd_fvec &rand_u, const simd_fvec &rand_v, simd_fvec out_V[3], - simd_fvec out_color[4]); +void Sample_OrenDiffuse_BSDF(const fvec T[3], const fvec B[3], const fvec N[3], + const fvec I[3], const fvec &roughness, const fvec base_color[3], + const fvec &rand_u, const fvec &rand_v, fvec out_V[3], + fvec out_color[4]); template -void Evaluate_PrincipledDiffuse_BSDF(const simd_fvec V[3], const simd_fvec N[3], const simd_fvec L[3], - const simd_fvec &roughness, const simd_fvec base_color[3], - const simd_fvec sheen_color[3], bool uniform_sampling, - simd_fvec out_color[4]); +void Evaluate_PrincipledDiffuse_BSDF(const fvec V[3], const fvec N[3], const fvec L[3], + const fvec &roughness, const fvec base_color[3], + const fvec sheen_color[3], bool uniform_sampling, + fvec out_color[4]); template -void Sample_PrincipledDiffuse_BSDF(const simd_fvec T[3], const simd_fvec B[3], const simd_fvec N[3], - const simd_fvec I[3], const simd_fvec &roughness, - const simd_fvec base_color[3], const simd_fvec sheen_color[3], - bool uniform_sampling, const simd_fvec rand[2], simd_fvec out_V[3], - simd_fvec out_color[4]); +void Sample_PrincipledDiffuse_BSDF(const fvec T[3], const fvec B[3], const fvec N[3], + const fvec I[3], const fvec &roughness, + const fvec base_color[3], const fvec sheen_color[3], + bool uniform_sampling, const fvec rand[2], fvec out_V[3], + fvec out_color[4]); template -void Evaluate_GGXSpecular_BSDF(const simd_fvec view_dir_ts[3], const simd_fvec sampled_normal_ts[3], - const simd_fvec reflected_dir_ts[3], const simd_fvec alpha[2], - const simd_fvec &spec_ior, const simd_fvec &spec_F0, - const simd_fvec spec_col[3], const simd_fvec spec_col_90[3], - simd_fvec out_color[4]); +void Evaluate_GGXSpecular_BSDF(const fvec view_dir_ts[3], const fvec sampled_normal_ts[3], + const fvec reflected_dir_ts[3], const fvec alpha[2], + const fvec &spec_ior, const fvec &spec_F0, + const fvec spec_col[3], const fvec spec_col_90[3], + fvec out_color[4]); template -void Sample_GGXSpecular_BSDF(const simd_fvec T[3], const simd_fvec B[3], const simd_fvec N[3], - const simd_fvec I[3], const simd_fvec alpha[2], const simd_fvec &spec_ior, - const simd_fvec &spec_F0, const simd_fvec spec_col[3], - const simd_fvec spec_col_90[3], const simd_fvec rand[2], simd_fvec out_V[3], - simd_fvec out_color[4]); +void Sample_GGXSpecular_BSDF(const fvec T[3], const fvec B[3], const fvec N[3], + const fvec I[3], const fvec alpha[2], const fvec &spec_ior, + const fvec &spec_F0, const fvec spec_col[3], + const fvec spec_col_90[3], const fvec rand[2], fvec out_V[3], + fvec out_color[4]); template -void Evaluate_GGXRefraction_BSDF(const simd_fvec view_dir_ts[3], const simd_fvec sampled_normal_ts[3], - const simd_fvec refr_dir_ts[3], const simd_fvec alpha[2], - const simd_fvec &eta, const simd_fvec refr_col[3], simd_fvec out_color[4]); +void Evaluate_GGXRefraction_BSDF(const fvec view_dir_ts[3], const fvec sampled_normal_ts[3], + const fvec refr_dir_ts[3], const fvec alpha[2], + const fvec &eta, const fvec refr_col[3], fvec out_color[4]); template -void Sample_GGXRefraction_BSDF(const simd_fvec T[3], const simd_fvec B[3], const simd_fvec N[3], - const simd_fvec I[3], const simd_fvec alpha[2], const simd_fvec &eta, - const simd_fvec refr_col[3], const simd_fvec rand[2], simd_fvec out_V[4], - simd_fvec out_color[4]); +void Sample_GGXRefraction_BSDF(const fvec T[3], const fvec B[3], const fvec N[3], + const fvec I[3], const fvec alpha[2], const fvec &eta, + const fvec refr_col[3], const fvec rand[2], fvec out_V[4], + fvec out_color[4]); template -void Evaluate_PrincipledClearcoat_BSDF(const simd_fvec view_dir_ts[3], const simd_fvec sampled_normal_ts[3], - const simd_fvec reflected_dir_ts[3], const simd_fvec &clearcoat_roughness2, - const simd_fvec &clearcoat_ior, const simd_fvec &clearcoat_F0, - simd_fvec out_color[4]); +void Evaluate_PrincipledClearcoat_BSDF(const fvec view_dir_ts[3], const fvec sampled_normal_ts[3], + const fvec reflected_dir_ts[3], const fvec &clearcoat_roughness2, + const fvec &clearcoat_ior, const fvec &clearcoat_F0, + fvec out_color[4]); template -void Sample_PrincipledClearcoat_BSDF(const simd_fvec T[3], const simd_fvec B[3], const simd_fvec N[3], - const simd_fvec I[3], const simd_fvec &clearcoat_roughness2, - const simd_fvec &clearcoat_ior, const simd_fvec &clearcoat_F0, - const simd_fvec rand[2], simd_fvec out_V[3], simd_fvec out_color[4]); +void Sample_PrincipledClearcoat_BSDF(const fvec T[3], const fvec B[3], const fvec N[3], + const fvec I[3], const fvec &clearcoat_roughness2, + const fvec &clearcoat_ior, const fvec &clearcoat_F0, + const fvec rand[2], fvec out_V[3], fvec out_color[4]); template -simd_fvec Evaluate_EnvQTree(float y_rotation, const simd_fvec4 *const *qtree_mips, int qtree_levels, - const simd_fvec L[3]); +fvec Evaluate_EnvQTree(float y_rotation, const fvec4 *const *qtree_mips, int qtree_levels, + const fvec L[3]); template -void Sample_EnvQTree(float y_rotation, const simd_fvec4 *const *qtree_mips, int qtree_levels, const simd_fvec &rand, - const simd_fvec &rx, const simd_fvec &ry, simd_fvec out_V[4]); +void Sample_EnvQTree(float y_rotation, const fvec4 *const *qtree_mips, int qtree_levels, const fvec &rand, + const fvec &rx, const fvec &ry, fvec out_V[4]); // Transform template -void TransformRay(const simd_fvec ro[3], const simd_fvec rd[3], const float *xform, simd_fvec out_ro[3], - simd_fvec out_rd[3]); -template void TransformPoint(const simd_fvec p[3], const float *xform, simd_fvec out_p[3]); -template void TransformPoint(const simd_fvec xform[16], simd_fvec out_p[3]); -template void TransformDirection(const simd_fvec xform[16], simd_fvec p[3]); -template void TransformNormal(const simd_fvec n[3], const float *inv_xform, simd_fvec out_n[3]); -template void TransformNormal(const simd_fvec n[3], const simd_fvec inv_xform[16], simd_fvec out_n[3]); -template void TransformNormal(const simd_fvec inv_xform[16], simd_fvec inout_n[3]); +void TransformRay(const fvec ro[3], const fvec rd[3], const float *xform, fvec out_ro[3], + fvec out_rd[3]); +template void TransformPoint(const fvec p[3], const float *xform, fvec out_p[3]); +template void TransformPoint(const fvec xform[16], fvec out_p[3]); +template void TransformDirection(const fvec xform[16], fvec p[3]); +template void TransformNormal(const fvec n[3], const float *inv_xform, fvec out_n[3]); +template void TransformNormal(const fvec n[3], const fvec inv_xform[16], fvec out_n[3]); +template void TransformNormal(const fvec inv_xform[16], fvec inout_n[3]); void TransformRay(const float ro[3], const float rd[3], const float *xform, float out_ro[3], float out_rd[3]); -template void CanonicalToDir(const simd_fvec p[2], float y_rotation, simd_fvec out_d[3]); -template void DirToCanonical(const simd_fvec d[3], float y_rotation, simd_fvec out_p[2]); +template void CanonicalToDir(const fvec p[2], float y_rotation, fvec out_d[3]); +template void DirToCanonical(const fvec d[3], float y_rotation, fvec out_p[2]); template -void rotate_around_axis(const simd_fvec p[3], const simd_fvec axis[3], const simd_fvec &angle, - simd_fvec out_p[3]); +void rotate_around_axis(const fvec p[3], const fvec axis[3], const fvec &angle, + fvec out_p[3]); // Sample texture template -void SampleNearest(const Cpu::TexStorageBase *const textures[], uint32_t index, const simd_fvec uvs[2], - const simd_fvec &lod, const simd_ivec &mask, simd_fvec out_rgba[4]); +void SampleNearest(const Cpu::TexStorageBase *const textures[], uint32_t index, const fvec uvs[2], + const fvec &lod, const ivec &mask, fvec out_rgba[4]); template -void SampleBilinear(const Cpu::TexStorageBase *const textures[], uint32_t index, const simd_fvec uvs[2], - const simd_ivec &lod, const simd_fvec rand[2], const simd_ivec &mask, - simd_fvec out_rgba[4]); +void SampleBilinear(const Cpu::TexStorageBase *const textures[], uint32_t index, const fvec uvs[2], + const ivec &lod, const fvec rand[2], const ivec &mask, + fvec out_rgba[4]); template -void SampleTrilinear(const Cpu::TexStorageBase *const textures[], uint32_t index, const simd_fvec uvs[2], - const simd_fvec &lod, const simd_fvec rand[2], const simd_ivec &mask, - simd_fvec out_rgba[4]); +void SampleTrilinear(const Cpu::TexStorageBase *const textures[], uint32_t index, const fvec uvs[2], + const fvec &lod, const fvec rand[2], const ivec &mask, + fvec out_rgba[4]); template -void SampleLatlong_RGBE(const Cpu::TexStorageRGBA &storage, uint32_t index, const simd_fvec dir[3], float y_rotation, - const simd_fvec rand[2], const simd_ivec &mask, simd_fvec out_rgb[3]); +void SampleLatlong_RGBE(const Cpu::TexStorageRGBA &storage, uint32_t index, const fvec dir[3], float y_rotation, + const fvec rand[2], const ivec &mask, fvec out_rgb[3]); // Trace rays through scene hierarchy template @@ -375,25 +375,25 @@ void IntersectScene(ray_data_t &r, int min_transp_depth, int max_transp_depth template void IntersectScene(const shadow_ray_t &r, int max_transp_depth, const scene_data_t &sc, uint32_t node_index, const uint32_t rand_seq[], uint32_t rand_seed, int iteration, - const Cpu::TexStorageBase *const textures[], simd_fvec rc[3]); + const Cpu::TexStorageBase *const textures[], fvec rc[3]); // Pick point on any light source for evaluation template -void SampleLightSource(const simd_fvec P[3], const simd_fvec T[3], const simd_fvec B[3], - const simd_fvec N[3], const scene_data_t &sc, const Cpu::TexStorageBase *const tex_atlases[], - const simd_fvec &rand_pick_light, const simd_fvec rand_light_uv[2], - const simd_fvec rand_tex_uv[2], simd_ivec ray_mask, light_sample_t &ls); +void SampleLightSource(const fvec P[3], const fvec T[3], const fvec B[3], + const fvec N[3], const scene_data_t &sc, const Cpu::TexStorageBase *const tex_atlases[], + const fvec &rand_pick_light, const fvec rand_light_uv[2], + const fvec rand_tex_uv[2], ivec ray_mask, light_sample_t &ls); // Account for visible lights contribution template void IntersectAreaLights(const ray_data_t &r, Span lights, Span nodes, hit_data_t &inout_inter); template -simd_fvec IntersectAreaLights(const shadow_ray_t &r, Span lights, +fvec IntersectAreaLights(const shadow_ray_t &r, Span lights, Span nodes); template -simd_fvec EvalTriLightFactor(const simd_fvec P[3], const simd_fvec ro[3], const simd_ivec &mask, - const simd_ivec &tri_index, Span lights, +fvec EvalTriLightFactor(const fvec P[3], const fvec ro[3], const ivec &mask, + const ivec &tri_index, Span lights, Span nodes); template @@ -407,109 +407,109 @@ void TraceShadowRays(Span> rays, int max_transp_depth, flo // Get environment collor at direction template -void Evaluate_EnvColor(const ray_data_t &ray, const simd_ivec &mask, const environment_t &env, - const Cpu::TexStorageRGBA &tex_storage, const simd_fvec &pdf_factor, - const simd_fvec rand[2], simd_fvec env_col[4]); +void Evaluate_EnvColor(const ray_data_t &ray, const ivec &mask, const environment_t &env, + const Cpu::TexStorageRGBA &tex_storage, const fvec &pdf_factor, + const fvec rand[2], fvec env_col[4]); // Get light color at intersection point template -void Evaluate_LightColor(const simd_fvec P[3], const ray_data_t &ray, const simd_ivec &mask, +void Evaluate_LightColor(const fvec P[3], const ray_data_t &ray, const ivec &mask, const hit_data_t &inter, const environment_t &env, Span lights, - uint32_t lights_count, const Cpu::TexStorageRGBA &tex_storage, const simd_fvec rand[2], - simd_fvec light_col[3]); + uint32_t lights_count, const Cpu::TexStorageRGBA &tex_storage, const fvec rand[2], + fvec light_col[3]); // Evaluate individual nodes template -simd_ivec Evaluate_DiffuseNode(const light_sample_t &ls, const ray_data_t &ray, const simd_ivec &mask, - const surface_t &surf, const simd_fvec base_color[3], - const simd_fvec &roughness, const simd_fvec &mix_weight, - const simd_ivec &mis_mask, simd_fvec out_col[3], shadow_ray_t &sh_r); +ivec Evaluate_DiffuseNode(const light_sample_t &ls, const ray_data_t &ray, const ivec &mask, + const surface_t &surf, const fvec base_color[3], + const fvec &roughness, const fvec &mix_weight, + const ivec &mis_mask, fvec out_col[3], shadow_ray_t &sh_r); template -void Sample_DiffuseNode(const ray_data_t &ray, const simd_ivec &mask, const surface_t &surf, - const simd_fvec base_color[3], const simd_fvec &roughness, const simd_fvec &rand_u, - const simd_fvec &rand_v, const simd_fvec &mix_weight, ray_data_t &new_ray); +void Sample_DiffuseNode(const ray_data_t &ray, const ivec &mask, const surface_t &surf, + const fvec base_color[3], const fvec &roughness, const fvec &rand_u, + const fvec &rand_v, const fvec &mix_weight, ray_data_t &new_ray); template -simd_ivec Evaluate_GlossyNode(const light_sample_t &ls, const ray_data_t &ray, simd_ivec mask, - const surface_t &surf, const simd_fvec base_color[3], - const simd_fvec &roughness, const simd_fvec ®ularize_alpha, - const simd_fvec &spec_ior, const simd_fvec &spec_F0, - const simd_fvec &mix_weight, const simd_ivec &mis_mask, simd_fvec out_col[3], +ivec Evaluate_GlossyNode(const light_sample_t &ls, const ray_data_t &ray, ivec mask, + const surface_t &surf, const fvec base_color[3], + const fvec &roughness, const fvec ®ularize_alpha, + const fvec &spec_ior, const fvec &spec_F0, + const fvec &mix_weight, const ivec &mis_mask, fvec out_col[3], shadow_ray_t &sh_r); template -void Sample_GlossyNode(const ray_data_t &ray, const simd_ivec &mask, const surface_t &surf, - const simd_fvec base_color[3], const simd_fvec &roughness, - const simd_fvec ®ularize_alpha, const simd_fvec &spec_ior, const simd_fvec &spec_F0, - const simd_fvec rand[2], const simd_fvec &mix_weight, ray_data_t &new_ray); +void Sample_GlossyNode(const ray_data_t &ray, const ivec &mask, const surface_t &surf, + const fvec base_color[3], const fvec &roughness, + const fvec ®ularize_alpha, const fvec &spec_ior, const fvec &spec_F0, + const fvec rand[2], const fvec &mix_weight, ray_data_t &new_ray); template -simd_ivec Evaluate_RefractiveNode(const light_sample_t &ls, const ray_data_t &ray, const simd_ivec &mask, - const surface_t &surf, const simd_fvec base_color[3], - const simd_fvec &roughness, const simd_fvec ®ularize_alpha, - const simd_fvec &eta, const simd_fvec &mix_weight, - const simd_ivec &mis_mask, simd_fvec out_col[3], shadow_ray_t &sh_r); +ivec Evaluate_RefractiveNode(const light_sample_t &ls, const ray_data_t &ray, const ivec &mask, + const surface_t &surf, const fvec base_color[3], + const fvec &roughness, const fvec ®ularize_alpha, + const fvec &eta, const fvec &mix_weight, + const ivec &mis_mask, fvec out_col[3], shadow_ray_t &sh_r); template -void Sample_RefractiveNode(const ray_data_t &ray, const simd_ivec &mask, const surface_t &surf, - const simd_fvec base_color[3], const simd_fvec &roughness, - const simd_fvec ®ularize_alpha, const simd_ivec &is_backfacing, - const simd_fvec &int_ior, const simd_fvec &ext_ior, const simd_fvec rand[2], - const simd_fvec &mix_weight, ray_data_t &new_ray); +void Sample_RefractiveNode(const ray_data_t &ray, const ivec &mask, const surface_t &surf, + const fvec base_color[3], const fvec &roughness, + const fvec ®ularize_alpha, const ivec &is_backfacing, + const fvec &int_ior, const fvec &ext_ior, const fvec rand[2], + const fvec &mix_weight, ray_data_t &new_ray); template struct diff_params_t { - simd_fvec base_color[3]; - simd_fvec sheen_color[3]; - simd_fvec roughness; + fvec base_color[3]; + fvec sheen_color[3]; + fvec roughness; }; template struct spec_params_t { - simd_fvec tmp_col[3]; - simd_fvec roughness; - simd_fvec ior; - simd_fvec F0; - simd_fvec anisotropy; + fvec tmp_col[3]; + fvec roughness; + fvec ior; + fvec F0; + fvec anisotropy; }; template struct clearcoat_params_t { - simd_fvec roughness; - simd_fvec ior; - simd_fvec F0; + fvec roughness; + fvec ior; + fvec F0; }; template struct transmission_params_t { - simd_fvec roughness; - simd_fvec int_ior; - simd_fvec eta; - simd_fvec fresnel; - simd_ivec backfacing; + fvec roughness; + fvec int_ior; + fvec eta; + fvec fresnel; + ivec backfacing; }; template struct lobe_weights_t { - simd_fvec diffuse, specular, clearcoat, refraction; + fvec diffuse, specular, clearcoat, refraction; }; template -simd_ivec -Evaluate_PrincipledNode(const light_sample_t &ls, const ray_data_t &ray, const simd_ivec &mask, +ivec +Evaluate_PrincipledNode(const light_sample_t &ls, const ray_data_t &ray, const ivec &mask, const surface_t &surf, const lobe_weights_t &lobe_weights, const diff_params_t &diff, const spec_params_t &spec, const clearcoat_params_t &coat, - const transmission_params_t &trans, const simd_fvec &metallic, float transmission, - const simd_fvec &N_dot_L, const simd_fvec &mix_weight, const simd_ivec &mis_mask, - const simd_fvec ®ularize_alpha, simd_fvec out_col[3], shadow_ray_t &sh_r); + const transmission_params_t &trans, const fvec &metallic, float transmission, + const fvec &N_dot_L, const fvec &mix_weight, const ivec &mis_mask, + const fvec ®ularize_alpha, fvec out_col[3], shadow_ray_t &sh_r); template -void Sample_PrincipledNode(const pass_settings_t &ps, const ray_data_t &ray, const simd_ivec &mask, +void Sample_PrincipledNode(const pass_settings_t &ps, const ray_data_t &ray, const ivec &mask, const surface_t &surf, const lobe_weights_t &lobe_weights, const diff_params_t &diff, const spec_params_t &spec, const clearcoat_params_t &coat, const transmission_params_t &trans, - const simd_fvec &metallic, float transmission, const simd_fvec rand[2], - simd_fvec mix_rand, const simd_fvec &mix_weight, const simd_fvec ®ularize_alpha, - simd_ivec &secondary_mask, ray_data_t &new_ray); + const fvec &metallic, float transmission, const fvec rand[2], + fvec mix_rand, const fvec &mix_weight, const fvec ®ularize_alpha, + ivec &secondary_mask, ray_data_t &new_ray); // Shade template void ShadeSurface(const pass_settings_t &ps, const float limits[2], const uint32_t rand_seq[], uint32_t rand_seed, int iteration, const hit_data_t &inter, const ray_data_t &ray, const scene_data_t &sc, - uint32_t node_index, const Cpu::TexStorageBase *const tex_atlases[], simd_fvec out_rgba[4], + uint32_t node_index, const Cpu::TexStorageBase *const tex_atlases[], fvec out_rgba[4], ray_data_t out_secondary_rays[], int *out_secondary_rays_count, shadow_ray_t out_shadow_rays[], - int *out_shadow_rays_count, simd_fvec out_base_color[4], simd_fvec out_depth_normals[4]); + int *out_shadow_rays_count, fvec out_base_color[4], fvec out_depth_normals[4]); template void ShadePrimary(const pass_settings_t &ps, Span> inters, Span> rays, const uint32_t rand_seq[], uint32_t rans_seed, int iteration, const scene_data_t &sc, @@ -547,7 +547,7 @@ class SIMDPolicyBase { using RayDataType = ray_data_t; using ShadowRayType = shadow_ray_t; using HitDataType = hit_data_t; - using RayHashType = simd_ivec; + using RayHashType = ivec; protected: static force_inline void GeneratePrimaryRays(const camera_t &cam, const rect_t &r, const int w, const int h, @@ -675,16 +675,16 @@ class SIMDPolicyBase { namespace Ray { namespace NS { -template force_inline simd_fvec safe_inv(const simd_fvec &a) { +template force_inline fvec safe_inv(const fvec &a) { #if USE_SAFE_MATH - const simd_fvec denom = select(a != 0.0f, a, simd_fvec{FLT_EPS}); + const fvec denom = select(a != 0.0f, a, fvec{FLT_EPS}); return 1.0f / denom; #else return 1.0f / a; #endif } -template force_inline simd_fvec safe_inv_pos(const simd_fvec &a) { +template force_inline fvec safe_inv_pos(const fvec &a) { #if USE_SAFE_MATH return 1.0f / max(a, FLT_EPS); #else @@ -692,16 +692,16 @@ template force_inline simd_fvec safe_inv_pos(const simd_fvec &a) { #endif } -template force_inline simd_fvec safe_div(const simd_fvec &a, const simd_fvec &b) { +template force_inline fvec safe_div(const fvec &a, const fvec &b) { #if USE_SAFE_MATH - const simd_fvec denom = select(b != 0.0f, b, simd_fvec{FLT_EPS}); + const fvec denom = select(b != 0.0f, b, fvec{FLT_EPS}); return a / denom; #else return a / b; #endif } -template force_inline simd_fvec safe_div_pos(const simd_fvec &a, const simd_fvec &b) { +template force_inline fvec safe_div_pos(const fvec &a, const fvec &b) { #if USE_SAFE_MATH return a / max(b, FLT_EPS); #else @@ -709,7 +709,7 @@ template force_inline simd_fvec safe_div_pos(const simd_fvec &a, c #endif } -template force_inline simd_fvec safe_div_pos(const float a, const simd_fvec &b) { +template force_inline fvec safe_div_pos(const float a, const fvec &b) { #if USE_SAFE_MATH return a / max(b, FLT_EPS); #else @@ -717,7 +717,7 @@ template force_inline simd_fvec safe_div_pos(const float a, const sim #endif } -template force_inline simd_fvec safe_div_pos(const simd_fvec &a, const float b) { +template force_inline fvec safe_div_pos(const fvec &a, const float b) { #if USE_SAFE_MATH return a / fmaxf(b, FLT_EPS); #else @@ -733,7 +733,7 @@ force_inline float safe_div_pos(const float a, const float b) { #endif } -template force_inline simd_fvec safe_div_neg(const simd_fvec &a, const simd_fvec &b) { +template force_inline fvec safe_div_neg(const fvec &a, const fvec &b) { #if USE_SAFE_MATH return a / min(b, -FLT_EPS); #else @@ -741,7 +741,7 @@ template force_inline simd_fvec safe_div_neg(const simd_fvec &a, c #endif } -template force_inline simd_fvec safe_sqrt(const simd_fvec &a) { +template force_inline fvec safe_sqrt(const fvec &a) { #if USE_SAFE_MATH return sqrt(max(a, 0.0f)); #else @@ -749,10 +749,10 @@ template force_inline simd_fvec safe_sqrt(const simd_fvec &a) { #endif } -template force_inline void safe_normalize(simd_fvec v[3]) { - simd_fvec l = sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); +template force_inline void safe_normalize(fvec v[3]) { + fvec l = sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); #if USE_SAFE_MATH - const simd_fvec mask = (l != 0.0f); + const fvec mask = (l != 0.0f); where(~mask, l) = FLT_EPS; where(mask, v[0]) /= l; @@ -765,12 +765,12 @@ template force_inline void safe_normalize(simd_fvec v[3]) { #endif } -template force_inline simd_fvec safe_sqrtf(const simd_fvec &f) { return sqrt(max(f, 0.0f)); } +template force_inline fvec safe_sqrtf(const fvec &f) { return sqrt(max(f, 0.0f)); } #define sqr(x) ((x) * (x)) template -force_inline void swap_elements(simd_vec &v1, const int i1, simd_vec &v2, const int i2) { +force_inline void swap_elements(fixed_size_simd &v1, const int i1, fixed_size_simd &v2, const int i2) { const T temp = v1[i1]; v1.set(i1, v2[i2]); v2.set(i2, temp); @@ -779,38 +779,38 @@ force_inline void swap_elements(simd_vec &v1, const int i1, simd_vec #define _dot(x, y) ((x)[0] * (y)[0] + (x)[1] * (y)[1] + (x)[2] * (y)[2]) template -force_inline simd_ivec IntersectTri(const simd_fvec ro[3], const simd_fvec rd[3], const simd_ivec &ray_mask, +force_inline ivec IntersectTri(const fvec ro[3], const fvec rd[3], const ivec &ray_mask, const tri_accel_t &tri, uint32_t prim_index, hit_data_t &inter) { - const simd_fvec det = _dot(rd, tri.n_plane); - const simd_fvec dett = tri.n_plane[3] - _dot(ro, tri.n_plane); + const fvec det = _dot(rd, tri.n_plane); + const fvec dett = tri.n_plane[3] - _dot(ro, tri.n_plane); - const simd_ivec imask = simd_cast(dett >= 0.0f) != simd_cast(det * inter.t - dett >= 0.0f); + const ivec imask = simd_cast(dett >= 0.0f) != simd_cast(det * inter.t - dett >= 0.0f); if (imask.all_zeros()) { - return simd_ivec{0}; + return ivec{0}; } - const simd_fvec p[3] = {det * ro[0] + dett * rd[0], det * ro[1] + dett * rd[1], det * ro[2] + dett * rd[2]}; - const simd_fvec detu = _dot(p, tri.u_plane) + det * tri.u_plane[3]; - const simd_ivec imask1 = simd_cast(detu >= 0.0f) != simd_cast(det - detu >= 0.0f); + const fvec p[3] = {det * ro[0] + dett * rd[0], det * ro[1] + dett * rd[1], det * ro[2] + dett * rd[2]}; + const fvec detu = _dot(p, tri.u_plane) + det * tri.u_plane[3]; + const ivec imask1 = simd_cast(detu >= 0.0f) != simd_cast(det - detu >= 0.0f); if (imask1.all_zeros()) { - return simd_ivec{0}; + return ivec{0}; } - const simd_fvec detv = _dot(p, tri.v_plane) + det * tri.v_plane[3]; - const simd_ivec imask2 = simd_cast(detv >= 0.0f) != simd_cast(det - detu - detv >= 0.0f); + const fvec detv = _dot(p, tri.v_plane) + det * tri.v_plane[3]; + const ivec imask2 = simd_cast(detv >= 0.0f) != simd_cast(det - detu - detv >= 0.0f); if (imask2.all_zeros()) { - return simd_ivec{0}; + return ivec{0}; } - const simd_fvec rdet = 1.0f / det; - const simd_fvec t = dett * rdet; + const fvec rdet = 1.0f / det; + const fvec t = dett * rdet; - const simd_fvec bar_u = detu * rdet; - const simd_fvec bar_v = detv * rdet; + const fvec bar_u = detu * rdet; + const fvec bar_v = detv * rdet; - const simd_fvec &fmask = simd_cast(imask); + const fvec &fmask = simd_cast(imask); - where(imask, inter.prim_index) = simd_ivec{reinterpret_cast(prim_index)}; + where(imask, inter.prim_index) = ivec{reinterpret_cast(prim_index)}; where(fmask, inter.t) = t; where(fmask, inter.u) = bar_u; where(fmask, inter.v) = bar_v; @@ -865,30 +865,30 @@ force_inline bool IntersectTri(const float o[3], const float d[3], int i, const template bool IntersectTri(const float ro[3], const float rd[3], const mtri_accel_t &tri, const uint32_t prim_index, int &inter_prim_index, float &inter_t, float &inter_u, float &inter_v) { - simd_ivec _mask = 0, _prim_index; - simd_fvec _t = inter_t, _u, _v; + ivec _mask = 0, _prim_index; + fvec _t = inter_t, _u, _v; for (int i = 0; i < 8; i += S) { - const simd_fvec det = rd[0] * simd_fvec{&tri.n_plane[0][i], simd_mem_aligned} + - rd[1] * simd_fvec{&tri.n_plane[1][i], simd_mem_aligned} + - rd[2] * simd_fvec{&tri.n_plane[2][i], simd_mem_aligned}; - const simd_fvec dett = simd_fvec{&tri.n_plane[3][i], simd_mem_aligned} - - ro[0] * simd_fvec{&tri.n_plane[0][i], simd_mem_aligned} - - ro[1] * simd_fvec{&tri.n_plane[1][i], simd_mem_aligned} - - ro[2] * simd_fvec{&tri.n_plane[2][i], simd_mem_aligned}; + const fvec det = rd[0] * fvec{&tri.n_plane[0][i], vector_aligned} + + rd[1] * fvec{&tri.n_plane[1][i], vector_aligned} + + rd[2] * fvec{&tri.n_plane[2][i], vector_aligned}; + const fvec dett = fvec{&tri.n_plane[3][i], vector_aligned} - + ro[0] * fvec{&tri.n_plane[0][i], vector_aligned} - + ro[1] * fvec{&tri.n_plane[1][i], vector_aligned} - + ro[2] * fvec{&tri.n_plane[2][i], vector_aligned}; // compare sign bits - simd_ivec is_active_lane = ~srai(simd_cast(dett ^ (det * _t - dett)), 31); + ivec is_active_lane = ~srai(simd_cast(dett ^ (det * _t - dett)), 31); if (is_active_lane.all_zeros()) { continue; } - const simd_fvec p[3] = {det * ro[0] + dett * rd[0], det * ro[1] + dett * rd[1], det * ro[2] + dett * rd[2]}; + const fvec p[3] = {det * ro[0] + dett * rd[0], det * ro[1] + dett * rd[1], det * ro[2] + dett * rd[2]}; - const simd_fvec detu = p[0] * simd_fvec{&tri.u_plane[0][i], simd_mem_aligned} + - p[1] * simd_fvec{&tri.u_plane[1][i], simd_mem_aligned} + - p[2] * simd_fvec{&tri.u_plane[2][i], simd_mem_aligned} + - det * simd_fvec{&tri.u_plane[3][i], simd_mem_aligned}; + const fvec detu = p[0] * fvec{&tri.u_plane[0][i], vector_aligned} + + p[1] * fvec{&tri.u_plane[1][i], vector_aligned} + + p[2] * fvec{&tri.u_plane[2][i], vector_aligned} + + det * fvec{&tri.u_plane[3][i], vector_aligned}; // compare sign bits is_active_lane &= ~srai(simd_cast(detu ^ (det - detu)), 31); @@ -896,10 +896,10 @@ bool IntersectTri(const float ro[3], const float rd[3], const mtri_accel_t &tri, continue; } - const simd_fvec detv = p[0] * simd_fvec{&tri.v_plane[0][i], simd_mem_aligned} + - p[1] * simd_fvec{&tri.v_plane[1][i], simd_mem_aligned} + - p[2] * simd_fvec{&tri.v_plane[2][i], simd_mem_aligned} + - det * simd_fvec{&tri.v_plane[3][i], simd_mem_aligned}; + const fvec detv = p[0] * fvec{&tri.v_plane[0][i], vector_aligned} + + p[1] * fvec{&tri.v_plane[1][i], vector_aligned} + + p[2] * fvec{&tri.v_plane[2][i], vector_aligned} + + det * fvec{&tri.v_plane[3][i], vector_aligned}; // compare sign bits is_active_lane &= ~srai(simd_cast(detv ^ (det - detu - detv)), 31); @@ -907,10 +907,10 @@ bool IntersectTri(const float ro[3], const float rd[3], const mtri_accel_t &tri, continue; } - const simd_fvec rdet = safe_inv(det); + const fvec rdet = safe_inv(det); - simd_ivec prim = -(int(prim_index) + simd_ivec{&ascending_counter[i], simd_mem_aligned}) - 1; - where(det < 0.0f, prim) = int(prim_index) + simd_ivec{&ascending_counter[i], simd_mem_aligned}; + ivec prim = -(int(prim_index) + ivec{&ascending_counter[i], vector_aligned}) - 1; + where(det < 0.0f, prim) = int(prim_index) + ivec{&ascending_counter[i], vector_aligned}; _mask |= is_active_lane; where(is_active_lane, _prim_index) = prim; @@ -956,44 +956,44 @@ bool IntersectTri(const float ro[3], const float rd[3], const mtri_accel_t &tri, template <> bool IntersectTri<16>(const float ro[3], const float rd[3], const mtri_accel_t &tri, const uint32_t prim_index, int &inter_prim_index, float &inter_t, float &inter_u, float &inter_v) { - simd_ivec<8> _mask = 0, _prim_index; - simd_fvec<8> _t = inter_t, _u, _v; + ivec<8> _mask = 0, _prim_index; + fvec<8> _t = inter_t, _u, _v; { // intersect 8 triangles - const simd_fvec<8> det = rd[0] * simd_fvec<8>{&tri.n_plane[0][0], simd_mem_aligned} + - rd[1] * simd_fvec<8>{&tri.n_plane[1][0], simd_mem_aligned} + - rd[2] * simd_fvec<8>{&tri.n_plane[2][0], simd_mem_aligned}; - const simd_fvec<8> dett = simd_fvec<8>{&tri.n_plane[3][0], simd_mem_aligned} - - ro[0] * simd_fvec<8>{&tri.n_plane[0][0], simd_mem_aligned} - - ro[1] * simd_fvec<8>{&tri.n_plane[1][0], simd_mem_aligned} - - ro[2] * simd_fvec<8>{&tri.n_plane[2][0], simd_mem_aligned}; + const fvec<8> det = rd[0] * fvec<8>{&tri.n_plane[0][0], vector_aligned} + + rd[1] * fvec<8>{&tri.n_plane[1][0], vector_aligned} + + rd[2] * fvec<8>{&tri.n_plane[2][0], vector_aligned}; + const fvec<8> dett = fvec<8>{&tri.n_plane[3][0], vector_aligned} - + ro[0] * fvec<8>{&tri.n_plane[0][0], vector_aligned} - + ro[1] * fvec<8>{&tri.n_plane[1][0], vector_aligned} - + ro[2] * fvec<8>{&tri.n_plane[2][0], vector_aligned}; // compare sign bits - simd_ivec<8> is_active_lane = ~srai(simd_cast(dett ^ (det * _t - dett)), 31); + ivec<8> is_active_lane = ~srai(simd_cast(dett ^ (det * _t - dett)), 31); if (!is_active_lane.all_zeros()) { - const simd_fvec<8> p[3] = {det * ro[0] + dett * rd[0], det * ro[1] + dett * rd[1], + const fvec<8> p[3] = {det * ro[0] + dett * rd[0], det * ro[1] + dett * rd[1], det * ro[2] + dett * rd[2]}; - const simd_fvec<8> detu = p[0] * simd_fvec<8>{&tri.u_plane[0][0], simd_mem_aligned} + - p[1] * simd_fvec<8>{&tri.u_plane[1][0], simd_mem_aligned} + - p[2] * simd_fvec<8>{&tri.u_plane[2][0], simd_mem_aligned} + - det * simd_fvec<8>{&tri.u_plane[3][0], simd_mem_aligned}; + const fvec<8> detu = p[0] * fvec<8>{&tri.u_plane[0][0], vector_aligned} + + p[1] * fvec<8>{&tri.u_plane[1][0], vector_aligned} + + p[2] * fvec<8>{&tri.u_plane[2][0], vector_aligned} + + det * fvec<8>{&tri.u_plane[3][0], vector_aligned}; // compare sign bits is_active_lane &= ~srai(simd_cast(detu ^ (det - detu)), 31); if (!is_active_lane.all_zeros()) { - const simd_fvec<8> detv = p[0] * simd_fvec<8>{&tri.v_plane[0][0], simd_mem_aligned} + - p[1] * simd_fvec<8>{&tri.v_plane[1][0], simd_mem_aligned} + - p[2] * simd_fvec<8>{&tri.v_plane[2][0], simd_mem_aligned} + - det * simd_fvec<8>{&tri.v_plane[3][0], simd_mem_aligned}; + const fvec<8> detv = p[0] * fvec<8>{&tri.v_plane[0][0], vector_aligned} + + p[1] * fvec<8>{&tri.v_plane[1][0], vector_aligned} + + p[2] * fvec<8>{&tri.v_plane[2][0], vector_aligned} + + det * fvec<8>{&tri.v_plane[3][0], vector_aligned}; // compare sign bits is_active_lane &= ~srai(simd_cast(detv ^ (det - detu - detv)), 31); if (!is_active_lane.all_zeros()) { - const simd_fvec<8> rdet = safe_inv(det); + const fvec<8> rdet = safe_inv(det); - simd_ivec<8> prim = -(int(prim_index) + simd_ivec<8>{&ascending_counter[0], simd_mem_aligned}) - 1; - where(det < 0.0f, prim) = int(prim_index) + simd_ivec<8>{&ascending_counter[0], simd_mem_aligned}; + ivec<8> prim = -(int(prim_index) + ivec<8>{&ascending_counter[0], vector_aligned}) - 1; + where(det < 0.0f, prim) = int(prim_index) + ivec<8>{&ascending_counter[0], vector_aligned}; _mask |= is_active_lane; where(is_active_lane, _prim_index) = prim; @@ -1040,9 +1040,9 @@ bool IntersectTri<16>(const float ro[3], const float rd[3], const mtri_accel_t & } template -force_inline simd_ivec bbox_test(const simd_fvec o[3], const simd_fvec inv_d[3], const simd_fvec &t, +force_inline ivec bbox_test(const fvec o[3], const fvec inv_d[3], const fvec &t, const float _bbox_min[3], const float _bbox_max[3]) { - simd_fvec low, high, tmin, tmax; + fvec low, high, tmin, tmax; low = inv_d[0] * (_bbox_min[0] - o[0]); high = inv_d[0] * (_bbox_max[0] - o[0]); @@ -1060,14 +1060,14 @@ force_inline simd_ivec bbox_test(const simd_fvec o[3], const simd_fvec tmax = min(tmax, max(low, high)); tmax *= 1.00000024f; - const simd_fvec mask = (tmin <= tmax) & (tmin <= t) & (tmax > 0.0f); - return reinterpret_cast &>(mask); + const fvec mask = (tmin <= tmax) & (tmin <= t) & (tmax > 0.0f); + return reinterpret_cast &>(mask); } template -force_inline simd_ivec bbox_test_fma(const simd_fvec inv_d[3], const simd_fvec inv_d_o[3], - const simd_fvec &t, const float _bbox_min[3], const float _bbox_max[3]) { - simd_fvec low, high, tmin, tmax; +force_inline ivec bbox_test_fma(const fvec inv_d[3], const fvec inv_d_o[3], + const fvec &t, const float _bbox_min[3], const float _bbox_max[3]) { + fvec low, high, tmin, tmax; low = fmsub(inv_d[0], _bbox_min[0], inv_d_o[0]); high = fmsub(inv_d[0], _bbox_max[0], inv_d_o[0]); @@ -1085,16 +1085,16 @@ force_inline simd_ivec bbox_test_fma(const simd_fvec inv_d[3], const simd_ tmax = min(tmax, max(low, high)); tmax *= 1.00000024f; - simd_fvec mask = (tmin <= tmax) & (tmin <= t) & (tmax > 0.0f); + fvec mask = (tmin <= tmax) & (tmin <= t) & (tmax > 0.0f); return simd_cast(mask); } template force_inline void bbox_test_oct(const float inv_d[3], const float inv_d_o[3], const float t, - const simd_fvec bbox_min[3], const simd_fvec bbox_max[3], simd_ivec &out_mask, - simd_fvec &out_dist) { - simd_fvec low, high, tmin, tmax; + const fvec bbox_min[3], const fvec bbox_max[3], ivec &out_mask, + fvec &out_dist) { + fvec low, high, tmin, tmax; low = fmsub(inv_d[0], bbox_min[0], inv_d_o[0]); high = fmsub(inv_d[0], bbox_max[0], inv_d_o[0]); @@ -1112,40 +1112,40 @@ force_inline void bbox_test_oct(const float inv_d[3], const float inv_d_o[3], co tmax = min(tmax, max(low, high)); tmax *= 1.00000024f; - const simd_fvec fmask = (tmin <= tmax) & (tmin <= t) & (tmax > 0.0f); - out_mask = reinterpret_cast &>(fmask); + const fvec fmask = (tmin <= tmax) & (tmin <= t) & (tmax > 0.0f); + out_mask = reinterpret_cast &>(fmask); out_dist = tmin; } template force_inline long bbox_test_oct(const float inv_d[3], const float inv_d_o[3], const float t, const float bbox_min[3][8], const float bbox_max[3][8], float out_dist[8]) { - simd_fvec low, high, tmin, tmax; + fvec low, high, tmin, tmax; long res = 0; static const int LanesCount = (8 / S); UNROLLED_FOR_R(i, LanesCount, { - low = fmsub(inv_d[0], simd_fvec{&bbox_min[0][S * i], simd_mem_aligned}, inv_d_o[0]); - high = fmsub(inv_d[0], simd_fvec{&bbox_max[0][S * i], simd_mem_aligned}, inv_d_o[0]); + low = fmsub(inv_d[0], fvec{&bbox_min[0][S * i], vector_aligned}, inv_d_o[0]); + high = fmsub(inv_d[0], fvec{&bbox_max[0][S * i], vector_aligned}, inv_d_o[0]); tmin = min(low, high); tmax = max(low, high); - low = fmsub(inv_d[1], simd_fvec{&bbox_min[1][S * i], simd_mem_aligned}, inv_d_o[1]); - high = fmsub(inv_d[1], simd_fvec{&bbox_max[1][S * i], simd_mem_aligned}, inv_d_o[1]); + low = fmsub(inv_d[1], fvec{&bbox_min[1][S * i], vector_aligned}, inv_d_o[1]); + high = fmsub(inv_d[1], fvec{&bbox_max[1][S * i], vector_aligned}, inv_d_o[1]); tmin = max(tmin, min(low, high)); tmax = min(tmax, max(low, high)); - low = fmsub(inv_d[2], simd_fvec{&bbox_min[2][S * i], simd_mem_aligned}, inv_d_o[2]); - high = fmsub(inv_d[2], simd_fvec{&bbox_max[2][S * i], simd_mem_aligned}, inv_d_o[2]); + low = fmsub(inv_d[2], fvec{&bbox_min[2][S * i], vector_aligned}, inv_d_o[2]); + high = fmsub(inv_d[2], fvec{&bbox_max[2][S * i], vector_aligned}, inv_d_o[2]); tmin = max(tmin, min(low, high)); tmax = min(tmax, max(low, high)); tmax *= 1.00000024f; - const simd_fvec fmask = (tmin <= tmax) & (tmin <= t) & (tmax > 0.0f); + const fvec fmask = (tmin <= tmax) & (tmin <= t) & (tmax > 0.0f); res <<= S; res |= simd_cast(fmask).movemask(); - tmin.store_to(&out_dist[S * i], simd_mem_aligned); + tmin.store_to(&out_dist[S * i], vector_aligned); }) return res; @@ -1154,34 +1154,34 @@ force_inline long bbox_test_oct(const float inv_d[3], const float inv_d_o[3], co template <> force_inline long bbox_test_oct<16>(const float inv_d[3], const float inv_d_o[3], const float t, const float bbox_min[3][8], const float bbox_max[3][8], float out_dist[8]) { - simd_fvec<8> low = fmsub(inv_d[0], simd_fvec<8>{&bbox_min[0][0], simd_mem_aligned}, inv_d_o[0]); - simd_fvec<8> high = fmsub(inv_d[0], simd_fvec<8>{&bbox_max[0][0], simd_mem_aligned}, inv_d_o[0]); - simd_fvec<8> tmin = min(low, high); - simd_fvec<8> tmax = max(low, high); + fvec<8> low = fmsub(inv_d[0], fvec<8>{&bbox_min[0][0], vector_aligned}, inv_d_o[0]); + fvec<8> high = fmsub(inv_d[0], fvec<8>{&bbox_max[0][0], vector_aligned}, inv_d_o[0]); + fvec<8> tmin = min(low, high); + fvec<8> tmax = max(low, high); - low = fmsub(inv_d[1], simd_fvec<8>{&bbox_min[1][0], simd_mem_aligned}, inv_d_o[1]); - high = fmsub(inv_d[1], simd_fvec<8>{&bbox_max[1][0], simd_mem_aligned}, inv_d_o[1]); + low = fmsub(inv_d[1], fvec<8>{&bbox_min[1][0], vector_aligned}, inv_d_o[1]); + high = fmsub(inv_d[1], fvec<8>{&bbox_max[1][0], vector_aligned}, inv_d_o[1]); tmin = max(tmin, min(low, high)); tmax = min(tmax, max(low, high)); - low = fmsub(inv_d[2], simd_fvec<8>{&bbox_min[2][0], simd_mem_aligned}, inv_d_o[2]); - high = fmsub(inv_d[2], simd_fvec<8>{&bbox_max[2][0], simd_mem_aligned}, inv_d_o[2]); + low = fmsub(inv_d[2], fvec<8>{&bbox_min[2][0], vector_aligned}, inv_d_o[2]); + high = fmsub(inv_d[2], fvec<8>{&bbox_max[2][0], vector_aligned}, inv_d_o[2]); tmin = max(tmin, min(low, high)); tmax = min(tmax, max(low, high)); tmax *= 1.00000024f; - const simd_fvec<8> fmask = (tmin <= tmax) & (tmin <= t) & (tmax > 0.0f); + const fvec<8> fmask = (tmin <= tmax) & (tmin <= t) & (tmax > 0.0f); long res = simd_cast(fmask).movemask(); - tmin.store_to(&out_dist[0], simd_mem_aligned); + tmin.store_to(&out_dist[0], vector_aligned); return res; } template -force_inline void bbox_test_oct(const float p[3], const simd_fvec bbox_min[3], const simd_fvec bbox_max[3], - simd_ivec &out_mask) { - const simd_fvec mask = (bbox_min[0] < p[0]) & (bbox_max[0] > p[0]) & (bbox_min[1] < p[1]) & +force_inline void bbox_test_oct(const float p[3], const fvec bbox_min[3], const fvec bbox_max[3], + ivec &out_mask) { + const fvec mask = (bbox_min[0] < p[0]) & (bbox_max[0] > p[0]) & (bbox_min[1] < p[1]) & (bbox_max[1] > p[1]) & (bbox_min[2] < p[2]) & (bbox_max[2] > p[2]); out_mask = simd_cast(mask); } @@ -1193,12 +1193,12 @@ force_inline long bbox_test_oct(const float p[3], const float bbox_min[3][8], co static const int LanesCount = (8 / S); UNROLLED_FOR_R(i, LanesCount, { - const simd_fvec fmask = (simd_fvec{&bbox_min[0][S * i], simd_mem_aligned} <= p[0]) & - (simd_fvec{&bbox_max[0][S * i], simd_mem_aligned} >= p[0]) & - (simd_fvec{&bbox_min[1][S * i], simd_mem_aligned} <= p[1]) & - (simd_fvec{&bbox_max[1][S * i], simd_mem_aligned} >= p[1]) & - (simd_fvec{&bbox_min[2][S * i], simd_mem_aligned} <= p[2]) & - (simd_fvec{&bbox_max[2][S * i], simd_mem_aligned} >= p[2]); + const fvec fmask = (fvec{&bbox_min[0][S * i], vector_aligned} <= p[0]) & + (fvec{&bbox_max[0][S * i], vector_aligned} >= p[0]) & + (fvec{&bbox_min[1][S * i], vector_aligned} <= p[1]) & + (fvec{&bbox_max[1][S * i], vector_aligned} >= p[1]) & + (fvec{&bbox_min[2][S * i], vector_aligned} <= p[2]) & + (fvec{&bbox_max[2][S * i], vector_aligned} >= p[2]); res <<= S; res |= simd_cast(fmask).movemask(); @@ -1209,12 +1209,12 @@ force_inline long bbox_test_oct(const float p[3], const float bbox_min[3][8], co template <> force_inline long bbox_test_oct<16>(const float p[3], const float bbox_min[3][8], const float bbox_max[3][8]) { - const simd_fvec<8> fmask = (simd_fvec<8>{&bbox_min[0][0], simd_mem_aligned} <= p[0]) & - (simd_fvec<8>{&bbox_max[0][0], simd_mem_aligned} >= p[0]) & - (simd_fvec<8>{&bbox_min[1][0], simd_mem_aligned} <= p[1]) & - (simd_fvec<8>{&bbox_max[1][0], simd_mem_aligned} >= p[1]) & - (simd_fvec<8>{&bbox_min[2][0], simd_mem_aligned} <= p[2]) & - (simd_fvec<8>{&bbox_max[2][0], simd_mem_aligned} >= p[2]); + const fvec<8> fmask = (fvec<8>{&bbox_min[0][0], vector_aligned} <= p[0]) & + (fvec<8>{&bbox_max[0][0], vector_aligned} >= p[0]) & + (fvec<8>{&bbox_min[1][0], vector_aligned} <= p[1]) & + (fvec<8>{&bbox_max[1][0], vector_aligned} >= p[1]) & + (fvec<8>{&bbox_min[2][0], vector_aligned} <= p[2]) & + (fvec<8>{&bbox_max[2][0], vector_aligned} >= p[2]); return simd_cast(fmask).movemask(); } @@ -1258,32 +1258,32 @@ force_inline bool bbox_test(const float inv_d[3], const float inv_do[3], const f } template -force_inline simd_ivec bbox_test(const simd_fvec p[3], const float _bbox_min[3], const float _bbox_max[3]) { - const simd_fvec mask = (p[0] > _bbox_min[0]) & (p[0] < _bbox_max[0]) & (p[1] > _bbox_min[1]) & +force_inline ivec bbox_test(const fvec p[3], const float _bbox_min[3], const float _bbox_max[3]) { + const fvec mask = (p[0] > _bbox_min[0]) & (p[0] < _bbox_max[0]) & (p[1] > _bbox_min[1]) & (p[1] < _bbox_max[1]) & (p[2] > _bbox_min[2]) & (p[2] < _bbox_max[2]); - return reinterpret_cast &>(mask); + return reinterpret_cast &>(mask); } template -force_inline simd_ivec bbox_test(const simd_fvec o[3], const simd_fvec inv_d[3], const simd_fvec &t, +force_inline ivec bbox_test(const fvec o[3], const fvec inv_d[3], const fvec &t, const bvh_node_t &node) { return bbox_test(o, inv_d, t, node.bbox_min, node.bbox_max); } template -force_inline simd_ivec bbox_test_fma(const simd_fvec inv_d[3], const simd_fvec inv_d_o[3], - const simd_fvec &t, const bvh_node_t &node) { +force_inline ivec bbox_test_fma(const fvec inv_d[3], const fvec inv_d_o[3], + const fvec &t, const bvh_node_t &node) { return bbox_test_fma(inv_d, inv_d_o, t, node.bbox_min, node.bbox_max); } -template force_inline simd_ivec bbox_test(const simd_fvec p[3], const bvh_node_t &node) { +template force_inline ivec bbox_test(const fvec p[3], const bvh_node_t &node) { return bbox_test(p, node.bbox_min, node.bbox_max); } template -force_inline uint32_t near_child(const simd_fvec rd[3], const simd_ivec &ray_mask, const bvh_node_t &node) { - const simd_fvec dir_neg_fmask = rd[node.prim_count >> 30] < 0.0f; - const auto dir_neg_imask = reinterpret_cast &>(dir_neg_fmask); +force_inline uint32_t near_child(const fvec rd[3], const ivec &ray_mask, const bvh_node_t &node) { + const fvec dir_neg_fmask = rd[node.prim_count >> 30] < 0.0f; + const auto dir_neg_imask = reinterpret_cast &>(dir_neg_fmask); if (dir_neg_imask.all_zeros(ray_mask)) { return node.left_child; } else { @@ -1301,19 +1301,19 @@ force_inline bool is_leaf_node(const wbvh_node_t &node) { return (node.child[0] template struct TraversalStateStack_Multi { struct { - simd_ivec mask; + ivec mask; uint32_t stack[StackSize]; uint32_t stack_size; } queue[S]; - force_inline void push_children(const simd_fvec rd[3], const bvh_node_t &node) { - const simd_fvec dir_neg_mask = rd[node.prim_count >> 30] < 0.0f; + force_inline void push_children(const fvec rd[3], const bvh_node_t &node) { + const fvec dir_neg_mask = rd[node.prim_count >> 30] < 0.0f; const auto mask1 = simd_cast(dir_neg_mask) & queue[index].mask; if (mask1.all_zeros()) { queue[index].stack[queue[index].stack_size++] = (node.right_child & RIGHT_CHILD_BITS); queue[index].stack[queue[index].stack_size++] = node.left_child; } else { - const simd_ivec mask2 = and_not(mask1, queue[index].mask); + const ivec mask2 = and_not(mask1, queue[index].mask); if (mask2.all_zeros()) { queue[index].stack[queue[index].stack_size++] = node.left_child; queue[index].stack[queue[index].stack_size++] = (node.right_child & RIGHT_CHILD_BITS); @@ -1453,19 +1453,19 @@ template class TraversalStateStack_S }; template -force_inline void comp_aux_inv_values(const simd_fvec o[3], const simd_fvec d[3], simd_fvec inv_d[3], - simd_fvec inv_d_o[3]) { +force_inline void comp_aux_inv_values(const fvec o[3], const fvec d[3], fvec inv_d[3], + fvec inv_d_o[3]) { for (int i = 0; i < 3; i++) { - const simd_fvec denom = select(d[i] != 0.0f, d[i], simd_fvec{FLT_EPS}); + const fvec denom = select(d[i] != 0.0f, d[i], fvec{FLT_EPS}); inv_d[i] = 1.0f / denom; inv_d_o[i] = o[i] * inv_d[i]; - const simd_fvec d_is_plus_zero = (d[i] <= FLT_EPS) & (d[i] >= 0.0f); + const fvec d_is_plus_zero = (d[i] <= FLT_EPS) & (d[i] >= 0.0f); where(d_is_plus_zero, inv_d[i]) = MAX_DIST; where(d_is_plus_zero, inv_d_o[i]) = MAX_DIST; - const simd_fvec d_is_minus_zero = (d[i] >= -FLT_EPS) & (d[i] < 0.0f); + const fvec d_is_minus_zero = (d[i] >= -FLT_EPS) & (d[i] < 0.0f); where(d_is_minus_zero, inv_d[i]) = -MAX_DIST; where(d_is_minus_zero, inv_d_o[i]) = -MAX_DIST; } @@ -1506,27 +1506,27 @@ force_inline void comp_aux_inv_values(const float o[3], const float d[3], float } } -template force_inline simd_fvec dot3(const simd_fvec v1[3], const simd_fvec v2[3]) { +template force_inline fvec dot3(const fvec v1[3], const fvec v2[3]) { return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2]; } -template force_inline simd_fvec dot3(const simd_fvec v1[3], const float v2[3]) { +template force_inline fvec dot3(const fvec v1[3], const float v2[3]) { return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2]; } -template force_inline simd_fvec dot3(const float v1[3], const simd_fvec v2[3]) { +template force_inline fvec dot3(const float v1[3], const fvec v2[3]) { return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2]; } force_inline float dot3(const float v1[3], const float v2[3]) { return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2]; } -template force_inline void cross(const simd_fvec v1[3], const simd_fvec v2[3], simd_fvec res[3]) { +template force_inline void cross(const fvec v1[3], const fvec v2[3], fvec res[3]) { res[0] = v1[1] * v2[2] - v1[2] * v2[1]; res[1] = v1[2] * v2[0] - v1[0] * v2[2]; res[2] = v1[0] * v2[1] - v1[1] * v2[0]; } -template force_inline void cross(const simd_fvec v1[3], const float v2[3], simd_fvec res[3]) { +template force_inline void cross(const fvec v1[3], const float v2[3], fvec res[3]) { res[0] = v1[1] * v2[2] - v1[2] * v2[1]; res[1] = v1[2] * v2[0] - v1[0] * v2[2]; res[2] = v1[0] * v2[1] - v1[1] * v2[0]; @@ -1538,8 +1538,8 @@ force_inline void cross(const float v1[3], const float v2[3], float res[3]) { res[2] = v1[0] * v2[1] - v1[1] * v2[0]; } -template force_inline simd_fvec normalize(simd_fvec v[3]) { - const simd_fvec l = sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); +template force_inline fvec normalize(fvec v[3]) { + const fvec l = sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); v[0] = safe_div_pos(v[0], l); v[1] = safe_div_pos(v[1], l); v[2] = safe_div_pos(v[2], l); @@ -1553,20 +1553,20 @@ force_inline void normalize(float v[3]) { v[2] /= l; } -template force_inline simd_fvec length2(const simd_fvec v[3]) { +template force_inline fvec length2(const fvec v[3]) { return (v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); } -template force_inline simd_fvec length(const simd_fvec v[3]) { return sqrt(length2(v)); } +template force_inline fvec length(const fvec v[3]) { return sqrt(length2(v)); } -template force_inline simd_fvec length2_2d(const simd_fvec v[2]) { return v[0] * v[0] + v[1] * v[1]; } +template force_inline fvec length2_2d(const fvec v[2]) { return v[0] * v[0] + v[1] * v[1]; } -template force_inline simd_fvec distance(const simd_fvec p1[3], const simd_fvec p2[3]) { - const simd_fvec temp[3] = {p1[0] - p2[0], p1[1] - p2[1], p1[2] - p2[2]}; +template force_inline fvec distance(const fvec p1[3], const fvec p2[3]) { + const fvec temp[3] = {p1[0] - p2[0], p1[1] - p2[1], p1[2] - p2[2]}; return length(temp); } -template force_inline simd_uvec hash(simd_uvec x) { +template force_inline uvec hash(uvec x) { // finalizer from murmurhash3 x ^= x >> 16; x *= 0x85ebca6bu; @@ -1576,15 +1576,15 @@ template force_inline simd_uvec hash(simd_uvec x) { return x; } -template force_inline simd_uvec hash_combine(const simd_uvec &seed, const simd_uvec &v) { +template force_inline uvec hash_combine(const uvec &seed, const uvec &v) { return seed ^ (v + (seed << 6) + (seed >> 2)); } -template force_inline simd_uvec hash_combine(const simd_uvec &seed, const uint32_t v) { +template force_inline uvec hash_combine(const uvec &seed, const uint32_t v) { return seed ^ (v + (seed << 6) + (seed >> 2)); } -template force_inline simd_uvec reverse_bits(simd_uvec x) { +template force_inline uvec reverse_bits(uvec x) { x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1)); x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2)); x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4)); @@ -1592,7 +1592,7 @@ template force_inline simd_uvec reverse_bits(simd_uvec x) { return ((x >> 16) | (x << 16)); } -template force_inline simd_uvec laine_karras_permutation(simd_uvec x, const simd_uvec &seed) { +template force_inline uvec laine_karras_permutation(uvec x, const uvec &seed) { x += seed; x ^= x * 0x6c50b47cu; x ^= x * 0xb82f1e52u; @@ -1601,33 +1601,33 @@ template force_inline simd_uvec laine_karras_permutation(simd_uvec return x; } -template force_inline simd_uvec nested_uniform_scramble_base2(simd_uvec x, const simd_uvec &seed) { +template force_inline uvec nested_uniform_scramble_base2(uvec x, const uvec &seed) { x = reverse_bits(x); x = laine_karras_permutation(x, seed); x = reverse_bits(x); return x; } -template force_inline simd_fvec scramble_flt(const simd_uvec &seed, const simd_fvec &val) { - simd_uvec u = simd_uvec(val * 16777216.0f) << 8; +template force_inline fvec scramble_flt(const uvec &seed, const fvec &val) { + uvec u = uvec(val * 16777216.0f) << 8; u = nested_uniform_scramble_base2(u, seed); - return simd_fvec(u >> 8) / 16777216.0f; + return fvec(u >> 8) / 16777216.0f; } -template force_inline simd_fvec scramble_unorm(const simd_uvec &seed, simd_uvec val) { +template force_inline fvec scramble_unorm(const uvec &seed, uvec val) { val = nested_uniform_scramble_base2(val, seed); - return simd_fvec(val >> 8) / 16777216.0f; + return fvec(val >> 8) / 16777216.0f; } template -void get_scrambled_2d_rand(const simd_uvec &dim, const simd_uvec &seed, const int sample, - const uint32_t rand_seq[], simd_fvec out_val[2]) { - const simd_uvec i_seed = hash_combine(seed, dim), x_seed = hash_combine(seed, 2 * dim + 0u), +void get_scrambled_2d_rand(const uvec &dim, const uvec &seed, const int sample, + const uint32_t rand_seq[], fvec out_val[2]) { + const uvec i_seed = hash_combine(seed, dim), x_seed = hash_combine(seed, 2 * dim + 0u), y_seed = hash_combine(seed, 2 * dim + 1); - const auto shuffled_dim = simd_ivec(nested_uniform_scramble_base2(dim, seed) & (RAND_DIMS_COUNT - 1)); + const auto shuffled_dim = ivec(nested_uniform_scramble_base2(dim, seed) & (RAND_DIMS_COUNT - 1)); const auto shuffled_i = - simd_ivec(nested_uniform_scramble_base2(simd_uvec(uint32_t(sample)), i_seed) & (RAND_SAMPLES_COUNT - 1)); + ivec(nested_uniform_scramble_base2(uvec(uint32_t(sample)), i_seed) & (RAND_SAMPLES_COUNT - 1)); out_val[0] = scramble_unorm(x_seed, gather(rand_seq, shuffled_dim * 2 * RAND_SAMPLES_COUNT + 2 * shuffled_i + 0)); out_val[1] = scramble_unorm(y_seed, gather(rand_seq, shuffled_dim * 2 * RAND_SAMPLES_COUNT + 2 * shuffled_i + 1)); @@ -1635,30 +1635,30 @@ void get_scrambled_2d_rand(const simd_uvec &dim, const simd_uvec &seed, co // Gram-Schmidt method template -force_inline void orthogonalize(const simd_fvec a[3], const simd_fvec b[3], simd_fvec out_v[3]) { +force_inline void orthogonalize(const fvec a[3], const fvec b[3], fvec out_v[3]) { // we assume that a is normalized - const simd_fvec temp = dot3(a, b); + const fvec temp = dot3(a, b); UNROLLED_FOR(i, 3, { out_v[i] = b[i] - temp * a[i]; }) normalize(out_v); } -template force_inline simd_fvec acos(const simd_fvec &v) { - simd_fvec ret; +template force_inline fvec acos(const fvec &v) { + fvec ret; UNROLLED_FOR_S(i, S, { ret.set(i, acosf(v[i])); }) return ret; } -template force_inline simd_fvec asin(const simd_fvec &v) { - simd_fvec ret; +template force_inline fvec asin(const fvec &v) { + fvec ret; UNROLLED_FOR_S(i, S, { ret.set(i, asinf(v[i])); }) return ret; } template -force_inline void slerp(const simd_fvec start[3], const simd_fvec end[3], const simd_fvec &percent, - simd_fvec out_v[3]) { +force_inline void slerp(const fvec start[3], const fvec end[3], const fvec &percent, + fvec out_v[3]) { // Dot product - the cosine of the angle between 2 vectors. - simd_fvec cos_theta = dot3(start, end); + fvec cos_theta = dot3(start, end); // Clamp it to be in the range of Acos() // This may be unnecessary, but floating point // precision can be a fickle mistress. @@ -1666,30 +1666,30 @@ force_inline void slerp(const simd_fvec start[3], const simd_fvec end[3], // Acos(dot) returns the angle between start and end, // And multiplying that by percent returns the angle between // start and the final result. - const simd_fvec theta = acos(cos_theta) * percent; - simd_fvec relative_vec[3]; + const fvec theta = acos(cos_theta) * percent; + fvec relative_vec[3]; UNROLLED_FOR(i, 3, { relative_vec[i] = end[i] - start[i] * cos_theta; }) safe_normalize(relative_vec); // Orthonormal basis // The final result. - const simd_fvec cos_theta2 = cos(theta), sin_theta = sin(theta); + const fvec cos_theta2 = cos(theta), sin_theta = sin(theta); UNROLLED_FOR(i, 3, { out_v[i] = start[i] * cos_theta2 + relative_vec[i] * sin_theta; }) } // Return arcsine(x) given that .57 < x -template force_inline simd_fvec asin_tail(const simd_fvec &x) { +template force_inline fvec asin_tail(const fvec &x) { return (PI / 2) - ((x + 2.71745038f) * x + 14.0375338f) * (0.00440413551f * ((x - 8.31223679f) * x + 25.3978882f)) * sqrt(1 - x); } -template force_inline simd_fvec portable_asinf(const simd_fvec &x) { - simd_fvec ret; +template force_inline fvec portable_asinf(const fvec &x) { + fvec ret; - const simd_fvec mask = abs(x) > 0.57f; + const fvec mask = abs(x) > 0.57f; where(mask, ret) = asin_tail(abs(x)); where(x < 0.0f, ret) = -ret; - const simd_fvec x2 = x * x; + const fvec x2 = x * x; where(~mask, ret) = x + (0.0517513789f * ((x2 + 1.83372748f) * x2 + 1.56678128f)) * x * (x2 * ((x2 - 1.48268414f) * x2 + 2.05554748f)); @@ -1698,39 +1698,39 @@ template force_inline simd_fvec portable_asinf(const simd_fvec &x) // Equivalent to acosf(dot(a, b)), but more numerically stable // Taken from PBRT source code -template simd_fvec angle_between(const simd_fvec v1[3], const simd_fvec v2[3]) { - const simd_fvec dot_mask = dot3(v1, v2) < 0.0f; +template fvec angle_between(const fvec v1[3], const fvec v2[3]) { + const fvec dot_mask = dot3(v1, v2) < 0.0f; - simd_fvec arg[3]; + fvec arg[3]; UNROLLED_FOR(i, 3, { arg[i] = v2[i] - v1[i]; where(dot_mask, arg[i]) = v1[i] + v2[i]; }) - simd_fvec ret = 2 * portable_asinf(length(arg) / 2); + fvec ret = 2 * portable_asinf(length(arg) / 2); where(dot_mask, ret) = PI - ret; return ret; } -template force_inline simd_fvec acos_positive_tail(const simd_fvec &x) { +template force_inline fvec acos_positive_tail(const fvec &x) { return (((x + 2.71850395f) * x + 14.7303705f)) * (0.00393401226f * ((x - 8.60734272f) * x + 27.0927486f)) * sqrt(1 - x); } -template force_inline simd_fvec acos_negative_tail(const simd_fvec &x) { +template force_inline fvec acos_negative_tail(const fvec &x) { return PI - (((x - 2.71850395f) * x + 14.7303705f)) * (0.00393401226f * ((x + 8.60734272f) * x + 27.0927486f)) * sqrt(1 + x); } -template force_inline simd_fvec portable_acosf(const simd_fvec &x) { - const simd_fvec mask1 = (x < -0.62f); - const simd_fvec mask2 = (x <= 0.62f); +template force_inline fvec portable_acosf(const fvec &x) { + const fvec mask1 = (x < -0.62f); + const fvec mask2 = (x <= 0.62f); - simd_fvec ret; + fvec ret; where(mask1, ret) = acos_negative_tail(x); - const simd_fvec x2 = x * x; + const fvec x2 = x * x; where(~mask1 & mask2, ret) = (PI / 2) - x - (0.0700945929f * x * ((x2 + 1.57144082f) * x2 + 1.25210774f)) * (x2 * ((x2 - 1.53757966f) * x2 + 1.89929986f)); @@ -1744,10 +1744,10 @@ template force_inline simd_fvec portable_acosf(const simd_fvec &x) // https://www.graphics.cornell.edu/pubs/1995/Arv95c.pdf // Based on https://www.shadertoy.com/view/4tGGzd template -simd_fvec SampleSphericalTriangle(const simd_fvec P[3], const simd_fvec p1[3], const simd_fvec p2[3], - const simd_fvec p3[3], const simd_fvec Xi[2], simd_fvec out_dir[3]) { +fvec SampleSphericalTriangle(const fvec P[3], const fvec p1[3], const fvec p2[3], + const fvec p3[3], const fvec Xi[2], fvec out_dir[3]) { // setup spherical triangle - simd_fvec A[3], B[3], C[3]; + fvec A[3], B[3], C[3]; UNROLLED_FOR(i, 3, { A[i] = p1[i] - P[i]; }) UNROLLED_FOR(i, 3, { B[i] = p2[i] - P[i]; }) UNROLLED_FOR(i, 3, { C[i] = p3[i] - P[i]; }) @@ -1755,7 +1755,7 @@ simd_fvec SampleSphericalTriangle(const simd_fvec P[3], const simd_fvec normalize(B); normalize(C); - simd_fvec BA[3], CA[3], AB[3], CB[3], BC[3], AC[3]; + fvec BA[3], CA[3], AB[3], CB[3], BC[3], AC[3]; // calculate internal angles of spherical triangle: alpha, beta and gamma for (int i = 0; i < 3; ++i) { BA[i] = B[i] - A[i]; @@ -1771,138 +1771,138 @@ simd_fvec SampleSphericalTriangle(const simd_fvec P[3], const simd_fvec orthogonalize(B, CB, CB); orthogonalize(C, BC, BC); orthogonalize(C, AC, AC); - const simd_fvec alpha = angle_between(BA, CA); - const simd_fvec beta = angle_between(AB, CB); - const simd_fvec gamma = angle_between(BC, AC); + const fvec alpha = angle_between(BA, CA); + const fvec beta = angle_between(AB, CB); + const fvec gamma = angle_between(BC, AC); - const simd_fvec area = alpha + beta + gamma - PI; - simd_ivec mask = simd_cast(area > SPHERICAL_AREA_THRESHOLD); + const fvec area = alpha + beta + gamma - PI; + ivec mask = simd_cast(area > SPHERICAL_AREA_THRESHOLD); if (mask.all_zeros()) { return 0.0f; } if (out_dir) { // calculate arc lengths for edges of spherical triangle - const simd_fvec b = portable_acosf(clamp(dot3(C, A), -1.0f, 1.0f)); - const simd_fvec c = portable_acosf(clamp(dot3(A, B), -1.0f, 1.0f)); + const fvec b = portable_acosf(clamp(dot3(C, A), -1.0f, 1.0f)); + const fvec c = portable_acosf(clamp(dot3(A, B), -1.0f, 1.0f)); // Use one random variable to select the new area - const simd_fvec area_S = Xi[0] * area; + const fvec area_S = Xi[0] * area; // Save the sine and cosine of the angle delta - const simd_fvec p = sin(area_S - alpha); - const simd_fvec q = cos(area_S - alpha); + const fvec p = sin(area_S - alpha); + const fvec q = cos(area_S - alpha); // Compute the pair(u; v) that determines sin(beta_s) and cos(beta_s) - const simd_fvec u = q - cos(alpha); - const simd_fvec v = p + sin(alpha) * cos(c); + const fvec u = q - cos(alpha); + const fvec v = p + sin(alpha) * cos(c); // Compute the s coordinate as normalized arc length from A to C_s - const simd_fvec denom = ((v * p + u * q) * sin(alpha)); - const simd_fvec s = safe_div(simd_fvec{1.0f}, b) * + const fvec denom = ((v * p + u * q) * sin(alpha)); + const fvec s = safe_div(fvec{1.0f}, b) * portable_acosf(clamp(safe_div(((v * q - u * p) * cos(alpha) - v), denom), -1.0f, 1.0f)); // Compute the third vertex of the sub - triangle. - simd_fvec C_s[3]; + fvec C_s[3]; slerp(A, C, s, C_s); // Compute the t coordinate using C_s and Xi[1] - const simd_fvec denom2 = portable_acosf(clamp(dot3(C_s, B), -1.0f, 1.0f)); - const simd_fvec t = + const fvec denom2 = portable_acosf(clamp(dot3(C_s, B), -1.0f, 1.0f)); + const fvec t = safe_div(portable_acosf(clamp(1.0f - Xi[1] * (1.0f - dot3(C_s, B)), -1.0f, 1.0f)), denom2); // Construct the corresponding point on the sphere slerp(B, C_s, t, out_dir); } - return select(mask, safe_div_pos(1.0f, area), simd_fvec{0.0f}); + return select(mask, safe_div_pos(1.0f, area), fvec{0.0f}); } // "An Area-Preserving Parametrization for Spherical Rectangles" // https://www.arnoldrenderer.com/research/egsr2013_spherical_rectangle.pdf // NOTE: no precomputation is done, everything is calculated in-place template -simd_fvec SampleSphericalRectangle(const simd_fvec P[3], const simd_fvec light_pos[3], - const simd_fvec axis_u[3], const simd_fvec axis_v[3], - const simd_fvec Xi[2], simd_fvec out_p[3]) { - simd_fvec corner[3], x[3], y[3], z[3]; +fvec SampleSphericalRectangle(const fvec P[3], const fvec light_pos[3], + const fvec axis_u[3], const fvec axis_v[3], + const fvec Xi[2], fvec out_p[3]) { + fvec corner[3], x[3], y[3], z[3]; UNROLLED_FOR(i, 3, { corner[i] = light_pos[i] - 0.5f * axis_u[i] - 0.5f * axis_v[i]; x[i] = axis_u[i]; y[i] = axis_v[i]; }) - const simd_fvec axisu_len = normalize(x), axisv_len = normalize(y); + const fvec axisu_len = normalize(x), axisv_len = normalize(y); cross(x, y, z); // compute rectangle coords in local reference system - simd_fvec dir[3]; + fvec dir[3]; UNROLLED_FOR(i, 3, { dir[i] = corner[i] - P[i]; }) - simd_fvec z0 = dot3(dir, z); + fvec z0 = dot3(dir, z); // flip z to make it point against Q UNROLLED_FOR(i, 3, { where(z0 > 0.0f, z[i]) = -z[i]; }) where(z0 > 0.0f, z0) = -z0; - const simd_fvec x0 = dot3(dir, x); - const simd_fvec y0 = dot3(dir, y); - const simd_fvec x1 = x0 + axisu_len; - const simd_fvec y1 = y0 + axisv_len; + const fvec x0 = dot3(dir, x); + const fvec y0 = dot3(dir, y); + const fvec x1 = x0 + axisu_len; + const fvec y1 = y0 + axisv_len; // compute internal angles (gamma_i) - simd_fvec diff[4] = {x0 - x1, y1 - y0, x1 - x0, y0 - y1}, nz[4] = {y0, x1, y1, x0}; + fvec diff[4] = {x0 - x1, y1 - y0, x1 - x0, y0 - y1}, nz[4] = {y0, x1, y1, x0}; UNROLLED_FOR(i, 4, { nz[i] *= diff[i]; nz[i] /= sqrt(z0 * z0 * diff[i] * diff[i] + nz[i] * nz[i]); }) - const simd_fvec g0 = portable_acosf(clamp(-nz[0] * nz[1], -1.0f, 1.0f)); - const simd_fvec g1 = portable_acosf(clamp(-nz[1] * nz[2], -1.0f, 1.0f)); - const simd_fvec g2 = portable_acosf(clamp(-nz[2] * nz[3], -1.0f, 1.0f)); - const simd_fvec g3 = portable_acosf(clamp(-nz[3] * nz[0], -1.0f, 1.0f)); + const fvec g0 = portable_acosf(clamp(-nz[0] * nz[1], -1.0f, 1.0f)); + const fvec g1 = portable_acosf(clamp(-nz[1] * nz[2], -1.0f, 1.0f)); + const fvec g2 = portable_acosf(clamp(-nz[2] * nz[3], -1.0f, 1.0f)); + const fvec g3 = portable_acosf(clamp(-nz[3] * nz[0], -1.0f, 1.0f)); // compute predefined constants - const simd_fvec b0 = nz[0]; - const simd_fvec b1 = nz[2]; - const simd_fvec b0sq = b0 * b0; - const simd_fvec k = 2 * PI - g2 - g3; + const fvec b0 = nz[0]; + const fvec b1 = nz[2]; + const fvec b0sq = b0 * b0; + const fvec k = 2 * PI - g2 - g3; // compute solid angle from internal angles - const simd_fvec area = g0 + g1 - k; - const simd_ivec mask = simd_cast(area > SPHERICAL_AREA_THRESHOLD); + const fvec area = g0 + g1 - k; + const ivec mask = simd_cast(area > SPHERICAL_AREA_THRESHOLD); if (mask.all_zeros()) { return 0.0f; } if (out_p) { // compute cu - const simd_fvec au = Xi[0] * area + k; - const simd_fvec fu = safe_div((cos(au) * b0 - b1), sin(au)); - simd_fvec cu = 1.0f / sqrt(fu * fu + b0sq); + const fvec au = Xi[0] * area + k; + const fvec fu = safe_div((cos(au) * b0 - b1), sin(au)); + fvec cu = 1.0f / sqrt(fu * fu + b0sq); where(fu <= 0.0f, cu) = -cu; cu = clamp(cu, -1.0f, 1.0f); // compute xu - simd_fvec xu = -(cu * z0) / max(sqrt(1.0f - cu * cu), 1e-7f); + fvec xu = -(cu * z0) / max(sqrt(1.0f - cu * cu), 1e-7f); xu = min(max(xu, x0), x1); // compute yv - const simd_fvec z0sq = z0 * z0; - const simd_fvec y0sq = y0 * y0; - const simd_fvec y1sq = y1 * y1; - const simd_fvec d = sqrt(xu * xu + z0sq); - const simd_fvec h0 = y0 / sqrt(d * d + y0sq); - const simd_fvec h1 = y1 / sqrt(d * d + y1sq); - const simd_fvec hv = h0 + Xi[1] * (h1 - h0), hv2 = hv * hv; - simd_fvec yv = y1; + const fvec z0sq = z0 * z0; + const fvec y0sq = y0 * y0; + const fvec y1sq = y1 * y1; + const fvec d = sqrt(xu * xu + z0sq); + const fvec h0 = y0 / sqrt(d * d + y0sq); + const fvec h1 = y1 / sqrt(d * d + y1sq); + const fvec hv = h0 + Xi[1] * (h1 - h0), hv2 = hv * hv; + fvec yv = y1; where(hv2 < 1.0f - 1e-6f, yv) = safe_div_pos(hv * d, sqrt(1.0f - hv2)); // transform (xu, yv, z0) to world coords UNROLLED_FOR(i, 3, { out_p[i] = P[i] + xu * x[i] + yv * y[i] + z0 * z[i]; }) } - return select(mask, safe_div_pos(1.0f, area), simd_fvec{0.0f}); + return select(mask, safe_div_pos(1.0f, area), fvec{0.0f}); } force_inline float floor(float x) { return float(int(x) - (x < 0.0f)); } template -force_inline void reflect(const simd_fvec I[3], const simd_fvec N[3], const simd_fvec &dot_N_I, - simd_fvec res[3]) { +force_inline void reflect(const fvec I[3], const fvec N[3], const fvec &dot_N_I, + fvec res[3]) { res[0] = I[0] - 2.0f * dot_N_I * N[0]; res[1] = I[1] - 2.0f * dot_N_I * N[1]; res[2] = I[2] - 2.0f * dot_N_I * N[2]; @@ -1922,18 +1922,18 @@ force_inline void TransformDirection(const float d[3], const float *xform, float out_d[2] = xform[2] * d[0] + xform[6] * d[1] + xform[10] * d[2]; } -template force_inline simd_fvec pow5(const simd_fvec &v) { return (v * v) * (v * v) * v; } +template force_inline fvec pow5(const fvec &v) { return (v * v) * (v * v) * v; } -template simd_ivec get_ray_hash(const ray_data_t &r, const float root_min[3], const float cell_size[3]) { - simd_ivec x = clamp(simd_ivec((r.o[0] - root_min[0]) / cell_size[0]), 0, 255), - y = clamp(simd_ivec((r.o[1] - root_min[1]) / cell_size[1]), 0, 255), - z = clamp(simd_ivec((r.o[2] - root_min[2]) / cell_size[2]), 0, 255); +template ivec get_ray_hash(const ray_data_t &r, const float root_min[3], const float cell_size[3]) { + ivec x = clamp(ivec((r.o[0] - root_min[0]) / cell_size[0]), 0, 255), + y = clamp(ivec((r.o[1] - root_min[1]) / cell_size[1]), 0, 255), + z = clamp(ivec((r.o[2] - root_min[2]) / cell_size[2]), 0, 255); - simd_ivec omega_index = clamp(simd_ivec((1.0f + r.d[2]) / omega_step), 0, 32), - phi_index_i = clamp(simd_ivec((1.0f + r.d[1]) / phi_step), 0, 16), - phi_index_j = clamp(simd_ivec((1.0f + r.d[0]) / phi_step), 0, 16); + ivec omega_index = clamp(ivec((1.0f + r.d[2]) / omega_step), 0, 32), + phi_index_i = clamp(ivec((1.0f + r.d[1]) / phi_step), 0, 16), + phi_index_j = clamp(ivec((1.0f + r.d[0]) / phi_step), 0, 16); - simd_ivec o, p; + ivec o, p; UNROLLED_FOR_S(i, S, { if (r.mask[i]) { @@ -1979,15 +1979,15 @@ force_inline void radix_sort(ray_chunk_t *begin, ray_chunk_t *end, ray_chunk_t * _radix_sort_lsb(begin, end, begin1, 24); } -template force_inline simd_fvec construct_float(const simd_ivec &_m) { - const simd_ivec ieeeMantissa = {0x007FFFFF}; // binary32 mantissa bitmask - const simd_ivec ieeeOne = {0x3F800000}; // 1.0 in IEEE binary32 +template force_inline fvec construct_float(const ivec &_m) { + const ivec ieeeMantissa = {0x007FFFFF}; // binary32 mantissa bitmask + const ivec ieeeOne = {0x3F800000}; // 1.0 in IEEE binary32 - simd_ivec m = _m & ieeeMantissa; // Keep only mantissa bits (fractional part) + ivec m = _m & ieeeMantissa; // Keep only mantissa bits (fractional part) m = m | ieeeOne; // Add fractional part to 1.0 - const simd_fvec f = simd_cast(m); // Range [1:2] - return f - simd_fvec{1.0f}; // Range [0:1] + const fvec f = simd_cast(m); // Range [1:2] + return f - fvec{1.0f}; // Range [0:1] } force_inline float fast_log2(float val) { @@ -2003,34 +2003,34 @@ force_inline float fast_log2(float val) { return log_2; } -template force_inline simd_fvec fast_log2(const simd_fvec &val) { +template force_inline fvec fast_log2(const fvec &val) { // From https://stackoverflow.com/questions/9411823/fast-log2float-x-implementation-c union { - simd_fvec val; - simd_ivec x; + fvec val; + ivec x; } u = {val}; - simd_fvec log_2 = simd_fvec(((u.x >> 23) & 255) - 128); + fvec log_2 = fvec(((u.x >> 23) & 255) - 128); u.x &= ~(255 << 23); u.x += 127 << 23; log_2 += ((-0.34484843f) * u.val + 2.02466578f) * u.val - 0.67487759f; return log_2; } -template force_inline simd_fvec lum(const simd_fvec color[3]) { +template force_inline fvec lum(const fvec color[3]) { return 0.212671f * color[0] + 0.715160f * color[1] + 0.072169f * color[2]; } -template force_inline void srgb_to_rgb(const simd_fvec in_col[4], simd_fvec out_col[4]) { +template force_inline void srgb_to_rgb(const fvec in_col[4], fvec out_col[4]) { UNROLLED_FOR(i, 3, { out_col[i] = select(in_col[i] > 0.04045f, pow((in_col[i] + 0.055f) / 1.055f, 2.4f), in_col[i] / 12.92f); }) out_col[3] = in_col[3]; } -template force_inline void YCoCg_to_RGB(const simd_fvec in_col[4], simd_fvec out_col[3]) { - const simd_fvec scale = (in_col[2] * (255.0f / 8.0f)) + 1.0f; - const simd_fvec Y = in_col[3]; - const simd_fvec Co = (in_col[0] - (0.5f * 256.0f / 255.0f)) / scale; - const simd_fvec Cg = (in_col[1] - (0.5f * 256.0f / 255.0f)) / scale; +template force_inline void YCoCg_to_RGB(const fvec in_col[4], fvec out_col[3]) { + const fvec scale = (in_col[2] * (255.0f / 8.0f)) + 1.0f; + const fvec Y = in_col[3]; + const fvec Co = (in_col[0] - (0.5f * 256.0f / 255.0f)) / scale; + const fvec Cg = (in_col[1] - (0.5f * 256.0f / 255.0f)) / scale; out_col[0] = saturate(Y + Co - Cg); out_col[1] = saturate(Y + Cg); @@ -2038,22 +2038,22 @@ template force_inline void YCoCg_to_RGB(const simd_fvec in_col[4], si } template -simd_fvec get_texture_lod(const Cpu::TexStorageBase *textures[], const uint32_t index, const simd_fvec duv_dx[2], - const simd_fvec duv_dy[2], const simd_ivec &mask) { +fvec get_texture_lod(const Cpu::TexStorageBase *textures[], const uint32_t index, const fvec duv_dx[2], + const fvec duv_dy[2], const ivec &mask) { #if FORCE_TEXTURE_LOD - const simd_fvec lod = float(FORCE_TEXTURE_LOD); + const fvec lod = float(FORCE_TEXTURE_LOD); #else float sz[2]; textures[index >> 28]->GetFRes(int(index & 0x00ffffff), 0, sz); - const simd_fvec _duv_dx[2] = {duv_dx[0] * sz[0], duv_dx[1] * sz[1]}; - const simd_fvec _duv_dy[2] = {duv_dy[0] * sz[0], duv_dy[1] * sz[1]}; + const fvec _duv_dx[2] = {duv_dx[0] * sz[0], duv_dx[1] * sz[1]}; + const fvec _duv_dy[2] = {duv_dy[0] * sz[0], duv_dy[1] * sz[1]}; - const simd_fvec _diagonal[2] = {_duv_dx[0] + _duv_dy[0], _duv_dx[1] + _duv_dy[1]}; + const fvec _diagonal[2] = {_duv_dx[0] + _duv_dy[0], _duv_dx[1] + _duv_dy[1]}; - const simd_fvec dim = min(min(length2_2d(_duv_dx), length2_2d(_duv_dy)), length2_2d(_diagonal)); + const fvec dim = min(min(length2_2d(_duv_dx), length2_2d(_duv_dy)), length2_2d(_diagonal)); - simd_fvec lod = 0.5f * fast_log2(dim) - 1.0f; + fvec lod = 0.5f * fast_log2(dim) - 1.0f; where(lod < 0.0f, lod) = 0.0f; where(lod > float(MAX_MIP_LEVEL), lod) = float(MAX_MIP_LEVEL); @@ -2062,18 +2062,18 @@ simd_fvec get_texture_lod(const Cpu::TexStorageBase *textures[], const uint32 } template -simd_fvec get_texture_lod(const Cpu::TexStorageBase *const textures[], const uint32_t index, - const simd_fvec &lambda, const simd_ivec &mask) { +fvec get_texture_lod(const Cpu::TexStorageBase *const textures[], const uint32_t index, + const fvec &lambda, const ivec &mask) { #if FORCE_TEXTURE_LOD - const simd_fvec lod = float(FORCE_TEXTURE_LOD); + const fvec lod = float(FORCE_TEXTURE_LOD); #else float sz[2]; textures[index >> 28]->GetFRes(int(index & 0x00ffffff), 0, sz); - simd_fvec lod = 0.0f; + fvec lod = 0.0f; UNROLLED_FOR_S(i, S, { - if (reinterpret_cast &>(mask).template get()) { + if (reinterpret_cast &>(mask).template get()) { lod.template set(lambda.template get() + 0.5f * fast_log2(sz[0] * sz[1]) - 1.0f); } }) @@ -2085,22 +2085,22 @@ simd_fvec get_texture_lod(const Cpu::TexStorageBase *const textures[], const } template -simd_fvec get_texture_lod(const simd_ivec &width, const simd_ivec &height, const simd_fvec duv_dx[2], - const simd_fvec duv_dy[2], const simd_ivec &mask) { +fvec get_texture_lod(const ivec &width, const ivec &height, const fvec duv_dx[2], + const fvec duv_dy[2], const ivec &mask) { #if FORCE_TEXTURE_LOD - const simd_fvec lod = float(FORCE_TEXTURE_LOD); + const fvec lod = float(FORCE_TEXTURE_LOD); #else - const simd_fvec _duv_dx[2] = {duv_dx[0] * simd_fvec(width), duv_dx[1] * simd_fvec(height)}; - const simd_fvec _duv_dy[2] = {duv_dy[0] * simd_fvec(width), duv_dy[1] * simd_fvec(height)}; + const fvec _duv_dx[2] = {duv_dx[0] * fvec(width), duv_dx[1] * fvec(height)}; + const fvec _duv_dy[2] = {duv_dy[0] * fvec(width), duv_dy[1] * fvec(height)}; - const simd_fvec _diagonal[2] = {_duv_dx[0] + _duv_dy[0], _duv_dx[1] + _duv_dy[1]}; + const fvec _diagonal[2] = {_duv_dx[0] + _duv_dy[0], _duv_dx[1] + _duv_dy[1]}; - const simd_fvec dim = min(min(length2_2d(_duv_dx), length2_2d(_duv_dy)), length2_2d(_diagonal)); + const fvec dim = min(min(length2_2d(_duv_dx), length2_2d(_duv_dy)), length2_2d(_diagonal)); - simd_fvec lod = 0.0f; + fvec lod = 0.0f; UNROLLED_FOR_S(i, S, { - if (reinterpret_cast &>(mask).template get()) { + if (reinterpret_cast &>(mask).template get()) { lod.template set(0.5f * fast_log2(dim.template get()) - 1.0f); } }) @@ -2112,15 +2112,15 @@ simd_fvec get_texture_lod(const simd_ivec &width, const simd_ivec &heig } template -simd_fvec get_texture_lod(const simd_ivec &width, const simd_ivec &height, const simd_fvec &lambda, - const simd_ivec &mask) { +fvec get_texture_lod(const ivec &width, const ivec &height, const fvec &lambda, + const ivec &mask) { #if FORCE_TEXTURE_LOD - const simd_fvec lod = float(FORCE_TEXTURE_LOD); + const fvec lod = float(FORCE_TEXTURE_LOD); #else - simd_fvec lod; + fvec lod; UNROLLED_FOR_S(i, S, { - if (reinterpret_cast &>(mask).template get()) { + if (reinterpret_cast &>(mask).template get()) { lod[i] = lambda.template get() + 0.5f * fast_log2(width * height) - 1.0f; } else { lod[i] = 0.0f; @@ -2133,26 +2133,26 @@ simd_fvec get_texture_lod(const simd_ivec &width, const simd_ivec &heig return lod; } -template force_inline simd_fvec conv_unorm_16(const simd_ivec &v) { return simd_fvec(v) / 65535.0f; } +template force_inline fvec conv_unorm_16(const ivec &v) { return fvec(v) / 65535.0f; } template -void FetchTransformAndRecalcBasis(const mesh_instance_t *sc_mesh_instances, const simd_ivec &mi_index, - const simd_fvec P_ls[3], simd_fvec inout_plane_N[3], simd_fvec inout_N[3], - simd_fvec inout_B[3], simd_fvec inout_T[3], simd_fvec inout_tangent[3], - simd_fvec inout_ro_ls[3], simd_fvec out_transform[16]) { +void FetchTransformAndRecalcBasis(const mesh_instance_t *sc_mesh_instances, const ivec &mi_index, + const fvec P_ls[3], fvec inout_plane_N[3], fvec inout_N[3], + fvec inout_B[3], fvec inout_T[3], fvec inout_tangent[3], + fvec inout_ro_ls[3], fvec out_transform[16]) { const float *transforms = &sc_mesh_instances[0].xform[0]; const float *inv_transforms = &sc_mesh_instances[0].inv_xform[0]; const int MeshInstancesStride = sizeof(mesh_instance_t) / sizeof(float); - simd_fvec inv_transform[16]; + fvec inv_transform[16]; UNROLLED_FOR(i, 16, { out_transform[i] = gather(transforms + i, mi_index * MeshInstancesStride); inv_transform[i] = gather(inv_transforms + i, mi_index * MeshInstancesStride); }) - simd_fvec temp[3]; + fvec temp[3]; cross(inout_tangent, inout_N, temp); - const simd_fvec mask = length2(temp) == 0.0f; + const fvec mask = length2(temp) == 0.0f; UNROLLED_FOR(i, 3, { where(mask, inout_tangent[i]) = P_ls[i]; }) TransformNormal(inv_transform, inout_plane_N); @@ -2164,17 +2164,17 @@ void FetchTransformAndRecalcBasis(const mesh_instance_t *sc_mesh_instances, cons } template -void FetchVertexAttribute3(const float *attribs, const simd_ivec vtx_indices[3], const simd_fvec &u, - const simd_fvec &v, const simd_fvec &w, simd_fvec out_A[3]) { +void FetchVertexAttribute3(const float *attribs, const ivec vtx_indices[3], const fvec &u, + const fvec &v, const fvec &w, fvec out_A[3]) { static const int VtxStride = sizeof(vertex_t) / sizeof(float); - const simd_fvec A1[3] = {gather(attribs + 0, vtx_indices[0] * VtxStride), + const fvec A1[3] = {gather(attribs + 0, vtx_indices[0] * VtxStride), gather(attribs + 1, vtx_indices[0] * VtxStride), gather(attribs + 2, vtx_indices[0] * VtxStride)}; - const simd_fvec A2[3] = {gather(attribs + 0, vtx_indices[1] * VtxStride), + const fvec A2[3] = {gather(attribs + 0, vtx_indices[1] * VtxStride), gather(attribs + 1, vtx_indices[1] * VtxStride), gather(attribs + 2, vtx_indices[1] * VtxStride)}; - const simd_fvec A3[3] = {gather(attribs + 0, vtx_indices[2] * VtxStride), + const fvec A3[3] = {gather(attribs + 0, vtx_indices[2] * VtxStride), gather(attribs + 1, vtx_indices[2] * VtxStride), gather(attribs + 2, vtx_indices[2] * VtxStride)}; @@ -2182,14 +2182,14 @@ void FetchVertexAttribute3(const float *attribs, const simd_ivec vtx_indices[ } template -void EnsureValidReflection(const simd_fvec Ng[3], const simd_fvec I[3], simd_fvec inout_N[3]) { - simd_fvec R[3]; +void EnsureValidReflection(const fvec Ng[3], const fvec I[3], fvec inout_N[3]) { + fvec R[3]; UNROLLED_FOR(i, 3, { R[i] = 2.0f * dot3(inout_N, I) * inout_N[i] - I[i]; }) // Reflection rays may always be at least as shallow as the incoming ray. - const simd_fvec threshold = min(0.9f * dot3(Ng, I), 0.01f); + const fvec threshold = min(0.9f * dot3(Ng, I), 0.01f); - const simd_ivec early_mask = simd_cast(dot3(Ng, R) < threshold); + const ivec early_mask = simd_cast(dot3(Ng, R) < threshold); if (early_mask.all_zeros()) { return; } @@ -2197,60 +2197,60 @@ void EnsureValidReflection(const simd_fvec Ng[3], const simd_fvec I[3], si // Form coordinate system with Ng as the Z axis and N inside the X-Z-plane. // The X axis is found by normalizing the component of N that's orthogonal to Ng. // The Y axis isn't actually needed. - const simd_fvec NdotNg = dot3(inout_N, Ng); + const fvec NdotNg = dot3(inout_N, Ng); - simd_fvec X[3]; + fvec X[3]; UNROLLED_FOR(i, 3, { X[i] = inout_N[i] - NdotNg * Ng[i]; }) safe_normalize(X); - const simd_fvec Ix = dot3(I, X), Iz = dot3(I, Ng); - const simd_fvec Ix2 = sqr(Ix), Iz2 = sqr(Iz); - const simd_fvec a = Ix2 + Iz2; + const fvec Ix = dot3(I, X), Iz = dot3(I, Ng); + const fvec Ix2 = sqr(Ix), Iz2 = sqr(Iz); + const fvec a = Ix2 + Iz2; - const simd_fvec b = safe_sqrtf(Ix2 * (a - (threshold * threshold))); - const simd_fvec c = Iz * threshold + a; + const fvec b = safe_sqrtf(Ix2 * (a - (threshold * threshold))); + const fvec c = Iz * threshold + a; // Evaluate both solutions. // In many cases one can be immediately discarded (if N'.z would be imaginary or larger than // one), so check for that first. If no option is viable (might happen in extreme cases like N // being in the wrong hemisphere), give up and return Ng. - const simd_fvec fac = safe_div(simd_fvec{0.5f}, a); - const simd_fvec N1_z2 = fac * (b + c), N2_z2 = fac * (-b + c); + const fvec fac = safe_div(fvec{0.5f}, a); + const fvec N1_z2 = fac * (b + c), N2_z2 = fac * (-b + c); - simd_ivec valid1 = simd_cast((N1_z2 > 1e-5f) & (N1_z2 <= (1.0f + 1e-5f))); - simd_ivec valid2 = simd_cast((N2_z2 > 1e-5f) & (N2_z2 <= (1.0f + 1e-5f))); + ivec valid1 = simd_cast((N1_z2 > 1e-5f) & (N1_z2 <= (1.0f + 1e-5f))); + ivec valid2 = simd_cast((N2_z2 > 1e-5f) & (N2_z2 <= (1.0f + 1e-5f))); - simd_fvec N_new[2]; + fvec N_new[2]; if ((valid1 & valid2).not_all_zeros()) { // If both are possible, do the expensive reflection-based check. - const simd_fvec N1[2] = {safe_sqrtf(1.0f - N1_z2), safe_sqrtf(N1_z2)}; - const simd_fvec N2[2] = {safe_sqrtf(1.0f - N2_z2), safe_sqrtf(N2_z2)}; + const fvec N1[2] = {safe_sqrtf(1.0f - N1_z2), safe_sqrtf(N1_z2)}; + const fvec N2[2] = {safe_sqrtf(1.0f - N2_z2), safe_sqrtf(N2_z2)}; - const simd_fvec R1 = 2 * (N1[0] * Ix + N1[1] * Iz) * N1[1] - Iz; - const simd_fvec R2 = 2 * (N2[0] * Ix + N2[1] * Iz) * N2[1] - Iz; + const fvec R1 = 2 * (N1[0] * Ix + N1[1] * Iz) * N1[1] - Iz; + const fvec R2 = 2 * (N2[0] * Ix + N2[1] * Iz) * N2[1] - Iz; valid1 = simd_cast(R1 >= 1e-5f); valid2 = simd_cast(R2 >= 1e-5f); - const simd_ivec mask = valid1 & valid2; + const ivec mask = valid1 & valid2; - const simd_ivec mask1 = mask & simd_cast(R1 < R2); + const ivec mask1 = mask & simd_cast(R1 < R2); UNROLLED_FOR(i, 2, { where(mask1, N_new[i]) = N1[i]; }) - const simd_ivec mask2 = mask & ~simd_cast(R1 < R2); + const ivec mask2 = mask & ~simd_cast(R1 < R2); UNROLLED_FOR(i, 2, { where(mask2, N_new[i]) = N2[i]; }) - const simd_ivec mask3 = ~mask & simd_cast(R1 > R2); + const ivec mask3 = ~mask & simd_cast(R1 > R2); UNROLLED_FOR(i, 2, { where(mask3, N_new[i]) = N1[i]; }) - const simd_ivec mask4 = ~mask & ~simd_cast(R1 > R2); + const ivec mask4 = ~mask & ~simd_cast(R1 > R2); UNROLLED_FOR(i, 2, { where(mask4, N_new[i]) = N2[i]; }) } if ((valid1 | valid2).not_all_zeros()) { - const simd_ivec exclude = ~(valid1 & valid2); + const ivec exclude = ~(valid1 & valid2); // Only one solution passes the N'.z criterium, so pick that one. - const simd_fvec Nz2 = select(valid1, N1_z2, N2_z2); + const fvec Nz2 = select(valid1, N1_z2, N2_z2); where(exclude & (valid1 | valid2), N_new[0]) = safe_sqrtf(1.0f - Nz2); where(exclude & (valid1 | valid2), N_new[1]) = safe_sqrtf(Nz2); @@ -2263,36 +2263,36 @@ void EnsureValidReflection(const simd_fvec Ng[3], const simd_fvec I[3], si } template -force_inline void world_from_tangent(const simd_fvec T[3], const simd_fvec B[3], const simd_fvec N[3], - const simd_fvec V[3], simd_fvec out_V[3]) { +force_inline void world_from_tangent(const fvec T[3], const fvec B[3], const fvec N[3], + const fvec V[3], fvec out_V[3]) { UNROLLED_FOR(i, 3, { out_V[i] = V[0] * T[i] + V[1] * B[i] + V[2] * N[i]; }) } template -force_inline void tangent_from_world(const simd_fvec T[3], const simd_fvec B[3], const simd_fvec N[3], - const simd_fvec V[3], simd_fvec out_V[3]) { +force_inline void tangent_from_world(const fvec T[3], const fvec B[3], const fvec N[3], + const fvec V[3], fvec out_V[3]) { out_V[0] = dot3(V, T); out_V[1] = dot3(V, B); out_V[2] = dot3(V, N); } -template force_inline simd_fvec cos(const simd_fvec &v) { - simd_fvec ret; +template force_inline fvec cos(const fvec &v) { + fvec ret; UNROLLED_FOR_S(i, S, { ret.template set(cosf(v.template get())); }) return ret; } -template force_inline simd_fvec sin(const simd_fvec &v) { - simd_fvec ret; +template force_inline fvec sin(const fvec &v) { + fvec ret; UNROLLED_FOR_S(i, S, { ret.template set(sinf(v.template get())); }) return ret; } template -force_inline void calc_alpha(const simd_fvec &roughness, const simd_fvec &anisotropy, - const simd_fvec ®ularize_alpha, simd_fvec out_alpha[2]) { - const simd_fvec roughness2 = sqr(roughness); - const simd_fvec aspect = sqrt(1.0f - 0.9f * anisotropy); +force_inline void calc_alpha(const fvec &roughness, const fvec &anisotropy, + const fvec ®ularize_alpha, fvec out_alpha[2]) { + const fvec roughness2 = sqr(roughness); + const fvec aspect = sqrt(1.0f - 0.9f * anisotropy); out_alpha[0] = (roughness2 / aspect); out_alpha[1] = (roughness2 * aspect); @@ -2306,46 +2306,46 @@ force_inline void calc_alpha(const simd_fvec &roughness, const simd_fvec & // // From "A Fast and Robust Method for Avoiding Self-Intersection" // -template void offset_ray(const simd_fvec p[3], const simd_fvec n[3], simd_fvec out_p[3]) { +template void offset_ray(const fvec p[3], const fvec n[3], fvec out_p[3]) { static const float Origin = 1.0f / 32.0f; static const float FloatScale = 1.0f / 65536.0f; static const float IntScale = 128.0f; // 256.0f; - simd_ivec of_i[3] = {simd_ivec{IntScale * n[0]}, simd_ivec{IntScale * n[1]}, - simd_ivec{IntScale * n[2]}}; + ivec of_i[3] = {ivec{IntScale * n[0]}, ivec{IntScale * n[1]}, + ivec{IntScale * n[2]}}; UNROLLED_FOR(i, 3, { where(p[i] < 0.0f, of_i[i]) = -of_i[i]; }) - const simd_fvec p_i[3] = {simd_cast(simd_cast(p[0]) + of_i[0]), simd_cast(simd_cast(p[1]) + of_i[1]), + const fvec p_i[3] = {simd_cast(simd_cast(p[0]) + of_i[0]), simd_cast(simd_cast(p[1]) + of_i[1]), simd_cast(simd_cast(p[2]) + of_i[2])}; UNROLLED_FOR(i, 3, { out_p[i] = p_i[i]; - where(abs(p[i]) < Origin, out_p[i]) = fmadd(simd_fvec{FloatScale}, n[i], p[i]); + where(abs(p[i]) < Origin, out_p[i]) = fmadd(fvec{FloatScale}, n[i], p[i]); }) } // http://jcgt.org/published/0007/04/01/paper.pdf template -void SampleVNDF_Hemisphere_CrossSect(const simd_fvec Vh[3], const simd_fvec &U1, const simd_fvec &U2, - simd_fvec out_Nh[3]) { +void SampleVNDF_Hemisphere_CrossSect(const fvec Vh[3], const fvec &U1, const fvec &U2, + fvec out_Nh[3]) { // orthonormal basis (with special case if cross product is zero) - const simd_fvec lensq = Vh[0] * Vh[0] + Vh[1] * Vh[1]; + const fvec lensq = Vh[0] * Vh[0] + Vh[1] * Vh[1]; - simd_fvec T1[3] = {{1.0f}, {0.0f}, {0.0f}}; - const simd_fvec denom = safe_sqrt(lensq); + fvec T1[3] = {{1.0f}, {0.0f}, {0.0f}}; + const fvec denom = safe_sqrt(lensq); where(lensq > 0.0f, T1[0]) = -safe_div_pos(Vh[1], denom); where(lensq > 0.0f, T1[1]) = safe_div_pos(Vh[0], denom); - simd_fvec T2[3]; + fvec T2[3]; cross(Vh, T1, T2); // parameterization of the projected area - const simd_fvec r = sqrt(U1); - const simd_fvec phi = 2.0f * PI * U2; - simd_fvec t1; + const fvec r = sqrt(U1); + const fvec phi = 2.0f * PI * U2; + fvec t1; UNROLLED_FOR_S(i, S, { t1.template set(r.template get() * cosf(phi.template get())); }) - simd_fvec t2; + fvec t2; UNROLLED_FOR_S(i, S, { t2.template set(r.template get() * sinf(phi.template get())); }) - const simd_fvec s = 0.5f * (1.0f + Vh[2]); + const fvec s = 0.5f * (1.0f + Vh[2]); t2 = (1.0f - s) * sqrt(1.0f - t1 * t1) + s * t2; // reprojection onto hemisphere UNROLLED_FOR(i, 3, { out_Nh[i] = t1 * T1[i] + t2 * T2[i] + sqrt(max(0.0f, 1.0f - t1 * t1 - t2 * t2)) * Vh[i]; }) @@ -2353,10 +2353,10 @@ void SampleVNDF_Hemisphere_CrossSect(const simd_fvec Vh[3], const simd_fvec -void SampleVNDF_Hemisphere_SphCap(const simd_fvec Vh[3], const simd_fvec rand[2], simd_fvec out_Nh[3]) { - const simd_fvec phi = 2.0f * PI * rand[0]; - const simd_fvec z = fmadd(1.0f - rand[1], 1.0f + Vh[2], -Vh[2]); - const simd_fvec sin_theta = sqrt(saturate(1.0f - z * z)); +void SampleVNDF_Hemisphere_SphCap(const fvec Vh[3], const fvec rand[2], fvec out_Nh[3]) { + const fvec phi = 2.0f * PI * rand[0]; + const fvec z = fmadd(1.0f - rand[1], 1.0f + Vh[2], -Vh[2]); + const fvec sin_theta = sqrt(saturate(1.0f - z * z)); out_Nh[0] = Vh[0] + sin_theta * cos(phi); out_Nh[1] = Vh[1] + sin_theta * sin(phi); out_Nh[2] = Vh[2] + z; @@ -2364,20 +2364,20 @@ void SampleVNDF_Hemisphere_SphCap(const simd_fvec Vh[3], const simd_fvec r // https://gpuopen.com/download/publications/Bounded_VNDF_Sampling_for_Smith-GGX_Reflections.pdf template -void SampleVNDF_Hemisphere_SphCap_Bounded(const simd_fvec Ve[3], const simd_fvec Vh[3], - const simd_fvec alpha[2], const simd_fvec rand[2], - simd_fvec out_Nh[3]) { +void SampleVNDF_Hemisphere_SphCap_Bounded(const fvec Ve[3], const fvec Vh[3], + const fvec alpha[2], const fvec rand[2], + fvec out_Nh[3]) { // sample a spherical cap in (-Vh.z, 1] - const simd_fvec phi = 2.0f * PI * rand[0]; - const simd_fvec a = saturate(min(alpha[0], alpha[1])); - const simd_fvec s = 1.0f + sqrt(Ve[0] * Ve[0] + Ve[1] * Ve[1]); - const simd_fvec a2 = a * a, s2 = s * s; - const simd_fvec k = (1.0f - a2) * s2 / (s2 + a2 * Ve[2] * Ve[2]); - const simd_fvec b = select(Ve[2] > 0.0f, k * Vh[2], Vh[2]); - const simd_fvec z = fmadd(1.0f - rand[1], 1.0f + b, -b); - const simd_fvec sin_theta = sqrt(saturate(1.0f - z * z)); - const simd_fvec x = sin_theta * cos(phi); - const simd_fvec y = sin_theta * sin(phi); + const fvec phi = 2.0f * PI * rand[0]; + const fvec a = saturate(min(alpha[0], alpha[1])); + const fvec s = 1.0f + sqrt(Ve[0] * Ve[0] + Ve[1] * Ve[1]); + const fvec a2 = a * a, s2 = s * s; + const fvec k = (1.0f - a2) * s2 / (s2 + a2 * Ve[2] * Ve[2]); + const fvec b = select(Ve[2] > 0.0f, k * Vh[2], Vh[2]); + const fvec z = fmadd(1.0f - rand[1], 1.0f + b, -b); + const fvec sin_theta = sqrt(saturate(1.0f - z * z)); + const fvec x = sin_theta * cos(phi); + const fvec y = sin_theta * sin(phi); out_Nh[0] = x + Vh[0]; out_Nh[1] = y + Vh[1]; out_Nh[2] = z + Vh[2]; @@ -2388,13 +2388,13 @@ void SampleVNDF_Hemisphere_SphCap_Bounded(const simd_fvec Ve[3], const simd_f // Input U1, U2: uniform random numbers // Output Ne: normal sampled with PDF D_Ve(Ne) = G1(Ve) * max(0, dot(Ve, Ne)) * D(Ne) / Ve.z template -void SampleGGX_VNDF(const simd_fvec Ve[3], const simd_fvec alpha[2], const simd_fvec rand[2], - simd_fvec out_V[3]) { +void SampleGGX_VNDF(const fvec Ve[3], const fvec alpha[2], const fvec rand[2], + fvec out_V[3]) { // transforming the view direction to the hemisphere configuration - simd_fvec Vh[3] = {alpha[0] * Ve[0], alpha[1] * Ve[1], Ve[2]}; + fvec Vh[3] = {alpha[0] * Ve[0], alpha[1] * Ve[1], Ve[2]}; safe_normalize(Vh); // sample the hemisphere - simd_fvec Nh[3]; + fvec Nh[3]; SampleVNDF_Hemisphere_SphCap(Vh, rand, Nh); // transforming the normal back to the ellipsoid configuration out_V[0] = alpha[0] * Nh[0]; @@ -2404,13 +2404,13 @@ void SampleGGX_VNDF(const simd_fvec Ve[3], const simd_fvec alpha[2], const } template -void SampleGGX_VNDF_Bounded(const simd_fvec Ve[3], const simd_fvec alpha[2], const simd_fvec rand[2], - simd_fvec out_V[3]) { +void SampleGGX_VNDF_Bounded(const fvec Ve[3], const fvec alpha[2], const fvec rand[2], + fvec out_V[3]) { // transforming the view direction to the hemisphere configuration - simd_fvec Vh[3] = {alpha[0] * Ve[0], alpha[1] * Ve[1], Ve[2]}; + fvec Vh[3] = {alpha[0] * Ve[0], alpha[1] * Ve[1], Ve[2]}; safe_normalize(Vh); // sample the hemisphere - simd_fvec Nh[3]; + fvec Nh[3]; SampleVNDF_Hemisphere_SphCap_Bounded(Ve, Vh, alpha, rand, Nh); // transforming the normal back to the ellipsoid configuration out_V[0] = alpha[0] * Nh[0]; @@ -2420,49 +2420,49 @@ void SampleGGX_VNDF_Bounded(const simd_fvec Ve[3], const simd_fvec alpha[2 } template -simd_fvec GGX_VNDF_Reflection_Bounded_PDF(const simd_fvec &D, const simd_fvec view_dir_ts[3], - const simd_fvec alpha[2]) { - const simd_fvec ai[2] = {alpha[0] * view_dir_ts[0], alpha[1] * view_dir_ts[1]}; - const simd_fvec len2 = ai[0] * ai[0] + ai[1] * ai[1]; - const simd_fvec t = sqrt(len2 + view_dir_ts[2] * view_dir_ts[2]); +fvec GGX_VNDF_Reflection_Bounded_PDF(const fvec &D, const fvec view_dir_ts[3], + const fvec alpha[2]) { + const fvec ai[2] = {alpha[0] * view_dir_ts[0], alpha[1] * view_dir_ts[1]}; + const fvec len2 = ai[0] * ai[0] + ai[1] * ai[1]; + const fvec t = sqrt(len2 + view_dir_ts[2] * view_dir_ts[2]); - simd_fvec ret = D * safe_div_pos(t - view_dir_ts[2], 2.0f * len2); + fvec ret = D * safe_div_pos(t - view_dir_ts[2], 2.0f * len2); - const simd_fvec a = saturate(min(alpha[0], alpha[1])); - const simd_fvec s = 1.0f + sqrt(view_dir_ts[0] * view_dir_ts[0] + view_dir_ts[1] * view_dir_ts[1]); - const simd_fvec a2 = a * a, s2 = s * s; - const simd_fvec k = (1.0f - a2) * s2 / (s2 + a2 * view_dir_ts[2] * view_dir_ts[2]); + const fvec a = saturate(min(alpha[0], alpha[1])); + const fvec s = 1.0f + sqrt(view_dir_ts[0] * view_dir_ts[0] + view_dir_ts[1] * view_dir_ts[1]); + const fvec a2 = a * a, s2 = s * s; + const fvec k = (1.0f - a2) * s2 / (s2 + a2 * view_dir_ts[2] * view_dir_ts[2]); where(view_dir_ts[2] >= 0.0f, ret) = safe_div(D, 2.0f * (k * view_dir_ts[2] + t)); return ret; } // Smith shadowing function -template force_inline simd_fvec G1(const simd_fvec Ve[3], simd_fvec alpha_x, simd_fvec alpha_y) { +template force_inline fvec G1(const fvec Ve[3], fvec alpha_x, fvec alpha_y) { alpha_x *= alpha_x; alpha_y *= alpha_y; - const simd_fvec delta = + const fvec delta = (-1.0f + safe_sqrt(1.0f + safe_div_pos(alpha_x * Ve[0] * Ve[0] + alpha_y * Ve[1] * Ve[1], Ve[2] * Ve[2]))) / 2.0f; return 1.0f / (1.0f + delta); } -template simd_fvec D_GTR1(const simd_fvec &NDotH, const simd_fvec &a) { - const simd_fvec a2 = sqr(a); - const simd_fvec t = 1.0f + (a2 - 1.0f) * NDotH * NDotH; - return select(a < 1.0f, safe_div(a2 - 1.0f, PI * log(a2) * t), simd_fvec{1.0f / PI}); +template fvec D_GTR1(const fvec &NDotH, const fvec &a) { + const fvec a2 = sqr(a); + const fvec t = 1.0f + (a2 - 1.0f) * NDotH * NDotH; + return select(a < 1.0f, safe_div(a2 - 1.0f, PI * log(a2) * t), fvec{1.0f / PI}); } -template simd_fvec D_GGX(const simd_fvec H[3], const simd_fvec &alpha_x, const simd_fvec &alpha_y) { - const simd_fvec sx = -safe_div(H[0], H[2] * alpha_x); - const simd_fvec sy = -safe_div(H[1], H[2] * alpha_y); - const simd_fvec s1 = 1.0f + sx * sx + sy * sy; - const simd_fvec cos_theta_h4 = H[2] * H[2] * H[2] * H[2]; - return select(H[2] != 0.0f, safe_inv_pos((s1 * s1) * PI * alpha_x * alpha_y * cos_theta_h4), simd_fvec{0.0f}); +template fvec D_GGX(const fvec H[3], const fvec &alpha_x, const fvec &alpha_y) { + const fvec sx = -safe_div(H[0], H[2] * alpha_x); + const fvec sy = -safe_div(H[1], H[2] * alpha_y); + const fvec s1 = 1.0f + sx * sx + sy * sy; + const fvec cos_theta_h4 = H[2] * H[2] * H[2] * H[2]; + return select(H[2] != 0.0f, safe_inv_pos((s1 * s1) * PI * alpha_x * alpha_y * cos_theta_h4), fvec{0.0f}); } -template void create_tbn(const simd_fvec N[3], simd_fvec out_T[3], simd_fvec out_B[3]) { - simd_fvec U[3] = {1.0f, 0.0f, 0.0f}; +template void create_tbn(const fvec N[3], fvec out_T[3], fvec out_B[3]) { + fvec U[3] = {1.0f, 0.0f, 0.0f}; where(N[1] < 0.999f, U[0]) = 0.0f; where(N[1] < 0.999f, U[1]) = 1.0f; @@ -2473,42 +2473,42 @@ template void create_tbn(const simd_fvec N[3], simd_fvec out_T[3], } template -void map_to_cone(const simd_fvec &r1, const simd_fvec &r2, const simd_fvec N[3], float radius, - simd_fvec out_V[3]) { - const simd_fvec offset[2] = {2.0f * r1 - 1.0f, 2.0f * r2 - 1.0f}; +void map_to_cone(const fvec &r1, const fvec &r2, const fvec N[3], float radius, + fvec out_V[3]) { + const fvec offset[2] = {2.0f * r1 - 1.0f, 2.0f * r2 - 1.0f}; UNROLLED_FOR(i, 3, { out_V[i] = N[i]; }) - simd_fvec r = offset[1]; - simd_fvec theta = 0.5f * PI * (1.0f - 0.5f * safe_div(offset[0], offset[1])); + fvec r = offset[1]; + fvec theta = 0.5f * PI * (1.0f - 0.5f * safe_div(offset[0], offset[1])); where(abs(offset[0]) > abs(offset[1]), r) = offset[0]; where(abs(offset[0]) > abs(offset[1]), theta) = 0.25f * PI * safe_div(offset[1], offset[0]); - const simd_fvec uv[2] = {radius * r * cos(theta), radius * r * sin(theta)}; + const fvec uv[2] = {radius * r * cos(theta), radius * r * sin(theta)}; - simd_fvec LT[3], LB[3]; + fvec LT[3], LB[3]; create_tbn(N, LT, LB); UNROLLED_FOR(i, 3, { out_V[i] = N[i] + uv[0] * LT[i] + uv[1] * LB[i]; }) - const simd_fvec mask = (offset[0] == 0.0f & offset[1] == 0.0f); + const fvec mask = (offset[0] == 0.0f & offset[1] == 0.0f); UNROLLED_FOR(i, 3, { where(mask, out_V[i]) = N[i]; }) } template -force_inline simd_fvec sphere_intersection(const float center[3], const float radius, const simd_fvec ro[3], - const simd_fvec rd[3]) { - const simd_fvec oc[3] = {ro[0] - center[0], ro[1] - center[1], ro[2] - center[2]}; - const simd_fvec a = dot3(rd, rd); - const simd_fvec b = 2 * dot3(oc, rd); - const simd_fvec c = dot3(oc, oc) - radius * radius; - const simd_fvec discriminant = b * b - 4 * a * c; +force_inline fvec sphere_intersection(const float center[3], const float radius, const fvec ro[3], + const fvec rd[3]) { + const fvec oc[3] = {ro[0] - center[0], ro[1] - center[1], ro[2] - center[2]}; + const fvec a = dot3(rd, rd); + const fvec b = 2 * dot3(oc, rd); + const fvec c = dot3(oc, oc) - radius * radius; + const fvec discriminant = b * b - 4 * a * c; return (-b - sqrt(max(discriminant, 0.0f))) / (2 * a); } -template force_inline simd_fvec schlick_weight(const simd_fvec &u) { - const simd_fvec m = saturate(1.0f - u); +template force_inline fvec schlick_weight(const fvec &u) { + const fvec m = saturate(1.0f - u); return pow5(m); } @@ -2530,26 +2530,26 @@ force_inline float fresnel_dielectric_cos(float cosi, float eta) { return result; } -template simd_fvec fresnel_dielectric_cos(const simd_fvec &cosi, const simd_fvec &eta) { +template fvec fresnel_dielectric_cos(const fvec &cosi, const fvec &eta) { // compute fresnel reflectance without explicitly computing the refracted direction - simd_fvec c = abs(cosi); - simd_fvec g = eta * eta - 1 + c * c; - const simd_fvec mask = (g > 0.0f); + fvec c = abs(cosi); + fvec g = eta * eta - 1 + c * c; + const fvec mask = (g > 0.0f); g = safe_sqrt(g); - const simd_fvec A = safe_div(g - c, g + c); - const simd_fvec B = safe_div(c * (g + c) - 1, c * (g - c) + 1); + const fvec A = safe_div(g - c, g + c); + const fvec B = safe_div(c * (g + c) - 1, c * (g - c) + 1); - return select(mask, 0.5f * A * A * (1 + B * B), simd_fvec{1.0f} /* TIR (no refracted component) */); + return select(mask, 0.5f * A * A * (1 + B * B), fvec{1.0f} /* TIR (no refracted component) */); } template -void get_lobe_weights(const simd_fvec &base_color_lum, const simd_fvec &spec_color_lum, - const simd_fvec &specular, const simd_fvec &metallic, const float transmission, +void get_lobe_weights(const fvec &base_color_lum, const fvec &spec_color_lum, + const fvec &specular, const fvec &metallic, const float transmission, const float clearcoat, lobe_weights_t &out_weights) { // taken from Cycles out_weights.diffuse = base_color_lum * (1.0f - metallic) * (1.0f - transmission); - const simd_fvec final_transmission = transmission * (1.0f - metallic); + const fvec final_transmission = transmission * (1.0f - metallic); //(*out_specular_weight) = // (specular != 0.0f || metallic != 0.0f) ? spec_color_lum * (1.0f - final_transmission) : 0.0f; out_weights.specular = 0.0f; @@ -2560,7 +2560,7 @@ void get_lobe_weights(const simd_fvec &base_color_lum, const simd_fvec &sp out_weights.clearcoat = 0.25f * clearcoat * (1.0f - metallic); out_weights.refraction = final_transmission * base_color_lum; - const simd_fvec total_weight = + const fvec total_weight = out_weights.diffuse + out_weights.specular + out_weights.clearcoat + out_weights.refraction; where(total_weight != 0.0f, out_weights.diffuse) = safe_div_pos(out_weights.diffuse, total_weight); @@ -2569,24 +2569,24 @@ void get_lobe_weights(const simd_fvec &base_color_lum, const simd_fvec &sp where(total_weight != 0.0f, out_weights.refraction) = safe_div_pos(out_weights.refraction, total_weight); } -template force_inline simd_fvec power_heuristic(const simd_fvec &a, const simd_fvec &b) { - const simd_fvec t = a * a; +template force_inline fvec power_heuristic(const fvec &a, const fvec &b) { + const fvec t = a * a; return safe_div_pos(t, b * b + t); } template -force_inline simd_ivec quadratic(const simd_fvec &a, const simd_fvec &b, const simd_fvec &c, - simd_fvec &t0, simd_fvec &t1) { - const simd_fvec d = b * b - 4.0f * a * c; - const simd_fvec sqrt_d = safe_sqrt(d); - const simd_fvec q = select(b < 0.0f, -0.5f * (b - sqrt_d), -0.5f * (b + sqrt_d)); +force_inline ivec quadratic(const fvec &a, const fvec &b, const fvec &c, + fvec &t0, fvec &t1) { + const fvec d = b * b - 4.0f * a * c; + const fvec sqrt_d = safe_sqrt(d); + const fvec q = select(b < 0.0f, -0.5f * (b - sqrt_d), -0.5f * (b + sqrt_d)); t0 = safe_div(q, a); t1 = safe_div(c, q); return simd_cast(d >= 0.0f); } -template force_inline simd_fvec ngon_rad(const simd_fvec &theta, const float n) { - simd_fvec ret; +template force_inline fvec ngon_rad(const fvec &theta, const float n) { + fvec ret; UNROLLED_FOR_S(i, S, { ret.template set( cosf(PI / n) / @@ -2597,9 +2597,9 @@ template force_inline simd_fvec ngon_rad(const simd_fvec &theta, c template void get_pix_dirs(const float w, const float h, const camera_t &cam, const float k, const float fov_k, - const simd_fvec &x, const simd_fvec &y, const simd_fvec origin[3], simd_fvec d[3]) { - simd_fvec _dx = 2 * fov_k * (x / w + cam.shift[0] / k) - fov_k; - simd_fvec _dy = 2 * fov_k * (-y / h + cam.shift[1]) + fov_k; + const fvec &x, const fvec &y, const fvec origin[3], fvec d[3]) { + fvec _dx = 2 * fov_k * (x / w + cam.shift[0] / k) - fov_k; + fvec _dy = 2 * fov_k * (-y / h + cam.shift[1]) + fov_k; d[0] = cam.origin[0] + k * _dx * cam.side[0] + _dy * cam.up[0] + cam.fwd[0] * cam.focus_distance; d[1] = cam.origin[1] + k * _dx * cam.side[1] + _dy * cam.up[1] + cam.fwd[1] * cam.focus_distance; @@ -2612,10 +2612,10 @@ void get_pix_dirs(const float w, const float h, const camera_t &cam, const float normalize(d); } -template void push_ior_stack(const simd_ivec &_mask, simd_fvec stack[4], const simd_fvec &val) { - simd_fvec active_lanes = simd_cast(_mask); +template void push_ior_stack(const ivec &_mask, fvec stack[4], const fvec &val) { + fvec active_lanes = simd_cast(_mask); // 0 - simd_fvec mask = active_lanes & (stack[0] < 0.0f); + fvec mask = active_lanes & (stack[0] < 0.0f); where(mask, stack[0]) = val; active_lanes &= ~mask; // 1 @@ -2632,12 +2632,12 @@ template void push_ior_stack(const simd_ivec &_mask, simd_fvec sta } template -simd_fvec pop_ior_stack(const simd_ivec &_mask, simd_fvec stack[4], - const simd_fvec &default_value = {1.0f}) { - simd_fvec ret = default_value; - simd_fvec active_lanes = simd_cast(_mask); +fvec pop_ior_stack(const ivec &_mask, fvec stack[4], + const fvec &default_value = {1.0f}) { + fvec ret = default_value; + fvec active_lanes = simd_cast(_mask); // 3 - simd_fvec mask = active_lanes & (stack[3] > 0.0f); + fvec mask = active_lanes & (stack[3] > 0.0f); where(mask, ret) = stack[3]; where(mask, stack[3]) = -1.0f; active_lanes &= ~mask; @@ -2660,15 +2660,15 @@ simd_fvec pop_ior_stack(const simd_ivec &_mask, simd_fvec stack[4], } template -simd_fvec peek_ior_stack(const simd_fvec stack[4], const simd_ivec &_skip_first, - const simd_fvec &default_value = {1.0f}) { - simd_fvec ret = default_value; - simd_fvec skip_first = simd_cast(_skip_first); +fvec peek_ior_stack(const fvec stack[4], const ivec &_skip_first, + const fvec &default_value = {1.0f}) { + fvec ret = default_value; + fvec skip_first = simd_cast(_skip_first); // 3 - simd_fvec mask = (stack[3] > 0.0f); + fvec mask = (stack[3] > 0.0f); mask &= ~std::exchange(skip_first, skip_first & ~mask); where(mask, ret) = stack[3]; - simd_fvec active_lanes = ~mask; + fvec active_lanes = ~mask; // 2 mask = active_lanes & (stack[2] > 0.0f); mask &= ~std::exchange(skip_first, skip_first & ~mask); @@ -2687,8 +2687,8 @@ simd_fvec peek_ior_stack(const simd_fvec stack[4], const simd_ivec &_sk return ret; } -template simd_fvec approx_atan2(const simd_fvec &y, const simd_fvec &x) { - simd_fvec t0, t1, t3, t4; +template fvec approx_atan2(const fvec &y, const fvec &x) { + fvec t0, t1, t3, t4; t3 = abs(x); t1 = abs(y); @@ -2713,7 +2713,7 @@ template simd_fvec approx_atan2(const simd_fvec &y, const simd_fve return t3; } -template force_inline simd_fvec approx_cos(simd_fvec x) { // max error is 0.056010f +template force_inline fvec approx_cos(fvec x) { // max error is 0.056010f const float tp = 1.0f / (2.0f * PI); x *= tp; x -= 0.25f + floor(x + 0.25f); @@ -2721,10 +2721,10 @@ template force_inline simd_fvec approx_cos(simd_fvec x) { // max e return x; } -template force_inline simd_fvec approx_acos(simd_fvec x) { // max error is 0.000068f - const simd_fvec negate = select(x < 0.0f, simd_fvec{1.0f}, simd_fvec{0.0f}); +template force_inline fvec approx_acos(fvec x) { // max error is 0.000068f + const fvec negate = select(x < 0.0f, fvec{1.0f}, fvec{0.0f}); x = abs(x); - simd_fvec ret = -0.0187293f; + fvec ret = -0.0187293f; ret = ret * x; ret = ret + 0.0742610f; ret = ret * x; @@ -2738,50 +2738,50 @@ template force_inline simd_fvec approx_acos(simd_fvec x) { // max template void calc_lnode_importance(const light_wbvh_node_t &n, const float P[3], float importance[8]) { for (int i = 0; i < 8; i += S) { - simd_fvec mul = 1.0f, v_len2 = 1.0f; + fvec mul = 1.0f, v_len2 = 1.0f; - const simd_ivec mask = simd_cast(simd_fvec{&n.bbox_min[0][i], simd_mem_aligned} > -MAX_DIST); + const ivec mask = simd_cast(fvec{&n.bbox_min[0][i], vector_aligned} > -MAX_DIST); if (mask.not_all_zeros()) { - simd_fvec v[3] = {P[0] - 0.5f * (simd_fvec{&n.bbox_min[0][i], simd_mem_aligned} + - simd_fvec{&n.bbox_max[0][i], simd_mem_aligned}), - P[1] - 0.5f * (simd_fvec{&n.bbox_min[1][i], simd_mem_aligned} + - simd_fvec{&n.bbox_max[1][i], simd_mem_aligned}), - P[2] - 0.5f * (simd_fvec{&n.bbox_min[2][i], simd_mem_aligned} + - simd_fvec{&n.bbox_max[2][i], simd_mem_aligned})}; - const simd_fvec ext[3] = { - simd_fvec{&n.bbox_max[0][i], simd_mem_aligned} - simd_fvec{&n.bbox_min[0][i], simd_mem_aligned}, - simd_fvec{&n.bbox_max[1][i], simd_mem_aligned} - simd_fvec{&n.bbox_min[1][i], simd_mem_aligned}, - simd_fvec{&n.bbox_max[2][i], simd_mem_aligned} - simd_fvec{&n.bbox_min[2][i], simd_mem_aligned}}; - - const simd_fvec extent = 0.5f * length(ext); + fvec v[3] = {P[0] - 0.5f * (fvec{&n.bbox_min[0][i], vector_aligned} + + fvec{&n.bbox_max[0][i], vector_aligned}), + P[1] - 0.5f * (fvec{&n.bbox_min[1][i], vector_aligned} + + fvec{&n.bbox_max[1][i], vector_aligned}), + P[2] - 0.5f * (fvec{&n.bbox_min[2][i], vector_aligned} + + fvec{&n.bbox_max[2][i], vector_aligned})}; + const fvec ext[3] = { + fvec{&n.bbox_max[0][i], vector_aligned} - fvec{&n.bbox_min[0][i], vector_aligned}, + fvec{&n.bbox_max[1][i], vector_aligned} - fvec{&n.bbox_min[1][i], vector_aligned}, + fvec{&n.bbox_max[2][i], vector_aligned} - fvec{&n.bbox_min[2][i], vector_aligned}}; + + const fvec extent = 0.5f * length(ext); where(mask, v_len2) = length2(v); - const simd_fvec v_len = sqrt(v_len2); - const simd_fvec omega_u = approx_atan2(extent, v_len) + 0.000005f; + const fvec v_len = sqrt(v_len2); + const fvec omega_u = approx_atan2(extent, v_len) + 0.000005f; - const simd_fvec axis[3] = {simd_fvec{&n.axis[0][i], simd_mem_aligned}, - simd_fvec{&n.axis[1][i], simd_mem_aligned}, - simd_fvec{&n.axis[2][i], simd_mem_aligned}}; + const fvec axis[3] = {fvec{&n.axis[0][i], vector_aligned}, + fvec{&n.axis[1][i], vector_aligned}, + fvec{&n.axis[2][i], vector_aligned}}; UNROLLED_FOR(j, 3, { v[j] /= v_len; }) - const simd_fvec omega = approx_acos(min(dot3(axis, v), 1.0f)) - 0.00007f; - const simd_fvec omega_ = max(0.0f, omega - simd_fvec{&n.omega_n[i], simd_mem_aligned} - omega_u); + const fvec omega = approx_acos(min(dot3(axis, v), 1.0f)) - 0.00007f; + const fvec omega_ = max(0.0f, omega - fvec{&n.omega_n[i], vector_aligned} - omega_u); where(mask, mul) = 0.0f; - where(mask & simd_cast(omega_ < simd_fvec{&n.omega_e[i], simd_mem_aligned}), mul) = + where(mask & simd_cast(omega_ < fvec{&n.omega_e[i], vector_aligned}), mul) = approx_cos(omega_) + 0.057f; } - const simd_fvec imp = simd_fvec{&n.flux[i], simd_mem_aligned} * mul / v_len2; - imp.store_to(&importance[i], simd_mem_aligned); + const fvec imp = fvec{&n.flux[i], vector_aligned} * mul / v_len2; + imp.store_to(&importance[i], vector_aligned); } } template -void calc_lnode_importance(const light_wbvh_node_t &n, const simd_fvec P[3], simd_fvec importance[8]) { +void calc_lnode_importance(const light_wbvh_node_t &n, const fvec P[3], fvec importance[8]) { for (int i = 0; i < 8; ++i) { - simd_fvec mul = 1.0f, v_len2 = 1.0f; + fvec mul = 1.0f, v_len2 = 1.0f; if (n.bbox_min[0][i] > -MAX_DIST) { - simd_fvec v[3] = {P[0] - 0.5f * (n.bbox_min[0][i] + n.bbox_max[0][i]), + fvec v[3] = {P[0] - 0.5f * (n.bbox_min[0][i] + n.bbox_max[0][i]), P[1] - 0.5f * (n.bbox_min[1][i] + n.bbox_max[1][i]), P[2] - 0.5f * (n.bbox_min[2][i] + n.bbox_max[2][i])}; @@ -2790,15 +2790,15 @@ void calc_lnode_importance(const light_wbvh_node_t &n, const simd_fvec P[3], const float extent = 0.5f * sqrtf(ext[0] * ext[0] + ext[1] * ext[1] + ext[2] * ext[2]); v_len2 = length2(v); - const simd_fvec v_len = sqrt(v_len2); - const simd_fvec omega_u = approx_atan2(simd_fvec{extent}, v_len) + 0.000005f; + const fvec v_len = sqrt(v_len2); + const fvec omega_u = approx_atan2(fvec{extent}, v_len) + 0.000005f; const float axis[3] = {n.axis[0][i], n.axis[1][i], n.axis[2][i]}; UNROLLED_FOR(j, 3, { v[j] = safe_div_pos(v[j], v_len); }) - const simd_fvec omega = approx_acos(min(dot3(axis, v), 1.0f)) - 0.00007f; - const simd_fvec omega_ = max(0.0f, omega - n.omega_n[i] - omega_u); - mul = select(omega_ < n.omega_e[i], approx_cos(omega_) + 0.057f, simd_fvec{0.0f}); + const fvec omega = approx_acos(min(dot3(axis, v), 1.0f)) - 0.00007f; + const fvec omega_ = max(0.0f, omega - n.omega_n[i] - omega_u); + mul = select(omega_ < n.omega_e[i], approx_cos(omega_) + 0.057f, fvec{0.0f}); } importance[i] = safe_div_pos(n.flux[i] * mul, v_len2); @@ -2822,8 +2822,8 @@ void Ray::NS::GeneratePrimaryRays(const camera_t &cam, const rect_t &r, int w, i const float fov_k = temp * cam.focus_distance; const float spread_angle = atanf(2.0f * temp / float(h)); - const auto off_x = simd_ivec{rays_layout_x, simd_mem_aligned}, - off_y = simd_ivec{rays_layout_y, simd_mem_aligned}; + const auto off_x = ivec{rays_layout_x, vector_aligned}, + off_y = ivec{rays_layout_y, vector_aligned}; const int x_res = (r.w + DimX - 1) / DimX, y_res = (r.h + DimY - 1) / DimY; @@ -2835,10 +2835,10 @@ void Ray::NS::GeneratePrimaryRays(const camera_t &cam, const rect_t &r, int w, i for (int x = r.x; x < r.x + r.w; x += DimX) { ray_data_t &out_r = out_rays[i]; - const simd_ivec ixx = x + off_x, iyy = y + off_y; - const simd_ivec ixx_clamped = min(ixx, w - 1), iyy_clamped = min(iyy, h - 1); + const ivec ixx = x + off_x, iyy = y + off_y; + const ivec ixx_clamped = min(ixx, w - 1), iyy_clamped = min(iyy, h - 1); - simd_ivec req_samples; + ivec req_samples; UNROLLED_FOR_S(i, S, { req_samples.template set( required_samples[iyy_clamped.template get() * w + ixx_clamped.template get()]); @@ -2846,30 +2846,30 @@ void Ray::NS::GeneratePrimaryRays(const camera_t &cam, const rect_t &r, int w, i out_r.mask = (ixx < w) & (iyy < h) & (req_samples >= iteration); - auto fxx = simd_fvec(ixx), fyy = simd_fvec(iyy); + auto fxx = fvec(ixx), fyy = fvec(iyy); - const simd_uvec px_hash = hash(simd_uvec((ixx << 16) | iyy)); - const simd_uvec rand_hash = hash_combine(px_hash, rand_seed); + const uvec px_hash = hash(uvec((ixx << 16) | iyy)); + const uvec rand_hash = hash_combine(px_hash, rand_seed); - simd_fvec filter_rand[2]; - get_scrambled_2d_rand(simd_uvec(uint32_t(RAND_DIM_FILTER)), rand_hash, iteration - 1, rand_seq, + fvec filter_rand[2]; + get_scrambled_2d_rand(uvec(uint32_t(RAND_DIM_FILTER)), rand_hash, iteration - 1, rand_seq, filter_rand); if (cam.filter != ePixelFilter::Box) { filter_rand[0] *= float(FILTER_TABLE_SIZE - 1); filter_rand[1] *= float(FILTER_TABLE_SIZE - 1); - const simd_ivec index_x = min(simd_ivec(filter_rand[0]), FILTER_TABLE_SIZE - 1), - index_y = min(simd_ivec(filter_rand[1]), FILTER_TABLE_SIZE - 1); + const ivec index_x = min(ivec(filter_rand[0]), FILTER_TABLE_SIZE - 1), + index_y = min(ivec(filter_rand[1]), FILTER_TABLE_SIZE - 1); - const simd_ivec nindex_x = min(index_x + 1, FILTER_TABLE_SIZE - 1), + const ivec nindex_x = min(index_x + 1, FILTER_TABLE_SIZE - 1), nindex_y = min(index_y + 1, FILTER_TABLE_SIZE - 1); - const simd_fvec tx = filter_rand[0] - simd_fvec(index_x), - ty = filter_rand[1] - simd_fvec(index_y); + const fvec tx = filter_rand[0] - fvec(index_x), + ty = filter_rand[1] - fvec(index_y); - const simd_fvec data0_x = gather(filter_table, index_x), data1_x = gather(filter_table, nindex_x); - const simd_fvec data0_y = gather(filter_table, index_y), data1_y = gather(filter_table, nindex_y); + const fvec data0_x = gather(filter_table, index_x), data1_x = gather(filter_table, nindex_x); + const fvec data0_y = gather(filter_table, index_y), data1_y = gather(filter_table, nindex_y); filter_rand[0] = (1.0f - tx) * data0_x + tx * data1_x; filter_rand[1] = (1.0f - ty) * data0_y + ty * data1_y; @@ -2878,16 +2878,16 @@ void Ray::NS::GeneratePrimaryRays(const camera_t &cam, const rect_t &r, int w, i fxx += filter_rand[0]; fyy += filter_rand[1]; - simd_fvec offset[2] = {0.0f, 0.0f}; + fvec offset[2] = {0.0f, 0.0f}; if (cam.fstop > 0.0f) { - simd_fvec lens_rand[2]; - get_scrambled_2d_rand(simd_uvec(uint32_t(RAND_DIM_LENS)), rand_hash, iteration - 1, rand_seq, + fvec lens_rand[2]; + get_scrambled_2d_rand(uvec(uint32_t(RAND_DIM_LENS)), rand_hash, iteration - 1, rand_seq, lens_rand); offset[0] = 2.0f * lens_rand[0] - 1.0f; offset[1] = 2.0f * lens_rand[1] - 1.0f; - simd_fvec r = offset[1], theta = 0.5f * PI - 0.25f * PI * safe_div(offset[0], offset[1]); + fvec r = offset[1], theta = 0.5f * PI - 0.25f * PI * safe_div(offset[0], offset[1]); where(abs(offset[0]) > abs(offset[1]), r) = offset[0]; where(abs(offset[0]) > abs(offset[1]), theta) = 0.25f * PI * safe_div(offset[1], offset[0]); @@ -2905,16 +2905,16 @@ void Ray::NS::GeneratePrimaryRays(const camera_t &cam, const rect_t &r, int w, i offset[1] *= coc * cam.sensor_height; } - const simd_fvec _origin[3] = {{cam.origin[0] + cam.side[0] * offset[0] + cam.up[0] * offset[1]}, + const fvec _origin[3] = {{cam.origin[0] + cam.side[0] * offset[0] + cam.up[0] * offset[1]}, {cam.origin[1] + cam.side[1] * offset[0] + cam.up[1] * offset[1]}, {cam.origin[2] + cam.side[2] * offset[0] + cam.up[2] * offset[1]}}; - simd_fvec _d[3], _dx[3], _dy[3]; + fvec _d[3], _dx[3], _dy[3]; get_pix_dirs(float(w), float(h), cam, k, fov_k, fxx, fyy, _origin, _d); get_pix_dirs(float(w), float(h), cam, k, fov_k, fxx + 1.0f, fyy, _origin, _dx); get_pix_dirs(float(w), float(h), cam, k, fov_k, fxx, fyy + 1.0f, _origin, _dy); - const simd_fvec clip_start = cam.clip_start / dot3(_d, cam.fwd); + const fvec clip_start = cam.clip_start / dot3(_d, cam.fwd); for (int j = 0; j < 3; j++) { out_r.d[j] = _d[j]; @@ -2929,9 +2929,9 @@ void Ray::NS::GeneratePrimaryRays(const camera_t &cam, const rect_t &r, int w, i out_r.cone_spread = spread_angle; out_r.pdf = {1e6f}; - out_r.xy = simd_uvec((ixx << 16) | iyy); + out_r.xy = uvec((ixx << 16) | iyy); out_r.depth = pack_ray_type(RAY_TYPE_CAMERA); - out_r.depth |= pack_depth(simd_ivec{0}, simd_ivec{0}, simd_ivec{0}, simd_ivec{0}); + out_r.depth |= pack_depth(ivec{0}, ivec{0}, ivec{0}, ivec{0}); hit_data_t &out_i = out_inters[i++]; out_i = {}; @@ -2955,18 +2955,18 @@ void Ray::NS::SampleMeshInTextureSpace(int iteration, int obj_index, int uv_laye out_rays.resize(r.w * r.h / S + ((r.w * r.h) % S != 0)); out_inters.resize(out_rays.size()); - const auto off_x = simd_ivec{rays_layout_x}, off_y = simd_ivec{rays_layout_y}; + const auto off_x = ivec{rays_layout_x}, off_y = ivec{rays_layout_y}; size_t count = 0; for (int y = r.y; y < r.y + r.h - (r.h & (DimY - 1)); y += DimY) { for (int x = r.x; x < r.x + r.w - (r.w & (DimX - 1)); x += DimX) { - const simd_ivec ixx = x + off_x, iyy = simd_ivec(y) + off_y; + const ivec ixx = x + off_x, iyy = ivec(y) + off_y; ray_data_t &out_ray = out_rays[count]; hit_data_t &out_inter = out_inters[count]; count++; - out_ray.xy = simd_uvec((ixx << 16) | iyy); + out_ray.xy = uvec((ixx << 16) | iyy); out_ray.c[0] = out_ray.c[1] = out_ray.c[2] = 1.0f; out_ray.cone_width = 0.0f; out_ray.cone_spread = 0.0f; @@ -2974,8 +2974,8 @@ void Ray::NS::SampleMeshInTextureSpace(int iteration, int obj_index, int uv_laye } } - const simd_ivec4 irect_min = {r.x, r.y, 0, 0}, irect_max = {r.x + r.w - 1, r.y + r.h - 1, 0, 0}; - const simd_fvec4 size = {float(width), float(height), 0.0f, 0.0f}; + const ivec4 irect_min = {r.x, r.y, 0, 0}, irect_max = {r.x + r.w - 1, r.y + r.h - 1, 0, 0}; + const fvec4 size = {float(width), float(height), 0.0f, 0.0f}; for (uint32_t tri = mesh.tris_index; tri < mesh.tris_index + mesh.tris_count; tri++) { const vertex_t &v0 = vertices[vtx_indices[tri * 3 + 0]]; @@ -2983,11 +2983,11 @@ void Ray::NS::SampleMeshInTextureSpace(int iteration, int obj_index, int uv_laye const vertex_t &v2 = vertices[vtx_indices[tri * 3 + 2]]; // TODO: use uv_layer - const auto t0 = simd_fvec4{v0.t[0], 1.0f - v0.t[1], 0.0f, 0.0f} * size; - const auto t1 = simd_fvec4{v1.t[0], 1.0f - v1.t[1], 0.0f, 0.0f} * size; - const auto t2 = simd_fvec4{v2.t[0], 1.0f - v2.t[1], 0.0f, 0.0f} * size; + const auto t0 = fvec4{v0.t[0], 1.0f - v0.t[1], 0.0f, 0.0f} * size; + const auto t1 = fvec4{v1.t[0], 1.0f - v1.t[1], 0.0f, 0.0f} * size; + const auto t2 = fvec4{v2.t[0], 1.0f - v2.t[1], 0.0f, 0.0f} * size; - simd_fvec4 bbox_min = t0, bbox_max = t0; + fvec4 bbox_min = t0, bbox_max = t0; bbox_min = min(bbox_min, t1); bbox_min = min(bbox_min, t2); @@ -2995,8 +2995,8 @@ void Ray::NS::SampleMeshInTextureSpace(int iteration, int obj_index, int uv_laye bbox_max = max(bbox_max, t1); bbox_max = max(bbox_max, t2); - simd_ivec4 ibbox_min = simd_ivec4(bbox_min), - ibbox_max = simd_ivec4{int(roundf(bbox_max[0])), int(roundf(bbox_max[1])), 0, 0}; + ivec4 ibbox_min = ivec4(bbox_min), + ibbox_max = ivec4{int(roundf(bbox_max[0])), int(roundf(bbox_max[1])), 0, 0}; if (ibbox_max[0] < irect_min[0] || ibbox_max[1] < irect_min[1] || ibbox_min[0] > irect_max[0] || ibbox_min[1] > irect_max[1]) { @@ -3011,7 +3011,7 @@ void Ray::NS::SampleMeshInTextureSpace(int iteration, int obj_index, int uv_laye ibbox_max.set<0>(ibbox_max[0] + (((ibbox_max[0] + 1) % DimX) ? (DimX - (ibbox_max[0] + 1) % DimX) : 0)); ibbox_max.set<1>(ibbox_max[1] + (((ibbox_max[1] + 1) % DimY) ? (DimY - (ibbox_max[1] + 1) % DimY) : 0)); - const simd_fvec4 d01 = t0 - t1, d12 = t1 - t2, d20 = t2 - t0; + const fvec4 d01 = t0 - t1, d12 = t1 - t2, d20 = t2 - t0; float area = d01[0] * d20[1] - d20[0] * d01[1]; if (area < FLT_EPS) { @@ -3022,15 +3022,15 @@ void Ray::NS::SampleMeshInTextureSpace(int iteration, int obj_index, int uv_laye for (int y = ibbox_min[1]; y <= ibbox_max[1]; y += DimY) { for (int x = ibbox_min[0]; x <= ibbox_max[0]; x += DimX) { - const simd_ivec ixx = x + off_x, iyy = simd_ivec(y) + off_y; + const ivec ixx = x + off_x, iyy = ivec(y) + off_y; const int ndx = ((y - r.y) / DimY) * (r.w / DimX) + (x - r.x) / DimX; ray_data_t &out_ray = out_rays[ndx]; hit_data_t &out_inter = out_inters[ndx]; // NOTE: temporarily broken - simd_fvec rxx = 0.0f; - simd_fvec ryy = 0.0f; + fvec rxx = 0.0f; + fvec ryy = 0.0f; // UNROLLED_FOR_S(i, S, { // float _unused; @@ -3038,28 +3038,28 @@ void Ray::NS::SampleMeshInTextureSpace(int iteration, int obj_index, int uv_laye // ryy.template set(modff(rand_seq[RAND_DIM_FILTER_V] + ryy.template get(), &_unused)); // }) - const simd_fvec fxx = simd_fvec{ixx} + rxx, fyy = simd_fvec{iyy} + ryy; + const fvec fxx = fvec{ixx} + rxx, fyy = fvec{iyy} + ryy; - simd_fvec u = d01[0] * (fyy - t0[1]) - d01[1] * (fxx - t0[0]), + fvec u = d01[0] * (fyy - t0[1]) - d01[1] * (fxx - t0[0]), v = d12[0] * (fyy - t1[1]) - d12[1] * (fxx - t1[0]), w = d20[0] * (fyy - t2[1]) - d20[1] * (fxx - t2[0]); - const simd_fvec fmask = (u >= -FLT_EPS) & (v >= -FLT_EPS) & (w >= -FLT_EPS); - const simd_ivec imask = simd_cast(fmask); + const fvec fmask = (u >= -FLT_EPS) & (v >= -FLT_EPS) & (w >= -FLT_EPS); + const ivec imask = simd_cast(fmask); if (imask.not_all_zeros()) { u *= inv_area; v *= inv_area; w *= inv_area; - const simd_fvec _p[3] = {v0.p[0] * v + v1.p[0] * w + v2.p[0] * u, + const fvec _p[3] = {v0.p[0] * v + v1.p[0] * w + v2.p[0] * u, v0.p[1] * v + v1.p[1] * w + v2.p[1] * u, v0.p[2] * v + v1.p[2] * w + v2.p[2] * u}, _n[3] = {v0.n[0] * v + v1.n[0] * w + v2.n[0] * u, v0.n[1] * v + v1.n[1] * w + v2.n[1] * u, v0.n[2] * v + v1.n[2] * w + v2.n[2] * u}; - simd_fvec p[3], n[3]; + fvec p[3], n[3]; TransformPoint(_p, mi.xform, p); TransformNormal(_n, mi.inv_xform, n); @@ -3069,7 +3069,7 @@ void Ray::NS::SampleMeshInTextureSpace(int iteration, int obj_index, int uv_laye // where(fmask, out_ray.ior) = 1.0f; where(fmask, out_ray.depth) = pack_ray_type(RAY_TYPE_DIFFUSE); where(fmask, out_ray.depth) |= - pack_depth(simd_ivec{0}, simd_ivec{0}, simd_ivec{0}, simd_ivec{0}); + pack_depth(ivec{0}, ivec{0}, ivec{0}, ivec{0}); where(imask, out_inter.prim_index) = tri; where(imask, out_inter.obj_index) = obj_index; @@ -3084,7 +3084,7 @@ void Ray::NS::SampleMeshInTextureSpace(int iteration, int obj_index, int uv_laye template int Ray::NS::SortRays_CPU(Span> rays, const float root_min[3], const float cell_size[3], - simd_ivec *hash_values, uint32_t *scan_values, ray_chunk_t *chunks, + ivec *hash_values, uint32_t *scan_values, ray_chunk_t *chunks, ray_chunk_t *chunks_temp) { // From "Fast Ray Sorting and Breadth-First Packet Traversal for GPU Ray Tracing" [2010] int rays_count = int(rays.size()); @@ -3170,7 +3170,7 @@ int Ray::NS::SortRays_CPU(Span> rays, const float root_min[3], con template int Ray::NS::SortRays_GPU(Span> rays, const float root_min[3], const float cell_size[3], - simd_ivec *hash_values, int *head_flags, uint32_t *scan_values, ray_chunk_t *chunks, + ivec *hash_values, int *head_flags, uint32_t *scan_values, ray_chunk_t *chunks, ray_chunk_t *chunks_temp, uint32_t *skeleton) { // From "Fast Ray Sorting and Breadth-First Packet Traversal for GPU Ray Tracing" [2010] int rays_count = int(rays.size()); @@ -3288,7 +3288,7 @@ int Ray::NS::SortRays_GPU(Span> rays, const float root_min[3], con } template -bool Ray::NS::IntersectTris_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], const simd_ivec &ray_mask, +bool Ray::NS::IntersectTris_ClosestHit(const fvec ro[3], const fvec rd[3], const ivec &ray_mask, const tri_accel_t *tris, uint32_t num_tris, int obj_index, hit_data_t &out_inter) { hit_data_t inter = {Uninitialize}; @@ -3300,7 +3300,7 @@ bool Ray::NS::IntersectTris_ClosestHit(const simd_fvec ro[3], const simd_fvec _IntersectTri(ro, rd, ray_mask, tris[i], i, inter); } - const simd_ivec inter_mask = simd_cast(inter.v >= 0.0f); + const ivec inter_mask = simd_cast(inter.v >= 0.0f); where(inter_mask, out_inter.obj_index) = inter.obj_index; where(inter_mask, out_inter.prim_index) = inter.prim_index; @@ -3314,7 +3314,7 @@ bool Ray::NS::IntersectTris_ClosestHit(const simd_fvec ro[3], const simd_fvec } template -bool Ray::NS::IntersectTris_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], const simd_ivec &ray_mask, +bool Ray::NS::IntersectTris_ClosestHit(const fvec ro[3], const fvec rd[3], const ivec &ray_mask, const tri_accel_t *tris, const int tri_start, const int tri_end, const int obj_index, hit_data_t &out_inter) { hit_data_t inter{Uninitialize}; @@ -3326,7 +3326,7 @@ bool Ray::NS::IntersectTris_ClosestHit(const simd_fvec ro[3], const simd_fvec IntersectTri(ro, rd, ray_mask, tris[i], i, inter); } - const simd_ivec inter_mask = simd_cast(inter.v >= 0.0f); + const ivec inter_mask = simd_cast(inter.v >= 0.0f); where(inter_mask, out_inter.obj_index) = inter.obj_index; where(inter_mask, out_inter.prim_index) = inter.prim_index; @@ -3370,7 +3370,7 @@ bool Ray::NS::IntersectTris_ClosestHit(const float o[3], const float d[3], const } template -bool Ray::NS::IntersectTris_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], const simd_ivec &ray_mask, +bool Ray::NS::IntersectTris_AnyHit(const fvec ro[3], const fvec rd[3], const ivec &ray_mask, const tri_accel_t *tris, uint32_t num_tris, int obj_index, hit_data_t &out_inter) { hit_data_t inter = {Uninitialize}; @@ -3382,7 +3382,7 @@ bool Ray::NS::IntersectTris_AnyHit(const simd_fvec ro[3], const simd_fvec _IntersectTri(ro, rd, ray_mask, tris[i], i, inter); } - const simd_ivec inter_mask = simd_cast(inter.v >= 0.0f); + const ivec inter_mask = simd_cast(inter.v >= 0.0f); where(inter_mask, out_inter.obj_index) = inter.obj_index; where(inter_mask, out_inter.prim_index) = inter.prim_index; @@ -3396,7 +3396,7 @@ bool Ray::NS::IntersectTris_AnyHit(const simd_fvec ro[3], const simd_fvec } template -bool Ray::NS::IntersectTris_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], const simd_ivec &ray_mask, +bool Ray::NS::IntersectTris_AnyHit(const fvec ro[3], const fvec rd[3], const ivec &ray_mask, const tri_accel_t *tris, const int tri_start, const int tri_end, const int obj_index, hit_data_t &out_inter) { hit_data_t inter{Uninitialize}; @@ -3408,7 +3408,7 @@ bool Ray::NS::IntersectTris_AnyHit(const simd_fvec ro[3], const simd_fvec IntersectTri(ro, rd, ray_mask, tris[i], i, inter); } - const simd_ivec inter_mask = simd_cast(inter.v >= 0.0f); + const ivec inter_mask = simd_cast(inter.v >= 0.0f); where(inter_mask, out_inter.obj_index) = inter.obj_index; where(inter_mask, out_inter.prim_index) = inter.prim_index; @@ -3461,15 +3461,15 @@ bool Ray::NS::IntersectTris_AnyHit(const float o[3], const float d[3], const mtr } template -bool Ray::NS::Traverse_TLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_uvec &ray_flags, const simd_ivec &ray_mask, +bool Ray::NS::Traverse_TLAS_WithStack_ClosestHit(const fvec ro[3], const fvec rd[3], + const uvec &ray_flags, const ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, const mesh_t *meshes, const tri_accel_t *tris, const uint32_t *tri_indices, hit_data_t &inter) { bool res = false; - simd_fvec inv_d[3], inv_d_o[3]; + fvec inv_d[3], inv_d_o[3]; comp_aux_inv_values(ro, rd, inv_d, inv_d_o); TraversalStateStack_Multi st; @@ -3484,12 +3484,12 @@ bool Ray::NS::Traverse_TLAS_WithStack_ClosestHit(const simd_fvec ro[3], const while (stack_size) { uint32_t cur = stack[--stack_size]; - simd_ivec mask1 = bbox_test_fma(inv_d, inv_d_o, inter.t, nodes[cur]) & st.queue[st.index].mask; + ivec mask1 = bbox_test_fma(inv_d, inv_d_o, inter.t, nodes[cur]) & st.queue[st.index].mask; if (mask1.all_zeros()) { continue; } - simd_ivec mask2 = and_not(mask1, st.queue[st.index].mask); + ivec mask2 = and_not(mask1, st.queue[st.index].mask); if (mask2.not_all_zeros()) { st.queue[st.num].mask = mask2; st.queue[st.num].stack_size = stack_size; @@ -3506,14 +3506,14 @@ bool Ray::NS::Traverse_TLAS_WithStack_ClosestHit(const simd_fvec ro[3], const const mesh_instance_t &mi = mesh_instances[mi_indices[i]]; const mesh_t &m = meshes[mi.mesh_index]; - simd_ivec bbox_mask = simd_ivec((mi.ray_visibility & ray_flags) != 0u) & + ivec bbox_mask = ivec((mi.ray_visibility & ray_flags) != 0u) & bbox_test_fma(inv_d, inv_d_o, inter.t, mi.bbox_min, mi.bbox_max) & st.queue[st.index].mask; if (bbox_mask.all_zeros()) { continue; } - simd_fvec _ro[3], _rd[3]; + fvec _ro[3], _rd[3]; TransformRay(ro, rd, mi.inv_xform, _ro, _rd); res |= Traverse_BLAS_WithStack_ClosestHit(_ro, _rd, bbox_mask, nodes, m.node_index, tris, @@ -3525,7 +3525,7 @@ bool Ray::NS::Traverse_TLAS_WithStack_ClosestHit(const simd_fvec ro[3], const } // resolve primitive index indirection - const simd_ivec is_backfacing = (inter.prim_index < 0); + const ivec is_backfacing = (inter.prim_index < 0); where(is_backfacing, inter.prim_index) = -inter.prim_index - 1; inter.prim_index = gather(reinterpret_cast(tri_indices), inter.prim_index); @@ -3535,36 +3535,36 @@ bool Ray::NS::Traverse_TLAS_WithStack_ClosestHit(const simd_fvec ro[3], const } template -bool Ray::NS::Traverse_TLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_uvec &ray_flags, const simd_ivec &ray_mask, +bool Ray::NS::Traverse_TLAS_WithStack_ClosestHit(const fvec ro[3], const fvec rd[3], + const uvec &ray_flags, const ivec &ray_mask, const wbvh_node_t *nodes, uint32_t node_index, const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, const mesh_t *meshes, const mtri_accel_t *mtris, const uint32_t *tri_indices, hit_data_t &inter) { bool res = false; - simd_fvec inv_d[3], inv_d_o[3]; + fvec inv_d[3], inv_d_o[3]; comp_aux_inv_values(ro, rd, inv_d, inv_d_o); alignas(S * 4) float _ro[3][S], _rd[3][S]; - ro[0].store_to(_ro[0], simd_mem_aligned); - ro[1].store_to(_ro[1], simd_mem_aligned); - ro[2].store_to(_ro[2], simd_mem_aligned); - rd[0].store_to(_rd[0], simd_mem_aligned); - rd[1].store_to(_rd[1], simd_mem_aligned); - rd[2].store_to(_rd[2], simd_mem_aligned); + ro[0].store_to(_ro[0], vector_aligned); + ro[1].store_to(_ro[1], vector_aligned); + ro[2].store_to(_ro[2], vector_aligned); + rd[0].store_to(_rd[0], vector_aligned); + rd[1].store_to(_rd[1], vector_aligned); + rd[2].store_to(_rd[2], vector_aligned); alignas(S * 4) int ray_masks[S], inter_prim_index[S], inter_obj_index[S]; alignas(S * 4) float inter_t[S], inter_u[S], inter_v[S]; - ray_mask.store_to(ray_masks, simd_mem_aligned); - inter.prim_index.store_to(inter_prim_index, simd_mem_aligned); - inter.obj_index.store_to(inter_obj_index, simd_mem_aligned); - inter.t.store_to(inter_t, simd_mem_aligned); - inter.u.store_to(inter_u, simd_mem_aligned); - inter.v.store_to(inter_v, simd_mem_aligned); + ray_mask.store_to(ray_masks, vector_aligned); + inter.prim_index.store_to(inter_prim_index, vector_aligned); + inter.obj_index.store_to(inter_obj_index, vector_aligned); + inter.t.store_to(inter_t, vector_aligned); + inter.u.store_to(inter_u, vector_aligned); + inter.v.store_to(inter_v, vector_aligned); alignas(S * 4) unsigned _ray_flags[S]; - ray_flags.store_to(_ray_flags, simd_mem_aligned); + ray_flags.store_to(_ray_flags, vector_aligned); for (int ri = 0; ri < S; ri++) { if (!ray_masks[ri]) { @@ -3673,16 +3673,16 @@ bool Ray::NS::Traverse_TLAS_WithStack_ClosestHit(const simd_fvec ro[3], const } } - inter.prim_index = simd_ivec{inter_prim_index, simd_mem_aligned}; - inter.obj_index = simd_ivec{inter_obj_index, simd_mem_aligned}; - inter.t = simd_fvec{inter_t, simd_mem_aligned}; - inter.u = simd_fvec{inter_u, simd_mem_aligned}; - inter.v = simd_fvec{inter_v, simd_mem_aligned}; + inter.prim_index = ivec{inter_prim_index, vector_aligned}; + inter.obj_index = ivec{inter_obj_index, vector_aligned}; + inter.t = fvec{inter_t, vector_aligned}; + inter.u = fvec{inter_u, vector_aligned}; + inter.v = fvec{inter_v, vector_aligned}; // resolve primitive index indirection - simd_ivec prim_index = (ray_mask & inter.prim_index); + ivec prim_index = (ray_mask & inter.prim_index); - const simd_ivec is_backfacing = (prim_index < 0); + const ivec is_backfacing = (prim_index < 0); where(is_backfacing, prim_index) = -prim_index - 1; where(ray_mask, inter.prim_index) = gather(reinterpret_cast(tri_indices), prim_index); @@ -3692,17 +3692,17 @@ bool Ray::NS::Traverse_TLAS_WithStack_ClosestHit(const simd_fvec ro[3], const } template -Ray::NS::simd_ivec -Ray::NS::Traverse_TLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], int ray_type, - const simd_ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, +Ray::NS::ivec +Ray::NS::Traverse_TLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], int ray_type, + const ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, const mesh_t *meshes, const tri_accel_t *tris, const tri_mat_data_t *materials, const uint32_t *tri_indices, hit_data_t &inter) { const int ray_vismask = (1u << ray_type); - simd_ivec solid_hit_mask = {0}; + ivec solid_hit_mask = {0}; - simd_fvec inv_d[3], inv_d_o[3]; + fvec inv_d[3], inv_d_o[3]; comp_aux_inv_values(ro, rd, inv_d, inv_d_o); TraversalStateStack_Multi st; @@ -3717,12 +3717,12 @@ Ray::NS::Traverse_TLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fve while (stack_size) { const uint32_t cur = stack[--stack_size]; - simd_ivec mask1 = bbox_test_fma(inv_d, inv_d_o, inter.t, nodes[cur]) & st.queue[st.index].mask; + ivec mask1 = bbox_test_fma(inv_d, inv_d_o, inter.t, nodes[cur]) & st.queue[st.index].mask; if (mask1.all_zeros()) { continue; } - simd_ivec mask2 = and_not(mask1, st.queue[st.index].mask); + ivec mask2 = and_not(mask1, st.queue[st.index].mask); if (mask2.not_all_zeros()) { st.queue[st.num].mask = mask2; st.queue[st.num].stack_size = stack_size; @@ -3743,13 +3743,13 @@ Ray::NS::Traverse_TLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fve const mesh_t &m = meshes[mi.mesh_index]; - const simd_ivec bbox_mask = + const ivec bbox_mask = bbox_test_fma(inv_d, inv_d_o, inter.t, mi.bbox_min, mi.bbox_max) & st.queue[st.index].mask; if (bbox_mask.all_zeros()) { continue; } - simd_fvec _ro[3], _rd[3]; + fvec _ro[3], _rd[3]; TransformRay(ro, rd, mi.inv_xform, _ro, _rd); solid_hit_mask |= Traverse_BLAS_WithStack_AnyHit(_ro, _rd, bbox_mask, nodes, m.node_index, tris, @@ -3761,9 +3761,9 @@ Ray::NS::Traverse_TLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fve } // resolve primitive index indirection - simd_ivec prim_index = (ray_mask & inter.prim_index); + ivec prim_index = (ray_mask & inter.prim_index); - const simd_ivec is_backfacing = (prim_index < 0); + const ivec is_backfacing = (prim_index < 0); where(is_backfacing, prim_index) = -prim_index - 1; where(ray_mask, inter.prim_index) = gather(reinterpret_cast(tri_indices), prim_index); @@ -3773,25 +3773,25 @@ Ray::NS::Traverse_TLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fve } template -Ray::NS::simd_ivec Ray::NS::Traverse_TLAS_WithStack_AnyHit( - const simd_fvec ro[3], const simd_fvec rd[3], int ray_type, const simd_ivec &ray_mask, +Ray::NS::ivec Ray::NS::Traverse_TLAS_WithStack_AnyHit( + const fvec ro[3], const fvec rd[3], int ray_type, const ivec &ray_mask, const wbvh_node_t *nodes, uint32_t node_index, const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, const mesh_t *meshes, const mtri_accel_t *mtris, const tri_mat_data_t *materials, const uint32_t *tri_indices, hit_data_t &inter) { const int ray_vismask = (1u << ray_type); - simd_ivec solid_hit_mask = {0}; + ivec solid_hit_mask = {0}; - simd_fvec inv_d[3], inv_d_o[3]; + fvec inv_d[3], inv_d_o[3]; comp_aux_inv_values(ro, rd, inv_d, inv_d_o); alignas(S * 4) int ray_masks[S], inter_prim_index[S]; alignas(S * 4) float inter_t[S], inter_u[S], inter_v[S]; - ray_mask.store_to(ray_masks, simd_mem_aligned); - inter.prim_index.store_to(inter_prim_index, simd_mem_aligned); - inter.t.store_to(inter_t, simd_mem_aligned); - inter.u.store_to(inter_u, simd_mem_aligned); - inter.v.store_to(inter_v, simd_mem_aligned); + ray_mask.store_to(ray_masks, vector_aligned); + inter.prim_index.store_to(inter_prim_index, vector_aligned); + inter.t.store_to(inter_t, vector_aligned); + inter.u.store_to(inter_u, vector_aligned); + inter.v.store_to(inter_v, vector_aligned); for (int ri = 0; ri < S; ri++) { if (!ray_masks[ri]) { @@ -3910,13 +3910,13 @@ Ray::NS::simd_ivec Ray::NS::Traverse_TLAS_WithStack_AnyHit( } } - inter.prim_index = simd_ivec{inter_prim_index, simd_mem_aligned}; - inter.t = simd_fvec{inter_t, simd_mem_aligned}; - inter.u = simd_fvec{inter_u, simd_mem_aligned}; - inter.v = simd_fvec{inter_v, simd_mem_aligned}; + inter.prim_index = ivec{inter_prim_index, vector_aligned}; + inter.t = fvec{inter_t, vector_aligned}; + inter.u = fvec{inter_u, vector_aligned}; + inter.v = fvec{inter_v, vector_aligned}; // resolve primitive index indirection - const simd_ivec is_backfacing = (inter.prim_index < 0); + const ivec is_backfacing = (inter.prim_index < 0); where(is_backfacing, inter.prim_index) = -inter.prim_index - 1; inter.prim_index = gather(reinterpret_cast(tri_indices), inter.prim_index); @@ -3926,13 +3926,13 @@ Ray::NS::simd_ivec Ray::NS::Traverse_TLAS_WithStack_AnyHit( } template -bool Ray::NS::Traverse_BLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_ivec &ray_mask, const bvh_node_t *nodes, +bool Ray::NS::Traverse_BLAS_WithStack_ClosestHit(const fvec ro[3], const fvec rd[3], + const ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, const tri_accel_t *tris, const uint32_t *tri_indices, int obj_index, hit_data_t &inter) { bool res = false; - simd_fvec inv_d[3], inv_d_o[3]; + fvec inv_d[3], inv_d_o[3]; comp_aux_inv_values(ro, rd, inv_d, inv_d_o); TraversalStateStack_Multi st; @@ -3947,12 +3947,12 @@ bool Ray::NS::Traverse_BLAS_WithStack_ClosestHit(const simd_fvec ro[3], const while (stack_size) { uint32_t cur = stack[--stack_size]; - simd_ivec mask1 = bbox_test_fma(inv_d, inv_d_o, inter.t, nodes[cur]) & st.queue[st.index].mask; + ivec mask1 = bbox_test_fma(inv_d, inv_d_o, inter.t, nodes[cur]) & st.queue[st.index].mask; if (mask1.all_zeros()) { continue; } - simd_ivec mask2 = and_not(mask1, st.queue[st.index].mask); + ivec mask2 = and_not(mask1, st.queue[st.index].mask); if (mask2.not_all_zeros()) { st.queue[st.num].mask = mask2; st.queue[st.num].stack_size = stack_size; @@ -4069,14 +4069,14 @@ bool Ray::NS::Traverse_BLAS_WithStack_ClosestHit(const float ro[3], const float } template -Ray::NS::simd_ivec -Ray::NS::Traverse_BLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, +Ray::NS::ivec +Ray::NS::Traverse_BLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], + const ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, const tri_accel_t *tris, const tri_mat_data_t *materials, const uint32_t *tri_indices, int obj_index, hit_data_t &inter) { - simd_ivec solid_hit_mask = 0; + ivec solid_hit_mask = 0; - simd_fvec inv_d[3], inv_d_o[3]; + fvec inv_d[3], inv_d_o[3]; comp_aux_inv_values(ro, rd, inv_d, inv_d_o); TraversalStateStack_Multi st; @@ -4091,12 +4091,12 @@ Ray::NS::Traverse_BLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fve while (stack_size) { const uint32_t cur = stack[--stack_size]; - const simd_ivec mask1 = bbox_test_fma(inv_d, inv_d_o, inter.t, nodes[cur]) & st.queue[st.index].mask; + const ivec mask1 = bbox_test_fma(inv_d, inv_d_o, inter.t, nodes[cur]) & st.queue[st.index].mask; if (mask1.all_zeros()) { continue; } - const simd_ivec mask2 = and_not(mask1, st.queue[st.index].mask); + const ivec mask2 = and_not(mask1, st.queue[st.index].mask); if (mask2.not_all_zeros()) { st.queue[st.num].mask = mask2; st.queue[st.num].stack_size = stack_size; @@ -4234,68 +4234,68 @@ int Ray::NS::Traverse_BLAS_WithStack_AnyHit(const float ro[3], const float rd[3] } template -Ray::NS::simd_fvec Ray::NS::BRDF_PrincipledDiffuse(const simd_fvec V[3], const simd_fvec N[3], - const simd_fvec L[3], const simd_fvec H[3], - const simd_fvec &roughness) { - const simd_fvec N_dot_L = dot3(N, L); - const simd_fvec N_dot_V = dot3(N, V); +Ray::NS::fvec Ray::NS::BRDF_PrincipledDiffuse(const fvec V[3], const fvec N[3], + const fvec L[3], const fvec H[3], + const fvec &roughness) { + const fvec N_dot_L = dot3(N, L); + const fvec N_dot_V = dot3(N, V); - const simd_fvec FL = schlick_weight(N_dot_L); - const simd_fvec FV = schlick_weight(N_dot_V); + const fvec FL = schlick_weight(N_dot_L); + const fvec FV = schlick_weight(N_dot_V); - const simd_fvec L_dot_H = dot3(L, H); - const simd_fvec Fd90 = 0.5f + 2.0f * L_dot_H * L_dot_H * roughness; - const simd_fvec Fd = mix(simd_fvec{1.0f}, Fd90, FL) * mix(simd_fvec{1.0f}, Fd90, FV); + const fvec L_dot_H = dot3(L, H); + const fvec Fd90 = 0.5f + 2.0f * L_dot_H * L_dot_H * roughness; + const fvec Fd = mix(fvec{1.0f}, Fd90, FL) * mix(fvec{1.0f}, Fd90, FV); - return select(N_dot_L > 0.0f, Fd, simd_fvec{0.0f}); + return select(N_dot_L > 0.0f, Fd, fvec{0.0f}); } template -void Ray::NS::Evaluate_OrenDiffuse_BSDF(const simd_fvec V[3], const simd_fvec N[3], const simd_fvec L[3], - const simd_fvec &roughness, const simd_fvec base_color[3], - simd_fvec out_color[4]) { - const simd_fvec sigma = roughness; - const simd_fvec div = 1.0f / (PI + ((3.0f * PI - 4.0f) / 6.0f) * sigma); +void Ray::NS::Evaluate_OrenDiffuse_BSDF(const fvec V[3], const fvec N[3], const fvec L[3], + const fvec &roughness, const fvec base_color[3], + fvec out_color[4]) { + const fvec sigma = roughness; + const fvec div = 1.0f / (PI + ((3.0f * PI - 4.0f) / 6.0f) * sigma); - const simd_fvec a = 1.0f * div; - const simd_fvec b = sigma * div; + const fvec a = 1.0f * div; + const fvec b = sigma * div; //// - const simd_fvec nl = max(dot3(N, L), 0.0f); - const simd_fvec nv = max(dot3(N, V), 0.0f); - simd_fvec t = dot3(L, V) - nl * nv; + const fvec nl = max(dot3(N, L), 0.0f); + const fvec nv = max(dot3(N, V), 0.0f); + fvec t = dot3(L, V) - nl * nv; where(t > 0.0f, t) /= (max(nl, nv) + FLT_MIN); - const simd_fvec is = nl * (a + b * t); + const fvec is = nl * (a + b * t); UNROLLED_FOR(i, 3, { out_color[i] = is * base_color[i]; }) out_color[3] = 0.5f / PI; } template -void Ray::NS::Sample_OrenDiffuse_BSDF(const simd_fvec T[3], const simd_fvec B[3], const simd_fvec N[3], - const simd_fvec I[3], const simd_fvec &roughness, - const simd_fvec base_color[3], const simd_fvec &rand_u, - const simd_fvec &rand_v, simd_fvec out_V[3], simd_fvec out_color[4]) { - const simd_fvec phi = 2 * PI * rand_v; - const simd_fvec cos_phi = cos(phi), sin_phi = sin(phi); - const simd_fvec dir = sqrt(1.0f - rand_u * rand_u); +void Ray::NS::Sample_OrenDiffuse_BSDF(const fvec T[3], const fvec B[3], const fvec N[3], + const fvec I[3], const fvec &roughness, + const fvec base_color[3], const fvec &rand_u, + const fvec &rand_v, fvec out_V[3], fvec out_color[4]) { + const fvec phi = 2 * PI * rand_v; + const fvec cos_phi = cos(phi), sin_phi = sin(phi); + const fvec dir = sqrt(1.0f - rand_u * rand_u); - const simd_fvec V[3] = {dir * cos_phi, dir * sin_phi, rand_u}; // in tangent-space + const fvec V[3] = {dir * cos_phi, dir * sin_phi, rand_u}; // in tangent-space world_from_tangent(T, B, N, V, out_V); - const simd_fvec neg_I[3] = {-I[0], -I[1], -I[2]}; + const fvec neg_I[3] = {-I[0], -I[1], -I[2]}; Evaluate_OrenDiffuse_BSDF(neg_I, N, out_V, roughness, base_color, out_color); } template -void Ray::NS::Evaluate_PrincipledDiffuse_BSDF(const simd_fvec V[3], const simd_fvec N[3], const simd_fvec L[3], - const simd_fvec &roughness, const simd_fvec base_color[3], - const simd_fvec sheen_color[3], const bool uniform_sampling, - simd_fvec out_color[4]) { - simd_fvec weight, pdf; +void Ray::NS::Evaluate_PrincipledDiffuse_BSDF(const fvec V[3], const fvec N[3], const fvec L[3], + const fvec &roughness, const fvec base_color[3], + const fvec sheen_color[3], const bool uniform_sampling, + fvec out_color[4]) { + fvec weight, pdf; if (uniform_sampling) { weight = 2 * dot3(N, L); pdf = 0.5f / PI; @@ -4304,40 +4304,40 @@ void Ray::NS::Evaluate_PrincipledDiffuse_BSDF(const simd_fvec V[3], const sim pdf = dot3(N, L) / PI; } - simd_fvec H[3] = {L[0] + V[0], L[1] + V[1], L[2] + V[2]}; + fvec H[3] = {L[0] + V[0], L[1] + V[1], L[2] + V[2]}; safe_normalize(H); - const simd_fvec dot_VH = dot3(V, H); + const fvec dot_VH = dot3(V, H); UNROLLED_FOR(i, 3, { where(dot_VH < 0.0f, H[i]) = -H[i]; }) weight *= BRDF_PrincipledDiffuse(V, N, L, H, roughness); UNROLLED_FOR(i, 3, { out_color[i] = base_color[i] * weight; }) - const simd_fvec FH = PI * schlick_weight(dot3(L, H)); + const fvec FH = PI * schlick_weight(dot3(L, H)); UNROLLED_FOR(i, 3, { out_color[i] += FH * sheen_color[i]; }) out_color[3] = pdf; } template -void Ray::NS::Sample_PrincipledDiffuse_BSDF(const simd_fvec T[3], const simd_fvec B[3], const simd_fvec N[3], - const simd_fvec I[3], const simd_fvec &roughness, - const simd_fvec base_color[3], const simd_fvec sheen_color[3], - const bool uniform_sampling, const simd_fvec rand[2], - simd_fvec out_V[3], simd_fvec out_color[4]) { - const simd_fvec phi = 2 * PI * rand[1]; - const simd_fvec cos_phi = cos(phi), sin_phi = sin(phi); +void Ray::NS::Sample_PrincipledDiffuse_BSDF(const fvec T[3], const fvec B[3], const fvec N[3], + const fvec I[3], const fvec &roughness, + const fvec base_color[3], const fvec sheen_color[3], + const bool uniform_sampling, const fvec rand[2], + fvec out_V[3], fvec out_color[4]) { + const fvec phi = 2 * PI * rand[1]; + const fvec cos_phi = cos(phi), sin_phi = sin(phi); - simd_fvec V[3]; + fvec V[3]; if (uniform_sampling) { - const simd_fvec dir = sqrt(1.0f - rand[0] * rand[0]); + const fvec dir = sqrt(1.0f - rand[0] * rand[0]); // in tangent-space V[0] = dir * cos_phi; V[1] = dir * sin_phi; V[2] = rand[0]; } else { - const simd_fvec dir = sqrt(rand[0]); - const simd_fvec k = sqrt(1.0f - rand[0]); + const fvec dir = sqrt(rand[0]); + const fvec k = sqrt(1.0f - rand[0]); // in tangent-space V[0] = dir * cos_phi; @@ -4347,70 +4347,70 @@ void Ray::NS::Sample_PrincipledDiffuse_BSDF(const simd_fvec T[3], const simd_ world_from_tangent(T, B, N, V, out_V); - const simd_fvec neg_I[3] = {-I[0], -I[1], -I[2]}; + const fvec neg_I[3] = {-I[0], -I[1], -I[2]}; Evaluate_PrincipledDiffuse_BSDF(neg_I, N, out_V, roughness, base_color, sheen_color, uniform_sampling, out_color); } template -void Ray::NS::Evaluate_GGXSpecular_BSDF(const simd_fvec view_dir_ts[3], const simd_fvec sampled_normal_ts[3], - const simd_fvec reflected_dir_ts[3], const simd_fvec alpha[2], - const simd_fvec &spec_ior, const simd_fvec &spec_F0, - const simd_fvec spec_col[3], const simd_fvec spec_col_90[3], - simd_fvec out_color[4]) { - const simd_fvec D = D_GGX(sampled_normal_ts, alpha[0], alpha[1]); +void Ray::NS::Evaluate_GGXSpecular_BSDF(const fvec view_dir_ts[3], const fvec sampled_normal_ts[3], + const fvec reflected_dir_ts[3], const fvec alpha[2], + const fvec &spec_ior, const fvec &spec_F0, + const fvec spec_col[3], const fvec spec_col_90[3], + fvec out_color[4]) { + const fvec D = D_GGX(sampled_normal_ts, alpha[0], alpha[1]); - const simd_fvec G = G1(view_dir_ts, alpha[0], alpha[1]) * G1(reflected_dir_ts, alpha[0], alpha[1]); + const fvec G = G1(view_dir_ts, alpha[0], alpha[1]) * G1(reflected_dir_ts, alpha[0], alpha[1]); - const simd_fvec FH = + const fvec FH = (fresnel_dielectric_cos(dot3(view_dir_ts, sampled_normal_ts), spec_ior) - spec_F0) / (1.0f - spec_F0); - simd_fvec F[3]; + fvec F[3]; UNROLLED_FOR(i, 3, { F[i] = mix(spec_col[i], spec_col_90[i], FH); }) - const simd_fvec denom = 4.0f * abs(view_dir_ts[2] * reflected_dir_ts[2]); - UNROLLED_FOR(i, 3, { F[i] = select(denom != 0.0f, F[i] * safe_div_pos(D * G, denom), simd_fvec{0.0f}); }) + const fvec denom = 4.0f * abs(view_dir_ts[2] * reflected_dir_ts[2]); + UNROLLED_FOR(i, 3, { F[i] = select(denom != 0.0f, F[i] * safe_div_pos(D * G, denom), fvec{0.0f}); }) - const simd_fvec pdf = GGX_VNDF_Reflection_Bounded_PDF(D, view_dir_ts, alpha); + const fvec pdf = GGX_VNDF_Reflection_Bounded_PDF(D, view_dir_ts, alpha); UNROLLED_FOR(i, 3, { out_color[i] = F[i] * max(reflected_dir_ts[2], 0.0f); }) out_color[3] = pdf; } template -void Ray::NS::Sample_GGXSpecular_BSDF(const simd_fvec T[3], const simd_fvec B[3], const simd_fvec N[3], - const simd_fvec I[3], const simd_fvec alpha[2], - const simd_fvec &spec_ior, const simd_fvec &spec_F0, - const simd_fvec spec_col[3], const simd_fvec spec_col_90[3], - const simd_fvec rand[2], simd_fvec out_V[3], simd_fvec out_color[4]) { - const simd_ivec is_mirror = simd_cast(alpha[0] * alpha[1] < 1e-7f); +void Ray::NS::Sample_GGXSpecular_BSDF(const fvec T[3], const fvec B[3], const fvec N[3], + const fvec I[3], const fvec alpha[2], + const fvec &spec_ior, const fvec &spec_F0, + const fvec spec_col[3], const fvec spec_col_90[3], + const fvec rand[2], fvec out_V[3], fvec out_color[4]) { + const ivec is_mirror = simd_cast(alpha[0] * alpha[1] < 1e-7f); if (is_mirror.not_all_zeros()) { reflect(I, N, dot3(N, I), out_V); - const simd_fvec FH = (fresnel_dielectric_cos(dot3(out_V, N), spec_ior) - spec_F0) / (1.0f - spec_F0); + const fvec FH = (fresnel_dielectric_cos(dot3(out_V, N), spec_ior) - spec_F0) / (1.0f - spec_F0); UNROLLED_FOR(i, 3, { out_color[i] = mix(spec_col[i], spec_col_90[i], FH) * 1e6f; }) out_color[3] = 1e6f; } - const simd_ivec is_glossy = ~is_mirror; + const ivec is_glossy = ~is_mirror; if (is_glossy.all_zeros()) { return; } - const simd_fvec nI[3] = {-I[0], -I[1], -I[2]}; + const fvec nI[3] = {-I[0], -I[1], -I[2]}; - simd_fvec view_dir_ts[3]; + fvec view_dir_ts[3]; tangent_from_world(T, B, N, nI, view_dir_ts); safe_normalize(view_dir_ts); - simd_fvec sampled_normal_ts[3]; + fvec sampled_normal_ts[3]; SampleGGX_VNDF_Bounded(view_dir_ts, alpha, rand, sampled_normal_ts); - const simd_fvec dot_N_V = -dot3(sampled_normal_ts, view_dir_ts); - simd_fvec reflected_dir_ts[3]; - const simd_fvec _view_dir_ts[3] = {-view_dir_ts[0], -view_dir_ts[1], -view_dir_ts[2]}; + const fvec dot_N_V = -dot3(sampled_normal_ts, view_dir_ts); + fvec reflected_dir_ts[3]; + const fvec _view_dir_ts[3] = {-view_dir_ts[0], -view_dir_ts[1], -view_dir_ts[2]}; reflect(_view_dir_ts, sampled_normal_ts, dot_N_V, reflected_dir_ts); safe_normalize(reflected_dir_ts); - simd_fvec glossy_V[3], glossy_F[4]; + fvec glossy_V[3], glossy_F[4]; world_from_tangent(T, B, N, reflected_dir_ts, glossy_V); Evaluate_GGXSpecular_BSDF(view_dir_ts, sampled_normal_ts, reflected_dir_ts, alpha, spec_ior, spec_F0, spec_col, spec_col_90, glossy_F); @@ -4420,79 +4420,79 @@ void Ray::NS::Sample_GGXSpecular_BSDF(const simd_fvec T[3], const simd_fvec -void Ray::NS::Evaluate_GGXRefraction_BSDF(const simd_fvec view_dir_ts[3], const simd_fvec sampled_normal_ts[3], - const simd_fvec refr_dir_ts[3], const simd_fvec alpha[2], - const simd_fvec &eta, const simd_fvec refr_col[3], - simd_fvec out_color[4]) { - const simd_fvec D = D_GGX(sampled_normal_ts, alpha[0], alpha[1]); +void Ray::NS::Evaluate_GGXRefraction_BSDF(const fvec view_dir_ts[3], const fvec sampled_normal_ts[3], + const fvec refr_dir_ts[3], const fvec alpha[2], + const fvec &eta, const fvec refr_col[3], + fvec out_color[4]) { + const fvec D = D_GGX(sampled_normal_ts, alpha[0], alpha[1]); - const simd_fvec G1o = G1(refr_dir_ts, alpha[0], alpha[1]), G1i = G1(view_dir_ts, alpha[0], alpha[1]); + const fvec G1o = G1(refr_dir_ts, alpha[0], alpha[1]), G1i = G1(view_dir_ts, alpha[0], alpha[1]); - const simd_fvec denom = dot3(refr_dir_ts, sampled_normal_ts) + dot3(view_dir_ts, sampled_normal_ts) * eta; - const simd_fvec jacobian = safe_div_pos(max(-dot3(refr_dir_ts, sampled_normal_ts), 0.0f), denom * denom); + const fvec denom = dot3(refr_dir_ts, sampled_normal_ts) + dot3(view_dir_ts, sampled_normal_ts) * eta; + const fvec jacobian = safe_div_pos(max(-dot3(refr_dir_ts, sampled_normal_ts), 0.0f), denom * denom); - simd_fvec F = safe_div(D * G1i * G1o * max(dot3(view_dir_ts, sampled_normal_ts), 0.0f) * jacobian, + fvec F = safe_div(D * G1i * G1o * max(dot3(view_dir_ts, sampled_normal_ts), 0.0f) * jacobian, (/*-refr_dir_ts[2] */ view_dir_ts[2])); - const simd_fvec pdf = + const fvec pdf = safe_div(D * G1o * max(dot3(view_dir_ts, sampled_normal_ts), 0.0f) * jacobian, view_dir_ts[2]); // const float pdf = D * fmaxf(sampled_normal_ts[2], 0.0f) * jacobian; // const float pdf = safe_div(D * sampled_normal_ts[2] * fmaxf(-dot3(refr_dir_ts, sampled_normal_ts), 0.0f), denom); - const simd_fvec is_valid = + const fvec is_valid = (refr_dir_ts[2] < 0.0f) & (view_dir_ts[2] > 0.0f) & (alpha[0] >= 1e-7f) & (alpha[1] >= 1e-7f); - UNROLLED_FOR(i, 3, { out_color[i] = select(is_valid, F * refr_col[i], simd_fvec{0.0f}); }) - out_color[3] = select(is_valid, pdf, simd_fvec{0.0f}); + UNROLLED_FOR(i, 3, { out_color[i] = select(is_valid, F * refr_col[i], fvec{0.0f}); }) + out_color[3] = select(is_valid, pdf, fvec{0.0f}); } template -void Ray::NS::Sample_GGXRefraction_BSDF(const simd_fvec T[3], const simd_fvec B[3], const simd_fvec N[3], - const simd_fvec I[3], const simd_fvec alpha[2], const simd_fvec &eta, - const simd_fvec refr_col[3], const simd_fvec rand[2], - simd_fvec out_V[4], simd_fvec out_color[4]) { - const simd_ivec is_mirror = simd_cast(alpha[0] * alpha[1] < 1e-7f); +void Ray::NS::Sample_GGXRefraction_BSDF(const fvec T[3], const fvec B[3], const fvec N[3], + const fvec I[3], const fvec alpha[2], const fvec &eta, + const fvec refr_col[3], const fvec rand[2], + fvec out_V[4], fvec out_color[4]) { + const ivec is_mirror = simd_cast(alpha[0] * alpha[1] < 1e-7f); if (is_mirror.not_all_zeros()) { - const simd_fvec cosi = -dot3(I, N); - const simd_fvec cost2 = 1.0f - eta * eta * (1.0f - cosi * cosi); + const fvec cosi = -dot3(I, N); + const fvec cost2 = 1.0f - eta * eta * (1.0f - cosi * cosi); - const simd_fvec m = eta * cosi - safe_sqrt(cost2); + const fvec m = eta * cosi - safe_sqrt(cost2); UNROLLED_FOR(i, 3, { out_V[i] = eta * I[i] + m * N[i]; }) safe_normalize(out_V); out_V[3] = m; - UNROLLED_FOR(i, 3, { out_color[i] = select(cost2 >= 0.0f, refr_col[i] * 1e6f, simd_fvec{0.0f}); }) - out_color[3] = select(cost2 >= 0.0f, simd_fvec{1e6f}, simd_fvec{0.0f}); + UNROLLED_FOR(i, 3, { out_color[i] = select(cost2 >= 0.0f, refr_col[i] * 1e6f, fvec{0.0f}); }) + out_color[3] = select(cost2 >= 0.0f, fvec{1e6f}, fvec{0.0f}); } - const simd_ivec is_glossy = ~is_mirror; + const ivec is_glossy = ~is_mirror; if (is_glossy.all_zeros()) { return; } - const simd_fvec neg_I[3] = {-I[0], -I[1], -I[2]}; + const fvec neg_I[3] = {-I[0], -I[1], -I[2]}; - simd_fvec view_dir_ts[3]; + fvec view_dir_ts[3]; tangent_from_world(T, B, N, neg_I, view_dir_ts); safe_normalize(view_dir_ts); - simd_fvec sampled_normal_ts[3]; + fvec sampled_normal_ts[3]; SampleGGX_VNDF(view_dir_ts, alpha, rand, sampled_normal_ts); - const simd_fvec cosi = dot3(view_dir_ts, sampled_normal_ts); - const simd_fvec cost2 = 1.0f - eta * eta * (1.0f - cosi * cosi); + const fvec cosi = dot3(view_dir_ts, sampled_normal_ts); + const fvec cost2 = 1.0f - eta * eta * (1.0f - cosi * cosi); UNROLLED_FOR(i, 4, { where(is_glossy, out_color[i]) = 0.0f; }) - const simd_ivec cost2_positive = simd_cast(cost2 >= 0.0f); + const ivec cost2_positive = simd_cast(cost2 >= 0.0f); if ((is_glossy & cost2_positive).not_all_zeros()) { - const simd_fvec m = eta * cosi - safe_sqrt(cost2); - simd_fvec refr_dir_ts[3]; + const fvec m = eta * cosi - safe_sqrt(cost2); + fvec refr_dir_ts[3]; UNROLLED_FOR(i, 3, { refr_dir_ts[i] = -eta * view_dir_ts[i] + m * sampled_normal_ts[i]; }) safe_normalize(refr_dir_ts); - simd_fvec glossy_V[3], glossy_F[4]; + fvec glossy_V[3], glossy_F[4]; world_from_tangent(T, B, N, refr_dir_ts, glossy_V); Evaluate_GGXRefraction_BSDF(view_dir_ts, sampled_normal_ts, refr_dir_ts, alpha, eta, refr_col, glossy_F); @@ -4502,75 +4502,75 @@ void Ray::NS::Sample_GGXRefraction_BSDF(const simd_fvec T[3], const simd_fvec } template -void Ray::NS::Evaluate_PrincipledClearcoat_BSDF(const simd_fvec view_dir_ts[3], - const simd_fvec sampled_normal_ts[3], - const simd_fvec reflected_dir_ts[3], - const simd_fvec &clearcoat_roughness2, - const simd_fvec &clearcoat_ior, const simd_fvec &clearcoat_F0, - simd_fvec out_color[4]) { - const simd_fvec D = D_GTR1(sampled_normal_ts[2], simd_fvec{clearcoat_roughness2}); +void Ray::NS::Evaluate_PrincipledClearcoat_BSDF(const fvec view_dir_ts[3], + const fvec sampled_normal_ts[3], + const fvec reflected_dir_ts[3], + const fvec &clearcoat_roughness2, + const fvec &clearcoat_ior, const fvec &clearcoat_F0, + fvec out_color[4]) { + const fvec D = D_GTR1(sampled_normal_ts[2], fvec{clearcoat_roughness2}); // Always assume roughness of 0.25 for clearcoat - const simd_fvec clearcoat_alpha[2] = {0.25f * 0.25f, 0.25f * 0.25f}; - const simd_fvec G = G1(view_dir_ts, clearcoat_alpha[0], clearcoat_alpha[1]) * + const fvec clearcoat_alpha[2] = {0.25f * 0.25f, 0.25f * 0.25f}; + const fvec G = G1(view_dir_ts, clearcoat_alpha[0], clearcoat_alpha[1]) * G1(reflected_dir_ts, clearcoat_alpha[0], clearcoat_alpha[1]); - const simd_fvec FH = + const fvec FH = (fresnel_dielectric_cos(dot3(reflected_dir_ts, sampled_normal_ts), clearcoat_ior) - clearcoat_F0) / (1.0f - clearcoat_F0); - simd_fvec F = mix(simd_fvec{0.04f}, simd_fvec{1.0f}, FH); + fvec F = mix(fvec{0.04f}, fvec{1.0f}, FH); - const simd_fvec denom = 4.0f * abs(view_dir_ts[2]) * abs(reflected_dir_ts[2]); - F = select(denom != 0.0f, safe_div_pos(F * D * G, denom), simd_fvec{0.0f}); + const fvec denom = 4.0f * abs(view_dir_ts[2]) * abs(reflected_dir_ts[2]); + F = select(denom != 0.0f, safe_div_pos(F * D * G, denom), fvec{0.0f}); F *= max(reflected_dir_ts[2], 0.0f); - const simd_fvec pdf = GGX_VNDF_Reflection_Bounded_PDF(D, view_dir_ts, clearcoat_alpha); + const fvec pdf = GGX_VNDF_Reflection_Bounded_PDF(D, view_dir_ts, clearcoat_alpha); UNROLLED_FOR(i, 3, { out_color[i] = F; }) out_color[3] = pdf; } template -void Ray::NS::Sample_PrincipledClearcoat_BSDF(const simd_fvec T[3], const simd_fvec B[3], const simd_fvec N[3], - const simd_fvec I[3], const simd_fvec &clearcoat_roughness2, - const simd_fvec &clearcoat_ior, const simd_fvec &clearcoat_F0, - const simd_fvec rand[2], simd_fvec out_V[3], - simd_fvec out_color[4]) { - const simd_ivec is_mirror = simd_cast(sqr(clearcoat_roughness2) < 1e-7f); +void Ray::NS::Sample_PrincipledClearcoat_BSDF(const fvec T[3], const fvec B[3], const fvec N[3], + const fvec I[3], const fvec &clearcoat_roughness2, + const fvec &clearcoat_ior, const fvec &clearcoat_F0, + const fvec rand[2], fvec out_V[3], + fvec out_color[4]) { + const ivec is_mirror = simd_cast(sqr(clearcoat_roughness2) < 1e-7f); if (is_mirror.not_all_zeros()) { reflect(I, N, dot3(N, I), out_V); - const simd_fvec FH = + const fvec FH = (fresnel_dielectric_cos(dot3(out_V, N), clearcoat_ior) - clearcoat_F0) / (1.0f - clearcoat_F0); - const simd_fvec F = mix(simd_fvec{0.04f}, simd_fvec{1.0f}, FH); + const fvec F = mix(fvec{0.04f}, fvec{1.0f}, FH); UNROLLED_FOR(i, 3, { out_color[i] = F * 1e6f; }) out_color[3] = 1e6f; } - const simd_ivec is_glossy = ~is_mirror; + const ivec is_glossy = ~is_mirror; if (is_glossy.all_zeros()) { return; } - const simd_fvec neg_I[3] = {-I[0], -I[1], -I[2]}; + const fvec neg_I[3] = {-I[0], -I[1], -I[2]}; - simd_fvec view_dir_ts[3]; + fvec view_dir_ts[3]; tangent_from_world(T, B, N, neg_I, view_dir_ts); safe_normalize(view_dir_ts); // NOTE: GTR1 distribution is not used for sampling because Cycles does it this way (???!) - simd_fvec sampled_normal_ts[3], alpha[2] = {clearcoat_roughness2, clearcoat_roughness2}; + fvec sampled_normal_ts[3], alpha[2] = {clearcoat_roughness2, clearcoat_roughness2}; SampleGGX_VNDF_Bounded(view_dir_ts, alpha, rand, sampled_normal_ts); - const simd_fvec dot_N_V = -dot3(sampled_normal_ts, view_dir_ts); - simd_fvec reflected_dir_ts[3]; - const simd_fvec _view_dir_ts[3] = {-view_dir_ts[0], -view_dir_ts[1], -view_dir_ts[2]}; + const fvec dot_N_V = -dot3(sampled_normal_ts, view_dir_ts); + fvec reflected_dir_ts[3]; + const fvec _view_dir_ts[3] = {-view_dir_ts[0], -view_dir_ts[1], -view_dir_ts[2]}; reflect(_view_dir_ts, sampled_normal_ts, dot_N_V, reflected_dir_ts); safe_normalize(reflected_dir_ts); world_from_tangent(T, B, N, reflected_dir_ts, out_V); - simd_fvec glossy_F[4]; + fvec glossy_F[4]; Evaluate_PrincipledClearcoat_BSDF(view_dir_ts, sampled_normal_ts, reflected_dir_ts, clearcoat_roughness2, clearcoat_ior, clearcoat_F0, glossy_F); @@ -4578,44 +4578,44 @@ void Ray::NS::Sample_PrincipledClearcoat_BSDF(const simd_fvec T[3], const sim } template -Ray::NS::simd_fvec Ray::NS::Evaluate_EnvQTree(const float y_rotation, const simd_fvec4 *const *qtree_mips, - const int qtree_levels, const simd_fvec L[3]) { +Ray::NS::fvec Ray::NS::Evaluate_EnvQTree(const float y_rotation, const fvec4 *const *qtree_mips, + const int qtree_levels, const fvec L[3]) { int res = 2; int lod = qtree_levels - 1; - simd_fvec p[2]; + fvec p[2]; DirToCanonical(L, -y_rotation, p); - simd_fvec factor = 1.0f; + fvec factor = 1.0f; while (lod >= 0) { - const simd_ivec x = clamp(simd_ivec(p[0] * float(res)), 0, res - 1); - const simd_ivec y = clamp(simd_ivec(p[1] * float(res)), 0, res - 1); + const ivec x = clamp(ivec(p[0] * float(res)), 0, res - 1); + const ivec y = clamp(ivec(p[1] * float(res)), 0, res - 1); - simd_ivec index = 0; + ivec index = 0; index |= (x & 1) << 0; index |= (y & 1) << 1; - const simd_ivec qx = x / 2; - const simd_ivec qy = y / 2; + const ivec qx = x / 2; + const ivec qy = y / 2; - simd_fvec quad[4]; + fvec quad[4]; UNROLLED_FOR_S(i, S, { - const simd_fvec4 q = qtree_mips[lod][qy.template get() * res / 2 + qx.template get()]; + const fvec4 q = qtree_mips[lod][qy.template get() * res / 2 + qx.template get()]; quad[0].template set(q.template get<0>()); quad[1].template set(q.template get<1>()); quad[2].template set(q.template get<2>()); quad[3].template set(q.template get<3>()); }) - const simd_fvec total = quad[0] + quad[1] + quad[2] + quad[3]; + const fvec total = quad[0] + quad[1] + quad[2] + quad[3]; - const simd_ivec mask = simd_cast(total > 0.0f); + const ivec mask = simd_cast(total > 0.0f); if (mask.all_zeros()) { break; } where(mask, factor) *= - 4.0f * gather(value_ptr(quad[0]), index * S + simd_ivec(ascending_counter, simd_mem_aligned)) / total; + 4.0f * gather(value_ptr(quad[0]), index * S + ivec(ascending_counter, vector_aligned)) / total; --lod; res *= 2; @@ -4625,25 +4625,25 @@ Ray::NS::simd_fvec Ray::NS::Evaluate_EnvQTree(const float y_rotation, const s } template -void Ray::NS::Sample_EnvQTree(float y_rotation, const simd_fvec4 *const *qtree_mips, int qtree_levels, - const simd_fvec &rand, const simd_fvec &rx, const simd_fvec &ry, - simd_fvec out_V[4]) { +void Ray::NS::Sample_EnvQTree(float y_rotation, const fvec4 *const *qtree_mips, int qtree_levels, + const fvec &rand, const fvec &rx, const fvec &ry, + fvec out_V[4]) { int res = 2; float step = 1.0f / float(res); - simd_fvec sample = rand; + fvec sample = rand; int lod = qtree_levels - 1; - simd_fvec origin[2] = {{0.0f}, {0.0f}}; - simd_fvec factor = 1.0f; + fvec origin[2] = {{0.0f}, {0.0f}}; + fvec factor = 1.0f; while (lod >= 0) { - const simd_ivec qx = simd_ivec(origin[0] * float(res)) / 2; - const simd_ivec qy = simd_ivec(origin[1] * float(res)) / 2; + const ivec qx = ivec(origin[0] * float(res)) / 2; + const ivec qy = ivec(origin[1] * float(res)) / 2; - simd_fvec quad[4]; + fvec quad[4]; UNROLLED_FOR_S(i, S, { - const simd_fvec4 q = qtree_mips[lod][qy.template get() * res / 2 + qx.template get()]; + const fvec4 q = qtree_mips[lod][qy.template get() * res / 2 + qx.template get()]; quad[0].template set(q.template get<0>()); quad[1].template set(q.template get<1>()); @@ -4651,22 +4651,22 @@ void Ray::NS::Sample_EnvQTree(float y_rotation, const simd_fvec4 *const *qtree_m quad[3].template set(q.template get<3>()); }) - const simd_fvec top_left = quad[0]; - const simd_fvec top_right = quad[1]; - simd_fvec partial = top_left + quad[2]; - const simd_fvec total = partial + top_right + quad[3]; + const fvec top_left = quad[0]; + const fvec top_right = quad[1]; + fvec partial = top_left + quad[2]; + const fvec total = partial + top_right + quad[3]; - const simd_ivec mask = simd_cast(total > 0.0f); + const ivec mask = simd_cast(total > 0.0f); if (mask.all_zeros()) { break; } - simd_fvec boundary = partial / total; + fvec boundary = partial / total; - simd_ivec index = 0; + ivec index = 0; { // left or right decision - const simd_ivec left_mask = simd_cast(sample < boundary); + const ivec left_mask = simd_cast(sample < boundary); where(left_mask, sample) /= boundary; where(left_mask, boundary) = top_left / partial; @@ -4679,7 +4679,7 @@ void Ray::NS::Sample_EnvQTree(float y_rotation, const simd_fvec4 *const *qtree_m } { // bottom or up decision - const simd_ivec bottom_mask = simd_cast(sample < boundary); + const ivec bottom_mask = simd_cast(sample < boundary); where(bottom_mask, sample) /= boundary; @@ -4689,7 +4689,7 @@ void Ray::NS::Sample_EnvQTree(float y_rotation, const simd_fvec4 *const *qtree_m } where(mask, factor) *= - 4.0f * gather(value_ptr(quad[0]), index * S + simd_ivec(ascending_counter, simd_mem_aligned)) / total; + 4.0f * gather(value_ptr(quad[0]), index * S + ivec(ascending_counter, vector_aligned)) / total; --lod; res *= 2; @@ -4704,8 +4704,8 @@ void Ray::NS::Sample_EnvQTree(float y_rotation, const simd_fvec4 *const *qtree_m } template -void Ray::NS::TransformRay(const simd_fvec ro[3], const simd_fvec rd[3], const float *xform, - simd_fvec out_ro[3], simd_fvec out_rd[3]) { +void Ray::NS::TransformRay(const fvec ro[3], const fvec rd[3], const float *xform, + fvec out_ro[3], fvec out_rd[3]) { out_ro[0] = ro[0] * xform[0] + ro[1] * xform[4] + ro[2] * xform[8] + xform[12]; out_ro[1] = ro[0] * xform[1] + ro[1] * xform[5] + ro[2] * xform[9] + xform[13]; out_ro[2] = ro[0] * xform[2] + ro[1] * xform[6] + ro[2] * xform[10] + xform[14]; @@ -4725,75 +4725,75 @@ void Ray::NS::TransformRay(const float ro[3], const float rd[3], const float *xf out_rd[2] = rd[0] * xform[2] + rd[1] * xform[6] + rd[2] * xform[10]; } -template void Ray::NS::TransformPoint(const simd_fvec p[3], const float *xform, simd_fvec out_p[3]) { +template void Ray::NS::TransformPoint(const fvec p[3], const float *xform, fvec out_p[3]) { out_p[0] = xform[0] * p[0] + xform[4] * p[1] + xform[8] * p[2] + xform[12]; out_p[1] = xform[1] * p[0] + xform[5] * p[1] + xform[9] * p[2] + xform[13]; out_p[2] = xform[2] * p[0] + xform[6] * p[1] + xform[10] * p[2] + xform[14]; } -template void Ray::NS::TransformPoint(const simd_fvec xform[16], simd_fvec p[3]) { - const simd_fvec temp0 = xform[0] * p[0] + xform[4] * p[1] + xform[8] * p[2] + xform[12]; - const simd_fvec temp1 = xform[1] * p[0] + xform[5] * p[1] + xform[9] * p[2] + xform[13]; - const simd_fvec temp2 = xform[2] * p[0] + xform[6] * p[1] + xform[10] * p[2] + xform[14]; +template void Ray::NS::TransformPoint(const fvec xform[16], fvec p[3]) { + const fvec temp0 = xform[0] * p[0] + xform[4] * p[1] + xform[8] * p[2] + xform[12]; + const fvec temp1 = xform[1] * p[0] + xform[5] * p[1] + xform[9] * p[2] + xform[13]; + const fvec temp2 = xform[2] * p[0] + xform[6] * p[1] + xform[10] * p[2] + xform[14]; p[0] = temp0; p[1] = temp1; p[2] = temp2; } -template void Ray::NS::TransformDirection(const simd_fvec xform[16], simd_fvec p[3]) { - const simd_fvec temp0 = xform[0] * p[0] + xform[4] * p[1] + xform[8] * p[2]; - const simd_fvec temp1 = xform[1] * p[0] + xform[5] * p[1] + xform[9] * p[2]; - const simd_fvec temp2 = xform[2] * p[0] + xform[6] * p[1] + xform[10] * p[2]; +template void Ray::NS::TransformDirection(const fvec xform[16], fvec p[3]) { + const fvec temp0 = xform[0] * p[0] + xform[4] * p[1] + xform[8] * p[2]; + const fvec temp1 = xform[1] * p[0] + xform[5] * p[1] + xform[9] * p[2]; + const fvec temp2 = xform[2] * p[0] + xform[6] * p[1] + xform[10] * p[2]; p[0] = temp0; p[1] = temp1; p[2] = temp2; } -template void Ray::NS::TransformNormal(const simd_fvec n[3], const float *inv_xform, simd_fvec out_n[3]) { +template void Ray::NS::TransformNormal(const fvec n[3], const float *inv_xform, fvec out_n[3]) { out_n[0] = n[0] * inv_xform[0] + n[1] * inv_xform[1] + n[2] * inv_xform[2]; out_n[1] = n[0] * inv_xform[4] + n[1] * inv_xform[5] + n[2] * inv_xform[6]; out_n[2] = n[0] * inv_xform[8] + n[1] * inv_xform[9] + n[2] * inv_xform[10]; } template -void Ray::NS::TransformNormal(const simd_fvec n[3], const simd_fvec inv_xform[16], simd_fvec out_n[3]) { +void Ray::NS::TransformNormal(const fvec n[3], const fvec inv_xform[16], fvec out_n[3]) { out_n[0] = n[0] * inv_xform[0] + n[1] * inv_xform[1] + n[2] * inv_xform[2]; out_n[1] = n[0] * inv_xform[4] + n[1] * inv_xform[5] + n[2] * inv_xform[6]; out_n[2] = n[0] * inv_xform[8] + n[1] * inv_xform[9] + n[2] * inv_xform[10]; } -template void Ray::NS::TransformNormal(const simd_fvec inv_xform[16], simd_fvec inout_n[3]) { - simd_fvec temp0 = inout_n[0] * inv_xform[0] + inout_n[1] * inv_xform[1] + inout_n[2] * inv_xform[2]; - simd_fvec temp1 = inout_n[0] * inv_xform[4] + inout_n[1] * inv_xform[5] + inout_n[2] * inv_xform[6]; - simd_fvec temp2 = inout_n[0] * inv_xform[8] + inout_n[1] * inv_xform[9] + inout_n[2] * inv_xform[10]; +template void Ray::NS::TransformNormal(const fvec inv_xform[16], fvec inout_n[3]) { + fvec temp0 = inout_n[0] * inv_xform[0] + inout_n[1] * inv_xform[1] + inout_n[2] * inv_xform[2]; + fvec temp1 = inout_n[0] * inv_xform[4] + inout_n[1] * inv_xform[5] + inout_n[2] * inv_xform[6]; + fvec temp2 = inout_n[0] * inv_xform[8] + inout_n[1] * inv_xform[9] + inout_n[2] * inv_xform[10]; inout_n[0] = temp0; inout_n[1] = temp1; inout_n[2] = temp2; } -template void Ray::NS::CanonicalToDir(const simd_fvec p[2], float y_rotation, simd_fvec out_d[3]) { - const simd_fvec cos_theta = 2 * p[0] - 1; - simd_fvec phi = 2 * PI * p[1] + y_rotation; +template void Ray::NS::CanonicalToDir(const fvec p[2], float y_rotation, fvec out_d[3]) { + const fvec cos_theta = 2 * p[0] - 1; + fvec phi = 2 * PI * p[1] + y_rotation; where(phi < 0.0f, phi) += 2 * PI; where(phi > 2 * PI, phi) -= 2 * PI; - const simd_fvec sin_theta = sqrt(1 - cos_theta * cos_theta); + const fvec sin_theta = sqrt(1 - cos_theta * cos_theta); - const simd_fvec sin_phi = sin(phi); - const simd_fvec cos_phi = cos(phi); + const fvec sin_phi = sin(phi); + const fvec cos_phi = cos(phi); out_d[0] = sin_theta * cos_phi; out_d[1] = cos_theta; out_d[2] = -sin_theta * sin_phi; } -template void Ray::NS::DirToCanonical(const simd_fvec d[3], float y_rotation, simd_fvec out_p[2]) { - const simd_fvec cos_theta = clamp(d[1], -1.0f, 1.0f); +template void Ray::NS::DirToCanonical(const fvec d[3], float y_rotation, fvec out_p[2]) { + const fvec cos_theta = clamp(d[1], -1.0f, 1.0f); - simd_fvec phi; + fvec phi; UNROLLED_FOR_S(i, S, { phi.template set(-atan2f(d[2].template get(), d[0].template get())); }) phi += y_rotation; @@ -4805,19 +4805,19 @@ template void Ray::NS::DirToCanonical(const simd_fvec d[3], float y_r } template -void Ray::NS::rotate_around_axis(const simd_fvec p[3], const simd_fvec axis[3], const simd_fvec &angle, - simd_fvec out_p[3]) { - const simd_fvec costheta = cos(angle), sintheta = sin(angle); +void Ray::NS::rotate_around_axis(const fvec p[3], const fvec axis[3], const fvec &angle, + fvec out_p[3]) { + const fvec costheta = cos(angle), sintheta = sin(angle); - const simd_fvec temp0 = ((costheta + (1.0f - costheta) * axis[0] * axis[0]) * p[0]) + + const fvec temp0 = ((costheta + (1.0f - costheta) * axis[0] * axis[0]) * p[0]) + (((1.0f - costheta) * axis[0] * axis[1] - axis[2] * sintheta) * p[1]) + (((1.0f - costheta) * axis[0] * axis[2] + axis[1] * sintheta) * p[2]); - const simd_fvec temp1 = (((1.0f - costheta) * axis[0] * axis[1] + axis[2] * sintheta) * p[0]) + + const fvec temp1 = (((1.0f - costheta) * axis[0] * axis[1] + axis[2] * sintheta) * p[0]) + ((costheta + (1.0f - costheta) * axis[1] * axis[1]) * p[1]) + (((1.0f - costheta) * axis[1] * axis[2] - axis[0] * sintheta) * p[2]); - const simd_fvec temp2 = (((1.0f - costheta) * axis[0] * axis[2] - axis[1] * sintheta) * p[0]) + + const fvec temp2 = (((1.0f - costheta) * axis[0] * axis[2] - axis[1] * sintheta) * p[0]) + (((1.0f - costheta) * axis[1] * axis[2] + axis[0] * sintheta) * p[1]) + ((costheta + (1.0f - costheta) * axis[2] * axis[2]) * p[2]); @@ -4828,10 +4828,10 @@ void Ray::NS::rotate_around_axis(const simd_fvec p[3], const simd_fvec axi template void Ray::NS::SampleNearest(const Cpu::TexStorageBase *const textures[], const uint32_t index, - const simd_fvec uvs[2], const simd_fvec &lod, const simd_ivec &mask, - simd_fvec out_rgba[4]) { + const fvec uvs[2], const fvec &lod, const ivec &mask, + fvec out_rgba[4]) { const Cpu::TexStorageBase &storage = *textures[index >> 28]; - auto _lod = (simd_ivec)lod; + auto _lod = (ivec)lod; where(_lod > MAX_MIP_LEVEL, _lod) = MAX_MIP_LEVEL; @@ -4851,13 +4851,13 @@ void Ray::NS::SampleNearest(const Cpu::TexStorageBase *const textures[], const u template void Ray::NS::SampleBilinear(const Cpu::TexStorageBase *const textures[], const uint32_t index, - const simd_fvec uvs[2], const simd_ivec &lod, const simd_fvec rand[2], - const simd_ivec &mask, simd_fvec out_rgba[4]) { + const fvec uvs[2], const ivec &lod, const fvec rand[2], + const ivec &mask, fvec out_rgba[4]) { const Cpu::TexStorageBase &storage = *textures[index >> 28]; const int tex = int(index & 0x00ffffff); - simd_fvec _uvs[2]; + fvec _uvs[2]; _uvs[0] = fract(uvs[0]); _uvs[1] = fract(uvs[1]); @@ -4891,9 +4891,9 @@ void Ray::NS::SampleBilinear(const Cpu::TexStorageBase *const textures[], const out_rgba[3].set(i, p00.v[3]); } #else // USE_STOCH_TEXTURE_FILTERING - const simd_fvec k[2] = {fract(_uvs[0]), fract(_uvs[1])}; + const fvec k[2] = {fract(_uvs[0]), fract(_uvs[1])}; - simd_fvec p0[4] = {}, p1[4] = {}; + fvec p0[4] = {}, p1[4] = {}; for (int i = 0; i < S; ++i) { if (!mask[i]) { @@ -4925,24 +4925,24 @@ void Ray::NS::SampleBilinear(const Cpu::TexStorageBase *const textures[], const template void Ray::NS::SampleTrilinear(const Cpu::TexStorageBase *const textures[], const uint32_t index, - const simd_fvec uvs[2], const simd_fvec &lod, const simd_fvec rand[2], - const simd_ivec &mask, simd_fvec out_rgba[4]) { - simd_fvec col1[4]; - SampleBilinear(textures, index, uvs, (simd_ivec)floor(lod), rand, mask, col1); - simd_fvec col2[4]; - SampleBilinear(textures, index, uvs, (simd_ivec)ceil(lod), rand, mask, col2); - - const simd_fvec k = fract(lod); + const fvec uvs[2], const fvec &lod, const fvec rand[2], + const ivec &mask, fvec out_rgba[4]) { + fvec col1[4]; + SampleBilinear(textures, index, uvs, (ivec)floor(lod), rand, mask, col1); + fvec col2[4]; + SampleBilinear(textures, index, uvs, (ivec)ceil(lod), rand, mask, col2); + + const fvec k = fract(lod); UNROLLED_FOR(i, 4, { out_rgba[i] = col1[i] * (1.0f - k) + col2[i] * k; }) } template -void Ray::NS::SampleLatlong_RGBE(const Cpu::TexStorageRGBA &storage, const uint32_t index, const simd_fvec dir[3], - const float y_rotation, const simd_fvec rand[2], const simd_ivec &mask, - simd_fvec out_rgb[3]) { - const simd_fvec y = clamp(dir[1], -1.0f, 1.0f); +void Ray::NS::SampleLatlong_RGBE(const Cpu::TexStorageRGBA &storage, const uint32_t index, const fvec dir[3], + const float y_rotation, const fvec rand[2], const ivec &mask, + fvec out_rgb[3]) { + const fvec y = clamp(dir[1], -1.0f, 1.0f); - simd_fvec theta = 0.0f, phi = 0.0f; + fvec theta = 0.0f, phi = 0.0f; UNROLLED_FOR_S(i, S, { if (mask.template get()) { theta.template set(acosf(y.template get()) / PI); @@ -4952,19 +4952,19 @@ void Ray::NS::SampleLatlong_RGBE(const Cpu::TexStorageRGBA &storage, const uint3 where(phi < 0.0f, phi) += 2 * PI; where(phi > 2 * PI, phi) -= 2 * PI; - const simd_fvec u = 0.5f * phi / PI; + const fvec u = 0.5f * phi / PI; const int tex = int(index & 0x00ffffff); float sz[2]; storage.GetFRes(tex, 0, sz); - simd_fvec uvs[2] = {clamp(u * sz[0], 0.0f, sz[0] - 1.0f), clamp(theta * sz[1], 0.0f, sz[1] - 1.0f)}; + fvec uvs[2] = {clamp(u * sz[0], 0.0f, sz[0] - 1.0f), clamp(theta * sz[1], 0.0f, sz[1] - 1.0f)}; #if USE_STOCH_TEXTURE_FILTERING uvs[0] += rand[0] - 0.5f; uvs[1] += rand[1] - 0.5f; - const simd_ivec iuvs[2] = {simd_ivec(uvs[0]), simd_ivec(uvs[1])}; + const ivec iuvs[2] = {ivec(uvs[0]), ivec(uvs[1])}; for (int i = 0; i < S; i++) { if (!mask[i]) { @@ -4979,9 +4979,9 @@ void Ray::NS::SampleLatlong_RGBE(const Cpu::TexStorageRGBA &storage, const uint3 out_rgb[2].set(i, to_norm_float(p00.v[2]) * f); } #else // USE_STOCH_TEXTURE_FILTERING - const simd_fvec k[2] = {fract(uvs[0]), fract(uvs[1])}; + const fvec k[2] = {fract(uvs[0]), fract(uvs[1])}; - simd_fvec _p00[3] = {}, _p01[3] = {}, _p10[3] = {}, _p11[3] = {}; + fvec _p00[3] = {}, _p01[3] = {}, _p10[3] = {}, _p11[3] = {}; for (int i = 0; i < S; i++) { if (!mask[i]) { @@ -5014,9 +5014,9 @@ void Ray::NS::SampleLatlong_RGBE(const Cpu::TexStorageRGBA &storage, const uint3 _p11[2].set(i, to_norm_float(p11.v[2]) * f); } - const simd_fvec p0X[3] = {_p01[0] * k[0] + _p00[0] * (1 - k[0]), _p01[1] * k[0] + _p00[1] * (1 - k[0]), + const fvec p0X[3] = {_p01[0] * k[0] + _p00[0] * (1 - k[0]), _p01[1] * k[0] + _p00[1] * (1 - k[0]), _p01[2] * k[0] + _p00[2] * (1 - k[0])}; - const simd_fvec p1X[3] = {_p11[0] * k[0] + _p10[0] * (1 - k[0]), _p11[1] * k[0] + _p10[1] * (1 - k[0]), + const fvec p1X[3] = {_p11[0] * k[0] + _p10[0] * (1 - k[0]), _p11[1] * k[0] + _p10[1] * (1 - k[0]), _p11[2] * k[0] + _p10[2] * (1 - k[0])}; out_rgb[0] = p1X[0] * k[1] + p0X[0] * (1.0f - k[1]); @@ -5030,18 +5030,18 @@ void Ray::NS::IntersectScene(ray_data_t &r, const int min_transp_depth, const const uint32_t rand_seq[], const uint32_t rand_seed, const int iteration, const scene_data_t &sc, const uint32_t root_index, const Cpu::TexStorageBase *const textures[], hit_data_t &inter) { - simd_fvec ro[3] = {r.o[0], r.o[1], r.o[2]}; + fvec ro[3] = {r.o[0], r.o[1], r.o[2]}; - const simd_uvec ray_flags = simd_uvec(1 << get_ray_type(r.depth)); + const uvec ray_flags = uvec(1 << get_ray_type(r.depth)); - const simd_uvec px_hash = hash(r.xy); - const simd_uvec rand_hash = hash_combine(px_hash, rand_seed); + const uvec px_hash = hash(r.xy); + const uvec rand_hash = hash_combine(px_hash, rand_seed); - auto rand_dim = simd_uvec(RAND_DIM_BASE_COUNT + get_total_depth(r.depth) * RAND_DIM_BOUNCE_COUNT); + auto rand_dim = uvec(RAND_DIM_BASE_COUNT + get_total_depth(r.depth) * RAND_DIM_BOUNCE_COUNT); - simd_ivec keep_going = r.mask; + ivec keep_going = r.mask; while (keep_going.not_all_zeros()) { - const simd_fvec t_val = inter.t; + const fvec t_val = inter.t; if (sc.wnodes) { NS::Traverse_TLAS_WithStack_ClosestHit(ro, r.d, ray_flags, keep_going, sc.wnodes, root_index, @@ -5058,17 +5058,17 @@ void Ray::NS::IntersectScene(ray_data_t &r, const int min_transp_depth, const break; } - simd_ivec tri_index = inter.prim_index; - const simd_ivec is_backfacing = (tri_index < 0); + ivec tri_index = inter.prim_index; + const ivec is_backfacing = (tri_index < 0); where(is_backfacing, tri_index) = -tri_index - 1; - simd_ivec mat_index = gather(reinterpret_cast(sc.tri_materials), tri_index); + ivec mat_index = gather(reinterpret_cast(sc.tri_materials), tri_index); where(~is_backfacing, mat_index) = mat_index & 0xffff; // use front material index where(is_backfacing, mat_index) = mat_index >> 16; // use back material index where(~keep_going, mat_index) = 0xffff; - const simd_ivec solid_hit = (mat_index & MATERIAL_SOLID_BIT) != 0; + const ivec solid_hit = (mat_index & MATERIAL_SOLID_BIT) != 0; keep_going &= ~solid_hit; if (keep_going.all_zeros()) { break; @@ -5076,35 +5076,35 @@ void Ray::NS::IntersectScene(ray_data_t &r, const int min_transp_depth, const mat_index &= MATERIAL_INDEX_BITS; - const simd_fvec w = 1.0f - inter.u - inter.v; + const fvec w = 1.0f - inter.u - inter.v; - const simd_ivec vtx_indices[3] = {gather(reinterpret_cast(sc.vtx_indices + 0), tri_index * 3), + const ivec vtx_indices[3] = {gather(reinterpret_cast(sc.vtx_indices + 0), tri_index * 3), gather(reinterpret_cast(sc.vtx_indices + 1), tri_index * 3), gather(reinterpret_cast(sc.vtx_indices + 2), tri_index * 3)}; - simd_fvec uvs[2]; + fvec uvs[2]; { // Fetch vertex uvs const float *vtx_uvs = &sc.vertices[0].t[0]; const int VtxUVsStride = sizeof(vertex_t) / sizeof(float); UNROLLED_FOR(i, 2, { - const simd_fvec temp1 = gather(vtx_uvs + i, vtx_indices[0] * VtxUVsStride); - const simd_fvec temp2 = gather(vtx_uvs + i, vtx_indices[1] * VtxUVsStride); - const simd_fvec temp3 = gather(vtx_uvs + i, vtx_indices[2] * VtxUVsStride); + const fvec temp1 = gather(vtx_uvs + i, vtx_indices[0] * VtxUVsStride); + const fvec temp2 = gather(vtx_uvs + i, vtx_indices[1] * VtxUVsStride); + const fvec temp3 = gather(vtx_uvs + i, vtx_indices[2] * VtxUVsStride); uvs[i] = temp1 * w + temp2 * inter.u + temp3 * inter.v; }) } - simd_fvec mix_term_rand[2]; + fvec mix_term_rand[2]; get_scrambled_2d_rand(rand_dim + unsigned(RAND_DIM_BSDF_PICK), rand_hash, iteration - 1, rand_seq, mix_term_rand); - simd_fvec tex_rand[2]; + fvec tex_rand[2]; get_scrambled_2d_rand(rand_dim + RAND_DIM_TEX, rand_hash, iteration - 1, rand_seq, tex_rand); { // resolve material - simd_ivec ray_queue[S]; + ivec ray_queue[S]; int index = 0, num = 1; ray_queue[0] = keep_going; @@ -5113,8 +5113,8 @@ void Ray::NS::IntersectScene(ray_data_t &r, const int min_transp_depth, const const int mask = ray_queue[index].movemask(); uint32_t first_mi = mat_index[GetFirstBit(mask)]; - simd_ivec same_mi = (mat_index == first_mi); - simd_ivec diff_mi = and_not(same_mi, ray_queue[index]); + ivec same_mi = (mat_index == first_mi); + ivec diff_mi = and_not(same_mi, ray_queue[index]); if (diff_mi.not_all_zeros()) { ray_queue[num++] = diff_mi; @@ -5126,11 +5126,11 @@ void Ray::NS::IntersectScene(ray_data_t &r, const int min_transp_depth, const const material_t *mat = &sc.materials[first_mi]; while (mat->type == eShadingNode::Mix) { - simd_fvec _mix_val = 1.0f; + fvec _mix_val = 1.0f; const uint32_t first_t = mat->textures[BASE_TEXTURE]; if (first_t != 0xffffffff) { - simd_fvec mix[4] = {}; + fvec mix[4] = {}; SampleBilinear(textures, first_t, uvs, {0}, tex_rand, same_mi, mix); if (first_t & TEX_YCOCG_BIT) { YCoCg_to_RGB(mix, mix); @@ -5164,7 +5164,7 @@ void Ray::NS::IntersectScene(ray_data_t &r, const int min_transp_depth, const } } - const simd_ivec _same_mi = (mat_index == first_mi); + const ivec _same_mi = (mat_index == first_mi); diff_mi = and_not(_same_mi, same_mi); same_mi = _same_mi; @@ -5184,16 +5184,16 @@ void Ray::NS::IntersectScene(ray_data_t &r, const int min_transp_depth, const } #if USE_PATH_TERMINATION - const simd_ivec can_terminate_path = get_transp_depth(r.depth) > min_transp_depth; + const ivec can_terminate_path = get_transp_depth(r.depth) > min_transp_depth; #else - const simd_ivec can_terminate_path = 0; + const ivec can_terminate_path = 0; #endif - const simd_fvec lum = max(r.c[0], max(r.c[1], r.c[2])); - const simd_fvec &p = mix_term_rand[1]; - simd_fvec q = 0.0f; + const fvec lum = max(r.c[0], max(r.c[1], r.c[2])); + const fvec &p = mix_term_rand[1]; + fvec q = 0.0f; where(can_terminate_path, q) = max(0.05f, 1.0f - lum); - const simd_ivec _terminate = + const ivec _terminate = simd_cast(p < q) | simd_cast(lum == 0.0f) | (get_transp_depth(r.depth) + 1 >= max_transp_depth); UNROLLED_FOR(i, 3, { @@ -5208,14 +5208,14 @@ void Ray::NS::IntersectScene(ray_data_t &r, const int min_transp_depth, const } } - const simd_fvec t = inter.t + HIT_BIAS; + const fvec t = inter.t + HIT_BIAS; UNROLLED_FOR(i, 3, { where(keep_going, ro[i]) += r.d[i] * t; }) // discard current intersection where(keep_going, inter.v) = -1.0f; where(keep_going, inter.t) = t_val - inter.t; - where(keep_going, r.depth) += pack_depth(simd_ivec{0}, simd_ivec{0}, simd_ivec{0}, simd_ivec{1}); + where(keep_going, r.depth) += pack_depth(ivec{0}, ivec{0}, ivec{0}, ivec{1}); rand_dim += RAND_DIM_BOUNCE_COUNT; } @@ -5249,24 +5249,24 @@ void Ray::NS::TraceShadowRays(Span> rays, int max_transp_d for (int i = 0; i < rays.size(); ++i) { const shadow_ray_t &sh_r = rays[i]; - simd_fvec rc[3]; + fvec rc[3]; IntersectScene(sh_r, max_transp_depth, sc, root_index, rand_seq, rand_seed, iteration, textures, rc); if (sc.blocker_lights_count) { - const simd_fvec k = IntersectAreaLights(sh_r, sc.lights, sc.light_wnodes); + const fvec k = IntersectAreaLights(sh_r, sc.lights, sc.light_wnodes); UNROLLED_FOR(j, 3, { rc[j] *= k; }) } - const simd_fvec sum = rc[0] + rc[1] + rc[2]; + const fvec sum = rc[0] + rc[1] + rc[2]; UNROLLED_FOR(j, 3, { where(sum > limit, rc[j]) = safe_div_pos(rc[j] * limit, sum); }) - const simd_uvec x = sh_r.xy >> 16, y = sh_r.xy & 0x0000FFFF; + const uvec x = sh_r.xy >> 16, y = sh_r.xy & 0x0000FFFF; // TODO: match layouts! UNROLLED_FOR_S(i, S, { if (sh_r.mask.template get()) { auto old_val = - simd_fvec4(out_color[y.template get() * img_w + x.template get()].v, simd_mem_aligned); - old_val += simd_fvec4(rc[0].template get(), rc[1].template get(), rc[2].template get(), 0.0f); - old_val.store_to(out_color[y.template get() * img_w + x.template get()].v, simd_mem_aligned); + fvec4(out_color[y.template get() * img_w + x.template get()].v, vector_aligned); + old_val += fvec4(rc[0].template get(), rc[1].template get(), rc[2].template get(), 0.0f); + old_val.store_to(out_color[y.template get() * img_w + x.template get()].v, vector_aligned); } }) } @@ -5275,23 +5275,23 @@ void Ray::NS::TraceShadowRays(Span> rays, int max_transp_d template void Ray::NS::IntersectScene(const shadow_ray_t &r, const int max_transp_depth, const scene_data_t &sc, const uint32_t node_index, const uint32_t rand_seq[], const uint32_t rand_seed, - const int iteration, const Cpu::TexStorageBase *const textures[], simd_fvec rc[3]) { - simd_fvec ro[3] = {r.o[0], r.o[1], r.o[2]}; + const int iteration, const Cpu::TexStorageBase *const textures[], fvec rc[3]) { + fvec ro[3] = {r.o[0], r.o[1], r.o[2]}; UNROLLED_FOR(i, 3, { rc[i] = r.c[i]; }) - simd_fvec dist = select(r.dist >= 0.0f, r.dist, simd_fvec{MAX_DIST}); - simd_ivec depth = get_transp_depth(r.depth); + fvec dist = select(r.dist >= 0.0f, r.dist, fvec{MAX_DIST}); + ivec depth = get_transp_depth(r.depth); - const simd_uvec px_hash = hash(r.xy); - const simd_uvec rand_hash = hash_combine(px_hash, rand_seed); + const uvec px_hash = hash(r.xy); + const uvec rand_hash = hash_combine(px_hash, rand_seed); - auto rand_dim = simd_uvec(RAND_DIM_BASE_COUNT + get_total_depth(r.depth) * RAND_DIM_BOUNCE_COUNT); + auto rand_dim = uvec(RAND_DIM_BASE_COUNT + get_total_depth(r.depth) * RAND_DIM_BOUNCE_COUNT); - simd_ivec keep_going = simd_cast(dist > HIT_EPS) & r.mask; + ivec keep_going = simd_cast(dist > HIT_EPS) & r.mask; while (keep_going.not_all_zeros()) { hit_data_t inter; inter.t = dist; - simd_ivec solid_hit; + ivec solid_hit; if (sc.wnodes) { solid_hit = Traverse_TLAS_WithStack_AnyHit(ro, r.d, RAY_TYPE_SHADOW, keep_going, sc.wnodes, node_index, sc.mesh_instances, sc.mi_indices, sc.meshes, sc.mtris, @@ -5302,7 +5302,7 @@ void Ray::NS::IntersectScene(const shadow_ray_t &r, const int max_transp_dept sc.tri_materials, sc.tri_indices, inter); } - const simd_ivec terminate_mask = solid_hit | (depth > max_transp_depth); + const ivec terminate_mask = solid_hit | (depth > max_transp_depth); UNROLLED_FOR(i, 3, { where(terminate_mask, rc[i]) = 0.0f; }) keep_going &= simd_cast(inter.v >= 0.0f) & ~terminate_mask; @@ -5310,43 +5310,43 @@ void Ray::NS::IntersectScene(const shadow_ray_t &r, const int max_transp_dept break; } - const simd_fvec w = 1.0f - inter.u - inter.v; + const fvec w = 1.0f - inter.u - inter.v; - simd_ivec tri_index = inter.prim_index; - const simd_ivec is_backfacing = (tri_index < 0); + ivec tri_index = inter.prim_index; + const ivec is_backfacing = (tri_index < 0); where(is_backfacing, tri_index) = -tri_index - 1; - const simd_ivec vtx_indices[3] = {gather(reinterpret_cast(sc.vtx_indices + 0), tri_index * 3), + const ivec vtx_indices[3] = {gather(reinterpret_cast(sc.vtx_indices + 0), tri_index * 3), gather(reinterpret_cast(sc.vtx_indices + 1), tri_index * 3), gather(reinterpret_cast(sc.vtx_indices + 2), tri_index * 3)}; - simd_fvec sh_uvs[2]; + fvec sh_uvs[2]; { // Fetch vertex uvs const float *vtx_uvs = &sc.vertices[0].t[0]; const int VtxUVsStride = sizeof(vertex_t) / sizeof(float); UNROLLED_FOR(i, 2, { - const simd_fvec temp1 = gather(vtx_uvs + i, vtx_indices[0] * VtxUVsStride); - const simd_fvec temp2 = gather(vtx_uvs + i, vtx_indices[1] * VtxUVsStride); - const simd_fvec temp3 = gather(vtx_uvs + i, vtx_indices[2] * VtxUVsStride); + const fvec temp1 = gather(vtx_uvs + i, vtx_indices[0] * VtxUVsStride); + const fvec temp2 = gather(vtx_uvs + i, vtx_indices[1] * VtxUVsStride); + const fvec temp3 = gather(vtx_uvs + i, vtx_indices[2] * VtxUVsStride); sh_uvs[i] = temp1 * w + temp2 * inter.u + temp3 * inter.v; }) } - simd_fvec tex_rand[2]; + fvec tex_rand[2]; get_scrambled_2d_rand(rand_dim + RAND_DIM_TEX, rand_hash, iteration - 1, rand_seq, tex_rand); - simd_ivec mat_index = gather(reinterpret_cast(sc.tri_materials), tri_index) & - simd_ivec((MATERIAL_INDEX_BITS << 16) | MATERIAL_INDEX_BITS); + ivec mat_index = gather(reinterpret_cast(sc.tri_materials), tri_index) & + ivec((MATERIAL_INDEX_BITS << 16) | MATERIAL_INDEX_BITS); where(~is_backfacing, mat_index) = mat_index & 0xffff; // use front material index where(is_backfacing, mat_index) = mat_index >> 16; // use back material index where(inter.v < 0.0f, mat_index) = 0xffff; { // resolve material - simd_ivec ray_queue[S]; + ivec ray_queue[S]; int index = 0, num = 1; ray_queue[0] = simd_cast(inter.v >= 0.0f); @@ -5355,8 +5355,8 @@ void Ray::NS::IntersectScene(const shadow_ray_t &r, const int max_transp_dept const int mask = ray_queue[index].movemask(); const uint32_t first_mi = mat_index[GetFirstBit(mask)]; - simd_ivec same_mi = (mat_index == first_mi); - simd_ivec diff_mi = and_not(same_mi, ray_queue[index]); + ivec same_mi = (mat_index == first_mi); + ivec diff_mi = and_not(same_mi, ray_queue[index]); if (diff_mi.not_all_zeros()) { ray_queue[num] = diff_mi; @@ -5366,24 +5366,24 @@ void Ray::NS::IntersectScene(const shadow_ray_t &r, const int max_transp_dept if (first_mi != 0xffff) { struct { uint32_t index; - simd_fvec weight; + fvec weight; } stack[16]; int stack_size = 0; stack[stack_size++] = {first_mi, 1.0f}; - simd_fvec throughput[3] = {}; + fvec throughput[3] = {}; while (stack_size--) { const material_t *mat = &sc.materials[stack[stack_size].index]; - const simd_fvec weight = stack[stack_size].weight; + const fvec weight = stack[stack_size].weight; // resolve mix material if (mat->type == eShadingNode::Mix) { - simd_fvec mix_val = mat->strength; + fvec mix_val = mat->strength; const uint32_t first_t = mat->textures[BASE_TEXTURE]; if (first_t != 0xffffffff) { - simd_fvec mix[4] = {}; + fvec mix[4] = {}; SampleBilinear(textures, first_t, sh_uvs, {0}, tex_rand, same_mi, mix); if (first_t & TEX_YCOCG_BIT) { YCoCg_to_RGB(mix, mix); @@ -5408,7 +5408,7 @@ void Ray::NS::IntersectScene(const shadow_ray_t &r, const int max_transp_dept } } - simd_fvec t = inter.t + HIT_BIAS; + fvec t = inter.t + HIT_BIAS; UNROLLED_FOR(i, 3, { ro[i] += r.d[i] * t; }) dist -= t; @@ -5423,20 +5423,20 @@ void Ray::NS::IntersectScene(const shadow_ray_t &r, const int max_transp_dept // Pick point on any light source for evaluation template -void Ray::NS::SampleLightSource(const simd_fvec P[3], const simd_fvec T[3], const simd_fvec B[3], - const simd_fvec N[3], const scene_data_t &sc, - const Cpu::TexStorageBase *const textures[], const simd_fvec &rand_pick_light, - const simd_fvec rand_light_uv[2], const simd_fvec rand_tex_uv[2], - simd_ivec ray_mask, light_sample_t &ls) { - simd_fvec ri = rand_pick_light; +void Ray::NS::SampleLightSource(const fvec P[3], const fvec T[3], const fvec B[3], + const fvec N[3], const scene_data_t &sc, + const Cpu::TexStorageBase *const textures[], const fvec &rand_pick_light, + const fvec rand_light_uv[2], const fvec rand_tex_uv[2], + ivec ray_mask, light_sample_t &ls) { + fvec ri = rand_pick_light; #if USE_HIERARCHICAL_NEE - simd_ivec light_index = -1; - simd_fvec factor = 1.0f; + ivec light_index = -1; + fvec factor = 1.0f; { // Traverse light tree structure struct { - simd_ivec mask; + ivec mask; uint32_t i; } queue[S]; @@ -5447,10 +5447,10 @@ void Ray::NS::SampleLightSource(const simd_fvec P[3], const simd_fvec T[3] while (index != num) { uint32_t i = queue[index].i; while (!is_leaf_node(sc.light_wnodes[i])) { - simd_fvec importance[8]; + fvec importance[8]; calc_lnode_importance(sc.light_wnodes[i], P, importance); - simd_fvec total_importance = importance[0]; + fvec total_importance = importance[0]; UNROLLED_FOR(j, 7, { total_importance += importance[j + 1]; }) queue[index].mask &= simd_cast(total_importance > 0.0f); @@ -5459,22 +5459,22 @@ void Ray::NS::SampleLightSource(const simd_fvec P[3], const simd_fvec T[3] break; } - simd_fvec factors[8]; + fvec factors[8]; UNROLLED_FOR(j, 8, { factors[j] = safe_div_pos(importance[j], total_importance); }) - simd_fvec factors_cdf[9] = {}; + fvec factors_cdf[9] = {}; UNROLLED_FOR(j, 8, { factors_cdf[j + 1] = factors_cdf[j] + factors[j]; }) // make sure cdf ends with 1.0 UNROLLED_FOR(j, 8, { where(factors_cdf[j + 1] == factors_cdf[8], factors_cdf[j + 1]) = 1.01f; }) - simd_ivec next = 0; + ivec next = 0; UNROLLED_FOR(j, 8, { where(factors_cdf[j + 1] <= ri, next) += 1; }) assert((next >= 8).all_zeros()); const int first_next = next[GetFirstBit(queue[index].mask.movemask())]; - const simd_ivec same_next = (next == first_next); - const simd_ivec diff_next = and_not(same_next, queue[index].mask); + const ivec same_next = (next == first_next); + const ivec diff_next = and_not(same_next, queue[index].mask); if (diff_next.not_all_zeros()) { queue[index].mask &= same_next; @@ -5500,13 +5500,13 @@ void Ray::NS::SampleLightSource(const simd_fvec P[3], const simd_fvec T[3] } factor = 1.0f / factor; #else - simd_ivec light_index = min(simd_ivec{ri * float(sc.li_indices.size())}, int(sc.li_indices.size() - 1)); - ri = ri * float(sc.li_indices.size()) - simd_fvec(light_index); + ivec light_index = min(ivec{ri * float(sc.li_indices.size())}, int(sc.li_indices.size() - 1)); + ri = ri * float(sc.li_indices.size()) - fvec(light_index); light_index = gather(reinterpret_cast(sc.li_indices.data()), light_index); - const simd_fvec factor = float(sc.li_indices.size()); + const fvec factor = float(sc.li_indices.size()); #endif - simd_ivec ray_queue[S]; + ivec ray_queue[S]; ray_queue[0] = ray_mask; int index = 0, num = 1; @@ -5514,8 +5514,8 @@ void Ray::NS::SampleLightSource(const simd_fvec P[3], const simd_fvec T[3] const long mask = ray_queue[index].movemask(); const uint32_t first_li = light_index[GetFirstBit(mask)]; - const simd_ivec same_li = (light_index == first_li); - const simd_ivec diff_li = and_not(same_li, ray_queue[index]); + const ivec same_li = (light_index == first_li); + const ivec diff_li = and_not(same_li, ray_queue[index]); if (diff_li.not_all_zeros()) { ray_queue[index] &= same_li; @@ -5528,24 +5528,24 @@ void Ray::NS::SampleLightSource(const simd_fvec P[3], const simd_fvec T[3] where(ray_queue[index], ls.cast_shadow) = l.cast_shadow ? -1 : 0; if (l.type == LIGHT_TYPE_SPHERE) { - const simd_fvec r1 = rand_light_uv[0], r2 = rand_light_uv[1]; + const fvec r1 = rand_light_uv[0], r2 = rand_light_uv[1]; const float *center = l.sph.pos; - const simd_fvec surface_to_center[3] = {center[0] - P[0], center[1] - P[1], center[2] - P[2]}; - simd_fvec sampled_dir[3]; + const fvec surface_to_center[3] = {center[0] - P[0], center[1] - P[1], center[2] - P[2]}; + fvec sampled_dir[3]; map_to_cone(r1, r2, surface_to_center, l.sph.radius, sampled_dir); - const simd_fvec disk_dist = normalize(sampled_dir); + const fvec disk_dist = normalize(sampled_dir); if (l.sph.radius > 0.0f) { - const simd_fvec ls_dist = sphere_intersection(center, l.sph.radius, P, sampled_dir); + const fvec ls_dist = sphere_intersection(center, l.sph.radius, P, sampled_dir); - const simd_fvec light_surf_pos[3] = { + const fvec light_surf_pos[3] = { P[0] + sampled_dir[0] * ls_dist, P[1] + sampled_dir[1] * ls_dist, P[2] + sampled_dir[2] * ls_dist}; - simd_fvec light_forward[3] = {light_surf_pos[0] - center[0], light_surf_pos[1] - center[1], + fvec light_forward[3] = {light_surf_pos[0] - center[0], light_surf_pos[1] - center[1], light_surf_pos[2] - center[2]}; normalize(light_forward); - simd_fvec lp_biased[3]; + fvec lp_biased[3]; offset_ray(light_surf_pos, light_forward, lp_biased); UNROLLED_FOR(i, 3, { where(ray_queue[index], ls.lp[i]) = lp_biased[i]; }) @@ -5563,17 +5563,17 @@ void Ray::NS::SampleLightSource(const simd_fvec P[3], const simd_fvec T[3] } if (l.sph.spot > 0.0f) { - simd_fvec _dot = + fvec _dot = min(-ls.L[0] * l.sph.dir[0] - ls.L[1] * l.sph.dir[1] - ls.L[2] * l.sph.dir[2], 1.0f); - simd_ivec mask = simd_cast(_dot > 0.0f); + ivec mask = simd_cast(_dot > 0.0f); if (mask.not_all_zeros()) { - simd_fvec _angle = 0.0f; + fvec _angle = 0.0f; UNROLLED_FOR_S(i, S, { if (mask.template get()) { _angle.template set(acosf(_dot.template get())); } }) - const simd_fvec k = saturate((l.sph.spot - _angle) / l.sph.blend); + const fvec k = saturate((l.sph.spot - _angle) / l.sph.blend); UNROLLED_FOR(i, 3, { where(ray_queue[index], ls.col[i]) *= k; }) } UNROLLED_FOR(i, 3, { where(~mask & ray_queue[index], ls.col[i]) = 0.0f; }) @@ -5586,14 +5586,14 @@ void Ray::NS::SampleLightSource(const simd_fvec P[3], const simd_fvec T[3] if (l.dir.angle != 0.0f) { const float radius = tanf(l.dir.angle); - simd_fvec V[3]; + fvec V[3]; map_to_cone(rand_light_uv[0], rand_light_uv[1], ls.L, radius, V); safe_normalize(V); UNROLLED_FOR(i, 3, { where(ray_queue[index], ls.L[i]) = V[i]; }) where(ray_queue[index], ls.area) = PI * radius * radius; - const simd_fvec cos_theta = dot3(ls.L, l.dir.dir); + const fvec cos_theta = dot3(ls.L, l.dir.dir); where(ray_queue[index], ls.pdf) = safe_div_pos(1.0f, ls.area * cos_theta); } UNROLLED_FOR(i, 3, { where(ray_queue[index], ls.lp[i]) = P[i] + ls.L[i]; }) @@ -5602,29 +5602,29 @@ void Ray::NS::SampleLightSource(const simd_fvec P[3], const simd_fvec T[3] where(ray_queue[index], ls.area) = 0.0f; } } else if (l.type == LIGHT_TYPE_RECT) { - simd_fvec lp[3]; + fvec lp[3]; #if USE_SPHERICAL_AREA_LIGHT_SAMPLING - const simd_fvec vp[3] = {l.rect.pos[0], l.rect.pos[1], l.rect.pos[2]}, + const fvec vp[3] = {l.rect.pos[0], l.rect.pos[1], l.rect.pos[2]}, vu[3] = {l.rect.u[0], l.rect.u[1], l.rect.u[2]}, vv[3] = {l.rect.v[0], l.rect.v[1], l.rect.v[2]}; - simd_fvec pdf = SampleSphericalRectangle(P, vp, vu, vv, rand_light_uv, lp); - const simd_ivec invalid_pdf = ray_queue[index] & simd_cast(pdf <= 0.0f); + fvec pdf = SampleSphericalRectangle(P, vp, vu, vv, rand_light_uv, lp); + const ivec invalid_pdf = ray_queue[index] & simd_cast(pdf <= 0.0f); if (invalid_pdf.not_all_zeros()) #endif { - const simd_fvec r1 = rand_light_uv[0] - 0.5f, r2 = rand_light_uv[1] - 0.5f; + const fvec r1 = rand_light_uv[0] - 0.5f, r2 = rand_light_uv[1] - 0.5f; UNROLLED_FOR(i, 3, { lp[i] = l.rect.pos[i] + l.rect.u[i] * r1 + l.rect.v[i] * r2; }) } - simd_fvec to_light[3]; + fvec to_light[3]; UNROLLED_FOR(i, 3, { to_light[i] = lp[i] - P[i]; }) - const simd_fvec ls_dist = normalize(to_light); + const fvec ls_dist = normalize(to_light); float light_forward[3]; cross(l.rect.u, l.rect.v, light_forward); normalize(light_forward); - simd_fvec lp_biased[3], _light_forward[3] = {light_forward[0], light_forward[1], light_forward[2]}; + fvec lp_biased[3], _light_forward[3] = {light_forward[0], light_forward[1], light_forward[2]}; offset_ray(lp, _light_forward, lp_biased); UNROLLED_FOR(i, 3, { where(ray_queue[index], ls.lp[i]) = lp_biased[i]; @@ -5633,7 +5633,7 @@ void Ray::NS::SampleLightSource(const simd_fvec P[3], const simd_fvec T[3] where(ray_queue[index], ls.area) = l.rect.area; - const simd_fvec cos_theta = + const fvec cos_theta = -ls.L[0] * light_forward[0] - ls.L[1] * light_forward[1] - ls.L[2] * light_forward[2]; where(invalid_pdf, pdf) = safe_div_pos(ls_dist * ls_dist, ls.area * cos_theta); where(cos_theta <= 0.0f, pdf) = 0.0f; @@ -5644,9 +5644,9 @@ void Ray::NS::SampleLightSource(const simd_fvec P[3], const simd_fvec T[3] } if (l.sky_portal != 0) { - simd_fvec env_col[3] = {sc.env.env_col[0], sc.env.env_col[1], sc.env.env_col[2]}; + fvec env_col[3] = {sc.env.env_col[0], sc.env.env_col[1], sc.env.env_col[2]}; if (sc.env.env_map != 0xffffffff) { - simd_fvec tex_col[3]; + fvec tex_col[3]; SampleLatlong_RGBE(*static_cast(textures[0]), sc.env.env_map, ls.L, sc.env.env_map_rotation, rand_tex_uv, ray_queue[index], tex_col); UNROLLED_FOR(i, 3, { env_col[i] *= tex_col[i]; }) @@ -5655,10 +5655,10 @@ void Ray::NS::SampleLightSource(const simd_fvec P[3], const simd_fvec T[3] where(ray_queue[index], ls.from_env) = -1; } } else if (l.type == LIGHT_TYPE_DISK) { - simd_fvec offset[2] = {2.0f * rand_light_uv[0] - 1.0f, 2.0f * rand_light_uv[1] - 1.0f}; - const simd_ivec mask = simd_cast(offset[0] != 0.0f & offset[1] != 0.0f); + fvec offset[2] = {2.0f * rand_light_uv[0] - 1.0f, 2.0f * rand_light_uv[1] - 1.0f}; + const ivec mask = simd_cast(offset[0] != 0.0f & offset[1] != 0.0f); if (mask.not_all_zeros()) { - simd_fvec theta = 0.5f * PI - 0.25f * PI * safe_div(offset[0], offset[1]), r = offset[1]; + fvec theta = 0.5f * PI - 0.25f * PI * safe_div(offset[0], offset[1]), r = offset[1]; where(abs(offset[0]) > abs(offset[1]), r) = offset[0]; where(abs(offset[0]) > abs(offset[1]), theta) = 0.25f * PI * safe_div(offset[1], offset[0]); @@ -5667,13 +5667,13 @@ void Ray::NS::SampleLightSource(const simd_fvec P[3], const simd_fvec T[3] where(mask, offset[1]) = 0.5f * r * sin(theta); } - const simd_fvec lp[3] = {l.disk.pos[0] + l.disk.u[0] * offset[0] + l.disk.v[0] * offset[1], + const fvec lp[3] = {l.disk.pos[0] + l.disk.u[0] * offset[0] + l.disk.v[0] * offset[1], l.disk.pos[1] + l.disk.u[1] * offset[0] + l.disk.v[1] * offset[1], l.disk.pos[2] + l.disk.u[2] * offset[0] + l.disk.v[2] * offset[1]}; - simd_fvec to_light[3]; + fvec to_light[3]; UNROLLED_FOR(i, 3, { to_light[i] = lp[i] - P[i]; }) - const simd_fvec ls_dist = normalize(to_light); + const fvec ls_dist = normalize(to_light); UNROLLED_FOR(i, 3, { where(ray_queue[index], ls.L[i]) = to_light[i]; }) where(ray_queue[index], ls.area) = l.disk.area; @@ -5682,13 +5682,13 @@ void Ray::NS::SampleLightSource(const simd_fvec P[3], const simd_fvec T[3] cross(l.disk.u, l.disk.v, light_forward); normalize(light_forward); - simd_fvec lp_biased[3], _light_forward[3] = {light_forward[0], light_forward[1], light_forward[2]}; + fvec lp_biased[3], _light_forward[3] = {light_forward[0], light_forward[1], light_forward[2]}; offset_ray(lp, _light_forward, lp_biased); UNROLLED_FOR(i, 3, { where(ray_queue[index], ls.lp[i]) = lp_biased[i]; }) - const simd_fvec cos_theta = + const fvec cos_theta = -ls.L[0] * light_forward[0] - ls.L[1] * light_forward[1] - ls.L[2] * light_forward[2]; - simd_fvec pdf = safe_div_pos(ls_dist * ls_dist, ls.area * cos_theta); + fvec pdf = safe_div_pos(ls_dist * ls_dist, ls.area * cos_theta); where(cos_theta <= 0.0f, pdf) = 0.0f; where(ray_queue[index], ls.pdf) = pdf; @@ -5697,9 +5697,9 @@ void Ray::NS::SampleLightSource(const simd_fvec P[3], const simd_fvec T[3] } if (l.sky_portal != 0) { - simd_fvec env_col[3] = {sc.env.env_col[0], sc.env.env_col[1], sc.env.env_col[2]}; + fvec env_col[3] = {sc.env.env_col[0], sc.env.env_col[1], sc.env.env_col[2]}; if (sc.env.env_map != 0xffffffff) { - simd_fvec tex_col[3]; + fvec tex_col[3]; SampleLatlong_RGBE(*static_cast(textures[0]), sc.env.env_map, ls.L, sc.env.env_map_rotation, rand_tex_uv, ray_queue[index], tex_col); UNROLLED_FOR(i, 3, { env_col[i] *= tex_col[i]; }) @@ -5708,43 +5708,43 @@ void Ray::NS::SampleLightSource(const simd_fvec P[3], const simd_fvec T[3] where(ray_queue[index], ls.from_env) = -1; } } else if (l.type == LIGHT_TYPE_LINE) { - simd_fvec center_to_surface[3]; + fvec center_to_surface[3]; UNROLLED_FOR(i, 3, { center_to_surface[i] = P[i] - l.line.pos[i]; }) const float *light_dir = l.line.v; - simd_fvec light_u[3] = {center_to_surface[1] * light_dir[2] - center_to_surface[2] * light_dir[1], + fvec light_u[3] = {center_to_surface[1] * light_dir[2] - center_to_surface[2] * light_dir[1], center_to_surface[2] * light_dir[0] - center_to_surface[0] * light_dir[2], center_to_surface[0] * light_dir[1] - center_to_surface[1] * light_dir[0]}; normalize(light_u); - const simd_fvec light_v[3] = {light_u[1] * light_dir[2] - light_u[2] * light_dir[1], + const fvec light_v[3] = {light_u[1] * light_dir[2] - light_u[2] * light_dir[1], light_u[2] * light_dir[0] - light_u[0] * light_dir[2], light_u[0] * light_dir[1] - light_u[1] * light_dir[0]}; - const simd_fvec phi = PI * rand_light_uv[0]; - const simd_fvec cos_phi = cos(phi), sin_phi = sin(phi); + const fvec phi = PI * rand_light_uv[0]; + const fvec cos_phi = cos(phi), sin_phi = sin(phi); - const simd_fvec normal[3] = {cos_phi * light_u[0] - sin_phi * light_v[0], + const fvec normal[3] = {cos_phi * light_u[0] - sin_phi * light_v[0], cos_phi * light_u[1] - sin_phi * light_v[1], cos_phi * light_u[2] - sin_phi * light_v[2]}; - const simd_fvec lp[3] = { + const fvec lp[3] = { l.line.pos[0] + normal[0] * l.line.radius + (rand_light_uv[1] - 0.5f) * light_dir[0] * l.line.height, l.line.pos[1] + normal[1] * l.line.radius + (rand_light_uv[1] - 0.5f) * light_dir[1] * l.line.height, l.line.pos[2] + normal[2] * l.line.radius + (rand_light_uv[1] - 0.5f) * light_dir[2] * l.line.height}; UNROLLED_FOR(i, 3, { where(ray_queue[index], ls.lp[i]) = lp[i]; }) - simd_fvec to_light[3]; + fvec to_light[3]; UNROLLED_FOR(i, 3, { to_light[i] = lp[i] - P[i]; }) - const simd_fvec ls_dist = normalize(to_light); + const fvec ls_dist = normalize(to_light); UNROLLED_FOR(i, 3, { where(ray_queue[index], ls.L[i]) = to_light[i]; }) where(ray_queue[index], ls.area) = l.line.area; - const simd_fvec cos_theta = 1.0f - abs(dot3(ls.L, light_dir)); - simd_fvec pdf = safe_div_pos(ls_dist * ls_dist, ls.area * cos_theta); + const fvec cos_theta = 1.0f - abs(dot3(ls.L, light_dir)); + fvec pdf = safe_div_pos(ls_dist * ls_dist, ls.area * cos_theta); where(cos_theta == 0.0f, pdf) = 0.0f; where(ray_queue[index], ls.pdf) = pdf; @@ -5764,7 +5764,7 @@ void Ray::NS::SampleLightSource(const simd_fvec P[3], const simd_fvec T[3] TransformPoint(v2.p, lmi.xform, p2); TransformPoint(v3.p, lmi.xform, p3); - const simd_fvec vp1[3] = {p1[0], p1[1], p1[2]}, vp2[3] = {p2[0], p2[1], p2[2]}, + const fvec vp1[3] = {p1[0], p1[1], p1[2]}, vp2[3] = {p2[0], p2[1], p2[2]}, vp3[3] = {p3[0], p3[1], p3[2]}; const float e1[3] = {p2[0] - p1[0], p2[1] - p1[1], p2[2] - p1[2]}, @@ -5778,26 +5778,26 @@ void Ray::NS::SampleLightSource(const simd_fvec P[3], const simd_fvec T[3] where(ray_queue[index], ls.area) = 0.5f * light_fwd_len; UNROLLED_FOR(i, 3, { light_forward[i] /= light_fwd_len; }) - simd_fvec lp[3] = {}; - simd_fvec luvs[2] = {}; - simd_fvec pdf = {}; + fvec lp[3] = {}; + fvec luvs[2] = {}; + fvec pdf = {}; #if USE_SPHERICAL_AREA_LIGHT_SAMPLING // Spherical triangle sampling - simd_fvec dir[3]; + fvec dir[3]; pdf = SampleSphericalTriangle(P, vp1, vp2, vp3, rand_light_uv, dir); - const simd_ivec pdf_positive = ray_queue[index] & simd_cast(pdf > 0.0f); + const ivec pdf_positive = ray_queue[index] & simd_cast(pdf > 0.0f); if (pdf_positive.not_all_zeros()) { // find u, v, t of intersection point - simd_fvec pvec[3]; + fvec pvec[3]; cross(dir, e2, pvec); - simd_fvec tvec[3]; + fvec tvec[3]; UNROLLED_FOR(i, 3, { tvec[i] = P[i] - p1[i]; }) - simd_fvec qvec[3]; + fvec qvec[3]; cross(tvec, e1, qvec); - const simd_fvec inv_det = safe_div(simd_fvec{1.0f}, dot3(e1, pvec)); - const simd_fvec tri_u = dot3(tvec, pvec) * inv_det, tri_v = dot3(dir, qvec) * inv_det; + const fvec inv_det = safe_div(fvec{1.0f}, dot3(e1, pvec)); + const fvec tri_u = dot3(tvec, pvec) * inv_det, tri_v = dot3(dir, qvec) * inv_det; UNROLLED_FOR(i, 3, { where(pdf_positive, lp[i]) = (1.0f - tri_u - tri_v) * p1[i] + tri_u * p2[i] + tri_v * p3[i]; @@ -5808,13 +5808,13 @@ void Ray::NS::SampleLightSource(const simd_fvec P[3], const simd_fvec T[3] UNROLLED_FOR(i, 3, { where(pdf_positive, ls.L[i]) = dir[i]; }) } - const simd_ivec pdf_negative = ray_queue[index] & ~pdf_positive; + const ivec pdf_negative = ray_queue[index] & ~pdf_positive; #else // USE_SPHERICAL_AREA_LIGHT_SAMPLING - const simd_ivec pdf_negative = -1; + const ivec pdf_negative = -1; #endif // USE_SPHERICAL_AREA_LIGHT_SAMPLING if (pdf_negative.not_all_zeros()) { // Simple area sampling - const simd_fvec r1 = sqrt(rand_light_uv[0]), r2 = rand_light_uv[1]; + const fvec r1 = sqrt(rand_light_uv[0]), r2 = rand_light_uv[1]; UNROLLED_FOR(i, 2, { where(pdf_negative, luvs[i]) = v1.t[i] * (1.0f - r1) + r1 * (v2.t[i] * (1.0f - r2) + v3.t[i] * r2); @@ -5823,29 +5823,29 @@ void Ray::NS::SampleLightSource(const simd_fvec P[3], const simd_fvec T[3] where(pdf_negative, lp[i]) = p1[i] * (1.0f - r1) + r1 * (p2[i] * (1.0f - r2) + p3[i] * r2); }) - simd_fvec to_light[3] = {lp[0] - P[0], lp[1] - P[1], lp[2] - P[2]}; - const simd_fvec ls_dist = normalize(to_light); + fvec to_light[3] = {lp[0] - P[0], lp[1] - P[1], lp[2] - P[2]}; + const fvec ls_dist = normalize(to_light); UNROLLED_FOR(i, 3, { where(pdf_negative, ls.L[i]) = to_light[i]; }) - const simd_fvec cos_theta = -dot3(ls.L, light_forward); + const fvec cos_theta = -dot3(ls.L, light_forward); where(pdf_negative, pdf) = safe_div_pos(ls_dist * ls_dist, ls.area * cos_theta); } - simd_fvec cos_theta = -dot3(ls.L, light_forward); + fvec cos_theta = -dot3(ls.L, light_forward); - simd_fvec lp_biased[3], vlight_forward[3] = {light_forward[0], light_forward[1], light_forward[2]}; + fvec lp_biased[3], vlight_forward[3] = {light_forward[0], light_forward[1], light_forward[2]}; UNROLLED_FOR(i, 3, { where(cos_theta < 0.0f, vlight_forward[i]) = -vlight_forward[i]; }) offset_ray(lp, vlight_forward, lp_biased); UNROLLED_FOR(i, 3, { where(ray_queue[index], ls.lp[i]) = lp_biased[i]; }) if (l.doublesided) { cos_theta = abs(cos_theta); } - simd_ivec accept = simd_cast(cos_theta > 0.0f) & ray_queue[index]; + ivec accept = simd_cast(cos_theta > 0.0f) & ray_queue[index]; if (accept.not_all_zeros()) { where(accept, ls.pdf) = pdf; if (l.tri.tex_index != 0xffffffff) { - simd_fvec tex_col[4] = {}; - SampleBilinear(textures, l.tri.tex_index, luvs, simd_ivec{0}, rand_tex_uv, accept, tex_col); + fvec tex_col[4] = {}; + SampleBilinear(textures, l.tri.tex_index, luvs, ivec{0}, rand_tex_uv, accept, tex_col); if (l.tri.tex_index & TEX_YCOCG_BIT) { YCoCg_to_RGB(tex_col, tex_col); } @@ -5856,27 +5856,27 @@ void Ray::NS::SampleLightSource(const simd_fvec P[3], const simd_fvec T[3] } } } else if (l.type == LIGHT_TYPE_ENV) { - simd_fvec dir_and_pdf[4]; + fvec dir_and_pdf[4]; if (sc.env.qtree_levels) { // Sample environment using quadtree - const auto *qtree_mips = reinterpret_cast(sc.env.qtree_mips); + const auto *qtree_mips = reinterpret_cast(sc.env.qtree_mips); Sample_EnvQTree(sc.env.env_map_rotation, qtree_mips, sc.env.qtree_levels, ri, rand_light_uv[0], rand_light_uv[1], dir_and_pdf); } else { // Sample environment as hemishpere - const simd_fvec phi = 2 * PI * rand_light_uv[1]; - const simd_fvec cos_phi = cos(phi), sin_phi = sin(phi); - const simd_fvec dir = sqrt(1.0f - rand_light_uv[0] * rand_light_uv[0]); + const fvec phi = 2 * PI * rand_light_uv[1]; + const fvec cos_phi = cos(phi), sin_phi = sin(phi); + const fvec dir = sqrt(1.0f - rand_light_uv[0] * rand_light_uv[0]); - const simd_fvec V[3] = {dir * cos_phi, dir * sin_phi, rand_light_uv[0]}; // in tangent-space + const fvec V[3] = {dir * cos_phi, dir * sin_phi, rand_light_uv[0]}; // in tangent-space world_from_tangent(T, B, N, V, dir_and_pdf); dir_and_pdf[3] = 0.5f / PI; } UNROLLED_FOR(i, 3, { where(ray_queue[index], ls.L[i]) = dir_and_pdf[i]; }) - simd_fvec tex_col[3] = {1.0f, 1.0f, 1.0f}; + fvec tex_col[3] = {1.0f, 1.0f, 1.0f}; if (sc.env.env_map != 0xffffffff) { SampleLatlong_RGBE(*static_cast(textures[0]), sc.env.env_map, ls.L, sc.env.env_map_rotation, rand_tex_uv, ray_queue[index], tex_col); @@ -5901,13 +5901,13 @@ void Ray::NS::IntersectAreaLights(const ray_data_t &r, Span li Span nodes, hit_data_t &inout_inter) { const int SS = S <= 8 ? S : 8; - simd_fvec inv_d[3], inv_d_o[3]; + fvec inv_d[3], inv_d_o[3]; comp_aux_inv_values(r.o, r.d, inv_d, inv_d_o); alignas(S * 4) int ray_masks[S]; alignas(S * 4) float inter_t[S]; - r.mask.store_to(ray_masks, simd_mem_aligned); - inout_inter.t.store_to(inter_t, simd_mem_aligned); + r.mask.store_to(ray_masks, vector_aligned); + inout_inter.t.store_to(inter_t, vector_aligned); for (int ri = 0; ri < S; ri++) { if (!ray_masks[ri]) { @@ -5935,10 +5935,10 @@ void Ray::NS::IntersectAreaLights(const ray_data_t &r, Span li long mask = bbox_test_oct(_inv_d, _inv_d_o, inter_t[ri], nodes[cur.index].bbox_min, nodes[cur.index].bbox_max, res_dist); if (mask) { - simd_fvec importance[8 / SS]; + fvec importance[8 / SS]; calc_lnode_importance(nodes[cur.index], _ro, value_ptr(importance[0])); - simd_fvec total_importance_v = 0.0f; + fvec total_importance_v = 0.0f; UNROLLED_FOR_S(i, 8 / SS, { total_importance_v += importance[i]; }) const float total_importance = hsum(total_importance_v); assert(total_importance > 0.0f); @@ -5946,7 +5946,7 @@ void Ray::NS::IntersectAreaLights(const ray_data_t &r, Span li alignas(32) float factors[8]; UNROLLED_FOR_S(i, 8 / SS, { importance[i] /= total_importance; - importance[i].store_to(&factors[i * SS], simd_mem_aligned); + importance[i].store_to(&factors[i * SS], vector_aligned); }) long i = GetFirstBit(mask); @@ -6016,35 +6016,35 @@ void Ray::NS::IntersectAreaLights(const ray_data_t &r, Span li } // Portal lights affect only missed rays // TODO: actually process multiple rays - simd_ivec ray_mask = 0; + ivec ray_mask = 0; ray_mask.set(ri, -1); - ray_mask &= ~(simd_ivec{l.sky_portal ? -1 : 0} & simd_cast(inout_inter.v >= 0.0f)); + ray_mask &= ~(ivec{l.sky_portal ? -1 : 0} & simd_cast(inout_inter.v >= 0.0f)); if (ray_mask.all_zeros()) { continue; } - const simd_fvec no_shadow = simd_cast(l.cast_shadow ? simd_ivec{0} : simd_ivec{-1}); + const fvec no_shadow = simd_cast(l.cast_shadow ? ivec{0} : ivec{-1}); if (l.type == LIGHT_TYPE_SPHERE) { - const simd_fvec op[3] = {l.sph.pos[0] - r.o[0], l.sph.pos[1] - r.o[1], l.sph.pos[2] - r.o[2]}; - const simd_fvec b = dot3(op, r.d); - simd_fvec det = b * b - dot3(op, op) + l.sph.radius * l.sph.radius; + const fvec op[3] = {l.sph.pos[0] - r.o[0], l.sph.pos[1] - r.o[1], l.sph.pos[2] - r.o[2]}; + const fvec b = dot3(op, r.d); + fvec det = b * b - dot3(op, op) + l.sph.radius * l.sph.radius; - simd_ivec imask = simd_cast(det >= 0.0f) & ray_mask; + ivec imask = simd_cast(det >= 0.0f) & ray_mask; if (imask.not_all_zeros()) { det = safe_sqrt(det); - const simd_fvec t1 = b - det, t2 = b + det; + const fvec t1 = b - det, t2 = b + det; - simd_fvec mask1 = (t1 > HIT_EPS) & ((t1 < inout_inter.t) | no_shadow) & simd_cast(imask); - const simd_fvec mask2 = + fvec mask1 = (t1 > HIT_EPS) & ((t1 < inout_inter.t) | no_shadow) & simd_cast(imask); + const fvec mask2 = (t2 > HIT_EPS) & ((t2 < inout_inter.t) | no_shadow) & simd_cast(imask) & ~mask1; if (l.sph.spot > 0.0f) { - const simd_fvec _dot = + const fvec _dot = min(-r.d[0] * l.sph.dir[0] - r.d[1] * l.sph.dir[1] - r.d[2] * l.sph.dir[2], 1.0f); mask1 &= (_dot > 0.0f); - const simd_ivec imask1 = simd_cast(mask1); + const ivec imask1 = simd_cast(mask1); if (imask1.not_all_zeros()) { - simd_fvec _angle = 0.0f; + fvec _angle = 0.0f; UNROLLED_FOR_S(i, S, { if (imask1.template get()) { _angle.template set(acosf(_dot.template get())); @@ -6055,21 +6055,21 @@ void Ray::NS::IntersectAreaLights(const ray_data_t &r, Span li } where(mask1 | mask2, inout_inter.v) = 0.0f; - where(mask1 | mask2, inout_inter.obj_index) = -simd_ivec(light_index) - 1; + where(mask1 | mask2, inout_inter.obj_index) = -ivec(light_index) - 1; where(mask1, inout_inter.t) = t1; where(mask2, inout_inter.t) = t2; where(mask1 | mask2, inout_inter.u) = cur.factor; - inout_inter.t.store_to(inter_t, simd_mem_aligned); + inout_inter.t.store_to(inter_t, vector_aligned); } } else if (l.type == LIGHT_TYPE_DIR) { - const simd_fvec cos_theta = dot3(r.d, l.dir.dir); - const simd_ivec imask = simd_cast(cos_theta > cosf(l.dir.angle)) & ray_mask & + const fvec cos_theta = dot3(r.d, l.dir.dir); + const ivec imask = simd_cast(cos_theta > cosf(l.dir.angle)) & ray_mask & (simd_cast(inout_inter.v < 0.0f) | simd_cast(no_shadow)); where(imask, inout_inter.v) = 0.0f; - where(imask, inout_inter.obj_index) = -simd_ivec(light_index) - 1; + where(imask, inout_inter.obj_index) = -ivec(light_index) - 1; where(imask, inout_inter.t) = safe_div_pos(1.0f, cos_theta); where(imask, inout_inter.u) = cur.factor; - inout_inter.t.store_to(inter_t, simd_mem_aligned); + inout_inter.t.store_to(inter_t, vector_aligned); } else if (l.type == LIGHT_TYPE_RECT) { float light_fwd[3]; cross(l.rect.u, l.rect.v, light_fwd); @@ -6077,30 +6077,30 @@ void Ray::NS::IntersectAreaLights(const ray_data_t &r, Span li const float plane_dist = dot3(light_fwd, l.rect.pos); - const simd_fvec cos_theta = dot3(r.d, light_fwd); - const simd_fvec t = safe_div_neg(plane_dist - dot3(light_fwd, r.o), cos_theta); + const fvec cos_theta = dot3(r.d, light_fwd); + const fvec t = safe_div_neg(plane_dist - dot3(light_fwd, r.o), cos_theta); - const simd_ivec imask = + const ivec imask = simd_cast((cos_theta < 0.0f) & (t > HIT_EPS) & ((t < inout_inter.t) | no_shadow)) & ray_mask; if (imask.not_all_zeros()) { const float dot_u = dot3(l.rect.u, l.rect.u); const float dot_v = dot3(l.rect.v, l.rect.v); - const simd_fvec p[3] = {fmadd(r.d[0], t, r.o[0]), fmadd(r.d[1], t, r.o[1]), + const fvec p[3] = {fmadd(r.d[0], t, r.o[0]), fmadd(r.d[1], t, r.o[1]), fmadd(r.d[2], t, r.o[2])}; - const simd_fvec vi[3] = {p[0] - l.rect.pos[0], p[1] - l.rect.pos[1], p[2] - l.rect.pos[2]}; + const fvec vi[3] = {p[0] - l.rect.pos[0], p[1] - l.rect.pos[1], p[2] - l.rect.pos[2]}; - const simd_fvec a1 = dot3(l.rect.u, vi) / dot_u; - const simd_fvec a2 = dot3(l.rect.v, vi) / dot_v; + const fvec a1 = dot3(l.rect.u, vi) / dot_u; + const fvec a2 = dot3(l.rect.v, vi) / dot_v; - const simd_fvec final_mask = + const fvec final_mask = (a1 >= -0.5f & a1 <= 0.5f) & (a2 >= -0.5f & a2 <= 0.5f) & simd_cast(imask); where(final_mask, inout_inter.v) = 0.0f; - where(final_mask, inout_inter.obj_index) = -simd_ivec(light_index) - 1; + where(final_mask, inout_inter.obj_index) = -ivec(light_index) - 1; where(final_mask, inout_inter.t) = t; where(final_mask, inout_inter.u) = cur.factor; - inout_inter.t.store_to(inter_t, simd_mem_aligned); + inout_inter.t.store_to(inter_t, vector_aligned); } } else if (l.type == LIGHT_TYPE_DISK) { float light_fwd[3]; @@ -6109,29 +6109,29 @@ void Ray::NS::IntersectAreaLights(const ray_data_t &r, Span li const float plane_dist = dot3(light_fwd, l.disk.pos); - const simd_fvec cos_theta = dot3(r.d, light_fwd); - const simd_fvec t = safe_div_neg(plane_dist - dot3(light_fwd, r.o), cos_theta); + const fvec cos_theta = dot3(r.d, light_fwd); + const fvec t = safe_div_neg(plane_dist - dot3(light_fwd, r.o), cos_theta); - const simd_ivec imask = + const ivec imask = simd_cast((cos_theta < 0.0f) & (t > HIT_EPS) & ((t < inout_inter.t) | no_shadow)) & ray_mask; if (imask.not_all_zeros()) { const float dot_u = dot3(l.disk.u, l.disk.u); const float dot_v = dot3(l.disk.v, l.disk.v); - const simd_fvec p[3] = {fmadd(r.d[0], t, r.o[0]), fmadd(r.d[1], t, r.o[1]), + const fvec p[3] = {fmadd(r.d[0], t, r.o[0]), fmadd(r.d[1], t, r.o[1]), fmadd(r.d[2], t, r.o[2])}; - const simd_fvec vi[3] = {p[0] - l.disk.pos[0], p[1] - l.disk.pos[1], p[2] - l.disk.pos[2]}; + const fvec vi[3] = {p[0] - l.disk.pos[0], p[1] - l.disk.pos[1], p[2] - l.disk.pos[2]}; - const simd_fvec a1 = dot3(l.disk.u, vi) / dot_u; - const simd_fvec a2 = dot3(l.disk.v, vi) / dot_v; + const fvec a1 = dot3(l.disk.u, vi) / dot_u; + const fvec a2 = dot3(l.disk.v, vi) / dot_v; - const simd_fvec final_mask = (sqrt(a1 * a1 + a2 * a2) <= 0.5f) & simd_cast(imask); + const fvec final_mask = (sqrt(a1 * a1 + a2 * a2) <= 0.5f) & simd_cast(imask); where(final_mask, inout_inter.v) = 0.0f; - where(final_mask, inout_inter.obj_index) = -simd_ivec(light_index) - 1; + where(final_mask, inout_inter.obj_index) = -ivec(light_index) - 1; where(final_mask, inout_inter.t) = t; where(final_mask, inout_inter.u) = cur.factor; - inout_inter.t.store_to(inter_t, simd_mem_aligned); + inout_inter.t.store_to(inter_t, vector_aligned); } } else if (l.type == LIGHT_TYPE_LINE) { const float *light_dir = l.line.v; @@ -6139,33 +6139,33 @@ void Ray::NS::IntersectAreaLights(const ray_data_t &r, Span li float light_v[3]; cross(l.line.u, light_dir, light_v); - simd_fvec _ro[3] = {r.o[0] - l.line.pos[0], r.o[1] - l.line.pos[1], r.o[2] - l.line.pos[2]}; - const simd_fvec ro[3] = {dot3(_ro, light_dir), dot3(_ro, l.line.u), dot3(_ro, light_v)}; - const simd_fvec rd[3] = {dot3(r.d, light_dir), dot3(r.d, l.line.u), dot3(r.d, light_v)}; + fvec _ro[3] = {r.o[0] - l.line.pos[0], r.o[1] - l.line.pos[1], r.o[2] - l.line.pos[2]}; + const fvec ro[3] = {dot3(_ro, light_dir), dot3(_ro, l.line.u), dot3(_ro, light_v)}; + const fvec rd[3] = {dot3(r.d, light_dir), dot3(r.d, l.line.u), dot3(r.d, light_v)}; - const simd_fvec A = rd[2] * rd[2] + rd[1] * rd[1]; - const simd_fvec B = 2.0f * (rd[2] * ro[2] + rd[1] * ro[1]); - const simd_fvec C = ro[2] * ro[2] + ro[1] * ro[1] - l.line.radius * l.line.radius; + const fvec A = rd[2] * rd[2] + rd[1] * rd[1]; + const fvec B = 2.0f * (rd[2] * ro[2] + rd[1] * ro[1]); + const fvec C = ro[2] * ro[2] + ro[1] * ro[1] - l.line.radius * l.line.radius; - simd_fvec t0, t1; - simd_ivec imask = quadratic(A, B, C, t0, t1); + fvec t0, t1; + ivec imask = quadratic(A, B, C, t0, t1); imask &= simd_cast(t0 > HIT_EPS) & simd_cast(t1 > HIT_EPS); - const simd_fvec t = min(t0, t1); - const simd_fvec p[3] = {fmadd(rd[0], t, ro[0]), fmadd(rd[1], t, ro[1]), fmadd(rd[2], t, ro[2])}; + const fvec t = min(t0, t1); + const fvec p[3] = {fmadd(rd[0], t, ro[0]), fmadd(rd[1], t, ro[1]), fmadd(rd[2], t, ro[2])}; imask &= simd_cast(abs(p[0]) < 0.5f * l.line.height) & simd_cast((t < inout_inter.t) | no_shadow) & ray_mask; where(imask, inout_inter.v) = 0.0f; - where(imask, inout_inter.obj_index) = -simd_ivec(light_index) - 1; + where(imask, inout_inter.obj_index) = -ivec(light_index) - 1; where(imask, inout_inter.t) = t; where(imask, inout_inter.u) = cur.factor; - inout_inter.t.store_to(inter_t, simd_mem_aligned); + inout_inter.t.store_to(inter_t, vector_aligned); } else if (l.type == LIGHT_TYPE_ENV) { // NOTE: mask remains empty where(simd_cast(inout_inter.v < 0.0f) & ray_mask, inout_inter.obj_index) = - -simd_ivec(light_index) - 1; + -ivec(light_index) - 1; where(simd_cast(inout_inter.v < 0.0f) & ray_mask, inout_inter.u) = cur.factor; } } @@ -6174,21 +6174,21 @@ void Ray::NS::IntersectAreaLights(const ray_data_t &r, Span li } template -Ray::NS::simd_fvec Ray::NS::IntersectAreaLights(const shadow_ray_t &r, Span lights, +Ray::NS::fvec Ray::NS::IntersectAreaLights(const shadow_ray_t &r, Span lights, Span nodes) { - simd_fvec inv_d[3], inv_d_o[3]; + fvec inv_d[3], inv_d_o[3]; comp_aux_inv_values(r.o, r.d, inv_d, inv_d_o); - const simd_fvec rdist = abs(r.dist); - const simd_ivec env_ray = simd_cast(r.dist < 0.0f); - simd_fvec ret = 1.0f; + const fvec rdist = abs(r.dist); + const ivec env_ray = simd_cast(r.dist < 0.0f); + fvec ret = 1.0f; - simd_ivec ray_mask = r.mask; + ivec ray_mask = r.mask; alignas(S * 4) int ray_masks[S]; alignas(S * 4) float inter_t[S]; - ray_mask.store_to(ray_masks, simd_mem_aligned); - rdist.store_to(inter_t, simd_mem_aligned); + ray_mask.store_to(ray_masks, vector_aligned); + rdist.store_to(inter_t, vector_aligned); for (int ri = 0; ri < S; ri++) { if (!ray_masks[ri]) { @@ -6277,7 +6277,7 @@ Ray::NS::simd_fvec Ray::NS::IntersectAreaLights(const shadow_ray_t &r, Spa if (!l.blocking) { continue; } - const simd_ivec portal_mask = l.sky_portal ? env_ray : -1; + const ivec portal_mask = l.sky_portal ? env_ray : -1; if (l.type == LIGHT_TYPE_RECT) { float light_fwd[3]; cross(l.rect.u, l.rect.v, light_fwd); @@ -6285,28 +6285,28 @@ Ray::NS::simd_fvec Ray::NS::IntersectAreaLights(const shadow_ray_t &r, Spa const float plane_dist = dot3(light_fwd, l.rect.pos); - const simd_fvec cos_theta = dot3(r.d, light_fwd); - const simd_fvec t = safe_div_neg(plane_dist - dot3(light_fwd, r.o), cos_theta); + const fvec cos_theta = dot3(r.d, light_fwd); + const fvec t = safe_div_neg(plane_dist - dot3(light_fwd, r.o), cos_theta); - const simd_ivec imask = + const ivec imask = simd_cast((cos_theta < 0.0f) & (t > HIT_EPS) & (t < rdist)) & portal_mask & ray_mask; if (imask.not_all_zeros()) { const float dot_u = dot3(l.rect.u, l.rect.u); const float dot_v = dot3(l.rect.v, l.rect.v); - const simd_fvec p[3] = {fmadd(r.d[0], t, r.o[0]), fmadd(r.d[1], t, r.o[1]), + const fvec p[3] = {fmadd(r.d[0], t, r.o[0]), fmadd(r.d[1], t, r.o[1]), fmadd(r.d[2], t, r.o[2])}; - const simd_fvec vi[3] = {p[0] - l.rect.pos[0], p[1] - l.rect.pos[1], p[2] - l.rect.pos[2]}; + const fvec vi[3] = {p[0] - l.rect.pos[0], p[1] - l.rect.pos[1], p[2] - l.rect.pos[2]}; - const simd_fvec a1 = dot3(l.rect.u, vi) / dot_u; - const simd_fvec a2 = dot3(l.rect.v, vi) / dot_v; + const fvec a1 = dot3(l.rect.u, vi) / dot_u; + const fvec a2 = dot3(l.rect.v, vi) / dot_v; - const simd_fvec final_mask = + const fvec final_mask = (a1 >= -0.5f & a1 <= 0.5f) & (a2 >= -0.5f & a2 <= 0.5f) & simd_cast(imask); ray_mask &= ~simd_cast(final_mask); where(final_mask, ret) = 0.0f; - ray_mask.store_to(ray_masks, simd_mem_aligned); + ray_mask.store_to(ray_masks, vector_aligned); } } else if (l.type == LIGHT_TYPE_DISK) { float light_fwd[3]; @@ -6315,27 +6315,27 @@ Ray::NS::simd_fvec Ray::NS::IntersectAreaLights(const shadow_ray_t &r, Spa const float plane_dist = dot3(light_fwd, l.disk.pos); - const simd_fvec cos_theta = dot3(r.d, light_fwd); - const simd_fvec t = safe_div_neg(plane_dist - dot3(light_fwd, r.o), cos_theta); + const fvec cos_theta = dot3(r.d, light_fwd); + const fvec t = safe_div_neg(plane_dist - dot3(light_fwd, r.o), cos_theta); - const simd_ivec imask = + const ivec imask = simd_cast((cos_theta < 0.0f) & (t > HIT_EPS) & (t < rdist)) & portal_mask & ray_mask; if (imask.not_all_zeros()) { const float dot_u = dot3(l.disk.u, l.disk.u); const float dot_v = dot3(l.disk.v, l.disk.v); - const simd_fvec p[3] = {fmadd(r.d[0], t, r.o[0]), fmadd(r.d[1], t, r.o[1]), + const fvec p[3] = {fmadd(r.d[0], t, r.o[0]), fmadd(r.d[1], t, r.o[1]), fmadd(r.d[2], t, r.o[2])}; - const simd_fvec vi[3] = {p[0] - l.disk.pos[0], p[1] - l.disk.pos[1], p[2] - l.disk.pos[2]}; + const fvec vi[3] = {p[0] - l.disk.pos[0], p[1] - l.disk.pos[1], p[2] - l.disk.pos[2]}; - const simd_fvec a1 = dot3(l.disk.u, vi) / dot_u; - const simd_fvec a2 = dot3(l.disk.v, vi) / dot_v; + const fvec a1 = dot3(l.disk.u, vi) / dot_u; + const fvec a2 = dot3(l.disk.v, vi) / dot_v; - const simd_fvec final_mask = (sqrt(a1 * a1 + a2 * a2) <= 0.5f) & simd_cast(imask); + const fvec final_mask = (sqrt(a1 * a1 + a2 * a2) <= 0.5f) & simd_cast(imask); ray_mask &= ~simd_cast(final_mask); where(final_mask, ret) = 0.0f; - ray_mask.store_to(ray_masks, simd_mem_aligned); + ray_mask.store_to(ray_masks, vector_aligned); } } } @@ -6346,12 +6346,12 @@ Ray::NS::simd_fvec Ray::NS::IntersectAreaLights(const shadow_ray_t &r, Spa } template -Ray::NS::simd_fvec Ray::NS::EvalTriLightFactor(const simd_fvec P[3], const simd_fvec ro[3], - const simd_ivec &mask, const simd_ivec &tri_index, +Ray::NS::fvec Ray::NS::EvalTriLightFactor(const fvec P[3], const fvec ro[3], + const ivec &mask, const ivec &tri_index, Span lights, Span nodes) { const int SS = S <= 8 ? S : 8; - simd_fvec ret = 1.0f; + fvec ret = 1.0f; for (int ri = 0; ri < S; ri++) { if (!mask[ri]) { @@ -6408,12 +6408,12 @@ Ray::NS::simd_fvec Ray::NS::EvalTriLightFactor(const simd_fvec P[3], const } template -void Ray::NS::Evaluate_EnvColor(const ray_data_t &ray, const simd_ivec &mask, const environment_t &env, - const Cpu::TexStorageRGBA &tex_storage, const simd_fvec &pdf_factor, - const simd_fvec rand[2], simd_fvec env_col[4]) { +void Ray::NS::Evaluate_EnvColor(const ray_data_t &ray, const ivec &mask, const environment_t &env, + const Cpu::TexStorageRGBA &tex_storage, const fvec &pdf_factor, + const fvec rand[2], fvec env_col[4]) { const uint32_t env_map = env.env_map; const float env_map_rotation = env.env_map_rotation; - const simd_ivec env_map_mask = is_indirect(ray.depth); + const ivec env_map_mask = is_indirect(ray.depth); if ((mask & env_map_mask).not_all_zeros()) { UNROLLED_FOR(i, 3, { env_col[i] = 1.0f; }) @@ -6421,23 +6421,23 @@ void Ray::NS::Evaluate_EnvColor(const ray_data_t &ray, const simd_ivec &ma SampleLatlong_RGBE(tex_storage, env_map, ray.d, env_map_rotation, rand, (mask & env_map_mask), env_col); } #if USE_NEE - const simd_ivec mis_mask = - simd_ivec((env.light_index != 0xffffffff) ? -1 : 0) & simd_cast(pdf_factor >= 0.0f) & env_map_mask; + const ivec mis_mask = + ivec((env.light_index != 0xffffffff) ? -1 : 0) & simd_cast(pdf_factor >= 0.0f) & env_map_mask; if (mis_mask.not_all_zeros()) { if (env.qtree_levels) { - const auto *qtree_mips = reinterpret_cast(env.qtree_mips); + const auto *qtree_mips = reinterpret_cast(env.qtree_mips); - const simd_fvec light_pdf = + const fvec light_pdf = safe_div_pos(Evaluate_EnvQTree(env_map_rotation, qtree_mips, env.qtree_levels, ray.d), pdf_factor); - const simd_fvec bsdf_pdf = ray.pdf; + const fvec bsdf_pdf = ray.pdf; - const simd_fvec mis_weight = power_heuristic(bsdf_pdf, light_pdf); + const fvec mis_weight = power_heuristic(bsdf_pdf, light_pdf); UNROLLED_FOR(i, 3, { where(mis_mask, env_col[i]) *= mis_weight; }) } else { - const simd_fvec light_pdf = safe_div_pos(0.5f, PI * pdf_factor); - const simd_fvec bsdf_pdf = ray.pdf; + const fvec light_pdf = safe_div_pos(0.5f, PI * pdf_factor); + const fvec bsdf_pdf = ray.pdf; - const simd_fvec mis_weight = power_heuristic(bsdf_pdf, light_pdf); + const fvec mis_weight = power_heuristic(bsdf_pdf, light_pdf); UNROLLED_FOR(i, 3, { where(mis_mask, env_col[i]) *= mis_weight; }) } } @@ -6447,10 +6447,10 @@ void Ray::NS::Evaluate_EnvColor(const ray_data_t &ray, const simd_ivec &ma const uint32_t back_map = env.back_map; const float back_map_rotation = env.back_map_rotation; - const simd_ivec back_map_mask = ~env_map_mask; + const ivec back_map_mask = ~env_map_mask; if (back_map != 0xffffffff && (mask & back_map_mask).not_all_zeros()) { - simd_fvec back_col[3] = {}; + fvec back_col[3] = {}; SampleLatlong_RGBE(tex_storage, back_map, ray.d, back_map_rotation, rand, (mask & back_map_mask), back_col); UNROLLED_FOR(i, 3, { where(back_map_mask, env_col[i]) = back_col[i]; }) } @@ -6458,17 +6458,17 @@ void Ray::NS::Evaluate_EnvColor(const ray_data_t &ray, const simd_ivec &ma } template -void Ray::NS::Evaluate_LightColor(const simd_fvec P[3], const ray_data_t &ray, const simd_ivec &mask, +void Ray::NS::Evaluate_LightColor(const fvec P[3], const ray_data_t &ray, const ivec &mask, const hit_data_t &inter, const environment_t &env, Span lights, const uint32_t lights_count, const Cpu::TexStorageRGBA &tex_storage, - const simd_fvec rand[2], simd_fvec light_col[3]) { + const fvec rand[2], fvec light_col[3]) { #if USE_HIERARCHICAL_NEE - const simd_fvec pdf_factor = safe_div_pos(1.0f, inter.u); + const fvec pdf_factor = safe_div_pos(1.0f, inter.u); #else const float pdf_factor = float(lights_count); #endif - simd_ivec ray_queue[S]; + ivec ray_queue[S]; ray_queue[0] = mask; int index = 0, num = 1; @@ -6476,8 +6476,8 @@ void Ray::NS::Evaluate_LightColor(const simd_fvec P[3], const ray_data_t & const int mask = ray_queue[index].movemask(); const uint32_t first_li = inter.obj_index[GetFirstBit(mask)]; - const simd_ivec same_li = (inter.obj_index == first_li); - const simd_ivec diff_li = and_not(same_li, ray_queue[index]); + const ivec same_li = (inter.obj_index == first_li); + const ivec diff_li = and_not(same_li, ray_queue[index]); if (diff_li.not_all_zeros()) { ray_queue[index] &= same_li; @@ -6486,11 +6486,11 @@ void Ray::NS::Evaluate_LightColor(const simd_fvec P[3], const ray_data_t & const light_t &l = lights[-int(first_li) - 1]; - simd_fvec lcol[3] = {l.col[0], l.col[1], l.col[2]}; + fvec lcol[3] = {l.col[0], l.col[1], l.col[2]}; if (l.sky_portal) { UNROLLED_FOR(i, 3, { lcol[i] *= env.env_col[i]; }) if (env.env_map != 0xffffffff) { - simd_fvec tex_col[3]; + fvec tex_col[3]; SampleLatlong_RGBE(tex_storage, env.env_map, ray.d, env.env_map_rotation, rand, ray_queue[index], tex_col); UNROLLED_FOR(i, 3, { lcol[i] *= tex_col[i]; }) @@ -6498,23 +6498,23 @@ void Ray::NS::Evaluate_LightColor(const simd_fvec P[3], const ray_data_t & } if (l.type == LIGHT_TYPE_SPHERE) { - simd_fvec disk_normal[3] = {ray.o[0] - l.sph.pos[0], ray.o[1] - l.sph.pos[1], ray.o[2] - l.sph.pos[2]}; + fvec disk_normal[3] = {ray.o[0] - l.sph.pos[0], ray.o[1] - l.sph.pos[1], ray.o[2] - l.sph.pos[2]}; normalize(disk_normal); - const simd_fvec disk_dist = dot3(ray.o, disk_normal) - dot3(l.sph.pos, disk_normal); + const fvec disk_dist = dot3(ray.o, disk_normal) - dot3(l.sph.pos, disk_normal); - const simd_fvec light_pdf = + const fvec light_pdf = safe_div(disk_dist * disk_dist, PI * l.sph.radius * l.sph.radius * pdf_factor); - const simd_fvec bsdf_pdf = ray.pdf; + const fvec bsdf_pdf = ray.pdf; - const simd_fvec mis_weight = power_heuristic(bsdf_pdf, light_pdf); + const fvec mis_weight = power_heuristic(bsdf_pdf, light_pdf); UNROLLED_FOR(i, 3, { lcol[i] *= mis_weight; }) if (l.sph.spot > 0.0f && l.sph.blend > 0.0f) { - const simd_fvec _dot = + const fvec _dot = -(ray.d[0] * l.sph.dir[0] + ray.d[1] * l.sph.dir[1] + ray.d[2] * l.sph.dir[2]); assert((ray_queue[index] & simd_cast(_dot <= 0.0f)).all_zeros()); - simd_fvec _angle = 0.0f; + fvec _angle = 0.0f; UNROLLED_FOR_S(i, S, { if (ray_queue[index].template get()) { _angle.template set(acosf(_dot.template get())); @@ -6522,7 +6522,7 @@ void Ray::NS::Evaluate_LightColor(const simd_fvec P[3], const ray_data_t & }) assert((ray_queue[index] & simd_cast(_angle > l.sph.spot)).all_zeros()); if (l.sph.blend > 0.0f) { - const simd_fvec spot_weight = saturate((l.sph.spot - _angle) / l.sph.blend); + const fvec spot_weight = saturate((l.sph.spot - _angle) / l.sph.blend); UNROLLED_FOR(i, 3, { lcol[i] *= spot_weight; }) } } @@ -6530,53 +6530,53 @@ void Ray::NS::Evaluate_LightColor(const simd_fvec P[3], const ray_data_t & const float radius = tanf(l.dir.angle); const float light_area = PI * radius * radius; - const simd_fvec cos_theta = dot3(ray.d, l.dir.dir); + const fvec cos_theta = dot3(ray.d, l.dir.dir); - const simd_fvec light_pdf = safe_div(simd_fvec{1.0f}, light_area * cos_theta * pdf_factor); - const simd_fvec bsdf_pdf = ray.pdf; + const fvec light_pdf = safe_div(fvec{1.0f}, light_area * cos_theta * pdf_factor); + const fvec bsdf_pdf = ray.pdf; - const simd_fvec mis_weight = power_heuristic(bsdf_pdf, light_pdf); + const fvec mis_weight = power_heuristic(bsdf_pdf, light_pdf); UNROLLED_FOR(i, 3, { lcol[i] *= mis_weight; }) } else if (l.type == LIGHT_TYPE_RECT) { float light_fwd[3]; cross(l.rect.u, l.rect.v, light_fwd); normalize(light_fwd); - const simd_fvec cos_theta = dot3(ray.d, light_fwd); + const fvec cos_theta = dot3(ray.d, light_fwd); - simd_fvec light_pdf = 0.0f; + fvec light_pdf = 0.0f; #if USE_SPHERICAL_AREA_LIGHT_SAMPLING - const simd_fvec vp[3] = {l.rect.pos[0], l.rect.pos[1], l.rect.pos[2]}, + const fvec vp[3] = {l.rect.pos[0], l.rect.pos[1], l.rect.pos[2]}, vu[3] = {l.rect.u[0], l.rect.u[1], l.rect.u[2]}, vv[3] = {l.rect.v[0], l.rect.v[1], l.rect.v[2]}; light_pdf = SampleSphericalRectangle(ray.o, vp, vu, vv, nullptr, nullptr) / pdf_factor; #endif where(light_pdf == 0.0f, light_pdf) = safe_div(inter.t * inter.t, l.rect.area * cos_theta * pdf_factor); - const simd_fvec bsdf_pdf = ray.pdf; + const fvec bsdf_pdf = ray.pdf; - const simd_fvec mis_weight = power_heuristic(bsdf_pdf, light_pdf); + const fvec mis_weight = power_heuristic(bsdf_pdf, light_pdf); UNROLLED_FOR(i, 3, { lcol[i] *= mis_weight; }) } else if (l.type == LIGHT_TYPE_DISK) { float light_fwd[3]; cross(l.disk.u, l.disk.v, light_fwd); normalize(light_fwd); - const simd_fvec cos_theta = dot3(ray.d, light_fwd); + const fvec cos_theta = dot3(ray.d, light_fwd); - const simd_fvec light_pdf = safe_div(inter.t * inter.t, l.disk.area * cos_theta * pdf_factor); - const simd_fvec bsdf_pdf = ray.pdf; + const fvec light_pdf = safe_div(inter.t * inter.t, l.disk.area * cos_theta * pdf_factor); + const fvec bsdf_pdf = ray.pdf; - const simd_fvec mis_weight = power_heuristic(bsdf_pdf, light_pdf); + const fvec mis_weight = power_heuristic(bsdf_pdf, light_pdf); UNROLLED_FOR(i, 3, { lcol[i] *= mis_weight; }) } else if (l.type == LIGHT_TYPE_LINE) { const float *light_dir = l.line.v; - const simd_fvec cos_theta = 1.0f - abs(dot3(ray.d, light_dir)); + const fvec cos_theta = 1.0f - abs(dot3(ray.d, light_dir)); - const simd_fvec light_pdf = safe_div(inter.t * inter.t, l.line.area * cos_theta * pdf_factor); - const simd_fvec bsdf_pdf = ray.pdf; + const fvec light_pdf = safe_div(inter.t * inter.t, l.line.area * cos_theta * pdf_factor); + const fvec bsdf_pdf = ray.pdf; - const simd_fvec mis_weight = power_heuristic(bsdf_pdf, light_pdf); + const fvec mis_weight = power_heuristic(bsdf_pdf, light_pdf); UNROLLED_FOR(i, 3, { lcol[i] *= mis_weight; }) } @@ -6589,26 +6589,26 @@ void Ray::NS::Evaluate_LightColor(const simd_fvec P[3], const ray_data_t & }; template -Ray::NS::simd_ivec Ray::NS::Evaluate_DiffuseNode(const light_sample_t &ls, const ray_data_t &ray, - const simd_ivec &mask, const surface_t &surf, - const simd_fvec base_color[3], const simd_fvec &roughness, - const simd_fvec &mix_weight, const simd_ivec &mis_mask, - simd_fvec out_col[3], shadow_ray_t &sh_r) { - const simd_fvec nI[3] = {-ray.d[0], -ray.d[1], -ray.d[2]}; +Ray::NS::ivec Ray::NS::Evaluate_DiffuseNode(const light_sample_t &ls, const ray_data_t &ray, + const ivec &mask, const surface_t &surf, + const fvec base_color[3], const fvec &roughness, + const fvec &mix_weight, const ivec &mis_mask, + fvec out_col[3], shadow_ray_t &sh_r) { + const fvec nI[3] = {-ray.d[0], -ray.d[1], -ray.d[2]}; - simd_fvec diff_col[4]; + fvec diff_col[4]; Evaluate_OrenDiffuse_BSDF(nI, surf.N, ls.L, roughness, base_color, diff_col); - const simd_fvec &bsdf_pdf = diff_col[3]; + const fvec &bsdf_pdf = diff_col[3]; - const simd_fvec mis_weight = - select(mis_mask & simd_cast(ls.area > 0.0f), power_heuristic(ls.pdf, bsdf_pdf), simd_fvec{1.0f}); + const fvec mis_weight = + select(mis_mask & simd_cast(ls.area > 0.0f), power_heuristic(ls.pdf, bsdf_pdf), fvec{1.0f}); - simd_fvec P_biased[3]; + fvec P_biased[3]; offset_ray(surf.P, surf.plane_N, P_biased); UNROLLED_FOR(i, 3, { where(mask, sh_r.o[i]) = P_biased[i]; }) UNROLLED_FOR(i, 3, { - const simd_fvec temp = ls.col[i] * diff_col[i] * safe_div_pos(mix_weight * mis_weight, ls.pdf); + const fvec temp = ls.col[i] * diff_col[i] * safe_div_pos(mix_weight * mis_weight, ls.pdf); where(mask, sh_r.c[i]) = ray.c[i] * temp; where(mask & ~ls.cast_shadow, out_col[i]) += temp; }) @@ -6617,18 +6617,18 @@ Ray::NS::simd_ivec Ray::NS::Evaluate_DiffuseNode(const light_sample_t &ls, } template -void Ray::NS::Sample_DiffuseNode(const ray_data_t &ray, const simd_ivec &mask, const surface_t &surf, - const simd_fvec base_color[3], const simd_fvec &roughness, - const simd_fvec &rand_u, const simd_fvec &rand_v, const simd_fvec &mix_weight, +void Ray::NS::Sample_DiffuseNode(const ray_data_t &ray, const ivec &mask, const surface_t &surf, + const fvec base_color[3], const fvec &roughness, + const fvec &rand_u, const fvec &rand_v, const fvec &mix_weight, ray_data_t &new_ray) { - simd_fvec V[3], F[4]; + fvec V[3], F[4]; Sample_OrenDiffuse_BSDF(surf.T, surf.B, surf.N, ray.d, roughness, base_color, rand_u, rand_v, V, F); where(mask, new_ray.depth) = pack_ray_type(RAY_TYPE_DIFFUSE); where(mask, new_ray.depth) |= - mask_ray_depth(ray.depth) + pack_depth(simd_ivec{1}, simd_ivec{0}, simd_ivec{0}, simd_ivec{0}); + mask_ray_depth(ray.depth) + pack_depth(ivec{1}, ivec{0}, ivec{0}, ivec{0}); - simd_fvec P_biased[3]; + fvec P_biased[3]; offset_ray(surf.P, surf.plane_N, P_biased); UNROLLED_FOR(i, 3, { @@ -6640,38 +6640,38 @@ void Ray::NS::Sample_DiffuseNode(const ray_data_t &ray, const simd_ivec &m } template -Ray::NS::simd_ivec -Ray::NS::Evaluate_GlossyNode(const light_sample_t &ls, const ray_data_t &ray, simd_ivec mask, - const surface_t &surf, const simd_fvec base_color[3], const simd_fvec &roughness, - const simd_fvec ®ularize_alpha, const simd_fvec &spec_ior, - const simd_fvec &spec_F0, const simd_fvec &mix_weight, const simd_ivec &mis_mask, - simd_fvec out_col[3], shadow_ray_t &sh_r) { - const simd_fvec nI[3] = {-ray.d[0], -ray.d[1], -ray.d[2]}; - simd_fvec H[3] = {ls.L[0] - ray.d[0], ls.L[1] - ray.d[1], ls.L[2] - ray.d[2]}; +Ray::NS::ivec +Ray::NS::Evaluate_GlossyNode(const light_sample_t &ls, const ray_data_t &ray, ivec mask, + const surface_t &surf, const fvec base_color[3], const fvec &roughness, + const fvec ®ularize_alpha, const fvec &spec_ior, + const fvec &spec_F0, const fvec &mix_weight, const ivec &mis_mask, + fvec out_col[3], shadow_ray_t &sh_r) { + const fvec nI[3] = {-ray.d[0], -ray.d[1], -ray.d[2]}; + fvec H[3] = {ls.L[0] - ray.d[0], ls.L[1] - ray.d[1], ls.L[2] - ray.d[2]}; safe_normalize(H); - simd_fvec view_dir_ts[3], light_dir_ts[3], sampled_normal_ts[3]; + fvec view_dir_ts[3], light_dir_ts[3], sampled_normal_ts[3]; tangent_from_world(surf.T, surf.B, surf.N, nI, view_dir_ts); tangent_from_world(surf.T, surf.B, surf.N, ls.L, light_dir_ts); tangent_from_world(surf.T, surf.B, surf.N, H, sampled_normal_ts); - simd_fvec spec_col[4], alpha[2]; - calc_alpha(roughness, simd_fvec{0.0f}, regularize_alpha, alpha); + fvec spec_col[4], alpha[2]; + calc_alpha(roughness, fvec{0.0f}, regularize_alpha, alpha); mask &= simd_cast(alpha[0] * alpha[1] >= 1e-7f); - Evaluate_GGXSpecular_BSDF(view_dir_ts, sampled_normal_ts, light_dir_ts, alpha, simd_fvec{spec_ior}, - simd_fvec{spec_F0}, base_color, base_color, spec_col); - const simd_fvec &bsdf_pdf = spec_col[3]; + Evaluate_GGXSpecular_BSDF(view_dir_ts, sampled_normal_ts, light_dir_ts, alpha, fvec{spec_ior}, + fvec{spec_F0}, base_color, base_color, spec_col); + const fvec &bsdf_pdf = spec_col[3]; - const simd_fvec mis_weight = - select(mis_mask & simd_cast(ls.area > 0.0f), power_heuristic(ls.pdf, bsdf_pdf), simd_fvec{1.0f}); + const fvec mis_weight = + select(mis_mask & simd_cast(ls.area > 0.0f), power_heuristic(ls.pdf, bsdf_pdf), fvec{1.0f}); - simd_fvec P_biased[3]; + fvec P_biased[3]; offset_ray(surf.P, surf.plane_N, P_biased); UNROLLED_FOR(i, 3, { where(mask, sh_r.o[i]) = P_biased[i]; }) UNROLLED_FOR(i, 3, { - const simd_fvec temp = ls.col[i] * spec_col[i] * safe_div_pos(mix_weight * mis_weight, ls.pdf); + const fvec temp = ls.col[i] * spec_col[i] * safe_div_pos(mix_weight * mis_weight, ls.pdf); where(mask, sh_r.c[i]) = ray.c[i] * temp; where(mask & ~ls.cast_shadow, out_col[i]) += temp; }) @@ -6680,23 +6680,23 @@ Ray::NS::Evaluate_GlossyNode(const light_sample_t &ls, const ray_data_t &r } template -void Ray::NS::Sample_GlossyNode(const ray_data_t &ray, const simd_ivec &mask, const surface_t &surf, - const simd_fvec base_color[3], const simd_fvec &roughness, - const simd_fvec ®ularize_alpha, const simd_fvec &spec_ior, - const simd_fvec &spec_F0, const simd_fvec rand[2], const simd_fvec &mix_weight, +void Ray::NS::Sample_GlossyNode(const ray_data_t &ray, const ivec &mask, const surface_t &surf, + const fvec base_color[3], const fvec &roughness, + const fvec ®ularize_alpha, const fvec &spec_ior, + const fvec &spec_F0, const fvec rand[2], const fvec &mix_weight, ray_data_t &new_ray) { - simd_fvec alpha[2]; - calc_alpha(roughness, simd_fvec{0.0f}, regularize_alpha, alpha); + fvec alpha[2]; + calc_alpha(roughness, fvec{0.0f}, regularize_alpha, alpha); - simd_fvec V[3], F[4]; + fvec V[3], F[4]; Sample_GGXSpecular_BSDF(surf.T, surf.B, surf.N, ray.d, alpha, spec_ior, spec_F0, base_color, base_color, rand, V, F); where(mask, new_ray.depth) = pack_ray_type(RAY_TYPE_SPECULAR); where(mask, new_ray.depth) |= - mask_ray_depth(ray.depth) + pack_depth(simd_ivec{0}, simd_ivec{1}, simd_ivec{0}, simd_ivec{0}); + mask_ray_depth(ray.depth) + pack_depth(ivec{0}, ivec{1}, ivec{0}, ivec{0}); - simd_fvec P_biased[3]; + fvec P_biased[3]; offset_ray(surf.P, surf.plane_N, P_biased); UNROLLED_FOR(i, 3, { @@ -6708,37 +6708,37 @@ void Ray::NS::Sample_GlossyNode(const ray_data_t &ray, const simd_ivec &ma } template -Ray::NS::simd_ivec Ray::NS::Evaluate_RefractiveNode(const light_sample_t &ls, const ray_data_t &ray, - const simd_ivec &mask, const surface_t &surf, - const simd_fvec base_color[3], const simd_fvec &roughness, - const simd_fvec ®ularize_alpha, const simd_fvec &eta, - const simd_fvec &mix_weight, const simd_ivec &mis_mask, - simd_fvec out_col[3], shadow_ray_t &sh_r) { - const simd_fvec nI[3] = {-ray.d[0], -ray.d[1], -ray.d[2]}; - simd_fvec H[3] = {ls.L[0] - ray.d[0] * eta, ls.L[1] - ray.d[1] * eta, ls.L[2] - ray.d[2] * eta}; +Ray::NS::ivec Ray::NS::Evaluate_RefractiveNode(const light_sample_t &ls, const ray_data_t &ray, + const ivec &mask, const surface_t &surf, + const fvec base_color[3], const fvec &roughness, + const fvec ®ularize_alpha, const fvec &eta, + const fvec &mix_weight, const ivec &mis_mask, + fvec out_col[3], shadow_ray_t &sh_r) { + const fvec nI[3] = {-ray.d[0], -ray.d[1], -ray.d[2]}; + fvec H[3] = {ls.L[0] - ray.d[0] * eta, ls.L[1] - ray.d[1] * eta, ls.L[2] - ray.d[2] * eta}; safe_normalize(H); - simd_fvec view_dir_ts[3], light_dir_ts[3], sampled_normal_ts[3]; + fvec view_dir_ts[3], light_dir_ts[3], sampled_normal_ts[3]; tangent_from_world(surf.T, surf.B, surf.N, nI, view_dir_ts); tangent_from_world(surf.T, surf.B, surf.N, ls.L, light_dir_ts); tangent_from_world(surf.T, surf.B, surf.N, H, sampled_normal_ts); - simd_fvec refr_col[4], alpha[2]; - calc_alpha(roughness, simd_fvec{0.0f}, regularize_alpha, alpha); - Evaluate_GGXRefraction_BSDF(view_dir_ts, sampled_normal_ts, light_dir_ts, alpha, simd_fvec{eta}, base_color, + fvec refr_col[4], alpha[2]; + calc_alpha(roughness, fvec{0.0f}, regularize_alpha, alpha); + Evaluate_GGXRefraction_BSDF(view_dir_ts, sampled_normal_ts, light_dir_ts, alpha, fvec{eta}, base_color, refr_col); - const simd_fvec &bsdf_pdf = refr_col[3]; + const fvec &bsdf_pdf = refr_col[3]; - const simd_fvec mis_weight = - select(mis_mask & simd_cast(ls.area > 0.0f), power_heuristic(ls.pdf, bsdf_pdf), simd_fvec{1.0f}); + const fvec mis_weight = + select(mis_mask & simd_cast(ls.area > 0.0f), power_heuristic(ls.pdf, bsdf_pdf), fvec{1.0f}); - simd_fvec P_biased[3]; - const simd_fvec _plane_N[3] = {-surf.plane_N[0], -surf.plane_N[1], -surf.plane_N[2]}; + fvec P_biased[3]; + const fvec _plane_N[3] = {-surf.plane_N[0], -surf.plane_N[1], -surf.plane_N[2]}; offset_ray(surf.P, _plane_N, P_biased); UNROLLED_FOR(i, 3, { where(mask, sh_r.o[i]) = P_biased[i]; }) UNROLLED_FOR(i, 3, { - const simd_fvec temp = ls.col[i] * refr_col[i] * safe_div_pos(mix_weight * mis_weight, ls.pdf); + const fvec temp = ls.col[i] * refr_col[i] * safe_div_pos(mix_weight * mis_weight, ls.pdf); where(mask, sh_r.c[i]) = ray.c[i] * temp; where(mask & ~ls.cast_shadow, out_col[i]) += temp; }) @@ -6747,24 +6747,24 @@ Ray::NS::simd_ivec Ray::NS::Evaluate_RefractiveNode(const light_sample_t & } template -void Ray::NS::Sample_RefractiveNode(const ray_data_t &ray, const simd_ivec &mask, const surface_t &surf, - const simd_fvec base_color[3], const simd_fvec &roughness, - const simd_fvec ®ularize_alpha, const simd_ivec &is_backfacing, - const simd_fvec &int_ior, const simd_fvec &ext_ior, - const simd_fvec rand[2], const simd_fvec &mix_weight, +void Ray::NS::Sample_RefractiveNode(const ray_data_t &ray, const ivec &mask, const surface_t &surf, + const fvec base_color[3], const fvec &roughness, + const fvec ®ularize_alpha, const ivec &is_backfacing, + const fvec &int_ior, const fvec &ext_ior, + const fvec rand[2], const fvec &mix_weight, ray_data_t &new_ray) { - const simd_fvec eta = select(is_backfacing, (int_ior / ext_ior), (ext_ior / int_ior)); + const fvec eta = select(is_backfacing, (int_ior / ext_ior), (ext_ior / int_ior)); - simd_fvec V[4], F[4], alpha[2]; - calc_alpha(roughness, simd_fvec{0.0f}, regularize_alpha, alpha); + fvec V[4], F[4], alpha[2]; + calc_alpha(roughness, fvec{0.0f}, regularize_alpha, alpha); Sample_GGXRefraction_BSDF(surf.T, surf.B, surf.N, ray.d, alpha, eta, base_color, rand, V, F); where(mask, new_ray.depth) = pack_ray_type(RAY_TYPE_REFR); where(mask, new_ray.depth) |= - mask_ray_depth(ray.depth) + pack_depth(simd_ivec{0}, simd_ivec{0}, simd_ivec{1}, simd_ivec{0}); + mask_ray_depth(ray.depth) + pack_depth(ivec{0}, ivec{0}, ivec{1}, ivec{0}); - simd_fvec P_biased[3]; - const simd_fvec _plane_N[3] = {-surf.plane_N[0], -surf.plane_N[1], -surf.plane_N[2]}; + fvec P_biased[3]; + const fvec _plane_N[3] = {-surf.plane_N[0], -surf.plane_N[1], -surf.plane_N[2]}; offset_ray(surf.P, _plane_N, P_biased); UNROLLED_FOR(i, 3, { @@ -6780,23 +6780,23 @@ void Ray::NS::Sample_RefractiveNode(const ray_data_t &ray, const simd_ivec } template -Ray::NS::simd_ivec Ray::NS::Evaluate_PrincipledNode( - const light_sample_t &ls, const ray_data_t &ray, const simd_ivec &mask, const surface_t &surf, +Ray::NS::ivec Ray::NS::Evaluate_PrincipledNode( + const light_sample_t &ls, const ray_data_t &ray, const ivec &mask, const surface_t &surf, const lobe_weights_t &lobe_weights, const diff_params_t &diff, const spec_params_t &spec, - const clearcoat_params_t &coat, const transmission_params_t &trans, const simd_fvec &metallic, - const float transmission, const simd_fvec &N_dot_L, const simd_fvec &mix_weight, const simd_ivec &mis_mask, - const simd_fvec ®ularize_alpha, simd_fvec out_col[3], shadow_ray_t &sh_r) { - const simd_fvec nI[3] = {-ray.d[0], -ray.d[1], -ray.d[2]}; + const clearcoat_params_t &coat, const transmission_params_t &trans, const fvec &metallic, + const float transmission, const fvec &N_dot_L, const fvec &mix_weight, const ivec &mis_mask, + const fvec ®ularize_alpha, fvec out_col[3], shadow_ray_t &sh_r) { + const fvec nI[3] = {-ray.d[0], -ray.d[1], -ray.d[2]}; - const simd_ivec _is_backfacing = simd_cast(N_dot_L < 0.0f); - const simd_ivec _is_frontfacing = simd_cast(N_dot_L > 0.0f); + const ivec _is_backfacing = simd_cast(N_dot_L < 0.0f); + const ivec _is_frontfacing = simd_cast(N_dot_L > 0.0f); - simd_fvec lcol[3] = {0.0f, 0.0f, 0.0f}; - simd_fvec bsdf_pdf = 0.0f; + fvec lcol[3] = {0.0f, 0.0f, 0.0f}; + fvec bsdf_pdf = 0.0f; - const simd_ivec eval_diff_lobe = simd_cast(lobe_weights.diffuse > 0.0f) & _is_frontfacing & mask; + const ivec eval_diff_lobe = simd_cast(lobe_weights.diffuse > 0.0f) & _is_frontfacing & mask; if (eval_diff_lobe.not_all_zeros()) { - simd_fvec diff_col[4]; + fvec diff_col[4]; Evaluate_PrincipledDiffuse_BSDF(nI, surf.N, ls.L, diff.roughness, diff.base_color, diff.sheen_color, false, diff_col); @@ -6807,23 +6807,23 @@ Ray::NS::simd_ivec Ray::NS::Evaluate_PrincipledNode( }) } - simd_fvec H[3]; + fvec H[3]; UNROLLED_FOR(i, 3, { H[i] = select(_is_frontfacing, ls.L[i] - ray.d[i], ls.L[i] - ray.d[i] * trans.eta); }) safe_normalize(H); - const simd_fvec spec_col_90[3] = {1.0f, 1.0f, 1.0f}; + const fvec spec_col_90[3] = {1.0f, 1.0f, 1.0f}; - simd_fvec view_dir_ts[3], light_dir_ts[3], sampled_normal_ts[3]; + fvec view_dir_ts[3], light_dir_ts[3], sampled_normal_ts[3]; tangent_from_world(surf.T, surf.B, surf.N, nI, view_dir_ts); tangent_from_world(surf.T, surf.B, surf.N, ls.L, light_dir_ts); tangent_from_world(surf.T, surf.B, surf.N, H, sampled_normal_ts); - simd_fvec spec_alpha[2]; + fvec spec_alpha[2]; calc_alpha(spec.roughness, spec.anisotropy, regularize_alpha, spec_alpha); - const simd_ivec eval_spec_lobe = simd_cast(lobe_weights.specular > 0.0f) & + const ivec eval_spec_lobe = simd_cast(lobe_weights.specular > 0.0f) & simd_cast(spec_alpha[0] * spec_alpha[1] >= 1e-7f) & _is_frontfacing & mask; if (eval_spec_lobe.not_all_zeros()) { - simd_fvec spec_col[4], _alpha[2] = {max(spec_alpha[0], 1e-7f), max(spec_alpha[1], 1e-7f)}; + fvec spec_col[4], _alpha[2] = {max(spec_alpha[0], 1e-7f), max(spec_alpha[1], 1e-7f)}; Evaluate_GGXSpecular_BSDF(view_dir_ts, sampled_normal_ts, light_dir_ts, _alpha, spec.ior, spec.F0, spec.tmp_col, spec_col_90, spec_col); @@ -6832,12 +6832,12 @@ Ray::NS::simd_ivec Ray::NS::Evaluate_PrincipledNode( UNROLLED_FOR(i, 3, { where(eval_spec_lobe, lcol[i]) += safe_div_pos(ls.col[i] * spec_col[i], ls.pdf); }) } - simd_fvec coat_alpha[2]; - calc_alpha(coat.roughness, simd_fvec{0.0f}, regularize_alpha, coat_alpha); - const simd_ivec eval_coat_lobe = simd_cast(lobe_weights.clearcoat > 0.0f) & + fvec coat_alpha[2]; + calc_alpha(coat.roughness, fvec{0.0f}, regularize_alpha, coat_alpha); + const ivec eval_coat_lobe = simd_cast(lobe_weights.clearcoat > 0.0f) & simd_cast(coat_alpha[0] * coat_alpha[1] >= 1e-7f) & _is_frontfacing & mask; if (eval_coat_lobe.not_all_zeros()) { - simd_fvec clearcoat_col[4]; + fvec clearcoat_col[4]; Evaluate_PrincipledClearcoat_BSDF(view_dir_ts, sampled_normal_ts, light_dir_ts, coat_alpha[0], coat.ior, coat.F0, clearcoat_col); @@ -6847,15 +6847,15 @@ Ray::NS::simd_ivec Ray::NS::Evaluate_PrincipledNode( { where(eval_coat_lobe, lcol[i]) += safe_div_pos(0.25f * ls.col[i] * clearcoat_col[i], ls.pdf); }) } - simd_fvec refr_spec_alpha[2]; - calc_alpha(spec.roughness, simd_fvec{0.0f}, regularize_alpha, refr_spec_alpha); - const simd_ivec eval_refr_spec_lobe = + fvec refr_spec_alpha[2]; + calc_alpha(spec.roughness, fvec{0.0f}, regularize_alpha, refr_spec_alpha); + const ivec eval_refr_spec_lobe = simd_cast(trans.fresnel != 0.0f) & simd_cast(lobe_weights.refraction > 0.0f) & simd_cast(refr_spec_alpha[0] * refr_spec_alpha[1] >= 1e-7f) & _is_frontfacing & mask; if (eval_refr_spec_lobe.not_all_zeros()) { - simd_fvec spec_col[4], spec_temp_col[3] = {1.0f, 1.0f, 1.0f}; + fvec spec_col[4], spec_temp_col[3] = {1.0f, 1.0f, 1.0f}; Evaluate_GGXSpecular_BSDF(view_dir_ts, sampled_normal_ts, light_dir_ts, refr_spec_alpha, - simd_fvec{1.0f} /* ior */, simd_fvec{0.0f} /* F0 */, spec_temp_col, spec_col_90, + fvec{1.0f} /* ior */, fvec{0.0f} /* F0 */, spec_temp_col, spec_col_90, spec_col); where(eval_refr_spec_lobe, bsdf_pdf) += lobe_weights.refraction * trans.fresnel * spec_col[3]; @@ -6864,13 +6864,13 @@ Ray::NS::simd_ivec Ray::NS::Evaluate_PrincipledNode( }) } - simd_fvec refr_trans_alpha[2]; - calc_alpha(trans.roughness, simd_fvec{0.0f}, regularize_alpha, refr_trans_alpha); - const simd_ivec eval_refr_trans_lobe = + fvec refr_trans_alpha[2]; + calc_alpha(trans.roughness, fvec{0.0f}, regularize_alpha, refr_trans_alpha); + const ivec eval_refr_trans_lobe = simd_cast(trans.fresnel != 1.0f) & simd_cast(lobe_weights.refraction > 0.0f) & simd_cast(refr_trans_alpha[0] * refr_trans_alpha[1] >= 1e-7f) & _is_backfacing & mask; if (eval_refr_trans_lobe.not_all_zeros()) { - simd_fvec refr_col[4]; + fvec refr_col[4]; Evaluate_GGXRefraction_BSDF(view_dir_ts, sampled_normal_ts, light_dir_ts, refr_trans_alpha, trans.eta, diff.base_color, refr_col); where(eval_refr_trans_lobe, bsdf_pdf) += lobe_weights.refraction * (1.0f - trans.fresnel) * refr_col[3]; @@ -6881,16 +6881,16 @@ Ray::NS::simd_ivec Ray::NS::Evaluate_PrincipledNode( }) } - const simd_fvec mis_weight = - select(mis_mask & simd_cast(ls.area > 0.0f), power_heuristic(ls.pdf, bsdf_pdf), simd_fvec{1.0f}); + const fvec mis_weight = + select(mis_mask & simd_cast(ls.area > 0.0f), power_heuristic(ls.pdf, bsdf_pdf), fvec{1.0f}); UNROLLED_FOR(i, 3, { where(mask, lcol[i]) *= mix_weight * mis_weight; }) /// - simd_fvec P_biased[3]; + fvec P_biased[3]; offset_ray(surf.P, surf.plane_N, P_biased); - const simd_fvec neg_plane_N[3] = {-surf.plane_N[0], -surf.plane_N[1], -surf.plane_N[2]}; - simd_fvec back_P_biased[3]; + const fvec neg_plane_N[3] = {-surf.plane_N[0], -surf.plane_N[1], -surf.plane_N[2]}; + fvec back_P_biased[3]; offset_ray(surf.P, neg_plane_N, back_P_biased); UNROLLED_FOR(i, 3, { @@ -6906,35 +6906,35 @@ Ray::NS::simd_ivec Ray::NS::Evaluate_PrincipledNode( } template -void Ray::NS::Sample_PrincipledNode(const pass_settings_t &ps, const ray_data_t &ray, const simd_ivec &mask, +void Ray::NS::Sample_PrincipledNode(const pass_settings_t &ps, const ray_data_t &ray, const ivec &mask, const surface_t &surf, const lobe_weights_t &lobe_weights, const diff_params_t &diff, const spec_params_t &spec, const clearcoat_params_t &coat, const transmission_params_t &trans, - const simd_fvec &metallic, const float transmission, const simd_fvec rand[2], - simd_fvec mix_rand, const simd_fvec &mix_weight, - const simd_fvec ®ularize_alpha, simd_ivec &secondary_mask, + const fvec &metallic, const float transmission, const fvec rand[2], + fvec mix_rand, const fvec &mix_weight, + const fvec ®ularize_alpha, ivec &secondary_mask, ray_data_t &new_ray) { - const simd_ivec diff_depth = get_diff_depth(ray.depth), spec_depth = get_spec_depth(ray.depth), + const ivec diff_depth = get_diff_depth(ray.depth), spec_depth = get_spec_depth(ray.depth), refr_depth = get_refr_depth(ray.depth); // NOTE: transparency depth is not accounted here - const simd_ivec total_depth = diff_depth + spec_depth + refr_depth; + const ivec total_depth = diff_depth + spec_depth + refr_depth; - const simd_ivec sample_diff_lobe = (diff_depth < ps.max_diff_depth) & (total_depth < ps.max_total_depth) & + const ivec sample_diff_lobe = (diff_depth < ps.max_diff_depth) & (total_depth < ps.max_total_depth) & simd_cast(mix_rand < lobe_weights.diffuse) & mask; if (sample_diff_lobe.not_all_zeros()) { - simd_fvec V[3], F[4]; + fvec V[3], F[4]; Sample_PrincipledDiffuse_BSDF(surf.T, surf.B, surf.N, ray.d, diff.roughness, diff.base_color, diff.sheen_color, false, rand, V, F); // F[3] *= lobe_weights.diffuse; UNROLLED_FOR(i, 3, { F[i] *= (1.0f - metallic) * (1.0f - transmission); }) - simd_fvec new_p[3]; + fvec new_p[3]; offset_ray(surf.P, surf.plane_N, new_p); where(sample_diff_lobe, new_ray.depth) = pack_ray_type(RAY_TYPE_DIFFUSE); where(sample_diff_lobe, new_ray.depth) |= - mask_ray_depth(ray.depth) + pack_depth(simd_ivec{1}, simd_ivec{0}, simd_ivec{0}, simd_ivec{0}); + mask_ray_depth(ray.depth) + pack_depth(ivec{1}, ivec{0}, ivec{0}, ivec{0}); UNROLLED_FOR(i, 3, { where(sample_diff_lobe, new_ray.o[i]) = new_p[i]; @@ -6947,24 +6947,24 @@ void Ray::NS::Sample_PrincipledNode(const pass_settings_t &ps, const ray_data_t< secondary_mask |= sample_diff_lobe; } - const simd_ivec sample_spec_lobe = (spec_depth < ps.max_spec_depth) & (total_depth < ps.max_total_depth) & + const ivec sample_spec_lobe = (spec_depth < ps.max_spec_depth) & (total_depth < ps.max_total_depth) & simd_cast(mix_rand >= lobe_weights.diffuse) & simd_cast(mix_rand < lobe_weights.diffuse + lobe_weights.specular) & mask; if (sample_spec_lobe.not_all_zeros()) { - const simd_fvec spec_col_90[3] = {1.0f, 1.0f, 1.0f}; + const fvec spec_col_90[3] = {1.0f, 1.0f, 1.0f}; - simd_fvec V[3], F[4], alpha[2]; + fvec V[3], F[4], alpha[2]; calc_alpha(spec.roughness, spec.anisotropy, regularize_alpha, alpha); Sample_GGXSpecular_BSDF(surf.T, surf.B, surf.N, ray.d, alpha, spec.ior, spec.F0, spec.tmp_col, spec_col_90, rand, V, F); F[3] *= lobe_weights.specular; - simd_fvec new_p[3]; + fvec new_p[3]; offset_ray(surf.P, surf.plane_N, new_p); where(sample_spec_lobe, new_ray.depth) = pack_ray_type(RAY_TYPE_SPECULAR); where(sample_spec_lobe, new_ray.depth) |= - mask_ray_depth(ray.depth) + pack_depth(simd_ivec{0}, simd_ivec{1}, simd_ivec{0}, simd_ivec{0}); + mask_ray_depth(ray.depth) + pack_depth(ivec{0}, ivec{1}, ivec{0}, ivec{0}); UNROLLED_FOR(i, 3, { where(sample_spec_lobe, new_ray.o[i]) = new_p[i]; @@ -6978,22 +6978,22 @@ void Ray::NS::Sample_PrincipledNode(const pass_settings_t &ps, const ray_data_t< secondary_mask |= sample_spec_lobe; } - const simd_ivec sample_coat_lobe = + const ivec sample_coat_lobe = (spec_depth < ps.max_spec_depth) & (total_depth < ps.max_total_depth) & simd_cast(mix_rand >= lobe_weights.diffuse + lobe_weights.specular) & simd_cast(mix_rand < lobe_weights.diffuse + lobe_weights.specular + lobe_weights.clearcoat) & mask; if (sample_coat_lobe.not_all_zeros()) { - simd_fvec V[3], F[4], alpha[2]; - calc_alpha(coat.roughness, simd_fvec{0.0f}, regularize_alpha, alpha); + fvec V[3], F[4], alpha[2]; + calc_alpha(coat.roughness, fvec{0.0f}, regularize_alpha, alpha); Sample_PrincipledClearcoat_BSDF(surf.T, surf.B, surf.N, ray.d, alpha[0], coat.ior, coat.F0, rand, V, F); F[3] *= lobe_weights.clearcoat; - simd_fvec new_p[3]; + fvec new_p[3]; offset_ray(surf.P, surf.plane_N, new_p); where(sample_spec_lobe, new_ray.depth) = pack_ray_type(RAY_TYPE_SPECULAR); where(sample_coat_lobe, new_ray.depth) |= - mask_ray_depth(ray.depth) + pack_depth(simd_ivec{0}, simd_ivec{1}, simd_ivec{0}, simd_ivec{0}); + mask_ray_depth(ray.depth) + pack_depth(ivec{0}, ivec{1}, ivec{0}, ivec{0}); UNROLLED_FOR(i, 3, { where(sample_coat_lobe, new_ray.o[i]) = new_p[i]; @@ -7006,7 +7006,7 @@ void Ray::NS::Sample_PrincipledNode(const pass_settings_t &ps, const ray_data_t< secondary_mask |= sample_coat_lobe; } - simd_ivec sample_trans_lobe = + ivec sample_trans_lobe = simd_cast(mix_rand >= lobe_weights.diffuse + lobe_weights.specular + lobe_weights.clearcoat) & (total_depth < ps.max_total_depth) & mask; @@ -7016,41 +7016,41 @@ void Ray::NS::Sample_PrincipledNode(const pass_settings_t &ps, const ray_data_t< sample_trans_lobe &= ((simd_cast(mix_rand >= trans.fresnel) & (refr_depth < ps.max_refr_depth)) | (simd_cast(mix_rand < trans.fresnel) & (spec_depth < ps.max_spec_depth))); if (sample_trans_lobe.not_all_zeros()) { - simd_fvec F[4] = {}, V[3] = {}; + fvec F[4] = {}, V[3] = {}; - const simd_ivec sample_trans_spec_lobe = simd_cast(mix_rand < trans.fresnel) & sample_trans_lobe; + const ivec sample_trans_spec_lobe = simd_cast(mix_rand < trans.fresnel) & sample_trans_lobe; if (sample_trans_spec_lobe.not_all_zeros()) { - simd_fvec _spec_tmp_col[3] = {1.0f, 1.0f, 1.0f}, alpha[2]; - calc_alpha(spec.roughness, simd_fvec{0.0f}, regularize_alpha, alpha); - Sample_GGXSpecular_BSDF(surf.T, surf.B, surf.N, ray.d, alpha, simd_fvec{1.0f} /* ior */, - simd_fvec{0.0f} /* F0 */, _spec_tmp_col, _spec_tmp_col, rand, V, F); + fvec _spec_tmp_col[3] = {1.0f, 1.0f, 1.0f}, alpha[2]; + calc_alpha(spec.roughness, fvec{0.0f}, regularize_alpha, alpha); + Sample_GGXSpecular_BSDF(surf.T, surf.B, surf.N, ray.d, alpha, fvec{1.0f} /* ior */, + fvec{0.0f} /* F0 */, _spec_tmp_col, _spec_tmp_col, rand, V, F); - simd_fvec new_p[3]; + fvec new_p[3]; offset_ray(surf.P, surf.plane_N, new_p); where(sample_trans_spec_lobe, new_ray.depth) = pack_ray_type(RAY_TYPE_SPECULAR); where(sample_trans_spec_lobe, new_ray.depth) |= mask_ray_depth(ray.depth) + - pack_depth(simd_ivec{0}, simd_ivec{1}, simd_ivec{0}, simd_ivec{0}); + pack_depth(ivec{0}, ivec{1}, ivec{0}, ivec{0}); UNROLLED_FOR(i, 3, { where(sample_trans_spec_lobe, new_ray.o[i]) = new_p[i]; }) } - const simd_ivec sample_trans_refr_lobe = ~sample_trans_spec_lobe & sample_trans_lobe; + const ivec sample_trans_refr_lobe = ~sample_trans_spec_lobe & sample_trans_lobe; if (sample_trans_refr_lobe.not_all_zeros()) { - simd_fvec temp_F[4], temp_V[4], alpha[2]; - calc_alpha(trans.roughness, simd_fvec{0.0f}, regularize_alpha, alpha); + fvec temp_F[4], temp_V[4], alpha[2]; + calc_alpha(trans.roughness, fvec{0.0f}, regularize_alpha, alpha); Sample_GGXRefraction_BSDF(surf.T, surf.B, surf.N, ray.d, alpha, trans.eta, diff.base_color, rand, temp_V, temp_F); - const simd_fvec _plane_N[3] = {-surf.plane_N[0], -surf.plane_N[1], -surf.plane_N[2]}; - simd_fvec new_p[3]; + const fvec _plane_N[3] = {-surf.plane_N[0], -surf.plane_N[1], -surf.plane_N[2]}; + fvec new_p[3]; offset_ray(surf.P, _plane_N, new_p); where(sample_trans_refr_lobe, new_ray.depth) = pack_ray_type(RAY_TYPE_REFR); where(sample_trans_refr_lobe, new_ray.depth) |= mask_ray_depth(ray.depth) + - pack_depth(simd_ivec{0}, simd_ivec{0}, simd_ivec{1}, simd_ivec{0}); + pack_depth(ivec{0}, ivec{0}, ivec{1}, ivec{0}); UNROLLED_FOR(i, 4, { where(sample_trans_refr_lobe, F[i]) = temp_F[i]; }) UNROLLED_FOR(i, 3, { @@ -7080,43 +7080,43 @@ template void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], const uint32_t rand_seq[], const uint32_t rand_seed, const int iteration, const hit_data_t &inter, const ray_data_t &ray, const scene_data_t &sc, const uint32_t node_index, - const Cpu::TexStorageBase *const textures[], simd_fvec out_rgba[4], + const Cpu::TexStorageBase *const textures[], fvec out_rgba[4], ray_data_t out_secondary_rays[], int *out_secondary_rays_count, shadow_ray_t out_shadow_rays[], int *out_shadow_rays_count, - simd_fvec out_base_color[4], simd_fvec out_depth_normals[4]) { + fvec out_base_color[4], fvec out_depth_normals[4]) { out_rgba[0] = out_rgba[1] = out_rgba[2] = {0.0f}; out_rgba[3] = {1.0f}; // used to randomize random sequence among pixels - const simd_uvec px_hash = hash(ray.xy); - const simd_uvec rand_hash = hash_combine(px_hash, rand_seed); + const uvec px_hash = hash(ray.xy); + const uvec rand_hash = hash_combine(px_hash, rand_seed); - const simd_ivec diff_depth = get_diff_depth(ray.depth), spec_depth = get_spec_depth(ray.depth), + const ivec diff_depth = get_diff_depth(ray.depth), spec_depth = get_spec_depth(ray.depth), refr_depth = get_refr_depth(ray.depth), transp_depth = get_transp_depth(ray.depth); // NOTE: transparency depth is not accounted here - const simd_ivec total_depth = diff_depth + spec_depth + refr_depth; + const ivec total_depth = diff_depth + spec_depth + refr_depth; // offset of the sequence - const auto rand_dim = simd_uvec(RAND_DIM_BASE_COUNT + (total_depth + transp_depth) * RAND_DIM_BOUNCE_COUNT); + const auto rand_dim = uvec(RAND_DIM_BASE_COUNT + (total_depth + transp_depth) * RAND_DIM_BOUNCE_COUNT); - simd_fvec tex_rand[2]; + fvec tex_rand[2]; get_scrambled_2d_rand(rand_dim + RAND_DIM_TEX, rand_hash, iteration - 1, rand_seq, tex_rand); - const simd_ivec ino_hit = simd_cast(inter.v < 0.0f); + const ivec ino_hit = simd_cast(inter.v < 0.0f); if (ino_hit.not_all_zeros()) { - simd_fvec env_col[4] = {{1.0f}, {1.0f}, {1.0f}, {1.0f}}; - const simd_fvec pdf_factor = select(total_depth < ps.max_total_depth, + fvec env_col[4] = {{1.0f}, {1.0f}, {1.0f}, {1.0f}}; + const fvec pdf_factor = select(total_depth < ps.max_total_depth, #if USE_HIERARCHICAL_NEE safe_div_pos(1.0f, inter.u), #else float(sc.li_indices.size()), #endif - simd_fvec{-1.0f}); + fvec{-1.0f}); Evaluate_EnvColor(ray, ino_hit, sc.env, *static_cast(textures[0]), pdf_factor, tex_rand, env_col); UNROLLED_FOR(i, 3, { env_col[i] = ray.c[i] * env_col[i]; }) - const simd_fvec sum = env_col[0] + env_col[1] + env_col[2]; + const fvec sum = env_col[0] + env_col[1] + env_col[2]; UNROLLED_FOR(i, 3, { where(sum > limits[0], env_col[i]) = safe_div_pos(env_col[i] * limits[0], sum); where(ino_hit, out_rgba[i]) = env_col[i]; @@ -7124,24 +7124,24 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con where(ino_hit, out_rgba[3]) = env_col[3]; } - simd_ivec is_active_lane = simd_cast(inter.v >= 0.0f); + ivec is_active_lane = simd_cast(inter.v >= 0.0f); if (is_active_lane.all_zeros()) { return; } - const simd_fvec *I = ray.d; + const fvec *I = ray.d; surface_t surf; UNROLLED_FOR(i, 3, { where(inter.v >= 0.0f, surf.P[i]) = fmadd(inter.t, ray.d[i], ray.o[i]); }) - const simd_ivec is_light_hit = is_active_lane & (inter.obj_index < 0); // Area light intersection + const ivec is_light_hit = is_active_lane & (inter.obj_index < 0); // Area light intersection if (is_light_hit.not_all_zeros()) { - simd_fvec light_col[3] = {}; + fvec light_col[3] = {}; Evaluate_LightColor(surf.P, ray, is_light_hit, inter, sc.env, sc.lights, uint32_t(sc.li_indices.size()), *static_cast(textures[0]), tex_rand, light_col); UNROLLED_FOR(i, 3, { light_col[i] = ray.c[i] * light_col[i]; }) - const simd_fvec sum = light_col[0] + light_col[1] + light_col[2]; + const fvec sum = light_col[0] + light_col[1] + light_col[2]; UNROLLED_FOR(i, 3, { where(sum > limits[0], light_col[i]) = safe_div_pos(light_col[i] * limits[0], sum); where(is_light_hit, out_rgba[i]) = light_col[i]; @@ -7155,21 +7155,21 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con return; } - const simd_ivec is_backfacing = (inter.prim_index < 0); - const simd_ivec tri_index = select(is_backfacing, -inter.prim_index - 1, inter.prim_index); + const ivec is_backfacing = (inter.prim_index < 0); + const ivec tri_index = select(is_backfacing, -inter.prim_index - 1, inter.prim_index); - const simd_ivec obj_index = select(is_active_lane, inter.obj_index, simd_ivec{0}); + const ivec obj_index = select(is_active_lane, inter.obj_index, ivec{0}); - simd_ivec mat_index = gather(reinterpret_cast(sc.tri_materials), tri_index) & - simd_ivec((MATERIAL_INDEX_BITS << 16) | MATERIAL_INDEX_BITS); + ivec mat_index = gather(reinterpret_cast(sc.tri_materials), tri_index) & + ivec((MATERIAL_INDEX_BITS << 16) | MATERIAL_INDEX_BITS); - const simd_ivec vtx_indices[3] = {gather(reinterpret_cast(sc.vtx_indices + 0), tri_index * 3), + const ivec vtx_indices[3] = {gather(reinterpret_cast(sc.vtx_indices + 0), tri_index * 3), gather(reinterpret_cast(sc.vtx_indices + 1), tri_index * 3), gather(reinterpret_cast(sc.vtx_indices + 2), tri_index * 3)}; - const simd_fvec w = 1.0f - inter.u - inter.v; + const fvec w = 1.0f - inter.u - inter.v; - simd_fvec p1[3], p2[3], p3[3], P_ls[3]; + fvec p1[3], p2[3], p3[3], P_ls[3]; { // Fetch vertex positions const float *vtx_positions = &sc.vertices[0].p[0]; const int VtxPositionsStride = sizeof(vertex_t) / sizeof(float); @@ -7186,7 +7186,7 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con FetchVertexAttribute3(&sc.vertices[0].n[0], vtx_indices, inter.u, inter.v, w, surf.N); safe_normalize(surf.N); - simd_fvec u1[2], u2[2], u3[2]; + fvec u1[2], u2[2], u3[2]; { // Fetch vertex uvs const float *vtx_uvs = &sc.vertices[0].t[0]; const int VtxUVStride = sizeof(vertex_t) / sizeof(float); @@ -7201,20 +7201,20 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con } { // calc planar normal - simd_fvec e21[3], e31[3]; + fvec e21[3], e31[3]; UNROLLED_FOR(i, 3, { e21[i] = p2[i] - p1[i]; e31[i] = p3[i] - p1[i]; }) cross(e21, e31, surf.plane_N); } - const simd_fvec pa = normalize(surf.plane_N); + const fvec pa = normalize(surf.plane_N); FetchVertexAttribute3(&sc.vertices[0].b[0], vtx_indices, inter.u, inter.v, w, surf.B); cross(surf.B, surf.N, surf.T); { // return black for non-existing backfacing material - simd_ivec no_back_mi = (mat_index >> 16) == 0xffff; + ivec no_back_mi = (mat_index >> 16) == 0xffff; no_back_mi &= is_backfacing & is_active_lane; UNROLLED_FOR(i, 4, { where(no_back_mi, out_rgba[i]) = 0.0f; }) is_active_lane &= ~no_back_mi; @@ -7234,9 +7234,9 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con where(is_backfacing, surf.T[i]) = -surf.T[i]; }) - simd_fvec tangent[3] = {-P_ls[2], {0.0f}, P_ls[0]}; + fvec tangent[3] = {-P_ls[2], {0.0f}, P_ls[0]}; - simd_fvec transform[16], ro_ls[3] = {ray.o[0], ray.o[1], ray.o[2]}; + fvec transform[16], ro_ls[3] = {ray.o[0], ray.o[1], ray.o[2]}; FetchTransformAndRecalcBasis(sc.mesh_instances, obj_index, P_ls, surf.plane_N, surf.N, surf.B, surf.T, tangent, ro_ls, transform); @@ -7248,11 +7248,11 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con ////////////////////////////////// - const simd_fvec ta = abs((u2[0] - u1[0]) * (u3[1] - u1[1]) - (u3[0] - u1[0]) * (u2[1] - u1[1])); + const fvec ta = abs((u2[0] - u1[0]) * (u3[1] - u1[1]) - (u3[0] - u1[0]) * (u2[1] - u1[1])); - const simd_fvec cone_width = ray.cone_width + ray.cone_spread * inter.t; + const fvec cone_width = ray.cone_width + ray.cone_spread * inter.t; - simd_fvec lambda = 0.5f * fast_log2(ta / pa); + fvec lambda = 0.5f * fast_log2(ta / pa); lambda += fast_log2(cone_width); // lambda += 0.5 * fast_log2(tex_res.x * tex_res.y); // lambda -= fast_log2(abs(dot3(I, surf.plane_N))); @@ -7261,30 +7261,30 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con static const int MatDWORDStride = sizeof(material_t) / sizeof(float); - const simd_fvec ext_ior = peek_ior_stack(ray.ior, is_backfacing); + const fvec ext_ior = peek_ior_stack(ray.ior, is_backfacing); - simd_ivec mat_type = + ivec mat_type = gather(reinterpret_cast(&sc.materials[0].type), mat_index * sizeof(material_t) / sizeof(int)) & 0xff; - simd_fvec mix_term_rand[2]; + fvec mix_term_rand[2]; get_scrambled_2d_rand(rand_dim + unsigned(RAND_DIM_BSDF_PICK), rand_hash, iteration - 1, rand_seq, mix_term_rand); - simd_fvec mix_rand = mix_term_rand[0]; - simd_fvec mix_weight = 1.0f; + fvec mix_rand = mix_term_rand[0]; + fvec mix_weight = 1.0f; // resolve mix material - simd_ivec is_mix_mat = (mat_type == int(eShadingNode::Mix)); + ivec is_mix_mat = (mat_type == int(eShadingNode::Mix)); while (is_mix_mat.not_all_zeros()) { const float *mix_values = &sc.materials[0].strength; - simd_fvec mix_val = gather(mix_values, mat_index * MatDWORDStride); + fvec mix_val = gather(mix_values, mat_index * MatDWORDStride); const int *base_textures = reinterpret_cast(&sc.materials[0].textures[BASE_TEXTURE]); - const simd_ivec base_texture = gather(base_textures, mat_index * MatDWORDStride); + const ivec base_texture = gather(base_textures, mat_index * MatDWORDStride); - const simd_ivec has_texture = (base_texture != -1) & is_active_lane; + const ivec has_texture = (base_texture != -1) & is_active_lane; if (has_texture.not_all_zeros()) { - simd_ivec ray_queue[S]; + ivec ray_queue[S]; ray_queue[0] = has_texture; int index = 0, num = 1; @@ -7292,18 +7292,18 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con const long mask = ray_queue[index].movemask(); const uint32_t first_t = base_texture[GetFirstBit(mask)]; - const simd_ivec same_t = (base_texture == first_t); - const simd_ivec diff_t = and_not(same_t, ray_queue[index]); + const ivec same_t = (base_texture == first_t); + const ivec diff_t = and_not(same_t, ray_queue[index]); if (diff_t.not_all_zeros()) { ray_queue[index] &= same_t; ray_queue[num++] = diff_t; } - const simd_fvec base_lod = get_texture_lod(textures, first_t, lambda, ray_queue[index]); + const fvec base_lod = get_texture_lod(textures, first_t, lambda, ray_queue[index]); - simd_fvec tex_color[4] = {}; - SampleBilinear(textures, first_t, surf.uvs, simd_ivec(base_lod), tex_rand, ray_queue[index], + fvec tex_color[4] = {}; + SampleBilinear(textures, first_t, surf.uvs, ivec(base_lod), tex_rand, ray_queue[index], tex_color); if (first_t & TEX_YCOCG_BIT) { YCoCg_to_RGB(tex_color, tex_color); @@ -7320,22 +7320,22 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con const float *iors = &sc.materials[0].ior; - const simd_fvec ior = gather(iors, mat_index * MatDWORDStride); + const fvec ior = gather(iors, mat_index * MatDWORDStride); - const simd_fvec eta = select(is_backfacing, safe_div_pos(ext_ior, ior), safe_div_pos(ior, ext_ior)); - const simd_fvec RR = select(ior != 0.0f, fresnel_dielectric_cos(dot3(I, surf.N), eta), simd_fvec{1.0f}); + const fvec eta = select(is_backfacing, safe_div_pos(ext_ior, ior), safe_div_pos(ior, ext_ior)); + const fvec RR = select(ior != 0.0f, fresnel_dielectric_cos(dot3(I, surf.N), eta), fvec{1.0f}); mix_val *= saturate(RR); - const simd_ivec use_mat1 = simd_cast(mix_rand > mix_val) & is_mix_mat; - const simd_ivec use_mat2 = ~use_mat1 & is_mix_mat; + const ivec use_mat1 = simd_cast(mix_rand > mix_val) & is_mix_mat; + const ivec use_mat2 = ~use_mat1 & is_mix_mat; const int *all_mat_flags = reinterpret_cast(&sc.materials[0].flags); - const simd_ivec is_add = (gather(all_mat_flags, mat_index * MatDWORDStride) & MAT_FLAG_MIX_ADD) != 0; + const ivec is_add = (gather(all_mat_flags, mat_index * MatDWORDStride) & MAT_FLAG_MIX_ADD) != 0; const int *all_mat_textures = reinterpret_cast(&sc.materials[0].textures[0]); - const simd_ivec mat1_index = gather(&all_mat_textures[MIX_MAT1], mat_index * MatDWORDStride); - const simd_ivec mat2_index = gather(&all_mat_textures[MIX_MAT2], mat_index * MatDWORDStride); + const ivec mat1_index = gather(&all_mat_textures[MIX_MAT1], mat_index * MatDWORDStride); + const ivec mat2_index = gather(&all_mat_textures[MIX_MAT2], mat_index * MatDWORDStride); where(is_add & use_mat1, mix_weight) = safe_div_pos(mix_weight, 1.0f - mix_val); where(use_mat1, mat_index) = mat1_index; @@ -7353,30 +7353,30 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con { // apply normal map const int *norm_textures = reinterpret_cast(&sc.materials[0].textures[NORMALS_TEXTURE]); - const simd_ivec normals_texture = gather(norm_textures, mat_index * MatDWORDStride); + const ivec normals_texture = gather(norm_textures, mat_index * MatDWORDStride); - const simd_ivec has_texture = (normals_texture != -1) & is_active_lane; + const ivec has_texture = (normals_texture != -1) & is_active_lane; if (has_texture.not_all_zeros()) { - simd_ivec ray_queue[S]; + ivec ray_queue[S]; ray_queue[0] = has_texture; - simd_fvec normals_tex[4] = {{0.0f}, {1.0f}, {0.0f}, {0.0f}}; - simd_ivec reconstruct_z = 0; + fvec normals_tex[4] = {{0.0f}, {1.0f}, {0.0f}, {0.0f}}; + ivec reconstruct_z = 0; int index = 0, num = 1; while (index != num) { const long mask = ray_queue[index].movemask(); const uint32_t first_t = normals_texture[GetFirstBit(mask)]; - const simd_ivec same_t = (normals_texture == first_t); - const simd_ivec diff_t = and_not(same_t, ray_queue[index]); + const ivec same_t = (normals_texture == first_t); + const ivec diff_t = and_not(same_t, ray_queue[index]); if (diff_t.not_all_zeros()) { ray_queue[index] &= same_t; ray_queue[num++] = diff_t; } - SampleBilinear(textures, first_t, surf.uvs, simd_ivec{0}, tex_rand, ray_queue[index], normals_tex); + SampleBilinear(textures, first_t, surf.uvs, ivec{0}, tex_rand, ray_queue[index], normals_tex); if (first_t & TEX_RECONSTRUCT_Z_BIT) { reconstruct_z |= ray_queue[index]; } @@ -7391,20 +7391,20 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con safe_sqrt(1.0f - normals_tex[0] * normals_tex[0] - normals_tex[1] * normals_tex[1]); } - simd_fvec new_normal[3]; + fvec new_normal[3]; UNROLLED_FOR(i, 3, { new_normal[i] = normals_tex[0] * surf.T[i] + normals_tex[2] * surf.N[i] + normals_tex[1] * surf.B[i]; }) normalize(new_normal); const int *normalmap_strengths = reinterpret_cast(&sc.materials[0].normal_map_strength_unorm); - const simd_ivec normalmap_strength = gather(normalmap_strengths, mat_index * MatDWORDStride) & 0xffff; + const ivec normalmap_strength = gather(normalmap_strengths, mat_index * MatDWORDStride) & 0xffff; - const simd_fvec fstrength = conv_unorm_16(normalmap_strength); + const fvec fstrength = conv_unorm_16(normalmap_strength); UNROLLED_FOR(i, 3, { new_normal[i] = surf.N[i] + (new_normal[i] - surf.N[i]) * fstrength; }) normalize(new_normal); - const simd_fvec nI[3] = {-I[0], -I[1], -I[2]}; + const fvec nI[3] = {-I[0], -I[1], -I[2]}; EnsureValidReflection(surf.plane_N, nI, new_normal); UNROLLED_FOR(i, 3, { where(has_texture, surf.N[i]) = new_normal[i]; }) @@ -7414,13 +7414,13 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con #if 0 #else - simd_fvec tangent_rotation; + fvec tangent_rotation; { // fetch anisotropic rotations const float *tangent_rotations = &sc.materials[0].tangent_rotation; tangent_rotation = gather(tangent_rotations, mat_index * MatDWORDStride); } - const simd_ivec has_rotation = simd_cast(tangent_rotation != 0.0f); + const ivec has_rotation = simd_cast(tangent_rotation != 0.0f); if (has_rotation.not_all_zeros()) { rotate_around_axis(tangent, surf.N, tangent_rotation, tangent); } @@ -7433,29 +7433,29 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con #if USE_NEE light_sample_t ls; if (!sc.light_wnodes.empty()) { - simd_fvec rand_pick_light[2]; + fvec rand_pick_light[2]; get_scrambled_2d_rand(rand_dim + RAND_DIM_LIGHT_PICK, rand_hash, iteration - 1, rand_seq, rand_pick_light); - simd_fvec rand_light_uv[2]; + fvec rand_light_uv[2]; get_scrambled_2d_rand(rand_dim + RAND_DIM_LIGHT, rand_hash, iteration - 1, rand_seq, rand_light_uv); SampleLightSource(surf.P, surf.T, surf.B, surf.N, sc, textures, rand_pick_light[0], rand_light_uv, tex_rand, is_active_lane, ls); } - const simd_fvec N_dot_L = dot3(surf.N, ls.L); + const fvec N_dot_L = dot3(surf.N, ls.L); #endif - simd_fvec base_color[3]; + fvec base_color[3]; { // Fetch material base color const float *base_colors = &sc.materials[0].base_color[0]; UNROLLED_FOR(i, 3, { base_color[i] = gather(base_colors + i, mat_index * MatDWORDStride); }) const int *base_textures = reinterpret_cast(&sc.materials[0].textures[BASE_TEXTURE]); - const simd_ivec base_texture = gather(base_textures, mat_index * MatDWORDStride); + const ivec base_texture = gather(base_textures, mat_index * MatDWORDStride); - const simd_ivec has_texture = (base_texture != -1) & is_active_lane; + const ivec has_texture = (base_texture != -1) & is_active_lane; if (has_texture.not_all_zeros()) { - simd_ivec ray_queue[S]; + ivec ray_queue[S]; ray_queue[0] = has_texture; int index = 0, num = 1; @@ -7463,18 +7463,18 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con const long mask = ray_queue[index].movemask(); const uint32_t first_t = base_texture[GetFirstBit(mask)]; - const simd_ivec same_t = (base_texture == first_t); - const simd_ivec diff_t = and_not(same_t, ray_queue[index]); + const ivec same_t = (base_texture == first_t); + const ivec diff_t = and_not(same_t, ray_queue[index]); if (diff_t.not_all_zeros()) { ray_queue[index] &= same_t; ray_queue[num++] = diff_t; } - const simd_fvec base_lod = get_texture_lod(textures, first_t, lambda, ray_queue[index]); + const fvec base_lod = get_texture_lod(textures, first_t, lambda, ray_queue[index]); - simd_fvec tex_color[4] = {}; - SampleBilinear(textures, first_t, surf.uvs, simd_ivec(base_lod), tex_rand, ray_queue[index], + fvec tex_color[4] = {}; + SampleBilinear(textures, first_t, surf.uvs, ivec(base_lod), tex_rand, ray_queue[index], tex_color); if (first_t & TEX_YCOCG_BIT) { YCoCg_to_RGB(tex_color, tex_color); @@ -7498,22 +7498,22 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con where(is_active_lane, out_depth_normals[3]) = inter.t; } - simd_fvec tint_color[3] = {{0.0f}, {0.0f}, {0.0f}}; + fvec tint_color[3] = {{0.0f}, {0.0f}, {0.0f}}; - const simd_fvec base_color_lum = lum(base_color); + const fvec base_color_lum = lum(base_color); UNROLLED_FOR(i, 3, { where(base_color_lum > 0.0f, tint_color[i]) = safe_div_pos(base_color[i], base_color_lum); }) - simd_fvec roughness; + fvec roughness; { // fetch material roughness const int *roughnesses = reinterpret_cast(&sc.materials[0].roughness_unorm); roughness = conv_unorm_16(gather(roughnesses, mat_index * MatDWORDStride) & 0xffff); const int *roughness_textures = reinterpret_cast(&sc.materials[0].textures[ROUGH_TEXTURE]); - const simd_ivec roughness_texture = gather(roughness_textures, mat_index * MatDWORDStride); + const ivec roughness_texture = gather(roughness_textures, mat_index * MatDWORDStride); - const simd_ivec has_texture = (roughness_texture != -1) & is_active_lane; + const ivec has_texture = (roughness_texture != -1) & is_active_lane; if (has_texture.not_all_zeros()) { - simd_ivec ray_queue[S]; + ivec ray_queue[S]; ray_queue[0] = has_texture; int index = 0, num = 1; @@ -7521,18 +7521,18 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con const long mask = ray_queue[index].movemask(); const uint32_t first_t = roughness_texture[GetFirstBit(mask)]; - const simd_ivec same_t = (roughness_texture == first_t); - const simd_ivec diff_t = and_not(same_t, ray_queue[index]); + const ivec same_t = (roughness_texture == first_t); + const ivec diff_t = and_not(same_t, ray_queue[index]); if (diff_t.not_all_zeros()) { ray_queue[index] &= same_t; ray_queue[num++] = diff_t; } - const simd_fvec roughness_lod = get_texture_lod(textures, first_t, lambda, ray_queue[index]); + const fvec roughness_lod = get_texture_lod(textures, first_t, lambda, ray_queue[index]); - simd_fvec roughness_color[4] = {}; - SampleBilinear(textures, first_t, surf.uvs, simd_ivec(roughness_lod), tex_rand, ray_queue[index], + fvec roughness_color[4] = {}; + SampleBilinear(textures, first_t, surf.uvs, ivec(roughness_lod), tex_rand, ray_queue[index], roughness_color); if (first_t & TEX_SRGB_BIT) { srgb_to_rgb(roughness_color, roughness_color); @@ -7544,12 +7544,12 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con } } - simd_fvec col[3] = {0.0f, 0.0f, 0.0f}; + fvec col[3] = {0.0f, 0.0f, 0.0f}; - simd_fvec rand_uv[2]; + fvec rand_uv[2]; get_scrambled_2d_rand(rand_dim + RAND_DIM_BSDF, rand_hash, iteration - 1, rand_seq, rand_uv); - simd_ivec secondary_mask = {0}, shadow_mask = {0}; + ivec secondary_mask = {0}, shadow_mask = {0}; ray_data_t &new_ray = out_secondary_rays[*out_secondary_rays_count]; new_ray.o[0] = new_ray.o[1] = new_ray.o[2] = 0.0f; @@ -7560,29 +7560,29 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con new_ray.cone_width = cone_width; new_ray.cone_spread = ray.cone_spread; new_ray.xy = ray.xy; - new_ray.depth = simd_uvec{0u}; + new_ray.depth = uvec{0u}; shadow_ray_t &sh_r = out_shadow_rays[*out_shadow_rays_count]; sh_r = {}; sh_r.depth = ray.depth; sh_r.xy = ray.xy; - simd_fvec regularize_alpha = 0.0f; + fvec regularize_alpha = 0.0f; where(get_diff_depth(ray.depth) > 0, regularize_alpha) = ps.regularize_alpha; { // Sample materials - simd_ivec ray_queue[S]; + ivec ray_queue[S]; ray_queue[0] = is_active_lane; - simd_ivec lanes_processed = 0; + ivec lanes_processed = 0; int index = 0, num = 1; while (index != num) { const long mask = ray_queue[index].movemask(); const uint32_t first_mi = mat_index[GetFirstBit(mask)]; - const simd_ivec same_mi = (mat_index == first_mi); - const simd_ivec diff_mi = and_not(same_mi, ray_queue[index]); + const ivec same_mi = (mat_index == first_mi); + const ivec diff_mi = and_not(same_mi, ray_queue[index]); if (diff_mi.not_all_zeros()) { ray_queue[index] &= same_mi; @@ -7595,14 +7595,14 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con const material_t *mat = &sc.materials[first_mi]; if (mat->type == eShadingNode::Diffuse) { #if USE_NEE - const simd_ivec eval_light = simd_cast(ls.pdf > 0.0f) & simd_cast(N_dot_L > 0.0f) & ray_queue[index]; + const ivec eval_light = simd_cast(ls.pdf > 0.0f) & simd_cast(N_dot_L > 0.0f) & ray_queue[index]; if (eval_light.not_all_zeros()) { assert((shadow_mask & eval_light).all_zeros()); shadow_mask |= Evaluate_DiffuseNode(ls, ray, eval_light, surf, base_color, roughness, mix_weight, (total_depth < ps.max_total_depth), col, sh_r); } #endif - const simd_ivec gen_ray = + const ivec gen_ray = (diff_depth < ps.max_diff_depth) & (total_depth < ps.max_total_depth) & ray_queue[index]; if (gen_ray.not_all_zeros()) { Sample_DiffuseNode(ray, gen_ray, surf, base_color, roughness, rand_uv[0], rand_uv[1], mix_weight, @@ -7616,75 +7616,75 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con const float spec_F0 = fresnel_dielectric_cos(1.0f, spec_ior); #if USE_NEE - const simd_ivec eval_light = simd_cast(ls.pdf > 0.0f) & simd_cast(N_dot_L > 0.0f) & ray_queue[index]; + const ivec eval_light = simd_cast(ls.pdf > 0.0f) & simd_cast(N_dot_L > 0.0f) & ray_queue[index]; if (eval_light.not_all_zeros()) { assert((shadow_mask & eval_light).all_zeros()); shadow_mask |= Evaluate_GlossyNode(ls, ray, eval_light, surf, base_color, roughness, - regularize_alpha, simd_fvec{spec_ior}, simd_fvec{spec_F0}, + regularize_alpha, fvec{spec_ior}, fvec{spec_F0}, mix_weight, (total_depth < ps.max_total_depth), col, sh_r); }; #endif - const simd_ivec gen_ray = + const ivec gen_ray = (spec_depth < ps.max_spec_depth) & (total_depth < ps.max_total_depth) & ray_queue[index]; if (gen_ray.not_all_zeros()) { Sample_GlossyNode(ray, gen_ray, surf, base_color, roughness, regularize_alpha, - simd_fvec{spec_ior}, simd_fvec{spec_F0}, rand_uv, mix_weight, new_ray); + fvec{spec_ior}, fvec{spec_F0}, rand_uv, mix_weight, new_ray); assert((secondary_mask & gen_ray).all_zeros()); secondary_mask |= gen_ray; } } else if (mat->type == eShadingNode::Refractive) { #if USE_NEE - const simd_ivec eval_light = simd_cast(ls.pdf > 0.0f) & simd_cast(N_dot_L < 0.0f) & ray_queue[index]; + const ivec eval_light = simd_cast(ls.pdf > 0.0f) & simd_cast(N_dot_L < 0.0f) & ray_queue[index]; if (eval_light.not_all_zeros()) { assert((shadow_mask & eval_light).all_zeros()); - const simd_fvec eta = select(is_backfacing, mat->ior / ext_ior, ext_ior / mat->ior); + const fvec eta = select(is_backfacing, mat->ior / ext_ior, ext_ior / mat->ior); shadow_mask |= Evaluate_RefractiveNode(ls, ray, eval_light, surf, base_color, roughness, regularize_alpha, eta, mix_weight, (total_depth < ps.max_total_depth), col, sh_r); } #endif - const simd_ivec gen_ray = + const ivec gen_ray = (refr_depth < ps.max_refr_depth) & (total_depth < ps.max_total_depth) & ray_queue[index]; if (gen_ray.not_all_zeros()) { Sample_RefractiveNode(ray, gen_ray, surf, base_color, roughness, regularize_alpha, is_backfacing, - simd_fvec{mat->ior}, ext_ior, rand_uv, mix_weight, new_ray); + fvec{mat->ior}, ext_ior, rand_uv, mix_weight, new_ray); assert((secondary_mask & gen_ray).all_zeros()); secondary_mask |= gen_ray; } } else if (mat->type == eShadingNode::Emissive) { - simd_fvec mis_weight = 1.0f; + fvec mis_weight = 1.0f; #if USE_NEE if ((ray.depth & 0x00ffffff).not_all_zeros() && (mat->flags & MAT_FLAG_MULT_IMPORTANCE)) { #if USE_HIERARCHICAL_NEE - const simd_fvec pdf_factor = + const fvec pdf_factor = EvalTriLightFactor(surf.P, ray.o, ray_queue[index], tri_index, sc.lights, sc.light_wnodes); #else // USE_HIERARCHICAL_NEE const float pdf_factor = float(sc.li_indices.size()); #endif // USE_HIERARCHICAL_NEE - const simd_fvec v1[3] = {p2[0] - p1[0], p2[1] - p1[1], p2[2] - p1[2]}, + const fvec v1[3] = {p2[0] - p1[0], p2[1] - p1[1], p2[2] - p1[2]}, v2[3] = {p3[0] - p1[0], p3[1] - p1[1], p3[2] - p1[2]}; - simd_fvec light_forward[3]; + fvec light_forward[3]; cross(v1, v2, light_forward); TransformDirection(transform, light_forward); - const simd_fvec tri_area = 0.5f * normalize(light_forward); + const fvec tri_area = 0.5f * normalize(light_forward); - const simd_fvec cos_theta = abs(dot3(I, light_forward)); - const simd_ivec emissive_mask = - ray_queue[index] & simd_cast(cos_theta > 0.0f) & (simd_ivec(ray.depth & 0x00ffffff) != 0); + const fvec cos_theta = abs(dot3(I, light_forward)); + const ivec emissive_mask = + ray_queue[index] & simd_cast(cos_theta > 0.0f) & (ivec(ray.depth & 0x00ffffff) != 0); if (emissive_mask.not_all_zeros()) { #if USE_SPHERICAL_AREA_LIGHT_SAMPLING - simd_fvec light_pdf = + fvec light_pdf = SampleSphericalTriangle(ro_ls, p1, p2, p3, nullptr, nullptr) / pdf_factor; where(light_pdf == 0.0f, light_pdf) = safe_div_pos(inter.t * inter.t, tri_area * cos_theta * pdf_factor); #else // USE_SPHERICAL_AREA_LIGHT_SAMPLING - const simd_fvec light_pdf = + const fvec light_pdf = safe_div_pos(inter.t * inter.t, tri_area * cos_theta * pdf_factor); #endif // USE_SPHERICAL_AREA_LIGHT_SAMPLING - const simd_fvec &bsdf_pdf = ray.pdf; + const fvec &bsdf_pdf = ray.pdf; where(emissive_mask, mis_weight) = power_heuristic(bsdf_pdf, light_pdf); } @@ -7694,23 +7694,23 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con where(ray_queue[index], col[i]) += mix_weight * mis_weight * mat->strength * base_color[i]; }) } else if (mat->type == eShadingNode::Principled) { - simd_fvec metallic = unpack_unorm_16(mat->metallic_unorm); + fvec metallic = unpack_unorm_16(mat->metallic_unorm); if (mat->textures[METALLIC_TEXTURE] != 0xffffffff) { const uint32_t metallic_tex = mat->textures[METALLIC_TEXTURE]; - const simd_fvec metallic_lod = get_texture_lod(textures, metallic_tex, lambda, ray_queue[index]); - simd_fvec metallic_color[4] = {}; - SampleBilinear(textures, metallic_tex, surf.uvs, simd_ivec(metallic_lod), tex_rand, + const fvec metallic_lod = get_texture_lod(textures, metallic_tex, lambda, ray_queue[index]); + fvec metallic_color[4] = {}; + SampleBilinear(textures, metallic_tex, surf.uvs, ivec(metallic_lod), tex_rand, ray_queue[index], metallic_color); metallic *= metallic_color[0]; } - simd_fvec specular = unpack_unorm_16(mat->specular_unorm); + fvec specular = unpack_unorm_16(mat->specular_unorm); if (mat->textures[SPECULAR_TEXTURE] != 0xffffffff) { const uint32_t specular_tex = mat->textures[SPECULAR_TEXTURE]; - const simd_fvec specular_lod = get_texture_lod(textures, specular_tex, lambda, ray_queue[index]); - simd_fvec specular_color[4] = {}; - SampleBilinear(textures, specular_tex, surf.uvs, simd_ivec(specular_lod), tex_rand, + const fvec specular_lod = get_texture_lod(textures, specular_tex, lambda, ray_queue[index]); + fvec specular_color[4] = {}; + SampleBilinear(textures, specular_tex, surf.uvs, ivec(specular_lod), tex_rand, ray_queue[index], specular_color); if (specular_tex & TEX_SRGB_BIT) { srgb_to_rgb(specular_color, specular_color); @@ -7727,23 +7727,23 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con diff_params_t diff; UNROLLED_FOR(i, 3, { diff.base_color[i] = base_color[i]; }) UNROLLED_FOR(i, 3, - { diff.sheen_color[i] = sheen * mix(simd_fvec{1.0f}, tint_color[i], sheen_tint); }) + { diff.sheen_color[i] = sheen * mix(fvec{1.0f}, tint_color[i], sheen_tint); }) diff.roughness = roughness; spec_params_t spec; UNROLLED_FOR(i, 3, { - spec.tmp_col[i] = mix(simd_fvec{1.0f}, tint_color[i], unpack_unorm_16(mat->specular_tint_unorm)); + spec.tmp_col[i] = mix(fvec{1.0f}, tint_color[i], unpack_unorm_16(mat->specular_tint_unorm)); spec.tmp_col[i] = mix(specular * 0.08f * spec.tmp_col[i], base_color[i], metallic); }) spec.roughness = roughness; spec.ior = (2.0f / (1.0f - sqrt(0.08f * specular))) - 1.0f; - spec.F0 = fresnel_dielectric_cos(simd_fvec{1.0f}, spec.ior); + spec.F0 = fresnel_dielectric_cos(fvec{1.0f}, spec.ior); spec.anisotropy = unpack_unorm_16(mat->anisotropic_unorm); clearcoat_params_t coat; coat.roughness = clearcoat_roughness; coat.ior = (2.0f / (1.0f - sqrtf(0.08f * clearcoat))) - 1.0f; - coat.F0 = fresnel_dielectric_cos(simd_fvec{1.0f}, coat.ior); + coat.F0 = fresnel_dielectric_cos(fvec{1.0f}, coat.ior); transmission_params_t trans; trans.roughness = @@ -7754,20 +7754,20 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con trans.backfacing = is_backfacing; // Approximation of FH (using shading normal) - const simd_fvec FN = + const fvec FN = (fresnel_dielectric_cos(dot3(I, surf.N), spec.ior) - spec.F0) / (1.0f - spec.F0); - simd_fvec approx_spec_col[3]; - UNROLLED_FOR(i, 3, { approx_spec_col[i] = mix(spec.tmp_col[i], simd_fvec{1.0f}, FN); }) + fvec approx_spec_col[3]; + UNROLLED_FOR(i, 3, { approx_spec_col[i] = mix(spec.tmp_col[i], fvec{1.0f}, FN); }) - const simd_fvec spec_color_lum = lum(approx_spec_col); + const fvec spec_color_lum = lum(approx_spec_col); lobe_weights_t lobe_weights; - get_lobe_weights(mix(base_color_lum, simd_fvec{1.0f}, sheen), spec_color_lum, specular, metallic, + get_lobe_weights(mix(base_color_lum, fvec{1.0f}, sheen), spec_color_lum, specular, metallic, transmission, clearcoat, lobe_weights); #if USE_NEE - const simd_ivec eval_light = simd_cast(ls.pdf > 0.0f) & ray_queue[index]; + const ivec eval_light = simd_cast(ls.pdf > 0.0f) & ray_queue[index]; if (eval_light.not_all_zeros()) { assert((shadow_mask & eval_light).all_zeros()); shadow_mask |= Evaluate_PrincipledNode( @@ -7787,14 +7787,14 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con } #if USE_PATH_TERMINATION - const simd_ivec can_terminate_path = total_depth > int(ps.min_total_depth); + const ivec can_terminate_path = total_depth > int(ps.min_total_depth); #else - const simd_ivec can_terminate_path = 0; + const ivec can_terminate_path = 0; #endif - const simd_fvec lum = max(new_ray.c[0], max(new_ray.c[1], new_ray.c[2])); - const simd_fvec &p = mix_term_rand[1]; - const simd_fvec q = select(can_terminate_path, max(0.05f, 1.0f - lum), simd_fvec{0.0f}); + const fvec lum = max(new_ray.c[0], max(new_ray.c[1], new_ray.c[2])); + const fvec &p = mix_term_rand[1]; + const fvec q = select(can_terminate_path, max(0.05f, 1.0f - lum), fvec{0.0f}); secondary_mask &= simd_cast(p >= q) & simd_cast(lum > 0.0f) & simd_cast(new_ray.pdf > 0.0f); if (secondary_mask.not_all_zeros()) { @@ -7813,7 +7813,7 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con #if USE_NEE if (shadow_mask.not_all_zeros()) { // actual ray direction accouning for bias from both ends - simd_fvec to_light[3]; + fvec to_light[3]; UNROLLED_FOR(i, 3, { to_light[i] = ls.lp[i] - sh_r.o[i]; }) sh_r.dist = normalize(to_light); UNROLLED_FOR(i, 3, { where(shadow_mask, sh_r.d[i]) = to_light[i]; }) @@ -7828,7 +7828,7 @@ void Ray::NS::ShadeSurface(const pass_settings_t &ps, const float limits[2], con UNROLLED_FOR(i, 3, { where(is_active_lane, col[i]) = ray.c[i] * col[i]; }) - const simd_fvec sum = col[0] + col[1] + col[2]; + const fvec sum = col[0] + col[1] + col[2]; UNROLLED_FOR(i, 3, { where(sum > limits[1], col[i]) = safe_div_pos(col[i] * limits[1], sum); where(is_active_lane, out_rgba[i]) = col[i]; @@ -7850,12 +7850,12 @@ void Ray::NS::ShadePrimary(const pass_settings_t &ps, Span> const ray_data_t &r = rays[i]; const hit_data_t &inter = inters[i]; - simd_fvec col[4] = {}, base_color[3] = {}, depth_normal[4] = {}; + fvec col[4] = {}, base_color[3] = {}, depth_normal[4] = {}; ShadeSurface(ps, limits, rand_seq, rand_seed, iteration, inter, r, sc, node_index, textures, col, out_secondary_rays, out_secondary_rays_count, out_shadow_rays, out_shadow_rays_count, base_color, depth_normal); - const simd_uvec x = r.xy >> 16, y = r.xy & 0x0000FFFF; + const uvec x = r.xy >> 16, y = r.xy & 0x0000FFFF; // TODO: match layouts! UNROLLED_FOR_S(j, S, { @@ -7864,24 +7864,23 @@ void Ray::NS::ShadePrimary(const pass_settings_t &ps, Span> out_color[y.template get() * img_w + x.template get()].v[k] = col[k].template get(); }) { // base color - auto old_val = simd_fvec4(out_base_color[y.template get() * img_w + x.template get()].v, - simd_mem_aligned); - old_val += (simd_fvec4{base_color[0].template get(), base_color[1].template get(), + auto old_val = fvec4(out_base_color[y.template get() * img_w + x.template get()].v, vector_aligned); + old_val += (fvec4{base_color[0].template get(), base_color[1].template get(), base_color[2].template get(), 0.0f} - old_val) * mix_factor; old_val.store_to(out_base_color[y.template get() * img_w + x.template get()].v, - simd_mem_aligned); + vector_aligned); } { // depth-normals - auto old_val = simd_fvec4(out_depth_normals[y.template get() * img_w + x.template get()].v, - simd_mem_aligned); - old_val += (simd_fvec4{depth_normal[0].template get(), depth_normal[1].template get(), + auto old_val = fvec4(out_depth_normals[y.template get() * img_w + x.template get()].v, + vector_aligned); + old_val += (fvec4{depth_normal[0].template get(), depth_normal[1].template get(), depth_normal[2].template get(), depth_normal[3].template get()} - old_val) * mix_factor; old_val.store_to(out_depth_normals[y.template get() * img_w + x.template get()].v, - simd_mem_aligned); + vector_aligned); } } }) @@ -7901,21 +7900,21 @@ void Ray::NS::ShadeSecondary(const pass_settings_t &ps, const float clamp_direct const ray_data_t &r = rays[i]; const hit_data_t &inter = inters[i]; - simd_fvec col[4] = {0.0f}; + fvec col[4] = {0.0f}; Ray::NS::ShadeSurface(ps, limits, rand_seq, rand_seed, iteration, inter, r, sc, node_index, textures, col, out_secondary_rays, out_secondary_rays_count, out_shadow_rays, out_shadow_rays_count, - (simd_fvec *)nullptr, (simd_fvec *)nullptr); + (fvec *)nullptr, (fvec *)nullptr); - const simd_uvec x = r.xy >> 16, y = r.xy & 0x0000FFFF; + const uvec x = r.xy >> 16, y = r.xy & 0x0000FFFF; // TODO: match layouts! UNROLLED_FOR_S(j, S, { if (r.mask.template get()) { auto old_val = - simd_fvec4(out_color[y.template get() * img_w + x.template get()].v, simd_mem_aligned); + fvec4(out_color[y.template get() * img_w + x.template get()].v, vector_aligned); old_val += - simd_fvec4(col[0].template get(), col[1].template get(), col[2].template get(), 0.0f); - old_val.store_to(out_color[y.template get() * img_w + x.template get()].v, simd_mem_aligned); + fvec4(col[0].template get(), col[1].template get(), col[2].template get(), 0.0f); + old_val.store_to(out_color[y.template get() * img_w + x.template get()].v, vector_aligned); } }) } diff --git a/internal/CoreVK.h b/internal/CoreVK.h index 7dd108f61..713c71baf 100644 --- a/internal/CoreVK.h +++ b/internal/CoreVK.h @@ -20,7 +20,7 @@ #pragma message("Ray::Ref::simd_vec will not use SIMD") #endif -#include "simd/simd_vec.h" +#include "simd/simd.h" #undef USE_SSE2 #undef USE_NEON diff --git a/internal/RastState.h b/internal/RastState.h index 3a6c3a580..f7708bcd8 100644 --- a/internal/RastState.h +++ b/internal/RastState.h @@ -139,10 +139,10 @@ struct RastState { DepthBias depth_bias; // mutable, because they are part of dynamic state - /*mutable simd_ivec4 viewport; + /*mutable ivec4 viewport; mutable struct { bool enabled = false; - simd_ivec4 rect; + ivec4 rect; } scissor;*/ }; diff --git a/internal/RendererAVX.cpp b/internal/RendererAVX.cpp index 12d70b829..f6570afa2 100644 --- a/internal/RendererAVX.cpp +++ b/internal/RendererAVX.cpp @@ -12,40 +12,36 @@ namespace Ray { namespace Avx { template int SortRays_CPU(Span> rays, const float root_min[3], const float cell_size[3], - simd_ivec *hash_values, uint32_t *scan_values, ray_chunk_t *chunks, + ivec *hash_values, uint32_t *scan_values, ray_chunk_t *chunks, ray_chunk_t *chunks_temp); template int SortRays_GPU(Span> rays, const float root_min[3], const float cell_size[3], - simd_ivec *hash_values, int *head_flags, uint32_t *scan_values, + ivec *hash_values, int *head_flags, uint32_t *scan_values, ray_chunk_t *chunks, ray_chunk_t *chunks_temp, uint32_t *skeleton); -template bool Traverse_TLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_uvec &ray_flags, - const simd_ivec &ray_mask, const bvh_node_t *nodes, - uint32_t node_index, const mesh_instance_t *mesh_instances, - const uint32_t *mi_indices, const mesh_t *meshes, - const tri_accel_t *tris, const uint32_t *tri_indices, - hit_data_t &inter); -template bool Traverse_TLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_uvec &ray_flags, - const simd_ivec &ray_mask, const wbvh_node_t *nodes, - uint32_t node_index, const mesh_instance_t *mesh_instances, - const uint32_t *mi_indices, const mesh_t *meshes, - const mtri_accel_t *mtris, const uint32_t *tri_indices, - hit_data_t &inter); -template simd_ivec -Traverse_TLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], int ray_type, - const simd_ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, - const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, - const mesh_t *meshes, const tri_accel_t *tris, const tri_mat_data_t *materials, - const uint32_t *tri_indices, hit_data_t &inter); -template simd_ivec -Traverse_TLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], int ray_type, - const simd_ivec &ray_mask, const wbvh_node_t *nodes, uint32_t node_index, - const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, - const mesh_t *meshes, const mtri_accel_t *mtris, const tri_mat_data_t *materials, - const uint32_t *tri_indices, hit_data_t &inter); -template bool Traverse_BLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_ivec &ray_mask, const bvh_node_t *nodes, +template bool Traverse_TLAS_WithStack_ClosestHit( + const fvec ro[3], const fvec rd[3], const uvec &ray_flags, const ivec &ray_mask, + const bvh_node_t *nodes, uint32_t node_index, const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, + const mesh_t *meshes, const tri_accel_t *tris, const uint32_t *tri_indices, hit_data_t &inter); +template bool Traverse_TLAS_WithStack_ClosestHit( + const fvec ro[3], const fvec rd[3], const uvec &ray_flags, const ivec &ray_mask, + const wbvh_node_t *nodes, uint32_t node_index, const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, + const mesh_t *meshes, const mtri_accel_t *mtris, const uint32_t *tri_indices, hit_data_t &inter); +template ivec Traverse_TLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], + int ray_type, const ivec &ray_mask, + const bvh_node_t *nodes, uint32_t node_index, + const mesh_instance_t *mesh_instances, + const uint32_t *mi_indices, const mesh_t *meshes, + const tri_accel_t *tris, const tri_mat_data_t *materials, + const uint32_t *tri_indices, hit_data_t &inter); +template ivec Traverse_TLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], + int ray_type, const ivec &ray_mask, + const wbvh_node_t *nodes, uint32_t node_index, + const mesh_instance_t *mesh_instances, + const uint32_t *mi_indices, const mesh_t *meshes, + const mtri_accel_t *mtris, const tri_mat_data_t *materials, + const uint32_t *tri_indices, hit_data_t &inter); +template bool Traverse_BLAS_WithStack_ClosestHit(const fvec ro[3], const fvec rd[3], + const ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, const tri_accel_t *tris, const uint32_t *tri_indices, int obj_index, hit_data_t &inter); @@ -53,11 +49,12 @@ template bool Traverse_BLAS_WithStack_ClosestHit(const float ro[3], cons uint32_t node_index, const mtri_accel_t *mtris, const uint32_t *tri_indices, int &inter_prim_index, float &inter_t, float &inter_u, float &inter_v); -template simd_ivec -Traverse_BLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, - const tri_accel_t *tris, const tri_mat_data_t *materials, - const uint32_t *tri_indices, int obj_index, hit_data_t &inter); +template ivec Traverse_BLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], + const ivec &ray_mask, const bvh_node_t *nodes, + uint32_t node_index, const tri_accel_t *tris, + const tri_mat_data_t *materials, + const uint32_t *tri_indices, int obj_index, + hit_data_t &inter); template int Traverse_BLAS_WithStack_AnyHit(const float ro[3], const float rd[3], const wbvh_node_t *nodes, uint32_t node_index, const mtri_accel_t *mtris, const tri_mat_data_t *materials, const uint32_t *tri_indices, @@ -65,20 +62,17 @@ template int Traverse_BLAS_WithStack_AnyHit(const float ro[3], const flo float &inter_v); template void SampleNearest(const Cpu::TexStorageBase *const textures[], const uint32_t index, - const simd_fvec uvs[2], const simd_fvec &lod, - const simd_ivec &mask, simd_fvec out_rgba[4]); + const fvec uvs[2], const fvec &lod, const ivec &mask, + fvec out_rgba[4]); template void SampleBilinear(const Cpu::TexStorageBase *const textures[], const uint32_t index, - const simd_fvec uvs[2], const simd_ivec &lod, - const simd_fvec rand[2], const simd_ivec &mask, - simd_fvec out_rgba[4]); + const fvec uvs[2], const ivec &lod, const fvec rand[2], + const ivec &mask, fvec out_rgba[4]); template void SampleTrilinear(const Cpu::TexStorageBase *const textures[], const uint32_t index, - const simd_fvec uvs[2], const simd_fvec &lod, - const simd_fvec rand[2], const simd_ivec &mask, - simd_fvec out_rgba[4]); -template void SampleLatlong_RGBE(const Cpu::TexStorageRGBA &storage, uint32_t index, - const simd_fvec dir[3], float y_rotation, - const simd_fvec rand[2], const simd_ivec &mask, - simd_fvec out_rgb[3]); + const fvec uvs[2], const fvec &lod, const fvec rand[2], + const ivec &mask, fvec out_rgba[4]); +template void SampleLatlong_RGBE(const Cpu::TexStorageRGBA &storage, uint32_t index, const fvec dir[3], + float y_rotation, const fvec rand[2], const ivec &mask, + fvec out_rgb[3]); class SIMDPolicy : public SIMDPolicyBase { protected: diff --git a/internal/RendererAVX2.cpp b/internal/RendererAVX2.cpp index 090c2746b..ec9021656 100644 --- a/internal/RendererAVX2.cpp +++ b/internal/RendererAVX2.cpp @@ -12,40 +12,36 @@ namespace Ray { namespace Avx2 { template int SortRays_CPU(Span> rays, const float root_min[3], const float cell_size[3], - simd_ivec *hash_values, uint32_t *scan_values, ray_chunk_t *chunks, + ivec *hash_values, uint32_t *scan_values, ray_chunk_t *chunks, ray_chunk_t *chunks_temp); template int SortRays_GPU(Span> rays, const float root_min[3], const float cell_size[3], - simd_ivec *hash_values, int *head_flags, uint32_t *scan_values, + ivec *hash_values, int *head_flags, uint32_t *scan_values, ray_chunk_t *chunks, ray_chunk_t *chunks_temp, uint32_t *skeleton); -template bool Traverse_TLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_uvec &ray_flags, - const simd_ivec &ray_mask, const bvh_node_t *nodes, - uint32_t node_index, const mesh_instance_t *mesh_instances, - const uint32_t *mi_indices, const mesh_t *meshes, - const tri_accel_t *tris, const uint32_t *tri_indices, - hit_data_t &inter); -template bool Traverse_TLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_uvec &ray_flags, - const simd_ivec &ray_mask, const wbvh_node_t *nodes, - uint32_t node_index, const mesh_instance_t *mesh_instances, - const uint32_t *mi_indices, const mesh_t *meshes, - const mtri_accel_t *mtris, const uint32_t *tri_indices, - hit_data_t &inter); -template simd_ivec -Traverse_TLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], int ray_type, - const simd_ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, - const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, - const mesh_t *meshes, const tri_accel_t *tris, const tri_mat_data_t *materials, - const uint32_t *tri_indices, hit_data_t &inter); -template simd_ivec -Traverse_TLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], int ray_type, - const simd_ivec &ray_mask, const wbvh_node_t *nodes, uint32_t node_index, - const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, - const mesh_t *meshes, const mtri_accel_t *mtris, const tri_mat_data_t *materials, - const uint32_t *tri_indices, hit_data_t &inter); -template bool Traverse_BLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_ivec &ray_mask, const bvh_node_t *nodes, +template bool Traverse_TLAS_WithStack_ClosestHit( + const fvec ro[3], const fvec rd[3], const uvec &ray_flags, const ivec &ray_mask, + const bvh_node_t *nodes, uint32_t node_index, const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, + const mesh_t *meshes, const tri_accel_t *tris, const uint32_t *tri_indices, hit_data_t &inter); +template bool Traverse_TLAS_WithStack_ClosestHit( + const fvec ro[3], const fvec rd[3], const uvec &ray_flags, const ivec &ray_mask, + const wbvh_node_t *nodes, uint32_t node_index, const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, + const mesh_t *meshes, const mtri_accel_t *mtris, const uint32_t *tri_indices, hit_data_t &inter); +template ivec Traverse_TLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], + int ray_type, const ivec &ray_mask, + const bvh_node_t *nodes, uint32_t node_index, + const mesh_instance_t *mesh_instances, + const uint32_t *mi_indices, const mesh_t *meshes, + const tri_accel_t *tris, const tri_mat_data_t *materials, + const uint32_t *tri_indices, hit_data_t &inter); +template ivec Traverse_TLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], + int ray_type, const ivec &ray_mask, + const wbvh_node_t *nodes, uint32_t node_index, + const mesh_instance_t *mesh_instances, + const uint32_t *mi_indices, const mesh_t *meshes, + const mtri_accel_t *mtris, const tri_mat_data_t *materials, + const uint32_t *tri_indices, hit_data_t &inter); +template bool Traverse_BLAS_WithStack_ClosestHit(const fvec ro[3], const fvec rd[3], + const ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, const tri_accel_t *tris, const uint32_t *tri_indices, int obj_index, hit_data_t &inter); @@ -53,11 +49,12 @@ template bool Traverse_BLAS_WithStack_ClosestHit(const float ro[3], cons uint32_t node_index, const mtri_accel_t *mtris, const uint32_t *tri_indices, int &inter_prim_index, float &inter_t, float &inter_u, float &inter_v); -template simd_ivec -Traverse_BLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, - const tri_accel_t *tris, const tri_mat_data_t *materials, - const uint32_t *tri_indices, int obj_index, hit_data_t &inter); +template ivec Traverse_BLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], + const ivec &ray_mask, const bvh_node_t *nodes, + uint32_t node_index, const tri_accel_t *tris, + const tri_mat_data_t *materials, + const uint32_t *tri_indices, int obj_index, + hit_data_t &inter); template int Traverse_BLAS_WithStack_AnyHit(const float ro[3], const float rd[3], const wbvh_node_t *nodes, uint32_t node_index, const mtri_accel_t *mtris, const tri_mat_data_t *materials, const uint32_t *tri_indices, @@ -65,20 +62,17 @@ template int Traverse_BLAS_WithStack_AnyHit(const float ro[3], const flo float &inter_v); template void SampleNearest(const Cpu::TexStorageBase *const textures[], uint32_t index, - const simd_fvec uvs[2], const simd_fvec &lod, - const simd_ivec &mask, simd_fvec out_rgba[4]); + const fvec uvs[2], const fvec &lod, const ivec &mask, + fvec out_rgba[4]); template void SampleBilinear(const Cpu::TexStorageBase *const textures[], uint32_t index, - const simd_fvec uvs[2], const simd_ivec &lod, - const simd_fvec rand[2], const simd_ivec &mask, - simd_fvec out_rgba[4]); + const fvec uvs[2], const ivec &lod, const fvec rand[2], + const ivec &mask, fvec out_rgba[4]); template void SampleTrilinear(const Cpu::TexStorageBase *const textures[], uint32_t index, - const simd_fvec uvs[2], const simd_fvec &lod, - const simd_fvec rand[2], const simd_ivec &mask, - simd_fvec out_rgba[4]); -template void SampleLatlong_RGBE(const Cpu::TexStorageRGBA &storage, uint32_t index, - const simd_fvec dir[3], float y_rotation, - const simd_fvec rand[2], const simd_ivec &mask, - simd_fvec out_rgb[3]); + const fvec uvs[2], const fvec &lod, const fvec rand[2], + const ivec &mask, fvec out_rgba[4]); +template void SampleLatlong_RGBE(const Cpu::TexStorageRGBA &storage, uint32_t index, const fvec dir[3], + float y_rotation, const fvec rand[2], const ivec &mask, + fvec out_rgb[3]); class SIMDPolicy : public SIMDPolicyBase { protected: diff --git a/internal/RendererAVX512.cpp b/internal/RendererAVX512.cpp index 48d512210..4a2a23c17 100644 --- a/internal/RendererAVX512.cpp +++ b/internal/RendererAVX512.cpp @@ -12,40 +12,36 @@ namespace Ray { namespace Avx512 { template int SortRays_CPU(Span> rays, const float root_min[3], const float cell_size[3], - simd_ivec *hash_values, uint32_t *scan_values, ray_chunk_t *chunks, + ivec *hash_values, uint32_t *scan_values, ray_chunk_t *chunks, ray_chunk_t *chunks_temp); template int SortRays_GPU(Span> rays, const float root_min[3], const float cell_size[3], - simd_ivec *hash_values, int *head_flags, uint32_t *scan_values, + ivec *hash_values, int *head_flags, uint32_t *scan_values, ray_chunk_t *chunks, ray_chunk_t *chunks_temp, uint32_t *skeleton); -template bool Traverse_TLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_uvec &ray_flags, - const simd_ivec &ray_mask, const bvh_node_t *nodes, - uint32_t node_index, const mesh_instance_t *mesh_instances, - const uint32_t *mi_indices, const mesh_t *meshes, - const tri_accel_t *tris, const uint32_t *tri_indices, - hit_data_t &inter); -template bool Traverse_TLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_uvec &ray_flags, - const simd_ivec &ray_mask, const wbvh_node_t *nodes, - uint32_t node_index, const mesh_instance_t *mesh_instances, - const uint32_t *mi_indices, const mesh_t *meshes, - const mtri_accel_t *mtris, const uint32_t *tri_indices, - hit_data_t &inter); -template simd_ivec -Traverse_TLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], int ray_type, - const simd_ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, - const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, - const mesh_t *meshes, const tri_accel_t *tris, const tri_mat_data_t *materials, - const uint32_t *tri_indices, hit_data_t &inter); -template simd_ivec -Traverse_TLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], int ray_type, - const simd_ivec &ray_mask, const wbvh_node_t *nodes, uint32_t node_index, - const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, - const mesh_t *meshes, const mtri_accel_t *mtris, const tri_mat_data_t *materials, - const uint32_t *tri_indices, hit_data_t &inter); -template bool Traverse_BLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_ivec &ray_mask, const bvh_node_t *nodes, +template bool Traverse_TLAS_WithStack_ClosestHit( + const fvec ro[3], const fvec rd[3], const uvec &ray_flags, const ivec &ray_mask, + const bvh_node_t *nodes, uint32_t node_index, const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, + const mesh_t *meshes, const tri_accel_t *tris, const uint32_t *tri_indices, hit_data_t &inter); +template bool Traverse_TLAS_WithStack_ClosestHit( + const fvec ro[3], const fvec rd[3], const uvec &ray_flags, const ivec &ray_mask, + const wbvh_node_t *nodes, uint32_t node_index, const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, + const mesh_t *meshes, const mtri_accel_t *mtris, const uint32_t *tri_indices, hit_data_t &inter); +template ivec Traverse_TLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], + int ray_type, const ivec &ray_mask, + const bvh_node_t *nodes, uint32_t node_index, + const mesh_instance_t *mesh_instances, + const uint32_t *mi_indices, const mesh_t *meshes, + const tri_accel_t *tris, const tri_mat_data_t *materials, + const uint32_t *tri_indices, hit_data_t &inter); +template ivec Traverse_TLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], + int ray_type, const ivec &ray_mask, + const wbvh_node_t *nodes, uint32_t node_index, + const mesh_instance_t *mesh_instances, + const uint32_t *mi_indices, const mesh_t *meshes, + const mtri_accel_t *mtris, const tri_mat_data_t *materials, + const uint32_t *tri_indices, hit_data_t &inter); +template bool Traverse_BLAS_WithStack_ClosestHit(const fvec ro[3], const fvec rd[3], + const ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, const tri_accel_t *tris, const uint32_t *tri_indices, int obj_index, hit_data_t &inter); @@ -53,11 +49,12 @@ template bool Traverse_BLAS_WithStack_ClosestHit(const float ro[3], cons uint32_t node_index, const mtri_accel_t *mtris, const uint32_t *tri_indices, int &inter_prim_index, float &inter_t, float &inter_u, float &inter_v); -template simd_ivec -Traverse_BLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, - const tri_accel_t *tris, const tri_mat_data_t *materials, - const uint32_t *tri_indices, int obj_index, hit_data_t &inter); +template ivec Traverse_BLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], + const ivec &ray_mask, const bvh_node_t *nodes, + uint32_t node_index, const tri_accel_t *tris, + const tri_mat_data_t *materials, + const uint32_t *tri_indices, int obj_index, + hit_data_t &inter); template int Traverse_BLAS_WithStack_AnyHit(const float ro[3], const float rd[3], const wbvh_node_t *nodes, uint32_t node_index, const mtri_accel_t *mtris, const tri_mat_data_t *materials, const uint32_t *tri_indices, @@ -65,20 +62,17 @@ template int Traverse_BLAS_WithStack_AnyHit(const float ro[3], const flo float &inter_v); template void SampleNearest(const Cpu::TexStorageBase *const textures[], uint32_t index, - const simd_fvec uvs[2], const simd_fvec &lod, - const simd_ivec &mask, simd_fvec out_rgba[4]); + const fvec uvs[2], const fvec &lod, const ivec &mask, + fvec out_rgba[4]); template void SampleBilinear(const Cpu::TexStorageBase *const textures[], uint32_t index, - const simd_fvec uvs[2], const simd_ivec &lod, - const simd_fvec rand[2], const simd_ivec &mask, - simd_fvec out_rgba[4]); + const fvec uvs[2], const ivec &lod, const fvec rand[2], + const ivec &mask, fvec out_rgba[4]); template void SampleTrilinear(const Cpu::TexStorageBase *const textures[], uint32_t index, - const simd_fvec uvs[2], const simd_fvec &lod, - const simd_fvec rand[2], const simd_ivec &mask, - simd_fvec out_rgba[4]); -template void SampleLatlong_RGBE(const Cpu::TexStorageRGBA &storage, uint32_t index, - const simd_fvec dir[3], float y_rotation, - const simd_fvec rand[2], const simd_ivec &mask, - simd_fvec out_rgb[3]); + const fvec uvs[2], const fvec &lod, const fvec rand[2], + const ivec &mask, fvec out_rgba[4]); +template void SampleLatlong_RGBE(const Cpu::TexStorageRGBA &storage, uint32_t index, const fvec dir[3], + float y_rotation, const fvec rand[2], const ivec &mask, + fvec out_rgb[3]); class SIMDPolicy : public SIMDPolicyBase { protected: diff --git a/internal/RendererCPU.h b/internal/RendererCPU.h index 0b77db20f..e8f8a043d 100644 --- a/internal/RendererCPU.h +++ b/internal/RendererCPU.h @@ -474,7 +474,7 @@ void Ray::Cpu::Renderer::RenderScene(const SceneBase *scene, RegionC &p.hash_values[0], &p.scan_values[0], &p.chunks[0], &p.chunks_temp[0]); #if 0 // debug hash values - static std::vector color_table; + static std::vector color_table; if (color_table.empty()) { for (int i = 0; i < 1024; i++) { color_table.emplace_back(float(rand()) / RAND_MAX, float(rand()) / RAND_MAX, float(rand()) / RAND_MAX); @@ -487,7 +487,7 @@ void Ray::Cpu::Renderer::RenderScene(const SceneBase *scene, RegionC const int x = r.id.x; const int y = r.id.y; - const simd_fvec3 &c = color_table[hash(p.hash_values[i]) % 1024]; + const fvec3 &c = color_table[hash(p.hash_values[i]) % 1024]; color_rgba_t col = { c[0], c[1], c[2], 1.0f }; temp_buf_.SetPixel(x, y, col); @@ -540,7 +540,7 @@ void Ray::Cpu::Renderer::RenderScene(const SceneBase *scene, RegionC tonemap_params.view_transform = cam.view_transform; tonemap_params.inv_gamma = (1.0f / cam.gamma); - Ref::simd_fvec4 exposure = std::pow(2.0f, cam.exposure); + Ref::fvec4 exposure = std::pow(2.0f, cam.exposure); exposure.set<3>(1.0f); const float variance_threshold = @@ -576,36 +576,36 @@ void Ray::Cpu::Renderer::RenderScene(const SceneBase *scene, RegionC continue; } - const auto new_val = Ref::simd_fvec4{temp_buf_[y * w_ + x].v, Ref::simd_mem_aligned} * exposure; + const auto new_val = Ref::fvec4{temp_buf_[y * w_ + x].v, Ref::vector_aligned} * exposure; // accumulate full buffer - Ref::simd_fvec4 cur_val_full = {full_buf_[y * w_ + x].v, Ref::simd_mem_aligned}; + Ref::fvec4 cur_val_full = {full_buf_[y * w_ + x].v, Ref::vector_aligned}; cur_val_full += (new_val - cur_val_full) * mix_factor; - cur_val_full.store_to(full_buf_[y * w_ + x].v, Ref::simd_mem_aligned); + cur_val_full.store_to(full_buf_[y * w_ + x].v, Ref::vector_aligned); if (is_class_a) { // accumulate half buffer - Ref::simd_fvec4 cur_val_half = {half_buf_[y * w_ + x].v, Ref::simd_mem_aligned}; + Ref::fvec4 cur_val_half = {half_buf_[y * w_ + x].v, Ref::vector_aligned}; cur_val_half += (new_val - cur_val_half) * half_mix_factor; - cur_val_half.store_to(half_buf_[y * w_ + x].v, Ref::simd_mem_aligned); + cur_val_half.store_to(half_buf_[y * w_ + x].v, Ref::vector_aligned); } } } for (int y = rect.y; y < rect.y + rect.h; ++y) { for (int x = rect.x; x < rect.x + rect.w; ++x) { - auto full_val = Ref::simd_fvec4{full_buf_[y * w_ + x].v, Ref::simd_mem_aligned}; - auto half_val = Ref::simd_fvec4{half_buf_[y * w_ + x].v, Ref::simd_mem_aligned}; + auto full_val = Ref::fvec4{full_buf_[y * w_ + x].v, Ref::vector_aligned}; + auto half_val = Ref::fvec4{half_buf_[y * w_ + x].v, Ref::vector_aligned}; // Store as denosed result until DenoiseImage method will be called - full_val.store_to(raw_filtered_buf_[y * w_ + x].v, Ref::simd_mem_aligned); + full_val.store_to(raw_filtered_buf_[y * w_ + x].v, Ref::vector_aligned); - const Ref::simd_fvec4 tonemapped_res = Tonemap(tonemap_params, full_val); - tonemapped_res.store_to(final_buf_[y * w_ + x].v, Ref::simd_mem_aligned); + const Ref::fvec4 tonemapped_res = Tonemap(tonemap_params, full_val); + tonemapped_res.store_to(final_buf_[y * w_ + x].v, Ref::vector_aligned); - const Ref::simd_fvec4 p1 = reversible_tonemap(2.0f * full_val - half_val); - const Ref::simd_fvec4 p2 = reversible_tonemap(half_val); + const Ref::fvec4 p1 = reversible_tonemap(2.0f * full_val - half_val); + const Ref::fvec4 p2 = reversible_tonemap(half_val); - const Ref::simd_fvec4 variance = 0.5f * (p1 - p2) * (p1 - p2); - variance.store_to(temp_buf_[y * w_ + x].v, Ref::simd_mem_aligned); + const Ref::fvec4 variance = 0.5f * (p1 - p2) * (p1 - p2); + variance.store_to(temp_buf_[y * w_ + x].v, Ref::vector_aligned); #if DEBUG_ADAPTIVE_SAMPLING if (cam.pass_settings.variance_threshold != 0.0f && required_samples_[y * w_ + x] >= region.iteration && @@ -641,11 +641,11 @@ template void Ray::Cpu::Renderer::DenoiseImage p.feature_buf2.resize(rect_ext.w * rect_ext.h); #define FETCH_FINAL_BUF(_x, _y) \ - Ref::simd_fvec4(full_buf_[std::min(std::max(_y, 0), h_ - 1) * w_ + std::min(std::max(_x, 0), w_ - 1)].v, \ - Ref::simd_mem_aligned) + Ref::fvec4(full_buf_[std::min(std::max(_y, 0), h_ - 1) * w_ + std::min(std::max(_x, 0), w_ - 1)].v, \ + Ref::vector_aligned) #define FETCH_VARIANCE(_x, _y) \ - Ref::simd_fvec4(temp_buf_[std::min(std::max(_y, 0), h_ - 1) * w_ + std::min(std::max(_x, 0), w_ - 1)].v, \ - Ref::simd_mem_aligned) + Ref::fvec4(temp_buf_[std::min(std::max(_y, 0), h_ - 1) * w_ + std::min(std::max(_x, 0), w_ - 1)].v, \ + Ref::vector_aligned) static const float GaussWeights[] = {0.2270270270f, 0.1945945946f, 0.1216216216f, 0.0540540541f, 0.0162162162f}; @@ -653,19 +653,19 @@ template void Ray::Cpu::Renderer::DenoiseImage const int yy = rect_ext.y + y; for (int x = 0; x < rect_ext.w; ++x) { const int xx = rect_ext.x + x; - const Ref::simd_fvec4 center_col = reversible_tonemap(FETCH_FINAL_BUF(xx, yy)); - center_col.store_to(p.temp_final_buf[y * rect_ext.w + x].v, Ref::simd_mem_aligned); + const Ref::fvec4 center_col = reversible_tonemap(FETCH_FINAL_BUF(xx, yy)); + center_col.store_to(p.temp_final_buf[y * rect_ext.w + x].v, Ref::vector_aligned); - const Ref::simd_fvec4 center_val = FETCH_VARIANCE(xx, yy); + const Ref::fvec4 center_val = FETCH_VARIANCE(xx, yy); - Ref::simd_fvec4 res = center_val * GaussWeights[0]; + Ref::fvec4 res = center_val * GaussWeights[0]; UNROLLED_FOR(i, 4, { res += FETCH_VARIANCE(xx - i + 1, yy) * GaussWeights[i + 1]; res += FETCH_VARIANCE(xx + i + 1, yy) * GaussWeights[i + 1]; }) res = max(res, center_val); - res.store_to(p.variance_buf[y * rect_ext.w + x].v, Ref::simd_mem_aligned); + res.store_to(p.variance_buf[y * rect_ext.w + x].v, Ref::vector_aligned); } } @@ -679,18 +679,18 @@ template void Ray::Cpu::Renderer::DenoiseImage for (int y = 4; y < rect_ext.h - 4; ++y) { for (int x = 4; x < rect_ext.w - 4; ++x) { - const Ref::simd_fvec4 center_val = {p.variance_buf[(y + 0) * rect_ext.w + x].v, Ref::simd_mem_aligned}; + const Ref::fvec4 center_val = {p.variance_buf[(y + 0) * rect_ext.w + x].v, Ref::vector_aligned}; - Ref::simd_fvec4 res = center_val * GaussWeights[0]; + Ref::fvec4 res = center_val * GaussWeights[0]; UNROLLED_FOR(i, 4, { - res += Ref::simd_fvec4(p.variance_buf[(y - i + 1) * rect_ext.w + x].v, Ref::simd_mem_aligned) * + res += Ref::fvec4(p.variance_buf[(y - i + 1) * rect_ext.w + x].v, Ref::vector_aligned) * GaussWeights[i + 1]; - res += Ref::simd_fvec4(p.variance_buf[(y + i + 1) * rect_ext.w + x].v, Ref::simd_mem_aligned) * + res += Ref::fvec4(p.variance_buf[(y + i + 1) * rect_ext.w + x].v, Ref::vector_aligned) * GaussWeights[i + 1]; }) res = max(res, center_val); - res.store_to(p.filtered_variance_buf[y * rect_ext.w + x].v, Ref::simd_mem_aligned); + res.store_to(p.filtered_variance_buf[y * rect_ext.w + x].v, Ref::vector_aligned); p.feature_buf1[y * rect_ext.w + x] = FETCH_BASE_COLOR(rect_ext.x + x, rect_ext.y + y); p.feature_buf2[y * rect_ext.w + x] = FETCH_DEPTH_NORMALS(rect_ext.x + x, rect_ext.y + y); @@ -714,8 +714,8 @@ template void Ray::Cpu::Renderer::DenoiseImage for (int x = 0; x < rect.w; ++x) { const int xx = rect.x + x; - const Ref::simd_fvec4 variance = { - p.filtered_variance_buf[(y + EXT_RADIUS) * rect_ext.w + (x + EXT_RADIUS)].v, Ref::simd_mem_aligned}; + const Ref::fvec4 variance = { + p.filtered_variance_buf[(y + EXT_RADIUS) * rect_ext.w + (x + EXT_RADIUS)].v, Ref::vector_aligned}; if (simd_cast(variance >= variance_threshold).not_all_zeros()) { required_samples_[yy * w_ + xx] = region.iteration + 1; } @@ -734,11 +734,11 @@ template void Ray::Cpu::Renderer::DenoiseImage for (int y = rect.y; y < rect.y + rect.h; ++y) { for (int x = rect.x; x < rect.x + rect.w; ++x) { - auto col = Ref::simd_fvec4(raw_filtered_buf_[y * w_ + x].v, Ref::simd_mem_aligned); + auto col = Ref::fvec4(raw_filtered_buf_[y * w_ + x].v, Ref::vector_aligned); col = Ref::reversible_tonemap_invert(col); - col.store_to(raw_filtered_buf_[y * w_ + x].v, Ref::simd_mem_aligned); + col.store_to(raw_filtered_buf_[y * w_ + x].v, Ref::vector_aligned); col = Tonemap(tonemap_params, col); - col.store_to(final_buf_[y * w_ + x].v, Ref::simd_mem_aligned); + col.store_to(final_buf_[y * w_ + x].v, Ref::vector_aligned); } } @@ -951,9 +951,9 @@ void Ray::Cpu::Renderer::DenoiseImage(const int pass, const RegionCo for (int y = r.y; y < r.y + r.h; ++y) { for (int x = r.x; x < r.x + r.w; ++x) { - auto col = Ref::simd_fvec4(raw_filtered_buf_[y * w_ + x].v, Ref::simd_mem_aligned); + auto col = Ref::fvec4(raw_filtered_buf_[y * w_ + x].v, Ref::vector_aligned); col = Tonemap(tonemap_params, col); - col.store_to(final_buf_[y * w_ + x].v, Ref::simd_mem_aligned); + col.store_to(final_buf_[y * w_ + x].v, Ref::vector_aligned); } } diff --git a/internal/RendererNEON.cpp b/internal/RendererNEON.cpp index 4bc862180..9966cb12a 100644 --- a/internal/RendererNEON.cpp +++ b/internal/RendererNEON.cpp @@ -12,40 +12,36 @@ namespace Ray { namespace Neon { template int SortRays_CPU(Span> rays, const float root_min[3], const float cell_size[3], - simd_ivec *hash_values, uint32_t *scan_values, ray_chunk_t *chunks, + ivec *hash_values, uint32_t *scan_values, ray_chunk_t *chunks, ray_chunk_t *chunks_temp); template int SortRays_GPU(Span> rays, const float root_min[3], const float cell_size[3], - simd_ivec *hash_values, int *head_flags, uint32_t *scan_values, + ivec *hash_values, int *head_flags, uint32_t *scan_values, ray_chunk_t *chunks, ray_chunk_t *chunks_temp, uint32_t *skeleton); -template bool Traverse_TLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_uvec &ray_flags, - const simd_ivec &ray_mask, const bvh_node_t *nodes, - uint32_t node_index, const mesh_instance_t *mesh_instances, - const uint32_t *mi_indices, const mesh_t *meshes, - const tri_accel_t *tris, const uint32_t *tri_indices, - hit_data_t &inter); -template bool Traverse_TLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_uvec &ray_flags, - const simd_ivec &ray_mask, const wbvh_node_t *nodes, - uint32_t node_index, const mesh_instance_t *mesh_instances, - const uint32_t *mi_indices, const mesh_t *meshes, - const mtri_accel_t *mtris, const uint32_t *tri_indices, - hit_data_t &inter); -template simd_ivec -Traverse_TLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], int ray_type, - const simd_ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, - const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, - const mesh_t *meshes, const tri_accel_t *tris, const tri_mat_data_t *materials, - const uint32_t *tri_indices, hit_data_t &inter); -template simd_ivec -Traverse_TLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], int ray_type, - const simd_ivec &ray_mask, const wbvh_node_t *nodes, uint32_t node_index, - const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, - const mesh_t *meshes, const mtri_accel_t *mtris, const tri_mat_data_t *materials, - const uint32_t *tri_indices, hit_data_t &inter); -template bool Traverse_BLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_ivec &ray_mask, const bvh_node_t *nodes, +template bool Traverse_TLAS_WithStack_ClosestHit( + const fvec ro[3], const fvec rd[3], const uvec &ray_flags, const ivec &ray_mask, + const bvh_node_t *nodes, uint32_t node_index, const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, + const mesh_t *meshes, const tri_accel_t *tris, const uint32_t *tri_indices, hit_data_t &inter); +template bool Traverse_TLAS_WithStack_ClosestHit( + const fvec ro[3], const fvec rd[3], const uvec &ray_flags, const ivec &ray_mask, + const wbvh_node_t *nodes, uint32_t node_index, const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, + const mesh_t *meshes, const mtri_accel_t *mtris, const uint32_t *tri_indices, hit_data_t &inter); +template ivec Traverse_TLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], + int ray_type, const ivec &ray_mask, + const bvh_node_t *nodes, uint32_t node_index, + const mesh_instance_t *mesh_instances, + const uint32_t *mi_indices, const mesh_t *meshes, + const tri_accel_t *tris, const tri_mat_data_t *materials, + const uint32_t *tri_indices, hit_data_t &inter); +template ivec Traverse_TLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], + int ray_type, const ivec &ray_mask, + const wbvh_node_t *nodes, uint32_t node_index, + const mesh_instance_t *mesh_instances, + const uint32_t *mi_indices, const mesh_t *meshes, + const mtri_accel_t *mtris, const tri_mat_data_t *materials, + const uint32_t *tri_indices, hit_data_t &inter); +template bool Traverse_BLAS_WithStack_ClosestHit(const fvec ro[3], const fvec rd[3], + const ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, const tri_accel_t *tris, const uint32_t *tri_indices, int obj_index, hit_data_t &inter); @@ -53,11 +49,12 @@ template bool Traverse_BLAS_WithStack_ClosestHit(const float ro[3], cons uint32_t node_index, const mtri_accel_t *mtris, const uint32_t *tri_indices, int &inter_prim_index, float &inter_t, float &inter_u, float &inter_v); -template simd_ivec -Traverse_BLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, - const tri_accel_t *tris, const tri_mat_data_t *materials, - const uint32_t *tri_indices, int obj_index, hit_data_t &inter); +template ivec Traverse_BLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], + const ivec &ray_mask, const bvh_node_t *nodes, + uint32_t node_index, const tri_accel_t *tris, + const tri_mat_data_t *materials, + const uint32_t *tri_indices, int obj_index, + hit_data_t &inter); template int Traverse_BLAS_WithStack_AnyHit(const float ro[3], const float rd[3], const wbvh_node_t *nodes, uint32_t node_index, const mtri_accel_t *mtris, const tri_mat_data_t *materials, const uint32_t *tri_indices, @@ -65,20 +62,17 @@ template int Traverse_BLAS_WithStack_AnyHit(const float ro[3], const flo float &inter_v); template void SampleNearest(const Cpu::TexStorageBase *const textures[], uint32_t index, - const simd_fvec uvs[2], const simd_fvec &lod, - const simd_ivec &mask, simd_fvec out_rgba[4]); + const fvec uvs[2], const fvec &lod, const ivec &mask, + fvec out_rgba[4]); template void SampleBilinear(const Cpu::TexStorageBase *const textures[], uint32_t index, - const simd_fvec uvs[2], const simd_ivec &lod, - const simd_fvec rand[2], const simd_ivec &mask, - simd_fvec out_rgba[4]); + const fvec uvs[2], const ivec &lod, const fvec rand[2], + const ivec &mask, fvec out_rgba[4]); template void SampleTrilinear(const Cpu::TexStorageBase *const textures[], uint32_t index, - const simd_fvec uvs[2], const simd_fvec &lod, - const simd_fvec rand[2], const simd_ivec &mask, - simd_fvec out_rgba[4]); -template void SampleLatlong_RGBE(const Cpu::TexStorageRGBA &storage, uint32_t index, - const simd_fvec dir[3], float y_rotation, - const simd_fvec rand[2], const simd_ivec &mask, - simd_fvec out_rgb[3]); + const fvec uvs[2], const fvec &lod, const fvec rand[2], + const ivec &mask, fvec out_rgba[4]); +template void SampleLatlong_RGBE(const Cpu::TexStorageRGBA &storage, uint32_t index, const fvec dir[3], + float y_rotation, const fvec rand[2], const ivec &mask, + fvec out_rgb[3]); class SIMDPolicy : public SIMDPolicyBase { protected: diff --git a/internal/RendererSSE2.cpp b/internal/RendererSSE2.cpp index cfccf44e1..7677ec7d5 100644 --- a/internal/RendererSSE2.cpp +++ b/internal/RendererSSE2.cpp @@ -12,39 +12,36 @@ namespace Ray { namespace Sse2 { template int SortRays_CPU(Span> rays, const float root_min[3], const float cell_size[3], - simd_ivec *hash_values, uint32_t *scan_values, ray_chunk_t *chunks, + ivec *hash_values, uint32_t *scan_values, ray_chunk_t *chunks, ray_chunk_t *chunks_temp); template int SortRays_GPU(Span> rays, const float root_min[3], const float cell_size[3], - simd_ivec *hash_values, int *head_flags, uint32_t *scan_values, + ivec *hash_values, int *head_flags, uint32_t *scan_values, ray_chunk_t *chunks, ray_chunk_t *chunks_temp, uint32_t *skeleton); -template bool Traverse_TLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_uvec &ray_flags, - const simd_ivec &ray_mask, const bvh_node_t *nodes, - uint32_t node_index, const mesh_instance_t *mesh_instances, - const uint32_t *mi_indices, const mesh_t *meshes, - const tri_accel_t *tris, const uint32_t *tri_indices, - hit_data_t &inter); -template bool Traverse_TLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_uvec &ray_flags, - const simd_ivec &ray_mask, const wbvh_node_t *nodes, - uint32_t node_index, const mesh_instance_t *mesh_instances, - const uint32_t *mi_indices, const mesh_t *meshes, - const mtri_accel_t *mtris, const uint32_t *tri_indices, - hit_data_t &inter); -template simd_ivec -Traverse_TLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], int ray_type, - const simd_ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, - const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, - const mesh_t *meshes, const tri_accel_t *tris, const tri_mat_data_t *materials, - const uint32_t *tri_indices, hit_data_t &inter); -template simd_ivec Traverse_TLAS_WithStack_AnyHit( - const simd_fvec ro[3], const simd_fvec rd[3], int ray_type, const simd_ivec &ray_mask, - const wbvh_node_t *oct_nodes, uint32_t node_index, const mesh_instance_t *mesh_instances, - const uint32_t *mi_indices, const mesh_t *meshes, const mtri_accel_t *mtris, const tri_mat_data_t *materials, - const uint32_t *tri_indices, hit_data_t &inter); -template bool Traverse_BLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_ivec &ray_mask, const bvh_node_t *nodes, +template bool Traverse_TLAS_WithStack_ClosestHit( + const fvec ro[3], const fvec rd[3], const uvec &ray_flags, const ivec &ray_mask, + const bvh_node_t *nodes, uint32_t node_index, const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, + const mesh_t *meshes, const tri_accel_t *tris, const uint32_t *tri_indices, hit_data_t &inter); +template bool Traverse_TLAS_WithStack_ClosestHit( + const fvec ro[3], const fvec rd[3], const uvec &ray_flags, const ivec &ray_mask, + const wbvh_node_t *nodes, uint32_t node_index, const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, + const mesh_t *meshes, const mtri_accel_t *mtris, const uint32_t *tri_indices, hit_data_t &inter); +template ivec Traverse_TLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], + int ray_type, const ivec &ray_mask, + const bvh_node_t *nodes, uint32_t node_index, + const mesh_instance_t *mesh_instances, + const uint32_t *mi_indices, const mesh_t *meshes, + const tri_accel_t *tris, const tri_mat_data_t *materials, + const uint32_t *tri_indices, hit_data_t &inter); +template ivec Traverse_TLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], + int ray_type, const ivec &ray_mask, + const wbvh_node_t *oct_nodes, uint32_t node_index, + const mesh_instance_t *mesh_instances, + const uint32_t *mi_indices, const mesh_t *meshes, + const mtri_accel_t *mtris, const tri_mat_data_t *materials, + const uint32_t *tri_indices, hit_data_t &inter); +template bool Traverse_BLAS_WithStack_ClosestHit(const fvec ro[3], const fvec rd[3], + const ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, const tri_accel_t *tris, const uint32_t *tri_indices, int obj_index, hit_data_t &inter); @@ -52,11 +49,12 @@ template bool Traverse_BLAS_WithStack_ClosestHit(const float ro[3], cons uint32_t node_index, const mtri_accel_t *mtris, const uint32_t *tri_indices, int &inter_prim_index, float &inter_t, float &inter_u, float &inter_v); -template simd_ivec -Traverse_BLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, - const tri_accel_t *tris, const tri_mat_data_t *materials, - const uint32_t *tri_indices, int obj_index, hit_data_t &inter); +template ivec Traverse_BLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], + const ivec &ray_mask, const bvh_node_t *nodes, + uint32_t node_index, const tri_accel_t *tris, + const tri_mat_data_t *materials, + const uint32_t *tri_indices, int obj_index, + hit_data_t &inter); template int Traverse_BLAS_WithStack_AnyHit(const float ro[3], const float rd[3], const wbvh_node_t *nodes, uint32_t node_index, const mtri_accel_t *mtris, const tri_mat_data_t *materials, const uint32_t *tri_indices, @@ -64,20 +62,17 @@ template int Traverse_BLAS_WithStack_AnyHit(const float ro[3], const flo float &inter_v); template void SampleNearest(const Cpu::TexStorageBase *const textures[], uint32_t index, - const simd_fvec uvs[2], const simd_fvec &lod, - const simd_ivec &mask, simd_fvec out_rgba[4]); + const fvec uvs[2], const fvec &lod, const ivec &mask, + fvec out_rgba[4]); template void SampleBilinear(const Cpu::TexStorageBase *const textures[], uint32_t index, - const simd_fvec uvs[2], const simd_ivec &lod, - const simd_fvec rand[2], const simd_ivec &mask, - simd_fvec out_rgba[4]); + const fvec uvs[2], const ivec &lod, const fvec rand[2], + const ivec &mask, fvec out_rgba[4]); template void SampleTrilinear(const Cpu::TexStorageBase *const textures[], uint32_t index, - const simd_fvec uvs[2], const simd_fvec &lod, - const simd_fvec rand[2], const simd_ivec &mask, - simd_fvec out_rgba[4]); -template void SampleLatlong_RGBE(const Cpu::TexStorageRGBA &storage, uint32_t index, - const simd_fvec dir[3], float y_rotation, - const simd_fvec rand[2], const simd_ivec &mask, - simd_fvec out_rgb[3]); + const fvec uvs[2], const fvec &lod, const fvec rand[2], + const ivec &mask, fvec out_rgba[4]); +template void SampleLatlong_RGBE(const Cpu::TexStorageRGBA &storage, uint32_t index, const fvec dir[3], + float y_rotation, const fvec rand[2], const ivec &mask, + fvec out_rgb[3]); class SIMDPolicy : public SIMDPolicyBase { protected: diff --git a/internal/RendererSSE41.cpp b/internal/RendererSSE41.cpp index 9a1e3f423..82963bb75 100644 --- a/internal/RendererSSE41.cpp +++ b/internal/RendererSSE41.cpp @@ -12,40 +12,36 @@ namespace Ray { namespace Sse41 { template int SortRays_CPU(Span> rays, const float root_min[3], const float cell_size[3], - simd_ivec *hash_values, uint32_t *scan_values, ray_chunk_t *chunks, + ivec *hash_values, uint32_t *scan_values, ray_chunk_t *chunks, ray_chunk_t *chunks_temp); template int SortRays_GPU(Span> rays, const float root_min[3], const float cell_size[3], - simd_ivec *hash_values, int *head_flags, uint32_t *scan_values, + ivec *hash_values, int *head_flags, uint32_t *scan_values, ray_chunk_t *chunks, ray_chunk_t *chunks_temp, uint32_t *skeleton); -template bool Traverse_TLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_uvec &ray_flags, - const simd_ivec &ray_mask, const bvh_node_t *nodes, - uint32_t node_index, const mesh_instance_t *mesh_instances, - const uint32_t *mi_indices, const mesh_t *meshes, - const tri_accel_t *tris, const uint32_t *tri_indices, - hit_data_t &inter); -template bool Traverse_TLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_uvec &ray_flags, - const simd_ivec &ray_mask, const wbvh_node_t *nodes, - uint32_t node_index, const mesh_instance_t *mesh_instances, - const uint32_t *mi_indices, const mesh_t *meshes, - const mtri_accel_t *mtris, const uint32_t *tri_indices, - hit_data_t &inter); -template simd_ivec -Traverse_TLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], int ray_type, - const simd_ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, - const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, - const mesh_t *meshes, const tri_accel_t *tris, const tri_mat_data_t *materials, - const uint32_t *tri_indices, hit_data_t &inter); -template simd_ivec -Traverse_TLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], int ray_type, - const simd_ivec &ray_mask, const wbvh_node_t *nodes, uint32_t node_index, - const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, - const mesh_t *meshes, const mtri_accel_t *mtris, const tri_mat_data_t *materials, - const uint32_t *tri_indices, hit_data_t &inter); -template bool Traverse_BLAS_WithStack_ClosestHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_ivec &ray_mask, const bvh_node_t *nodes, +template bool Traverse_TLAS_WithStack_ClosestHit( + const fvec ro[3], const fvec rd[3], const uvec &ray_flags, const ivec &ray_mask, + const bvh_node_t *nodes, uint32_t node_index, const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, + const mesh_t *meshes, const tri_accel_t *tris, const uint32_t *tri_indices, hit_data_t &inter); +template bool Traverse_TLAS_WithStack_ClosestHit( + const fvec ro[3], const fvec rd[3], const uvec &ray_flags, const ivec &ray_mask, + const wbvh_node_t *nodes, uint32_t node_index, const mesh_instance_t *mesh_instances, const uint32_t *mi_indices, + const mesh_t *meshes, const mtri_accel_t *mtris, const uint32_t *tri_indices, hit_data_t &inter); +template ivec Traverse_TLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], + int ray_type, const ivec &ray_mask, + const bvh_node_t *nodes, uint32_t node_index, + const mesh_instance_t *mesh_instances, + const uint32_t *mi_indices, const mesh_t *meshes, + const tri_accel_t *tris, const tri_mat_data_t *materials, + const uint32_t *tri_indices, hit_data_t &inter); +template ivec Traverse_TLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], + int ray_type, const ivec &ray_mask, + const wbvh_node_t *nodes, uint32_t node_index, + const mesh_instance_t *mesh_instances, + const uint32_t *mi_indices, const mesh_t *meshes, + const mtri_accel_t *mtris, const tri_mat_data_t *materials, + const uint32_t *tri_indices, hit_data_t &inter); +template bool Traverse_BLAS_WithStack_ClosestHit(const fvec ro[3], const fvec rd[3], + const ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, const tri_accel_t *tris, const uint32_t *tri_indices, int obj_index, hit_data_t &inter); @@ -54,11 +50,12 @@ template bool Traverse_BLAS_WithStack_ClosestHit(const float ro[3], cons const mtri_accel_t *mtris, const uint32_t *tri_indices, int &inter_prim_index, float &inter_t, float &inter_u, float &inter_v); -template simd_ivec -Traverse_BLAS_WithStack_AnyHit(const simd_fvec ro[3], const simd_fvec rd[3], - const simd_ivec &ray_mask, const bvh_node_t *nodes, uint32_t node_index, - const tri_accel_t *tris, const tri_mat_data_t *materials, - const uint32_t *tri_indices, int obj_index, hit_data_t &inter); +template ivec Traverse_BLAS_WithStack_AnyHit(const fvec ro[3], const fvec rd[3], + const ivec &ray_mask, const bvh_node_t *nodes, + uint32_t node_index, const tri_accel_t *tris, + const tri_mat_data_t *materials, + const uint32_t *tri_indices, int obj_index, + hit_data_t &inter); template int Traverse_BLAS_WithStack_AnyHit(const float ro[3], const float rd[3], const wbvh_node_t *wnodes, uint32_t node_index, const mtri_accel_t *mtris, const tri_mat_data_t *materials, const uint32_t *tri_indices, @@ -66,20 +63,17 @@ template int Traverse_BLAS_WithStack_AnyHit(const float ro[3], const flo float &inter_v); template void SampleNearest(const Cpu::TexStorageBase *const textures[], const uint32_t index, - const simd_fvec uvs[2], const simd_fvec &lod, - const simd_ivec &mask, simd_fvec out_rgba[4]); + const fvec uvs[2], const fvec &lod, const ivec &mask, + fvec out_rgba[4]); template void SampleBilinear(const Cpu::TexStorageBase *const textures[], const uint32_t index, - const simd_fvec uvs[2], const simd_ivec &lod, - const simd_fvec rand[2], const simd_ivec &mask, - simd_fvec out_rgba[4]); + const fvec uvs[2], const ivec &lod, const fvec rand[2], + const ivec &mask, fvec out_rgba[4]); template void SampleTrilinear(const Cpu::TexStorageBase *const textures[], const uint32_t index, - const simd_fvec uvs[2], const simd_fvec &lod, - const simd_fvec rand[2], const simd_ivec &mask, - simd_fvec out_rgba[4]); -template void SampleLatlong_RGBE(const Cpu::TexStorageRGBA &atlas, uint32_t index, - const simd_fvec dir[3], float y_rotation, - const simd_fvec rand[2], const simd_ivec &mask, - simd_fvec out_rgb[3]); + const fvec uvs[2], const fvec &lod, const fvec rand[2], + const ivec &mask, fvec out_rgba[4]); +template void SampleLatlong_RGBE(const Cpu::TexStorageRGBA &atlas, uint32_t index, const fvec dir[3], + float y_rotation, const fvec rand[2], const ivec &mask, + fvec out_rgb[3]); class SIMDPolicy : public SIMDPolicyBase { protected: diff --git a/internal/SceneCPU.cpp b/internal/SceneCPU.cpp index 759d5f607..111fd2c61 100644 --- a/internal/SceneCPU.cpp +++ b/internal/SceneCPU.cpp @@ -15,8 +15,8 @@ namespace Ray { namespace Cpu { template T clamp(T val, T min, T max) { return (val < min ? min : (val > max ? max : val)); } -Ref::simd_fvec4 cross(const Ref::simd_fvec4 &v1, const Ref::simd_fvec4 &v2) { - return Ref::simd_fvec4{v1.get<1>() * v2.get<2>() - v1.get<2>() * v2.get<1>(), +Ref::fvec4 cross(const Ref::fvec4 &v1, const Ref::fvec4 &v2) { + return Ref::fvec4{v1.get<1>() * v2.get<2>() - v1.get<2>() * v2.get<1>(), v1.get<2>() * v2.get<0>() - v1.get<0>() * v2.get<2>(), v1.get<0>() * v2.get<1>() - v1.get<1>() * v2.get<0>(), 0.0f}; } @@ -651,8 +651,8 @@ Ray::LightHandle Ray::Cpu::Scene::AddLight(const rect_light_desc_t &_l, const fl l.rect.area = _l.width * _l.height; - const Ref::simd_fvec4 uvec = _l.width * TransformDirection(Ref::simd_fvec4{1.0f, 0.0f, 0.0f, 0.0f}, xform); - const Ref::simd_fvec4 vvec = _l.height * TransformDirection(Ref::simd_fvec4{0.0f, 0.0f, 1.0f, 0.0f}, xform); + const Ref::fvec4 uvec = _l.width * TransformDirection(Ref::fvec4{1.0f, 0.0f, 0.0f, 0.0f}, xform); + const Ref::fvec4 vvec = _l.height * TransformDirection(Ref::fvec4{0.0f, 0.0f, 1.0f, 0.0f}, xform); memcpy(l.rect.u, value_ptr(uvec), 3 * sizeof(float)); memcpy(l.rect.v, value_ptr(vvec), 3 * sizeof(float)); @@ -680,8 +680,8 @@ Ray::LightHandle Ray::Cpu::Scene::AddLight(const disk_light_desc_t &_l, const fl l.disk.area = 0.25f * PI * _l.size_x * _l.size_y; - const Ref::simd_fvec4 uvec = _l.size_x * TransformDirection(Ref::simd_fvec4{1.0f, 0.0f, 0.0f, 0.0f}, xform); - const Ref::simd_fvec4 vvec = _l.size_y * TransformDirection(Ref::simd_fvec4{0.0f, 0.0f, 1.0f, 0.0f}, xform); + const Ref::fvec4 uvec = _l.size_x * TransformDirection(Ref::fvec4{1.0f, 0.0f, 0.0f, 0.0f}, xform); + const Ref::fvec4 vvec = _l.size_y * TransformDirection(Ref::fvec4{0.0f, 0.0f, 1.0f, 0.0f}, xform); memcpy(l.disk.u, value_ptr(uvec), 3 * sizeof(float)); memcpy(l.disk.v, value_ptr(vvec), 3 * sizeof(float)); @@ -709,8 +709,8 @@ Ray::LightHandle Ray::Cpu::Scene::AddLight(const line_light_desc_t &_l, const fl l.line.area = 2.0f * PI * _l.radius * _l.height; - const Ref::simd_fvec4 uvec = TransformDirection(Ref::simd_fvec4{1.0f, 0.0f, 0.0f, 0.0f}, xform); - const Ref::simd_fvec4 vvec = TransformDirection(Ref::simd_fvec4{0.0f, 1.0f, 0.0f, 0.0f}, xform); + const Ref::fvec4 uvec = TransformDirection(Ref::fvec4{1.0f, 0.0f, 0.0f, 0.0f}, xform); + const Ref::fvec4 vvec = TransformDirection(Ref::fvec4{0.0f, 1.0f, 0.0f, 0.0f}, xform); memcpy(l.line.u, value_ptr(uvec), 3 * sizeof(float)); l.line.radius = _l.radius; @@ -903,8 +903,8 @@ void Ray::Cpu::Scene::RebuildTLAS_nolock() { primitives.reserve(mesh_instances_.size()); for (const mesh_instance_t &mi : mesh_instances_) { - primitives.push_back({0, 0, 0, Ref::simd_fvec4{mi.bbox_min[0], mi.bbox_min[1], mi.bbox_min[2], 0.0f}, - Ref::simd_fvec4{mi.bbox_max[0], mi.bbox_max[1], mi.bbox_max[2], 0.0f}}); + primitives.push_back({0, 0, 0, Ref::fvec4{mi.bbox_min[0], mi.bbox_min[1], mi.bbox_min[2], 0.0f}, + Ref::fvec4{mi.bbox_max[0], mi.bbox_max[1], mi.bbox_max[2], 0.0f}}); } std::vector temp_nodes; @@ -995,7 +995,7 @@ void Ray::Cpu::Scene::PrepareSkyEnvMap_nolock( void Ray::Cpu::Scene::PrepareEnvMapQTree_nolock() { const int tex = int(env_.env_map & 0x00ffffff); - Ref::simd_ivec2 size; + Ref::ivec2 size; tex_storage_rgba_.GetIRes(tex, 0, value_ptr(size)); const int lowest_dim = std::min(size[0], size[1]); @@ -1016,7 +1016,7 @@ void Ray::Cpu::Scene::PrepareEnvMapQTree_nolock() { for (int y = 0; y < size[1]; ++y) { for (int x = 0; x < size[0]; ++x) { const color_rgba8_t col_rgbe = tex_storage_rgba_.Get(tex, x, y, 0); - const Ref::simd_fvec4 col_rgb = Ref::rgbe_to_rgb(col_rgbe); + const Ref::fvec4 col_rgb = Ref::rgbe_to_rgb(col_rgbe); const float cur_lum = (col_rgb[0] + col_rgb[1] + col_rgb[2]); @@ -1025,9 +1025,9 @@ void Ray::Cpu::Scene::PrepareEnvMapQTree_nolock() { for (int ii = -1; ii <= 1; ++ii) { const float phi = 2.0f * PI * float(x + ii) / float(size[0]); - auto dir = Ref::simd_fvec4{sinf(theta) * cosf(phi), cosf(theta), sinf(theta) * sinf(phi), 0.0f}; + auto dir = Ref::fvec4{sinf(theta) * cosf(phi), cosf(theta), sinf(theta) * sinf(phi), 0.0f}; - Ref::simd_fvec2 q; + Ref::fvec2 q; DirToCanonical(value_ptr(dir), 0.0f, value_ptr(q)); int qx = clamp(int(cur_res * q[0]), 0, cur_res - 1); @@ -1041,7 +1041,7 @@ void Ray::Cpu::Scene::PrepareEnvMapQTree_nolock() { qy /= 2; auto &qvec = - reinterpret_cast(env_map_qtree_.mips[0][4 * (qy * cur_res / 2 + qx)]); + reinterpret_cast(env_map_qtree_.mips[0][4 * (qy * cur_res / 2 + qx)]); qvec.set(index, fmaxf(qvec[index], cur_lum)); } } @@ -1060,7 +1060,7 @@ void Ray::Cpu::Scene::PrepareEnvMapQTree_nolock() { while (cur_res > 1) { env_map_qtree_.mips.emplace_back(cur_res * cur_res, 0.0f); const auto *prev_mip = - reinterpret_cast(env_map_qtree_.mips[env_map_qtree_.mips.size() - 2].data()); + reinterpret_cast(env_map_qtree_.mips[env_map_qtree_.mips.size() - 2].data()); for (int y = 0; y < cur_res; ++y) { for (int x = 0; x < cur_res; ++x) { @@ -1091,12 +1091,12 @@ void Ray::Cpu::Scene::PrepareEnvMapQTree_nolock() { int the_last_required_lod = 0; for (int lod = int(env_map_qtree_.mips.size()) - 1; lod >= 0; --lod) { the_last_required_lod = lod; - const auto *cur_mip = reinterpret_cast(env_map_qtree_.mips[lod].data()); + const auto *cur_mip = reinterpret_cast(env_map_qtree_.mips[lod].data()); bool subdivision_required = false; for (int y = 0; y < (cur_res / 2) && !subdivision_required; ++y) { for (int x = 0; x < (cur_res / 2) && !subdivision_required; ++x) { - const Ref::simd_ivec4 mask = simd_cast(cur_mip[y * cur_res / 2 + x] > LumFractThreshold * total_lum); + const Ref::ivec4 mask = simd_cast(cur_mip[y * cur_res / 2 + x] > LumFractThreshold * total_lum); subdivision_required |= mask.not_all_zeros(); } } @@ -1137,7 +1137,7 @@ void Ray::Cpu::Scene::RebuildLightTree_nolock() { primitives.reserve(lights_.size()); struct additional_data_t { - Ref::simd_fvec4 axis; + Ref::fvec4 axis; float flux, omega_n, omega_e; }; aligned_vector additional_data; @@ -1153,7 +1153,7 @@ void Ray::Cpu::Scene::RebuildLightTree_nolock() { continue; } - Ref::simd_fvec4 bbox_min = 0.0f, bbox_max = 0.0f, axis = {0.0f, 1.0f, 0.0f, 0.0f}; + Ref::fvec4 bbox_min = 0.0f, bbox_max = 0.0f, axis = {0.0f, 1.0f, 0.0f, 0.0f}; float area = 1.0f, omega_n = 0.0f, omega_e = 0.0f; float lum = l.col[0] + l.col[1] + l.col[2]; @@ -1167,10 +1167,10 @@ void Ray::Cpu::Scene::RebuildLightTree_nolock() { switch (l.type) { case LIGHT_TYPE_SPHERE: { - const auto pos = Ref::simd_fvec4{l.sph.pos[0], l.sph.pos[1], l.sph.pos[2], 0.0f}; + const auto pos = Ref::fvec4{l.sph.pos[0], l.sph.pos[1], l.sph.pos[2], 0.0f}; - bbox_min = pos - Ref::simd_fvec4{l.sph.radius, l.sph.radius, l.sph.radius, 0.0f}; - bbox_max = pos + Ref::simd_fvec4{l.sph.radius, l.sph.radius, l.sph.radius, 0.0f}; + bbox_min = pos - Ref::fvec4{l.sph.radius, l.sph.radius, l.sph.radius, 0.0f}; + bbox_max = pos + Ref::fvec4{l.sph.radius, l.sph.radius, l.sph.radius, 0.0f}; if (l.sph.area != 0.0f) { area = l.sph.area; } @@ -1178,9 +1178,9 @@ void Ray::Cpu::Scene::RebuildLightTree_nolock() { omega_e = PI / 2.0f; } break; case LIGHT_TYPE_DIR: { - bbox_min = Ref::simd_fvec4{-MAX_DIST, -MAX_DIST, -MAX_DIST, 0.0f}; - bbox_max = Ref::simd_fvec4{MAX_DIST, MAX_DIST, MAX_DIST, 0.0f}; - axis = Ref::simd_fvec4{l.dir.dir[0], l.dir.dir[1], l.dir.dir[2], 0.0f}; + bbox_min = Ref::fvec4{-MAX_DIST, -MAX_DIST, -MAX_DIST, 0.0f}; + bbox_max = Ref::fvec4{MAX_DIST, MAX_DIST, MAX_DIST, 0.0f}; + axis = Ref::fvec4{l.dir.dir[0], l.dir.dir[1], l.dir.dir[2], 0.0f}; omega_n = 0.0f; // single normal omega_e = l.dir.angle; if (l.dir.angle != 0.0f) { @@ -1189,16 +1189,16 @@ void Ray::Cpu::Scene::RebuildLightTree_nolock() { } } break; case LIGHT_TYPE_LINE: { - const auto pos = Ref::simd_fvec4{l.line.pos[0], l.line.pos[1], l.line.pos[2], 0.0f}; - auto light_u = Ref::simd_fvec4{l.line.u[0], l.line.u[1], l.line.u[2], 0.0f}, - light_dir = Ref::simd_fvec4{l.line.v[0], l.line.v[1], l.line.v[2], 0.0f}; - Ref::simd_fvec4 light_v = Ray::Cpu::cross(light_u, light_dir); + const auto pos = Ref::fvec4{l.line.pos[0], l.line.pos[1], l.line.pos[2], 0.0f}; + auto light_u = Ref::fvec4{l.line.u[0], l.line.u[1], l.line.u[2], 0.0f}, + light_dir = Ref::fvec4{l.line.v[0], l.line.v[1], l.line.v[2], 0.0f}; + Ref::fvec4 light_v = Ray::Cpu::cross(light_u, light_dir); light_u *= l.line.radius; light_v *= l.line.radius; light_dir *= 0.5f * l.line.height; - const Ref::simd_fvec4 p0 = pos + light_dir + light_u + light_v, p1 = pos + light_dir + light_u - light_v, + const Ref::fvec4 p0 = pos + light_dir + light_u + light_v, p1 = pos + light_dir + light_u - light_v, p2 = pos + light_dir - light_u + light_v, p3 = pos + light_dir - light_u - light_v, p4 = pos - light_dir + light_u + light_v, p5 = pos - light_dir + light_u - light_v, p6 = pos - light_dir - light_u + light_v, p7 = pos - light_dir - light_u - light_v; @@ -1210,11 +1210,11 @@ void Ray::Cpu::Scene::RebuildLightTree_nolock() { omega_e = PI / 2.0f; } break; case LIGHT_TYPE_RECT: { - const auto pos = Ref::simd_fvec4{l.rect.pos[0], l.rect.pos[1], l.rect.pos[2], 0.0f}; - const auto u = 0.5f * Ref::simd_fvec4{l.rect.u[0], l.rect.u[1], l.rect.u[2], 0.0f}; - const auto v = 0.5f * Ref::simd_fvec4{l.rect.v[0], l.rect.v[1], l.rect.v[2], 0.0f}; + const auto pos = Ref::fvec4{l.rect.pos[0], l.rect.pos[1], l.rect.pos[2], 0.0f}; + const auto u = 0.5f * Ref::fvec4{l.rect.u[0], l.rect.u[1], l.rect.u[2], 0.0f}; + const auto v = 0.5f * Ref::fvec4{l.rect.v[0], l.rect.v[1], l.rect.v[2], 0.0f}; - const Ref::simd_fvec4 p0 = pos + u + v, p1 = pos + u - v, p2 = pos - u + v, p3 = pos - u - v; + const Ref::fvec4 p0 = pos + u + v, p1 = pos + u - v, p2 = pos - u + v, p3 = pos - u - v; bbox_min = min(min(p0, p1), min(p2, p3)); bbox_max = max(max(p0, p1), max(p2, p3)); area = l.rect.area; @@ -1224,11 +1224,11 @@ void Ray::Cpu::Scene::RebuildLightTree_nolock() { omega_e = PI / 2.0f; } break; case LIGHT_TYPE_DISK: { - const auto pos = Ref::simd_fvec4{l.disk.pos[0], l.disk.pos[1], l.disk.pos[2], 0.0f}; - const auto u = 0.5f * Ref::simd_fvec4{l.disk.u[0], l.disk.u[1], l.disk.u[2], 0.0f}; - const auto v = 0.5f * Ref::simd_fvec4{l.disk.v[0], l.disk.v[1], l.disk.v[2], 0.0f}; + const auto pos = Ref::fvec4{l.disk.pos[0], l.disk.pos[1], l.disk.pos[2], 0.0f}; + const auto u = 0.5f * Ref::fvec4{l.disk.u[0], l.disk.u[1], l.disk.u[2], 0.0f}; + const auto v = 0.5f * Ref::fvec4{l.disk.v[0], l.disk.v[1], l.disk.v[2], 0.0f}; - const Ref::simd_fvec4 p0 = pos + u + v, p1 = pos + u - v, p2 = pos - u + v, p3 = pos - u - v; + const Ref::fvec4 p0 = pos + u + v, p1 = pos + u - v, p2 = pos - u + v, p3 = pos - u - v; bbox_min = min(min(p0, p1), min(p2, p3)); bbox_max = max(max(p0, p1), max(p2, p3)); area = l.disk.area; @@ -1245,9 +1245,9 @@ void Ray::Cpu::Scene::RebuildLightTree_nolock() { const vertex_t &v2 = vertices_[vtx_indices_[ltri_index * 3 + 1]]; const vertex_t &v3 = vertices_[vtx_indices_[ltri_index * 3 + 2]]; - auto p1 = Ref::simd_fvec4(v1.p[0], v1.p[1], v1.p[2], 0.0f), - p2 = Ref::simd_fvec4(v2.p[0], v2.p[1], v2.p[2], 0.0f), - p3 = Ref::simd_fvec4(v3.p[0], v3.p[1], v3.p[2], 0.0f); + auto p1 = Ref::fvec4(v1.p[0], v1.p[1], v1.p[2], 0.0f), + p2 = Ref::fvec4(v2.p[0], v2.p[1], v2.p[2], 0.0f), + p3 = Ref::fvec4(v3.p[0], v3.p[1], v3.p[2], 0.0f); p1 = TransformPoint(p1, lmi.xform); p2 = TransformPoint(p2, lmi.xform); @@ -1256,7 +1256,7 @@ void Ray::Cpu::Scene::RebuildLightTree_nolock() { bbox_min = min(p1, min(p2, p3)); bbox_max = max(p1, max(p2, p3)); - Ref::simd_fvec4 light_forward = Ray::Cpu::cross(p2 - p1, p3 - p1); + Ref::fvec4 light_forward = Ray::Cpu::cross(p2 - p1, p3 - p1); area = 0.5f * length(light_forward); axis = normalize(light_forward); @@ -1265,8 +1265,8 @@ void Ray::Cpu::Scene::RebuildLightTree_nolock() { } break; case LIGHT_TYPE_ENV: { lum = (lum / 3.0f) * env_map_qtree_.medium_lum; - bbox_min = Ref::simd_fvec4{-MAX_DIST, -MAX_DIST, -MAX_DIST, 0.0f}; - bbox_max = Ref::simd_fvec4{MAX_DIST, MAX_DIST, MAX_DIST, 0.0f}; + bbox_min = Ref::fvec4{-MAX_DIST, -MAX_DIST, -MAX_DIST, 0.0f}; + bbox_max = Ref::fvec4{MAX_DIST, MAX_DIST, MAX_DIST, 0.0f}; omega_n = PI; // normals in all directions omega_e = PI / 2.0f; } break; @@ -1351,7 +1351,7 @@ void Ray::Cpu::Scene::RebuildLightTree_nolock() { memcpy(light_nodes_[parent].axis, light_nodes_[n].axis, 3 * sizeof(float)); light_nodes_[parent].omega_n = light_nodes_[n].omega_n; } else { - auto axis1 = Ref::simd_fvec4{light_nodes_[parent].axis}, axis2 = Ref::simd_fvec4{light_nodes_[n].axis}; + auto axis1 = Ref::fvec4{light_nodes_[parent].axis}, axis2 = Ref::fvec4{light_nodes_[n].axis}; axis1.set<3>(0.0f); axis2.set<3>(0.0f); @@ -1362,7 +1362,7 @@ void Ray::Cpu::Scene::RebuildLightTree_nolock() { if (axis_length != 0.0f) { axis1 /= axis_length; } else { - axis1 = Ref::simd_fvec4{0.0f, 1.0f, 0.0f, 0.0f}; + axis1 = Ref::fvec4{0.0f, 1.0f, 0.0f, 0.0f}; } memcpy(light_nodes_[parent].axis, value_ptr(axis1), 3 * sizeof(float)); diff --git a/internal/SceneCommon.cpp b/internal/SceneCommon.cpp index 534b9698d..0ff35bdc8 100644 --- a/internal/SceneCommon.cpp +++ b/internal/SceneCommon.cpp @@ -4,16 +4,16 @@ #include "Core.h" namespace Ray { -Ref::simd_fvec4 rgb_to_rgbe(const Ref::simd_fvec4 &rgb) { +Ref::fvec4 rgb_to_rgbe(const Ref::fvec4 &rgb) { float max_component = fmaxf(fmaxf(rgb.get<0>(), rgb.get<1>()), rgb.get<2>()); if (max_component < 1e-32) { - return Ref::simd_fvec4{0.0f}; + return Ref::fvec4{0.0f}; } int exponent; const float factor = frexpf(max_component, &exponent) * 256.0f / max_component; - return Ref::simd_fvec4{rgb.get<0>() * factor, rgb.get<1>() * factor, rgb.get<2>() * factor, float(exponent + 128)}; + return Ref::fvec4{rgb.get<0>() * factor, rgb.get<1>() * factor, rgb.get<2>() * factor, float(exponent + 128)}; } } // namespace Ray @@ -190,19 +190,19 @@ void Ray::SceneCommon::UpdateSkyTransmittanceLUT(const atmosphere_params_t ¶ for (int x = 0; x < TRANSMITTANCE_LUT_W; ++x) { const float u = float(x) / TRANSMITTANCE_LUT_W; - const Ref::simd_fvec2 uv = {u, v}; + const Ref::fvec2 uv = {u, v}; float view_height, view_zenith_cos_angle; UvToLutTransmittanceParams(params, uv, view_height, view_zenith_cos_angle); - const Ref::simd_fvec4 world_pos = {0.0f, view_height - params.planet_radius, 0.0f, 0.0f}; - const Ref::simd_fvec4 world_dir = {0.0f, view_zenith_cos_angle, + const Ref::fvec4 world_pos = {0.0f, view_height - params.planet_radius, 0.0f, 0.0f}; + const Ref::fvec4 world_dir = {0.0f, view_zenith_cos_angle, -sqrtf(1.0f - view_zenith_cos_angle * view_zenith_cos_angle), 0.0f}; - const Ref::simd_fvec4 optical_depthlight = IntegrateOpticalDepth(params, world_pos, world_dir); - const Ref::simd_fvec4 transmittance = exp(-optical_depthlight); + const Ref::fvec4 optical_depthlight = IntegrateOpticalDepth(params, world_pos, world_dir); + const Ref::fvec4 transmittance = exp(-optical_depthlight); - transmittance.store_to(&sky_transmittance_lut_[4 * (y * TRANSMITTANCE_LUT_W + x)], Ref::simd_mem_aligned); + transmittance.store_to(&sky_transmittance_lut_[4 * (y * TRANSMITTANCE_LUT_W + x)], Ref::vector_aligned); } } } @@ -225,20 +225,20 @@ void Ray::SceneCommon::UpdateMultiscatterLUT(const atmosphere_params_t ¶ms) for (int i = 0; i < MULTISCATTER_LUT_RES; ++i) { const float x = (i + 0.5f) / MULTISCATTER_LUT_RES; - const Ref::simd_fvec2 uv = {from_sub_uvs_to_unit(x, MULTISCATTER_LUT_RES), + const Ref::fvec2 uv = {from_sub_uvs_to_unit(x, MULTISCATTER_LUT_RES), from_sub_uvs_to_unit(y, MULTISCATTER_LUT_RES)}; const float cos_sun_zenith_angle = uv.get<0>() * 2.0f - 1.0f; - const Ref::simd_fvec4 sun_dir = { + const Ref::fvec4 sun_dir = { 0.0f, cos_sun_zenith_angle, -sqrtf(saturate(1.0f - cos_sun_zenith_angle * cos_sun_zenith_angle)), 0.0f}; const float view_height = saturate(uv.get<1>() + PlanetRadiusOffset) * (params.atmosphere_height - PlanetRadiusOffset); - const Ref::simd_fvec4 world_pos = {0.0f, view_height, 0.0f, 0.0f}; - Ref::simd_fvec4 world_dir = {0.0f, 1.0f, 0.0f, 0.0f}; + const Ref::fvec4 world_pos = {0.0f, view_height, 0.0f, 0.0f}; + Ref::fvec4 world_dir = {0.0f, 1.0f, 0.0f, 0.0f}; - std::pair total_res = {}; + std::pair total_res = {}; for (int rj = 0; rj < RaysCountSqrt; ++rj) { const float rv = (rj + 0.5f) / RaysCountSqrt; @@ -255,8 +255,8 @@ void Ray::SceneCommon::UpdateMultiscatterLUT(const atmosphere_params_t ¶ms) world_dir.set<1>(cos_phi); world_dir.set<2>(-sin_theta * sin_phi); - Ref::simd_fvec4 transmittance = 1.0f; - const std::pair res = + Ref::fvec4 transmittance = 1.0f; + const std::pair res = IntegrateScatteringMain(_params, world_pos, world_dir, MAX_DIST, sun_dir, {}, 1.0f, sky_transmittance_lut_, {}, 0.0f, 32, transmittance); @@ -268,16 +268,16 @@ void Ray::SceneCommon::UpdateMultiscatterLUT(const atmosphere_params_t ¶ms) total_res.first *= SphereSolidAngle / (RaysCountSqrt * RaysCountSqrt); total_res.second *= SphereSolidAngle / (RaysCountSqrt * RaysCountSqrt); - const Ref::simd_fvec4 in_scattered_luminance = total_res.first * IsotropicPhase; - const Ref::simd_fvec4 multi_scat_as_1 = total_res.second * IsotropicPhase; + const Ref::fvec4 in_scattered_luminance = total_res.first * IsotropicPhase; + const Ref::fvec4 multi_scat_as_1 = total_res.second * IsotropicPhase; // For a serie, sum_{n=0}^{n=+inf} = 1 + r + r^2 + r^3 + ... + r^n = 1 / (1.0 - r), see // https://en.wikipedia.org/wiki/Geometric_series - const Ref::simd_fvec4 r = multi_scat_as_1; - const Ref::simd_fvec4 sum_of_all_multiscattering_events_contribution = 1.0f / (1.0f - r); - const Ref::simd_fvec4 L = in_scattered_luminance * sum_of_all_multiscattering_events_contribution; + const Ref::fvec4 r = multi_scat_as_1; + const Ref::fvec4 sum_of_all_multiscattering_events_contribution = 1.0f / (1.0f - r); + const Ref::fvec4 L = in_scattered_luminance * sum_of_all_multiscattering_events_contribution; - L.store_to(&sky_multiscatter_lut_[4 * (j * MULTISCATTER_LUT_RES + i)], Ref::simd_mem_aligned); + L.store_to(&sky_multiscatter_lut_[4 * (j * MULTISCATTER_LUT_RES + i)], Ref::vector_aligned); } } } @@ -297,32 +297,32 @@ Ray::SceneCommon::CalcSkyEnvTexture(const atmosphere_params_t ¶ms, const int const uint32_t px_hash = Ref::hash(y * res[0] + x); const float phi = 2.0f * PI * (x + 0.5f) / float(res[0]); - auto ray_dir = Ref::simd_fvec4{sinf(theta) * cosf(phi), cosf(theta), sinf(theta) * sinf(phi), 0.0f}; + auto ray_dir = Ref::fvec4{sinf(theta) * cosf(phi), cosf(theta), sinf(theta) * sinf(phi), 0.0f}; - Ref::simd_fvec4 color = 0.0f; + Ref::fvec4 color = 0.0f; // Evaluate light sources if (!dir_lights.empty()) { for (const uint32_t li_index : dir_lights) { const light_t &l = lights[li_index]; - const Ref::simd_fvec4 light_dir = {l.dir.dir[0], l.dir.dir[1], l.dir.dir[2], 0.0f}; - Ref::simd_fvec4 light_col = {l.col[0], l.col[1], l.col[2], 0.0f}; + const Ref::fvec4 light_dir = {l.dir.dir[0], l.dir.dir[1], l.dir.dir[2], 0.0f}; + Ref::fvec4 light_col = {l.col[0], l.col[1], l.col[2], 0.0f}; if (l.dir.angle != 0.0f) { const float radius = tanf(l.dir.angle); light_col *= (PI * radius * radius); } - color += IntegrateScattering(params, Ref::simd_fvec4{0.0f, params.viewpoint_height, 0.0f, 0.0f}, + color += IntegrateScattering(params, Ref::fvec4{0.0f, params.viewpoint_height, 0.0f, 0.0f}, ray_dir, MAX_DIST, light_dir, l.dir.angle, light_col, sky_transmittance_lut_, sky_multiscatter_lut_, px_hash); } } else if (params.stars_brightness > 0.0f) { // Use fake lightsource (to light up the moon) - const Ref::simd_fvec4 light_dir = {0.0f, -1.0f, 0.0f, 0.0f}, + const Ref::fvec4 light_dir = {0.0f, -1.0f, 0.0f, 0.0f}, light_col = {144809.866891f, 129443.618266f, 127098.894121f, 0.0f}; - color += IntegrateScattering(params, Ref::simd_fvec4{0.0f, params.viewpoint_height, 0.0f, 0.0f}, + color += IntegrateScattering(params, Ref::fvec4{0.0f, params.viewpoint_height, 0.0f, 0.0f}, ray_dir, MAX_DIST, light_dir, 0.0f, light_col, sky_transmittance_lut_, sky_multiscatter_lut_, px_hash); } diff --git a/internal/SceneGPU.h b/internal/SceneGPU.h index 9dedcfcf5..a53bd9a91 100644 --- a/internal/SceneGPU.h +++ b/internal/SceneGPU.h @@ -19,8 +19,8 @@ template force_inline T clamp(const T &val, const T &min_val, const T return std::min(std::max(val, min_val), max_val); } -inline Ref::simd_fvec4 cross(const Ref::simd_fvec4 &v1, const Ref::simd_fvec4 &v2) { - return Ref::simd_fvec4{v1.get<1>() * v2.get<2>() - v1.get<2>() * v2.get<1>(), +inline Ref::fvec4 cross(const Ref::fvec4 &v1, const Ref::fvec4 &v2) { + return Ref::fvec4{v1.get<1>() * v2.get<2>() - v1.get<2>() * v2.get<1>(), v1.get<2>() * v2.get<0>() - v1.get<0>() * v2.get<2>(), v1.get<0>() * v2.get<1>() - v1.get<1>() * v2.get<0>(), 0.0f}; } @@ -72,7 +72,7 @@ class Scene : public SceneCommon { struct { int res = -1; float medium_lum = 0.0f; - SmallVector, 16> mips; + SmallVector, 16> mips; Texture2D tex; } env_map_qtree_; @@ -910,11 +910,11 @@ inline Ray::MeshHandle Ray::NS::Scene::AddMesh(const mesh_desc_t &_m) { s.allow_spatial_splits = _m.allow_spatial_splits; s.use_fast_bvh_build = _m.use_fast_bvh_build; - simd_fvec4 bbox_min{FLT_MAX}, bbox_max{-FLT_MAX}; + fvec4 bbox_min{FLT_MAX}, bbox_max{-FLT_MAX}; if (use_hwrt_) { for (int j = 0; j < int(_m.vtx_indices.size()); j += 3) { - simd_fvec4 p[3]; + fvec4 p[3]; const uint32_t i0 = _m.vtx_indices[j + 0], i1 = _m.vtx_indices[j + 1], i2 = _m.vtx_indices[j + 2]; @@ -1206,8 +1206,8 @@ inline Ray::LightHandle Ray::NS::Scene::AddLight(const rect_light_desc_t &_l, co l.rect.area = _l.width * _l.height; - const Ref::simd_fvec4 uvec = _l.width * TransformDirection(Ref::simd_fvec4{1.0f, 0.0f, 0.0f, 0.0f}, xform); - const Ref::simd_fvec4 vvec = _l.height * TransformDirection(Ref::simd_fvec4{0.0f, 0.0f, 1.0f, 0.0f}, xform); + const Ref::fvec4 uvec = _l.width * TransformDirection(Ref::fvec4{1.0f, 0.0f, 0.0f, 0.0f}, xform); + const Ref::fvec4 vvec = _l.height * TransformDirection(Ref::fvec4{0.0f, 0.0f, 1.0f, 0.0f}, xform); memcpy(l.rect.u, value_ptr(uvec), 3 * sizeof(float)); memcpy(l.rect.v, value_ptr(vvec), 3 * sizeof(float)); @@ -1235,8 +1235,8 @@ inline Ray::LightHandle Ray::NS::Scene::AddLight(const disk_light_desc_t &_l, co l.disk.area = 0.25f * PI * _l.size_x * _l.size_y; - const Ref::simd_fvec4 uvec = _l.size_x * TransformDirection(Ref::simd_fvec4{1.0f, 0.0f, 0.0f, 0.0f}, xform); - const Ref::simd_fvec4 vvec = _l.size_y * TransformDirection(Ref::simd_fvec4{0.0f, 0.0f, 1.0f, 0.0f}, xform); + const Ref::fvec4 uvec = _l.size_x * TransformDirection(Ref::fvec4{1.0f, 0.0f, 0.0f, 0.0f}, xform); + const Ref::fvec4 vvec = _l.size_y * TransformDirection(Ref::fvec4{0.0f, 0.0f, 1.0f, 0.0f}, xform); memcpy(l.disk.u, value_ptr(uvec), 3 * sizeof(float)); memcpy(l.disk.v, value_ptr(vvec), 3 * sizeof(float)); @@ -1264,8 +1264,8 @@ inline Ray::LightHandle Ray::NS::Scene::AddLight(const line_light_desc_t &_l, co l.line.area = 2.0f * PI * _l.radius * _l.height; - const Ref::simd_fvec4 uvec = TransformDirection(Ref::simd_fvec4{1.0f, 0.0f, 0.0f, 0.0f}, xform); - const Ref::simd_fvec4 vvec = TransformDirection(Ref::simd_fvec4{0.0f, 1.0f, 0.0f, 0.0f}, xform); + const Ref::fvec4 uvec = TransformDirection(Ref::fvec4{1.0f, 0.0f, 0.0f, 0.0f}, xform); + const Ref::fvec4 vvec = TransformDirection(Ref::fvec4{0.0f, 1.0f, 0.0f, 0.0f}, xform); memcpy(l.line.u, value_ptr(uvec), 3 * sizeof(float)); l.line.radius = _l.radius; @@ -1492,8 +1492,8 @@ inline void Ray::NS::Scene::Rebuild_SWRT_TLAS_nolock() { primitives.reserve(mi_count); for (auto it = mesh_instances_.cbegin(); it != mesh_instances_.cend(); ++it) { - primitives.push_back({0, 0, 0, Ref::simd_fvec4{it->bbox_min[0], it->bbox_min[1], it->bbox_min[2], 0.0f}, - Ref::simd_fvec4{it->bbox_max[0], it->bbox_max[1], it->bbox_max[2], 0.0f}}); + primitives.push_back({0, 0, 0, Ref::fvec4{it->bbox_min[0], it->bbox_min[1], it->bbox_min[2], 0.0f}, + Ref::fvec4{it->bbox_max[0], it->bbox_max[1], it->bbox_max[2], 0.0f}}); } std::vector bvh_nodes; @@ -1580,7 +1580,7 @@ inline void Ray::NS::Scene::PrepareEnvMapQTree_nolock() { const int tex = int(env_.env_map & 0x00ffffff); Buffer temp_stage_buf; - simd_ivec2 size; + ivec2 size; int pitch = 0; if (use_bindless_) { @@ -1644,7 +1644,7 @@ inline void Ray::NS::Scene::PrepareEnvMapQTree_nolock() { for (int y = 0; y < size[1]; ++y) { for (int x = 0; x < size[0]; ++x) { const uint8_t *col_rgbe = &rgbe_data[4 * (y * pitch + x)]; - simd_fvec4 col_rgb; + fvec4 col_rgb; rgbe_to_rgb(col_rgbe, value_ptr(col_rgb)); const float cur_lum = (col_rgb[0] + col_rgb[1] + col_rgb[2]); @@ -1653,10 +1653,10 @@ inline void Ray::NS::Scene::PrepareEnvMapQTree_nolock() { const float theta = PI * float(y + jj) / float(size[1]); for (int ii = -1; ii <= 1; ++ii) { const float phi = 2.0f * PI * float(x + ii) / float(size[0]); - auto dir = simd_fvec4{std::sin(theta) * std::cos(phi), std::cos(theta), + auto dir = fvec4{std::sin(theta) * std::cos(phi), std::cos(theta), std::sin(theta) * std::sin(phi), 0.0f}; - simd_fvec2 q; + fvec2 q; DirToCanonical(value_ptr(dir), 0.0f, value_ptr(q)); int qx = clamp(int(cur_res * q[0]), 0, cur_res - 1); @@ -1669,14 +1669,14 @@ inline void Ray::NS::Scene::PrepareEnvMapQTree_nolock() { qx /= 2; qy /= 2; - simd_fvec4 &qvec = env_map_qtree_.mips[0][qy * cur_res / 2 + qx]; + fvec4 &qvec = env_map_qtree_.mips[0][qy * cur_res / 2 + qx]; qvec.set(index, std::max(qvec[index], cur_lum)); } } } } - for (const simd_fvec4 &v : env_map_qtree_.mips[0]) { + for (const fvec4 &v : env_map_qtree_.mips[0]) { total_lum += (v[0] + v[1] + v[2] + v[3]); } @@ -1726,7 +1726,7 @@ inline void Ray::NS::Scene::PrepareEnvMapQTree_nolock() { bool subdivision_required = false; for (int y = 0; y < (cur_res / 2) && !subdivision_required; ++y) { for (int x = 0; x < (cur_res / 2) && !subdivision_required; ++x) { - const simd_ivec4 mask = simd_cast(cur_mip[y * cur_res / 2 + x] > LumFractThreshold * total_lum); + const ivec4 mask = simd_cast(cur_mip[y * cur_res / 2 + x] > LumFractThreshold * total_lum); subdivision_required |= mask.not_all_zeros(); } } @@ -1766,7 +1766,7 @@ inline void Ray::NS::Scene::PrepareEnvMapQTree_nolock() { int req_size = 0, mip_offsets[16] = {}; for (int i = 0; i < env_.qtree_levels; ++i) { mip_offsets[i] = req_size; - req_size += 4096 * int((env_map_qtree_.mips[i].size() * sizeof(simd_fvec4) + 4096 - 1) / 4096); + req_size += 4096 * int((env_map_qtree_.mips[i].size() * sizeof(fvec4) + 4096 - 1) / 4096); } temp_stage_buf = Buffer("Temp upload buf", ctx_, eBufType::Upload, req_size); @@ -1778,8 +1778,8 @@ inline void Ray::NS::Scene::PrepareEnvMapQTree_nolock() { int j = mip_offsets[i]; for (int y = 0; y < res; ++y) { - memcpy(&stage_data[j], &env_map_qtree_.mips[i][y * res], res * sizeof(simd_fvec4)); - j += round_up(res * sizeof(simd_fvec4), TextureDataPitchAlignment); + memcpy(&stage_data[j], &env_map_qtree_.mips[i][y * res], res * sizeof(fvec4)); + j += round_up(res * sizeof(fvec4), TextureDataPitchAlignment); } } temp_stage_buf.Unmap(); @@ -1797,7 +1797,7 @@ inline void Ray::NS::Scene::PrepareEnvMapQTree_nolock() { for (int i = 0; i < env_.qtree_levels; ++i) { env_map_qtree_.tex.SetSubImage(i, 0, 0, (env_map_qtree_.res >> i) / 2, (env_map_qtree_.res >> i) / 2, eTexFormat::RawRGBA32F, temp_stage_buf, cmd_buf, mip_offsets[i], - int(env_map_qtree_.mips[i].size() * sizeof(simd_fvec4))); + int(env_map_qtree_.mips[i].size() * sizeof(fvec4))); } EndSingleTimeCommands(ctx_->api(), ctx_->device(), ctx_->graphics_queue(), cmd_buf, ctx_->temp_command_pool()); @@ -1812,7 +1812,7 @@ inline void Ray::NS::Scene::RebuildLightTree_nolock() { primitives.reserve(lights_.size()); struct additional_data_t { - Ref::simd_fvec4 axis; + Ref::fvec4 axis; float flux, omega_n, omega_e; }; aligned_vector additional_data; @@ -1830,7 +1830,7 @@ inline void Ray::NS::Scene::RebuildLightTree_nolock() { continue; } - Ref::simd_fvec4 bbox_min = 0.0f, bbox_max = 0.0f, axis = {0.0f, 1.0f, 0.0f, 0.0f}; + Ref::fvec4 bbox_min = 0.0f, bbox_max = 0.0f, axis = {0.0f, 1.0f, 0.0f, 0.0f}; float area = 1.0f, omega_n = 0.0f, omega_e = 0.0f; float lum = l.col[0] + l.col[1] + l.col[2]; @@ -1844,10 +1844,10 @@ inline void Ray::NS::Scene::RebuildLightTree_nolock() { switch (l.type) { case LIGHT_TYPE_SPHERE: { - const auto pos = Ref::simd_fvec4{l.sph.pos[0], l.sph.pos[1], l.sph.pos[2], 0.0f}; + const auto pos = Ref::fvec4{l.sph.pos[0], l.sph.pos[1], l.sph.pos[2], 0.0f}; - bbox_min = pos - Ref::simd_fvec4{l.sph.radius, l.sph.radius, l.sph.radius, 0.0f}; - bbox_max = pos + Ref::simd_fvec4{l.sph.radius, l.sph.radius, l.sph.radius, 0.0f}; + bbox_min = pos - Ref::fvec4{l.sph.radius, l.sph.radius, l.sph.radius, 0.0f}; + bbox_max = pos + Ref::fvec4{l.sph.radius, l.sph.radius, l.sph.radius, 0.0f}; if (l.sph.area != 0.0f) { area = l.sph.area; } @@ -1855,9 +1855,9 @@ inline void Ray::NS::Scene::RebuildLightTree_nolock() { omega_e = PI / 2.0f; } break; case LIGHT_TYPE_DIR: { - bbox_min = Ref::simd_fvec4{-MAX_DIST, -MAX_DIST, -MAX_DIST, 0.0f}; - bbox_max = Ref::simd_fvec4{MAX_DIST, MAX_DIST, MAX_DIST, 0.0f}; - axis = Ref::simd_fvec4{l.dir.dir[0], l.dir.dir[1], l.dir.dir[2], 0.0f}; + bbox_min = Ref::fvec4{-MAX_DIST, -MAX_DIST, -MAX_DIST, 0.0f}; + bbox_max = Ref::fvec4{MAX_DIST, MAX_DIST, MAX_DIST, 0.0f}; + axis = Ref::fvec4{l.dir.dir[0], l.dir.dir[1], l.dir.dir[2], 0.0f}; omega_n = 0.0f; // single normal omega_e = l.dir.angle; if (l.dir.angle != 0.0f) { @@ -1866,16 +1866,16 @@ inline void Ray::NS::Scene::RebuildLightTree_nolock() { } } break; case LIGHT_TYPE_LINE: { - const auto pos = Ref::simd_fvec4{l.line.pos[0], l.line.pos[1], l.line.pos[2], 0.0f}; - auto light_u = Ref::simd_fvec4{l.line.u[0], l.line.u[1], l.line.u[2], 0.0f}, - light_dir = Ref::simd_fvec4{l.line.v[0], l.line.v[1], l.line.v[2], 0.0f}; - Ref::simd_fvec4 light_v = NS::cross(light_u, light_dir); + const auto pos = Ref::fvec4{l.line.pos[0], l.line.pos[1], l.line.pos[2], 0.0f}; + auto light_u = Ref::fvec4{l.line.u[0], l.line.u[1], l.line.u[2], 0.0f}, + light_dir = Ref::fvec4{l.line.v[0], l.line.v[1], l.line.v[2], 0.0f}; + Ref::fvec4 light_v = NS::cross(light_u, light_dir); light_u *= l.line.radius; light_v *= l.line.radius; light_dir *= 0.5f * l.line.height; - const Ref::simd_fvec4 p0 = pos + light_dir + light_u + light_v, p1 = pos + light_dir + light_u - light_v, + const Ref::fvec4 p0 = pos + light_dir + light_u + light_v, p1 = pos + light_dir + light_u - light_v, p2 = pos + light_dir - light_u + light_v, p3 = pos + light_dir - light_u - light_v, p4 = pos - light_dir + light_u + light_v, p5 = pos - light_dir + light_u - light_v, p6 = pos - light_dir - light_u + light_v, p7 = pos - light_dir - light_u - light_v; @@ -1887,11 +1887,11 @@ inline void Ray::NS::Scene::RebuildLightTree_nolock() { omega_e = PI / 2.0f; } break; case LIGHT_TYPE_RECT: { - const auto pos = Ref::simd_fvec4{l.rect.pos[0], l.rect.pos[1], l.rect.pos[2], 0.0f}; - const auto u = 0.5f * Ref::simd_fvec4{l.rect.u[0], l.rect.u[1], l.rect.u[2], 0.0f}; - const auto v = 0.5f * Ref::simd_fvec4{l.rect.v[0], l.rect.v[1], l.rect.v[2], 0.0f}; + const auto pos = Ref::fvec4{l.rect.pos[0], l.rect.pos[1], l.rect.pos[2], 0.0f}; + const auto u = 0.5f * Ref::fvec4{l.rect.u[0], l.rect.u[1], l.rect.u[2], 0.0f}; + const auto v = 0.5f * Ref::fvec4{l.rect.v[0], l.rect.v[1], l.rect.v[2], 0.0f}; - const Ref::simd_fvec4 p0 = pos + u + v, p1 = pos + u - v, p2 = pos - u + v, p3 = pos - u - v; + const Ref::fvec4 p0 = pos + u + v, p1 = pos + u - v, p2 = pos - u + v, p3 = pos - u - v; bbox_min = min(min(p0, p1), min(p2, p3)); bbox_max = max(max(p0, p1), max(p2, p3)); area = l.rect.area; @@ -1901,11 +1901,11 @@ inline void Ray::NS::Scene::RebuildLightTree_nolock() { omega_e = PI / 2.0f; } break; case LIGHT_TYPE_DISK: { - const auto pos = Ref::simd_fvec4{l.disk.pos[0], l.disk.pos[1], l.disk.pos[2], 0.0f}; - const auto u = 0.5f * Ref::simd_fvec4{l.disk.u[0], l.disk.u[1], l.disk.u[2], 0.0f}; - const auto v = 0.5f * Ref::simd_fvec4{l.disk.v[0], l.disk.v[1], l.disk.v[2], 0.0f}; + const auto pos = Ref::fvec4{l.disk.pos[0], l.disk.pos[1], l.disk.pos[2], 0.0f}; + const auto u = 0.5f * Ref::fvec4{l.disk.u[0], l.disk.u[1], l.disk.u[2], 0.0f}; + const auto v = 0.5f * Ref::fvec4{l.disk.v[0], l.disk.v[1], l.disk.v[2], 0.0f}; - const Ref::simd_fvec4 p0 = pos + u + v, p1 = pos + u - v, p2 = pos - u + v, p3 = pos - u - v; + const Ref::fvec4 p0 = pos + u + v, p1 = pos + u - v, p2 = pos - u + v, p3 = pos - u - v; bbox_min = min(min(p0, p1), min(p2, p3)); bbox_max = max(max(p0, p1), max(p2, p3)); area = l.disk.area; @@ -1922,9 +1922,9 @@ inline void Ray::NS::Scene::RebuildLightTree_nolock() { const vertex_t &v2 = vertices_[vtx_indices_[ltri_index * 3 + 1]]; const vertex_t &v3 = vertices_[vtx_indices_[ltri_index * 3 + 2]]; - auto p1 = Ref::simd_fvec4(v1.p[0], v1.p[1], v1.p[2], 0.0f), - p2 = Ref::simd_fvec4(v2.p[0], v2.p[1], v2.p[2], 0.0f), - p3 = Ref::simd_fvec4(v3.p[0], v3.p[1], v3.p[2], 0.0f); + auto p1 = Ref::fvec4(v1.p[0], v1.p[1], v1.p[2], 0.0f), + p2 = Ref::fvec4(v2.p[0], v2.p[1], v2.p[2], 0.0f), + p3 = Ref::fvec4(v3.p[0], v3.p[1], v3.p[2], 0.0f); p1 = TransformPoint(p1, lmi.xform); p2 = TransformPoint(p2, lmi.xform); @@ -1933,7 +1933,7 @@ inline void Ray::NS::Scene::RebuildLightTree_nolock() { bbox_min = min(p1, min(p2, p3)); bbox_max = max(p1, max(p2, p3)); - Ref::simd_fvec4 light_forward = NS::cross(p2 - p1, p3 - p1); + Ref::fvec4 light_forward = NS::cross(p2 - p1, p3 - p1); area = 0.5f * length(light_forward); axis = normalize(light_forward); @@ -1942,8 +1942,8 @@ inline void Ray::NS::Scene::RebuildLightTree_nolock() { } break; case LIGHT_TYPE_ENV: { lum = (lum / 3.0f) * env_map_qtree_.medium_lum; - bbox_min = Ref::simd_fvec4{-MAX_DIST, -MAX_DIST, -MAX_DIST, 0.0f}; - bbox_max = Ref::simd_fvec4{MAX_DIST, MAX_DIST, MAX_DIST, 0.0f}; + bbox_min = Ref::fvec4{-MAX_DIST, -MAX_DIST, -MAX_DIST, 0.0f}; + bbox_max = Ref::fvec4{MAX_DIST, MAX_DIST, MAX_DIST, 0.0f}; omega_n = PI; // normals in all directions omega_e = PI / 2.0f; } break; @@ -2029,7 +2029,7 @@ inline void Ray::NS::Scene::RebuildLightTree_nolock() { memcpy(temp_lnodes[parent].axis, temp_lnodes[n].axis, 3 * sizeof(float)); temp_lnodes[parent].omega_n = temp_lnodes[n].omega_n; } else { - auto axis1 = Ref::simd_fvec4{temp_lnodes[parent].axis}, axis2 = Ref::simd_fvec4{temp_lnodes[n].axis}; + auto axis1 = Ref::fvec4{temp_lnodes[parent].axis}, axis2 = Ref::fvec4{temp_lnodes[n].axis}; axis1.set<3>(0.0f); axis2.set<3>(0.0f); @@ -2040,7 +2040,7 @@ inline void Ray::NS::Scene::RebuildLightTree_nolock() { if (axis_length != 0.0f) { axis1 /= axis_length; } else { - axis1 = Ref::simd_fvec4{0.0f, 1.0f, 0.0f, 0.0f}; + axis1 = Ref::fvec4{0.0f, 1.0f, 0.0f, 0.0f}; } memcpy(temp_lnodes[parent].axis, value_ptr(axis1), 3 * sizeof(float)); diff --git a/internal/TextureUtils.cpp b/internal/TextureUtils.cpp index d2feeaab6..7b8d53ab8 100644 --- a/internal/TextureUtils.cpp +++ b/internal/TextureUtils.cpp @@ -261,17 +261,17 @@ std::unique_ptr Ray::ConvertRGB32F_to_RGBE(const float image_data[], for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { - Ref::simd_fvec4 val; + Ref::fvec4 val; if (channels == 3) { - val = Ref::simd_fvec4{image_data[3 * (y * w + x) + 0], image_data[3 * (y * w + x) + 1], + val = Ref::fvec4{image_data[3 * (y * w + x) + 0], image_data[3 * (y * w + x) + 1], image_data[3 * (y * w + x) + 2], 0.0f}; } else if (channels == 4) { - val = Ref::simd_fvec4{image_data[4 * (y * w + x) + 0], image_data[4 * (y * w + x) + 1], + val = Ref::fvec4{image_data[4 * (y * w + x) + 0], image_data[4 * (y * w + x) + 1], image_data[4 * (y * w + x) + 2], 0.0f}; } - auto exp = Ref::simd_fvec4{std::log2(val[0]), std::log2(val[1]), std::log2(val[2]), 0.0f}; + auto exp = Ref::fvec4{std::log2(val[0]), std::log2(val[1]), std::log2(val[2]), 0.0f}; for (int i = 0; i < 3; i++) { exp.set(i, std::ceil(exp[i])); if (exp[i] < -128.0f) { @@ -284,7 +284,7 @@ std::unique_ptr Ray::ConvertRGB32F_to_RGBE(const float image_data[], const float common_exp = std::max(exp[0], std::max(exp[1], exp[2])); const float range = std::exp2(common_exp); - Ref::simd_fvec4 mantissa = val / range; + Ref::fvec4 mantissa = val / range; for (int i = 0; i < 3; i++) { if (mantissa[i] < 0.0f) { mantissa.set(i, 0.0f); @@ -293,7 +293,7 @@ std::unique_ptr Ray::ConvertRGB32F_to_RGBE(const float image_data[], } } - const auto res = Ref::simd_fvec4{mantissa[0], mantissa[1], mantissa[2], common_exp + 128.0f}; + const auto res = Ref::fvec4{mantissa[0], mantissa[1], mantissa[2], common_exp + 128.0f}; u8_data[(y * w + x) * 4 + 0] = (uint8_t)_CLAMP(int(res[0] * 255), 0, 255); u8_data[(y * w + x) * 4 + 1] = (uint8_t)_CLAMP(int(res[1] * 255), 0, 255); @@ -1813,29 +1813,29 @@ template int Ray::Preprocess_BCn<4>(const uint8_t in_data[], const int tiles_w, void Ray::ComputeTangentBasis(size_t vtx_offset, size_t vtx_start, std::vector &vertices, Span new_vtx_indices, Span indices) { - auto cross = [](const Ref::simd_fvec3 &v1, const Ref::simd_fvec3 &v2) -> Ref::simd_fvec3 { - return Ref::simd_fvec3{v1[1] * v2[2] - v1[2] * v2[1], v1[2] * v2[0] - v1[0] * v2[2], + auto cross = [](const Ref::fvec3 &v1, const Ref::fvec3 &v2) -> Ref::fvec3 { + return Ref::fvec3{v1[1] * v2[2] - v1[2] * v2[1], v1[2] * v2[0] - v1[0] * v2[2], v1[0] * v2[1] - v1[1] * v2[0]}; }; std::vector> twin_verts(vertices.size(), {0, 0, 0}); - aligned_vector binormals(vertices.size()); + aligned_vector binormals(vertices.size()); for (int i = 0; i < indices.size(); i += 3) { vertex_t *v0 = &vertices[indices[i + 0]]; vertex_t *v1 = &vertices[indices[i + 1]]; vertex_t *v2 = &vertices[indices[i + 2]]; - Ref::simd_fvec3 &b0 = binormals[indices[i + 0]]; - Ref::simd_fvec3 &b1 = binormals[indices[i + 1]]; - Ref::simd_fvec3 &b2 = binormals[indices[i + 2]]; + Ref::fvec3 &b0 = binormals[indices[i + 0]]; + Ref::fvec3 &b1 = binormals[indices[i + 1]]; + Ref::fvec3 &b2 = binormals[indices[i + 2]]; - const Ref::simd_fvec3 dp1 = Ref::simd_fvec3(v1->p) - Ref::simd_fvec3(v0->p); - const Ref::simd_fvec3 dp2 = Ref::simd_fvec3(v2->p) - Ref::simd_fvec3(v0->p); + const Ref::fvec3 dp1 = Ref::fvec3(v1->p) - Ref::fvec3(v0->p); + const Ref::fvec3 dp2 = Ref::fvec3(v2->p) - Ref::fvec3(v0->p); - const Ref::simd_fvec2 dt1 = Ref::simd_fvec2(v1->t) - Ref::simd_fvec2(v0->t); - const Ref::simd_fvec2 dt2 = Ref::simd_fvec2(v2->t) - Ref::simd_fvec2(v0->t); + const Ref::fvec2 dt1 = Ref::fvec2(v1->t) - Ref::fvec2(v0->t); + const Ref::fvec2 dt2 = Ref::fvec2(v2->t) - Ref::fvec2(v0->t); - Ref::simd_fvec3 tangent, binormal; + Ref::fvec3 tangent, binormal; const float det = std::abs(dt1[0] * dt2[1] - dt1[1] * dt2[0]); if (det > FLT_EPS) { @@ -1843,21 +1843,21 @@ void Ray::ComputeTangentBasis(size_t vtx_offset, size_t vtx_start, std::vector FLT_EPS) { - binormal = normalize(cross(Ref::simd_fvec3(plane_N), tangent)); - tangent = normalize(cross(Ref::simd_fvec3(plane_N), binormal)); + binormal = normalize(cross(Ref::fvec3(plane_N), tangent)); + tangent = normalize(cross(Ref::fvec3(plane_N), binormal)); } else { binormal = {0.0f}; tangent = {0.0f}; @@ -1941,8 +1941,8 @@ void Ray::ComputeTangentBasis(size_t vtx_offset, size_t vtx_start, std::vector FLT_EPS || std::abs(v.b[1]) > FLT_EPS || std::abs(v.b[2]) > FLT_EPS) { - const auto tangent = Ref::simd_fvec3{v.b}; - Ref::simd_fvec3 binormal = cross(Ref::simd_fvec3(v.n), tangent); + const auto tangent = Ref::fvec3{v.b}; + Ref::fvec3 binormal = cross(Ref::fvec3(v.n), tangent); const float l = length(binormal); if (l > FLT_EPS) { binormal /= l; diff --git a/internal/simd/simd_vec.h b/internal/simd/simd.h similarity index 56% rename from internal/simd/simd_vec.h rename to internal/simd/simd.h index 7796d0480..a132e72fe 100644 --- a/internal/simd/simd_vec.h +++ b/internal/simd/simd.h @@ -110,25 +110,25 @@ namespace Ray { namespace NS { -enum simd_mem_aligned_tag { simd_mem_aligned }; +enum vector_aligned_tag { vector_aligned }; -template class simd_vec { +template class fixed_size_simd { T comp_[S]; - friend class simd_vec; - friend class simd_vec; - friend class simd_vec; + friend class fixed_size_simd; + friend class fixed_size_simd; + friend class fixed_size_simd; public: - simd_vec() = default; - simd_vec(T f) { + fixed_size_simd() = default; + fixed_size_simd(T f) { UNROLLED_FOR_S(i, S, { comp_[i] = f; }) } template - force_inline simd_vec(typename std::enable_if::type head, Tail... tail) + force_inline fixed_size_simd(typename std::enable_if::type head, Tail... tail) : comp_{head, T(tail)...} {} - force_inline explicit simd_vec(const T *f) { memcpy(&comp_, f, S * sizeof(T)); } - force_inline simd_vec(const T *_f, simd_mem_aligned_tag) { + force_inline explicit fixed_size_simd(const T *f) { memcpy(&comp_, f, S * sizeof(T)); } + force_inline fixed_size_simd(const T *_f, vector_aligned_tag) { const auto *f = (const T *)assume_aligned(_f, sizeof(T)); memcpy(&comp_[0], f, S * sizeof(T)); } @@ -140,27 +140,27 @@ template class simd_vec { template force_inline void set(const T f) { comp_[i] = f; } force_inline void set(const int i, const T f) { comp_[i] = f; } - simd_vec &operator+=(const simd_vec &rhs) { + fixed_size_simd &operator+=(const fixed_size_simd &rhs) { UNROLLED_FOR_S(i, S, { comp_[i] += rhs.comp_[i]; }) return *this; } - simd_vec &operator-=(const simd_vec &rhs) { + fixed_size_simd &operator-=(const fixed_size_simd &rhs) { UNROLLED_FOR_S(i, S, { comp_[i] -= rhs.comp_[i]; }) return *this; } - simd_vec &operator*=(const simd_vec &rhs) { + fixed_size_simd &operator*=(const fixed_size_simd &rhs) { UNROLLED_FOR_S(i, S, { comp_[i] *= rhs.comp_[i]; }) return *this; } - simd_vec &operator/=(const simd_vec &rhs) { + fixed_size_simd &operator/=(const fixed_size_simd &rhs) { UNROLLED_FOR_S(i, S, { comp_[i] /= rhs.comp_[i]; }) return *this; } - simd_vec &operator|=(const simd_vec &rhs) { + fixed_size_simd &operator|=(const fixed_size_simd &rhs) { const auto *src2 = reinterpret_cast(&rhs.comp_[0]); auto *dst = reinterpret_cast(&comp_[0]); @@ -172,97 +172,97 @@ template class simd_vec { return *this; } - simd_vec &operator^=(const simd_vec &rhs) { + fixed_size_simd &operator^=(const fixed_size_simd &rhs) { UNROLLED_FOR_S(i, S, { comp_[i] ^= rhs.comp_[i]; }) return *this; } - simd_vec operator-() const { - simd_vec temp; + fixed_size_simd operator-() const { + fixed_size_simd temp; UNROLLED_FOR_S(i, S, { temp.comp_[i] = -comp_[i]; }) return temp; } - simd_vec operator==(const simd_vec &rhs) const { - simd_vec temp; + fixed_size_simd operator==(const fixed_size_simd &rhs) const { + fixed_size_simd temp; UNROLLED_FOR_S(i, S, { temp.comp_[i] = comp_[i] == rhs.comp_[i] ? -1 : 0; }) - static_assert(sizeof(simd_vec) == sizeof(simd_vec), "!"); + static_assert(sizeof(fixed_size_simd) == sizeof(fixed_size_simd), "!"); - simd_vec ret; - memcpy(&ret, &temp, sizeof(simd_vec)); + fixed_size_simd ret; + memcpy(&ret, &temp, sizeof(fixed_size_simd)); return ret; } - simd_vec operator!=(const simd_vec &rhs) const { - simd_vec temp; + fixed_size_simd operator!=(const fixed_size_simd &rhs) const { + fixed_size_simd temp; UNROLLED_FOR_S(i, S, { temp.comp_[i] = comp_[i] != rhs.comp_[i] ? -1 : 0; }) - static_assert(sizeof(simd_vec) == sizeof(simd_vec), "!"); + static_assert(sizeof(fixed_size_simd) == sizeof(fixed_size_simd), "!"); - simd_vec ret; - memcpy(&ret, &temp, sizeof(simd_vec)); + fixed_size_simd ret; + memcpy(&ret, &temp, sizeof(fixed_size_simd)); return ret; } - simd_vec operator<(const simd_vec &rhs) const { - simd_vec temp; + fixed_size_simd operator<(const fixed_size_simd &rhs) const { + fixed_size_simd temp; UNROLLED_FOR_S(i, S, { temp.comp_[i] = comp_[i] < rhs.comp_[i] ? -1 : 0; }) - static_assert(sizeof(simd_vec) == sizeof(simd_vec), "!"); + static_assert(sizeof(fixed_size_simd) == sizeof(fixed_size_simd), "!"); - simd_vec ret; - memcpy(&ret, &temp, sizeof(simd_vec)); + fixed_size_simd ret; + memcpy(&ret, &temp, sizeof(fixed_size_simd)); return ret; } - simd_vec operator<=(const simd_vec &rhs) const { - simd_vec temp; + fixed_size_simd operator<=(const fixed_size_simd &rhs) const { + fixed_size_simd temp; UNROLLED_FOR_S(i, S, { temp.comp_[i] = comp_[i] <= rhs.comp_[i] ? -1 : 0; }) - static_assert(sizeof(simd_vec) == sizeof(simd_vec), "!"); + static_assert(sizeof(fixed_size_simd) == sizeof(fixed_size_simd), "!"); - simd_vec ret; - memcpy(&ret, &temp, sizeof(simd_vec)); + fixed_size_simd ret; + memcpy(&ret, &temp, sizeof(fixed_size_simd)); return ret; } - simd_vec operator>(const simd_vec &rhs) const { - simd_vec temp; + fixed_size_simd operator>(const fixed_size_simd &rhs) const { + fixed_size_simd temp; UNROLLED_FOR_S(i, S, { temp.comp_[i] = comp_[i] > rhs.comp_[i] ? -1 : 0; }) - static_assert(sizeof(simd_vec) == sizeof(simd_vec), "!"); + static_assert(sizeof(fixed_size_simd) == sizeof(fixed_size_simd), "!"); - simd_vec ret; - memcpy(&ret, &temp, sizeof(simd_vec)); + fixed_size_simd ret; + memcpy(&ret, &temp, sizeof(fixed_size_simd)); return ret; } - simd_vec operator>=(const simd_vec &rhs) const { - simd_vec temp; + fixed_size_simd operator>=(const fixed_size_simd &rhs) const { + fixed_size_simd temp; UNROLLED_FOR_S(i, S, { temp.comp_[i] = comp_[i] >= rhs.comp_[i] ? -1 : 0; }) - static_assert(sizeof(simd_vec) == sizeof(simd_vec), "!"); + static_assert(sizeof(fixed_size_simd) == sizeof(fixed_size_simd), "!"); - simd_vec ret; - memcpy(&ret, &temp, sizeof(simd_vec)); + fixed_size_simd ret; + memcpy(&ret, &temp, sizeof(fixed_size_simd)); return ret; } - simd_vec &operator&=(const simd_vec &rhs) { + fixed_size_simd &operator&=(const fixed_size_simd &rhs) { UNROLLED_FOR_S(i, S, { reinterpret_cast(comp_[i]) &= reinterpret_cast(rhs.comp_[i]); }) return *this; } - simd_vec operator~() const { - simd_vec ret; + fixed_size_simd operator~() const { + fixed_size_simd ret; UNROLLED_FOR_S(i, S, { const uint32_t temp = ~reinterpret_cast(comp_[i]); ret.comp_[i] = reinterpret_cast(temp); @@ -270,32 +270,32 @@ template class simd_vec { return ret; } - explicit operator simd_vec() const { - simd_vec ret; + explicit operator fixed_size_simd() const { + fixed_size_simd ret; UNROLLED_FOR_S(i, S, { ret.comp_[i] = int(comp_[i]); }) return ret; } - explicit operator simd_vec() const { - simd_vec ret; + explicit operator fixed_size_simd() const { + fixed_size_simd ret; UNROLLED_FOR_S(i, S, { ret.comp_[i] = unsigned(comp_[i]); }) return ret; } - explicit operator simd_vec() const { - simd_vec ret; + explicit operator fixed_size_simd() const { + fixed_size_simd ret; UNROLLED_FOR_S(i, S, { ret.comp_[i] = float(comp_[i]); }) return ret; } - simd_vec sqrt() const { - simd_vec temp; + fixed_size_simd sqrt() const { + fixed_size_simd temp; UNROLLED_FOR_S(i, S, { temp.set(std::sqrt(comp_[i])); }) return temp; } - simd_vec log() const { - simd_vec temp; + fixed_size_simd log() const { + fixed_size_simd temp; UNROLLED_FOR_S(i, S, { temp.set(std::log(comp_[i])); }) return temp; } @@ -320,7 +320,7 @@ template class simd_vec { force_inline void store_to(T *f) const { memcpy(f, &comp_[0], S * sizeof(T)); } - force_inline void store_to(T *_f, simd_mem_aligned_tag) const { + force_inline void store_to(T *_f, vector_aligned_tag) const { auto *f = (T *)assume_aligned(_f, sizeof(T)); memcpy(f, &comp_[0], S * sizeof(T)); } @@ -334,7 +334,7 @@ template class simd_vec { return true; } - bool all_zeros(const simd_vec &mask) const { + bool all_zeros(const fixed_size_simd &mask) const { const auto *src1 = reinterpret_cast(&comp_[0]); const auto *src2 = reinterpret_cast(&mask.comp_[0]); @@ -357,7 +357,7 @@ template class simd_vec { } // clang-format off - void blend_to(const simd_vec &mask, const simd_vec &v1) { + void blend_to(const fixed_size_simd &mask, const fixed_size_simd &v1) { UNROLLED_FOR_S(i, S, { if (mask.comp_[i] != T(0)) { comp_[i] = v1.comp_[i]; @@ -365,7 +365,7 @@ template class simd_vec { }) } - void blend_inv_to(const simd_vec &mask, const simd_vec &v1) { + void blend_inv_to(const fixed_size_simd &mask, const fixed_size_simd &v1) { UNROLLED_FOR_S(i, S, { if (mask.comp_[i] == T(0)) { comp_[i] = v1.comp_[i]; @@ -383,23 +383,23 @@ template class simd_vec { return res; } - friend simd_vec min(const simd_vec &v1, const simd_vec &v2) { - simd_vec temp; + friend fixed_size_simd min(const fixed_size_simd &v1, const fixed_size_simd &v2) { + fixed_size_simd temp; UNROLLED_FOR_S(i, S, { temp.comp_[i] = std::min(v1.comp_[i], v2.comp_[i]); }) return temp; } - friend simd_vec max(const simd_vec &v1, const simd_vec &v2) { - simd_vec temp; + friend fixed_size_simd max(const fixed_size_simd &v1, const fixed_size_simd &v2) { + fixed_size_simd temp; UNROLLED_FOR_S(i, S, { temp.comp_[i] = std::max(v1.comp_[i], v2.comp_[i]); }) return temp; } - static simd_vec and_not(const simd_vec &v1, const simd_vec &v2) { + static fixed_size_simd and_not(const fixed_size_simd &v1, const fixed_size_simd &v2) { const auto *src1 = reinterpret_cast(&v1.comp_[0]); const auto *src2 = reinterpret_cast(&v2.comp_[0]); - simd_vec ret; + fixed_size_simd ret; auto *dst = reinterpret_cast(&ret.comp_[0]); @@ -410,14 +410,14 @@ template class simd_vec { return ret; } - static simd_vec floor(const simd_vec &v1) { - simd_vec temp; + static fixed_size_simd floor(const fixed_size_simd &v1) { + fixed_size_simd temp; UNROLLED_FOR_S(i, S, { temp.comp_[i] = float(int(v1.comp_[i]) - (v1.comp_[i] < 0.0f)); }) return temp; } - static simd_vec ceil(const simd_vec &v1) { - simd_vec temp; + static fixed_size_simd ceil(const fixed_size_simd &v1) { + fixed_size_simd temp; UNROLLED_FOR_S(i, S, { int _v = int(v1.comp_[i]); temp.comp_[i] = float(_v + (v1.comp_[i] != _v)); @@ -426,10 +426,10 @@ template class simd_vec { } #define DEFINE_BITS_OPERATOR(OP) \ - friend simd_vec operator OP(const simd_vec &v1, const simd_vec &v2) { \ + friend fixed_size_simd operator OP(const fixed_size_simd &v1, const fixed_size_simd &v2) { \ const auto *src1 = reinterpret_cast(&v1.comp_[0]); \ const auto *src2 = reinterpret_cast(&v2.comp_[0]); \ - simd_vec ret; \ + fixed_size_simd ret; \ auto *dst = reinterpret_cast(&ret.comp_[0]); \ for (int i = 0; i < S * sizeof(T); i++) { \ dst[i] = src1[i] OP src2[i]; \ @@ -444,8 +444,8 @@ template class simd_vec { #undef DEFINE_BITS_OPERATOR #define DEFINE_ARITHMETIC_OPERATOR(OP) \ - friend simd_vec operator OP(const simd_vec &v1, const simd_vec &v2) { \ - simd_vec ret; \ + friend fixed_size_simd operator OP(const fixed_size_simd &v1, const fixed_size_simd &v2) { \ + fixed_size_simd ret; \ UNROLLED_FOR_S(i, S, { ret.comp_[i] = v1.comp_[i] OP v2.comp_[i]; }) \ return ret; \ } @@ -459,143 +459,162 @@ template class simd_vec { #undef DEFINE_ARITHMETIC_OPERATOR - friend simd_vec srai(const simd_vec &v1, int v2) { - simd_vec ret; + friend fixed_size_simd srai(const fixed_size_simd &v1, int v2) { + fixed_size_simd ret; UNROLLED_FOR_S(i, S, { ret.comp_[i] = v1.comp_[i] >> v2; }) return ret; } - friend simd_vec srli(const simd_vec &v1, int v2) { - simd_vec ret; + friend fixed_size_simd srli(const fixed_size_simd &v1, int v2) { + fixed_size_simd ret; UNROLLED_FOR_S(i, S, { ret.comp_[i] = unsigned(v1.comp_[i]) >> v2; }) return ret; } - friend T dot(const simd_vec &v1, const simd_vec &v2) { + friend T dot(const fixed_size_simd &v1, const fixed_size_simd &v2) { T ret = {0}; UNROLLED_FOR_S(i, S, { ret += v1.comp_[i] * v2.comp_[i]; }) return ret; } - friend force_inline simd_vec clamp(const simd_vec &v1, const simd_vec &_min, - const simd_vec &_max) { + friend force_inline fixed_size_simd clamp(const fixed_size_simd &v1, const fixed_size_simd &_min, + const fixed_size_simd &_max) { return min(max(v1, _min), _max); } - friend force_inline simd_vec saturate(const simd_vec &v1) { return clamp(v1, T(0), T(1)); } + friend force_inline fixed_size_simd saturate(const fixed_size_simd &v1) { + return clamp(v1, T(0), T(1)); + } - friend simd_vec pow(const simd_vec &v1, const simd_vec &v2) { - simd_vec ret; + friend fixed_size_simd pow(const fixed_size_simd &v1, const fixed_size_simd &v2) { + fixed_size_simd ret; UNROLLED_FOR_S(i, S, { ret.comp_[i] = std::pow(v1.comp_[i], v2.comp_[i]); }) return ret; } - friend force_inline simd_vec normalize(const simd_vec &v1) { return v1 / v1.length(); } + friend force_inline fixed_size_simd normalize(const fixed_size_simd &v1) { return v1 / v1.length(); } - friend force_inline simd_vec normalize_len(const simd_vec &v1, T &out_len) { + friend force_inline fixed_size_simd normalize_len(const fixed_size_simd &v1, T &out_len) { return v1 / (out_len = v1.length()); } - friend bool is_equal(const simd_vec &v1, const simd_vec &v2) { + friend bool is_equal(const fixed_size_simd &v1, const fixed_size_simd &v2) { bool res = true; UNROLLED_FOR_S(i, S, { res = res && (v1.comp_[i] == v2.comp_[i]); }) return res; } - friend force_inline const T *value_ptr(const simd_vec &v1) { return &v1.comp_[0]; } - friend force_inline T *value_ptr(simd_vec &v1) { return &v1.comp_[0]; } + friend force_inline const T *value_ptr(const fixed_size_simd &v1) { return &v1.comp_[0]; } + friend force_inline T *value_ptr(fixed_size_simd &v1) { return &v1.comp_[0]; } static int size() { return S; } static bool is_native() { return false; } }; -template force_inline simd_vec and_not(const simd_vec &v1, const simd_vec &v2) { - return simd_vec::and_not(v1, v2); +template +force_inline fixed_size_simd and_not(const fixed_size_simd &v1, const fixed_size_simd &v2) { + return fixed_size_simd::and_not(v1, v2); } -template force_inline simd_vec floor(const simd_vec &v1) { - return simd_vec::floor(v1); +template force_inline fixed_size_simd floor(const fixed_size_simd &v1) { + return fixed_size_simd::floor(v1); } -template force_inline simd_vec ceil(const simd_vec &v1) { - return simd_vec::ceil(v1); +template force_inline fixed_size_simd ceil(const fixed_size_simd &v1) { + return fixed_size_simd::ceil(v1); } -template force_inline simd_vec mod(const simd_vec &v1, const simd_vec &v2) { +template +force_inline fixed_size_simd mod(const fixed_size_simd &v1, const fixed_size_simd &v2) { return v1 - v2 * floor(v1 / v2); } -template force_inline simd_vec sqrt(const simd_vec &v1) { return v1.sqrt(); } -template force_inline simd_vec log(const simd_vec &v1) { return v1.log(); } +template force_inline fixed_size_simd sqrt(const fixed_size_simd &v1) { + return v1.sqrt(); +} +template force_inline fixed_size_simd log(const fixed_size_simd &v1) { + return v1.log(); +} -template force_inline T length(const simd_vec &v1) { return v1.length(); } +template force_inline T length(const fixed_size_simd &v1) { return v1.length(); } -template force_inline T length2(const simd_vec &v1) { return v1.length2(); } +template force_inline T length2(const fixed_size_simd &v1) { return v1.length2(); } -template force_inline T hsum(const simd_vec &v1) { return v1.hsum(); } +template force_inline T hsum(const fixed_size_simd &v1) { return v1.hsum(); } -template force_inline simd_vec fract(const simd_vec &v1) { return v1 - floor(v1); } +template force_inline fixed_size_simd fract(const fixed_size_simd &v1) { + return v1 - floor(v1); +} -template force_inline simd_vec max(const simd_vec &v1, const simd_vec &v2) { - return simd_vec::max(v1, v2); +template +force_inline fixed_size_simd max(const fixed_size_simd &v1, const fixed_size_simd &v2) { + return fixed_size_simd::max(v1, v2); } -template force_inline simd_vec abs(const simd_vec &v) { +template force_inline fixed_size_simd abs(const fixed_size_simd &v) { // TODO: find faster implementation return max(v, -v); } -template force_inline simd_vec exp(const simd_vec &v) { - return pow(simd_vec{std::exp(1.0f)}, v); +template force_inline fixed_size_simd exp(const fixed_size_simd &v) { + return pow(fixed_size_simd{std::exp(1.0f)}, v); } template -force_inline simd_vec fmadd(const simd_vec &a, const simd_vec &b, const simd_vec &c) { +force_inline fixed_size_simd fmadd(const fixed_size_simd &a, const fixed_size_simd &b, + const fixed_size_simd &c) { return a * b + c; } template -force_inline simd_vec fmadd(const simd_vec &a, const float b, const simd_vec &c) { +force_inline fixed_size_simd fmadd(const fixed_size_simd &a, const float b, + const fixed_size_simd &c) { return a * b + c; } -template force_inline simd_vec fmadd(const float a, const simd_vec &b, const float c) { +template +force_inline fixed_size_simd fmadd(const float a, const fixed_size_simd &b, const float c) { return a * b + c; } template -force_inline simd_vec fmsub(const simd_vec &a, const simd_vec &b, const simd_vec &c) { +force_inline fixed_size_simd fmsub(const fixed_size_simd &a, const fixed_size_simd &b, + const fixed_size_simd &c) { return a * b - c; } template -force_inline simd_vec fmsub(const simd_vec &a, const float b, const simd_vec &c) { +force_inline fixed_size_simd fmsub(const fixed_size_simd &a, const float b, + const fixed_size_simd &c) { return a * b - c; } -template force_inline simd_vec fmsub(const float a, const simd_vec &b, const float c) { +template +force_inline fixed_size_simd fmsub(const float a, const fixed_size_simd &b, const float c) { return a * b - c; } -template force_inline simd_vec mix(const simd_vec &v1, const simd_vec &v2, T k) { +template +force_inline fixed_size_simd mix(const fixed_size_simd &v1, const fixed_size_simd &v2, T k) { return (1.0f - k) * v1 + k * v2; } template -force_inline simd_vec mix(const simd_vec &v1, const simd_vec &v2, simd_vec k) { - return (simd_vec{1} - k) * v1 + k * v2; +force_inline fixed_size_simd mix(const fixed_size_simd &v1, const fixed_size_simd &v2, + fixed_size_simd k) { + return (fixed_size_simd{1} - k) * v1 + k * v2; } -template simd_vec gather(const T *base_addr, const simd_vec &vindex) { - simd_vec res; +template fixed_size_simd gather(const T *base_addr, const fixed_size_simd &vindex) { + fixed_size_simd res; UNROLLED_FOR_S(i, S, { res.template set(base_addr[vindex.template get()]); }); return res; } template -simd_vec gather(const simd_vec &src, const T *base_addr, const simd_vec &mask, - const simd_vec &vindex) { - simd_vec res = src; +fixed_size_simd gather(const fixed_size_simd &src, const T *base_addr, const fixed_size_simd &mask, + const fixed_size_simd &vindex) { + fixed_size_simd res = src; UNROLLED_FOR_S(i, S, { if (mask.template get()) { res.template set(base_addr[vindex.template get()]); @@ -604,16 +623,18 @@ simd_vec gather(const simd_vec &src, const T *base_addr, const simd_ return res; } -template void scatter(T *base_addr, const simd_vec &vindex, const simd_vec &v) { +template +void scatter(T *base_addr, const fixed_size_simd &vindex, const fixed_size_simd &v) { UNROLLED_FOR_S(i, S, { base_addr[vindex.template get()] = v.template get(); }); } -template void scatter(T *base_addr, const simd_vec &vindex, const T v) { +template void scatter(T *base_addr, const fixed_size_simd &vindex, const T v) { UNROLLED_FOR_S(i, S, { base_addr[vindex.template get()] = v; }); } template -void scatter(T *base_addr, const simd_vec &mask, const simd_vec &vindex, const simd_vec &v) { +void scatter(T *base_addr, const fixed_size_simd &mask, const fixed_size_simd &vindex, + const fixed_size_simd &v) { UNROLLED_FOR_S(i, S, { if (mask.template get()) { base_addr[vindex.template get()] = v.template get(); @@ -622,7 +643,7 @@ void scatter(T *base_addr, const simd_vec &mask, const simd_vec } template -void scatter(T *base_addr, const simd_vec &mask, const simd_vec &vindex, const T v) { +void scatter(T *base_addr, const fixed_size_simd &mask, const fixed_size_simd &vindex, const T v) { UNROLLED_FOR_S(i, S, { if (mask.template get()) { base_addr[vindex.template get()] = v; @@ -630,73 +651,75 @@ void scatter(T *base_addr, const simd_vec &mask, const simd_vec }); } -template simd_vec inclusive_scan(const simd_vec &vec) { - simd_vec res = vec; +template fixed_size_simd inclusive_scan(const fixed_size_simd &vec) { + fixed_size_simd res = vec; UNROLLED_FOR_S(i, S - 1, { res.template set(res.template get() + res.template get()); }); return res; } -template class simd_comp_where_helper { - const simd_vec &mask_; - simd_vec &comp_; +template class simd_where_expression { + const fixed_size_simd &mask_; + fixed_size_simd &comp_; public: - force_inline simd_comp_where_helper(const simd_vec &mask, simd_vec &vec) - : mask_(reinterpret_cast &>(mask)), comp_(vec) {} - - force_inline void operator=(const simd_vec &vec) { comp_.blend_to(mask_, vec); } - force_inline void operator+=(const simd_vec &vec) { comp_.blend_to(mask_, comp_ + vec); } - force_inline void operator-=(const simd_vec &vec) { comp_.blend_to(mask_, comp_ - vec); } - force_inline void operator*=(const simd_vec &vec) { comp_.blend_to(mask_, comp_ * vec); } - force_inline void operator/=(const simd_vec &vec) { comp_.blend_to(mask_, comp_ / vec); } - force_inline void operator|=(const simd_vec &vec) { comp_.blend_to(mask_, comp_ | vec); } - force_inline void operator&=(const simd_vec &vec) { comp_.blend_to(mask_, comp_ & vec); } + force_inline simd_where_expression(const fixed_size_simd &mask, fixed_size_simd &vec) + : mask_(reinterpret_cast &>(mask)), comp_(vec) {} + + force_inline void operator=(const fixed_size_simd &vec) && { comp_.blend_to(mask_, vec); } + force_inline void operator+=(const fixed_size_simd &vec) && { comp_.blend_to(mask_, comp_ + vec); } + force_inline void operator-=(const fixed_size_simd &vec) && { comp_.blend_to(mask_, comp_ - vec); } + force_inline void operator*=(const fixed_size_simd &vec) && { comp_.blend_to(mask_, comp_ * vec); } + force_inline void operator/=(const fixed_size_simd &vec) && { comp_.blend_to(mask_, comp_ / vec); } + force_inline void operator|=(const fixed_size_simd &vec) && { comp_.blend_to(mask_, comp_ | vec); } + force_inline void operator&=(const fixed_size_simd &vec) && { comp_.blend_to(mask_, comp_ & vec); } }; -template class simd_comp_where_inv_helper { - const simd_vec &mask_; - simd_vec &comp_; +template class simd_where_inv_expression { + const fixed_size_simd &mask_; + fixed_size_simd &comp_; public: - force_inline simd_comp_where_inv_helper(const simd_vec &mask, simd_vec &vec) - : mask_(reinterpret_cast &>(mask)), comp_(vec) {} - - force_inline void operator=(const simd_vec &vec) { comp_.blend_inv_to(mask_, vec); } - force_inline void operator+=(const simd_vec &vec) { comp_.blend_inv_to(mask_, comp_ + vec); } - force_inline void operator-=(const simd_vec &vec) { comp_.blend_inv_to(mask_, comp_ - vec); } - force_inline void operator*=(const simd_vec &vec) { comp_.blend_inv_to(mask_, comp_ * vec); } - force_inline void operator/=(const simd_vec &vec) { comp_.blend_inv_to(mask_, comp_ / vec); } - force_inline void operator|=(const simd_vec &vec) { comp_.blend_inv_to(mask_, comp_ | vec); } - force_inline void operator&=(const simd_vec &vec) { comp_.blend_inv_to(mask_, comp_ & vec); } + force_inline simd_where_inv_expression(const fixed_size_simd &mask, fixed_size_simd &vec) + : mask_(reinterpret_cast &>(mask)), comp_(vec) {} + + force_inline void operator=(const fixed_size_simd &vec) && { comp_.blend_inv_to(mask_, vec); } + force_inline void operator+=(const fixed_size_simd &vec) && { comp_.blend_inv_to(mask_, comp_ + vec); } + force_inline void operator-=(const fixed_size_simd &vec) && { comp_.blend_inv_to(mask_, comp_ - vec); } + force_inline void operator*=(const fixed_size_simd &vec) && { comp_.blend_inv_to(mask_, comp_ * vec); } + force_inline void operator/=(const fixed_size_simd &vec) && { comp_.blend_inv_to(mask_, comp_ / vec); } + force_inline void operator|=(const fixed_size_simd &vec) && { comp_.blend_inv_to(mask_, comp_ | vec); } + force_inline void operator&=(const fixed_size_simd &vec) && { comp_.blend_inv_to(mask_, comp_ & vec); } }; template -force_inline simd_comp_where_helper where(const simd_vec &mask, simd_vec &vec) { +force_inline simd_where_expression where(const fixed_size_simd &mask, fixed_size_simd &vec) { return {mask, vec}; } template -force_inline simd_comp_where_inv_helper where_not(const simd_vec &mask, simd_vec &vec) { +force_inline simd_where_inv_expression where_not(const fixed_size_simd &mask, + fixed_size_simd &vec) { return {mask, vec}; } template -inline simd_vec select(const simd_vec &mask, const simd_vec &vec1, const simd_vec &vec2) { - simd_vec ret; +inline fixed_size_simd select(const fixed_size_simd &mask, const fixed_size_simd &vec1, + const fixed_size_simd &vec2) { + fixed_size_simd ret; UNROLLED_FOR_S(i, S, { ret.template set(mask.template get() ? vec1.template get() : vec2.template get()); }); return ret; } -template force_inline simd_vec simd_cast(const simd_vec &vec) { - simd_vec ret; - memcpy(&ret, &vec, sizeof(simd_vec)); +template force_inline fixed_size_simd simd_cast(const fixed_size_simd &vec) { + fixed_size_simd ret; + memcpy(&ret, &vec, sizeof(fixed_size_simd)); return ret; } -template force_inline const simd_vec simd_cast(const simd_vec &vec) { - simd_vec ret; - memcpy(&ret, &vec, sizeof(simd_vec)); +template force_inline const fixed_size_simd simd_cast(const fixed_size_simd &vec) { + fixed_size_simd ret; + memcpy(&ret, &vec, sizeof(fixed_size_simd)); return ret; } @@ -704,44 +727,44 @@ template force_inline const simd_vec simd_cast(const simd_vec< } // namespace Ray #if defined(USE_SSE2) || defined(USE_SSE41) -#include "simd_vec_sse.h" +#include "simd_sse.h" #elif defined(USE_AVX) || defined(USE_AVX2) -#include "simd_vec_avx.h" +#include "simd_avx.h" #elif defined(USE_AVX512) -#include "simd_vec_avx512.h" +#include "simd_avx512.h" #elif defined(USE_NEON) -#include "simd_vec_neon.h" +#include "simd_neon.h" #endif namespace Ray { namespace NS { -template using simd_fvec = simd_vec; -using simd_fvec2 = simd_fvec<2>; -using simd_fvec3 = simd_fvec<3>; -using simd_fvec4 = simd_fvec<4>; -using simd_fvec8 = simd_fvec<8>; -using simd_fvec16 = simd_fvec<16>; - -template using simd_ivec = simd_vec; -using simd_ivec2 = simd_ivec<2>; -using simd_ivec3 = simd_ivec<3>; -using simd_ivec4 = simd_ivec<4>; -using simd_ivec8 = simd_ivec<8>; -using simd_ivec16 = simd_ivec<16>; - -template using simd_uvec = simd_vec; -using simd_uvec2 = simd_uvec<2>; -using simd_uvec3 = simd_uvec<3>; -using simd_uvec4 = simd_uvec<4>; -using simd_uvec8 = simd_uvec<8>; -using simd_uvec16 = simd_uvec<16>; - -template using simd_dvec = simd_vec; -using simd_dvec2 = simd_dvec<2>; -using simd_dvec3 = simd_dvec<3>; -using simd_dvec4 = simd_dvec<4>; -using simd_dvec8 = simd_dvec<8>; -using simd_dvec16 = simd_dvec<16>; +template using fvec = fixed_size_simd; +using fvec2 = fvec<2>; +using fvec3 = fvec<3>; +using fvec4 = fvec<4>; +using fvec8 = fvec<8>; +using fvec16 = fvec<16>; + +template using ivec = fixed_size_simd; +using ivec2 = ivec<2>; +using ivec3 = ivec<3>; +using ivec4 = ivec<4>; +using ivec8 = ivec<8>; +using ivec16 = ivec<16>; + +template using uvec = fixed_size_simd; +using uvec2 = uvec<2>; +using uvec3 = uvec<3>; +using uvec4 = uvec<4>; +using uvec8 = uvec<8>; +using uvec16 = uvec<16>; + +template using dvec = fixed_size_simd; +using dvec2 = dvec<2>; +using dvec3 = dvec<3>; +using dvec4 = dvec<4>; +using dvec8 = dvec<8>; +using dvec16 = dvec<16>; } // namespace NS } // namespace Ray diff --git a/internal/simd/simd_avx.h b/internal/simd/simd_avx.h new file mode 100644 index 000000000..ac4df8312 --- /dev/null +++ b/internal/simd/simd_avx.h @@ -0,0 +1,1406 @@ +// #pragma once + +#include "simd_sse.h" + +#include + +#if defined(__GNUC__) || defined(__clang__) +#define _mm256_test_all_zeros(mask, val) _mm256_testz_si256((mask), (val)) +#endif + +#ifndef NDEBUG +#define validate_mask(m) __assert_valid_mask(m) +#else +#define validate_mask(m) ((void)m) +#endif + +#if defined(USE_AVX2) || defined(USE_AVX512) +#define USE_FMA +#endif + +#pragma warning(push) +#pragma warning(disable : 4752) + +#if defined(USE_AVX2) || defined(USE_AVX512) +#define avx2_inline force_inline +#else +#define avx2_inline inline +#endif + +namespace Ray { +namespace NS { + +template <> force_inline __m256 _mm_cast(__m256i x) { return _mm256_castsi256_ps(x); } +template <> force_inline __m256i _mm_cast(__m256 x) { return _mm256_castps_si256(x); } + +template <> class fixed_size_simd; +template <> class fixed_size_simd; + +template <> class fixed_size_simd { + union { + __m256 vec_; + float comp_[8]; + }; + + friend class fixed_size_simd; + friend class fixed_size_simd; + + public: + force_inline fixed_size_simd() = default; + force_inline fixed_size_simd(const float f) { vec_ = _mm256_set1_ps(f); } + force_inline fixed_size_simd(const float f1, const float f2, const float f3, const float f4, const float f5, + const float f6, const float f7, const float f8) { + vec_ = _mm256_setr_ps(f1, f2, f3, f4, f5, f6, f7, f8); + } + force_inline explicit fixed_size_simd(const float *f) { vec_ = _mm256_loadu_ps(f); } + force_inline fixed_size_simd(const float *f, vector_aligned_tag) { vec_ = _mm256_load_ps(f); } + + force_inline float operator[](const int i) const { return comp_[i]; } + force_inline float operator[](const long i) const { return operator[](int(i)); } + + template force_inline float get() const { return comp_[i & 7]; } + template force_inline void set(const float v) { comp_[i & 7] = v; } + force_inline void set(const int i, const float v) { comp_[i] = v; } + + force_inline fixed_size_simd &vectorcall operator+=(const fixed_size_simd rhs) { + vec_ = _mm256_add_ps(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd &vectorcall operator-=(const fixed_size_simd rhs) { + vec_ = _mm256_sub_ps(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd &vectorcall operator*=(const fixed_size_simd rhs) { + vec_ = _mm256_mul_ps(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd &vectorcall operator/=(const fixed_size_simd rhs) { + vec_ = _mm256_div_ps(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd &vectorcall operator|=(const fixed_size_simd rhs) { + vec_ = _mm256_or_ps(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd &vectorcall operator&=(const fixed_size_simd rhs) { + vec_ = _mm256_and_ps(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd operator~() const; + force_inline fixed_size_simd operator-() const; + force_inline explicit vectorcall operator fixed_size_simd() const; + force_inline explicit vectorcall operator fixed_size_simd() const; + + force_inline fixed_size_simd sqrt() const; + force_inline fixed_size_simd log() const; + + force_inline float length() const { return sqrtf(length2()); } + + float length2() const { + float ret = 0; + UNROLLED_FOR(i, 8, { ret += comp_[i] * comp_[i]; }) + return ret; + } + + force_inline float hsum() const { +#if 1 + __m256 temp = _mm256_hadd_ps(vec_, vec_); + temp = _mm256_hadd_ps(temp, temp); + + __m256 ret = _mm256_permute2f128_ps(temp, temp, 1); + ret = _mm256_add_ps(ret, temp); + + return _mm256_cvtss_f32(ret); +#else + // ( x3+x7, x2+x6, x1+x5, x0+x4 ) + const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(vec_, 1), _mm256_castps256_ps128(vec_)); + // ( -, -, x1+x3+x5+x7, x0+x2+x4+x6 ) + const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128)); + // ( -, -, -, x0+x1+x2+x3+x4+x5+x6+x7 ) + const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55)); + // Conversion to float is a no-op on x86-64 + return _mm_cvtss_f32(x32); +#endif + } + +#if defined(USE_AVX2) || defined(USE_AVX512) + friend force_inline fixed_size_simd vectorcall inclusive_scan(fixed_size_simd v1) { + v1.vec_ = _mm256_add_ps(v1.vec_, _mm256_castsi256_ps(_mm256_slli_si256(_mm256_castps_si256(v1.vec_), 4))); + v1.vec_ = _mm256_add_ps(v1.vec_, _mm256_castsi256_ps(_mm256_slli_si256(_mm256_castps_si256(v1.vec_), 8))); + + __m256 temp = _mm256_shuffle_ps(v1.vec_, v1.vec_, _MM_SHUFFLE(3, 3, 3, 3)); + temp = _mm256_permute2f128_ps(_mm256_setzero_ps(), temp, 0x20); + + v1.vec_ = _mm256_add_ps(v1.vec_, temp); + + return v1; + } +#endif + + force_inline void store_to(float *f) const { _mm256_storeu_ps(f, vec_); } + force_inline void store_to(float *f, vector_aligned_tag) const { _mm256_store_ps(f, vec_); } + + force_inline void vectorcall blend_to(const fixed_size_simd mask, const fixed_size_simd v1) { + validate_mask(mask); + vec_ = _mm256_blendv_ps(vec_, v1.vec_, mask.vec_); + } + + force_inline void vectorcall blend_inv_to(const fixed_size_simd mask, + const fixed_size_simd v1) { + validate_mask(mask); + vec_ = _mm256_blendv_ps(v1.vec_, vec_, mask.vec_); + } + + friend force_inline fixed_size_simd vectorcall min(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall max(fixed_size_simd v1, + fixed_size_simd v2); + + friend force_inline fixed_size_simd vectorcall and_not(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall floor(fixed_size_simd v1); + friend force_inline fixed_size_simd vectorcall ceil(fixed_size_simd v1); + + friend force_inline fixed_size_simd vectorcall operator&(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall operator|(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall operator^(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall operator+(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall operator-(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall operator*(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall operator/(fixed_size_simd v1, + fixed_size_simd v2); + + friend force_inline fixed_size_simd vectorcall operator<(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall operator<=(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall operator>(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall operator>=(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall operator==(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall operator!=(fixed_size_simd v1, + fixed_size_simd v2); + + friend force_inline fixed_size_simd + vectorcall clamp(fixed_size_simd v1, fixed_size_simd min, fixed_size_simd max); + // friend force_inline fixed_size_simd vectorcall clamp(fixed_size_simd v1, float min, float + // max); + friend force_inline fixed_size_simd vectorcall saturate(fixed_size_simd v1) { + return clamp(v1, 0.0f, 1.0f); + } + friend force_inline fixed_size_simd vectorcall pow(fixed_size_simd v1, + fixed_size_simd v2); + + friend force_inline fixed_size_simd vectorcall normalize(fixed_size_simd v1); + friend force_inline fixed_size_simd vectorcall normalize_len(fixed_size_simd v1, + float &out_len); + +#ifdef USE_FMA + friend force_inline fixed_size_simd + vectorcall fmadd(fixed_size_simd a, fixed_size_simd b, fixed_size_simd c); + friend force_inline fixed_size_simd + vectorcall fmsub(fixed_size_simd a, fixed_size_simd b, fixed_size_simd c); +#endif // USE_FMA + +#if defined(USE_AVX2) || defined(USE_AVX512) + friend force_inline fixed_size_simd vectorcall gather(const float *base_addr, + fixed_size_simd vindex); + friend force_inline fixed_size_simd vectorcall gather(fixed_size_simd src, + const float *base_addr, + fixed_size_simd mask, + fixed_size_simd vindex); +#endif + + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + +#ifndef NDEBUG + friend void vectorcall __assert_valid_mask(const fixed_size_simd mask) { + UNROLLED_FOR(i, 8, { + const float val = mask.get(); + assert(reinterpret_cast(val) == 0 || + reinterpret_cast(val) == 0xffffffff); + }) + } +#endif + + friend force_inline const float *value_ptr(const fixed_size_simd &v1) { + return reinterpret_cast(&v1.vec_); + } + friend force_inline float *value_ptr(fixed_size_simd &v1) { return reinterpret_cast(&v1.vec_); } + + static int size() { return 8; } + static bool is_native() { return true; } +}; + +template <> class fixed_size_simd { + union { + __m256i vec_; + int comp_[8]; + }; + + friend class fixed_size_simd; + friend class fixed_size_simd; + + public: + force_inline fixed_size_simd() = default; + force_inline fixed_size_simd(const int f) { vec_ = _mm256_set1_epi32(f); } + force_inline fixed_size_simd(const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, + const int i7, const int i8) { + vec_ = _mm256_setr_epi32(i1, i2, i3, i4, i5, i6, i7, i8); + } + force_inline explicit fixed_size_simd(const int *f) { vec_ = _mm256_loadu_si256((const __m256i *)f); } + force_inline fixed_size_simd(const int *f, vector_aligned_tag) { vec_ = _mm256_load_si256((const __m256i *)f); } + + force_inline int operator[](const int i) const { return comp_[i]; } + force_inline int operator[](const long i) const { return operator[](int(i)); } + + template force_inline int get() const { return _mm256_extract_epi32(vec_, i & 7); } + template force_inline void set(const int v) { vec_ = _mm256_insert_epi32(vec_, v, i & 7); } + force_inline void set(const int i, const int v) { comp_[i] = v; } + + avx2_inline fixed_size_simd &vectorcall operator+=(const fixed_size_simd rhs) { +#if defined(USE_AVX2) || defined(USE_AVX512) + vec_ = _mm256_add_epi32(vec_, rhs.vec_); +#else + UNROLLED_FOR(i, 8, { comp_[i] += rhs.comp_[i]; }) +#endif + return *this; + } + + force_inline fixed_size_simd &vectorcall operator+=(const int rhs) { + return operator+=(fixed_size_simd{rhs}); + } + + avx2_inline fixed_size_simd &vectorcall operator-=(const fixed_size_simd rhs) { +#if defined(USE_AVX2) || defined(USE_AVX512) + vec_ = _mm256_sub_epi32(vec_, rhs.vec_); +#else + UNROLLED_FOR(i, 8, { comp_[i] -= rhs.comp_[i]; }) +#endif + return *this; + } + + avx2_inline fixed_size_simd &vectorcall operator*=(const fixed_size_simd rhs) { +#if defined(USE_AVX2) || defined(USE_AVX512) + vec_ = _mm256_mullo_epi32(vec_, rhs.vec_); +#else + UNROLLED_FOR(i, 8, { comp_[i] *= rhs.comp_[i]; }) +#endif + return *this; + } + + fixed_size_simd &vectorcall operator/=(const fixed_size_simd rhs) { + UNROLLED_FOR(i, 8, { comp_[i] /= rhs.comp_[i]; }) + return *this; + } + + avx2_inline fixed_size_simd &vectorcall operator|=(const fixed_size_simd rhs) { +#if defined(USE_AVX2) || defined(USE_AVX512) + vec_ = _mm256_or_si256(vec_, rhs.vec_); +#else + UNROLLED_FOR(i, 8, { comp_[i] |= rhs.comp_[i]; }) +#endif + return *this; + } + + avx2_inline fixed_size_simd &vectorcall operator^=(const fixed_size_simd rhs) { +#if defined(USE_AVX2) || defined(USE_AVX512) + vec_ = _mm256_xor_si256(vec_, rhs.vec_); +#else + UNROLLED_FOR(i, 8, { comp_[i] ^= rhs.comp_[i]; }) +#endif + return *this; + } + + avx2_inline fixed_size_simd vectorcall operator-() const { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_sub_epi32(_mm256_setzero_si256(), vec_); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = -comp_[i]; }) +#endif + return ret; + } + + avx2_inline fixed_size_simd vectorcall operator==(const fixed_size_simd rhs) const { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_cmpeq_epi32(vec_, rhs.vec_); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = (comp_[i] == rhs.comp_[i]) ? -1 : 0; }) +#endif + return ret; + } + + avx2_inline fixed_size_simd vectorcall operator!=(const fixed_size_simd rhs) const { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_andnot_si256(_mm256_cmpeq_epi32(vec_, rhs.vec_), _mm256_set1_epi32(~0)); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = (comp_[i] != rhs.comp_[i]) ? -1 : 0; }) +#endif + return ret; + } + + avx2_inline fixed_size_simd &vectorcall operator&=(const fixed_size_simd rhs) { +#if defined(USE_AVX2) || defined(USE_AVX512) + vec_ = _mm256_and_si256(vec_, rhs.vec_); +#else + UNROLLED_FOR(i, 8, { comp_[i] &= rhs.comp_[i]; }) +#endif + return *this; + } + + force_inline explicit vectorcall operator fixed_size_simd() const { + fixed_size_simd ret; + ret.vec_ = _mm256_cvtepi32_ps(vec_); + return ret; + } + + force_inline explicit vectorcall operator fixed_size_simd() const; + + avx2_inline int hsum() const { +#if defined(USE_AVX2) || defined(USE_AVX512) + __m256i temp = _mm256_hadd_epi32(vec_, vec_); + temp = _mm256_hadd_epi32(temp, temp); + + __m256i ret = _mm256_permute2f128_si256(temp, temp, 1); + ret = _mm256_add_epi32(ret, temp); + + return _mm256_cvtsi256_si32(ret); +#else + int ret = comp_[0]; + UNROLLED_FOR(i, 7, { ret += comp_[i + 1]; }) + return ret; +#endif + } + + force_inline void store_to(int *f) const { _mm256_storeu_si256((__m256i *)f, vec_); } + force_inline void store_to(int *f, vector_aligned_tag) const { _mm256_store_si256((__m256i *)f, vec_); } + + force_inline void vectorcall blend_to(const fixed_size_simd mask, const fixed_size_simd v1) { + validate_mask(mask); + vec_ = _mm256_castps_si256( + _mm256_blendv_ps(_mm256_castsi256_ps(vec_), _mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(mask.vec_))); + } + + force_inline void vectorcall blend_inv_to(const fixed_size_simd mask, const fixed_size_simd v1) { + validate_mask(mask); + vec_ = _mm256_castps_si256( + _mm256_blendv_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(vec_), _mm256_castsi256_ps(mask.vec_))); + } + + force_inline int movemask() const { return _mm256_movemask_ps(_mm256_castsi256_ps(vec_)); } + + force_inline bool vectorcall all_zeros() const { return _mm256_test_all_zeros(vec_, vec_) != 0; } + force_inline bool vectorcall all_zeros(const fixed_size_simd mask) const { + return _mm256_test_all_zeros(vec_, mask.vec_) != 0; + } + + force_inline bool vectorcall not_all_zeros() const { + int res = _mm256_test_all_zeros(vec_, vec_); + return res == 0; + } + + friend avx2_inline fixed_size_simd vectorcall min(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_min_epi32(v1.vec_, v2.vec_); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = std::min(v1.comp_[i], v2.comp_[i]); }) +#endif + return ret; + } + + avx2_inline static fixed_size_simd vectorcall max(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_max_epi32(v1.vec_, v2.vec_); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = std::max(v1.comp_[i], v2.comp_[i]); }) +#endif + return ret; + } + + friend force_inline fixed_size_simd vectorcall clamp(const fixed_size_simd v1, + const fixed_size_simd _min, + const fixed_size_simd _max) { + return max(_min, min(v1, _max)); + } + + force_inline static fixed_size_simd vectorcall and_not(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_))); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator&(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_))); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator|(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_))); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator^(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_))); + return temp; + } + + friend avx2_inline fixed_size_simd vectorcall operator+(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_add_epi32(v1.vec_, v2.vec_); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = v1.comp_[i] + v2.comp_[i]; }) +#endif + return ret; + } + + friend avx2_inline fixed_size_simd vectorcall operator-(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_sub_epi32(v1.vec_, v2.vec_); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = v1.comp_[i] - v2.comp_[i]; }) +#endif + return ret; + } + + friend avx2_inline fixed_size_simd vectorcall operator*(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_mullo_epi32(v1.vec_, v2.vec_); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = v1.comp_[i] * v2.comp_[i]; }) +#endif + return ret; + } + + friend fixed_size_simd vectorcall operator/(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + UNROLLED_FOR(i, 8, { ret.comp_[i] = (v1.comp_[i] / v2.comp_[i]); }) + return ret; + } + + friend avx2_inline fixed_size_simd vectorcall operator<(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_cmpgt_epi32(v2.vec_, v1.vec_); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = (v1.comp_[i] < v2.comp_[i]) ? -1 : 0; }) +#endif + return ret; + } + + friend avx2_inline fixed_size_simd vectorcall operator>(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_cmpgt_epi32(v1.vec_, v2.vec_); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = (v1.comp_[i] > v2.comp_[i]) ? -1 : 0; }) +#endif + return ret; + } + + friend avx2_inline fixed_size_simd vectorcall operator>=(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_andnot_si256(_mm256_cmpgt_epi32(v2.vec_, v1.vec_), _mm256_set1_epi32(-1)); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = (v1.comp_[i] >= v2.comp_[i]) ? -1 : 0; }) +#endif + return ret; + } + + friend avx2_inline fixed_size_simd vectorcall operator>>(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_srlv_epi32(v1.vec_, v2.vec_); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = int(unsigned(v1.comp_[i]) >> unsigned(v2.comp_[i])); }) +#endif + return ret; + } + + friend avx2_inline fixed_size_simd vectorcall operator>>(const fixed_size_simd v1, const int v2) { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_srli_epi32(v1.vec_, v2); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = int(unsigned(v1.comp_[i]) >> v2); }) +#endif + return ret; + } + + friend avx2_inline fixed_size_simd vectorcall operator<<(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_sllv_epi32(v1.vec_, v2.vec_); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = int(unsigned(v1.comp_[i]) << unsigned(v2.comp_[i])); }) +#endif + return ret; + } + + friend avx2_inline fixed_size_simd vectorcall operator<<(const fixed_size_simd v1, const int v2) { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_slli_epi32(v1.vec_, v2); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = int(unsigned(v1.comp_[i]) << v2); }) +#endif + return ret; + } + + avx2_inline fixed_size_simd operator~() const { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_andnot_si256(vec_, _mm256_set1_epi32(~0)); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = ~comp_[i]; }) +#endif + return ret; + } + + friend avx2_inline fixed_size_simd vectorcall srai(const fixed_size_simd v1, const int v2) { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_srai_epi32(v1.vec_, v2); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = (v1.comp_[i] >> v2); }) +#endif + return ret; + } + + friend avx2_inline bool vectorcall is_equal(const fixed_size_simd v1, const fixed_size_simd v2) { +#if defined(USE_AVX2) || defined(USE_AVX512) + __m256i vcmp = _mm256_cmpeq_epi32(v1.vec_, v2.vec_); + return (_mm256_movemask_epi8(vcmp) == 0xffffffff); +#else + bool ret = true; + UNROLLED_FOR(i, 8, { ret &= (v1.comp_[i] == v2.comp_[i]); }) + return ret; +#endif + } + +#if defined(USE_AVX2) || defined(USE_AVX512) + friend force_inline fixed_size_simd vectorcall inclusive_scan(fixed_size_simd v1) { + v1.vec_ = _mm256_add_epi32(v1.vec_, _mm256_slli_si256(v1.vec_, 4)); + v1.vec_ = _mm256_add_epi32(v1.vec_, _mm256_slli_si256(v1.vec_, 8)); + + __m256i temp = _mm256_shuffle_epi32(v1.vec_, _MM_SHUFFLE(3, 3, 3, 3)); + temp = _mm256_permute2x128_si256(_mm256_setzero_si256(), temp, 0x20); + + v1.vec_ = _mm256_add_epi32(v1.vec_, temp); + + return v1; + } +#endif + +#if defined(USE_AVX2) || defined(USE_AVX512) + friend force_inline fixed_size_simd vectorcall gather(const float *base_addr, + fixed_size_simd vindex); + friend force_inline fixed_size_simd vectorcall gather(fixed_size_simd src, + const float *base_addr, + fixed_size_simd mask, + fixed_size_simd vindex); + friend force_inline fixed_size_simd vectorcall gather(const int *base_addr, fixed_size_simd vindex); + friend force_inline fixed_size_simd vectorcall gather(fixed_size_simd src, const int *base_addr, + fixed_size_simd mask, + fixed_size_simd vindex); + friend force_inline fixed_size_simd vectorcall gather(const unsigned *base_addr, + fixed_size_simd vindex); + friend force_inline fixed_size_simd vectorcall gather(fixed_size_simd src, + const unsigned *base_addr, + fixed_size_simd mask, + fixed_size_simd vindex); +#endif + + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + +#ifndef NDEBUG + friend void vectorcall __assert_valid_mask(const fixed_size_simd mask) { + UNROLLED_FOR(i, 8, { + const int val = mask.get(); + assert(val == 0 || val == -1); + }) + } +#endif + + friend force_inline const int *value_ptr(const fixed_size_simd &v1) { + return reinterpret_cast(&v1.vec_); + } + friend force_inline int *value_ptr(fixed_size_simd &v1) { return reinterpret_cast(&v1.vec_); } + + static int size() { return 8; } + static bool is_native() { +#if defined(USE_AVX2) || defined(USE_AVX512) + return true; +#else + // mostly not native, so return false here + return false; +#endif + } +}; + +template <> class fixed_size_simd { + union { + __m256i vec_; + unsigned comp_[8]; + }; + + friend class fixed_size_simd; + friend class fixed_size_simd; + + public: + force_inline fixed_size_simd() = default; + force_inline fixed_size_simd(const unsigned f) { vec_ = _mm256_set1_epi32(f); } + force_inline fixed_size_simd(const unsigned i1, const unsigned i2, const unsigned i3, const unsigned i4, + const unsigned i5, const unsigned i6, const unsigned i7, const unsigned i8) { + vec_ = _mm256_setr_epi32(i1, i2, i3, i4, i5, i6, i7, i8); + } + force_inline explicit fixed_size_simd(const unsigned *f) { vec_ = _mm256_loadu_si256((const __m256i *)f); } + force_inline fixed_size_simd(const unsigned *f, vector_aligned_tag) { + vec_ = _mm256_load_si256((const __m256i *)f); + } + + force_inline unsigned operator[](const int i) const { return comp_[i]; } + force_inline unsigned operator[](const long i) const { return operator[](int(i)); } + + template force_inline unsigned get() const { return _mm256_extract_epi32(vec_, i & 7); } + template force_inline void set(const unsigned v) { vec_ = _mm256_insert_epi32(vec_, v, i & 7); } + force_inline void set(const int i, const unsigned v) { comp_[i] = v; } + + avx2_inline fixed_size_simd &vectorcall operator+=(const fixed_size_simd rhs) { +#if defined(USE_AVX2) || defined(USE_AVX512) + vec_ = _mm256_add_epi32(vec_, rhs.vec_); +#else + UNROLLED_FOR(i, 8, { comp_[i] += rhs.comp_[i]; }) +#endif + return *this; + } + + force_inline fixed_size_simd &vectorcall operator+=(const unsigned rhs) { + return operator+=(fixed_size_simd{rhs}); + } + + avx2_inline fixed_size_simd &vectorcall operator-=(const fixed_size_simd rhs) { +#if defined(USE_AVX2) || defined(USE_AVX512) + vec_ = _mm256_sub_epi32(vec_, rhs.vec_); +#else + UNROLLED_FOR(i, 8, { comp_[i] -= rhs.comp_[i]; }) +#endif + return *this; + } + + force_inline fixed_size_simd &vectorcall operator-=(const unsigned rhs) { + return operator-=(fixed_size_simd{rhs}); + } + + fixed_size_simd &vectorcall operator*=(const fixed_size_simd rhs) { + UNROLLED_FOR(i, 8, { comp_[i] *= rhs.comp_[i]; }) + return *this; + } + + force_inline fixed_size_simd &vectorcall operator*=(const unsigned rhs) { + return operator*=(fixed_size_simd{rhs}); + } + + fixed_size_simd &vectorcall operator/=(const fixed_size_simd rhs) { + UNROLLED_FOR(i, 8, { comp_[i] /= rhs.comp_[i]; }) + return *this; + } + + force_inline fixed_size_simd &vectorcall operator/=(const unsigned rhs) { + return operator/=(fixed_size_simd{rhs}); + } + + avx2_inline fixed_size_simd &vectorcall operator|=(const fixed_size_simd rhs) { +#if defined(USE_AVX2) || defined(USE_AVX512) + vec_ = _mm256_or_si256(vec_, rhs.vec_); +#else + UNROLLED_FOR(i, 8, { comp_[i] |= rhs.comp_[i]; }) +#endif + return *this; + } + + avx2_inline fixed_size_simd &vectorcall operator^=(const fixed_size_simd rhs) { +#if defined(USE_AVX2) || defined(USE_AVX512) + vec_ = _mm256_xor_si256(vec_, rhs.vec_); +#else + UNROLLED_FOR(i, 8, { comp_[i] ^= rhs.comp_[i]; }) +#endif + return *this; + } + + avx2_inline fixed_size_simd vectorcall operator==(const fixed_size_simd rhs) const { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_cmpeq_epi32(vec_, rhs.vec_); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = (comp_[i] == rhs.comp_[i]) ? 0xffffffff : 0; }) +#endif + return ret; + } + + avx2_inline fixed_size_simd vectorcall operator!=(const fixed_size_simd rhs) const { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_andnot_si256(_mm256_cmpeq_epi32(vec_, rhs.vec_), _mm256_set1_epi32(~0)); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = (comp_[i] != rhs.comp_[i]) ? 0xffffffff : 0; }) +#endif + return ret; + } + + avx2_inline fixed_size_simd &vectorcall operator&=(const fixed_size_simd rhs) { +#if defined(USE_AVX2) || defined(USE_AVX512) + vec_ = _mm256_and_si256(vec_, rhs.vec_); +#else + UNROLLED_FOR(i, 8, { comp_[i] &= rhs.comp_[i]; }) +#endif + return *this; + } + + force_inline explicit vectorcall operator fixed_size_simd() const { + fixed_size_simd ret; + ret.vec_ = _mm256_cvtepi32_ps(vec_); + return ret; + } + + force_inline explicit vectorcall operator fixed_size_simd() const { + fixed_size_simd ret; + ret.vec_ = vec_; + return ret; + } + + avx2_inline unsigned hsum() const { +#if defined(USE_AVX2) || defined(USE_AVX512) + __m256i temp = _mm256_hadd_epi32(vec_, vec_); + temp = _mm256_hadd_epi32(temp, temp); + + __m256i ret = _mm256_permute2f128_si256(temp, temp, 1); + ret = _mm256_add_epi32(ret, temp); + + return _mm256_cvtsi256_si32(ret); +#else + unsigned ret = comp_[0]; + UNROLLED_FOR(i, 7, { ret += comp_[i + 1]; }) + return ret; +#endif + } + + force_inline void store_to(unsigned *f) const { _mm256_storeu_si256((__m256i *)f, vec_); } + force_inline void store_to(unsigned *f, vector_aligned_tag) const { _mm256_store_si256((__m256i *)f, vec_); } + + force_inline void vectorcall blend_to(const fixed_size_simd mask, + const fixed_size_simd v1) { + validate_mask(mask); + vec_ = _mm256_castps_si256( + _mm256_blendv_ps(_mm256_castsi256_ps(vec_), _mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(mask.vec_))); + } + + force_inline void vectorcall blend_inv_to(const fixed_size_simd mask, + const fixed_size_simd v1) { + validate_mask(mask); + vec_ = _mm256_castps_si256( + _mm256_blendv_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(vec_), _mm256_castsi256_ps(mask.vec_))); + } + + force_inline int movemask() const { return _mm256_movemask_ps(_mm256_castsi256_ps(vec_)); } + + force_inline bool vectorcall all_zeros() const { return _mm256_test_all_zeros(vec_, vec_) != 0; } + force_inline bool vectorcall all_zeros(const fixed_size_simd mask) const { + return _mm256_test_all_zeros(vec_, mask.vec_) != 0; + } + + force_inline bool vectorcall not_all_zeros() const { + int res = _mm256_test_all_zeros(vec_, vec_); + return res == 0; + } + + friend avx2_inline fixed_size_simd vectorcall min(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_min_epu32(v1.vec_, v2.vec_); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = std::min(v1.comp_[i], v2.comp_[i]); }) +#endif + return ret; + } + + avx2_inline static fixed_size_simd vectorcall max(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_max_epu32(v1.vec_, v2.vec_); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = std::max(v1.comp_[i], v2.comp_[i]); }) +#endif + return ret; + } + + friend force_inline fixed_size_simd vectorcall clamp(const fixed_size_simd v1, + const fixed_size_simd _min, + const fixed_size_simd _max) { + return max(_min, min(v1, _max)); + } + + force_inline static fixed_size_simd vectorcall and_not(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_))); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator&(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_))); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator|(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_))); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator^(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_))); + return temp; + } + + friend avx2_inline fixed_size_simd vectorcall operator+(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_add_epi32(v1.vec_, v2.vec_); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = v1.comp_[i] + v2.comp_[i]; }) +#endif + return ret; + } + + friend avx2_inline fixed_size_simd vectorcall operator-(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_sub_epi32(v1.vec_, v2.vec_); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = v1.comp_[i] - v2.comp_[i]; }) +#endif + return ret; + } + + friend fixed_size_simd vectorcall operator*(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + UNROLLED_FOR(i, 8, { ret.comp_[i] = v1.comp_[i] * v2.comp_[i]; }) + return ret; + } + + friend fixed_size_simd vectorcall operator/(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + UNROLLED_FOR(i, 8, { ret.comp_[i] = (v1.comp_[i] / v2.comp_[i]); }) + return ret; + } + + friend avx2_inline fixed_size_simd vectorcall operator>>(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_srlv_epi32(v1.vec_, v2.vec_); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = (v1.comp_[i] >> v2.comp_[i]); }) +#endif + return ret; + } + + friend avx2_inline fixed_size_simd vectorcall operator>>(const fixed_size_simd v1, + const unsigned v2) { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_srli_epi32(v1.vec_, v2); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = (v1.comp_[i] >> v2); }) +#endif + return ret; + } + + friend avx2_inline fixed_size_simd vectorcall operator<<(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_sllv_epi32(v1.vec_, v2.vec_); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = (v1.comp_[i] << v2.comp_[i]); }) +#endif + return ret; + } + + friend avx2_inline fixed_size_simd vectorcall operator<<(const fixed_size_simd v1, + const unsigned v2) { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_slli_epi32(v1.vec_, v2); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = (v1.comp_[i] << v2); }) +#endif + return ret; + } + + avx2_inline fixed_size_simd operator~() const { + fixed_size_simd ret; +#if defined(USE_AVX2) || defined(USE_AVX512) + ret.vec_ = _mm256_andnot_si256(vec_, _mm256_set1_epi32(~0)); +#else + UNROLLED_FOR(i, 8, { ret.comp_[i] = ~comp_[i]; }) +#endif + return ret; + } + + friend avx2_inline bool vectorcall is_equal(const fixed_size_simd v1, + const fixed_size_simd v2) { +#if defined(USE_AVX2) || defined(USE_AVX512) + __m256i vcmp = _mm256_cmpeq_epi32(v1.vec_, v2.vec_); + return (_mm256_movemask_epi8(vcmp) == 0xffffffff); +#else + bool ret = true; + UNROLLED_FOR(i, 8, { ret &= (v1.comp_[i] == v2.comp_[i]); }) + return ret; +#endif + } + +#if defined(USE_AVX2) || defined(USE_AVX512) + friend force_inline fixed_size_simd vectorcall inclusive_scan(fixed_size_simd v1) { + v1.vec_ = _mm256_add_epi32(v1.vec_, _mm256_slli_si256(v1.vec_, 4)); + v1.vec_ = _mm256_add_epi32(v1.vec_, _mm256_slli_si256(v1.vec_, 8)); + + __m256i temp = _mm256_shuffle_epi32(v1.vec_, _MM_SHUFFLE(3, 3, 3, 3)); + temp = _mm256_permute2x128_si256(_mm256_setzero_si256(), temp, 0x20); + + v1.vec_ = _mm256_add_epi32(v1.vec_, temp); + + return v1; + } +#endif + +#if defined(USE_AVX2) || defined(USE_AVX512) + // friend force_inline fixed_size_simd vectorcall gather(const float *base_addr, fixed_size_simd + // vindex); friend force_inline fixed_size_simd vectorcall gather(fixed_size_simd src, const + // float *base_addr, + // fixed_size_simd mask, fixed_size_simd + // vindex); + friend force_inline fixed_size_simd vectorcall gather(const unsigned *base_addr, + fixed_size_simd vindex); + friend force_inline fixed_size_simd vectorcall gather(fixed_size_simd src, + const unsigned *base_addr, + fixed_size_simd mask, + fixed_size_simd vindex); +#endif + + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + +#ifndef NDEBUG + friend void vectorcall __assert_valid_mask(const fixed_size_simd mask) { + UNROLLED_FOR(i, 8, { + const int val = mask.get(); + assert(val == 0 || val == 0xffffffff); + }) + } +#endif + + friend force_inline const unsigned *value_ptr(const fixed_size_simd &v1) { + return reinterpret_cast(&v1.vec_); + } + friend force_inline unsigned *value_ptr(fixed_size_simd &v1) { + return reinterpret_cast(&v1.vec_); + } + + static int size() { return 8; } + static bool is_native() { +#if defined(USE_AVX2) || defined(USE_AVX512) + return true; +#else + // mostly not native, so return false here + return false; +#endif + } +}; + +avx2_inline fixed_size_simd fixed_size_simd::operator~() const { +#if defined(USE_AVX2) || defined(USE_AVX512) + fixed_size_simd ret; + ret.vec_ = _mm256_castsi256_ps(_mm256_andnot_si256(_mm256_castps_si256(vec_), _mm256_set1_epi32(~0))); + return ret; +#else + alignas(32) uint32_t temp[8]; + _mm256_store_ps((float *)temp, vec_); + UNROLLED_FOR(i, 8, { temp[i] = ~temp[i]; }) + return fixed_size_simd{(const float *)temp, vector_aligned}; +#endif +} + +force_inline fixed_size_simd fixed_size_simd::operator-() const { + fixed_size_simd temp; + __m256 m = _mm256_set1_ps(-0.0f); + temp.vec_ = _mm256_xor_ps(vec_, m); + return temp; +} + +force_inline fixed_size_simd::operator fixed_size_simd() const { + fixed_size_simd ret; + ret.vec_ = _mm256_cvttps_epi32(vec_); + return ret; +} + +force_inline fixed_size_simd::operator fixed_size_simd() const { + fixed_size_simd ret; + ret.vec_ = _mm256_cvttps_epi32(vec_); + return ret; +} + +force_inline fixed_size_simd::operator fixed_size_simd() const { + fixed_size_simd ret; + ret.vec_ = vec_; + return ret; +} + +force_inline fixed_size_simd fixed_size_simd::sqrt() const { + fixed_size_simd temp; + temp.vec_ = _mm256_sqrt_ps(vec_); + return temp; +} + +avx2_inline fixed_size_simd fixed_size_simd::log() const { + fixed_size_simd ret; + UNROLLED_FOR(i, 8, { ret.comp_[i] = logf(comp_[i]); }) + return ret; +} + +force_inline fixed_size_simd vectorcall min(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm256_min_ps(v1.vec_, v2.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall max(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm256_max_ps(v1.vec_, v2.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall clamp(const fixed_size_simd v1, + const fixed_size_simd min, + const fixed_size_simd max) { + fixed_size_simd ret; + ret.vec_ = _mm256_max_ps(min.vec_, _mm256_min_ps(v1.vec_, max.vec_)); + return ret; +} + +force_inline fixed_size_simd vectorcall and_not(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm256_andnot_ps(v1.vec_, v2.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall floor(const fixed_size_simd v1) { + fixed_size_simd temp; + temp.vec_ = _mm256_floor_ps(v1.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall ceil(const fixed_size_simd v1) { + fixed_size_simd temp; + temp.vec_ = _mm256_ceil_ps(v1.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall operator&(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm256_and_ps(v1.vec_, v2.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall operator|(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm256_or_ps(v1.vec_, v2.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall operator^(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm256_xor_ps(v1.vec_, v2.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall operator+(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm256_add_ps(v1.vec_, v2.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall operator-(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm256_sub_ps(v1.vec_, v2.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall operator*(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm256_mul_ps(v1.vec_, v2.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall operator/(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm256_div_ps(v1.vec_, v2.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall operator<(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm256_cmp_ps(v1.vec_, v2.vec_, _CMP_LT_OS); + return ret; +} + +force_inline fixed_size_simd vectorcall operator<=(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm256_cmp_ps(v1.vec_, v2.vec_, _CMP_LE_OS); + return ret; +} + +force_inline fixed_size_simd vectorcall operator>(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm256_cmp_ps(v1.vec_, v2.vec_, _CMP_GT_OS); + return ret; +} + +force_inline fixed_size_simd vectorcall operator>=(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm256_cmp_ps(v1.vec_, v2.vec_, _CMP_GE_OS); + return ret; +} + +force_inline fixed_size_simd vectorcall operator==(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm256_cmp_ps(v1.vec_, v2.vec_, _CMP_EQ_OS); + return ret; +} + +force_inline fixed_size_simd vectorcall operator!=(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm256_cmp_ps(v1.vec_, v2.vec_, _CMP_NEQ_OS); + return ret; +} + +inline fixed_size_simd vectorcall pow(const fixed_size_simd v1, + const fixed_size_simd v2) { + alignas(32) float comp1[8], comp2[8]; + _mm256_store_ps(comp1, v1.vec_); + _mm256_store_ps(comp2, v2.vec_); + UNROLLED_FOR(i, 8, { comp1[i] = powf(comp1[i], comp2[i]); }) + return fixed_size_simd{comp1, vector_aligned}; +} + +force_inline fixed_size_simd vectorcall normalize(const fixed_size_simd v1) { + return v1 / v1.length(); +} + +force_inline fixed_size_simd vectorcall normalize_len(const fixed_size_simd v1, float &out_len) { + return v1 / (out_len = v1.length()); +} + +#ifdef USE_FMA +force_inline fixed_size_simd vectorcall fmadd(const fixed_size_simd a, + const fixed_size_simd b, + const fixed_size_simd c) { + fixed_size_simd ret; + ret.vec_ = _mm256_fmadd_ps(a.vec_, b.vec_, c.vec_); + return ret; +} + +force_inline fixed_size_simd vectorcall fmsub(const fixed_size_simd a, + const fixed_size_simd b, + const fixed_size_simd c) { + fixed_size_simd ret; + ret.vec_ = _mm256_fmsub_ps(a.vec_, b.vec_, c.vec_); + return ret; +} +#endif // USE_FMA + +#if defined(USE_AVX2) || defined(USE_AVX512) +force_inline fixed_size_simd vectorcall gather(const float *base_addr, const fixed_size_simd vindex) { + fixed_size_simd ret; + ret.vec_ = _mm256_i32gather_ps(base_addr, vindex.vec_, sizeof(float)); + return ret; +} + +force_inline fixed_size_simd vectorcall gather(fixed_size_simd src, const float *base_addr, + fixed_size_simd mask, fixed_size_simd vindex) { + fixed_size_simd ret; + ret.vec_ = + _mm256_mask_i32gather_ps(src.vec_, base_addr, vindex.vec_, _mm256_castsi256_ps(mask.vec_), sizeof(float)); + return ret; +} + +force_inline fixed_size_simd vectorcall gather(const int *base_addr, const fixed_size_simd vindex) { + fixed_size_simd ret; + ret.vec_ = _mm256_i32gather_epi32(base_addr, vindex.vec_, sizeof(int)); + return ret; +} + +force_inline fixed_size_simd vectorcall gather(fixed_size_simd src, const int *base_addr, + fixed_size_simd mask, fixed_size_simd vindex) { + fixed_size_simd ret; + ret.vec_ = _mm256_mask_i32gather_epi32(src.vec_, base_addr, vindex.vec_, mask.vec_, sizeof(int)); + return ret; +} + +force_inline fixed_size_simd vectorcall gather(const unsigned *base_addr, + const fixed_size_simd vindex) { + fixed_size_simd ret; + ret.vec_ = _mm256_i32gather_epi32(reinterpret_cast(base_addr), vindex.vec_, sizeof(int)); + return ret; +} + +force_inline fixed_size_simd vectorcall gather(fixed_size_simd src, const unsigned *base_addr, + fixed_size_simd mask, + fixed_size_simd vindex) { + fixed_size_simd ret; + ret.vec_ = _mm256_mask_i32gather_epi32(src.vec_, reinterpret_cast(base_addr), vindex.vec_, mask.vec_, + sizeof(unsigned)); + return ret; +} +#endif + +template +force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2) { + validate_mask(mask); + fixed_size_simd ret; + ret.vec_ = _mm256_blendv_ps(vec2.vec_, vec1.vec_, _mm_cast<__m256>(mask.vec_)); + return ret; +} + +template +force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2) { + validate_mask(mask); + fixed_size_simd ret; + ret.vec_ = _mm256_castps_si256( + _mm256_blendv_ps(_mm256_castsi256_ps(vec2.vec_), _mm256_castsi256_ps(vec1.vec_), _mm_cast<__m256>(mask.vec_))); + return ret; +} + +template +force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2) { + validate_mask(mask); + fixed_size_simd ret; + ret.vec_ = _mm256_castps_si256( + _mm256_blendv_ps(_mm256_castsi256_ps(vec2.vec_), _mm256_castsi256_ps(vec1.vec_), _mm_cast<__m256>(mask.vec_))); + return ret; +} + +} // namespace NS +} // namespace Ray + +#pragma warning(pop) + +#undef avx2_inline + +#undef validate_mask diff --git a/internal/simd/simd_avx512.h b/internal/simd/simd_avx512.h new file mode 100644 index 000000000..a4414b46e --- /dev/null +++ b/internal/simd/simd_avx512.h @@ -0,0 +1,1193 @@ +// #pragma once + +#include "simd_avx.h" + +#include + +#define _mm512_cmp_ps(a, b, c) _mm512_castsi512_ps(_mm512_movm_epi32(_mm512_cmp_ps_mask(a, b, c))) + +#define _mm512_blendv_ps(a, b, m) \ + _mm512_castsi512_ps(_mm512_ternarylogic_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b), \ + _mm512_srai_epi32(_mm512_castps_si512(m), 31), 0xd8)) + +#define _mm512_movemask_epi32(a) \ + (int)_mm512_cmpneq_epi32_mask(_mm512_setzero_si512(), _mm512_and_si512(_mm512_set1_epi32(0x80000000U), a)) + +// https://adms-conf.org/2020-camera-ready/ADMS20_05.pdf +#define _mm512_slli_si512(x, k) _mm512_alignr_epi32(x, _mm512_setzero_si512(), 16 - k) + +#ifndef NDEBUG +#define validate_mask(m) __assert_valid_mask(m) +#else +#define validate_mask(m) ((void)m) +#endif + +#pragma warning(push) +#pragma warning(disable : 4752) + +namespace Ray { +namespace NS { + +template <> force_inline __m512 _mm_cast(__m512i x) { return _mm512_castsi512_ps(x); } +template <> force_inline __m512i _mm_cast(__m512 x) { return _mm512_castps_si512(x); } + +template <> class fixed_size_simd; +template <> class fixed_size_simd; + +template <> class fixed_size_simd { + union { + __m512 vec_; + float comp_[16]; + }; + + friend class fixed_size_simd; + friend class fixed_size_simd; + + public: + force_inline fixed_size_simd() = default; + force_inline fixed_size_simd(const float f) { vec_ = _mm512_set1_ps(f); } + force_inline fixed_size_simd(const float f0, const float f1, const float f2, const float f3, const float f4, + const float f5, const float f6, const float f7, const float f8, const float f9, + const float f10, const float f11, const float f12, const float f13, const float f14, + const float f15) { + vec_ = _mm512_setr_ps(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15); + } + force_inline explicit fixed_size_simd(const float *f) { vec_ = _mm512_loadu_ps(f); } + force_inline fixed_size_simd(const float *f, vector_aligned_tag) { vec_ = _mm512_load_ps(f); } + + force_inline float operator[](const int i) const { + __m512 temp = _mm512_maskz_compress_ps(__mmask16(1u << i), vec_); + return _mm512_cvtss_f32(temp); + } + + force_inline float operator[](const long i) const { return operator[](int(i)); } + + template force_inline float get() const { + __m128 temp = _mm512_extractf32x4_ps(vec_, (i & 15) / 4); + const int ndx = (i & 15) % 4; + return _mm_cvtss_f32(_mm_shuffle_ps(temp, temp, _MM_SHUFFLE(ndx, ndx, ndx, ndx))); + } + template force_inline void set(const float v) { + // TODO: find more optimal implementation (with compile-time index) + vec_ = _mm512_mask_broadcastss_ps(vec_, __mmask16(1u << (i & 15)), _mm_set_ss(v)); + } + force_inline void set(const int i, const float v) { + vec_ = _mm512_mask_broadcastss_ps(vec_, __mmask16(1u << i), _mm_set_ss(v)); + } + + force_inline fixed_size_simd &vectorcall operator+=(const fixed_size_simd rhs) { + vec_ = _mm512_add_ps(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd &vectorcall operator-=(const fixed_size_simd rhs) { + vec_ = _mm512_sub_ps(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd &vectorcall operator*=(const fixed_size_simd rhs) { + vec_ = _mm512_mul_ps(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd &vectorcall operator/=(const fixed_size_simd rhs) { + vec_ = _mm512_div_ps(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd &vectorcall operator|=(const fixed_size_simd rhs) { + vec_ = _mm512_or_ps(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd &vectorcall operator&=(const fixed_size_simd rhs) { + vec_ = _mm512_and_ps(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd operator~() const; + force_inline fixed_size_simd operator-() const; + force_inline explicit operator fixed_size_simd() const; + force_inline explicit operator fixed_size_simd() const; + + force_inline fixed_size_simd sqrt() const; + force_inline fixed_size_simd log() const; + + force_inline float length() const { return sqrtf(length2()); } + + float length2() const { + float temp = 0; + UNROLLED_FOR(i, 16, { temp += comp_[i] * comp_[i]; }) + return temp; + } + + force_inline float hsum() const { return _mm512_reduce_add_ps(vec_); } + + force_inline void store_to(float *f) const { _mm512_storeu_ps(f, vec_); } + force_inline void store_to(float *f, vector_aligned_tag) const { _mm512_store_ps(f, vec_); } + + force_inline void vectorcall blend_to(const fixed_size_simd mask, const fixed_size_simd v1) { + validate_mask(mask); + //__mmask16 msk = + // _mm512_fpclass_ps_mask(mask.vec_, 0x54); // 0x54 = Negative_Finite | Negative_Infinity | Negative_Zero + // vec_ = _mm512_mask_blend_ps(msk, vec_, v1.vec_); + vec_ = _mm512_blendv_ps(vec_, v1.vec_, mask.vec_); + } + + force_inline void vectorcall blend_inv_to(const fixed_size_simd mask, + const fixed_size_simd v1) { + validate_mask(mask); + //__mmask16 msk = + // _mm512_fpclass_ps_mask(mask.vec_, 0x54); // 0x54 = Negative_Finite | Negative_Infinity | Negative_Zero + // vec_ = _mm512_mask_blend_ps(msk, v1.vec_, vec_); + vec_ = _mm512_blendv_ps(v1.vec_, vec_, mask.vec_); + } + + friend force_inline fixed_size_simd vectorcall min(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall max(fixed_size_simd v1, + fixed_size_simd v2); + + friend force_inline fixed_size_simd vectorcall and_not(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall floor(fixed_size_simd v1); + friend force_inline fixed_size_simd vectorcall ceil(fixed_size_simd v1); + + friend force_inline fixed_size_simd vectorcall operator&(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall operator|(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall operator^(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall operator+(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall operator-(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall operator*(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall operator/(fixed_size_simd v1, + fixed_size_simd v2); + + friend force_inline fixed_size_simd vectorcall operator<(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall operator<=(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall operator>(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall operator>=(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall operator==(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall operator!=(fixed_size_simd v1, + fixed_size_simd v2); + + friend force_inline fixed_size_simd + vectorcall clamp(fixed_size_simd v1, fixed_size_simd min, fixed_size_simd max); + // friend force_inline fixed_size_simd vectorcall clamp(fixed_size_simd v1, float min, float + // max); + friend force_inline fixed_size_simd vectorcall saturate(const fixed_size_simd v1) { + return clamp(v1, 0.0f, 1.0f); + } + friend force_inline fixed_size_simd vectorcall pow(fixed_size_simd v1, + fixed_size_simd v2); + friend force_inline fixed_size_simd vectorcall normalize(fixed_size_simd v1); + friend force_inline fixed_size_simd vectorcall normalize_len(fixed_size_simd v1, + float &out_len); + friend force_inline fixed_size_simd vectorcall inclusive_scan(fixed_size_simd v1); + + friend force_inline fixed_size_simd + vectorcall fmadd(fixed_size_simd a, fixed_size_simd b, fixed_size_simd c); + friend force_inline fixed_size_simd + vectorcall fmsub(fixed_size_simd a, fixed_size_simd b, fixed_size_simd c); + + friend force_inline fixed_size_simd vectorcall gather(const float *base_addr, + fixed_size_simd vindex); + + friend force_inline void vectorcall scatter(float *base_addr, fixed_size_simd vindex, + fixed_size_simd v); + friend force_inline void vectorcall scatter(float *base_addr, fixed_size_simd mask, + fixed_size_simd vindex, fixed_size_simd v); + + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + +#ifndef NDEBUG + friend void vectorcall __assert_valid_mask(const fixed_size_simd mask) { + UNROLLED_FOR(i, 16, { + const float val = mask.get(); + assert(reinterpret_cast(val) == 0 || + reinterpret_cast(val) == 0xffffffff); + }) + } +#endif + + friend force_inline const float *value_ptr(const fixed_size_simd &v1) { + return reinterpret_cast(&v1.vec_); + } + friend force_inline float *value_ptr(fixed_size_simd &v1) { return reinterpret_cast(&v1.vec_); } + + static int size() { return 16; } + static bool is_native() { return true; } +}; + +template <> class fixed_size_simd { + union { + __m512i vec_; + int comp_[16]; + }; + + friend class fixed_size_simd; + friend class fixed_size_simd; + + public: + force_inline fixed_size_simd() = default; + force_inline fixed_size_simd(const int f) { vec_ = _mm512_set1_epi32(f); } + force_inline fixed_size_simd(const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, + const int i6, const int i7, const int i8, const int i9, const int i10, const int i11, + const int i12, const int i13, const int i14, const int i15) { + vec_ = _mm512_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15); + } + force_inline explicit fixed_size_simd(const int *f) { vec_ = _mm512_loadu_si512((const __m512i *)f); } + force_inline fixed_size_simd(const int *f, vector_aligned_tag) { vec_ = _mm512_load_si512((const __m512i *)f); } + + force_inline int operator[](const int i) const { + __m512i temp = _mm512_maskz_compress_epi32(__mmask16(1u << (i & 15)), vec_); + return _mm512_cvtsi512_si32(temp); + } + + force_inline int operator[](const long i) const { return operator[](int(i)); } + + template force_inline int get() const { + __m128i temp = _mm512_extracti32x4_epi32(vec_, (i & 15) / 4); + return _mm_extract_epi32(temp, (i & 15) % 4); + } + template force_inline void set(const int v) { + // TODO: find more optimal implementation (with compile-time index) + vec_ = _mm512_mask_set1_epi32(vec_, __mmask16(1u << (i & 15)), v); + } + force_inline void set(const int i, const int v) { + vec_ = _mm512_mask_set1_epi32(vec_, __mmask16(1u << (i & 15)), v); + } + + force_inline fixed_size_simd &vectorcall operator+=(const fixed_size_simd rhs) { + vec_ = _mm512_add_epi32(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd &vectorcall operator-=(const fixed_size_simd rhs) { + vec_ = _mm512_sub_epi32(vec_, rhs.vec_); + return *this; + } + + fixed_size_simd &vectorcall operator*=(const fixed_size_simd rhs) { + UNROLLED_FOR(i, 16, { comp_[i] *= rhs.comp_[i]; }) + return *this; + } + + fixed_size_simd &vectorcall operator/=(const fixed_size_simd rhs) { + UNROLLED_FOR(i, 16, { comp_[i] /= rhs.comp_[i]; }) + return *this; + } + + force_inline fixed_size_simd &vectorcall operator|=(const fixed_size_simd rhs) { + vec_ = _mm512_or_si512(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd &vectorcall operator^=(const fixed_size_simd rhs) { + vec_ = _mm512_xor_epi32(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd operator-() const { + fixed_size_simd temp; + temp.vec_ = _mm512_sub_epi32(_mm512_setzero_si512(), vec_); + return temp; + } + + force_inline fixed_size_simd vectorcall operator==(const fixed_size_simd rhs) const { + fixed_size_simd ret; + ret.vec_ = _mm512_movm_epi32(_mm512_cmpeq_epi32_mask(vec_, rhs.vec_)); + return ret; + } + + force_inline fixed_size_simd vectorcall operator!=(const fixed_size_simd rhs) const { + fixed_size_simd ret; + ret.vec_ = + _mm512_andnot_si512(_mm512_movm_epi32(_mm512_cmpeq_epi32_mask(vec_, rhs.vec_)), _mm512_set1_epi32(~0)); + return ret; + } + + force_inline fixed_size_simd &vectorcall operator&=(const fixed_size_simd rhs) { + vec_ = _mm512_and_si512(vec_, rhs.vec_); + return *this; + } + + force_inline explicit operator fixed_size_simd() const { + fixed_size_simd ret; + ret.vec_ = _mm512_cvtepi32_ps(vec_); + return ret; + } + + force_inline explicit operator fixed_size_simd() const; + + force_inline int hsum() const { return _mm512_reduce_add_epi32(vec_); } + + force_inline void store_to(int *f) const { _mm512_storeu_si512((__m512i *)f, vec_); } + force_inline void store_to(int *f, vector_aligned_tag) const { _mm512_store_si512((__m512i *)f, vec_); } + + force_inline void vectorcall blend_to(const fixed_size_simd mask, const fixed_size_simd v1) { + validate_mask(mask); + vec_ = _mm512_ternarylogic_epi32(vec_, v1.vec_, _mm512_srai_epi32(mask.vec_, 31), 0xd8); + } + + force_inline void vectorcall blend_inv_to(const fixed_size_simd mask, const fixed_size_simd v1) { + validate_mask(mask); + vec_ = _mm512_ternarylogic_epi32(v1.vec_, vec_, _mm512_srai_epi32(mask.vec_, 31), 0xd8); + } + + force_inline int movemask() const { return _mm512_movemask_epi32(vec_); } + + force_inline bool vectorcall all_zeros() const { + return _mm512_cmpeq_epi32_mask(vec_, _mm512_setzero_si512()) == 0xFFFF; + } + + force_inline bool vectorcall all_zeros(const fixed_size_simd mask) const { + return _mm512_cmpeq_epi32_mask(_mm512_and_si512(vec_, mask.vec_), _mm512_setzero_si512()) == 0xFFFF; + } + + force_inline bool not_all_zeros() const { return !all_zeros(); } + + friend force_inline fixed_size_simd vectorcall min(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_min_epi32(v1.vec_, v2.vec_); + return temp; + } + + friend force_inline fixed_size_simd vectorcall max(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_max_epi32(v1.vec_, v2.vec_); + return temp; + } + + friend force_inline fixed_size_simd vectorcall clamp(const fixed_size_simd v1, + const fixed_size_simd _min, + const fixed_size_simd _max) { + return max(_min, min(v1, _max)); + } + + force_inline static fixed_size_simd vectorcall and_not(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_castps_si512(_mm512_andnot_ps(_mm512_castsi512_ps(v1.vec_), _mm512_castsi512_ps(v2.vec_))); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator&(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_castps_si512(_mm512_and_ps(_mm512_castsi512_ps(v1.vec_), _mm512_castsi512_ps(v2.vec_))); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator|(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_castps_si512(_mm512_or_ps(_mm512_castsi512_ps(v1.vec_), _mm512_castsi512_ps(v2.vec_))); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator^(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_castps_si512(_mm512_xor_ps(_mm512_castsi512_ps(v1.vec_), _mm512_castsi512_ps(v2.vec_))); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator+(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_add_epi32(v1.vec_, v2.vec_); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator-(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_sub_epi32(v1.vec_, v2.vec_); + return temp; + } + + friend fixed_size_simd vectorcall operator*(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + UNROLLED_FOR(i, 16, { ret.comp_[i] = v1.comp_[i] * v2.comp_[i]; }) + return ret; + } + + friend fixed_size_simd vectorcall operator/(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + UNROLLED_FOR(i, 16, { ret.comp_[i] = v1.comp_[i] / v2.comp_[i]; }) + return ret; + } + + friend force_inline fixed_size_simd vectorcall operator<(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm512_movm_epi32(_mm512_cmpgt_epi32_mask(v2.vec_, v1.vec_)); + return ret; + } + + friend force_inline fixed_size_simd vectorcall operator>(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm512_movm_epi32(_mm512_cmpgt_epi32_mask(v1.vec_, v2.vec_)); + return ret; + } + + friend force_inline fixed_size_simd vectorcall operator>=(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm512_movm_epi32(_mm512_cmpge_epi32_mask(v1.vec_, v2.vec_)); + return ret; + } + + friend force_inline fixed_size_simd vectorcall operator>>(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm512_srlv_epi32(v1.vec_, v2.vec_); + return ret; + } + + friend force_inline fixed_size_simd vectorcall operator>>(const fixed_size_simd v1, + const int v2) { + fixed_size_simd ret; + ret.vec_ = _mm512_srli_epi32(v1.vec_, v2); + return ret; + } + + friend force_inline fixed_size_simd vectorcall operator<<(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm512_sllv_epi32(v1.vec_, v2.vec_); + return ret; + } + + friend force_inline fixed_size_simd vectorcall operator<<(const fixed_size_simd v1, + const int v2) { + fixed_size_simd ret; + ret.vec_ = _mm512_slli_epi32(v1.vec_, v2); + return ret; + } + + force_inline fixed_size_simd operator~() const { + fixed_size_simd ret; + ret.vec_ = _mm512_andnot_si512(vec_, _mm512_set1_epi32(~0)); + return ret; + } + + friend force_inline fixed_size_simd vectorcall srai(const fixed_size_simd v1, const int v2) { + fixed_size_simd ret; + ret.vec_ = _mm512_srai_epi32(v1.vec_, v2); + return ret; + } + + friend force_inline bool vectorcall is_equal(const fixed_size_simd v1, const fixed_size_simd v2) { + return _mm512_cmpeq_epi32_mask(v1.vec_, v2.vec_) == 0xFFFF; + } + + friend fixed_size_simd vectorcall inclusive_scan(fixed_size_simd v1); + + friend force_inline fixed_size_simd vectorcall gather(const float *base_addr, + fixed_size_simd vindex); + friend force_inline fixed_size_simd vectorcall gather(const int *base_addr, + fixed_size_simd vindex); + friend force_inline fixed_size_simd vectorcall gather(const unsigned *base_addr, + fixed_size_simd vindex); + + friend force_inline void vectorcall scatter(float *base_addr, fixed_size_simd vindex, + fixed_size_simd v); + friend force_inline void vectorcall scatter(float *base_addr, fixed_size_simd vindex, const float v) { + scatter(base_addr, vindex, fixed_size_simd{v}); + } + friend force_inline void vectorcall scatter(float *base_addr, fixed_size_simd mask, + fixed_size_simd vindex, fixed_size_simd v); + friend force_inline void vectorcall scatter(float *base_addr, fixed_size_simd mask, + fixed_size_simd vindex, const float v) { + scatter(base_addr, mask, vindex, fixed_size_simd{v}); + } + friend force_inline void vectorcall scatter(int *base_addr, fixed_size_simd vindex, + fixed_size_simd v); + friend force_inline void vectorcall scatter(int *base_addr, fixed_size_simd vindex, const int v) { + scatter(base_addr, vindex, fixed_size_simd{v}); + } + friend force_inline void vectorcall scatter(int *base_addr, fixed_size_simd mask, + fixed_size_simd vindex, fixed_size_simd v); + friend force_inline void vectorcall scatter(int *base_addr, fixed_size_simd mask, + fixed_size_simd vindex, const int v) { + scatter(base_addr, mask, vindex, fixed_size_simd{v}); + } + friend force_inline void vectorcall scatter(unsigned *base_addr, fixed_size_simd vindex, + fixed_size_simd v); + friend force_inline void vectorcall scatter(unsigned *base_addr, fixed_size_simd mask, + fixed_size_simd vindex, fixed_size_simd v); + + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + +#ifndef NDEBUG + friend void vectorcall __assert_valid_mask(const fixed_size_simd mask) { + UNROLLED_FOR(i, 16, { + const int val = mask.get(); + assert(val == 0 || val == -1); + }) + } +#endif + + friend force_inline const int *value_ptr(const fixed_size_simd &v1) { + return reinterpret_cast(&v1.vec_); + } + friend force_inline int *value_ptr(fixed_size_simd &v1) { return reinterpret_cast(&v1.vec_); } + + static int size() { return 16; } + static bool is_native() { return true; } +}; + +template <> class fixed_size_simd { + union { + __m512i vec_; + unsigned comp_[16]; + }; + + friend class fixed_size_simd; + friend class fixed_size_simd; + + public: + force_inline fixed_size_simd() = default; + force_inline fixed_size_simd(const unsigned f) { vec_ = _mm512_set1_epi32(f); } + force_inline fixed_size_simd(const unsigned i0, const unsigned i1, const unsigned i2, const unsigned i3, + const unsigned i4, const unsigned i5, const unsigned i6, const unsigned i7, + const unsigned i8, const unsigned i9, const unsigned i10, const unsigned i11, + const unsigned i12, const unsigned i13, const unsigned i14, const unsigned i15) { + vec_ = _mm512_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15); + } + force_inline explicit fixed_size_simd(const unsigned *f) { vec_ = _mm512_loadu_si512((const __m512i *)f); } + force_inline fixed_size_simd(const unsigned *f, vector_aligned_tag) { + vec_ = _mm512_load_si512((const __m512i *)f); + } + + force_inline unsigned operator[](const int i) const { + __m512i temp = _mm512_maskz_compress_epi32(__mmask16(1u << (i & 15)), vec_); + return _mm512_cvtsi512_si32(temp); + } + + force_inline unsigned operator[](const long i) const { return operator[](int(i)); } + + template force_inline unsigned get() const { + __m128i temp = _mm512_extracti32x4_epi32(vec_, (i & 15) / 4); + return _mm_extract_epi32(temp, (i & 15) % 4); + } + template force_inline void set(const unsigned v) { + // TODO: find more optimal implementation (with compile-time index) + vec_ = _mm512_mask_set1_epi32(vec_, __mmask16(1u << (i & 15)), v); + } + force_inline void set(const int i, const unsigned v) { + vec_ = _mm512_mask_set1_epi32(vec_, __mmask16(1u << (i & 15)), v); + } + + force_inline fixed_size_simd &vectorcall operator+=(const fixed_size_simd rhs) { + vec_ = _mm512_add_epi32(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd &vectorcall operator-=(const fixed_size_simd rhs) { + vec_ = _mm512_sub_epi32(vec_, rhs.vec_); + return *this; + } + + fixed_size_simd &vectorcall operator*=(const fixed_size_simd rhs) { + UNROLLED_FOR(i, 16, { comp_[i] *= rhs.comp_[i]; }) + return *this; + } + + fixed_size_simd &vectorcall operator/=(const fixed_size_simd rhs) { + UNROLLED_FOR(i, 16, { comp_[i] /= rhs.comp_[i]; }) + return *this; + } + + force_inline fixed_size_simd &vectorcall operator|=(const fixed_size_simd rhs) { + vec_ = _mm512_or_si512(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd &vectorcall operator^=(const fixed_size_simd rhs) { + vec_ = _mm512_xor_epi32(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd vectorcall operator==(const fixed_size_simd rhs) const { + fixed_size_simd ret; + ret.vec_ = _mm512_movm_epi32(_mm512_cmpeq_epi32_mask(vec_, rhs.vec_)); + return ret; + } + + force_inline fixed_size_simd vectorcall operator!=(const fixed_size_simd rhs) const { + fixed_size_simd ret; + ret.vec_ = + _mm512_andnot_si512(_mm512_movm_epi32(_mm512_cmpeq_epi32_mask(vec_, rhs.vec_)), _mm512_set1_epi32(~0)); + return ret; + } + + force_inline fixed_size_simd &vectorcall operator&=(const fixed_size_simd rhs) { + vec_ = _mm512_and_si512(vec_, rhs.vec_); + return *this; + } + + force_inline explicit operator fixed_size_simd() const { + fixed_size_simd ret; + ret.vec_ = _mm512_cvtepu32_ps(vec_); + return ret; + } + + force_inline explicit operator fixed_size_simd() const { + fixed_size_simd ret; + ret.vec_ = vec_; + return ret; + } + + force_inline unsigned hsum() const { return _mm512_reduce_add_epi32(vec_); } + + force_inline void store_to(unsigned *f) const { _mm512_storeu_si512((__m512i *)f, vec_); } + force_inline void store_to(unsigned *f, vector_aligned_tag) const { _mm512_store_si512((__m512i *)f, vec_); } + + force_inline void vectorcall blend_to(const fixed_size_simd mask, + const fixed_size_simd v1) { + validate_mask(mask); + vec_ = _mm512_ternarylogic_epi32(vec_, v1.vec_, _mm512_srai_epi32(mask.vec_, 31), 0xd8); + } + + force_inline void vectorcall blend_inv_to(const fixed_size_simd mask, + const fixed_size_simd v1) { + validate_mask(mask); + vec_ = _mm512_ternarylogic_epi32(v1.vec_, vec_, _mm512_srai_epi32(mask.vec_, 31), 0xd8); + } + + force_inline int movemask() const { return _mm512_movemask_epi32(vec_); } + + force_inline bool vectorcall all_zeros() const { + return _mm512_cmpeq_epi32_mask(vec_, _mm512_setzero_si512()) == 0xFFFF; + } + + force_inline bool vectorcall all_zeros(const fixed_size_simd mask) const { + return _mm512_cmpeq_epi32_mask(_mm512_and_si512(vec_, mask.vec_), _mm512_setzero_si512()) == 0xFFFF; + } + + force_inline bool not_all_zeros() const { return !all_zeros(); } + + friend force_inline fixed_size_simd vectorcall min(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_min_epu32(v1.vec_, v2.vec_); + return temp; + } + + friend force_inline fixed_size_simd vectorcall max(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_max_epu32(v1.vec_, v2.vec_); + return temp; + } + + friend force_inline fixed_size_simd vectorcall clamp(const fixed_size_simd v1, + const fixed_size_simd _min, + const fixed_size_simd _max) { + return max(_min, min(v1, _max)); + } + + force_inline static fixed_size_simd vectorcall and_not(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_castps_si512(_mm512_andnot_ps(_mm512_castsi512_ps(v1.vec_), _mm512_castsi512_ps(v2.vec_))); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator&(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_castps_si512(_mm512_and_ps(_mm512_castsi512_ps(v1.vec_), _mm512_castsi512_ps(v2.vec_))); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator|(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_castps_si512(_mm512_or_ps(_mm512_castsi512_ps(v1.vec_), _mm512_castsi512_ps(v2.vec_))); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator^(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_castps_si512(_mm512_xor_ps(_mm512_castsi512_ps(v1.vec_), _mm512_castsi512_ps(v2.vec_))); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator+(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_add_epi32(v1.vec_, v2.vec_); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator-(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_sub_epi32(v1.vec_, v2.vec_); + return temp; + } + + friend fixed_size_simd vectorcall operator*(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + UNROLLED_FOR(i, 16, { ret.comp_[i] = v1.comp_[i] * v2.comp_[i]; }) + return ret; + } + + friend fixed_size_simd vectorcall operator/(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + UNROLLED_FOR(i, 16, { ret.comp_[i] = v1.comp_[i] / v2.comp_[i]; }) + return ret; + } + + friend force_inline fixed_size_simd vectorcall operator>>(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm512_srlv_epi32(v1.vec_, v2.vec_); + return ret; + } + + friend force_inline fixed_size_simd vectorcall operator>>(const fixed_size_simd v1, + const unsigned v2) { + fixed_size_simd ret; + ret.vec_ = _mm512_srli_epi32(v1.vec_, v2); + return ret; + } + + friend force_inline fixed_size_simd vectorcall operator<<(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm512_sllv_epi32(v1.vec_, v2.vec_); + return ret; + } + + friend force_inline fixed_size_simd vectorcall operator<<(const fixed_size_simd v1, + const unsigned v2) { + fixed_size_simd ret; + ret.vec_ = _mm512_slli_epi32(v1.vec_, v2); + return ret; + } + + force_inline fixed_size_simd operator~() const { + fixed_size_simd ret; + ret.vec_ = _mm512_andnot_si512(vec_, _mm512_set1_epi32(~0)); + return ret; + } + + friend force_inline bool vectorcall is_equal(const fixed_size_simd v1, + const fixed_size_simd v2) { + return _mm512_cmpeq_epi32_mask(v1.vec_, v2.vec_) == 0xFFFF; + } + + friend fixed_size_simd vectorcall inclusive_scan(fixed_size_simd v1); + + friend force_inline fixed_size_simd vectorcall gather(const unsigned *base_addr, + fixed_size_simd vindex); + + friend force_inline void vectorcall scatter(unsigned *base_addr, fixed_size_simd vindex, + fixed_size_simd v); + friend force_inline void vectorcall scatter(unsigned *base_addr, fixed_size_simd vindex, + const unsigned v) { + scatter(base_addr, vindex, fixed_size_simd{v}); + } + friend force_inline void vectorcall scatter(unsigned *base_addr, fixed_size_simd mask, + fixed_size_simd vindex, fixed_size_simd v); + friend force_inline void vectorcall scatter(unsigned *base_addr, fixed_size_simd mask, + fixed_size_simd vindex, const unsigned v) { + scatter(base_addr, mask, vindex, fixed_size_simd{v}); + } + + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + +#ifndef NDEBUG + friend void vectorcall __assert_valid_mask(const fixed_size_simd mask) { + UNROLLED_FOR(i, 16, { + const int val = mask.get(); + assert(val == 0 || val == 0xffffffff); + }) + } +#endif + + friend force_inline const unsigned *value_ptr(const fixed_size_simd &v1) { + return reinterpret_cast(&v1.vec_); + } + friend force_inline unsigned *value_ptr(fixed_size_simd &v1) { + return reinterpret_cast(&v1.vec_); + } + + static int size() { return 16; } + static bool is_native() { return true; } +}; + +force_inline fixed_size_simd fixed_size_simd::operator~() const { + fixed_size_simd ret; + ret.vec_ = _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(vec_), _mm512_set1_epi32(~0))); + return ret; +} + +force_inline fixed_size_simd fixed_size_simd::operator-() const { + fixed_size_simd temp; + __m512 m = _mm512_set1_ps(-0.0f); + temp.vec_ = _mm512_xor_ps(vec_, m); + return temp; +} + +force_inline fixed_size_simd::operator fixed_size_simd() const { + fixed_size_simd ret; + ret.vec_ = _mm512_cvttps_epi32(vec_); + return ret; +} + +force_inline fixed_size_simd::operator fixed_size_simd() const { + fixed_size_simd ret; + ret.vec_ = _mm512_cvttps_epi32(vec_); + return ret; +} + +force_inline fixed_size_simd::operator fixed_size_simd() const { + fixed_size_simd ret; + ret.vec_ = vec_; + return ret; +} + +force_inline fixed_size_simd fixed_size_simd::sqrt() const { + fixed_size_simd temp; + temp.vec_ = _mm512_sqrt_ps(vec_); + return temp; +} + +inline fixed_size_simd fixed_size_simd::log() const { + fixed_size_simd ret; + UNROLLED_FOR(i, 16, { ret.comp_[i] = logf(comp_[i]); }) + return ret; +} + +force_inline fixed_size_simd vectorcall min(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_min_ps(v1.vec_, v2.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall max(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_max_ps(v1.vec_, v2.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall and_not(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_andnot_ps(v1.vec_, v2.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall floor(const fixed_size_simd v1) { + fixed_size_simd temp; + temp.vec_ = _mm512_floor_ps(v1.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall ceil(const fixed_size_simd v1) { + fixed_size_simd temp; + temp.vec_ = _mm512_ceil_ps(v1.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall operator&(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_and_ps(v1.vec_, v2.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall operator|(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_or_ps(v1.vec_, v2.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall operator^(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_xor_ps(v1.vec_, v2.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall operator+(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_add_ps(v1.vec_, v2.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall operator-(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_sub_ps(v1.vec_, v2.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall operator*(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_mul_ps(v1.vec_, v2.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall operator/(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm512_div_ps(v1.vec_, v2.vec_); + return temp; +} + +force_inline fixed_size_simd vectorcall operator<(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm512_cmp_ps(v1.vec_, v2.vec_, _CMP_LT_OS); + return ret; +} + +force_inline fixed_size_simd vectorcall operator<=(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm512_cmp_ps(v1.vec_, v2.vec_, _CMP_LE_OS); + return ret; +} + +force_inline fixed_size_simd vectorcall operator>(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm512_cmp_ps(v1.vec_, v2.vec_, _CMP_GT_OS); + return ret; +} + +force_inline fixed_size_simd vectorcall operator>=(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm512_cmp_ps(v1.vec_, v2.vec_, _CMP_GE_OS); + return ret; +} + +force_inline fixed_size_simd vectorcall operator==(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm512_cmp_ps(v1.vec_, v2.vec_, _CMP_EQ_OS); + return ret; +} + +force_inline fixed_size_simd vectorcall operator!=(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm512_cmp_ps(v1.vec_, v2.vec_, _CMP_NEQ_OS); + return ret; +} + +force_inline fixed_size_simd vectorcall clamp(const fixed_size_simd v1, + const fixed_size_simd min, + const fixed_size_simd max) { + fixed_size_simd ret; + ret.vec_ = _mm512_max_ps(min.vec_, _mm512_min_ps(v1.vec_, max.vec_)); + return ret; +} + +inline fixed_size_simd vectorcall pow(const fixed_size_simd v1, + const fixed_size_simd v2) { + alignas(64) float comp1[16], comp2[16]; + _mm512_store_ps(comp1, v1.vec_); + _mm512_store_ps(comp2, v2.vec_); + UNROLLED_FOR(i, 16, { comp1[i] = powf(comp1[i], comp2[i]); }) + return fixed_size_simd{comp1, vector_aligned}; +} + +force_inline fixed_size_simd vectorcall normalize(const fixed_size_simd v1) { + return v1 / v1.length(); +} + +force_inline fixed_size_simd vectorcall normalize_len(const fixed_size_simd v1, float &out_len) { + return v1 / (out_len = v1.length()); +} + +force_inline fixed_size_simd vectorcall inclusive_scan(fixed_size_simd v1) { + v1.vec_ = _mm512_add_ps(v1.vec_, _mm512_castsi512_ps(_mm512_slli_si512(_mm512_castps_si512(v1.vec_), 1))); + v1.vec_ = _mm512_add_ps(v1.vec_, _mm512_castsi512_ps(_mm512_slli_si512(_mm512_castps_si512(v1.vec_), 2))); + v1.vec_ = _mm512_add_ps(v1.vec_, _mm512_castsi512_ps(_mm512_slli_si512(_mm512_castps_si512(v1.vec_), 4))); + v1.vec_ = _mm512_add_ps(v1.vec_, _mm512_castsi512_ps(_mm512_slli_si512(_mm512_castps_si512(v1.vec_), 8))); + return v1; +} + +force_inline fixed_size_simd vectorcall inclusive_scan(fixed_size_simd v1) { + v1.vec_ = _mm512_add_epi32(v1.vec_, _mm512_slli_si512(v1.vec_, 1)); + v1.vec_ = _mm512_add_epi32(v1.vec_, _mm512_slli_si512(v1.vec_, 2)); + v1.vec_ = _mm512_add_epi32(v1.vec_, _mm512_slli_si512(v1.vec_, 4)); + v1.vec_ = _mm512_add_epi32(v1.vec_, _mm512_slli_si512(v1.vec_, 8)); + return v1; +} + +force_inline fixed_size_simd vectorcall inclusive_scan(fixed_size_simd v1) { + v1.vec_ = _mm512_add_epi32(v1.vec_, _mm512_slli_si512(v1.vec_, 1)); + v1.vec_ = _mm512_add_epi32(v1.vec_, _mm512_slli_si512(v1.vec_, 2)); + v1.vec_ = _mm512_add_epi32(v1.vec_, _mm512_slli_si512(v1.vec_, 4)); + v1.vec_ = _mm512_add_epi32(v1.vec_, _mm512_slli_si512(v1.vec_, 8)); + return v1; +} + +force_inline fixed_size_simd vectorcall fmadd(const fixed_size_simd a, + const fixed_size_simd b, + const fixed_size_simd c) { + fixed_size_simd ret; + ret.vec_ = _mm512_fmadd_ps(a.vec_, b.vec_, c.vec_); + return ret; +} + +force_inline fixed_size_simd vectorcall fmsub(const fixed_size_simd a, + const fixed_size_simd b, + const fixed_size_simd c) { + fixed_size_simd ret; + ret.vec_ = _mm512_fmsub_ps(a.vec_, b.vec_, c.vec_); + return ret; +} + +force_inline fixed_size_simd vectorcall gather(const float *base_addr, + const fixed_size_simd vindex) { + fixed_size_simd ret; + ret.vec_ = _mm512_i32gather_ps(vindex.vec_, base_addr, sizeof(float)); + return ret; +} + +force_inline fixed_size_simd vectorcall gather(const int *base_addr, const fixed_size_simd vindex) { + fixed_size_simd ret; + ret.vec_ = _mm512_i32gather_epi32(vindex.vec_, base_addr, sizeof(int)); + return ret; +} + +force_inline fixed_size_simd vectorcall gather(const unsigned *base_addr, + const fixed_size_simd vindex) { + fixed_size_simd ret; + ret.vec_ = _mm512_i32gather_epi32(vindex.vec_, reinterpret_cast(base_addr), sizeof(unsigned)); + return ret; +} + +force_inline void vectorcall scatter(float *base_addr, fixed_size_simd vindex, fixed_size_simd v) { + _mm512_i32scatter_ps(base_addr, vindex.vec_, v.vec_, sizeof(float)); +} + +force_inline void vectorcall scatter(float *base_addr, fixed_size_simd mask, fixed_size_simd vindex, + fixed_size_simd v) { + _mm512_mask_i32scatter_ps(base_addr, mask.movemask(), vindex.vec_, v.vec_, sizeof(float)); +} + +force_inline void vectorcall scatter(int *base_addr, fixed_size_simd vindex, fixed_size_simd v) { + _mm512_i32scatter_epi32(base_addr, vindex.vec_, v.vec_, sizeof(int)); +} + +force_inline void vectorcall scatter(int *base_addr, fixed_size_simd mask, fixed_size_simd vindex, + fixed_size_simd v) { + _mm512_mask_i32scatter_epi32(base_addr, mask.movemask(), vindex.vec_, v.vec_, sizeof(int)); +} + +force_inline void vectorcall scatter(unsigned *base_addr, fixed_size_simd vindex, + fixed_size_simd v) { + _mm512_i32scatter_epi32(base_addr, vindex.vec_, v.vec_, sizeof(unsigned)); +} + +force_inline void vectorcall scatter(unsigned *base_addr, fixed_size_simd mask, + fixed_size_simd vindex, fixed_size_simd v) { + _mm512_mask_i32scatter_epi32(base_addr, mask.movemask(), vindex.vec_, v.vec_, sizeof(int)); +} + +template +force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2) { + validate_mask(mask); + fixed_size_simd ret; + ret.vec_ = _mm512_blendv_ps(vec2.vec_, vec1.vec_, _mm_cast<__m512>(mask.vec_)); + return ret; +} + +template +force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2) { + validate_mask(mask); + fixed_size_simd ret; + ret.vec_ = + _mm512_ternarylogic_epi32(vec2.vec_, vec1.vec_, _mm512_srai_epi32(_mm_cast<__m512i>(mask.vec_), 31), 0xd8); + return ret; +} + +template +force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2) { + validate_mask(mask); + fixed_size_simd ret; + ret.vec_ = + _mm512_ternarylogic_epi32(vec2.vec_, vec1.vec_, _mm512_srai_epi32(_mm_cast<__m512i>(mask.vec_), 31), 0xd8); + return ret; +} + +} // namespace NS +} // namespace Ray + +#undef validate_mask + +#pragma warning(pop) diff --git a/internal/simd/simd_vec_neon.h b/internal/simd/simd_neon.h similarity index 54% rename from internal/simd/simd_vec_neon.h rename to internal/simd/simd_neon.h index f8e3d03d6..47e3084c7 100644 --- a/internal/simd/simd_vec_neon.h +++ b/internal/simd/simd_neon.h @@ -39,24 +39,24 @@ template <> force_inline int32x4_t _vcast(float32x4_t x) { return vreinterpretq_ template <> force_inline uint32x4_t _vcast(float32x4_t x) { return vreinterpretq_u32_f32(x); } #endif -template <> class simd_vec; -template <> class simd_vec; +template <> class fixed_size_simd; +template <> class fixed_size_simd; -template <> class simd_vec { +template <> class fixed_size_simd { float32x4_t vec_; - friend class simd_vec; - friend class simd_vec; + friend class fixed_size_simd; + friend class fixed_size_simd; public: - force_inline simd_vec() = default; - force_inline simd_vec(const float f) { vec_ = vdupq_n_f32(f); } - force_inline simd_vec(const float f1, const float f2, const float f3, const float f4) { + force_inline fixed_size_simd() = default; + force_inline fixed_size_simd(const float f) { vec_ = vdupq_n_f32(f); } + force_inline fixed_size_simd(const float f1, const float f2, const float f3, const float f4) { alignas(16) const float init[4] = {f1, f2, f3, f4}; vec_ = vld1q_f32(init); } - force_inline simd_vec(const float *f) { vec_ = vld1q_f32(f); } - force_inline simd_vec(const float *f, simd_mem_aligned_tag) { + force_inline fixed_size_simd(const float *f) { vec_ = vld1q_f32(f); } + force_inline fixed_size_simd(const float *f, vector_aligned_tag) { const float *_f = (const float *)__builtin_assume_aligned(f, 16); vec_ = vld1q_f32(_f); } @@ -86,82 +86,82 @@ template <> class simd_vec { #endif } - force_inline simd_vec &vectorcall operator+=(const simd_vec rhs) { + force_inline fixed_size_simd &vectorcall operator+=(const fixed_size_simd rhs) { vec_ = vaddq_f32(vec_, rhs.vec_); return *this; } - force_inline simd_vec &vectorcall operator-=(const simd_vec rhs) { + force_inline fixed_size_simd &vectorcall operator-=(const fixed_size_simd rhs) { vec_ = vsubq_f32(vec_, rhs.vec_); return *this; } - force_inline simd_vec &vectorcall operator*=(const simd_vec rhs) { + force_inline fixed_size_simd &vectorcall operator*=(const fixed_size_simd rhs) { vec_ = vmulq_f32(vec_, rhs.vec_); return *this; } - force_inline simd_vec &vectorcall operator/=(const simd_vec rhs) { + force_inline fixed_size_simd &vectorcall operator/=(const fixed_size_simd rhs) { vec_ = vdivq_f32(vec_, rhs.vec_); return *this; } - force_inline simd_vec &vectorcall operator|=(const simd_vec rhs) { + force_inline fixed_size_simd &vectorcall operator|=(const fixed_size_simd rhs) { vec_ = vorrq_u32(vreinterpretq_u32_f32(vec_), vreinterpretq_u32_f32(rhs.vec_)); return *this; } - force_inline simd_vec operator-() const { - simd_vec temp; + force_inline fixed_size_simd operator-() const { + fixed_size_simd temp; float32x4_t m = vdupq_n_f32(-0.0f); int32x4_t res = veorq_s32(vreinterpretq_s32_f32(vec_), vreinterpretq_s32_f32(m)); temp.vec_ = vreinterpretq_f32_s32(res); return temp; } - force_inline simd_vec vectorcall operator<(const simd_vec rhs) const { - simd_vec ret; + force_inline fixed_size_simd vectorcall operator<(const fixed_size_simd rhs) const { + fixed_size_simd ret; uint32x4_t res = vcltq_f32(vec_, rhs.vec_); ret.vec_ = vreinterpretq_f32_u32(res); return ret; } - force_inline simd_vec vectorcall operator<=(const simd_vec rhs) const { - simd_vec ret; + force_inline fixed_size_simd vectorcall operator<=(const fixed_size_simd rhs) const { + fixed_size_simd ret; uint32x4_t res = vcleq_f32(vec_, rhs.vec_); ret.vec_ = vreinterpretq_f32_u32(res); return ret; } - force_inline simd_vec vectorcall operator>(const simd_vec rhs) const { - simd_vec ret; + force_inline fixed_size_simd vectorcall operator>(const fixed_size_simd rhs) const { + fixed_size_simd ret; uint32x4_t res = vcgtq_f32(vec_, rhs.vec_); ret.vec_ = vreinterpretq_f32_u32(res); return ret; } - force_inline simd_vec vectorcall operator>=(const simd_vec rhs) const { - simd_vec ret; + force_inline fixed_size_simd vectorcall operator>=(const fixed_size_simd rhs) const { + fixed_size_simd ret; uint32x4_t res = vcgeq_f32(vec_, rhs.vec_); ret.vec_ = vreinterpretq_f32_u32(res); return ret; } - force_inline simd_vec vectorcall operator&=(const simd_vec rhs) { + force_inline fixed_size_simd vectorcall operator&=(const fixed_size_simd rhs) { vec_ = vandq_u32(vreinterpretq_u32_f32(vec_), vreinterpretq_u32_f32(rhs.vec_)); return *this; } - force_inline simd_vec operator~() const { - simd_vec ret; + force_inline fixed_size_simd operator~() const { + fixed_size_simd ret; ret.vec_ = vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(vec_))); return ret; } - force_inline operator simd_vec() const; - force_inline operator simd_vec() const; + force_inline operator fixed_size_simd() const; + force_inline operator fixed_size_simd() const; - simd_vec sqrt() const { + fixed_size_simd sqrt() const { // This is not precise enough :( // float32x4_t recipsq = vrsqrteq_f32(vec_); // temp.vec_ = vrecpeq_f32(recipsq); @@ -169,14 +169,14 @@ template <> class simd_vec { alignas(16) float comp[4]; vst1q_f32(comp, vec_); UNROLLED_FOR(i, 4, { comp[i] = sqrtf(comp[i]); }) - return simd_vec{comp, simd_mem_aligned}; + return fixed_size_simd{comp, vector_aligned}; } - simd_vec log() const { + fixed_size_simd log() const { alignas(16) float comp[4]; vst1q_f32(comp, vec_); UNROLLED_FOR(i, 4, { comp[i] = logf(comp[i]); }) - return simd_vec{comp, simd_mem_aligned}; + return fixed_size_simd{comp, vector_aligned}; } force_inline float length() const { return sqrtf(length2()); } @@ -197,19 +197,20 @@ template <> class simd_vec { } force_inline void store_to(float *f) const { vst1q_f32(f, vec_); } - force_inline void store_to(float *f, simd_mem_aligned_tag) const { + force_inline void store_to(float *f, vector_aligned_tag) const { float *_f = (float *)__builtin_assume_aligned(f, 16); vst1q_f32(_f, vec_); } - force_inline void vectorcall blend_to(const simd_vec mask, const simd_vec v1) { + force_inline void vectorcall blend_to(const fixed_size_simd mask, const fixed_size_simd v1) { validate_mask(mask); int32x4_t temp1 = vandq_s32(vreinterpretq_s32_f32(v1.vec_), vreinterpretq_s32_f32(mask.vec_)); int32x4_t temp2 = vbicq_s32(vreinterpretq_s32_f32(vec_), vreinterpretq_s32_f32(mask.vec_)); vec_ = vreinterpretq_f32_s32(vorrq_s32(temp1, temp2)); } - force_inline void vectorcall blend_inv_to(const simd_vec mask, const simd_vec v1) { + force_inline void vectorcall blend_inv_to(const fixed_size_simd mask, + const fixed_size_simd v1) { validate_mask(mask); int32x4_t temp1 = vandq_s32(vreinterpretq_s32_f32(vec_), vreinterpretq_s32_f32(mask.vec_)); int32x4_t temp2 = vbicq_s32(vreinterpretq_s32_f32(v1.vec_), vreinterpretq_s32_f32(mask.vec_)); @@ -227,176 +228,187 @@ template <> class simd_vec { return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); } - friend force_inline simd_vec vectorcall min(const simd_vec v1, const simd_vec v2) { - simd_vec temp; + friend force_inline fixed_size_simd vectorcall min(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; temp.vec_ = vminq_f32(v1.vec_, v2.vec_); return temp; } - friend force_inline simd_vec vectorcall max(const simd_vec v1, const simd_vec v2) { - simd_vec temp; + friend force_inline fixed_size_simd vectorcall max(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; temp.vec_ = vmaxq_f32(v1.vec_, v2.vec_); return temp; } - friend force_inline simd_vec vectorcall clamp(const simd_vec v1, const simd_vec _min, - const simd_vec _max) { + friend force_inline fixed_size_simd vectorcall clamp(const fixed_size_simd v1, + const fixed_size_simd _min, + const fixed_size_simd _max) { return max(_min, min(v1, _max)); } - friend force_inline simd_vec vectorcall saturate(const simd_vec v1) { + friend force_inline fixed_size_simd vectorcall saturate(const fixed_size_simd v1) { return clamp(v1, 0.0f, 1.0f); } - force_inline static simd_vec vectorcall and_not(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; + force_inline static fixed_size_simd vectorcall and_not(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; temp.vec_ = vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(v2.vec_), vreinterpretq_s32_f32(v1.vec_))); return temp; } - force_inline static simd_vec vectorcall floor(const simd_vec v1) { - simd_vec temp; + force_inline static fixed_size_simd vectorcall floor(const fixed_size_simd v1) { + fixed_size_simd temp; float32x4_t t = vcvtq_f32_s32(vcvtq_s32_f32(v1.vec_)); float32x4_t r = vsubq_f32(t, vandq_s32(vcltq_f32(v1.vec_, t), vdupq_n_f32(1.0f))); temp.vec_ = r; return temp; } - force_inline static simd_vec vectorcall ceil(const simd_vec v1) { - simd_vec temp; + force_inline static fixed_size_simd vectorcall ceil(const fixed_size_simd v1) { + fixed_size_simd temp; float32x4_t t = vcvtq_f32_s32(vcvtq_s32_f32(v1.vec_)); float32x4_t r = vaddq_f32(t, vandq_s32(vcgtq_f32(v1.vec_, t), vdupq_n_f32(1.0f))); temp.vec_ = r; return temp; } - friend force_inline simd_vec vectorcall operator&(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; + friend force_inline fixed_size_simd vectorcall operator&(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; temp.vec_ = vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(v1.vec_), vreinterpretq_s32_f32(v2.vec_))); return temp; } - friend force_inline simd_vec vectorcall operator|(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; + friend force_inline fixed_size_simd vectorcall operator|(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; temp.vec_ = vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(v1.vec_), vreinterpretq_s32_f32(v2.vec_))); return temp; } - friend force_inline simd_vec vectorcall operator^(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; + friend force_inline fixed_size_simd vectorcall operator^(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; temp.vec_ = vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(v1.vec_), vreinterpretq_s32_f32(v2.vec_))); return temp; } - friend force_inline simd_vec vectorcall operator+(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; + friend force_inline fixed_size_simd vectorcall operator+(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; ret.vec_ = vaddq_f32(v1.vec_, v2.vec_); return ret; } - friend force_inline simd_vec vectorcall operator-(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; + friend force_inline fixed_size_simd vectorcall operator-(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; ret.vec_ = vsubq_f32(v1.vec_, v2.vec_); return ret; } - force_inline simd_vec vectorcall operator==(const simd_vec rhs) const { - simd_vec ret; + force_inline fixed_size_simd vectorcall operator==(const fixed_size_simd rhs) const { + fixed_size_simd ret; ret.vec_ = vceqq_f32(vec_, rhs.vec_); return ret; } - force_inline simd_vec vectorcall operator!=(const simd_vec rhs) const { - simd_vec ret; + force_inline fixed_size_simd vectorcall operator!=(const fixed_size_simd rhs) const { + fixed_size_simd ret; ret.vec_ = vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(vec_, rhs.vec_))); return ret; } - friend force_inline simd_vec vectorcall operator*(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; + friend force_inline fixed_size_simd vectorcall operator*(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; ret.vec_ = vmulq_f32(v1.vec_, v2.vec_); return ret; } - friend force_inline simd_vec vectorcall operator/(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; + friend force_inline fixed_size_simd vectorcall operator/(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; ret.vec_ = vdivq_f32(v1.vec_, v2.vec_); return ret; } - friend force_inline simd_vec vectorcall operator*(const simd_vec v1, const float v2) { - simd_vec ret; + friend force_inline fixed_size_simd vectorcall operator*(const fixed_size_simd v1, + const float v2) { + fixed_size_simd ret; ret.vec_ = vmulq_f32(v1.vec_, vdupq_n_f32(v2)); return ret; } - friend force_inline simd_vec vectorcall operator/(const simd_vec v1, const float v2) { - simd_vec ret; + friend force_inline fixed_size_simd vectorcall operator/(const fixed_size_simd v1, + const float v2) { + fixed_size_simd ret; ret.vec_ = vdivq_f32(v1.vec_, vdupq_n_f32(v2)); return ret; } - friend force_inline simd_vec vectorcall operator*(const float v1, const simd_vec v2) { - simd_vec ret; + friend force_inline fixed_size_simd vectorcall operator*(const float v1, + const fixed_size_simd v2) { + fixed_size_simd ret; ret.vec_ = vmulq_f32(vdupq_n_f32(v1), v2.vec_); return ret; } - friend force_inline simd_vec vectorcall operator/(const float v1, const simd_vec v2) { - simd_vec ret; + friend force_inline fixed_size_simd vectorcall operator/(const float v1, + const fixed_size_simd v2) { + fixed_size_simd ret; ret.vec_ = vdivq_f32(vdupq_n_f32(v1), v2.vec_); return ret; } - friend force_inline float vectorcall dot(const simd_vec v1, const simd_vec v2) { + friend force_inline float vectorcall dot(const fixed_size_simd v1, const fixed_size_simd v2) { float32x4_t r1 = vmulq_f32(v1.vec_, v2.vec_); float32x2_t r2 = vadd_f32(vget_high_f32(r1), vget_low_f32(r1)); return vget_lane_f32(vpadd_f32(r2, r2), 0); } - friend simd_vec vectorcall pow(const simd_vec v1, const simd_vec v2) { + friend fixed_size_simd vectorcall pow(const fixed_size_simd v1, + const fixed_size_simd v2) { alignas(16) float comp1[4], comp2[4]; vst1q_f32(comp1, v1.vec_); vst1q_f32(comp2, v2.vec_); UNROLLED_FOR(i, 4, { comp1[i] = powf(comp1[i], comp2[i]); }) - return simd_vec{comp1, simd_mem_aligned}; + return fixed_size_simd{comp1, vector_aligned}; } - friend force_inline simd_vec vectorcall normalize(const simd_vec v1) { + friend force_inline fixed_size_simd vectorcall normalize(const fixed_size_simd v1) { return v1 / v1.length(); } - friend force_inline simd_vec vectorcall normalize_len(const simd_vec v1, float &out_len) { + friend force_inline fixed_size_simd vectorcall normalize_len(const fixed_size_simd v1, + float &out_len) { return v1 / (out_len = v1.length()); } - friend force_inline simd_vec vectorcall inclusive_scan(simd_vec v1) { + friend force_inline fixed_size_simd vectorcall inclusive_scan(fixed_size_simd v1) { v1.vec_ = vaddq_f32(v1.vec_, vreinterpretq_f32_s32(slli<4>(vreinterpretq_s32_f32(v1.vec_)))); v1.vec_ = vaddq_f32(v1.vec_, vreinterpretq_f32_s32(slli<8>(vreinterpretq_s32_f32(v1.vec_)))); return v1; } template - friend force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2); + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); template - friend force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2); + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); template - friend force_inline simd_vec vectorcall select(const simd_vec mask, - const simd_vec vec1, - const simd_vec vec2); + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); #ifndef NDEBUG - friend void vectorcall __assert_valid_mask(const simd_vec mask) { + friend void vectorcall __assert_valid_mask(const fixed_size_simd mask) { UNROLLED_FOR(i, 4, { const float val = mask.get(); assert(reinterpret_cast(val) == 0 || @@ -405,30 +417,30 @@ template <> class simd_vec { } #endif - friend force_inline const float *value_ptr(const simd_vec &v1) { + friend force_inline const float *value_ptr(const fixed_size_simd &v1) { return reinterpret_cast(&v1.vec_); } - friend force_inline float *value_ptr(simd_vec &v1) { return reinterpret_cast(&v1.vec_); } + friend force_inline float *value_ptr(fixed_size_simd &v1) { return reinterpret_cast(&v1.vec_); } static int size() { return 4; } static bool is_native() { return true; } }; -template <> class simd_vec { +template <> class fixed_size_simd { int32x4_t vec_; - friend class simd_vec; - friend class simd_vec; + friend class fixed_size_simd; + friend class fixed_size_simd; public: - force_inline simd_vec() = default; - force_inline simd_vec(const int f) { vec_ = vdupq_n_s32(f); } - force_inline simd_vec(const int i1, const int i2, const int i3, const int i4) { + force_inline fixed_size_simd() = default; + force_inline fixed_size_simd(const int f) { vec_ = vdupq_n_s32(f); } + force_inline fixed_size_simd(const int i1, const int i2, const int i3, const int i4) { alignas(16) const int init[4] = {i1, i2, i3, i4}; vec_ = vld1q_s32(init); } - force_inline simd_vec(const int *f) { vec_ = vld1q_s32((const int32_t *)f); } - force_inline simd_vec(const int *f, simd_mem_aligned_tag) { + force_inline fixed_size_simd(const int *f) { vec_ = vld1q_s32((const int32_t *)f); } + force_inline fixed_size_simd(const int *f, vector_aligned_tag) { const int *_f = (const int *)__builtin_assume_aligned(f, 16); vec_ = vld1q_s32((const int32_t *)_f); } @@ -458,27 +470,27 @@ template <> class simd_vec { #endif } - force_inline simd_vec &vectorcall operator+=(const simd_vec rhs) { + force_inline fixed_size_simd &vectorcall operator+=(const fixed_size_simd rhs) { vec_ = vaddq_s32(vec_, rhs.vec_); return *this; } - force_inline simd_vec &vectorcall operator-=(const simd_vec rhs) { + force_inline fixed_size_simd &vectorcall operator-=(const fixed_size_simd rhs) { vec_ = vsubq_s32(vec_, rhs.vec_); return *this; } - force_inline simd_vec &vectorcall operator*=(const simd_vec rhs) { + force_inline fixed_size_simd &vectorcall operator*=(const fixed_size_simd rhs) { vec_ = vmulq_s32(vec_, rhs.vec_); return *this; } - force_inline simd_vec &vectorcall operator*=(const int rhs) { + force_inline fixed_size_simd &vectorcall operator*=(const int rhs) { vec_ = vmulq_s32(vec_, vdupq_n_s32(rhs)); return *this; } - simd_vec &vectorcall operator/=(const simd_vec rhs) { + fixed_size_simd &vectorcall operator/=(const fixed_size_simd rhs) { #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { vec_.n128_i32[i] /= rhs.vec_.n128_i32[i]; }) #else @@ -491,7 +503,7 @@ template <> class simd_vec { return *this; } - simd_vec &vectorcall operator/=(const int rhs) { + fixed_size_simd &vectorcall operator/=(const int rhs) { #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { vec_.n128_i32[i] /= rhs; }) #else @@ -503,76 +515,76 @@ template <> class simd_vec { return *this; } - force_inline simd_vec &vectorcall operator|=(const simd_vec rhs) { + force_inline fixed_size_simd &vectorcall operator|=(const fixed_size_simd rhs) { vec_ = vorrq_s32(vec_, rhs.vec_); return *this; } - force_inline simd_vec &vectorcall operator^=(const simd_vec rhs) { + force_inline fixed_size_simd &vectorcall operator^=(const fixed_size_simd rhs) { vec_ = veorq_s32(vec_, rhs.vec_); return *this; } - force_inline simd_vec operator-() const { - simd_vec temp; + force_inline fixed_size_simd operator-() const { + fixed_size_simd temp; temp.vec_ = vsubq_s32(vdupq_n_s32(0), vec_); return temp; } - force_inline simd_vec vectorcall operator==(const simd_vec rhs) const { - simd_vec ret; + force_inline fixed_size_simd vectorcall operator==(const fixed_size_simd rhs) const { + fixed_size_simd ret; ret.vec_ = vreinterpretq_s32_u32(vceqq_s32(vec_, rhs.vec_)); return ret; } - force_inline simd_vec vectorcall operator!=(const simd_vec rhs) const { - simd_vec ret; + force_inline fixed_size_simd vectorcall operator!=(const fixed_size_simd rhs) const { + fixed_size_simd ret; ret.vec_ = vreinterpretq_s32_u32(vmvnq_u32(vceqq_s32(vec_, rhs.vec_))); return ret; } - force_inline simd_vec vectorcall operator<(const simd_vec rhs) const { - simd_vec ret; + force_inline fixed_size_simd vectorcall operator<(const fixed_size_simd rhs) const { + fixed_size_simd ret; ret.vec_ = vreinterpretq_s32_u32(vcltq_s32(vec_, rhs.vec_)); return ret; } - force_inline simd_vec vectorcall operator<=(const simd_vec rhs) const { - simd_vec ret; + force_inline fixed_size_simd vectorcall operator<=(const fixed_size_simd rhs) const { + fixed_size_simd ret; ret.vec_ = vreinterpretq_s32_u32(vcleq_s32(vec_, rhs.vec_)); return ret; } - force_inline simd_vec vectorcall operator>(const simd_vec rhs) const { - simd_vec ret; + force_inline fixed_size_simd vectorcall operator>(const fixed_size_simd rhs) const { + fixed_size_simd ret; ret.vec_ = vreinterpretq_s32_u32(vcgtq_s32(vec_, rhs.vec_)); return ret; } - force_inline simd_vec vectorcall operator>=(const simd_vec rhs) const { - simd_vec ret; + force_inline fixed_size_simd vectorcall operator>=(const fixed_size_simd rhs) const { + fixed_size_simd ret; ret.vec_ = vreinterpretq_s32_u32(vcgeq_s32(vec_, rhs.vec_)); return ret; } - force_inline simd_vec vectorcall operator&=(const simd_vec rhs) { + force_inline fixed_size_simd vectorcall operator&=(const fixed_size_simd rhs) { vec_ = vandq_s32(vec_, rhs.vec_); return *this; } - force_inline simd_vec operator~() const { - simd_vec ret; + force_inline fixed_size_simd operator~() const { + fixed_size_simd ret; ret.vec_ = vmvnq_u32(vec_); return ret; } - force_inline operator simd_vec() const { - simd_vec ret; + force_inline operator fixed_size_simd() const { + fixed_size_simd ret; ret.vec_ = vcvtq_f32_s32(vec_); return ret; } - force_inline operator simd_vec() const; + force_inline operator fixed_size_simd() const; force_inline int hsum() const { alignas(16) int comp[4]; @@ -581,19 +593,19 @@ template <> class simd_vec { } force_inline void store_to(int *f) const { vst1q_s32((int32_t *)f, vec_); } - force_inline void store_to(int *f, simd_mem_aligned_tag) const { + force_inline void store_to(int *f, vector_aligned_tag) const { const int *_f = (const int *)__builtin_assume_aligned(f, 16); vst1q_s32((int32_t *)_f, vec_); } - force_inline void vectorcall blend_to(const simd_vec mask, const simd_vec v1) { + force_inline void vectorcall blend_to(const fixed_size_simd mask, const fixed_size_simd v1) { validate_mask(mask); int32x4_t temp1 = vandq_s32(v1.vec_, mask.vec_); int32x4_t temp2 = vbicq_s32(vec_, mask.vec_); vec_ = vorrq_s32(temp1, temp2); } - force_inline void vectorcall blend_inv_to(const simd_vec mask, const simd_vec v1) { + force_inline void vectorcall blend_inv_to(const fixed_size_simd mask, const fixed_size_simd v1) { validate_mask(mask); int32x4_t temp1 = vandq_s32(vec_, mask.vec_); int32x4_t temp2 = vbicq_s32(v1.vec_, mask.vec_); @@ -623,7 +635,7 @@ template <> class simd_vec { return res == 0; } - force_inline bool vectorcall all_zeros(const simd_vec mask) const { + force_inline bool vectorcall all_zeros(const fixed_size_simd mask) const { int32_t res = 0; #if defined(__aarch64__) || defined(_M_ARM64) res |= vaddvq_s32(vandq_s32(vec_, mask.vec_)); @@ -638,61 +650,71 @@ template <> class simd_vec { force_inline bool not_all_zeros() const { return !all_zeros(); } - friend force_inline simd_vec vectorcall min(const simd_vec v1, const simd_vec v2) { - simd_vec temp; + friend force_inline fixed_size_simd vectorcall min(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; temp.vec_ = vminq_s32(v1.vec_, v2.vec_); return temp; } - friend force_inline simd_vec vectorcall max(const simd_vec v1, const simd_vec v2) { - simd_vec temp; + friend force_inline fixed_size_simd vectorcall max(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; temp.vec_ = vmaxq_s32(v1.vec_, v2.vec_); return temp; } - friend force_inline simd_vec vectorcall clamp(const simd_vec v1, const simd_vec _min, - const simd_vec _max) { + friend force_inline fixed_size_simd vectorcall clamp(const fixed_size_simd v1, + const fixed_size_simd _min, + const fixed_size_simd _max) { return max(_min, min(v1, _max)); } - force_inline static simd_vec vectorcall and_not(const simd_vec v1, const simd_vec v2) { - simd_vec temp; + force_inline static fixed_size_simd vectorcall and_not(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; temp.vec_ = vbicq_s32(v2.vec_, v1.vec_); return temp; } - friend force_inline simd_vec vectorcall operator&(const simd_vec v1, const simd_vec v2) { - simd_vec temp; + friend force_inline fixed_size_simd vectorcall operator&(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; temp.vec_ = vandq_s32(v1.vec_, v2.vec_); return temp; } - friend force_inline simd_vec vectorcall operator|(const simd_vec v1, const simd_vec v2) { - simd_vec temp; + friend force_inline fixed_size_simd vectorcall operator|(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; temp.vec_ = vorrq_s32(v1.vec_, v2.vec_); return temp; } - friend force_inline simd_vec vectorcall operator^(const simd_vec v1, const simd_vec v2) { - simd_vec temp; + friend force_inline fixed_size_simd vectorcall operator^(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; temp.vec_ = veorq_s32(v1.vec_, v2.vec_); return temp; } - friend force_inline simd_vec vectorcall operator+(const simd_vec v1, const simd_vec v2) { - simd_vec ret; + friend force_inline fixed_size_simd vectorcall operator+(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; ret.vec_ = vaddq_s32(v1.vec_, v2.vec_); return ret; } - friend force_inline simd_vec vectorcall operator-(const simd_vec v1, const simd_vec v2) { - simd_vec ret; + friend force_inline fixed_size_simd vectorcall operator-(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; ret.vec_ = vsubq_s32(v1.vec_, v2.vec_); return ret; } - friend simd_vec vectorcall operator*(const simd_vec v1, const simd_vec v2) { - simd_vec ret; + friend fixed_size_simd vectorcall operator*(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { ret.vec_.n128_i32[i] = v1.vec_.n128_i32[i] * v2.vec_.n128_i32[i]; }) #else @@ -705,8 +727,9 @@ template <> class simd_vec { return ret; } - friend simd_vec vectorcall operator/(const simd_vec v1, const simd_vec v2) { - simd_vec ret; + friend fixed_size_simd vectorcall operator/(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { ret.vec_.n128_i32[i] = v1.vec_.n128_i32[i] / v2.vec_.n128_i32[i]; }) #else @@ -719,8 +742,8 @@ template <> class simd_vec { return ret; } - friend simd_vec vectorcall operator*(const simd_vec v1, const int v2) { - simd_vec ret; + friend fixed_size_simd vectorcall operator*(const fixed_size_simd v1, const int v2) { + fixed_size_simd ret; #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { ret.vec_.n128_i32[i] = v1.vec_.n128_i32[i] * v2; }) #else @@ -732,8 +755,8 @@ template <> class simd_vec { return ret; } - friend simd_vec vectorcall operator/(const simd_vec v1, const int v2) { - simd_vec ret; + friend fixed_size_simd vectorcall operator/(const fixed_size_simd v1, const int v2) { + fixed_size_simd ret; #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { ret.vec_.n128_i32[i] = v1.vec_.n128_i32[i] / v2; }) #else @@ -745,8 +768,8 @@ template <> class simd_vec { return ret; } - friend simd_vec vectorcall operator*(const int v1, const simd_vec v2) { - simd_vec ret; + friend fixed_size_simd vectorcall operator*(const int v1, const fixed_size_simd v2) { + fixed_size_simd ret; #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { ret.vec_.n128_i32[i] = v1 * v2.vec_.n128_i32[i]; }) #else @@ -758,8 +781,8 @@ template <> class simd_vec { return ret; } - friend simd_vec vectorcall operator/(const int v1, const simd_vec v2) { - simd_vec ret; + friend fixed_size_simd vectorcall operator/(const int v1, const fixed_size_simd v2) { + fixed_size_simd ret; #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { ret.vec_.n128_i32[i] = v1 / v2.vec_.n128_i32[i]; }) #else @@ -771,8 +794,9 @@ template <> class simd_vec { return ret; } - friend simd_vec vectorcall operator>>(const simd_vec v1, const simd_vec v2) { - simd_vec ret; + friend fixed_size_simd vectorcall operator>>(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { ret.vec_.n128_u32[i] = v1.vec_.n128_u32[i] >> v2.vec_.n128_u32[i]; }) #else @@ -785,8 +809,8 @@ template <> class simd_vec { return ret; } - friend simd_vec vectorcall operator>>(const simd_vec v1, const int v2) { - simd_vec ret; + friend fixed_size_simd vectorcall operator>>(const fixed_size_simd v1, const int v2) { + fixed_size_simd ret; #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { ret.vec_.n128_u32[i] = v1.vec_.n128_u32[i] >> v2; }) #else @@ -798,8 +822,9 @@ template <> class simd_vec { return ret; } - friend simd_vec vectorcall operator<<(const simd_vec v1, const simd_vec v2) { - simd_vec ret; + friend fixed_size_simd vectorcall operator<<(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { ret.vec_.n128_u32[i] = v1.vec_.n128_u32[i] << v2.vec_.n128_u32[i]; }) #else @@ -812,8 +837,8 @@ template <> class simd_vec { return ret; } - friend simd_vec vectorcall operator<<(const simd_vec v1, const int v2) { - simd_vec ret; + friend fixed_size_simd vectorcall operator<<(const fixed_size_simd v1, const int v2) { + fixed_size_simd ret; #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { ret.vec_.n128_u32[i] = v1.vec_.n128_u32[i] << v2; }) #else @@ -825,13 +850,13 @@ template <> class simd_vec { return ret; } - friend force_inline simd_vec vectorcall srai(const simd_vec v1, const int v2) { - simd_vec ret; + friend force_inline fixed_size_simd vectorcall srai(const fixed_size_simd v1, const int v2) { + fixed_size_simd ret; ret.vec_ = vshlq_s32(v1.vec_, vdupq_n_s32(-v2)); return ret; } - friend bool vectorcall is_equal(const simd_vec v1, const simd_vec v2) { + friend bool vectorcall is_equal(const fixed_size_simd v1, const fixed_size_simd v2) { #if defined(_MSC_VER) && !defined(__clang__) bool res = true; UNROLLED_FOR(i, 4, { res &= (v1.vec_.n128_i32[i] == v2.vec_.n128_i32[i]); }) @@ -847,25 +872,27 @@ template <> class simd_vec { #endif } - friend force_inline simd_vec vectorcall inclusive_scan(simd_vec v1) { + friend force_inline fixed_size_simd vectorcall inclusive_scan(fixed_size_simd v1) { v1.vec_ = vaddq_s32(v1.vec_, slli<4>(v1.vec_)); v1.vec_ = vaddq_s32(v1.vec_, slli<8>(v1.vec_)); return v1; } template - friend force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2); + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); template - friend force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2); + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); template - friend force_inline simd_vec vectorcall select(const simd_vec mask, - const simd_vec vec1, - const simd_vec vec2); + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); #ifndef NDEBUG - friend void vectorcall __assert_valid_mask(const simd_vec mask) { + friend void vectorcall __assert_valid_mask(const fixed_size_simd mask) { UNROLLED_FOR(i, 4, { const int val = mask.get(); assert(val == 0 || val == -1); @@ -873,30 +900,30 @@ template <> class simd_vec { } #endif - friend force_inline const int *value_ptr(const simd_vec &v1) { + friend force_inline const int *value_ptr(const fixed_size_simd &v1) { return reinterpret_cast(&v1.vec_); } - friend force_inline int *value_ptr(simd_vec &v1) { return reinterpret_cast(&v1.vec_); } + friend force_inline int *value_ptr(fixed_size_simd &v1) { return reinterpret_cast(&v1.vec_); } static int size() { return 4; } static bool is_native() { return true; } }; -template <> class simd_vec { +template <> class fixed_size_simd { uint32x4_t vec_; - friend class simd_vec; - friend class simd_vec; + friend class fixed_size_simd; + friend class fixed_size_simd; public: - force_inline simd_vec() = default; - force_inline simd_vec(const unsigned f) { vec_ = vdupq_n_u32(f); } - force_inline simd_vec(const unsigned i1, const unsigned i2, const unsigned i3, const unsigned i4) { + force_inline fixed_size_simd() = default; + force_inline fixed_size_simd(const unsigned f) { vec_ = vdupq_n_u32(f); } + force_inline fixed_size_simd(const unsigned i1, const unsigned i2, const unsigned i3, const unsigned i4) { alignas(16) const unsigned init[4] = {i1, i2, i3, i4}; vec_ = vld1q_u32(init); } - force_inline simd_vec(const unsigned *f) { vec_ = vld1q_u32(f); } - force_inline simd_vec(const unsigned *f, simd_mem_aligned_tag) { + force_inline fixed_size_simd(const unsigned *f) { vec_ = vld1q_u32(f); } + force_inline fixed_size_simd(const unsigned *f, vector_aligned_tag) { const unsigned *_f = (const unsigned *)__builtin_assume_aligned(f, 16); vec_ = vld1q_u32(_f); } @@ -926,22 +953,22 @@ template <> class simd_vec { #endif } - force_inline simd_vec &vectorcall operator+=(const simd_vec rhs) { + force_inline fixed_size_simd &vectorcall operator+=(const fixed_size_simd rhs) { vec_ = vaddq_u32(vec_, rhs.vec_); return *this; } - force_inline simd_vec &vectorcall operator-=(const simd_vec rhs) { + force_inline fixed_size_simd &vectorcall operator-=(const fixed_size_simd rhs) { vec_ = vsubq_u32(vec_, rhs.vec_); return *this; } - force_inline simd_vec &vectorcall operator*=(const simd_vec rhs) { + force_inline fixed_size_simd &vectorcall operator*=(const fixed_size_simd rhs) { vec_ = vmulq_u32(vec_, rhs.vec_); return *this; } - simd_vec &vectorcall operator/=(const simd_vec rhs) { + fixed_size_simd &vectorcall operator/=(const fixed_size_simd rhs) { #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { vec_.n128_u32[i] /= rhs.vec_.n128_u32[i]; }) #else @@ -954,7 +981,7 @@ template <> class simd_vec { return *this; } - simd_vec &vectorcall operator/=(const unsigned rhs) { + fixed_size_simd &vectorcall operator/=(const unsigned rhs) { #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { vec_.n128_u32[i] /= rhs; }) #else @@ -966,71 +993,71 @@ template <> class simd_vec { return *this; } - force_inline simd_vec &vectorcall operator|=(const simd_vec rhs) { + force_inline fixed_size_simd &vectorcall operator|=(const fixed_size_simd rhs) { vec_ = vorrq_u32(vec_, rhs.vec_); return *this; } - force_inline simd_vec &vectorcall operator^=(const simd_vec rhs) { + force_inline fixed_size_simd &vectorcall operator^=(const fixed_size_simd rhs) { vec_ = veorq_u32(vec_, rhs.vec_); return *this; } - force_inline simd_vec vectorcall operator==(const simd_vec rhs) const { - simd_vec ret; + force_inline fixed_size_simd vectorcall operator==(const fixed_size_simd rhs) const { + fixed_size_simd ret; ret.vec_ = vceqq_u32(vec_, rhs.vec_); return ret; } - force_inline simd_vec vectorcall operator!=(const simd_vec rhs) const { - simd_vec ret; + force_inline fixed_size_simd vectorcall operator!=(const fixed_size_simd rhs) const { + fixed_size_simd ret; ret.vec_ = vmvnq_u32(vceqq_u32(vec_, rhs.vec_)); return ret; } - force_inline simd_vec vectorcall operator<(const simd_vec rhs) const { - simd_vec ret; + force_inline fixed_size_simd vectorcall operator<(const fixed_size_simd rhs) const { + fixed_size_simd ret; ret.vec_ = vcltq_u32(vec_, rhs.vec_); return ret; } - force_inline simd_vec vectorcall operator<=(const simd_vec rhs) const { - simd_vec ret; + force_inline fixed_size_simd vectorcall operator<=(const fixed_size_simd rhs) const { + fixed_size_simd ret; ret.vec_ = vcleq_u32(vec_, rhs.vec_); return ret; } - force_inline simd_vec vectorcall operator>(const simd_vec rhs) const { - simd_vec ret; + force_inline fixed_size_simd vectorcall operator>(const fixed_size_simd rhs) const { + fixed_size_simd ret; ret.vec_ = vcgtq_u32(vec_, rhs.vec_); return ret; } - force_inline simd_vec vectorcall operator>=(const simd_vec rhs) const { - simd_vec ret; + force_inline fixed_size_simd vectorcall operator>=(const fixed_size_simd rhs) const { + fixed_size_simd ret; ret.vec_ = vcgeq_u32(vec_, rhs.vec_); return ret; } - force_inline simd_vec vectorcall operator&=(const simd_vec rhs) { + force_inline fixed_size_simd vectorcall operator&=(const fixed_size_simd rhs) { vec_ = vandq_u32(vec_, rhs.vec_); return *this; } - force_inline simd_vec operator~() const { - simd_vec ret; + force_inline fixed_size_simd operator~() const { + fixed_size_simd ret; ret.vec_ = vmvnq_u32(vec_); return ret; } - force_inline operator simd_vec() const { - simd_vec ret; + force_inline operator fixed_size_simd() const { + fixed_size_simd ret; ret.vec_ = vcvtq_f32_u32(vec_); return ret; } - force_inline operator simd_vec() const { - simd_vec ret; + force_inline operator fixed_size_simd() const { + fixed_size_simd ret; ret.vec_ = vreinterpretq_s32_u32(vec_); return ret; } @@ -1042,19 +1069,21 @@ template <> class simd_vec { } force_inline void store_to(unsigned *f) const { vst1q_u32(f, vec_); } - force_inline void store_to(unsigned *f, simd_mem_aligned_tag) const { + force_inline void store_to(unsigned *f, vector_aligned_tag) const { unsigned *_f = (unsigned *)__builtin_assume_aligned(f, 16); vst1q_u32(_f, vec_); } - force_inline void vectorcall blend_to(const simd_vec mask, const simd_vec v1) { + force_inline void vectorcall blend_to(const fixed_size_simd mask, + const fixed_size_simd v1) { validate_mask(mask); uint32x4_t temp1 = vandq_u32(v1.vec_, mask.vec_); uint32x4_t temp2 = vbicq_u32(vec_, mask.vec_); vec_ = vorrq_u32(temp1, temp2); } - force_inline void vectorcall blend_inv_to(const simd_vec mask, const simd_vec v1) { + force_inline void vectorcall blend_inv_to(const fixed_size_simd mask, + const fixed_size_simd v1) { validate_mask(mask); uint32x4_t temp1 = vandq_u32(vec_, mask.vec_); uint32x4_t temp2 = vbicq_u32(v1.vec_, mask.vec_); @@ -1084,7 +1113,7 @@ template <> class simd_vec { return res == 0; } - force_inline bool vectorcall all_zeros(const simd_vec mask) const { + force_inline bool vectorcall all_zeros(const fixed_size_simd mask) const { int32_t res = 0; #if defined(__aarch64__) || defined(_M_ARM64) res |= vaddvq_u32(vandq_u32(vec_, mask.vec_)); @@ -1099,70 +1128,71 @@ template <> class simd_vec { force_inline bool not_all_zeros() const { return !all_zeros(); } - friend force_inline simd_vec vectorcall min(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; + friend force_inline fixed_size_simd vectorcall min(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; temp.vec_ = vminq_u32(v1.vec_, v2.vec_); return temp; } - friend force_inline simd_vec vectorcall max(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; + friend force_inline fixed_size_simd vectorcall max(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; temp.vec_ = vmaxq_u32(v1.vec_, v2.vec_); return temp; } - friend force_inline simd_vec vectorcall clamp(const simd_vec v1, - const simd_vec _min, - const simd_vec _max) { + friend force_inline fixed_size_simd vectorcall clamp(const fixed_size_simd v1, + const fixed_size_simd _min, + const fixed_size_simd _max) { return max(_min, min(v1, _max)); } - force_inline static simd_vec vectorcall and_not(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; + force_inline static fixed_size_simd vectorcall and_not(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; temp.vec_ = vbicq_u32(v2.vec_, v1.vec_); return temp; } - friend force_inline simd_vec vectorcall operator&(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; + friend force_inline fixed_size_simd vectorcall operator&(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; temp.vec_ = vandq_u32(v1.vec_, v2.vec_); return temp; } - friend force_inline simd_vec vectorcall operator|(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; + friend force_inline fixed_size_simd vectorcall operator|(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; temp.vec_ = vorrq_u32(v1.vec_, v2.vec_); return temp; } - friend force_inline simd_vec vectorcall operator^(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; + friend force_inline fixed_size_simd vectorcall operator^(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; temp.vec_ = veorq_u32(v1.vec_, v2.vec_); return temp; } - friend force_inline simd_vec vectorcall operator+(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; + friend force_inline fixed_size_simd vectorcall operator+(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; ret.vec_ = vaddq_u32(v1.vec_, v2.vec_); return ret; } - friend force_inline simd_vec vectorcall operator-(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; + friend force_inline fixed_size_simd vectorcall operator-(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; ret.vec_ = vsubq_u32(v1.vec_, v2.vec_); return ret; } - friend simd_vec vectorcall operator*(const simd_vec v1, const simd_vec v2) { - simd_vec ret; + friend fixed_size_simd vectorcall operator*(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { ret.vec_.n128_u32[i] = v1.vec_.n128_u32[i] * v2.vec_.n128_u32[i]; }) #else @@ -1175,8 +1205,9 @@ template <> class simd_vec { return ret; } - friend simd_vec vectorcall operator/(const simd_vec v1, const simd_vec v2) { - simd_vec ret; + friend fixed_size_simd vectorcall operator/(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { ret.vec_.n128_u32[i] = v1.vec_.n128_u32[i] / v2.vec_.n128_u32[i]; }) #else @@ -1189,8 +1220,8 @@ template <> class simd_vec { return ret; } - friend simd_vec vectorcall operator*(const simd_vec v1, const unsigned v2) { - simd_vec ret; + friend fixed_size_simd vectorcall operator*(const fixed_size_simd v1, const unsigned v2) { + fixed_size_simd ret; #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { ret.vec_.n128_u32[i] = v1.vec_.n128_u32[i] * v2; }) #else @@ -1202,8 +1233,8 @@ template <> class simd_vec { return ret; } - friend simd_vec vectorcall operator/(const simd_vec v1, const unsigned v2) { - simd_vec ret; + friend fixed_size_simd vectorcall operator/(const fixed_size_simd v1, const unsigned v2) { + fixed_size_simd ret; #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { ret.vec_.n128_u32[i] = v1.vec_.n128_u32[i] / v2; }) #else @@ -1215,8 +1246,8 @@ template <> class simd_vec { return ret; } - friend simd_vec vectorcall operator*(const unsigned v1, const simd_vec v2) { - simd_vec ret; + friend fixed_size_simd vectorcall operator*(const unsigned v1, const fixed_size_simd v2) { + fixed_size_simd ret; #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { ret.vec_.n128_u32[i] = v1 * v2.vec_.n128_u32[i]; }) #else @@ -1228,8 +1259,8 @@ template <> class simd_vec { return ret; } - friend simd_vec vectorcall operator/(const unsigned v1, const simd_vec v2) { - simd_vec ret; + friend fixed_size_simd vectorcall operator/(const unsigned v1, const fixed_size_simd v2) { + fixed_size_simd ret; #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { ret.vec_.n128_u32[i] = v1 / v2.vec_.n128_u32[i]; }) #else @@ -1241,8 +1272,9 @@ template <> class simd_vec { return ret; } - friend simd_vec vectorcall operator>>(const simd_vec v1, const simd_vec v2) { - simd_vec ret; + friend fixed_size_simd vectorcall operator>>(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { ret.vec_.n128_u32[i] = v1.vec_.n128_u32[i] >> v2.vec_.n128_u32[i]; }) #else @@ -1255,8 +1287,9 @@ template <> class simd_vec { return ret; } - friend simd_vec vectorcall operator>>(const simd_vec v1, const unsigned v2) { - simd_vec ret; + friend fixed_size_simd vectorcall operator>>(const fixed_size_simd v1, + const unsigned v2) { + fixed_size_simd ret; #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { ret.vec_.n128_u32[i] = v1.vec_.n128_u32[i] >> v2; }) #else @@ -1268,8 +1301,9 @@ template <> class simd_vec { return ret; } - friend simd_vec vectorcall operator<<(const simd_vec v1, const simd_vec v2) { - simd_vec ret; + friend fixed_size_simd vectorcall operator<<(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { ret.vec_.n128_u32[i] = v1.vec_.n128_u32[i] << v2.vec_.n128_u32[i]; }) #else @@ -1282,8 +1316,9 @@ template <> class simd_vec { return ret; } - friend simd_vec vectorcall operator<<(const simd_vec v1, const unsigned v2) { - simd_vec ret; + friend fixed_size_simd vectorcall operator<<(const fixed_size_simd v1, + const unsigned v2) { + fixed_size_simd ret; #if defined(_MSC_VER) && !defined(__clang__) UNROLLED_FOR(i, 4, { ret.vec_.n128_u32[i] = v1.vec_.n128_u32[i] << v2; }) #else @@ -1295,7 +1330,7 @@ template <> class simd_vec { return ret; } - friend bool vectorcall is_equal(const simd_vec v1, const simd_vec v2) { + friend bool vectorcall is_equal(const fixed_size_simd v1, const fixed_size_simd v2) { #if defined(_MSC_VER) && !defined(__clang__) bool res = true; UNROLLED_FOR(i, 4, { res &= (v1.vec_.n128_u32[i] == v2.vec_.n128_u32[i]); }) @@ -1311,25 +1346,27 @@ template <> class simd_vec { #endif } - friend force_inline simd_vec vectorcall inclusive_scan(simd_vec v1) { + friend force_inline fixed_size_simd vectorcall inclusive_scan(fixed_size_simd v1) { v1.vec_ = vaddq_s32(v1.vec_, slli<4>(v1.vec_)); v1.vec_ = vaddq_s32(v1.vec_, slli<8>(v1.vec_)); return v1; } template - friend force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2); + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); template - friend force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2); + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); template - friend force_inline simd_vec vectorcall select(const simd_vec mask, - const simd_vec vec1, - const simd_vec vec2); + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); #ifndef NDEBUG - friend void vectorcall __assert_valid_mask(const simd_vec mask) { + friend void vectorcall __assert_valid_mask(const fixed_size_simd mask) { UNROLLED_FOR(i, 4, { const int val = mask.get(); assert(val == 0 || val == 0xffffffff); @@ -1337,10 +1374,10 @@ template <> class simd_vec { } #endif - friend force_inline const unsigned *value_ptr(const simd_vec &v1) { + friend force_inline const unsigned *value_ptr(const fixed_size_simd &v1) { return reinterpret_cast(&v1.vec_); } - friend force_inline unsigned *value_ptr(simd_vec &v1) { + friend force_inline unsigned *value_ptr(fixed_size_simd &v1) { return reinterpret_cast(&v1.vec_); } @@ -1348,29 +1385,30 @@ template <> class simd_vec { static bool is_native() { return true; } }; -force_inline simd_vec::operator simd_vec() const { - simd_vec ret; +force_inline fixed_size_simd::operator fixed_size_simd() const { + fixed_size_simd ret; ret.vec_ = vcvtq_s32_f32(vec_); return ret; } -force_inline simd_vec::operator simd_vec() const { - simd_vec ret; +force_inline fixed_size_simd::operator fixed_size_simd() const { + fixed_size_simd ret; ret.vec_ = vcvtq_u32_f32(vec_); return ret; } -force_inline simd_vec::operator simd_vec() const { - simd_vec ret; +force_inline fixed_size_simd::operator fixed_size_simd() const { + fixed_size_simd ret; ret.vec_ = vreinterpretq_u32_s32(vec_); return ret; } template -force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2) { +force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2) { validate_mask(mask); - simd_vec ret; + fixed_size_simd ret; const int32x4_t temp1 = vandq_s32(vreinterpretq_s32_f32(vec1.vec_), _vcast(mask.vec_)); const int32x4_t temp2 = vbicq_s32(vreinterpretq_s32_f32(vec2.vec_), _vcast(mask.vec_)); ret.vec_ = vreinterpretq_f32_s32(vorrq_s32(temp1, temp2)); @@ -1378,10 +1416,11 @@ force_inline simd_vec vectorcall select(const simd_vec mask, con } template -force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2) { +force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2) { validate_mask(mask); - simd_vec ret; + fixed_size_simd ret; const int32x4_t temp1 = vandq_s32(vec1.vec_, _vcast(mask.vec_)); const int32x4_t temp2 = vbicq_s32(vec2.vec_, _vcast(mask.vec_)); ret.vec_ = vorrq_s32(temp1, temp2); @@ -1389,10 +1428,11 @@ force_inline simd_vec vectorcall select(const simd_vec mask, const } template -force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2) { +force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2) { validate_mask(mask); - simd_vec ret; + fixed_size_simd ret; const uint32x4_t temp1 = vandq_u32(vec1.vec_, _vcast(mask.vec_)); const uint32x4_t temp2 = vbicq_u32(vec2.vec_, _vcast(mask.vec_)); ret.vec_ = vorrq_u32(temp1, temp2); diff --git a/internal/simd/simd_sse.h b/internal/simd/simd_sse.h new file mode 100644 index 000000000..54a9de366 --- /dev/null +++ b/internal/simd/simd_sse.h @@ -0,0 +1,1090 @@ +// #pragma once + +#include + +#include +#include +#include + +#ifndef NDEBUG +#define validate_mask(m) __assert_valid_mask(m) +#else +#define validate_mask(m) ((void)m) +#endif + +namespace Ray { +namespace NS { + +template To _mm_cast(From x) { return x; } +template <> force_inline __m128 _mm_cast(__m128i x) { return _mm_castsi128_ps(x); } +template <> force_inline __m128i _mm_cast(__m128 x) { return _mm_castps_si128(x); } + +template <> class fixed_size_simd; +template <> class fixed_size_simd; + +template <> class fixed_size_simd { + union { + __m128 vec_; + float comp_[4]; + }; + + friend class fixed_size_simd; + friend class fixed_size_simd; + + public: + force_inline fixed_size_simd() = default; + force_inline fixed_size_simd(const float f) { vec_ = _mm_set1_ps(f); } + template + force_inline fixed_size_simd(const float f1, const float f2, const float f3, const float f4) { + vec_ = _mm_setr_ps(f1, f2, f3, f4); + } + force_inline explicit fixed_size_simd(const float *f) { vec_ = _mm_loadu_ps(f); } + force_inline fixed_size_simd(const float *f, vector_aligned_tag) { vec_ = _mm_load_ps(f); } + + force_inline float operator[](const int i) const { return comp_[i]; } + + force_inline float operator[](const long i) const { return operator[](int(i)); } + + template force_inline float get() const { return comp_[i]; } + template force_inline void set(const float v) { comp_[i] = v; } + force_inline void set(const int i, const float v) { comp_[i] = v; } + + force_inline fixed_size_simd &vectorcall operator+=(const fixed_size_simd rhs) { + vec_ = _mm_add_ps(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd &vectorcall operator-=(const fixed_size_simd rhs) { + vec_ = _mm_sub_ps(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd &vectorcall operator*=(const fixed_size_simd rhs) { + vec_ = _mm_mul_ps(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd &vectorcall operator/=(const fixed_size_simd rhs) { + vec_ = _mm_div_ps(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd &vectorcall operator|=(const fixed_size_simd rhs) { + vec_ = _mm_or_ps(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd vectorcall operator-() const { + fixed_size_simd temp; + __m128 m = _mm_set1_ps(-0.0f); + temp.vec_ = _mm_xor_ps(vec_, m); + return temp; + } + + force_inline fixed_size_simd vectorcall operator<(const fixed_size_simd rhs) const { + fixed_size_simd ret; + ret.vec_ = _mm_cmplt_ps(vec_, rhs.vec_); + return ret; + } + + force_inline fixed_size_simd vectorcall operator<=(const fixed_size_simd rhs) const { + fixed_size_simd ret; + ret.vec_ = _mm_cmple_ps(vec_, rhs.vec_); + return ret; + } + + force_inline fixed_size_simd vectorcall operator>(const fixed_size_simd rhs) const { + fixed_size_simd ret; + ret.vec_ = _mm_cmpgt_ps(vec_, rhs.vec_); + return ret; + } + + force_inline fixed_size_simd vectorcall operator>=(const fixed_size_simd rhs) const { + fixed_size_simd ret; + ret.vec_ = _mm_cmpge_ps(vec_, rhs.vec_); + return ret; + } + + force_inline fixed_size_simd vectorcall operator~() const { + fixed_size_simd ret; + ret.vec_ = _mm_castsi128_ps(_mm_andnot_si128(_mm_castps_si128(vec_), _mm_set1_epi32(~0))); + return ret; + } + + force_inline fixed_size_simd &vectorcall operator&=(const fixed_size_simd rhs) { + vec_ = _mm_and_ps(vec_, rhs.vec_); + return *this; + } + + force_inline explicit vectorcall operator fixed_size_simd() const; + force_inline explicit vectorcall operator fixed_size_simd() const; + + force_inline fixed_size_simd vectorcall sqrt() const { + fixed_size_simd temp; + temp.vec_ = _mm_sqrt_ps(vec_); + return temp; + } + + fixed_size_simd vectorcall log() const { + fixed_size_simd ret; + UNROLLED_FOR(i, 4, { ret.comp_[i] = logf(comp_[i]); }) + return ret; + } + + float vectorcall length() const { + __m128 r1, r2; + r1 = _mm_mul_ps(vec_, vec_); + + r2 = _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(2, 3, 0, 1)); + r1 = _mm_add_ps(r1, r2); + r2 = _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(0, 1, 2, 3)); + r1 = _mm_add_ps(r1, r2); + + return _mm_cvtss_f32(_mm_sqrt_ss(r1)); + } + + float vectorcall length2() const { + __m128 r1, r2; + r1 = _mm_mul_ps(vec_, vec_); + + r2 = _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(2, 3, 0, 1)); + r1 = _mm_add_ps(r1, r2); + r2 = _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(0, 1, 2, 3)); + r1 = _mm_add_ps(r1, r2); + + return _mm_cvtss_f32(r1); + } + + force_inline float hsum() const { +#if defined(USE_SSE41) + __m128 temp = _mm_hadd_ps(vec_, vec_); + temp = _mm_hadd_ps(temp, temp); + return _mm_cvtss_f32(temp); +#else + return comp_[0] + comp_[1] + comp_[2] + comp_[3]; +#endif + } + + force_inline void vectorcall store_to(float *f) const { _mm_storeu_ps(f, vec_); } + force_inline void vectorcall store_to(float *f, vector_aligned_tag) const { _mm_store_ps(f, vec_); } + + force_inline void vectorcall blend_to(const fixed_size_simd mask, const fixed_size_simd v1) { + validate_mask(mask); +#if defined(USE_SSE41) + vec_ = _mm_blendv_ps(vec_, v1.vec_, mask.vec_); +#else + __m128 temp1 = _mm_and_ps(mask.vec_, v1.vec_); + __m128 temp2 = _mm_andnot_ps(mask.vec_, vec_); + vec_ = _mm_or_ps(temp1, temp2); +#endif + } + + force_inline void vectorcall blend_inv_to(const fixed_size_simd mask, + const fixed_size_simd v1) { + validate_mask(mask); +#if defined(USE_SSE41) + vec_ = _mm_blendv_ps(v1.vec_, vec_, mask.vec_); +#else + __m128 temp1 = _mm_andnot_ps(mask.vec_, v1.vec_); + __m128 temp2 = _mm_and_ps(mask.vec_, vec_); + vec_ = _mm_or_ps(temp1, temp2); +#endif + } + + friend force_inline fixed_size_simd vectorcall min(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm_min_ps(v1.vec_, v2.vec_); + return temp; + } + + friend force_inline fixed_size_simd vectorcall max(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm_max_ps(v1.vec_, v2.vec_); + return temp; + } + + force_inline static fixed_size_simd vectorcall and_not(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm_andnot_ps(v1.vec_, v2.vec_); + return temp; + } + + force_inline static fixed_size_simd vectorcall floor(const fixed_size_simd v1) { + fixed_size_simd temp; +#if defined(USE_SSE41) + temp.vec_ = _mm_floor_ps(v1.vec_); +#else + __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(v1.vec_)); + temp.vec_ = _mm_sub_ps(t, _mm_and_ps(_mm_cmplt_ps(v1.vec_, t), _mm_set1_ps(1.0f))); +#endif + return temp; + } + + force_inline static fixed_size_simd vectorcall ceil(const fixed_size_simd v1) { + fixed_size_simd temp; + __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(v1.vec_)); + __m128 r = _mm_add_ps(t, _mm_and_ps(_mm_cmpgt_ps(v1.vec_, t), _mm_set1_ps(1.0f))); + temp.vec_ = r; + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator&(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm_and_ps(v1.vec_, v2.vec_); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator|(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm_or_ps(v1.vec_, v2.vec_); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator^(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm_xor_ps(v1.vec_, v2.vec_); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator+(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm_add_ps(v1.vec_, v2.vec_); + return ret; + } + + friend force_inline fixed_size_simd vectorcall operator-(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm_sub_ps(v1.vec_, v2.vec_); + return ret; + } + + force_inline fixed_size_simd vectorcall operator==(const fixed_size_simd rhs) const { + fixed_size_simd ret; + ret.vec_ = _mm_cmpeq_ps(vec_, rhs.vec_); + return ret; + } + + force_inline fixed_size_simd vectorcall operator!=(const fixed_size_simd rhs) const { + fixed_size_simd ret; + ret.vec_ = _mm_cmpneq_ps(vec_, rhs.vec_); + return ret; + } + + friend force_inline fixed_size_simd vectorcall operator*(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm_mul_ps(v1.vec_, v2.vec_); + return ret; + } + + friend force_inline fixed_size_simd vectorcall operator/(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm_div_ps(v1.vec_, v2.vec_); + return ret; + } + + friend force_inline float vectorcall dot(const fixed_size_simd v1, const fixed_size_simd v2) { + __m128 r1, r2; + r1 = _mm_mul_ps(v1.vec_, v2.vec_); + r2 = _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(2, 3, 0, 1)); + r1 = _mm_add_ps(r1, r2); + r2 = _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(0, 1, 2, 3)); + r1 = _mm_add_ps(r1, r2); + return _mm_cvtss_f32(r1); + } + + friend force_inline fixed_size_simd vectorcall clamp(const fixed_size_simd v1, + const fixed_size_simd min, + const fixed_size_simd max) { + fixed_size_simd ret; + ret.vec_ = _mm_max_ps(min.vec_, _mm_min_ps(v1.vec_, max.vec_)); + return ret; + } + + friend force_inline fixed_size_simd vectorcall saturate(const fixed_size_simd v1) { + return clamp(v1, 0.0f, 1.0f); + } + + friend fixed_size_simd vectorcall pow(const fixed_size_simd v1, + const fixed_size_simd v2) { + alignas(16) float comp1[4], comp2[4]; + _mm_store_ps(comp1, v1.vec_); + _mm_store_ps(comp2, v2.vec_); + UNROLLED_FOR(i, 4, { comp1[i] = powf(comp1[i], comp2[i]); }) + return fixed_size_simd{comp1, vector_aligned}; + } + + friend force_inline fixed_size_simd vectorcall normalize(const fixed_size_simd v1) { + return v1 / v1.length(); + } + + friend force_inline fixed_size_simd vectorcall normalize_len(const fixed_size_simd v1, + float &out_len) { + return v1 / (out_len = v1.length()); + } + + friend force_inline fixed_size_simd vectorcall inclusive_scan(fixed_size_simd v1) { + v1.vec_ = _mm_add_ps(v1.vec_, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(v1.vec_), 4))); + v1.vec_ = _mm_add_ps(v1.vec_, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(v1.vec_), 8))); + return v1; + } + + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + +#ifndef NDEBUG + friend void vectorcall __assert_valid_mask(const fixed_size_simd mask) { + UNROLLED_FOR(i, 4, { + const float val = mask.get(); + assert(reinterpret_cast(val) == 0 || + reinterpret_cast(val) == 0xffffffff); + }) + } +#endif + + friend force_inline const float *vectorcall value_ptr(const fixed_size_simd &v1) { + return reinterpret_cast(&v1.vec_); + } + friend force_inline float *vectorcall value_ptr(fixed_size_simd &v1) { + return reinterpret_cast(&v1.vec_); + } + + static int size() { return 4; } + static bool is_native() { return true; } +}; + +template <> class fixed_size_simd { + union { + __m128i vec_; + int comp_[4]; + }; + + friend class fixed_size_simd; + friend class fixed_size_simd; + + public: + force_inline fixed_size_simd() = default; + force_inline fixed_size_simd(const int v) { vec_ = _mm_set1_epi32(v); } + force_inline fixed_size_simd(const int i1, const int i2, const int i3, const int i4) { + vec_ = _mm_setr_epi32(i1, i2, i3, i4); + } + force_inline explicit fixed_size_simd(const int *f) { vec_ = _mm_loadu_si128((const __m128i *)f); } + force_inline fixed_size_simd(const int *f, vector_aligned_tag) { vec_ = _mm_load_si128((const __m128i *)f); } + + force_inline int operator[](const int i) const { return comp_[i]; } + force_inline int operator[](const long i) const { return operator[](int(i)); } + + template force_inline int get() const { return comp_[i]; } + template force_inline void set(const int v) { comp_[i & 3] = v; } + force_inline void set(const int i, const int v) { comp_[i] = v; } + + force_inline fixed_size_simd &vectorcall operator+=(const fixed_size_simd rhs) { + vec_ = _mm_add_epi32(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd &vectorcall operator-=(const fixed_size_simd rhs) { + vec_ = _mm_sub_epi32(vec_, rhs.vec_); + return *this; + } + + fixed_size_simd &vectorcall operator*=(const fixed_size_simd rhs) { +#if defined(USE_SSE41) + vec_ = _mm_mullo_epi32(vec_, rhs.vec_); +#else + UNROLLED_FOR(i, 4, { comp_[i] *= rhs.comp_[i]; }) +#endif + return *this; + } + + fixed_size_simd &vectorcall operator/=(const fixed_size_simd rhs) { + UNROLLED_FOR(i, 4, { comp_[i] /= rhs.comp_[i]; }) + return *this; + } + + force_inline fixed_size_simd &vectorcall operator|=(const fixed_size_simd rhs) { + vec_ = _mm_or_si128(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd &vectorcall operator^=(const fixed_size_simd rhs) { + vec_ = _mm_xor_si128(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd vectorcall operator-() const { + fixed_size_simd temp; + temp.vec_ = _mm_sub_epi32(_mm_setzero_si128(), vec_); + return temp; + } + + force_inline fixed_size_simd vectorcall operator==(const fixed_size_simd rhs) const { + fixed_size_simd ret; + ret.vec_ = _mm_cmpeq_epi32(vec_, rhs.vec_); + return ret; + } + + force_inline fixed_size_simd vectorcall operator!=(const fixed_size_simd rhs) const { + fixed_size_simd ret; + ret.vec_ = _mm_andnot_si128(_mm_cmpeq_epi32(vec_, rhs.vec_), _mm_set1_epi32(~0)); + return ret; + } + + force_inline fixed_size_simd vectorcall operator<(const fixed_size_simd rhs) const { + fixed_size_simd ret; + ret.vec_ = _mm_cmplt_epi32(vec_, rhs.vec_); + return ret; + } + + force_inline fixed_size_simd vectorcall operator<=(const fixed_size_simd rhs) const { + fixed_size_simd ret; + ret.vec_ = _mm_andnot_si128(_mm_cmpgt_epi32(vec_, rhs.vec_), _mm_set1_epi32(~0)); + return ret; + } + + force_inline fixed_size_simd vectorcall operator>(const fixed_size_simd rhs) const { + fixed_size_simd ret; + ret.vec_ = _mm_cmpgt_epi32(vec_, rhs.vec_); + return ret; + } + + force_inline fixed_size_simd vectorcall operator>=(const fixed_size_simd rhs) const { + fixed_size_simd ret; + ret.vec_ = _mm_andnot_si128(_mm_cmplt_epi32(vec_, rhs.vec_), _mm_set1_epi32(~0)); + return ret; + } + + force_inline fixed_size_simd &vectorcall operator&=(const fixed_size_simd rhs) { + vec_ = _mm_and_si128(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd vectorcall operator~() const { + fixed_size_simd ret; + ret.vec_ = _mm_andnot_si128(vec_, _mm_set1_epi32(~0)); + return ret; + } + + force_inline explicit vectorcall operator fixed_size_simd() const { + fixed_size_simd ret; + ret.vec_ = _mm_cvtepi32_ps(vec_); + return ret; + } + + force_inline explicit vectorcall operator fixed_size_simd() const; + + force_inline int hsum() const { +#if defined(USE_SSE41) + __m128i temp = _mm_hadd_epi32(vec_, vec_); + temp = _mm_hadd_epi32(temp, temp); + return _mm_cvtsi128_si32(temp); +#else + return comp_[0] + comp_[1] + comp_[2] + comp_[3]; +#endif + } + + force_inline void store_to(int *f) const { _mm_storeu_si128((__m128i *)f, vec_); } + force_inline void store_to(int *f, vector_aligned_tag) const { _mm_store_si128((__m128i *)f, vec_); } + + force_inline void vectorcall blend_to(const fixed_size_simd mask, const fixed_size_simd v1) { + validate_mask(mask); +#if defined(USE_SSE41) + vec_ = _mm_blendv_epi8(vec_, v1.vec_, mask.vec_); +#else + __m128i temp1 = _mm_and_si128(mask.vec_, v1.vec_); + __m128i temp2 = _mm_andnot_si128(mask.vec_, vec_); + vec_ = _mm_or_si128(temp1, temp2); +#endif + } + + force_inline void vectorcall blend_inv_to(const fixed_size_simd mask, const fixed_size_simd v1) { + validate_mask(mask); +#if defined(USE_SSE41) + vec_ = _mm_blendv_epi8(v1.vec_, vec_, mask.vec_); +#else + __m128i temp1 = _mm_andnot_si128(mask.vec_, v1.vec_); + __m128i temp2 = _mm_and_si128(mask.vec_, vec_); + vec_ = _mm_or_si128(temp1, temp2); +#endif + } + + force_inline int movemask() const { return _mm_movemask_ps(_mm_castsi128_ps(vec_)); } + + force_inline bool all_zeros() const { +#if defined(USE_SSE41) + return _mm_test_all_zeros(vec_, vec_); +#else + return _mm_movemask_epi8(_mm_cmpeq_epi32(vec_, _mm_setzero_si128())) == 0xFFFF; +#endif + } + + force_inline bool vectorcall all_zeros(const fixed_size_simd mask) const { +#if defined(USE_SSE41) + return _mm_test_all_zeros(vec_, mask.vec_); +#else + return _mm_movemask_epi8(_mm_cmpeq_epi32(_mm_and_si128(vec_, mask.vec_), _mm_setzero_si128())) == 0xFFFF; +#endif + } + + force_inline bool not_all_zeros() const { return !all_zeros(); } + + friend fixed_size_simd vectorcall min(const fixed_size_simd v1, const fixed_size_simd v2) { + fixed_size_simd temp; +#if defined(USE_SSE41) + temp.vec_ = _mm_min_epi32(v1.vec_, v2.vec_); +#else + UNROLLED_FOR(i, 4, { temp.comp_[i] = (v1.comp_[i] < v2.comp_[i]) ? v1.comp_[i] : v2.comp_[i]; }) +#endif + return temp; + } + + static fixed_size_simd vectorcall max(const fixed_size_simd v1, const fixed_size_simd v2) { + fixed_size_simd temp; +#if defined(USE_SSE41) + temp.vec_ = _mm_max_epi32(v1.vec_, v2.vec_); +#else + UNROLLED_FOR(i, 4, { temp.comp_[i] = (v1.comp_[i] > v2.comp_[i]) ? v1.comp_[i] : v2.comp_[i]; }) +#endif + return temp; + } + + friend force_inline fixed_size_simd vectorcall clamp(const fixed_size_simd v1, + const fixed_size_simd _min, + const fixed_size_simd _max) { + return max(_min, min(v1, _max)); + } + + force_inline static fixed_size_simd vectorcall and_not(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm_andnot_si128(v1.vec_, v2.vec_); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator&(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm_and_si128(v1.vec_, v2.vec_); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator|(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm_or_si128(v1.vec_, v2.vec_); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator^(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm_xor_si128(v1.vec_, v2.vec_); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator+(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm_add_epi32(v1.vec_, v2.vec_); + return ret; + } + + friend force_inline fixed_size_simd vectorcall operator-(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm_sub_epi32(v1.vec_, v2.vec_); + return ret; + } + + friend fixed_size_simd vectorcall operator*(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; +#if defined(USE_SSE41) + ret.vec_ = _mm_mullo_epi32(v1.vec_, v2.vec_); +#else + UNROLLED_FOR(i, 4, { ret.comp_[i] = v1.comp_[i] * v2.comp_[i]; }) +#endif + return ret; + } + + friend fixed_size_simd vectorcall operator/(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + UNROLLED_FOR(i, 4, { ret.comp_[i] = v1.comp_[i] / v2.comp_[i]; }) + return ret; + } + + friend fixed_size_simd vectorcall operator>>(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + UNROLLED_FOR(i, 4, { ret.comp_[i] = int(unsigned(v1.comp_[i]) >> unsigned(v2.comp_[i])); }) + return ret; + } + + friend force_inline fixed_size_simd vectorcall operator>>(const fixed_size_simd v1, const int v2) { + fixed_size_simd ret; + ret.vec_ = _mm_srli_epi32(v1.vec_, v2); + return ret; + } + + friend fixed_size_simd vectorcall operator<<(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + UNROLLED_FOR(i, 4, { ret.comp_[i] = v1.comp_[i] << v2.comp_[i]; }) + return ret; + } + + friend force_inline fixed_size_simd vectorcall operator<<(const fixed_size_simd v1, const int v2) { + fixed_size_simd ret; + ret.vec_ = _mm_slli_epi32(v1.vec_, v2); + return ret; + } + + friend force_inline fixed_size_simd vectorcall srai(const fixed_size_simd v1, const int v2) { + fixed_size_simd ret; + ret.vec_ = _mm_srai_epi32(v1.vec_, v2); + return ret; + } + + friend force_inline bool vectorcall is_equal(const fixed_size_simd v1, const fixed_size_simd v2) { + __m128i vcmp = _mm_cmpeq_epi32(v1.vec_, v2.vec_); + return (_mm_movemask_epi8(vcmp) == 0xffff); + } + + friend force_inline fixed_size_simd vectorcall inclusive_scan(fixed_size_simd v1) { + v1.vec_ = _mm_add_epi32(v1.vec_, _mm_slli_si128(v1.vec_, 4)); + v1.vec_ = _mm_add_epi32(v1.vec_, _mm_slli_si128(v1.vec_, 8)); + return v1; + } + + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + +#ifndef NDEBUG + friend void vectorcall __assert_valid_mask(const fixed_size_simd mask) { + UNROLLED_FOR(i, 4, { + const int val = mask.get(); + assert(val == 0 || val == -1); + }) + } +#endif + + friend force_inline const int *value_ptr(const fixed_size_simd &v1) { + return reinterpret_cast(&v1.vec_); + } + friend force_inline int *value_ptr(fixed_size_simd &v1) { return reinterpret_cast(&v1.vec_); } + + static int size() { return 4; } + static bool is_native() { return true; } +}; + +template <> class fixed_size_simd { + union { + __m128i vec_; + unsigned comp_[4]; + }; + + friend class fixed_size_simd; + friend class fixed_size_simd; + + public: + force_inline fixed_size_simd() = default; + force_inline fixed_size_simd(const unsigned v) { vec_ = _mm_set1_epi32(v); } + force_inline fixed_size_simd(const unsigned i1, const unsigned i2, const unsigned i3, const unsigned i4) { + vec_ = _mm_setr_epi32(i1, i2, i3, i4); + } + force_inline explicit fixed_size_simd(const unsigned *f) { vec_ = _mm_loadu_si128((const __m128i *)f); } + force_inline fixed_size_simd(const unsigned *f, vector_aligned_tag) { vec_ = _mm_load_si128((const __m128i *)f); } + + force_inline unsigned operator[](const int i) const { return comp_[i]; } + force_inline unsigned operator[](const long i) const { return operator[](int(i)); } + + template force_inline unsigned get() const { +#if defined(USE_SSE41) + return _mm_extract_epi32(vec_, i & 3); +#else + return comp_[i]; +#endif + } + template force_inline void set(const unsigned v) { +#if defined(USE_SSE41) + vec_ = _mm_insert_epi32(vec_, v, i & 3); +#else + comp_[i] = v; +#endif + } + force_inline void set(const int i, const unsigned v) { comp_[i] = v; } + + force_inline fixed_size_simd &vectorcall operator+=(const fixed_size_simd rhs) { + vec_ = _mm_add_epi32(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd &vectorcall operator-=(const fixed_size_simd rhs) { + vec_ = _mm_sub_epi32(vec_, rhs.vec_); + return *this; + } + + fixed_size_simd &vectorcall operator*=(const fixed_size_simd rhs) { + UNROLLED_FOR(i, 4, { comp_[i] *= rhs.comp_[i]; }) + return *this; + } + + fixed_size_simd &vectorcall operator/=(const fixed_size_simd rhs) { + UNROLLED_FOR(i, 4, { comp_[i] /= rhs.comp_[i]; }) + return *this; + } + + force_inline fixed_size_simd &vectorcall operator|=(const fixed_size_simd rhs) { + vec_ = _mm_or_si128(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd &vectorcall operator^=(const fixed_size_simd rhs) { + vec_ = _mm_xor_si128(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd vectorcall operator==(const fixed_size_simd rhs) const { + fixed_size_simd ret; + ret.vec_ = _mm_cmpeq_epi32(vec_, rhs.vec_); + return ret; + } + + force_inline fixed_size_simd vectorcall operator!=(const fixed_size_simd rhs) const { + fixed_size_simd ret; + ret.vec_ = _mm_andnot_si128(_mm_cmpeq_epi32(vec_, rhs.vec_), _mm_set1_epi32(~0)); + return ret; + } + + force_inline fixed_size_simd &vectorcall operator&=(const fixed_size_simd rhs) { + vec_ = _mm_and_si128(vec_, rhs.vec_); + return *this; + } + + force_inline fixed_size_simd vectorcall operator~() const { + fixed_size_simd ret; + ret.vec_ = _mm_andnot_si128(vec_, _mm_set1_epi32(~0)); + return ret; + } + + force_inline explicit vectorcall operator fixed_size_simd() const { + fixed_size_simd ret; + ret.vec_ = _mm_cvtepi32_ps(vec_); + return ret; + } + + force_inline explicit vectorcall operator fixed_size_simd() const { + fixed_size_simd ret; + ret.vec_ = vec_; + return ret; + } + + force_inline unsigned hsum() const { +#if defined(USE_SSE41) + __m128i temp = _mm_hadd_epi32(vec_, vec_); + temp = _mm_hadd_epi32(temp, temp); + return _mm_cvtsi128_si32(temp); +#else + return comp_[0] + comp_[1] + comp_[2] + comp_[3]; +#endif + } + + force_inline void store_to(unsigned *f) const { _mm_storeu_si128((__m128i *)f, vec_); } + force_inline void store_to(unsigned *f, vector_aligned_tag) const { _mm_store_si128((__m128i *)f, vec_); } + + force_inline void vectorcall blend_to(const fixed_size_simd mask, + const fixed_size_simd v1) { + validate_mask(mask); +#if defined(USE_SSE41) + vec_ = _mm_blendv_epi8(vec_, v1.vec_, mask.vec_); +#else + __m128i temp1 = _mm_and_si128(mask.vec_, v1.vec_); + __m128i temp2 = _mm_andnot_si128(mask.vec_, vec_); + vec_ = _mm_or_si128(temp1, temp2); +#endif + } + + force_inline void vectorcall blend_inv_to(const fixed_size_simd mask, + const fixed_size_simd v1) { + validate_mask(mask); +#if defined(USE_SSE41) + vec_ = _mm_blendv_epi8(v1.vec_, vec_, mask.vec_); +#else + __m128i temp1 = _mm_andnot_si128(mask.vec_, v1.vec_); + __m128i temp2 = _mm_and_si128(mask.vec_, vec_); + vec_ = _mm_or_si128(temp1, temp2); +#endif + } + + force_inline int movemask() const { return _mm_movemask_ps(_mm_castsi128_ps(vec_)); } + + force_inline bool all_zeros() const { +#if defined(USE_SSE41) + return _mm_test_all_zeros(vec_, vec_); +#else + return _mm_movemask_epi8(_mm_cmpeq_epi32(vec_, _mm_setzero_si128())) == 0xFFFF; +#endif + } + + force_inline bool vectorcall all_zeros(const fixed_size_simd mask) const { +#if defined(USE_SSE41) + return _mm_test_all_zeros(vec_, mask.vec_); +#else + return _mm_movemask_epi8(_mm_cmpeq_epi32(_mm_and_si128(vec_, mask.vec_), _mm_setzero_si128())) == 0xFFFF; +#endif + } + + force_inline bool not_all_zeros() const { return !all_zeros(); } + + static fixed_size_simd vectorcall min(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; +#if defined(USE_SSE41) + temp.vec_ = _mm_min_epu32(v1.vec_, v2.vec_); +#else + UNROLLED_FOR(i, 4, { temp.comp_[i] = (v1.comp_[i] < v2.comp_[i]) ? v1.comp_[i] : v2.comp_[i]; }) +#endif + return temp; + } + + static fixed_size_simd vectorcall max(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; +#if defined(USE_SSE41) + temp.vec_ = _mm_max_epu32(v1.vec_, v2.vec_); +#else + UNROLLED_FOR(i, 4, { temp.comp_[i] = (v1.comp_[i] > v2.comp_[i]) ? v1.comp_[i] : v2.comp_[i]; }) +#endif + return temp; + } + + force_inline static fixed_size_simd vectorcall and_not(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm_andnot_si128(v1.vec_, v2.vec_); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator&(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm_and_si128(v1.vec_, v2.vec_); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator|(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm_or_si128(v1.vec_, v2.vec_); + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator^(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd temp; + temp.vec_ = _mm_xor_si128(v1.vec_, v2.vec_); + ; + return temp; + } + + friend force_inline fixed_size_simd vectorcall operator+(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm_add_epi32(v1.vec_, v2.vec_); + return ret; + } + + friend force_inline fixed_size_simd vectorcall operator-(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + ret.vec_ = _mm_sub_epi32(v1.vec_, v2.vec_); + return ret; + } + + friend fixed_size_simd vectorcall operator*(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + UNROLLED_FOR(i, 4, { ret.comp_[i] = v1.comp_[i] * v2.comp_[i]; }) + return ret; + } + + friend fixed_size_simd vectorcall operator/(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + UNROLLED_FOR(i, 4, { ret.comp_[i] = v1.comp_[i] / v2.comp_[i]; }) + return ret; + } + + friend fixed_size_simd vectorcall operator>>(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + UNROLLED_FOR(i, 4, { ret.comp_[i] = v1.comp_[i] >> v2.comp_[i]; }) + return ret; + } + + friend force_inline fixed_size_simd vectorcall operator>>(const fixed_size_simd v1, + const unsigned v2) { + fixed_size_simd ret; + ret.vec_ = _mm_srli_epi32(v1.vec_, v2); + return ret; + } + + friend fixed_size_simd vectorcall operator<<(const fixed_size_simd v1, + const fixed_size_simd v2) { + fixed_size_simd ret; + UNROLLED_FOR(i, 4, { ret.comp_[i] = v1.comp_[i] << v2.comp_[i]; }) + return ret; + } + + friend force_inline fixed_size_simd vectorcall operator<<(const fixed_size_simd v1, + const unsigned v2) { + fixed_size_simd ret; + ret.vec_ = _mm_slli_epi32(v1.vec_, v2); + return ret; + } + + friend force_inline bool vectorcall is_equal(const fixed_size_simd v1, + const fixed_size_simd v2) { + __m128i vcmp = _mm_cmpeq_epi32(v1.vec_, v2.vec_); + return (_mm_movemask_epi8(vcmp) == 0xffff); + } + + friend force_inline fixed_size_simd vectorcall inclusive_scan(fixed_size_simd v1) { + v1.vec_ = _mm_add_epi32(v1.vec_, _mm_slli_si128(v1.vec_, 4)); + v1.vec_ = _mm_add_epi32(v1.vec_, _mm_slli_si128(v1.vec_, 8)); + return v1; + } + + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + template + friend force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2); + +#ifndef NDEBUG + friend void vectorcall __assert_valid_mask(const fixed_size_simd mask) { + UNROLLED_FOR(i, 4, { + const unsigned val = mask.get(); + assert(val == 0 || val == 0xffffffff); + }) + } +#endif + + friend force_inline const unsigned *value_ptr(const fixed_size_simd &v1) { + return reinterpret_cast(&v1.vec_); + } + friend force_inline unsigned *value_ptr(fixed_size_simd &v1) { + return reinterpret_cast(&v1.vec_); + } + + static int size() { return 4; } + static bool is_native() { return true; } +}; + +force_inline vectorcall fixed_size_simd::operator fixed_size_simd() const { + fixed_size_simd ret; + ret.vec_ = _mm_cvttps_epi32(vec_); + return ret; +} + +force_inline vectorcall fixed_size_simd::operator fixed_size_simd() const { + fixed_size_simd ret; + ret.vec_ = _mm_cvttps_epi32(vec_); + return ret; +} + +force_inline vectorcall fixed_size_simd::operator fixed_size_simd() const { + fixed_size_simd ret; + ret.vec_ = vec_; + return ret; +} + +template +force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2) { + validate_mask(mask); + fixed_size_simd ret; +#if defined(USE_SSE41) + ret.vec_ = _mm_blendv_ps(vec2.vec_, vec1.vec_, _mm_cast<__m128>(mask.vec_)); +#else + const __m128 temp1 = _mm_and_ps(_mm_cast<__m128>(mask.vec_), vec1.vec_); + const __m128 temp2 = _mm_andnot_ps(_mm_cast<__m128>(mask.vec_), vec2.vec_); + ret.vec_ = _mm_or_ps(temp1, temp2); +#endif + return ret; +} + +template +force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2) { + validate_mask(mask); + fixed_size_simd ret; +#if defined(USE_SSE41) + ret.vec_ = _mm_blendv_epi8(vec2.vec_, vec1.vec_, _mm_cast<__m128i>(mask.vec_)); +#else + const __m128i temp1 = _mm_and_si128(_mm_cast<__m128i>(mask.vec_), vec1.vec_); + const __m128i temp2 = _mm_andnot_si128(_mm_cast<__m128i>(mask.vec_), vec2.vec_); + ret.vec_ = _mm_or_si128(temp1, temp2); +#endif + return ret; +} + +template +force_inline fixed_size_simd vectorcall select(const fixed_size_simd mask, + const fixed_size_simd vec1, + const fixed_size_simd vec2) { + validate_mask(mask); + fixed_size_simd ret; +#if defined(USE_SSE41) + ret.vec_ = _mm_blendv_epi8(vec2.vec_, vec1.vec_, _mm_cast<__m128i>(mask.vec_)); +#else + const __m128i temp1 = _mm_and_si128(_mm_cast<__m128i>(mask.vec_), vec1.vec_); + const __m128i temp2 = _mm_andnot_si128(_mm_cast<__m128i>(mask.vec_), vec2.vec_); + ret.vec_ = _mm_or_si128(temp1, temp2); +#endif + return ret; +} + +} // namespace NS +} // namespace Ray + +#undef validate_mask diff --git a/internal/simd/simd_vec_avx.h b/internal/simd/simd_vec_avx.h deleted file mode 100644 index 2b318af18..000000000 --- a/internal/simd/simd_vec_avx.h +++ /dev/null @@ -1,1309 +0,0 @@ -// #pragma once - -#include "simd_vec_sse.h" - -#include - -#if defined(__GNUC__) || defined(__clang__) -#define _mm256_test_all_zeros(mask, val) _mm256_testz_si256((mask), (val)) -#endif - -#ifndef NDEBUG -#define validate_mask(m) __assert_valid_mask(m) -#else -#define validate_mask(m) ((void)m) -#endif - -#if defined(USE_AVX2) || defined(USE_AVX512) -#define USE_FMA -#endif - -#pragma warning(push) -#pragma warning(disable : 4752) - -#if defined(USE_AVX2) || defined(USE_AVX512) -#define avx2_inline force_inline -#else -#define avx2_inline inline -#endif - -namespace Ray { -namespace NS { - -template <> force_inline __m256 _mm_cast(__m256i x) { return _mm256_castsi256_ps(x); } -template <> force_inline __m256i _mm_cast(__m256 x) { return _mm256_castps_si256(x); } - -template <> class simd_vec; -template <> class simd_vec; - -template <> class simd_vec { - union { - __m256 vec_; - float comp_[8]; - }; - - friend class simd_vec; - friend class simd_vec; - - public: - force_inline simd_vec() = default; - force_inline simd_vec(const float f) { vec_ = _mm256_set1_ps(f); } - force_inline simd_vec(const float f1, const float f2, const float f3, const float f4, const float f5, - const float f6, const float f7, const float f8) { - vec_ = _mm256_setr_ps(f1, f2, f3, f4, f5, f6, f7, f8); - } - force_inline explicit simd_vec(const float *f) { vec_ = _mm256_loadu_ps(f); } - force_inline simd_vec(const float *f, simd_mem_aligned_tag) { vec_ = _mm256_load_ps(f); } - - force_inline float operator[](const int i) const { return comp_[i]; } - force_inline float operator[](const long i) const { return operator[](int(i)); } - - template force_inline float get() const { return comp_[i & 7]; } - template force_inline void set(const float v) { comp_[i & 7] = v; } - force_inline void set(const int i, const float v) { comp_[i] = v; } - - force_inline simd_vec &vectorcall operator+=(const simd_vec rhs) { - vec_ = _mm256_add_ps(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec &vectorcall operator-=(const simd_vec rhs) { - vec_ = _mm256_sub_ps(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec &vectorcall operator*=(const simd_vec rhs) { - vec_ = _mm256_mul_ps(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec &vectorcall operator/=(const simd_vec rhs) { - vec_ = _mm256_div_ps(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec &vectorcall operator|=(const simd_vec rhs) { - vec_ = _mm256_or_ps(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec &vectorcall operator&=(const simd_vec rhs) { - vec_ = _mm256_and_ps(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec operator~() const; - force_inline simd_vec operator-() const; - force_inline explicit vectorcall operator simd_vec() const; - force_inline explicit vectorcall operator simd_vec() const; - - force_inline simd_vec sqrt() const; - force_inline simd_vec log() const; - - force_inline float length() const { return sqrtf(length2()); } - - float length2() const { - float ret = 0; - UNROLLED_FOR(i, 8, { ret += comp_[i] * comp_[i]; }) - return ret; - } - - force_inline float hsum() const { -#if 1 - __m256 temp = _mm256_hadd_ps(vec_, vec_); - temp = _mm256_hadd_ps(temp, temp); - - __m256 ret = _mm256_permute2f128_ps(temp, temp, 1); - ret = _mm256_add_ps(ret, temp); - - return _mm256_cvtss_f32(ret); -#else - // ( x3+x7, x2+x6, x1+x5, x0+x4 ) - const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(vec_, 1), _mm256_castps256_ps128(vec_)); - // ( -, -, x1+x3+x5+x7, x0+x2+x4+x6 ) - const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128)); - // ( -, -, -, x0+x1+x2+x3+x4+x5+x6+x7 ) - const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55)); - // Conversion to float is a no-op on x86-64 - return _mm_cvtss_f32(x32); -#endif - } - -#if defined(USE_AVX2) || defined(USE_AVX512) - friend force_inline simd_vec vectorcall inclusive_scan(simd_vec v1) { - v1.vec_ = _mm256_add_ps(v1.vec_, _mm256_castsi256_ps(_mm256_slli_si256(_mm256_castps_si256(v1.vec_), 4))); - v1.vec_ = _mm256_add_ps(v1.vec_, _mm256_castsi256_ps(_mm256_slli_si256(_mm256_castps_si256(v1.vec_), 8))); - - __m256 temp = _mm256_shuffle_ps(v1.vec_, v1.vec_, _MM_SHUFFLE(3, 3, 3, 3)); - temp = _mm256_permute2f128_ps(_mm256_setzero_ps(), temp, 0x20); - - v1.vec_ = _mm256_add_ps(v1.vec_, temp); - - return v1; - } -#endif - - force_inline void store_to(float *f) const { _mm256_storeu_ps(f, vec_); } - force_inline void store_to(float *f, simd_mem_aligned_tag) const { _mm256_store_ps(f, vec_); } - - force_inline void vectorcall blend_to(const simd_vec mask, const simd_vec v1) { - validate_mask(mask); - vec_ = _mm256_blendv_ps(vec_, v1.vec_, mask.vec_); - } - - force_inline void vectorcall blend_inv_to(const simd_vec mask, const simd_vec v1) { - validate_mask(mask); - vec_ = _mm256_blendv_ps(v1.vec_, vec_, mask.vec_); - } - - friend force_inline simd_vec vectorcall min(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall max(simd_vec v1, simd_vec v2); - - friend force_inline simd_vec vectorcall and_not(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall floor(simd_vec v1); - friend force_inline simd_vec vectorcall ceil(simd_vec v1); - - friend force_inline simd_vec vectorcall operator&(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall operator|(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall operator^(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall operator+(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall operator-(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall operator*(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall operator/(simd_vec v1, simd_vec v2); - - friend force_inline simd_vec vectorcall operator<(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall operator<=(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall operator>(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall operator>=(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall operator==(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall operator!=(simd_vec v1, simd_vec v2); - - friend force_inline simd_vec vectorcall clamp(simd_vec v1, simd_vec min, - simd_vec max); - // friend force_inline simd_vec vectorcall clamp(simd_vec v1, float min, float max); - friend force_inline simd_vec vectorcall saturate(simd_vec v1) { return clamp(v1, 0.0f, 1.0f); } - friend force_inline simd_vec vectorcall pow(simd_vec v1, simd_vec v2); - - friend force_inline simd_vec vectorcall normalize(simd_vec v1); - friend force_inline simd_vec vectorcall normalize_len(simd_vec v1, float &out_len); - -#ifdef USE_FMA - friend force_inline simd_vec vectorcall fmadd(simd_vec a, simd_vec b, - simd_vec c); - friend force_inline simd_vec vectorcall fmsub(simd_vec a, simd_vec b, - simd_vec c); -#endif // USE_FMA - -#if defined(USE_AVX2) || defined(USE_AVX512) - friend force_inline simd_vec vectorcall gather(const float *base_addr, simd_vec vindex); - friend force_inline simd_vec vectorcall gather(simd_vec src, const float *base_addr, - simd_vec mask, simd_vec vindex); -#endif - - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2); - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2); - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, - const simd_vec vec1, - const simd_vec vec2); - -#ifndef NDEBUG - friend void vectorcall __assert_valid_mask(const simd_vec mask) { - UNROLLED_FOR(i, 8, { - const float val = mask.get(); - assert(reinterpret_cast(val) == 0 || - reinterpret_cast(val) == 0xffffffff); - }) - } -#endif - - friend force_inline const float *value_ptr(const simd_vec &v1) { - return reinterpret_cast(&v1.vec_); - } - friend force_inline float *value_ptr(simd_vec &v1) { return reinterpret_cast(&v1.vec_); } - - static int size() { return 8; } - static bool is_native() { return true; } -}; - -template <> class simd_vec { - union { - __m256i vec_; - int comp_[8]; - }; - - friend class simd_vec; - friend class simd_vec; - - public: - force_inline simd_vec() = default; - force_inline simd_vec(const int f) { vec_ = _mm256_set1_epi32(f); } - force_inline simd_vec(const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, - const int i7, const int i8) { - vec_ = _mm256_setr_epi32(i1, i2, i3, i4, i5, i6, i7, i8); - } - force_inline explicit simd_vec(const int *f) { vec_ = _mm256_loadu_si256((const __m256i *)f); } - force_inline simd_vec(const int *f, simd_mem_aligned_tag) { vec_ = _mm256_load_si256((const __m256i *)f); } - - force_inline int operator[](const int i) const { return comp_[i]; } - force_inline int operator[](const long i) const { return operator[](int(i)); } - - template force_inline int get() const { return _mm256_extract_epi32(vec_, i & 7); } - template force_inline void set(const int v) { vec_ = _mm256_insert_epi32(vec_, v, i & 7); } - force_inline void set(const int i, const int v) { comp_[i] = v; } - - avx2_inline simd_vec &vectorcall operator+=(const simd_vec rhs) { -#if defined(USE_AVX2) || defined(USE_AVX512) - vec_ = _mm256_add_epi32(vec_, rhs.vec_); -#else - UNROLLED_FOR(i, 8, { comp_[i] += rhs.comp_[i]; }) -#endif - return *this; - } - - force_inline simd_vec &vectorcall operator+=(const int rhs) { return operator+=(simd_vec{rhs}); } - - avx2_inline simd_vec &vectorcall operator-=(const simd_vec rhs) { -#if defined(USE_AVX2) || defined(USE_AVX512) - vec_ = _mm256_sub_epi32(vec_, rhs.vec_); -#else - UNROLLED_FOR(i, 8, { comp_[i] -= rhs.comp_[i]; }) -#endif - return *this; - } - - avx2_inline simd_vec &vectorcall operator*=(const simd_vec rhs) { -#if defined(USE_AVX2) || defined(USE_AVX512) - vec_ = _mm256_mullo_epi32(vec_, rhs.vec_); -#else - UNROLLED_FOR(i, 8, { comp_[i] *= rhs.comp_[i]; }) -#endif - return *this; - } - - simd_vec &vectorcall operator/=(const simd_vec rhs) { - UNROLLED_FOR(i, 8, { comp_[i] /= rhs.comp_[i]; }) - return *this; - } - - avx2_inline simd_vec &vectorcall operator|=(const simd_vec rhs) { -#if defined(USE_AVX2) || defined(USE_AVX512) - vec_ = _mm256_or_si256(vec_, rhs.vec_); -#else - UNROLLED_FOR(i, 8, { comp_[i] |= rhs.comp_[i]; }) -#endif - return *this; - } - - avx2_inline simd_vec &vectorcall operator^=(const simd_vec rhs) { -#if defined(USE_AVX2) || defined(USE_AVX512) - vec_ = _mm256_xor_si256(vec_, rhs.vec_); -#else - UNROLLED_FOR(i, 8, { comp_[i] ^= rhs.comp_[i]; }) -#endif - return *this; - } - - avx2_inline simd_vec vectorcall operator-() const { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_sub_epi32(_mm256_setzero_si256(), vec_); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = -comp_[i]; }) -#endif - return ret; - } - - avx2_inline simd_vec vectorcall operator==(const simd_vec rhs) const { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_cmpeq_epi32(vec_, rhs.vec_); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = (comp_[i] == rhs.comp_[i]) ? -1 : 0; }) -#endif - return ret; - } - - avx2_inline simd_vec vectorcall operator!=(const simd_vec rhs) const { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_andnot_si256(_mm256_cmpeq_epi32(vec_, rhs.vec_), _mm256_set1_epi32(~0)); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = (comp_[i] != rhs.comp_[i]) ? -1 : 0; }) -#endif - return ret; - } - - avx2_inline simd_vec &vectorcall operator&=(const simd_vec rhs) { -#if defined(USE_AVX2) || defined(USE_AVX512) - vec_ = _mm256_and_si256(vec_, rhs.vec_); -#else - UNROLLED_FOR(i, 8, { comp_[i] &= rhs.comp_[i]; }) -#endif - return *this; - } - - force_inline explicit vectorcall operator simd_vec() const { - simd_vec ret; - ret.vec_ = _mm256_cvtepi32_ps(vec_); - return ret; - } - - force_inline explicit vectorcall operator simd_vec() const; - - avx2_inline int hsum() const { -#if defined(USE_AVX2) || defined(USE_AVX512) - __m256i temp = _mm256_hadd_epi32(vec_, vec_); - temp = _mm256_hadd_epi32(temp, temp); - - __m256i ret = _mm256_permute2f128_si256(temp, temp, 1); - ret = _mm256_add_epi32(ret, temp); - - return _mm256_cvtsi256_si32(ret); -#else - int ret = comp_[0]; - UNROLLED_FOR(i, 7, { ret += comp_[i + 1]; }) - return ret; -#endif - } - - force_inline void store_to(int *f) const { _mm256_storeu_si256((__m256i *)f, vec_); } - force_inline void store_to(int *f, simd_mem_aligned_tag) const { _mm256_store_si256((__m256i *)f, vec_); } - - force_inline void vectorcall blend_to(const simd_vec mask, const simd_vec v1) { - validate_mask(mask); - vec_ = _mm256_castps_si256( - _mm256_blendv_ps(_mm256_castsi256_ps(vec_), _mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(mask.vec_))); - } - - force_inline void vectorcall blend_inv_to(const simd_vec mask, const simd_vec v1) { - validate_mask(mask); - vec_ = _mm256_castps_si256( - _mm256_blendv_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(vec_), _mm256_castsi256_ps(mask.vec_))); - } - - force_inline int movemask() const { return _mm256_movemask_ps(_mm256_castsi256_ps(vec_)); } - - force_inline bool vectorcall all_zeros() const { return _mm256_test_all_zeros(vec_, vec_) != 0; } - force_inline bool vectorcall all_zeros(const simd_vec mask) const { - return _mm256_test_all_zeros(vec_, mask.vec_) != 0; - } - - force_inline bool vectorcall not_all_zeros() const { - int res = _mm256_test_all_zeros(vec_, vec_); - return res == 0; - } - - friend avx2_inline simd_vec vectorcall min(const simd_vec v1, const simd_vec v2) { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_min_epi32(v1.vec_, v2.vec_); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = std::min(v1.comp_[i], v2.comp_[i]); }) -#endif - return ret; - } - - avx2_inline static simd_vec vectorcall max(const simd_vec v1, const simd_vec v2) { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_max_epi32(v1.vec_, v2.vec_); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = std::max(v1.comp_[i], v2.comp_[i]); }) -#endif - return ret; - } - - friend force_inline simd_vec vectorcall clamp(const simd_vec v1, const simd_vec _min, - const simd_vec _max) { - return max(_min, min(v1, _max)); - } - - force_inline static simd_vec vectorcall and_not(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_))); - return temp; - } - - friend force_inline simd_vec vectorcall operator&(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_))); - return temp; - } - - friend force_inline simd_vec vectorcall operator|(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_))); - return temp; - } - - friend force_inline simd_vec vectorcall operator^(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_))); - return temp; - } - - friend avx2_inline simd_vec vectorcall operator+(const simd_vec v1, const simd_vec v2) { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_add_epi32(v1.vec_, v2.vec_); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = v1.comp_[i] + v2.comp_[i]; }) -#endif - return ret; - } - - friend avx2_inline simd_vec vectorcall operator-(const simd_vec v1, const simd_vec v2) { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_sub_epi32(v1.vec_, v2.vec_); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = v1.comp_[i] - v2.comp_[i]; }) -#endif - return ret; - } - - friend avx2_inline simd_vec vectorcall operator*(const simd_vec v1, const simd_vec v2) { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_mullo_epi32(v1.vec_, v2.vec_); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = v1.comp_[i] * v2.comp_[i]; }) -#endif - return ret; - } - - friend simd_vec vectorcall operator/(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - UNROLLED_FOR(i, 8, { ret.comp_[i] = (v1.comp_[i] / v2.comp_[i]); }) - return ret; - } - - friend avx2_inline simd_vec vectorcall operator<(const simd_vec v1, const simd_vec v2) { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_cmpgt_epi32(v2.vec_, v1.vec_); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = (v1.comp_[i] < v2.comp_[i]) ? -1 : 0; }) -#endif - return ret; - } - - friend avx2_inline simd_vec vectorcall operator>(const simd_vec v1, const simd_vec v2) { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_cmpgt_epi32(v1.vec_, v2.vec_); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = (v1.comp_[i] > v2.comp_[i]) ? -1 : 0; }) -#endif - return ret; - } - - friend avx2_inline simd_vec vectorcall operator>=(const simd_vec v1, const simd_vec v2) { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_andnot_si256(_mm256_cmpgt_epi32(v2.vec_, v1.vec_), _mm256_set1_epi32(-1)); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = (v1.comp_[i] >= v2.comp_[i]) ? -1 : 0; }) -#endif - return ret; - } - - friend avx2_inline simd_vec vectorcall operator>>(const simd_vec v1, const simd_vec v2) { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_srlv_epi32(v1.vec_, v2.vec_); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = int(unsigned(v1.comp_[i]) >> unsigned(v2.comp_[i])); }) -#endif - return ret; - } - - friend avx2_inline simd_vec vectorcall operator>>(const simd_vec v1, const int v2) { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_srli_epi32(v1.vec_, v2); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = int(unsigned(v1.comp_[i]) >> v2); }) -#endif - return ret; - } - - friend avx2_inline simd_vec vectorcall operator<<(const simd_vec v1, const simd_vec v2) { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_sllv_epi32(v1.vec_, v2.vec_); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = int(unsigned(v1.comp_[i]) << unsigned(v2.comp_[i])); }) -#endif - return ret; - } - - friend avx2_inline simd_vec vectorcall operator<<(const simd_vec v1, const int v2) { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_slli_epi32(v1.vec_, v2); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = int(unsigned(v1.comp_[i]) << v2); }) -#endif - return ret; - } - - avx2_inline simd_vec operator~() const { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_andnot_si256(vec_, _mm256_set1_epi32(~0)); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = ~comp_[i]; }) -#endif - return ret; - } - - friend avx2_inline simd_vec vectorcall srai(const simd_vec v1, const int v2) { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_srai_epi32(v1.vec_, v2); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = (v1.comp_[i] >> v2); }) -#endif - return ret; - } - - friend avx2_inline bool vectorcall is_equal(const simd_vec v1, const simd_vec v2) { -#if defined(USE_AVX2) || defined(USE_AVX512) - __m256i vcmp = _mm256_cmpeq_epi32(v1.vec_, v2.vec_); - return (_mm256_movemask_epi8(vcmp) == 0xffffffff); -#else - bool ret = true; - UNROLLED_FOR(i, 8, { ret &= (v1.comp_[i] == v2.comp_[i]); }) - return ret; -#endif - } - -#if defined(USE_AVX2) || defined(USE_AVX512) - friend force_inline simd_vec vectorcall inclusive_scan(simd_vec v1) { - v1.vec_ = _mm256_add_epi32(v1.vec_, _mm256_slli_si256(v1.vec_, 4)); - v1.vec_ = _mm256_add_epi32(v1.vec_, _mm256_slli_si256(v1.vec_, 8)); - - __m256i temp = _mm256_shuffle_epi32(v1.vec_, _MM_SHUFFLE(3, 3, 3, 3)); - temp = _mm256_permute2x128_si256(_mm256_setzero_si256(), temp, 0x20); - - v1.vec_ = _mm256_add_epi32(v1.vec_, temp); - - return v1; - } -#endif - -#if defined(USE_AVX2) || defined(USE_AVX512) - friend force_inline simd_vec vectorcall gather(const float *base_addr, simd_vec vindex); - friend force_inline simd_vec vectorcall gather(simd_vec src, const float *base_addr, - simd_vec mask, simd_vec vindex); - friend force_inline simd_vec vectorcall gather(const int *base_addr, simd_vec vindex); - friend force_inline simd_vec vectorcall gather(simd_vec src, const int *base_addr, - simd_vec mask, simd_vec vindex); - friend force_inline simd_vec vectorcall gather(const unsigned *base_addr, simd_vec vindex); - friend force_inline simd_vec vectorcall gather(simd_vec src, const unsigned *base_addr, - simd_vec mask, simd_vec vindex); -#endif - - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2); - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2); - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, - const simd_vec vec1, - const simd_vec vec2); - -#ifndef NDEBUG - friend void vectorcall __assert_valid_mask(const simd_vec mask) { - UNROLLED_FOR(i, 8, { - const int val = mask.get(); - assert(val == 0 || val == -1); - }) - } -#endif - - friend force_inline const int *value_ptr(const simd_vec &v1) { - return reinterpret_cast(&v1.vec_); - } - friend force_inline int *value_ptr(simd_vec &v1) { return reinterpret_cast(&v1.vec_); } - - static int size() { return 8; } - static bool is_native() { -#if defined(USE_AVX2) || defined(USE_AVX512) - return true; -#else - // mostly not native, so return false here - return false; -#endif - } -}; - -template <> class simd_vec { - union { - __m256i vec_; - unsigned comp_[8]; - }; - - friend class simd_vec; - friend class simd_vec; - - public: - force_inline simd_vec() = default; - force_inline simd_vec(const unsigned f) { vec_ = _mm256_set1_epi32(f); } - force_inline simd_vec(const unsigned i1, const unsigned i2, const unsigned i3, const unsigned i4, const unsigned i5, - const unsigned i6, const unsigned i7, const unsigned i8) { - vec_ = _mm256_setr_epi32(i1, i2, i3, i4, i5, i6, i7, i8); - } - force_inline explicit simd_vec(const unsigned *f) { vec_ = _mm256_loadu_si256((const __m256i *)f); } - force_inline simd_vec(const unsigned *f, simd_mem_aligned_tag) { vec_ = _mm256_load_si256((const __m256i *)f); } - - force_inline unsigned operator[](const int i) const { return comp_[i]; } - force_inline unsigned operator[](const long i) const { return operator[](int(i)); } - - template force_inline unsigned get() const { return _mm256_extract_epi32(vec_, i & 7); } - template force_inline void set(const unsigned v) { vec_ = _mm256_insert_epi32(vec_, v, i & 7); } - force_inline void set(const int i, const unsigned v) { comp_[i] = v; } - - avx2_inline simd_vec &vectorcall operator+=(const simd_vec rhs) { -#if defined(USE_AVX2) || defined(USE_AVX512) - vec_ = _mm256_add_epi32(vec_, rhs.vec_); -#else - UNROLLED_FOR(i, 8, { comp_[i] += rhs.comp_[i]; }) -#endif - return *this; - } - - force_inline simd_vec &vectorcall operator+=(const unsigned rhs) { - return operator+=(simd_vec{rhs}); - } - - avx2_inline simd_vec &vectorcall operator-=(const simd_vec rhs) { -#if defined(USE_AVX2) || defined(USE_AVX512) - vec_ = _mm256_sub_epi32(vec_, rhs.vec_); -#else - UNROLLED_FOR(i, 8, { comp_[i] -= rhs.comp_[i]; }) -#endif - return *this; - } - - force_inline simd_vec &vectorcall operator-=(const unsigned rhs) { - return operator-=(simd_vec{rhs}); - } - - simd_vec &vectorcall operator*=(const simd_vec rhs) { - UNROLLED_FOR(i, 8, { comp_[i] *= rhs.comp_[i]; }) - return *this; - } - - force_inline simd_vec &vectorcall operator*=(const unsigned rhs) { - return operator*=(simd_vec{rhs}); - } - - simd_vec &vectorcall operator/=(const simd_vec rhs) { - UNROLLED_FOR(i, 8, { comp_[i] /= rhs.comp_[i]; }) - return *this; - } - - force_inline simd_vec &vectorcall operator/=(const unsigned rhs) { - return operator/=(simd_vec{rhs}); - } - - avx2_inline simd_vec &vectorcall operator|=(const simd_vec rhs) { -#if defined(USE_AVX2) || defined(USE_AVX512) - vec_ = _mm256_or_si256(vec_, rhs.vec_); -#else - UNROLLED_FOR(i, 8, { comp_[i] |= rhs.comp_[i]; }) -#endif - return *this; - } - - avx2_inline simd_vec &vectorcall operator^=(const simd_vec rhs) { -#if defined(USE_AVX2) || defined(USE_AVX512) - vec_ = _mm256_xor_si256(vec_, rhs.vec_); -#else - UNROLLED_FOR(i, 8, { comp_[i] ^= rhs.comp_[i]; }) -#endif - return *this; - } - - avx2_inline simd_vec vectorcall operator==(const simd_vec rhs) const { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_cmpeq_epi32(vec_, rhs.vec_); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = (comp_[i] == rhs.comp_[i]) ? 0xffffffff : 0; }) -#endif - return ret; - } - - avx2_inline simd_vec vectorcall operator!=(const simd_vec rhs) const { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_andnot_si256(_mm256_cmpeq_epi32(vec_, rhs.vec_), _mm256_set1_epi32(~0)); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = (comp_[i] != rhs.comp_[i]) ? 0xffffffff : 0; }) -#endif - return ret; - } - - avx2_inline simd_vec &vectorcall operator&=(const simd_vec rhs) { -#if defined(USE_AVX2) || defined(USE_AVX512) - vec_ = _mm256_and_si256(vec_, rhs.vec_); -#else - UNROLLED_FOR(i, 8, { comp_[i] &= rhs.comp_[i]; }) -#endif - return *this; - } - - force_inline explicit vectorcall operator simd_vec() const { - simd_vec ret; - ret.vec_ = _mm256_cvtepi32_ps(vec_); - return ret; - } - - force_inline explicit vectorcall operator simd_vec() const { - simd_vec ret; - ret.vec_ = vec_; - return ret; - } - - avx2_inline unsigned hsum() const { -#if defined(USE_AVX2) || defined(USE_AVX512) - __m256i temp = _mm256_hadd_epi32(vec_, vec_); - temp = _mm256_hadd_epi32(temp, temp); - - __m256i ret = _mm256_permute2f128_si256(temp, temp, 1); - ret = _mm256_add_epi32(ret, temp); - - return _mm256_cvtsi256_si32(ret); -#else - unsigned ret = comp_[0]; - UNROLLED_FOR(i, 7, { ret += comp_[i + 1]; }) - return ret; -#endif - } - - force_inline void store_to(unsigned *f) const { _mm256_storeu_si256((__m256i *)f, vec_); } - force_inline void store_to(unsigned *f, simd_mem_aligned_tag) const { _mm256_store_si256((__m256i *)f, vec_); } - - force_inline void vectorcall blend_to(const simd_vec mask, const simd_vec v1) { - validate_mask(mask); - vec_ = _mm256_castps_si256( - _mm256_blendv_ps(_mm256_castsi256_ps(vec_), _mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(mask.vec_))); - } - - force_inline void vectorcall blend_inv_to(const simd_vec mask, const simd_vec v1) { - validate_mask(mask); - vec_ = _mm256_castps_si256( - _mm256_blendv_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(vec_), _mm256_castsi256_ps(mask.vec_))); - } - - force_inline int movemask() const { return _mm256_movemask_ps(_mm256_castsi256_ps(vec_)); } - - force_inline bool vectorcall all_zeros() const { return _mm256_test_all_zeros(vec_, vec_) != 0; } - force_inline bool vectorcall all_zeros(const simd_vec mask) const { - return _mm256_test_all_zeros(vec_, mask.vec_) != 0; - } - - force_inline bool vectorcall not_all_zeros() const { - int res = _mm256_test_all_zeros(vec_, vec_); - return res == 0; - } - - friend avx2_inline simd_vec vectorcall min(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_min_epu32(v1.vec_, v2.vec_); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = std::min(v1.comp_[i], v2.comp_[i]); }) -#endif - return ret; - } - - avx2_inline static simd_vec vectorcall max(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_max_epu32(v1.vec_, v2.vec_); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = std::max(v1.comp_[i], v2.comp_[i]); }) -#endif - return ret; - } - - friend force_inline simd_vec vectorcall clamp(const simd_vec v1, - const simd_vec _min, - const simd_vec _max) { - return max(_min, min(v1, _max)); - } - - force_inline static simd_vec vectorcall and_not(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_))); - return temp; - } - - friend force_inline simd_vec vectorcall operator&(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_))); - return temp; - } - - friend force_inline simd_vec vectorcall operator|(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_))); - return temp; - } - - friend force_inline simd_vec vectorcall operator^(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(v1.vec_), _mm256_castsi256_ps(v2.vec_))); - return temp; - } - - friend avx2_inline simd_vec vectorcall operator+(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_add_epi32(v1.vec_, v2.vec_); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = v1.comp_[i] + v2.comp_[i]; }) -#endif - return ret; - } - - friend avx2_inline simd_vec vectorcall operator-(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_sub_epi32(v1.vec_, v2.vec_); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = v1.comp_[i] - v2.comp_[i]; }) -#endif - return ret; - } - - friend simd_vec vectorcall operator*(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - UNROLLED_FOR(i, 8, { ret.comp_[i] = v1.comp_[i] * v2.comp_[i]; }) - return ret; - } - - friend simd_vec vectorcall operator/(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - UNROLLED_FOR(i, 8, { ret.comp_[i] = (v1.comp_[i] / v2.comp_[i]); }) - return ret; - } - - friend avx2_inline simd_vec vectorcall operator>>(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_srlv_epi32(v1.vec_, v2.vec_); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = (v1.comp_[i] >> v2.comp_[i]); }) -#endif - return ret; - } - - friend avx2_inline simd_vec vectorcall operator>>(const simd_vec v1, const unsigned v2) { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_srli_epi32(v1.vec_, v2); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = (v1.comp_[i] >> v2); }) -#endif - return ret; - } - - friend avx2_inline simd_vec vectorcall operator<<(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_sllv_epi32(v1.vec_, v2.vec_); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = (v1.comp_[i] << v2.comp_[i]); }) -#endif - return ret; - } - - friend avx2_inline simd_vec vectorcall operator<<(const simd_vec v1, const unsigned v2) { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_slli_epi32(v1.vec_, v2); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = (v1.comp_[i] << v2); }) -#endif - return ret; - } - - avx2_inline simd_vec operator~() const { - simd_vec ret; -#if defined(USE_AVX2) || defined(USE_AVX512) - ret.vec_ = _mm256_andnot_si256(vec_, _mm256_set1_epi32(~0)); -#else - UNROLLED_FOR(i, 8, { ret.comp_[i] = ~comp_[i]; }) -#endif - return ret; - } - - friend avx2_inline bool vectorcall is_equal(const simd_vec v1, const simd_vec v2) { -#if defined(USE_AVX2) || defined(USE_AVX512) - __m256i vcmp = _mm256_cmpeq_epi32(v1.vec_, v2.vec_); - return (_mm256_movemask_epi8(vcmp) == 0xffffffff); -#else - bool ret = true; - UNROLLED_FOR(i, 8, { ret &= (v1.comp_[i] == v2.comp_[i]); }) - return ret; -#endif - } - -#if defined(USE_AVX2) || defined(USE_AVX512) - friend force_inline simd_vec vectorcall inclusive_scan(simd_vec v1) { - v1.vec_ = _mm256_add_epi32(v1.vec_, _mm256_slli_si256(v1.vec_, 4)); - v1.vec_ = _mm256_add_epi32(v1.vec_, _mm256_slli_si256(v1.vec_, 8)); - - __m256i temp = _mm256_shuffle_epi32(v1.vec_, _MM_SHUFFLE(3, 3, 3, 3)); - temp = _mm256_permute2x128_si256(_mm256_setzero_si256(), temp, 0x20); - - v1.vec_ = _mm256_add_epi32(v1.vec_, temp); - - return v1; - } -#endif - -#if defined(USE_AVX2) || defined(USE_AVX512) - // friend force_inline simd_vec vectorcall gather(const float *base_addr, simd_vec vindex); - // friend force_inline simd_vec vectorcall gather(simd_vec src, const float *base_addr, - // simd_vec mask, simd_vec vindex); - friend force_inline simd_vec vectorcall gather(const unsigned *base_addr, simd_vec vindex); - friend force_inline simd_vec vectorcall gather(simd_vec src, const unsigned *base_addr, - simd_vec mask, simd_vec vindex); -#endif - - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2); - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2); - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, - const simd_vec vec1, - const simd_vec vec2); - -#ifndef NDEBUG - friend void vectorcall __assert_valid_mask(const simd_vec mask) { - UNROLLED_FOR(i, 8, { - const int val = mask.get(); - assert(val == 0 || val == 0xffffffff); - }) - } -#endif - - friend force_inline const unsigned *value_ptr(const simd_vec &v1) { - return reinterpret_cast(&v1.vec_); - } - friend force_inline unsigned *value_ptr(simd_vec &v1) { - return reinterpret_cast(&v1.vec_); - } - - static int size() { return 8; } - static bool is_native() { -#if defined(USE_AVX2) || defined(USE_AVX512) - return true; -#else - // mostly not native, so return false here - return false; -#endif - } -}; - -avx2_inline simd_vec simd_vec::operator~() const { -#if defined(USE_AVX2) || defined(USE_AVX512) - simd_vec ret; - ret.vec_ = _mm256_castsi256_ps(_mm256_andnot_si256(_mm256_castps_si256(vec_), _mm256_set1_epi32(~0))); - return ret; -#else - alignas(32) uint32_t temp[8]; - _mm256_store_ps((float *)temp, vec_); - UNROLLED_FOR(i, 8, { temp[i] = ~temp[i]; }) - return simd_vec{(const float *)temp, simd_mem_aligned}; -#endif -} - -force_inline simd_vec simd_vec::operator-() const { - simd_vec temp; - __m256 m = _mm256_set1_ps(-0.0f); - temp.vec_ = _mm256_xor_ps(vec_, m); - return temp; -} - -force_inline simd_vec::operator simd_vec() const { - simd_vec ret; - ret.vec_ = _mm256_cvttps_epi32(vec_); - return ret; -} - -force_inline simd_vec::operator simd_vec() const { - simd_vec ret; - ret.vec_ = _mm256_cvttps_epi32(vec_); - return ret; -} - -force_inline simd_vec::operator simd_vec() const { - simd_vec ret; - ret.vec_ = vec_; - return ret; -} - -force_inline simd_vec simd_vec::sqrt() const { - simd_vec temp; - temp.vec_ = _mm256_sqrt_ps(vec_); - return temp; -} - -avx2_inline simd_vec simd_vec::log() const { - simd_vec ret; - UNROLLED_FOR(i, 8, { ret.comp_[i] = logf(comp_[i]); }) - return ret; -} - -force_inline simd_vec vectorcall min(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm256_min_ps(v1.vec_, v2.vec_); - return temp; -} - -force_inline simd_vec vectorcall max(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm256_max_ps(v1.vec_, v2.vec_); - return temp; -} - -force_inline simd_vec vectorcall clamp(const simd_vec v1, const simd_vec min, - const simd_vec max) { - simd_vec ret; - ret.vec_ = _mm256_max_ps(min.vec_, _mm256_min_ps(v1.vec_, max.vec_)); - return ret; -} - -force_inline simd_vec vectorcall and_not(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm256_andnot_ps(v1.vec_, v2.vec_); - return temp; -} - -force_inline simd_vec vectorcall floor(const simd_vec v1) { - simd_vec temp; - temp.vec_ = _mm256_floor_ps(v1.vec_); - return temp; -} - -force_inline simd_vec vectorcall ceil(const simd_vec v1) { - simd_vec temp; - temp.vec_ = _mm256_ceil_ps(v1.vec_); - return temp; -} - -force_inline simd_vec vectorcall operator&(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm256_and_ps(v1.vec_, v2.vec_); - return temp; -} - -force_inline simd_vec vectorcall operator|(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm256_or_ps(v1.vec_, v2.vec_); - return temp; -} - -force_inline simd_vec vectorcall operator^(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm256_xor_ps(v1.vec_, v2.vec_); - return temp; -} - -force_inline simd_vec vectorcall operator+(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm256_add_ps(v1.vec_, v2.vec_); - return temp; -} - -force_inline simd_vec vectorcall operator-(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm256_sub_ps(v1.vec_, v2.vec_); - return temp; -} - -force_inline simd_vec vectorcall operator*(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm256_mul_ps(v1.vec_, v2.vec_); - return temp; -} - -force_inline simd_vec vectorcall operator/(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm256_div_ps(v1.vec_, v2.vec_); - return temp; -} - -force_inline simd_vec vectorcall operator<(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm256_cmp_ps(v1.vec_, v2.vec_, _CMP_LT_OS); - return ret; -} - -force_inline simd_vec vectorcall operator<=(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm256_cmp_ps(v1.vec_, v2.vec_, _CMP_LE_OS); - return ret; -} - -force_inline simd_vec vectorcall operator>(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm256_cmp_ps(v1.vec_, v2.vec_, _CMP_GT_OS); - return ret; -} - -force_inline simd_vec vectorcall operator>=(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm256_cmp_ps(v1.vec_, v2.vec_, _CMP_GE_OS); - return ret; -} - -force_inline simd_vec vectorcall operator==(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm256_cmp_ps(v1.vec_, v2.vec_, _CMP_EQ_OS); - return ret; -} - -force_inline simd_vec vectorcall operator!=(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm256_cmp_ps(v1.vec_, v2.vec_, _CMP_NEQ_OS); - return ret; -} - -inline simd_vec vectorcall pow(const simd_vec v1, const simd_vec v2) { - alignas(32) float comp1[8], comp2[8]; - _mm256_store_ps(comp1, v1.vec_); - _mm256_store_ps(comp2, v2.vec_); - UNROLLED_FOR(i, 8, { comp1[i] = powf(comp1[i], comp2[i]); }) - return simd_vec{comp1, simd_mem_aligned}; -} - -force_inline simd_vec vectorcall normalize(const simd_vec v1) { return v1 / v1.length(); } - -force_inline simd_vec vectorcall normalize_len(const simd_vec v1, float &out_len) { - return v1 / (out_len = v1.length()); -} - -#ifdef USE_FMA -force_inline simd_vec vectorcall fmadd(const simd_vec a, const simd_vec b, - const simd_vec c) { - simd_vec ret; - ret.vec_ = _mm256_fmadd_ps(a.vec_, b.vec_, c.vec_); - return ret; -} - -force_inline simd_vec vectorcall fmsub(const simd_vec a, const simd_vec b, - const simd_vec c) { - simd_vec ret; - ret.vec_ = _mm256_fmsub_ps(a.vec_, b.vec_, c.vec_); - return ret; -} -#endif // USE_FMA - -#if defined(USE_AVX2) || defined(USE_AVX512) -force_inline simd_vec vectorcall gather(const float *base_addr, const simd_vec vindex) { - simd_vec ret; - ret.vec_ = _mm256_i32gather_ps(base_addr, vindex.vec_, sizeof(float)); - return ret; -} - -force_inline simd_vec vectorcall gather(simd_vec src, const float *base_addr, simd_vec mask, - simd_vec vindex) { - simd_vec ret; - ret.vec_ = - _mm256_mask_i32gather_ps(src.vec_, base_addr, vindex.vec_, _mm256_castsi256_ps(mask.vec_), sizeof(float)); - return ret; -} - -force_inline simd_vec vectorcall gather(const int *base_addr, const simd_vec vindex) { - simd_vec ret; - ret.vec_ = _mm256_i32gather_epi32(base_addr, vindex.vec_, sizeof(int)); - return ret; -} - -force_inline simd_vec vectorcall gather(simd_vec src, const int *base_addr, simd_vec mask, - simd_vec vindex) { - simd_vec ret; - ret.vec_ = _mm256_mask_i32gather_epi32(src.vec_, base_addr, vindex.vec_, mask.vec_, sizeof(int)); - return ret; -} - -force_inline simd_vec vectorcall gather(const unsigned *base_addr, const simd_vec vindex) { - simd_vec ret; - ret.vec_ = _mm256_i32gather_epi32(reinterpret_cast(base_addr), vindex.vec_, sizeof(int)); - return ret; -} - -force_inline simd_vec vectorcall gather(simd_vec src, const unsigned *base_addr, - simd_vec mask, simd_vec vindex) { - simd_vec ret; - ret.vec_ = _mm256_mask_i32gather_epi32(src.vec_, reinterpret_cast(base_addr), vindex.vec_, mask.vec_, - sizeof(unsigned)); - return ret; -} -#endif - -template -force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2) { - validate_mask(mask); - simd_vec ret; - ret.vec_ = _mm256_blendv_ps(vec2.vec_, vec1.vec_, _mm_cast<__m256>(mask.vec_)); - return ret; -} - -template -force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2) { - validate_mask(mask); - simd_vec ret; - ret.vec_ = _mm256_castps_si256( - _mm256_blendv_ps(_mm256_castsi256_ps(vec2.vec_), _mm256_castsi256_ps(vec1.vec_), _mm_cast<__m256>(mask.vec_))); - return ret; -} - -template -force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2) { - validate_mask(mask); - simd_vec ret; - ret.vec_ = _mm256_castps_si256( - _mm256_blendv_ps(_mm256_castsi256_ps(vec2.vec_), _mm256_castsi256_ps(vec1.vec_), _mm_cast<__m256>(mask.vec_))); - return ret; -} - -} // namespace NS -} // namespace Ray - -#pragma warning(pop) - -#undef avx2_inline - -#undef validate_mask diff --git a/internal/simd/simd_vec_avx512.h b/internal/simd/simd_vec_avx512.h deleted file mode 100644 index 6302d2116..000000000 --- a/internal/simd/simd_vec_avx512.h +++ /dev/null @@ -1,1110 +0,0 @@ -// #pragma once - -#include "simd_vec_avx.h" - -#include - -#define _mm512_cmp_ps(a, b, c) _mm512_castsi512_ps(_mm512_movm_epi32(_mm512_cmp_ps_mask(a, b, c))) - -#define _mm512_blendv_ps(a, b, m) \ - _mm512_castsi512_ps(_mm512_ternarylogic_epi32(_mm512_castps_si512(a), _mm512_castps_si512(b), \ - _mm512_srai_epi32(_mm512_castps_si512(m), 31), 0xd8)) - -#define _mm512_movemask_epi32(a) \ - (int)_mm512_cmpneq_epi32_mask(_mm512_setzero_si512(), _mm512_and_si512(_mm512_set1_epi32(0x80000000U), a)) - -// https://adms-conf.org/2020-camera-ready/ADMS20_05.pdf -#define _mm512_slli_si512(x, k) _mm512_alignr_epi32(x, _mm512_setzero_si512(), 16 - k) - -#ifndef NDEBUG -#define validate_mask(m) __assert_valid_mask(m) -#else -#define validate_mask(m) ((void)m) -#endif - -#pragma warning(push) -#pragma warning(disable : 4752) - -namespace Ray { -namespace NS { - -template <> force_inline __m512 _mm_cast(__m512i x) { return _mm512_castsi512_ps(x); } -template <> force_inline __m512i _mm_cast(__m512 x) { return _mm512_castps_si512(x); } - -template <> class simd_vec; -template <> class simd_vec; - -template <> class simd_vec { - union { - __m512 vec_; - float comp_[16]; - }; - - friend class simd_vec; - friend class simd_vec; - - public: - force_inline simd_vec() = default; - force_inline simd_vec(const float f) { vec_ = _mm512_set1_ps(f); } - force_inline simd_vec(const float f0, const float f1, const float f2, const float f3, const float f4, - const float f5, const float f6, const float f7, const float f8, const float f9, - const float f10, const float f11, const float f12, const float f13, const float f14, - const float f15) { - vec_ = _mm512_setr_ps(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15); - } - force_inline explicit simd_vec(const float *f) { vec_ = _mm512_loadu_ps(f); } - force_inline simd_vec(const float *f, simd_mem_aligned_tag) { vec_ = _mm512_load_ps(f); } - - force_inline float operator[](const int i) const { - __m512 temp = _mm512_maskz_compress_ps(__mmask16(1u << i), vec_); - return _mm512_cvtss_f32(temp); - } - - force_inline float operator[](const long i) const { return operator[](int(i)); } - - template force_inline float get() const { - __m128 temp = _mm512_extractf32x4_ps(vec_, (i & 15) / 4); - const int ndx = (i & 15) % 4; - return _mm_cvtss_f32(_mm_shuffle_ps(temp, temp, _MM_SHUFFLE(ndx, ndx, ndx, ndx))); - } - template force_inline void set(const float v) { - // TODO: find more optimal implementation (with compile-time index) - vec_ = _mm512_mask_broadcastss_ps(vec_, __mmask16(1u << (i & 15)), _mm_set_ss(v)); - } - force_inline void set(const int i, const float v) { - vec_ = _mm512_mask_broadcastss_ps(vec_, __mmask16(1u << i), _mm_set_ss(v)); - } - - force_inline simd_vec &vectorcall operator+=(const simd_vec rhs) { - vec_ = _mm512_add_ps(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec &vectorcall operator-=(const simd_vec rhs) { - vec_ = _mm512_sub_ps(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec &vectorcall operator*=(const simd_vec rhs) { - vec_ = _mm512_mul_ps(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec &vectorcall operator/=(const simd_vec rhs) { - vec_ = _mm512_div_ps(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec &vectorcall operator|=(const simd_vec rhs) { - vec_ = _mm512_or_ps(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec &vectorcall operator&=(const simd_vec rhs) { - vec_ = _mm512_and_ps(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec operator~() const; - force_inline simd_vec operator-() const; - force_inline explicit operator simd_vec() const; - force_inline explicit operator simd_vec() const; - - force_inline simd_vec sqrt() const; - force_inline simd_vec log() const; - - force_inline float length() const { return sqrtf(length2()); } - - float length2() const { - float temp = 0; - UNROLLED_FOR(i, 16, { temp += comp_[i] * comp_[i]; }) - return temp; - } - - force_inline float hsum() const { return _mm512_reduce_add_ps(vec_); } - - force_inline void store_to(float *f) const { _mm512_storeu_ps(f, vec_); } - force_inline void store_to(float *f, simd_mem_aligned_tag) const { _mm512_store_ps(f, vec_); } - - force_inline void vectorcall blend_to(const simd_vec mask, const simd_vec v1) { - validate_mask(mask); - //__mmask16 msk = - // _mm512_fpclass_ps_mask(mask.vec_, 0x54); // 0x54 = Negative_Finite | Negative_Infinity | Negative_Zero - // vec_ = _mm512_mask_blend_ps(msk, vec_, v1.vec_); - vec_ = _mm512_blendv_ps(vec_, v1.vec_, mask.vec_); - } - - force_inline void vectorcall blend_inv_to(const simd_vec mask, const simd_vec v1) { - validate_mask(mask); - //__mmask16 msk = - // _mm512_fpclass_ps_mask(mask.vec_, 0x54); // 0x54 = Negative_Finite | Negative_Infinity | Negative_Zero - // vec_ = _mm512_mask_blend_ps(msk, v1.vec_, vec_); - vec_ = _mm512_blendv_ps(v1.vec_, vec_, mask.vec_); - } - - friend force_inline simd_vec vectorcall min(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall max(simd_vec v1, simd_vec v2); - - friend force_inline simd_vec vectorcall and_not(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall floor(simd_vec v1); - friend force_inline simd_vec vectorcall ceil(simd_vec v1); - - friend force_inline simd_vec vectorcall operator&(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall operator|(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall operator^(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall operator+(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall operator-(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall operator*(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall operator/(simd_vec v1, simd_vec v2); - - friend force_inline simd_vec vectorcall operator<(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall operator<=(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall operator>(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall operator>=(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall operator==(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall operator!=(simd_vec v1, simd_vec v2); - - friend force_inline simd_vec vectorcall clamp(simd_vec v1, simd_vec min, - simd_vec max); - // friend force_inline simd_vec vectorcall clamp(simd_vec v1, float min, float max); - friend force_inline simd_vec vectorcall saturate(const simd_vec v1) { - return clamp(v1, 0.0f, 1.0f); - } - friend force_inline simd_vec vectorcall pow(simd_vec v1, simd_vec v2); - friend force_inline simd_vec vectorcall normalize(simd_vec v1); - friend force_inline simd_vec vectorcall normalize_len(simd_vec v1, float &out_len); - friend force_inline simd_vec vectorcall inclusive_scan(simd_vec v1); - - friend force_inline simd_vec vectorcall fmadd(simd_vec a, simd_vec b, - simd_vec c); - friend force_inline simd_vec vectorcall fmsub(simd_vec a, simd_vec b, - simd_vec c); - - friend force_inline simd_vec vectorcall gather(const float *base_addr, simd_vec vindex); - - friend force_inline void vectorcall scatter(float *base_addr, simd_vec vindex, simd_vec v); - friend force_inline void vectorcall scatter(float *base_addr, simd_vec mask, simd_vec vindex, - simd_vec v); - - template - friend force_inline simd_vec - vectorcall select(const simd_vec mask, const simd_vec vec1, const simd_vec vec2); - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2); - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, - const simd_vec vec1, - const simd_vec vec2); - -#ifndef NDEBUG - friend void vectorcall __assert_valid_mask(const simd_vec mask) { - UNROLLED_FOR(i, 16, { - const float val = mask.get(); - assert(reinterpret_cast(val) == 0 || - reinterpret_cast(val) == 0xffffffff); - }) - } -#endif - - friend force_inline const float *value_ptr(const simd_vec &v1) { - return reinterpret_cast(&v1.vec_); - } - friend force_inline float *value_ptr(simd_vec &v1) { return reinterpret_cast(&v1.vec_); } - - static int size() { return 16; } - static bool is_native() { return true; } -}; - -template <> class simd_vec { - union { - __m512i vec_; - int comp_[16]; - }; - - friend class simd_vec; - friend class simd_vec; - - public: - force_inline simd_vec() = default; - force_inline simd_vec(const int f) { vec_ = _mm512_set1_epi32(f); } - force_inline simd_vec(const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, - const int i6, const int i7, const int i8, const int i9, const int i10, const int i11, - const int i12, const int i13, const int i14, const int i15) { - vec_ = _mm512_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15); - } - force_inline explicit simd_vec(const int *f) { vec_ = _mm512_loadu_si512((const __m512i *)f); } - force_inline simd_vec(const int *f, simd_mem_aligned_tag) { vec_ = _mm512_load_si512((const __m512i *)f); } - - force_inline int operator[](const int i) const { - __m512i temp = _mm512_maskz_compress_epi32(__mmask16(1u << (i & 15)), vec_); - return _mm512_cvtsi512_si32(temp); - } - - force_inline int operator[](const long i) const { return operator[](int(i)); } - - template force_inline int get() const { - __m128i temp = _mm512_extracti32x4_epi32(vec_, (i & 15) / 4); - return _mm_extract_epi32(temp, (i & 15) % 4); - } - template force_inline void set(const int v) { - // TODO: find more optimal implementation (with compile-time index) - vec_ = _mm512_mask_set1_epi32(vec_, __mmask16(1u << (i & 15)), v); - } - force_inline void set(const int i, const int v) { - vec_ = _mm512_mask_set1_epi32(vec_, __mmask16(1u << (i & 15)), v); - } - - force_inline simd_vec &vectorcall operator+=(const simd_vec rhs) { - vec_ = _mm512_add_epi32(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec &vectorcall operator-=(const simd_vec rhs) { - vec_ = _mm512_sub_epi32(vec_, rhs.vec_); - return *this; - } - - simd_vec &vectorcall operator*=(const simd_vec rhs) { - UNROLLED_FOR(i, 16, { comp_[i] *= rhs.comp_[i]; }) - return *this; - } - - simd_vec &vectorcall operator/=(const simd_vec rhs) { - UNROLLED_FOR(i, 16, { comp_[i] /= rhs.comp_[i]; }) - return *this; - } - - force_inline simd_vec &vectorcall operator|=(const simd_vec rhs) { - vec_ = _mm512_or_si512(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec &vectorcall operator^=(const simd_vec rhs) { - vec_ = _mm512_xor_epi32(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec operator-() const { - simd_vec temp; - temp.vec_ = _mm512_sub_epi32(_mm512_setzero_si512(), vec_); - return temp; - } - - force_inline simd_vec vectorcall operator==(const simd_vec rhs) const { - simd_vec ret; - ret.vec_ = _mm512_movm_epi32(_mm512_cmpeq_epi32_mask(vec_, rhs.vec_)); - return ret; - } - - force_inline simd_vec vectorcall operator!=(const simd_vec rhs) const { - simd_vec ret; - ret.vec_ = - _mm512_andnot_si512(_mm512_movm_epi32(_mm512_cmpeq_epi32_mask(vec_, rhs.vec_)), _mm512_set1_epi32(~0)); - return ret; - } - - force_inline simd_vec &vectorcall operator&=(const simd_vec rhs) { - vec_ = _mm512_and_si512(vec_, rhs.vec_); - return *this; - } - - force_inline explicit operator simd_vec() const { - simd_vec ret; - ret.vec_ = _mm512_cvtepi32_ps(vec_); - return ret; - } - - force_inline explicit operator simd_vec() const; - - force_inline int hsum() const { return _mm512_reduce_add_epi32(vec_); } - - force_inline void store_to(int *f) const { _mm512_storeu_si512((__m512i *)f, vec_); } - force_inline void store_to(int *f, simd_mem_aligned_tag) const { _mm512_store_si512((__m512i *)f, vec_); } - - force_inline void vectorcall blend_to(const simd_vec mask, const simd_vec v1) { - validate_mask(mask); - vec_ = _mm512_ternarylogic_epi32(vec_, v1.vec_, _mm512_srai_epi32(mask.vec_, 31), 0xd8); - } - - force_inline void vectorcall blend_inv_to(const simd_vec mask, const simd_vec v1) { - validate_mask(mask); - vec_ = _mm512_ternarylogic_epi32(v1.vec_, vec_, _mm512_srai_epi32(mask.vec_, 31), 0xd8); - } - - force_inline int movemask() const { return _mm512_movemask_epi32(vec_); } - - force_inline bool vectorcall all_zeros() const { - return _mm512_cmpeq_epi32_mask(vec_, _mm512_setzero_si512()) == 0xFFFF; - } - - force_inline bool vectorcall all_zeros(const simd_vec mask) const { - return _mm512_cmpeq_epi32_mask(_mm512_and_si512(vec_, mask.vec_), _mm512_setzero_si512()) == 0xFFFF; - } - - force_inline bool not_all_zeros() const { return !all_zeros(); } - - friend force_inline simd_vec vectorcall min(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_min_epi32(v1.vec_, v2.vec_); - return temp; - } - - friend force_inline simd_vec vectorcall max(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_max_epi32(v1.vec_, v2.vec_); - return temp; - } - - friend force_inline simd_vec vectorcall clamp(const simd_vec v1, const simd_vec _min, - const simd_vec _max) { - return max(_min, min(v1, _max)); - } - - force_inline static simd_vec vectorcall and_not(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_castps_si512(_mm512_andnot_ps(_mm512_castsi512_ps(v1.vec_), _mm512_castsi512_ps(v2.vec_))); - return temp; - } - - friend force_inline simd_vec vectorcall operator&(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_castps_si512(_mm512_and_ps(_mm512_castsi512_ps(v1.vec_), _mm512_castsi512_ps(v2.vec_))); - return temp; - } - - friend force_inline simd_vec vectorcall operator|(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_castps_si512(_mm512_or_ps(_mm512_castsi512_ps(v1.vec_), _mm512_castsi512_ps(v2.vec_))); - return temp; - } - - friend force_inline simd_vec vectorcall operator^(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_castps_si512(_mm512_xor_ps(_mm512_castsi512_ps(v1.vec_), _mm512_castsi512_ps(v2.vec_))); - return temp; - } - - friend force_inline simd_vec vectorcall operator+(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_add_epi32(v1.vec_, v2.vec_); - return temp; - } - - friend force_inline simd_vec vectorcall operator-(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_sub_epi32(v1.vec_, v2.vec_); - return temp; - } - - friend simd_vec vectorcall operator*(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - UNROLLED_FOR(i, 16, { ret.comp_[i] = v1.comp_[i] * v2.comp_[i]; }) - return ret; - } - - friend simd_vec vectorcall operator/(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - UNROLLED_FOR(i, 16, { ret.comp_[i] = v1.comp_[i] / v2.comp_[i]; }) - return ret; - } - - friend force_inline simd_vec vectorcall operator<(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm512_movm_epi32(_mm512_cmpgt_epi32_mask(v2.vec_, v1.vec_)); - return ret; - } - - friend force_inline simd_vec vectorcall operator>(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm512_movm_epi32(_mm512_cmpgt_epi32_mask(v1.vec_, v2.vec_)); - return ret; - } - - friend force_inline simd_vec vectorcall operator>=(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm512_movm_epi32(_mm512_cmpge_epi32_mask(v1.vec_, v2.vec_)); - return ret; - } - - friend force_inline simd_vec vectorcall operator>>(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm512_srlv_epi32(v1.vec_, v2.vec_); - return ret; - } - - friend force_inline simd_vec vectorcall operator>>(const simd_vec v1, const int v2) { - simd_vec ret; - ret.vec_ = _mm512_srli_epi32(v1.vec_, v2); - return ret; - } - - friend force_inline simd_vec vectorcall operator<<(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm512_sllv_epi32(v1.vec_, v2.vec_); - return ret; - } - - friend force_inline simd_vec vectorcall operator<<(const simd_vec v1, const int v2) { - simd_vec ret; - ret.vec_ = _mm512_slli_epi32(v1.vec_, v2); - return ret; - } - - force_inline simd_vec operator~() const { - simd_vec ret; - ret.vec_ = _mm512_andnot_si512(vec_, _mm512_set1_epi32(~0)); - return ret; - } - - friend force_inline simd_vec vectorcall srai(const simd_vec v1, const int v2) { - simd_vec ret; - ret.vec_ = _mm512_srai_epi32(v1.vec_, v2); - return ret; - } - - friend force_inline bool vectorcall is_equal(const simd_vec v1, const simd_vec v2) { - return _mm512_cmpeq_epi32_mask(v1.vec_, v2.vec_) == 0xFFFF; - } - - friend simd_vec vectorcall inclusive_scan(simd_vec v1); - - friend force_inline simd_vec vectorcall gather(const float *base_addr, simd_vec vindex); - friend force_inline simd_vec vectorcall gather(const int *base_addr, simd_vec vindex); - friend force_inline simd_vec vectorcall gather(const unsigned *base_addr, simd_vec vindex); - - friend force_inline void vectorcall scatter(float *base_addr, simd_vec vindex, simd_vec v); - friend force_inline void vectorcall scatter(float *base_addr, simd_vec vindex, const float v) { - scatter(base_addr, vindex, simd_vec{v}); - } - friend force_inline void vectorcall scatter(float *base_addr, simd_vec mask, simd_vec vindex, - simd_vec v); - friend force_inline void vectorcall scatter(float *base_addr, simd_vec mask, simd_vec vindex, - const float v) { - scatter(base_addr, mask, vindex, simd_vec{v}); - } - friend force_inline void vectorcall scatter(int *base_addr, simd_vec vindex, simd_vec v); - friend force_inline void vectorcall scatter(int *base_addr, simd_vec vindex, const int v) { - scatter(base_addr, vindex, simd_vec{v}); - } - friend force_inline void vectorcall scatter(int *base_addr, simd_vec mask, simd_vec vindex, - simd_vec v); - friend force_inline void vectorcall scatter(int *base_addr, simd_vec mask, simd_vec vindex, - const int v) { - scatter(base_addr, mask, vindex, simd_vec{v}); - } - friend force_inline void vectorcall scatter(unsigned *base_addr, simd_vec vindex, - simd_vec v); - friend force_inline void vectorcall scatter(unsigned *base_addr, simd_vec mask, simd_vec vindex, - simd_vec v); - - template - friend force_inline simd_vec - vectorcall select(const simd_vec mask, const simd_vec vec1, const simd_vec vec2); - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2); - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, - const simd_vec vec1, - const simd_vec vec2); - -#ifndef NDEBUG - friend void vectorcall __assert_valid_mask(const simd_vec mask) { - UNROLLED_FOR(i, 16, { - const int val = mask.get(); - assert(val == 0 || val == -1); - }) - } -#endif - - friend force_inline const int *value_ptr(const simd_vec &v1) { - return reinterpret_cast(&v1.vec_); - } - friend force_inline int *value_ptr(simd_vec &v1) { return reinterpret_cast(&v1.vec_); } - - static int size() { return 16; } - static bool is_native() { return true; } -}; - -template <> class simd_vec { - union { - __m512i vec_; - unsigned comp_[16]; - }; - - friend class simd_vec; - friend class simd_vec; - - public: - force_inline simd_vec() = default; - force_inline simd_vec(const unsigned f) { vec_ = _mm512_set1_epi32(f); } - force_inline simd_vec(const unsigned i0, const unsigned i1, const unsigned i2, const unsigned i3, const unsigned i4, - const unsigned i5, const unsigned i6, const unsigned i7, const unsigned i8, const unsigned i9, - const unsigned i10, const unsigned i11, const unsigned i12, const unsigned i13, - const unsigned i14, const unsigned i15) { - vec_ = _mm512_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15); - } - force_inline explicit simd_vec(const unsigned *f) { vec_ = _mm512_loadu_si512((const __m512i *)f); } - force_inline simd_vec(const unsigned *f, simd_mem_aligned_tag) { vec_ = _mm512_load_si512((const __m512i *)f); } - - force_inline unsigned operator[](const int i) const { - __m512i temp = _mm512_maskz_compress_epi32(__mmask16(1u << (i & 15)), vec_); - return _mm512_cvtsi512_si32(temp); - } - - force_inline unsigned operator[](const long i) const { return operator[](int(i)); } - - template force_inline unsigned get() const { - __m128i temp = _mm512_extracti32x4_epi32(vec_, (i & 15) / 4); - return _mm_extract_epi32(temp, (i & 15) % 4); - } - template force_inline void set(const unsigned v) { - // TODO: find more optimal implementation (with compile-time index) - vec_ = _mm512_mask_set1_epi32(vec_, __mmask16(1u << (i & 15)), v); - } - force_inline void set(const int i, const unsigned v) { - vec_ = _mm512_mask_set1_epi32(vec_, __mmask16(1u << (i & 15)), v); - } - - force_inline simd_vec &vectorcall operator+=(const simd_vec rhs) { - vec_ = _mm512_add_epi32(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec &vectorcall operator-=(const simd_vec rhs) { - vec_ = _mm512_sub_epi32(vec_, rhs.vec_); - return *this; - } - - simd_vec &vectorcall operator*=(const simd_vec rhs) { - UNROLLED_FOR(i, 16, { comp_[i] *= rhs.comp_[i]; }) - return *this; - } - - simd_vec &vectorcall operator/=(const simd_vec rhs) { - UNROLLED_FOR(i, 16, { comp_[i] /= rhs.comp_[i]; }) - return *this; - } - - force_inline simd_vec &vectorcall operator|=(const simd_vec rhs) { - vec_ = _mm512_or_si512(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec &vectorcall operator^=(const simd_vec rhs) { - vec_ = _mm512_xor_epi32(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec vectorcall operator==(const simd_vec rhs) const { - simd_vec ret; - ret.vec_ = _mm512_movm_epi32(_mm512_cmpeq_epi32_mask(vec_, rhs.vec_)); - return ret; - } - - force_inline simd_vec vectorcall operator!=(const simd_vec rhs) const { - simd_vec ret; - ret.vec_ = - _mm512_andnot_si512(_mm512_movm_epi32(_mm512_cmpeq_epi32_mask(vec_, rhs.vec_)), _mm512_set1_epi32(~0)); - return ret; - } - - force_inline simd_vec &vectorcall operator&=(const simd_vec rhs) { - vec_ = _mm512_and_si512(vec_, rhs.vec_); - return *this; - } - - force_inline explicit operator simd_vec() const { - simd_vec ret; - ret.vec_ = _mm512_cvtepu32_ps(vec_); - return ret; - } - - force_inline explicit operator simd_vec() const { - simd_vec ret; - ret.vec_ = vec_; - return ret; - } - - force_inline unsigned hsum() const { return _mm512_reduce_add_epi32(vec_); } - - force_inline void store_to(unsigned *f) const { _mm512_storeu_si512((__m512i *)f, vec_); } - force_inline void store_to(unsigned *f, simd_mem_aligned_tag) const { _mm512_store_si512((__m512i *)f, vec_); } - - force_inline void vectorcall blend_to(const simd_vec mask, const simd_vec v1) { - validate_mask(mask); - vec_ = _mm512_ternarylogic_epi32(vec_, v1.vec_, _mm512_srai_epi32(mask.vec_, 31), 0xd8); - } - - force_inline void vectorcall blend_inv_to(const simd_vec mask, const simd_vec v1) { - validate_mask(mask); - vec_ = _mm512_ternarylogic_epi32(v1.vec_, vec_, _mm512_srai_epi32(mask.vec_, 31), 0xd8); - } - - force_inline int movemask() const { return _mm512_movemask_epi32(vec_); } - - force_inline bool vectorcall all_zeros() const { - return _mm512_cmpeq_epi32_mask(vec_, _mm512_setzero_si512()) == 0xFFFF; - } - - force_inline bool vectorcall all_zeros(const simd_vec mask) const { - return _mm512_cmpeq_epi32_mask(_mm512_and_si512(vec_, mask.vec_), _mm512_setzero_si512()) == 0xFFFF; - } - - force_inline bool not_all_zeros() const { return !all_zeros(); } - - friend force_inline simd_vec vectorcall min(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_min_epu32(v1.vec_, v2.vec_); - return temp; - } - - friend force_inline simd_vec vectorcall max(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_max_epu32(v1.vec_, v2.vec_); - return temp; - } - - friend force_inline simd_vec vectorcall clamp(const simd_vec v1, - const simd_vec _min, - const simd_vec _max) { - return max(_min, min(v1, _max)); - } - - force_inline static simd_vec vectorcall and_not(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_castps_si512(_mm512_andnot_ps(_mm512_castsi512_ps(v1.vec_), _mm512_castsi512_ps(v2.vec_))); - return temp; - } - - friend force_inline simd_vec vectorcall operator&(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_castps_si512(_mm512_and_ps(_mm512_castsi512_ps(v1.vec_), _mm512_castsi512_ps(v2.vec_))); - return temp; - } - - friend force_inline simd_vec vectorcall operator|(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_castps_si512(_mm512_or_ps(_mm512_castsi512_ps(v1.vec_), _mm512_castsi512_ps(v2.vec_))); - return temp; - } - - friend force_inline simd_vec vectorcall operator^(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_castps_si512(_mm512_xor_ps(_mm512_castsi512_ps(v1.vec_), _mm512_castsi512_ps(v2.vec_))); - return temp; - } - - friend force_inline simd_vec vectorcall operator+(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_add_epi32(v1.vec_, v2.vec_); - return temp; - } - - friend force_inline simd_vec vectorcall operator-(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_sub_epi32(v1.vec_, v2.vec_); - return temp; - } - - friend simd_vec vectorcall operator*(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; - UNROLLED_FOR(i, 16, { ret.comp_[i] = v1.comp_[i] * v2.comp_[i]; }) - return ret; - } - - friend simd_vec vectorcall operator/(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; - UNROLLED_FOR(i, 16, { ret.comp_[i] = v1.comp_[i] / v2.comp_[i]; }) - return ret; - } - - friend force_inline simd_vec vectorcall operator>>(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm512_srlv_epi32(v1.vec_, v2.vec_); - return ret; - } - - friend force_inline simd_vec vectorcall operator>>(const simd_vec v1, - const unsigned v2) { - simd_vec ret; - ret.vec_ = _mm512_srli_epi32(v1.vec_, v2); - return ret; - } - - friend force_inline simd_vec vectorcall operator<<(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm512_sllv_epi32(v1.vec_, v2.vec_); - return ret; - } - - friend force_inline simd_vec vectorcall operator<<(const simd_vec v1, - const unsigned v2) { - simd_vec ret; - ret.vec_ = _mm512_slli_epi32(v1.vec_, v2); - return ret; - } - - force_inline simd_vec operator~() const { - simd_vec ret; - ret.vec_ = _mm512_andnot_si512(vec_, _mm512_set1_epi32(~0)); - return ret; - } - - friend force_inline bool vectorcall is_equal(const simd_vec v1, const simd_vec v2) { - return _mm512_cmpeq_epi32_mask(v1.vec_, v2.vec_) == 0xFFFF; - } - - friend simd_vec vectorcall inclusive_scan(simd_vec v1); - - friend force_inline simd_vec vectorcall gather(const unsigned *base_addr, simd_vec vindex); - - friend force_inline void vectorcall scatter(unsigned *base_addr, simd_vec vindex, - simd_vec v); - friend force_inline void vectorcall scatter(unsigned *base_addr, simd_vec vindex, const unsigned v) { - scatter(base_addr, vindex, simd_vec{v}); - } - friend force_inline void vectorcall scatter(unsigned *base_addr, simd_vec mask, simd_vec vindex, - simd_vec v); - friend force_inline void vectorcall scatter(unsigned *base_addr, simd_vec mask, simd_vec vindex, - const unsigned v) { - scatter(base_addr, mask, vindex, simd_vec{v}); - } - - template - friend force_inline simd_vec - vectorcall select(const simd_vec mask, const simd_vec vec1, const simd_vec vec2); - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2); - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, - const simd_vec vec1, - const simd_vec vec2); - -#ifndef NDEBUG - friend void vectorcall __assert_valid_mask(const simd_vec mask) { - UNROLLED_FOR(i, 16, { - const int val = mask.get(); - assert(val == 0 || val == 0xffffffff); - }) - } -#endif - - friend force_inline const unsigned *value_ptr(const simd_vec &v1) { - return reinterpret_cast(&v1.vec_); - } - friend force_inline unsigned *value_ptr(simd_vec &v1) { - return reinterpret_cast(&v1.vec_); - } - - static int size() { return 16; } - static bool is_native() { return true; } -}; - -force_inline simd_vec simd_vec::operator~() const { - simd_vec ret; - ret.vec_ = _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(vec_), _mm512_set1_epi32(~0))); - return ret; -} - -force_inline simd_vec simd_vec::operator-() const { - simd_vec temp; - __m512 m = _mm512_set1_ps(-0.0f); - temp.vec_ = _mm512_xor_ps(vec_, m); - return temp; -} - -force_inline simd_vec::operator simd_vec() const { - simd_vec ret; - ret.vec_ = _mm512_cvttps_epi32(vec_); - return ret; -} - -force_inline simd_vec::operator simd_vec() const { - simd_vec ret; - ret.vec_ = _mm512_cvttps_epi32(vec_); - return ret; -} - -force_inline simd_vec::operator simd_vec() const { - simd_vec ret; - ret.vec_ = vec_; - return ret; -} - -force_inline simd_vec simd_vec::sqrt() const { - simd_vec temp; - temp.vec_ = _mm512_sqrt_ps(vec_); - return temp; -} - -inline simd_vec simd_vec::log() const { - simd_vec ret; - UNROLLED_FOR(i, 16, { ret.comp_[i] = logf(comp_[i]); }) - return ret; -} - -force_inline simd_vec vectorcall min(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_min_ps(v1.vec_, v2.vec_); - return temp; -} - -force_inline simd_vec vectorcall max(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_max_ps(v1.vec_, v2.vec_); - return temp; -} - -force_inline simd_vec vectorcall and_not(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_andnot_ps(v1.vec_, v2.vec_); - return temp; -} - -force_inline simd_vec vectorcall floor(const simd_vec v1) { - simd_vec temp; - temp.vec_ = _mm512_floor_ps(v1.vec_); - return temp; -} - -force_inline simd_vec vectorcall ceil(const simd_vec v1) { - simd_vec temp; - temp.vec_ = _mm512_ceil_ps(v1.vec_); - return temp; -} - -force_inline simd_vec vectorcall operator&(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_and_ps(v1.vec_, v2.vec_); - return temp; -} - -force_inline simd_vec vectorcall operator|(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_or_ps(v1.vec_, v2.vec_); - return temp; -} - -force_inline simd_vec vectorcall operator^(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_xor_ps(v1.vec_, v2.vec_); - return temp; -} - -force_inline simd_vec vectorcall operator+(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_add_ps(v1.vec_, v2.vec_); - return temp; -} - -force_inline simd_vec vectorcall operator-(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_sub_ps(v1.vec_, v2.vec_); - return temp; -} - -force_inline simd_vec vectorcall operator*(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_mul_ps(v1.vec_, v2.vec_); - return temp; -} - -force_inline simd_vec vectorcall operator/(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm512_div_ps(v1.vec_, v2.vec_); - return temp; -} - -force_inline simd_vec vectorcall operator<(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm512_cmp_ps(v1.vec_, v2.vec_, _CMP_LT_OS); - return ret; -} - -force_inline simd_vec vectorcall operator<=(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm512_cmp_ps(v1.vec_, v2.vec_, _CMP_LE_OS); - return ret; -} - -force_inline simd_vec vectorcall operator>(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm512_cmp_ps(v1.vec_, v2.vec_, _CMP_GT_OS); - return ret; -} - -force_inline simd_vec vectorcall operator>=(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm512_cmp_ps(v1.vec_, v2.vec_, _CMP_GE_OS); - return ret; -} - -force_inline simd_vec vectorcall operator==(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm512_cmp_ps(v1.vec_, v2.vec_, _CMP_EQ_OS); - return ret; -} - -force_inline simd_vec vectorcall operator!=(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm512_cmp_ps(v1.vec_, v2.vec_, _CMP_NEQ_OS); - return ret; -} - -force_inline simd_vec vectorcall clamp(const simd_vec v1, const simd_vec min, - const simd_vec max) { - simd_vec ret; - ret.vec_ = _mm512_max_ps(min.vec_, _mm512_min_ps(v1.vec_, max.vec_)); - return ret; -} - -inline simd_vec vectorcall pow(const simd_vec v1, const simd_vec v2) { - alignas(64) float comp1[16], comp2[16]; - _mm512_store_ps(comp1, v1.vec_); - _mm512_store_ps(comp2, v2.vec_); - UNROLLED_FOR(i, 16, { comp1[i] = powf(comp1[i], comp2[i]); }) - return simd_vec{comp1, simd_mem_aligned}; -} - -force_inline simd_vec vectorcall normalize(const simd_vec v1) { return v1 / v1.length(); } - -force_inline simd_vec vectorcall normalize_len(const simd_vec v1, float &out_len) { - return v1 / (out_len = v1.length()); -} - -force_inline simd_vec vectorcall inclusive_scan(simd_vec v1) { - v1.vec_ = _mm512_add_ps(v1.vec_, _mm512_castsi512_ps(_mm512_slli_si512(_mm512_castps_si512(v1.vec_), 1))); - v1.vec_ = _mm512_add_ps(v1.vec_, _mm512_castsi512_ps(_mm512_slli_si512(_mm512_castps_si512(v1.vec_), 2))); - v1.vec_ = _mm512_add_ps(v1.vec_, _mm512_castsi512_ps(_mm512_slli_si512(_mm512_castps_si512(v1.vec_), 4))); - v1.vec_ = _mm512_add_ps(v1.vec_, _mm512_castsi512_ps(_mm512_slli_si512(_mm512_castps_si512(v1.vec_), 8))); - return v1; -} - -force_inline simd_vec vectorcall inclusive_scan(simd_vec v1) { - v1.vec_ = _mm512_add_epi32(v1.vec_, _mm512_slli_si512(v1.vec_, 1)); - v1.vec_ = _mm512_add_epi32(v1.vec_, _mm512_slli_si512(v1.vec_, 2)); - v1.vec_ = _mm512_add_epi32(v1.vec_, _mm512_slli_si512(v1.vec_, 4)); - v1.vec_ = _mm512_add_epi32(v1.vec_, _mm512_slli_si512(v1.vec_, 8)); - return v1; -} - -force_inline simd_vec vectorcall inclusive_scan(simd_vec v1) { - v1.vec_ = _mm512_add_epi32(v1.vec_, _mm512_slli_si512(v1.vec_, 1)); - v1.vec_ = _mm512_add_epi32(v1.vec_, _mm512_slli_si512(v1.vec_, 2)); - v1.vec_ = _mm512_add_epi32(v1.vec_, _mm512_slli_si512(v1.vec_, 4)); - v1.vec_ = _mm512_add_epi32(v1.vec_, _mm512_slli_si512(v1.vec_, 8)); - return v1; -} - -force_inline simd_vec vectorcall fmadd(const simd_vec a, const simd_vec b, - const simd_vec c) { - simd_vec ret; - ret.vec_ = _mm512_fmadd_ps(a.vec_, b.vec_, c.vec_); - return ret; -} - -force_inline simd_vec vectorcall fmsub(const simd_vec a, const simd_vec b, - const simd_vec c) { - simd_vec ret; - ret.vec_ = _mm512_fmsub_ps(a.vec_, b.vec_, c.vec_); - return ret; -} - -force_inline simd_vec vectorcall gather(const float *base_addr, const simd_vec vindex) { - simd_vec ret; - ret.vec_ = _mm512_i32gather_ps(vindex.vec_, base_addr, sizeof(float)); - return ret; -} - -force_inline simd_vec vectorcall gather(const int *base_addr, const simd_vec vindex) { - simd_vec ret; - ret.vec_ = _mm512_i32gather_epi32(vindex.vec_, base_addr, sizeof(int)); - return ret; -} - -force_inline simd_vec vectorcall gather(const unsigned *base_addr, const simd_vec vindex) { - simd_vec ret; - ret.vec_ = _mm512_i32gather_epi32(vindex.vec_, reinterpret_cast(base_addr), sizeof(unsigned)); - return ret; -} - -force_inline void vectorcall scatter(float *base_addr, simd_vec vindex, simd_vec v) { - _mm512_i32scatter_ps(base_addr, vindex.vec_, v.vec_, sizeof(float)); -} - -force_inline void vectorcall scatter(float *base_addr, simd_vec mask, simd_vec vindex, - simd_vec v) { - _mm512_mask_i32scatter_ps(base_addr, mask.movemask(), vindex.vec_, v.vec_, sizeof(float)); -} - -force_inline void vectorcall scatter(int *base_addr, simd_vec vindex, simd_vec v) { - _mm512_i32scatter_epi32(base_addr, vindex.vec_, v.vec_, sizeof(int)); -} - -force_inline void vectorcall scatter(int *base_addr, simd_vec mask, simd_vec vindex, - simd_vec v) { - _mm512_mask_i32scatter_epi32(base_addr, mask.movemask(), vindex.vec_, v.vec_, sizeof(int)); -} - -force_inline void vectorcall scatter(unsigned *base_addr, simd_vec vindex, simd_vec v) { - _mm512_i32scatter_epi32(base_addr, vindex.vec_, v.vec_, sizeof(unsigned)); -} - -force_inline void vectorcall scatter(unsigned *base_addr, simd_vec mask, simd_vec vindex, - simd_vec v) { - _mm512_mask_i32scatter_epi32(base_addr, mask.movemask(), vindex.vec_, v.vec_, sizeof(int)); -} - -template -force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2) { - validate_mask(mask); - simd_vec ret; - ret.vec_ = _mm512_blendv_ps(vec2.vec_, vec1.vec_, _mm_cast<__m512>(mask.vec_)); - return ret; -} - -template -force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2) { - validate_mask(mask); - simd_vec ret; - ret.vec_ = - _mm512_ternarylogic_epi32(vec2.vec_, vec1.vec_, _mm512_srai_epi32(_mm_cast<__m512i>(mask.vec_), 31), 0xd8); - return ret; -} - -template -force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2) { - validate_mask(mask); - simd_vec ret; - ret.vec_ = - _mm512_ternarylogic_epi32(vec2.vec_, vec1.vec_, _mm512_srai_epi32(_mm_cast<__m512i>(mask.vec_), 31), 0xd8); - return ret; -} - -} // namespace NS -} // namespace Ray - -#undef validate_mask - -#pragma warning(pop) diff --git a/internal/simd/simd_vec_sse.h b/internal/simd/simd_vec_sse.h deleted file mode 100644 index 43b244660..000000000 --- a/internal/simd/simd_vec_sse.h +++ /dev/null @@ -1,1052 +0,0 @@ -// #pragma once - -#include - -#include -#include -#include - -#ifndef NDEBUG -#define validate_mask(m) __assert_valid_mask(m) -#else -#define validate_mask(m) ((void)m) -#endif - -namespace Ray { -namespace NS { - -template To _mm_cast(From x) { return x; } -template <> force_inline __m128 _mm_cast(__m128i x) { return _mm_castsi128_ps(x); } -template <> force_inline __m128i _mm_cast(__m128 x) { return _mm_castps_si128(x); } - -template <> class simd_vec; -template <> class simd_vec; - -template <> class simd_vec { - union { - __m128 vec_; - float comp_[4]; - }; - - friend class simd_vec; - friend class simd_vec; - - public: - force_inline simd_vec() = default; - force_inline simd_vec(const float f) { vec_ = _mm_set1_ps(f); } - template force_inline simd_vec(const float f1, const float f2, const float f3, const float f4) { - vec_ = _mm_setr_ps(f1, f2, f3, f4); - } - force_inline explicit simd_vec(const float *f) { vec_ = _mm_loadu_ps(f); } - force_inline simd_vec(const float *f, simd_mem_aligned_tag) { vec_ = _mm_load_ps(f); } - - force_inline float operator[](const int i) const { return comp_[i]; } - - force_inline float operator[](const long i) const { return operator[](int(i)); } - - template force_inline float get() const { return comp_[i]; } - template force_inline void set(const float v) { comp_[i] = v; } - force_inline void set(const int i, const float v) { comp_[i] = v; } - - force_inline simd_vec &vectorcall operator+=(const simd_vec rhs) { - vec_ = _mm_add_ps(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec &vectorcall operator-=(const simd_vec rhs) { - vec_ = _mm_sub_ps(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec &vectorcall operator*=(const simd_vec rhs) { - vec_ = _mm_mul_ps(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec &vectorcall operator/=(const simd_vec rhs) { - vec_ = _mm_div_ps(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec &vectorcall operator|=(const simd_vec rhs) { - vec_ = _mm_or_ps(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec vectorcall operator-() const { - simd_vec temp; - __m128 m = _mm_set1_ps(-0.0f); - temp.vec_ = _mm_xor_ps(vec_, m); - return temp; - } - - force_inline simd_vec vectorcall operator<(const simd_vec rhs) const { - simd_vec ret; - ret.vec_ = _mm_cmplt_ps(vec_, rhs.vec_); - return ret; - } - - force_inline simd_vec vectorcall operator<=(const simd_vec rhs) const { - simd_vec ret; - ret.vec_ = _mm_cmple_ps(vec_, rhs.vec_); - return ret; - } - - force_inline simd_vec vectorcall operator>(const simd_vec rhs) const { - simd_vec ret; - ret.vec_ = _mm_cmpgt_ps(vec_, rhs.vec_); - return ret; - } - - force_inline simd_vec vectorcall operator>=(const simd_vec rhs) const { - simd_vec ret; - ret.vec_ = _mm_cmpge_ps(vec_, rhs.vec_); - return ret; - } - - force_inline simd_vec vectorcall operator~() const { - simd_vec ret; - ret.vec_ = _mm_castsi128_ps(_mm_andnot_si128(_mm_castps_si128(vec_), _mm_set1_epi32(~0))); - return ret; - } - - force_inline simd_vec &vectorcall operator&=(const simd_vec rhs) { - vec_ = _mm_and_ps(vec_, rhs.vec_); - return *this; - } - - force_inline explicit vectorcall operator simd_vec() const; - force_inline explicit vectorcall operator simd_vec() const; - - force_inline simd_vec vectorcall sqrt() const { - simd_vec temp; - temp.vec_ = _mm_sqrt_ps(vec_); - return temp; - } - - simd_vec vectorcall log() const { - simd_vec ret; - UNROLLED_FOR(i, 4, { ret.comp_[i] = logf(comp_[i]); }) - return ret; - } - - float vectorcall length() const { - __m128 r1, r2; - r1 = _mm_mul_ps(vec_, vec_); - - r2 = _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(2, 3, 0, 1)); - r1 = _mm_add_ps(r1, r2); - r2 = _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(0, 1, 2, 3)); - r1 = _mm_add_ps(r1, r2); - - return _mm_cvtss_f32(_mm_sqrt_ss(r1)); - } - - float vectorcall length2() const { - __m128 r1, r2; - r1 = _mm_mul_ps(vec_, vec_); - - r2 = _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(2, 3, 0, 1)); - r1 = _mm_add_ps(r1, r2); - r2 = _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(0, 1, 2, 3)); - r1 = _mm_add_ps(r1, r2); - - return _mm_cvtss_f32(r1); - } - - force_inline float hsum() const { -#if defined(USE_SSE41) - __m128 temp = _mm_hadd_ps(vec_, vec_); - temp = _mm_hadd_ps(temp, temp); - return _mm_cvtss_f32(temp); -#else - return comp_[0] + comp_[1] + comp_[2] + comp_[3]; -#endif - } - - force_inline void vectorcall store_to(float *f) const { _mm_storeu_ps(f, vec_); } - force_inline void vectorcall store_to(float *f, simd_mem_aligned_tag) const { _mm_store_ps(f, vec_); } - - force_inline void vectorcall blend_to(const simd_vec mask, const simd_vec v1) { - validate_mask(mask); -#if defined(USE_SSE41) - vec_ = _mm_blendv_ps(vec_, v1.vec_, mask.vec_); -#else - __m128 temp1 = _mm_and_ps(mask.vec_, v1.vec_); - __m128 temp2 = _mm_andnot_ps(mask.vec_, vec_); - vec_ = _mm_or_ps(temp1, temp2); -#endif - } - - force_inline void vectorcall blend_inv_to(const simd_vec mask, const simd_vec v1) { - validate_mask(mask); -#if defined(USE_SSE41) - vec_ = _mm_blendv_ps(v1.vec_, vec_, mask.vec_); -#else - __m128 temp1 = _mm_andnot_ps(mask.vec_, v1.vec_); - __m128 temp2 = _mm_and_ps(mask.vec_, vec_); - vec_ = _mm_or_ps(temp1, temp2); -#endif - } - - friend force_inline simd_vec vectorcall min(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm_min_ps(v1.vec_, v2.vec_); - return temp; - } - - friend force_inline simd_vec vectorcall max(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm_max_ps(v1.vec_, v2.vec_); - return temp; - } - - force_inline static simd_vec vectorcall and_not(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm_andnot_ps(v1.vec_, v2.vec_); - return temp; - } - - force_inline static simd_vec vectorcall floor(const simd_vec v1) { - simd_vec temp; -#if defined(USE_SSE41) - temp.vec_ = _mm_floor_ps(v1.vec_); -#else - __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(v1.vec_)); - temp.vec_ = _mm_sub_ps(t, _mm_and_ps(_mm_cmplt_ps(v1.vec_, t), _mm_set1_ps(1.0f))); -#endif - return temp; - } - - force_inline static simd_vec vectorcall ceil(const simd_vec v1) { - simd_vec temp; - __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(v1.vec_)); - __m128 r = _mm_add_ps(t, _mm_and_ps(_mm_cmpgt_ps(v1.vec_, t), _mm_set1_ps(1.0f))); - temp.vec_ = r; - return temp; - } - - friend force_inline simd_vec vectorcall operator&(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm_and_ps(v1.vec_, v2.vec_); - return temp; - } - - friend force_inline simd_vec vectorcall operator|(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm_or_ps(v1.vec_, v2.vec_); - return temp; - } - - friend force_inline simd_vec vectorcall operator^(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm_xor_ps(v1.vec_, v2.vec_); - return temp; - } - - friend force_inline simd_vec vectorcall operator+(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm_add_ps(v1.vec_, v2.vec_); - return ret; - } - - friend force_inline simd_vec vectorcall operator-(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm_sub_ps(v1.vec_, v2.vec_); - return ret; - } - - force_inline simd_vec vectorcall operator==(const simd_vec rhs) const { - simd_vec ret; - ret.vec_ = _mm_cmpeq_ps(vec_, rhs.vec_); - return ret; - } - - force_inline simd_vec vectorcall operator!=(const simd_vec rhs) const { - simd_vec ret; - ret.vec_ = _mm_cmpneq_ps(vec_, rhs.vec_); - return ret; - } - - friend force_inline simd_vec vectorcall operator*(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm_mul_ps(v1.vec_, v2.vec_); - return ret; - } - - friend force_inline simd_vec vectorcall operator/(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm_div_ps(v1.vec_, v2.vec_); - return ret; - } - - friend force_inline float vectorcall dot(const simd_vec v1, const simd_vec v2) { - __m128 r1, r2; - r1 = _mm_mul_ps(v1.vec_, v2.vec_); - r2 = _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(2, 3, 0, 1)); - r1 = _mm_add_ps(r1, r2); - r2 = _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(0, 1, 2, 3)); - r1 = _mm_add_ps(r1, r2); - return _mm_cvtss_f32(r1); - } - - friend force_inline simd_vec vectorcall clamp(const simd_vec v1, const simd_vec min, - const simd_vec max) { - simd_vec ret; - ret.vec_ = _mm_max_ps(min.vec_, _mm_min_ps(v1.vec_, max.vec_)); - return ret; - } - - friend force_inline simd_vec vectorcall saturate(const simd_vec v1) { - return clamp(v1, 0.0f, 1.0f); - } - - friend simd_vec vectorcall pow(const simd_vec v1, const simd_vec v2) { - alignas(16) float comp1[4], comp2[4]; - _mm_store_ps(comp1, v1.vec_); - _mm_store_ps(comp2, v2.vec_); - UNROLLED_FOR(i, 4, { comp1[i] = powf(comp1[i], comp2[i]); }) - return simd_vec{comp1, simd_mem_aligned}; - } - - friend force_inline simd_vec vectorcall normalize(const simd_vec v1) { - return v1 / v1.length(); - } - - friend force_inline simd_vec vectorcall normalize_len(const simd_vec v1, float &out_len) { - return v1 / (out_len = v1.length()); - } - - friend force_inline simd_vec vectorcall inclusive_scan(simd_vec v1) { - v1.vec_ = _mm_add_ps(v1.vec_, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(v1.vec_), 4))); - v1.vec_ = _mm_add_ps(v1.vec_, _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(v1.vec_), 8))); - return v1; - } - - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2); - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2); - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, - const simd_vec vec1, - const simd_vec vec2); - -#ifndef NDEBUG - friend void vectorcall __assert_valid_mask(const simd_vec mask) { - UNROLLED_FOR(i, 4, { - const float val = mask.get(); - assert(reinterpret_cast(val) == 0 || - reinterpret_cast(val) == 0xffffffff); - }) - } -#endif - - friend force_inline const float *vectorcall value_ptr(const simd_vec &v1) { - return reinterpret_cast(&v1.vec_); - } - friend force_inline float *vectorcall value_ptr(simd_vec &v1) { - return reinterpret_cast(&v1.vec_); - } - - static int size() { return 4; } - static bool is_native() { return true; } -}; - -template <> class simd_vec { - union { - __m128i vec_; - int comp_[4]; - }; - - friend class simd_vec; - friend class simd_vec; - - public: - force_inline simd_vec() = default; - force_inline simd_vec(const int v) { vec_ = _mm_set1_epi32(v); } - force_inline simd_vec(const int i1, const int i2, const int i3, const int i4) { - vec_ = _mm_setr_epi32(i1, i2, i3, i4); - } - force_inline explicit simd_vec(const int *f) { vec_ = _mm_loadu_si128((const __m128i *)f); } - force_inline simd_vec(const int *f, simd_mem_aligned_tag) { vec_ = _mm_load_si128((const __m128i *)f); } - - force_inline int operator[](const int i) const { return comp_[i]; } - force_inline int operator[](const long i) const { return operator[](int(i)); } - - template force_inline int get() const { return comp_[i]; } - template force_inline void set(const int v) { comp_[i & 3] = v; } - force_inline void set(const int i, const int v) { comp_[i] = v; } - - force_inline simd_vec &vectorcall operator+=(const simd_vec rhs) { - vec_ = _mm_add_epi32(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec &vectorcall operator-=(const simd_vec rhs) { - vec_ = _mm_sub_epi32(vec_, rhs.vec_); - return *this; - } - - simd_vec &vectorcall operator*=(const simd_vec rhs) { -#if defined(USE_SSE41) - vec_ = _mm_mullo_epi32(vec_, rhs.vec_); -#else - UNROLLED_FOR(i, 4, { comp_[i] *= rhs.comp_[i]; }) -#endif - return *this; - } - - simd_vec &vectorcall operator/=(const simd_vec rhs) { - UNROLLED_FOR(i, 4, { comp_[i] /= rhs.comp_[i]; }) - return *this; - } - - force_inline simd_vec &vectorcall operator|=(const simd_vec rhs) { - vec_ = _mm_or_si128(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec &vectorcall operator^=(const simd_vec rhs) { - vec_ = _mm_xor_si128(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec vectorcall operator-() const { - simd_vec temp; - temp.vec_ = _mm_sub_epi32(_mm_setzero_si128(), vec_); - return temp; - } - - force_inline simd_vec vectorcall operator==(const simd_vec rhs) const { - simd_vec ret; - ret.vec_ = _mm_cmpeq_epi32(vec_, rhs.vec_); - return ret; - } - - force_inline simd_vec vectorcall operator!=(const simd_vec rhs) const { - simd_vec ret; - ret.vec_ = _mm_andnot_si128(_mm_cmpeq_epi32(vec_, rhs.vec_), _mm_set1_epi32(~0)); - return ret; - } - - force_inline simd_vec vectorcall operator<(const simd_vec rhs) const { - simd_vec ret; - ret.vec_ = _mm_cmplt_epi32(vec_, rhs.vec_); - return ret; - } - - force_inline simd_vec vectorcall operator<=(const simd_vec rhs) const { - simd_vec ret; - ret.vec_ = _mm_andnot_si128(_mm_cmpgt_epi32(vec_, rhs.vec_), _mm_set1_epi32(~0)); - return ret; - } - - force_inline simd_vec vectorcall operator>(const simd_vec rhs) const { - simd_vec ret; - ret.vec_ = _mm_cmpgt_epi32(vec_, rhs.vec_); - return ret; - } - - force_inline simd_vec vectorcall operator>=(const simd_vec rhs) const { - simd_vec ret; - ret.vec_ = _mm_andnot_si128(_mm_cmplt_epi32(vec_, rhs.vec_), _mm_set1_epi32(~0)); - return ret; - } - - force_inline simd_vec &vectorcall operator&=(const simd_vec rhs) { - vec_ = _mm_and_si128(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec vectorcall operator~() const { - simd_vec ret; - ret.vec_ = _mm_andnot_si128(vec_, _mm_set1_epi32(~0)); - return ret; - } - - force_inline explicit vectorcall operator simd_vec() const { - simd_vec ret; - ret.vec_ = _mm_cvtepi32_ps(vec_); - return ret; - } - - force_inline explicit vectorcall operator simd_vec() const; - - force_inline int hsum() const { -#if defined(USE_SSE41) - __m128i temp = _mm_hadd_epi32(vec_, vec_); - temp = _mm_hadd_epi32(temp, temp); - return _mm_cvtsi128_si32(temp); -#else - return comp_[0] + comp_[1] + comp_[2] + comp_[3]; -#endif - } - - force_inline void store_to(int *f) const { _mm_storeu_si128((__m128i *)f, vec_); } - force_inline void store_to(int *f, simd_mem_aligned_tag) const { _mm_store_si128((__m128i *)f, vec_); } - - force_inline void vectorcall blend_to(const simd_vec mask, const simd_vec v1) { - validate_mask(mask); -#if defined(USE_SSE41) - vec_ = _mm_blendv_epi8(vec_, v1.vec_, mask.vec_); -#else - __m128i temp1 = _mm_and_si128(mask.vec_, v1.vec_); - __m128i temp2 = _mm_andnot_si128(mask.vec_, vec_); - vec_ = _mm_or_si128(temp1, temp2); -#endif - } - - force_inline void vectorcall blend_inv_to(const simd_vec mask, const simd_vec v1) { - validate_mask(mask); -#if defined(USE_SSE41) - vec_ = _mm_blendv_epi8(v1.vec_, vec_, mask.vec_); -#else - __m128i temp1 = _mm_andnot_si128(mask.vec_, v1.vec_); - __m128i temp2 = _mm_and_si128(mask.vec_, vec_); - vec_ = _mm_or_si128(temp1, temp2); -#endif - } - - force_inline int movemask() const { return _mm_movemask_ps(_mm_castsi128_ps(vec_)); } - - force_inline bool all_zeros() const { -#if defined(USE_SSE41) - return _mm_test_all_zeros(vec_, vec_); -#else - return _mm_movemask_epi8(_mm_cmpeq_epi32(vec_, _mm_setzero_si128())) == 0xFFFF; -#endif - } - - force_inline bool vectorcall all_zeros(const simd_vec mask) const { -#if defined(USE_SSE41) - return _mm_test_all_zeros(vec_, mask.vec_); -#else - return _mm_movemask_epi8(_mm_cmpeq_epi32(_mm_and_si128(vec_, mask.vec_), _mm_setzero_si128())) == 0xFFFF; -#endif - } - - force_inline bool not_all_zeros() const { return !all_zeros(); } - - friend simd_vec vectorcall min(const simd_vec v1, const simd_vec v2) { - simd_vec temp; -#if defined(USE_SSE41) - temp.vec_ = _mm_min_epi32(v1.vec_, v2.vec_); -#else - UNROLLED_FOR(i, 4, { temp.comp_[i] = (v1.comp_[i] < v2.comp_[i]) ? v1.comp_[i] : v2.comp_[i]; }) -#endif - return temp; - } - - static simd_vec vectorcall max(const simd_vec v1, const simd_vec v2) { - simd_vec temp; -#if defined(USE_SSE41) - temp.vec_ = _mm_max_epi32(v1.vec_, v2.vec_); -#else - UNROLLED_FOR(i, 4, { temp.comp_[i] = (v1.comp_[i] > v2.comp_[i]) ? v1.comp_[i] : v2.comp_[i]; }) -#endif - return temp; - } - - friend force_inline simd_vec vectorcall clamp(const simd_vec v1, const simd_vec _min, - const simd_vec _max) { - return max(_min, min(v1, _max)); - } - - force_inline static simd_vec vectorcall and_not(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm_andnot_si128(v1.vec_, v2.vec_); - return temp; - } - - friend force_inline simd_vec vectorcall operator&(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm_and_si128(v1.vec_, v2.vec_); - return temp; - } - - friend force_inline simd_vec vectorcall operator|(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm_or_si128(v1.vec_, v2.vec_); - return temp; - } - - friend force_inline simd_vec vectorcall operator^(const simd_vec v1, const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm_xor_si128(v1.vec_, v2.vec_); - return temp; - } - - friend force_inline simd_vec vectorcall operator+(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm_add_epi32(v1.vec_, v2.vec_); - return ret; - } - - friend force_inline simd_vec vectorcall operator-(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm_sub_epi32(v1.vec_, v2.vec_); - return ret; - } - - friend simd_vec vectorcall operator*(const simd_vec v1, const simd_vec v2) { - simd_vec ret; -#if defined(USE_SSE41) - ret.vec_ = _mm_mullo_epi32(v1.vec_, v2.vec_); -#else - UNROLLED_FOR(i, 4, { ret.comp_[i] = v1.comp_[i] * v2.comp_[i]; }) -#endif - return ret; - } - - friend simd_vec vectorcall operator/(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - UNROLLED_FOR(i, 4, { ret.comp_[i] = v1.comp_[i] / v2.comp_[i]; }) - return ret; - } - - friend simd_vec vectorcall operator>>(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - UNROLLED_FOR(i, 4, { ret.comp_[i] = int(unsigned(v1.comp_[i]) >> unsigned(v2.comp_[i])); }) - return ret; - } - - friend force_inline simd_vec vectorcall operator>>(const simd_vec v1, const int v2) { - simd_vec ret; - ret.vec_ = _mm_srli_epi32(v1.vec_, v2); - return ret; - } - - friend simd_vec vectorcall operator<<(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - UNROLLED_FOR(i, 4, { ret.comp_[i] = v1.comp_[i] << v2.comp_[i]; }) - return ret; - } - - friend force_inline simd_vec vectorcall operator<<(const simd_vec v1, const int v2) { - simd_vec ret; - ret.vec_ = _mm_slli_epi32(v1.vec_, v2); - return ret; - } - - friend force_inline simd_vec vectorcall srai(const simd_vec v1, const int v2) { - simd_vec ret; - ret.vec_ = _mm_srai_epi32(v1.vec_, v2); - return ret; - } - - friend force_inline bool vectorcall is_equal(const simd_vec v1, const simd_vec v2) { - __m128i vcmp = _mm_cmpeq_epi32(v1.vec_, v2.vec_); - return (_mm_movemask_epi8(vcmp) == 0xffff); - } - - friend force_inline simd_vec vectorcall inclusive_scan(simd_vec v1) { - v1.vec_ = _mm_add_epi32(v1.vec_, _mm_slli_si128(v1.vec_, 4)); - v1.vec_ = _mm_add_epi32(v1.vec_, _mm_slli_si128(v1.vec_, 8)); - return v1; - } - - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2); - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2); - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, - const simd_vec vec1, - const simd_vec vec2); - -#ifndef NDEBUG - friend void vectorcall __assert_valid_mask(const simd_vec mask) { - UNROLLED_FOR(i, 4, { - const int val = mask.get(); - assert(val == 0 || val == -1); - }) - } -#endif - - friend force_inline const int *value_ptr(const simd_vec &v1) { - return reinterpret_cast(&v1.vec_); - } - friend force_inline int *value_ptr(simd_vec &v1) { return reinterpret_cast(&v1.vec_); } - - static int size() { return 4; } - static bool is_native() { return true; } -}; - -template <> class simd_vec { - union { - __m128i vec_; - unsigned comp_[4]; - }; - - friend class simd_vec; - friend class simd_vec; - - public: - force_inline simd_vec() = default; - force_inline simd_vec(const unsigned v) { vec_ = _mm_set1_epi32(v); } - force_inline simd_vec(const unsigned i1, const unsigned i2, const unsigned i3, const unsigned i4) { - vec_ = _mm_setr_epi32(i1, i2, i3, i4); - } - force_inline explicit simd_vec(const unsigned *f) { vec_ = _mm_loadu_si128((const __m128i *)f); } - force_inline simd_vec(const unsigned *f, simd_mem_aligned_tag) { vec_ = _mm_load_si128((const __m128i *)f); } - - force_inline unsigned operator[](const int i) const { return comp_[i]; } - force_inline unsigned operator[](const long i) const { return operator[](int(i)); } - - template force_inline unsigned get() const { -#if defined(USE_SSE41) - return _mm_extract_epi32(vec_, i & 3); -#else - return comp_[i]; -#endif - } - template force_inline void set(const unsigned v) { -#if defined(USE_SSE41) - vec_ = _mm_insert_epi32(vec_, v, i & 3); -#else - comp_[i] = v; -#endif - } - force_inline void set(const int i, const unsigned v) { comp_[i] = v; } - - force_inline simd_vec &vectorcall operator+=(const simd_vec rhs) { - vec_ = _mm_add_epi32(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec &vectorcall operator-=(const simd_vec rhs) { - vec_ = _mm_sub_epi32(vec_, rhs.vec_); - return *this; - } - - simd_vec &vectorcall operator*=(const simd_vec rhs) { - UNROLLED_FOR(i, 4, { comp_[i] *= rhs.comp_[i]; }) - return *this; - } - - simd_vec &vectorcall operator/=(const simd_vec rhs) { - UNROLLED_FOR(i, 4, { comp_[i] /= rhs.comp_[i]; }) - return *this; - } - - force_inline simd_vec &vectorcall operator|=(const simd_vec rhs) { - vec_ = _mm_or_si128(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec &vectorcall operator^=(const simd_vec rhs) { - vec_ = _mm_xor_si128(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec vectorcall operator==(const simd_vec rhs) const { - simd_vec ret; - ret.vec_ = _mm_cmpeq_epi32(vec_, rhs.vec_); - return ret; - } - - force_inline simd_vec vectorcall operator!=(const simd_vec rhs) const { - simd_vec ret; - ret.vec_ = _mm_andnot_si128(_mm_cmpeq_epi32(vec_, rhs.vec_), _mm_set1_epi32(~0)); - return ret; - } - - force_inline simd_vec &vectorcall operator&=(const simd_vec rhs) { - vec_ = _mm_and_si128(vec_, rhs.vec_); - return *this; - } - - force_inline simd_vec vectorcall operator~() const { - simd_vec ret; - ret.vec_ = _mm_andnot_si128(vec_, _mm_set1_epi32(~0)); - return ret; - } - - force_inline explicit vectorcall operator simd_vec() const { - simd_vec ret; - ret.vec_ = _mm_cvtepi32_ps(vec_); - return ret; - } - - force_inline explicit vectorcall operator simd_vec() const { - simd_vec ret; - ret.vec_ = vec_; - return ret; - } - - force_inline unsigned hsum() const { -#if defined(USE_SSE41) - __m128i temp = _mm_hadd_epi32(vec_, vec_); - temp = _mm_hadd_epi32(temp, temp); - return _mm_cvtsi128_si32(temp); -#else - return comp_[0] + comp_[1] + comp_[2] + comp_[3]; -#endif - } - - force_inline void store_to(unsigned *f) const { _mm_storeu_si128((__m128i *)f, vec_); } - force_inline void store_to(unsigned *f, simd_mem_aligned_tag) const { _mm_store_si128((__m128i *)f, vec_); } - - force_inline void vectorcall blend_to(const simd_vec mask, const simd_vec v1) { - validate_mask(mask); -#if defined(USE_SSE41) - vec_ = _mm_blendv_epi8(vec_, v1.vec_, mask.vec_); -#else - __m128i temp1 = _mm_and_si128(mask.vec_, v1.vec_); - __m128i temp2 = _mm_andnot_si128(mask.vec_, vec_); - vec_ = _mm_or_si128(temp1, temp2); -#endif - } - - force_inline void vectorcall blend_inv_to(const simd_vec mask, const simd_vec v1) { - validate_mask(mask); -#if defined(USE_SSE41) - vec_ = _mm_blendv_epi8(v1.vec_, vec_, mask.vec_); -#else - __m128i temp1 = _mm_andnot_si128(mask.vec_, v1.vec_); - __m128i temp2 = _mm_and_si128(mask.vec_, vec_); - vec_ = _mm_or_si128(temp1, temp2); -#endif - } - - force_inline int movemask() const { return _mm_movemask_ps(_mm_castsi128_ps(vec_)); } - - force_inline bool all_zeros() const { -#if defined(USE_SSE41) - return _mm_test_all_zeros(vec_, vec_); -#else - return _mm_movemask_epi8(_mm_cmpeq_epi32(vec_, _mm_setzero_si128())) == 0xFFFF; -#endif - } - - force_inline bool vectorcall all_zeros(const simd_vec mask) const { -#if defined(USE_SSE41) - return _mm_test_all_zeros(vec_, mask.vec_); -#else - return _mm_movemask_epi8(_mm_cmpeq_epi32(_mm_and_si128(vec_, mask.vec_), _mm_setzero_si128())) == 0xFFFF; -#endif - } - - force_inline bool not_all_zeros() const { return !all_zeros(); } - - static simd_vec vectorcall min(const simd_vec v1, const simd_vec v2) { - simd_vec temp; -#if defined(USE_SSE41) - temp.vec_ = _mm_min_epu32(v1.vec_, v2.vec_); -#else - UNROLLED_FOR(i, 4, { temp.comp_[i] = (v1.comp_[i] < v2.comp_[i]) ? v1.comp_[i] : v2.comp_[i]; }) -#endif - return temp; - } - - static simd_vec vectorcall max(const simd_vec v1, const simd_vec v2) { - simd_vec temp; -#if defined(USE_SSE41) - temp.vec_ = _mm_max_epu32(v1.vec_, v2.vec_); -#else - UNROLLED_FOR(i, 4, { temp.comp_[i] = (v1.comp_[i] > v2.comp_[i]) ? v1.comp_[i] : v2.comp_[i]; }) -#endif - return temp; - } - - force_inline static simd_vec vectorcall and_not(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm_andnot_si128(v1.vec_, v2.vec_); - return temp; - } - - friend force_inline simd_vec vectorcall operator&(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm_and_si128(v1.vec_, v2.vec_); - return temp; - } - - friend force_inline simd_vec vectorcall operator|(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm_or_si128(v1.vec_, v2.vec_); - return temp; - } - - friend force_inline simd_vec vectorcall operator^(const simd_vec v1, - const simd_vec v2) { - simd_vec temp; - temp.vec_ = _mm_xor_si128(v1.vec_, v2.vec_); - ; - return temp; - } - - friend force_inline simd_vec vectorcall operator+(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm_add_epi32(v1.vec_, v2.vec_); - return ret; - } - - friend force_inline simd_vec vectorcall operator-(const simd_vec v1, - const simd_vec v2) { - simd_vec ret; - ret.vec_ = _mm_sub_epi32(v1.vec_, v2.vec_); - return ret; - } - - friend simd_vec vectorcall operator*(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - UNROLLED_FOR(i, 4, { ret.comp_[i] = v1.comp_[i] * v2.comp_[i]; }) - return ret; - } - - friend simd_vec vectorcall operator/(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - UNROLLED_FOR(i, 4, { ret.comp_[i] = v1.comp_[i] / v2.comp_[i]; }) - return ret; - } - - friend simd_vec vectorcall operator>>(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - UNROLLED_FOR(i, 4, { ret.comp_[i] = v1.comp_[i] >> v2.comp_[i]; }) - return ret; - } - - friend force_inline simd_vec vectorcall operator>>(const simd_vec v1, const unsigned v2) { - simd_vec ret; - ret.vec_ = _mm_srli_epi32(v1.vec_, v2); - return ret; - } - - friend simd_vec vectorcall operator<<(const simd_vec v1, const simd_vec v2) { - simd_vec ret; - UNROLLED_FOR(i, 4, { ret.comp_[i] = v1.comp_[i] << v2.comp_[i]; }) - return ret; - } - - friend force_inline simd_vec vectorcall operator<<(const simd_vec v1, const unsigned v2) { - simd_vec ret; - ret.vec_ = _mm_slli_epi32(v1.vec_, v2); - return ret; - } - - friend force_inline bool vectorcall is_equal(const simd_vec v1, const simd_vec v2) { - __m128i vcmp = _mm_cmpeq_epi32(v1.vec_, v2.vec_); - return (_mm_movemask_epi8(vcmp) == 0xffff); - } - - friend force_inline simd_vec vectorcall inclusive_scan(simd_vec v1) { - v1.vec_ = _mm_add_epi32(v1.vec_, _mm_slli_si128(v1.vec_, 4)); - v1.vec_ = _mm_add_epi32(v1.vec_, _mm_slli_si128(v1.vec_, 8)); - return v1; - } - - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2); - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2); - template - friend force_inline simd_vec vectorcall select(const simd_vec mask, - const simd_vec vec1, - const simd_vec vec2); - -#ifndef NDEBUG - friend void vectorcall __assert_valid_mask(const simd_vec mask) { - UNROLLED_FOR(i, 4, { - const unsigned val = mask.get(); - assert(val == 0 || val == 0xffffffff); - }) - } -#endif - - friend force_inline const unsigned *value_ptr(const simd_vec &v1) { - return reinterpret_cast(&v1.vec_); - } - friend force_inline unsigned *value_ptr(simd_vec &v1) { - return reinterpret_cast(&v1.vec_); - } - - static int size() { return 4; } - static bool is_native() { return true; } -}; - -force_inline vectorcall simd_vec::operator simd_vec() const { - simd_vec ret; - ret.vec_ = _mm_cvttps_epi32(vec_); - return ret; -} - -force_inline vectorcall simd_vec::operator simd_vec() const { - simd_vec ret; - ret.vec_ = _mm_cvttps_epi32(vec_); - return ret; -} - -force_inline vectorcall simd_vec::operator simd_vec() const { - simd_vec ret; - ret.vec_ = vec_; - return ret; -} - -template -force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2) { - validate_mask(mask); - simd_vec ret; -#if defined(USE_SSE41) - ret.vec_ = _mm_blendv_ps(vec2.vec_, vec1.vec_, _mm_cast<__m128>(mask.vec_)); -#else - const __m128 temp1 = _mm_and_ps(_mm_cast<__m128>(mask.vec_), vec1.vec_); - const __m128 temp2 = _mm_andnot_ps(_mm_cast<__m128>(mask.vec_), vec2.vec_); - ret.vec_ = _mm_or_ps(temp1, temp2); -#endif - return ret; -} - -template -force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2) { - validate_mask(mask); - simd_vec ret; -#if defined(USE_SSE41) - ret.vec_ = _mm_blendv_epi8(vec2.vec_, vec1.vec_, _mm_cast<__m128i>(mask.vec_)); -#else - const __m128i temp1 = _mm_and_si128(_mm_cast<__m128i>(mask.vec_), vec1.vec_); - const __m128i temp2 = _mm_andnot_si128(_mm_cast<__m128i>(mask.vec_), vec2.vec_); - ret.vec_ = _mm_or_si128(temp1, temp2); -#endif - return ret; -} - -template -force_inline simd_vec vectorcall select(const simd_vec mask, const simd_vec vec1, - const simd_vec vec2) { - validate_mask(mask); - simd_vec ret; -#if defined(USE_SSE41) - ret.vec_ = _mm_blendv_epi8(vec2.vec_, vec1.vec_, _mm_cast<__m128i>(mask.vec_)); -#else - const __m128i temp1 = _mm_and_si128(_mm_cast<__m128i>(mask.vec_), vec1.vec_); - const __m128i temp2 = _mm_andnot_si128(_mm_cast<__m128i>(mask.vec_), vec2.vec_); - ret.vec_ = _mm_or_si128(temp1, temp2); -#endif - return ret; -} - -} // namespace NS -} // namespace Ray - -#undef validate_mask diff --git a/tests/test_simd.cpp b/tests/test_simd.cpp index 4d8d72c24..7b89afdd8 100644 --- a/tests/test_simd.cpp +++ b/tests/test_simd.cpp @@ -4,7 +4,7 @@ #include "../internal/simd/detect.h" #define NS Ref2 -#include "../internal/simd/simd_vec.h" +#include "../internal/simd/simd.h" void test_simd_ref() { #include "test_simd.ipp" @@ -14,7 +14,7 @@ void test_simd_ref() { #if !defined(__aarch64__) && !defined(_M_ARM) && !defined(_M_ARM64) #define NS Sse2 #define USE_SSE2 -#include "../internal/simd/simd_vec.h" +#include "../internal/simd/simd.h" void test_simd_sse2() { #include "test_simd.ipp" @@ -31,7 +31,7 @@ void test_simd_avx512(); #define NS Neon #define USE_NEON -#include "../internal/simd/simd_vec.h" +#include "../internal/simd/simd.h" void test_simd_neon() { #include "test_simd.ipp" diff --git a/tests/test_simd.ipp b/tests/test_simd.ipp index aece51ddb..d514dfdf2 100644 --- a/tests/test_simd.ipp +++ b/tests/test_simd.ipp @@ -1,9 +1,9 @@ using namespace Ray::NS; { - printf("Test simd_fvec4 (%s) | ", simd_fvec4::is_native() ? "hard" : "soft"); + printf("Test fvec4 (%s) | ", fvec4::is_native() ? "hard" : "soft"); - simd_fvec4 v1, v2 = {42.0f}, v3 = {1.0f, 2.0f, 3.0f, 4.0f}; + fvec4 v1, v2 = {42.0f}, v3 = {1.0f, 2.0f, 3.0f, 4.0f}; require(v2[0] == 42.0f); require(v2[1] == 42.0f); @@ -25,7 +25,7 @@ using namespace Ray::NS; require(v3.get<2>() == 3.0f); require(v3.get<3>() == 4.0f); - simd_fvec4 v4(v2), v5 = v3; + fvec4 v4(v2), v5 = v3; require(v4[0] == 42.0f); require(v4[1] == 42.0f); @@ -45,9 +45,9 @@ using namespace Ray::NS; require(v1[3] == 4.0f); float unaligned_array[] = {0.0f, 2.0f, 30.0f, 14.0f}; - alignas(alignof(simd_fvec4)) float aligned_array[] = {0.0f, 2.0f, 30.0f, 14.0f}; + alignas(alignof(fvec4)) float aligned_array[] = {0.0f, 2.0f, 30.0f, 14.0f}; - auto v7 = simd_fvec4{&unaligned_array[0]}, v8 = simd_fvec4{&aligned_array[0], simd_mem_aligned}; + auto v7 = fvec4{&unaligned_array[0]}, v8 = fvec4{&aligned_array[0], vector_aligned}; require(v7[0] == 0.0f); require(v7[1] == 2.0f); @@ -60,7 +60,7 @@ using namespace Ray::NS; require(v8[3] == 14.0f); v5.store_to(&unaligned_array[0]); - v1.store_to(&aligned_array[0], simd_mem_aligned); + v1.store_to(&aligned_array[0], vector_aligned); require(unaligned_array[0] == 1.0f); require(unaligned_array[1] == 2.0f); @@ -78,11 +78,11 @@ using namespace Ray::NS; v3 = v1 + v2; v4 = v1 - v2; v5 = v1 * v2; - simd_fvec4 v6 = v1 / v2; - simd_fvec4 v66 = -v1; - simd_fvec4 v666 = normalize(v1); + fvec4 v6 = v1 / v2; + fvec4 v66 = -v1; + fvec4 v666 = normalize(v1); float v1_len; - simd_fvec4 v6666 = normalize_len(v1, v1_len); + fvec4 v6666 = normalize_len(v1, v1_len); require(v3[0] == Approx(5)); require(v3[1] == Approx(7)); @@ -127,14 +127,14 @@ using namespace Ray::NS; require(v5[2] == Approx(4.2426)); require(v5[3] == Approx(5.2915)); - simd_fvec4 v55 = fract(v5); + fvec4 v55 = fract(v5); require(v55[0] == Approx(0)); require(v55[1] == Approx(0.1623)); require(v55[2] == Approx(0.2426)); require(v55[3] == Approx(0.2915)); - simd_fvec4 v9 = {3.0f, 6.0f, 7.0f, 6.0f}; + fvec4 v9 = {3.0f, 6.0f, 7.0f, 6.0f}; require(hsum(v9) == Approx(22.0f)); @@ -154,9 +154,9 @@ using namespace Ray::NS; static const float gather_source[] = {0, 42.0f, 0, 0, 12.0f, 0, 0, 0, 11.0f, 0, 0, 0, 0, 0, 0, 23.0f, 0, 0}; - const simd_ivec4 v12i = {-1, 2, 6, 13}; - const simd_fvec4 v12 = gather(gather_source + 2, v12i); - const simd_fvec4 v12_masked = gather(simd_fvec4{69}, gather_source + 2, simd_ivec4{-1, 0, -1, 0}, v12i); + const ivec4 v12i = {-1, 2, 6, 13}; + const fvec4 v12 = gather(gather_source + 2, v12i); + const fvec4 v12_masked = gather(fvec4{69}, gather_source + 2, ivec4{-1, 0, -1, 0}, v12i); require(v12[0] == Approx(42)); require(v12[1] == Approx(12)); @@ -173,7 +173,7 @@ using namespace Ray::NS; require(memcmp(gather_source, scatter_destination, sizeof(gather_source)) == 0); - const simd_ivec4 scatter_mask = {-1, 0, 0, -1}; + const ivec4 scatter_mask = {-1, 0, 0, -1}; float masked_scatter_destination[] = {1, -1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 1, 2, 3, -1, 4, 0}; static const float masked_scatter_expected[] = {1, 42.0f, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 1, 2, 3, 23.0f, 4, 0}; @@ -182,8 +182,8 @@ using namespace Ray::NS; require(memcmp(masked_scatter_destination, masked_scatter_expected, sizeof(masked_scatter_destination)) == 0); - const simd_fvec4 v14 = {42.0f, 0, 24.0f, 0}; - simd_fvec4 v15 = {0, 12.0f, 0, 0}; + const fvec4 v14 = {42.0f, 0, 24.0f, 0}; + fvec4 v15 = {0, 12.0f, 0, 0}; v15 |= v14; @@ -192,20 +192,20 @@ using namespace Ray::NS; require(v15[2] == 24.0f); require(v15[3] == 0); - const simd_fvec4 v16 = {3, 1, 4, 1}; - const simd_fvec4 v17 = inclusive_scan(v16); + const fvec4 v16 = {3, 1, 4, 1}; + const fvec4 v17 = inclusive_scan(v16); require(v17[0] == 3.0f); require(v17[1] == 4.0f); require(v17[2] == 8.0f); require(v17[3] == 9.0f); - const simd_ivec4 vmask = {-1, 0, 0, -1}; + const ivec4 vmask = {-1, 0, 0, -1}; - simd_fvec4 v18 = v3; + fvec4 v18 = v3; where(vmask, v18) = v2; - const simd_fvec4 v19 = select(vmask, v2, v3); + const fvec4 v19 = select(vmask, v2, v3); require(v18.get<0>() == 4.0f); require(v18.get<1>() == 7.0f); @@ -221,9 +221,9 @@ using namespace Ray::NS; } { - printf("Test simd_ivec4 (%s) | ", simd_ivec4::is_native() ? "hard" : "soft"); + printf("Test ivec4 (%s) | ", ivec4::is_native() ? "hard" : "soft"); - simd_ivec4 v1, v2 = {42}, v3 = {1, 2, 3, 4}; + ivec4 v1, v2 = {42}, v3 = {1, 2, 3, 4}; require(v2[0] == 42); require(v2[1] == 42); @@ -245,7 +245,7 @@ using namespace Ray::NS; require(v3.get<2>() == 3); require(v3.get<3>() == 4); - simd_ivec4 v4(v2), v5 = v3; + ivec4 v4(v2), v5 = v3; require(v4[0] == 42); require(v4[1] == 42); @@ -265,9 +265,9 @@ using namespace Ray::NS; require(v1[3] == 4); int unaligned_array[] = {0, 2, 30, 14}; - alignas(alignof(simd_ivec4)) int aligned_array[] = {0, 2, 30, 14}; + alignas(alignof(ivec4)) int aligned_array[] = {0, 2, 30, 14}; - auto v7 = simd_ivec4{&unaligned_array[0]}, v8 = simd_ivec4{&aligned_array[0], simd_mem_aligned}; + auto v7 = ivec4{&unaligned_array[0]}, v8 = ivec4{&aligned_array[0], vector_aligned}; require(v7[0] == 0); require(v7[1] == 2); @@ -280,7 +280,7 @@ using namespace Ray::NS; require(v8[3] == 14); v5.store_to(&unaligned_array[0]); - v1.store_to(&aligned_array[0], simd_mem_aligned); + v1.store_to(&aligned_array[0], vector_aligned); require(unaligned_array[0] == 1); require(unaligned_array[1] == 2); @@ -298,8 +298,8 @@ using namespace Ray::NS; v3 = v1 + v2; v4 = v1 - v2; v5 = v1 * v2; - simd_ivec4 v6 = v1 / v2; - simd_ivec4 v66 = -v1; + ivec4 v6 = v1 / v2; + ivec4 v66 = -v1; require(v3[0] == 5); require(v3[1] == 7); @@ -331,9 +331,9 @@ using namespace Ray::NS; static const int gather_source[] = {0, 42, 0, 0, 12, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 23, 0, 0}; - const simd_ivec4 v9i = {-1, 2, 6, 13}; - const simd_ivec4 v9 = gather(gather_source + 2, v9i); - const simd_ivec4 v9_masked = gather(simd_ivec4{69}, gather_source + 2, simd_ivec4{-1, 0, -1, 0}, v9i); + const ivec4 v9i = {-1, 2, 6, 13}; + const ivec4 v9 = gather(gather_source + 2, v9i); + const ivec4 v9_masked = gather(ivec4{69}, gather_source + 2, ivec4{-1, 0, -1, 0}, v9i); require(v9[0] == 42); require(v9[1] == 12); @@ -345,7 +345,7 @@ using namespace Ray::NS; require(v9_masked[2] == 11); require(v9_masked[3] == 69); - simd_ivec4 v9_ = {3, 6, 7, 6}; + ivec4 v9_ = {3, 6, 7, 6}; require(hsum(v9_) == 22); int scatter_destination[18] = {}; @@ -353,7 +353,7 @@ using namespace Ray::NS; require(memcmp(gather_source, scatter_destination, sizeof(gather_source)) == 0); - const simd_ivec4 scatter_mask = {-1, 0, 0, -1}; + const ivec4 scatter_mask = {-1, 0, 0, -1}; int masked_scatter_destination[] = {1, -1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 1, 2, 3, -1, 4, 0}; static const int masked_scatter_expected[] = {1, 42, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 1, 2, 3, 23, 4, 0}; @@ -362,8 +362,8 @@ using namespace Ray::NS; require(memcmp(masked_scatter_destination, masked_scatter_expected, sizeof(masked_scatter_destination)) == 0); - const simd_ivec4 v11 = {-1, 0, -1, 0}; - simd_ivec4 v12 = {0, -1, 0, 0}; + const ivec4 v11 = {-1, 0, -1, 0}; + ivec4 v12 = {0, -1, 0, 0}; v12 |= v11; @@ -372,8 +372,8 @@ using namespace Ray::NS; require(v12[2] == -1); require(v12[3] == 0); - const simd_ivec4 v13 = {-1, 0, -1, 0}; - simd_ivec4 v14 = {0, -1, 0, 0}; + const ivec4 v13 = {-1, 0, -1, 0}; + ivec4 v14 = {0, -1, 0, 0}; v14 &= v13; @@ -382,24 +382,24 @@ using namespace Ray::NS; require(v14[2] == 0); require(v14[3] == 0); - const simd_ivec4 v15 = {-2147483647, 1, -42, 42}; - const simd_ivec4 v16 = srai(v15, 31); - require((v16 != simd_ivec4{-1, 0, -1, 0}).all_zeros()); + const ivec4 v15 = {-2147483647, 1, -42, 42}; + const ivec4 v16 = srai(v15, 31); + require((v16 != ivec4{-1, 0, -1, 0}).all_zeros()); - const simd_ivec4 v17 = {3, 1, 4, 1}; - const simd_ivec4 v18 = inclusive_scan(v17); + const ivec4 v17 = {3, 1, 4, 1}; + const ivec4 v18 = inclusive_scan(v17); require(v18[0] == 3); require(v18[1] == 4); require(v18[2] == 8); require(v18[3] == 9); - const simd_uvec4 vmask = {0xffffffff, 0, 0, 0xffffffff}; + const uvec4 vmask = {0xffffffff, 0, 0, 0xffffffff}; - simd_ivec4 v19 = v3; + ivec4 v19 = v3; where(vmask, v19) = v2; - const simd_ivec4 v20 = select(vmask, v2, v3); + const ivec4 v20 = select(vmask, v2, v3); require(v19.get<0>() == 4); require(v19.get<1>() == 7); @@ -415,9 +415,9 @@ using namespace Ray::NS; } { - printf("Test simd_uvec4 (%s) | ", simd_uvec4::is_native() ? "hard" : "soft"); + printf("Test uvec4 (%s) | ", uvec4::is_native() ? "hard" : "soft"); - simd_uvec4 v1, v2 = {42}, v3 = {1, 2, 3, 4}; + uvec4 v1, v2 = {42}, v3 = {1, 2, 3, 4}; require(v2[0] == 42); require(v2[1] == 42); @@ -439,7 +439,7 @@ using namespace Ray::NS; require(v3.get<2>() == 3); require(v3.get<3>() == 4); - simd_uvec4 v4(v2), v5 = v3; + uvec4 v4(v2), v5 = v3; require(v4[0] == 42); require(v4[1] == 42); @@ -459,9 +459,9 @@ using namespace Ray::NS; require(v1[3] == 4); unsigned unaligned_array[] = {0, 2, 30, 14}; - alignas(alignof(simd_uvec4)) unsigned aligned_array[] = {0, 2, 30, 14}; + alignas(alignof(uvec4)) unsigned aligned_array[] = {0, 2, 30, 14}; - auto v7 = simd_uvec4{&unaligned_array[0]}, v8 = simd_uvec4{&aligned_array[0], simd_mem_aligned}; + auto v7 = uvec4{&unaligned_array[0]}, v8 = uvec4{&aligned_array[0], vector_aligned}; require(v7[0] == 0); require(v7[1] == 2); @@ -474,7 +474,7 @@ using namespace Ray::NS; require(v8[3] == 14); v5.store_to(&unaligned_array[0]); - v1.store_to(&aligned_array[0], simd_mem_aligned); + v1.store_to(&aligned_array[0], vector_aligned); require(unaligned_array[0] == 1); require(unaligned_array[1] == 2); @@ -519,9 +519,9 @@ using namespace Ray::NS; static const unsigned gather_source[] = {0, 42, 0, 0, 12, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 23, 0, 0}; - const simd_ivec4 v9i = {-1, 2, 6, 13}; - const simd_uvec4 v9 = gather(gather_source + 2, v9i); - const simd_uvec4 v9_masked = gather(simd_uvec4{69}, gather_source + 2, simd_ivec4{-1, 0, -1, 0}, v9i); + const ivec4 v9i = {-1, 2, 6, 13}; + const uvec4 v9 = gather(gather_source + 2, v9i); + const uvec4 v9_masked = gather(uvec4{69}, gather_source + 2, ivec4{-1, 0, -1, 0}, v9i); require(v9[0] == 42); require(v9[1] == 12); @@ -533,7 +533,7 @@ using namespace Ray::NS; require(v9_masked[2] == 11); require(v9_masked[3] == 69); - simd_uvec4 v9_ = {3, 6, 7, 6}; + uvec4 v9_ = {3, 6, 7, 6}; require(hsum(v9_) == 22); unsigned scatter_destination[18] = {}; @@ -541,7 +541,7 @@ using namespace Ray::NS; require(memcmp(gather_source, scatter_destination, sizeof(gather_source)) == 0); - const simd_ivec4 scatter_mask = {-1, 0, 0, -1}; + const ivec4 scatter_mask = {-1, 0, 0, -1}; unsigned masked_scatter_destination[] = {1, 0xffffffff, 2, 3, 0xffffffff, 4, 5, 6, 0xffffffff, 7, 8, 9, 1, 2, 3, 0xffffffff, 4, 0}; @@ -552,8 +552,8 @@ using namespace Ray::NS; require(memcmp(masked_scatter_destination, masked_scatter_expected, sizeof(masked_scatter_destination)) == 0); - const simd_uvec4 v11 = {0xffffffff, 0, 0xffffffff, 0}; - simd_uvec4 v12 = {0, 0xffffffff, 0, 0}; + const uvec4 v11 = {0xffffffff, 0, 0xffffffff, 0}; + uvec4 v12 = {0, 0xffffffff, 0, 0}; v12 |= v11; @@ -562,8 +562,8 @@ using namespace Ray::NS; require(v12[2] == 0xffffffff); require(v12[3] == 0); - const simd_uvec4 v13 = {0xffffffff, 0, 0xffffffff, 0}; - simd_uvec4 v14 = {0, 0xffffffff, 0, 0}; + const uvec4 v13 = {0xffffffff, 0, 0xffffffff, 0}; + uvec4 v14 = {0, 0xffffffff, 0, 0}; v14 &= v13; @@ -572,20 +572,20 @@ using namespace Ray::NS; require(v14[2] == 0); require(v14[3] == 0); - const simd_uvec4 v17 = {3, 1, 4, 1}; - const simd_uvec4 v18 = inclusive_scan(v17); + const uvec4 v17 = {3, 1, 4, 1}; + const uvec4 v18 = inclusive_scan(v17); require(v18[0] == 3); require(v18[1] == 4); require(v18[2] == 8); require(v18[3] == 9); - const simd_ivec4 vmask = {-1, 0, 0, -1}; + const ivec4 vmask = {-1, 0, 0, -1}; - simd_uvec4 v19 = v3; + uvec4 v19 = v3; where(vmask, v19) = v2; - const simd_uvec4 v20 = select(vmask, v2, v3); + const uvec4 v20 = select(vmask, v2, v3); require(v19.get<0>() == 4); require(v19.get<1>() == 7); @@ -601,9 +601,9 @@ using namespace Ray::NS; } { - printf("Test simd_fvec8 (%s) | ", simd_fvec8::is_native() ? "hard" : "soft"); + printf("Test fvec8 (%s) | ", fvec8::is_native() ? "hard" : "soft"); - simd_fvec8 v1, v2 = {42.0f}, v3 = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}; + fvec8 v1, v2 = {42.0f}, v3 = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}; require(v2[0] == 42.0f); require(v2[1] == 42.0f); @@ -641,7 +641,7 @@ using namespace Ray::NS; require(v3.get<6>() == 7.0f); require(v3.get<7>() == 8.0f); - simd_fvec8 v4(v2), v5 = v3; + fvec8 v4(v2), v5 = v3; require(v4[0] == 42.0f); require(v4[1] == 42.0f); @@ -678,11 +678,11 @@ using namespace Ray::NS; v3 = v1 + v2; v4 = v1 - v2; v5 = v1 * v2; - simd_fvec8 v6 = v1 / v2; - simd_fvec8 v66 = -v1; - simd_fvec8 v666 = normalize(v1); + fvec8 v6 = v1 / v2; + fvec8 v66 = -v1; + fvec8 v666 = normalize(v1); float v1_len; - simd_fvec8 v6666 = normalize_len(v1, v1_len); + fvec8 v6666 = normalize_len(v1, v1_len); require(v3[0] == Approx(5)); require(v3[1] == Approx(7)); @@ -759,7 +759,7 @@ using namespace Ray::NS; require(v5[6] == Approx(6)); require(v5[7] == Approx(1.4142)); - simd_fvec8 v55 = fract(v5); + fvec8 v55 = fract(v5); require(v55[0] == Approx(0)); require(v55[1] == Approx(0.1623)); @@ -770,7 +770,7 @@ using namespace Ray::NS; require(v55[6] == Approx(0)); require(v55[7] == Approx(0.4142)); - simd_fvec8 v9 = {3.0f, 6.0f, 7.0f, 6.0f, 2.0f, 12.0f, 18.0f, 0.0f}; + fvec8 v9 = {3.0f, 6.0f, 7.0f, 6.0f, 2.0f, 12.0f, 18.0f, 0.0f}; require(hsum(v9) == Approx(54.0f)); auto v10 = simd_cast(v2 < v9); @@ -798,10 +798,10 @@ using namespace Ray::NS; static const float gather_source[] = {0, 42.0f, 0, 0, 12.0f, 0, 0, 0, 11.0f, 0, 0, 0, 0, 0, 0, 23.0f, 0, 0, 0, 42.0f, 0, 0, 12.0f, 0, 0, 0, 11.0f, 0, 0, 0, 0, 0, 0, 23.0f, 0, 0}; - const simd_ivec8 v12i = {-1, 2, 6, 13, 17, 20, 24, 31}; - const simd_fvec8 v12 = gather(gather_source + 2, v12i); - const simd_fvec8 v12_masked = - gather(simd_fvec8{69}, gather_source + 2, simd_ivec8{-1, 0, -1, 0, -1, 0, -1, 0}, v12i); + const ivec8 v12i = {-1, 2, 6, 13, 17, 20, 24, 31}; + const fvec8 v12 = gather(gather_source + 2, v12i); + const fvec8 v12_masked = + gather(fvec8{69}, gather_source + 2, ivec8{-1, 0, -1, 0, -1, 0, -1, 0}, v12i); require(v12[0] == Approx(42)); require(v12[1] == Approx(12)); @@ -826,7 +826,7 @@ using namespace Ray::NS; require(memcmp(gather_source, scatter_destination, sizeof(gather_source)) == 0); - const simd_ivec8 scatter_mask = {-1, 0, 0, -1, -1, 0, 0, -1}; + const ivec8 scatter_mask = {-1, 0, 0, -1, -1, 0, 0, -1}; float masked_scatter_destination[] = {1, -1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 1, 2, 3, -1, 4, 0, 1, -1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 1, 2, 3, -1, 4, 0}; @@ -837,8 +837,8 @@ using namespace Ray::NS; require(memcmp(masked_scatter_destination, masked_scatter_expected, sizeof(masked_scatter_destination)) == 0); - const simd_fvec8 v14 = {42.0f, 0, 24.0f, 0, 42.0f, 0, 24.0f, 0}; - simd_fvec8 v15 = {0, 12.0f, 0, 0, 0, 12.0f, 0, 0}; + const fvec8 v14 = {42.0f, 0, 24.0f, 0, 42.0f, 0, 24.0f, 0}; + fvec8 v15 = {0, 12.0f, 0, 0, 0, 12.0f, 0, 0}; v15 |= v14; @@ -851,8 +851,8 @@ using namespace Ray::NS; require(v15[6] == 24.0f); require(v15[7] == 0); - const simd_fvec8 v16 = {3, 1, 4, 1, 3, 1, 4, 1}; - const simd_fvec8 v17 = inclusive_scan(v16); + const fvec8 v16 = {3, 1, 4, 1, 3, 1, 4, 1}; + const fvec8 v17 = inclusive_scan(v16); require(v17[0] == 3.0f); require(v17[1] == 4.0f); @@ -863,12 +863,12 @@ using namespace Ray::NS; require(v17[6] == 17.0f); require(v17[7] == 18.0f); - const simd_ivec8 vmask = {-1, 0, 0, -1, -1, 0, 0, -1}; + const ivec8 vmask = {-1, 0, 0, -1, -1, 0, 0, -1}; - simd_fvec8 v18 = v3; + fvec8 v18 = v3; where(vmask, v18) = v2; - const simd_fvec8 v19 = select(vmask, v2, v3); + const fvec8 v19 = select(vmask, v2, v3); require(v18.get<0>() == 4.0f); require(v18.get<1>() == 7.0f); @@ -892,9 +892,9 @@ using namespace Ray::NS; } { - printf("Test simd_ivec8 (%s) | ", simd_ivec8::is_native() ? "hard" : "soft"); + printf("Test ivec8 (%s) | ", ivec8::is_native() ? "hard" : "soft"); - simd_ivec8 v1, v2 = {42}, v3 = {1, 2, 3, 4, 5, 6, 7, 8}; + ivec8 v1, v2 = {42}, v3 = {1, 2, 3, 4, 5, 6, 7, 8}; require(v2[0] == 42); require(v2[1] == 42); @@ -932,7 +932,7 @@ using namespace Ray::NS; require(v3.get<6>() == 7); require(v3.get<7>() == 8); - simd_ivec8 v4(v2), v5 = v3; + ivec8 v4(v2), v5 = v3; require(v4[0] == 42); require(v4[1] == 42); @@ -969,8 +969,8 @@ using namespace Ray::NS; v3 = v1 + v2; v4 = v1 - v2; v5 = v1 * v2; - simd_ivec8 v6 = v1 / v2; - simd_ivec8 v66 = -v1; + ivec8 v6 = v1 / v2; + ivec8 v66 = -v1; require(v3[0] == 5); require(v3[1] == 7); @@ -1020,9 +1020,9 @@ using namespace Ray::NS; static const int gather_source[] = {0, 42, 0, 0, 12, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 23, 0, 0, 0, 42, 0, 0, 12, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 23, 0, 0}; - const simd_ivec8 v9i = {-1, 2, 6, 13, 17, 20, 24, 31}; - const simd_ivec8 v9 = gather(gather_source + 2, v9i); - const simd_ivec8 v9_masked = gather(simd_ivec8{69}, gather_source + 2, simd_ivec8{-1, 0, -1, 0, -1, 0, -1, 0}, v9i); + const ivec8 v9i = {-1, 2, 6, 13, 17, 20, 24, 31}; + const ivec8 v9 = gather(gather_source + 2, v9i); + const ivec8 v9_masked = gather(ivec8{69}, gather_source + 2, ivec8{-1, 0, -1, 0, -1, 0, -1, 0}, v9i); require(v9[0] == 42); require(v9[1] == 12); @@ -1042,7 +1042,7 @@ using namespace Ray::NS; require(v9_masked[6] == 11); require(v9_masked[7] == 69); - simd_ivec8 v9_ = {3, 6, 7, 6, 2, 12, 18, 0}; + ivec8 v9_ = {3, 6, 7, 6, 2, 12, 18, 0}; require(hsum(v9_) == 54); int scatter_destination[36] = {}; @@ -1050,7 +1050,7 @@ using namespace Ray::NS; require(memcmp(gather_source, scatter_destination, sizeof(gather_source)) == 0); - const simd_ivec8 scatter_mask = {-1, 0, 0, -1, -1, 0, 0, -1}; + const ivec8 scatter_mask = {-1, 0, 0, -1, -1, 0, 0, -1}; int masked_scatter_destination[] = {1, -1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 1, 2, 3, -1, 4, 0, 1, -1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 1, 2, 3, -1, 4, 0}; @@ -1061,8 +1061,8 @@ using namespace Ray::NS; require(memcmp(masked_scatter_destination, masked_scatter_expected, sizeof(masked_scatter_destination)) == 0); - const simd_ivec8 v11 = {-1, 0, -1, 0, -1, 0, -1, 0}; - simd_ivec8 v12 = {0, -1, 0, 0, 0, -1, 0, 0}; + const ivec8 v11 = {-1, 0, -1, 0, -1, 0, -1, 0}; + ivec8 v12 = {0, -1, 0, 0, 0, -1, 0, 0}; v12 |= v11; @@ -1075,8 +1075,8 @@ using namespace Ray::NS; require(v12[6] == -1); require(v12[7] == 0); - const simd_ivec8 v13 = {-1, 0, -1, 0, -1, 0, -1, 0}; - simd_ivec8 v14 = {0, -1, 0, 0, 0, -1, 0, 0}; + const ivec8 v13 = {-1, 0, -1, 0, -1, 0, -1, 0}; + ivec8 v14 = {0, -1, 0, 0, 0, -1, 0, 0}; v14 &= v13; @@ -1089,12 +1089,12 @@ using namespace Ray::NS; require(v14[6] == 0); require(v14[7] == 0); - const simd_ivec8 v15 = {-2147483647, 1, -42, 42, -2147483647, 1, -42, 42}; - const simd_ivec8 v16 = srai(v15, 31); - require((v16 != simd_ivec8{-1, 0, -1, 0, -1, 0, -1, 0}).all_zeros()); + const ivec8 v15 = {-2147483647, 1, -42, 42, -2147483647, 1, -42, 42}; + const ivec8 v16 = srai(v15, 31); + require((v16 != ivec8{-1, 0, -1, 0, -1, 0, -1, 0}).all_zeros()); - const simd_ivec8 v17 = {3, 1, 4, 1, 3, 1, 4, 1}; - const simd_ivec8 v18 = inclusive_scan(v17); + const ivec8 v17 = {3, 1, 4, 1, 3, 1, 4, 1}; + const ivec8 v18 = inclusive_scan(v17); require(v18[0] == 3); require(v18[1] == 4); @@ -1105,12 +1105,12 @@ using namespace Ray::NS; require(v18[6] == 17); require(v18[7] == 18); - const simd_uvec8 vmask = {0xffffffff, 0, 0, 0xffffffff, 0xffffffff, 0, 0, 0xffffffff}; + const uvec8 vmask = {0xffffffff, 0, 0, 0xffffffff, 0xffffffff, 0, 0, 0xffffffff}; - simd_ivec8 v19 = v3; + ivec8 v19 = v3; where(vmask, v19) = v2; - const simd_ivec8 v20 = select(vmask, v2, v3); + const ivec8 v20 = select(vmask, v2, v3); require(v19.get<0>() == 4); require(v19.get<1>() == 7); @@ -1134,9 +1134,9 @@ using namespace Ray::NS; } { - printf("Test simd_uvec8 (%s) | ", simd_uvec8::is_native() ? "hard" : "soft"); + printf("Test uvec8 (%s) | ", uvec8::is_native() ? "hard" : "soft"); - simd_uvec8 v1, v2 = {42}, v3 = {1, 2, 3, 4, 5, 6, 7, 8}; + uvec8 v1, v2 = {42}, v3 = {1, 2, 3, 4, 5, 6, 7, 8}; require(v2[0] == 42); require(v2[1] == 42); @@ -1174,7 +1174,7 @@ using namespace Ray::NS; require(v3.get<6>() == 7); require(v3.get<7>() == 8); - simd_uvec8 v4(v2), v5 = v3; + uvec8 v4(v2), v5 = v3; require(v4[0] == 42); require(v4[1] == 42); @@ -1252,9 +1252,9 @@ using namespace Ray::NS; static const unsigned gather_source[] = {0, 42, 0, 0, 12, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 23, 0, 0, 0, 42, 0, 0, 12, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 23, 0, 0}; - const simd_ivec8 v9i = {-1, 2, 6, 13, 17, 20, 24, 31}; - const simd_uvec8 v9 = gather(gather_source + 2, v9i); - const simd_uvec8 v9_masked = gather(simd_uvec8{69}, gather_source + 2, simd_ivec8{-1, 0, -1, 0, -1, 0, -1, 0}, v9i); + const ivec8 v9i = {-1, 2, 6, 13, 17, 20, 24, 31}; + const uvec8 v9 = gather(gather_source + 2, v9i); + const uvec8 v9_masked = gather(uvec8{69}, gather_source + 2, ivec8{-1, 0, -1, 0, -1, 0, -1, 0}, v9i); require(v9[0] == 42); require(v9[1] == 12); @@ -1274,7 +1274,7 @@ using namespace Ray::NS; require(v9_masked[6] == 11); require(v9_masked[7] == 69); - simd_uvec8 v9_ = {3, 6, 7, 6, 2, 12, 18, 0}; + uvec8 v9_ = {3, 6, 7, 6, 2, 12, 18, 0}; require(hsum(v9_) == 54); unsigned scatter_destination[36] = {}; @@ -1282,7 +1282,7 @@ using namespace Ray::NS; require(memcmp(gather_source, scatter_destination, sizeof(gather_source)) == 0); - const simd_ivec8 scatter_mask = {-1, 0, 0, -1, -1, 0, 0, -1}; + const ivec8 scatter_mask = {-1, 0, 0, -1, -1, 0, 0, -1}; unsigned masked_scatter_destination[] = { 1, 0xffffffff, 2, 3, 0xffffffff, 4, 5, 6, 0xffffffff, 7, 8, 9, 1, 2, 3, 0xffffffff, 4, 0, @@ -1294,8 +1294,8 @@ using namespace Ray::NS; require(memcmp(masked_scatter_destination, masked_scatter_expected, sizeof(masked_scatter_destination)) == 0); - const simd_uvec8 v11 = {0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0}; - simd_uvec8 v12 = {0, 0xffffffff, 0, 0, 0, 0xffffffff, 0, 0}; + const uvec8 v11 = {0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0}; + uvec8 v12 = {0, 0xffffffff, 0, 0, 0, 0xffffffff, 0, 0}; v12 |= v11; @@ -1308,8 +1308,8 @@ using namespace Ray::NS; require(v12[6] == -1); require(v12[7] == 0); - const simd_uvec8 v13 = {0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0}; - simd_uvec8 v14 = {0, 0xffffffff, 0, 0, 0, 0xffffffff, 0, 0}; + const uvec8 v13 = {0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0}; + uvec8 v14 = {0, 0xffffffff, 0, 0, 0, 0xffffffff, 0, 0}; v14 &= v13; @@ -1322,8 +1322,8 @@ using namespace Ray::NS; require(v14[6] == 0); require(v14[7] == 0); - const simd_uvec8 v17 = {3, 1, 4, 1, 3, 1, 4, 1}; - const simd_uvec8 v18 = inclusive_scan(v17); + const uvec8 v17 = {3, 1, 4, 1, 3, 1, 4, 1}; + const uvec8 v18 = inclusive_scan(v17); require(v18[0] == 3); require(v18[1] == 4); @@ -1334,12 +1334,12 @@ using namespace Ray::NS; require(v18[6] == 17); require(v18[7] == 18); - const simd_ivec8 vmask = {-1, 0, 0, -1, -1, 0, 0, -1}; + const ivec8 vmask = {-1, 0, 0, -1, -1, 0, 0, -1}; - simd_uvec8 v19 = v3; + uvec8 v19 = v3; where(vmask, v19) = v2; - const simd_uvec8 v20 = select(vmask, v2, v3); + const uvec8 v20 = select(vmask, v2, v3); require(v19.get<0>() == 4); require(v19.get<1>() == 7); @@ -1365,9 +1365,9 @@ using namespace Ray::NS; ////////////////////////////////////////////////// { - printf("Test simd_fvec16 (%s) | ", simd_fvec16::is_native() ? "hard" : "soft"); + printf("Test fvec16 (%s) | ", fvec16::is_native() ? "hard" : "soft"); - simd_fvec16 v1, v2 = {42.0f}, v3 = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, + fvec16 v1, v2 = {42.0f}, v3 = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}; require(v2[0] == 42.0f); @@ -1438,7 +1438,7 @@ using namespace Ray::NS; require(v3.get<14>() == 15.0f); require(v3.get<15>() == 16.0f); - simd_fvec16 v4(v2), v5 = v3; + fvec16 v4(v2), v5 = v3; require(v4[0] == 42.0f); require(v4[1] == 42.0f); @@ -1499,11 +1499,11 @@ using namespace Ray::NS; v3 = v1 + v2; v4 = v1 - v2; v5 = v1 * v2; - simd_fvec16 v6 = v1 / v2; - simd_fvec16 v66 = -v1; - simd_fvec16 v666 = normalize(v1); + fvec16 v6 = v1 / v2; + fvec16 v66 = -v1; + fvec16 v666 = normalize(v1); float v1_len; - simd_fvec16 v6666 = normalize_len(v1, v1_len); + fvec16 v6666 = normalize_len(v1, v1_len); require(v3[0] == Approx(5)); require(v3[1] == Approx(7)); @@ -1644,7 +1644,7 @@ using namespace Ray::NS; require(v5[14] == Approx(6)); require(v5[15] == Approx(1.4142)); - simd_fvec16 v55 = fract(v5); + fvec16 v55 = fract(v5); require(v55[0] == Approx(0)); require(v55[1] == Approx(0.1623)); @@ -1663,7 +1663,7 @@ using namespace Ray::NS; require(v55[14] == Approx(0)); require(v55[15] == Approx(0.4142)); - simd_fvec16 v9 = {3.0f, 6.0f, 7.0f, 6.0f, 2.0f, 12.0f, 18.0f, 0.0f, + fvec16 v9 = {3.0f, 6.0f, 7.0f, 6.0f, 2.0f, 12.0f, 18.0f, 0.0f, 3.0f, 6.0f, 7.0f, 6.0f, 2.0f, 12.0f, 18.0f, 0.0f}; require(hsum(v9) == Approx(108.0f)); @@ -1694,10 +1694,10 @@ using namespace Ray::NS; 0, 42.0f, 0, 0, 12.0f, 0, 0, 0, 11.0f, 0, 0, 0, 0, 0, 0, 23.0f, 0, 0, 0, 42.0f, 0, 0, 12.0f, 0, 0, 0, 11.0f, 0, 0, 0, 0, 0, 0, 23.0f, 0, 0}; - const simd_ivec16 v12i = {-1, 2, 6, 13, 17, 20, 24, 31, 35, 38, 42, 49, 53, 56, 60, 67}; - const simd_fvec16 v12 = gather(gather_source + 2, v12i); - const simd_fvec16 v12_masked = gather(simd_fvec16{69}, gather_source + 2, - simd_ivec16{-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0}, v12i); + const ivec16 v12i = {-1, 2, 6, 13, 17, 20, 24, 31, 35, 38, 42, 49, 53, 56, 60, 67}; + const fvec16 v12 = gather(gather_source + 2, v12i); + const fvec16 v12_masked = gather(fvec16{69}, gather_source + 2, + ivec16{-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0}, v12i); require(v12[0] == Approx(42)); require(v12[1] == Approx(12)); @@ -1738,7 +1738,7 @@ using namespace Ray::NS; require(memcmp(gather_source, scatter_destination, sizeof(gather_source)) == 0); - const simd_ivec16 scatter_mask = {-1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1}; + const ivec16 scatter_mask = {-1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1}; float masked_scatter_destination[] = {1, -1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 1, 2, 3, -1, 4, 0, 1, -1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 1, 2, 3, -1, 4, 0, @@ -1753,8 +1753,8 @@ using namespace Ray::NS; require(memcmp(masked_scatter_destination, masked_scatter_expected, sizeof(masked_scatter_destination)) == 0); - const simd_fvec16 v14 = {42.0f, 0, 24.0f, 0, 42.0f, 0, 24.0f, 0, 42.0f, 0, 24.0f, 0, 42.0f, 0, 24.0f, 0}; - simd_fvec16 v15 = {0, 12.0f, 0, 0, 0, 12.0f, 0, 0, 0, 12.0f, 0, 0, 0, 12.0f, 0, 0}; + const fvec16 v14 = {42.0f, 0, 24.0f, 0, 42.0f, 0, 24.0f, 0, 42.0f, 0, 24.0f, 0, 42.0f, 0, 24.0f, 0}; + fvec16 v15 = {0, 12.0f, 0, 0, 0, 12.0f, 0, 0, 0, 12.0f, 0, 0, 0, 12.0f, 0, 0}; v15 |= v14; @@ -1775,8 +1775,8 @@ using namespace Ray::NS; require(v15[14] == 24.0f); require(v15[15] == 0); - const simd_fvec16 v16 = {3, 1, 4, 1, 3, 1, 4, 1, 3, 1, 4, 1, 3, 1, 4, 1}; - const simd_fvec16 v17 = inclusive_scan(v16); + const fvec16 v16 = {3, 1, 4, 1, 3, 1, 4, 1, 3, 1, 4, 1, 3, 1, 4, 1}; + const fvec16 v17 = inclusive_scan(v16); require(v17[0] == 3.0f); require(v17[1] == 4.0f); @@ -1795,12 +1795,12 @@ using namespace Ray::NS; require(v17[14] == 35.0f); require(v17[15] == 36.0f); - const simd_ivec16 vmask = {-1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1}; + const ivec16 vmask = {-1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1}; - simd_fvec16 v18 = v3; + fvec16 v18 = v3; where(vmask, v18) = v2; - const simd_fvec16 v19 = select(vmask, v2, v3); + const fvec16 v19 = select(vmask, v2, v3); require(v18.get<0>() == 4.0f); require(v18.get<1>() == 7.0f); @@ -1840,9 +1840,9 @@ using namespace Ray::NS; } { - printf("Test simd_ivec16 (%s) | ", simd_ivec16::is_native() ? "hard" : "soft"); + printf("Test ivec16 (%s) | ", ivec16::is_native() ? "hard" : "soft"); - simd_ivec16 v1, v2 = {42}, v3 = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; + ivec16 v1, v2 = {42}, v3 = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; require(v2[0] == 42); require(v2[1] == 42); @@ -1912,7 +1912,7 @@ using namespace Ray::NS; require(v3.get<14>() == 15); require(v3.get<15>() == 16); - simd_ivec16 v4(v2), v5 = v3; + ivec16 v4(v2), v5 = v3; require(v4[0] == 42); require(v4[1] == 42); @@ -1973,8 +1973,8 @@ using namespace Ray::NS; v3 = v1 + v2; v4 = v1 - v2; v5 = v1 * v2; - simd_ivec16 v6 = v1 / v2; - simd_ivec16 v66 = -v1; + ivec16 v6 = v1 / v2; + ivec16 v66 = -v1; require(v3[0] == 5); require(v3[1] == 7); @@ -2066,10 +2066,10 @@ using namespace Ray::NS; 0, 42, 0, 0, 12, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 23, 0, 0, 0, 42, 0, 0, 12, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 23, 0, 0}; - const simd_ivec16 v9i = {-1, 2, 6, 13, 17, 20, 24, 31, 35, 38, 42, 49, 53, 56, 60, 67}; - const simd_ivec16 v9 = gather(gather_source + 2, v9i); - const simd_ivec16 v9_masked = gather(simd_ivec16{69}, gather_source + 2, - simd_ivec16{-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0}, v9i); + const ivec16 v9i = {-1, 2, 6, 13, 17, 20, 24, 31, 35, 38, 42, 49, 53, 56, 60, 67}; + const ivec16 v9 = gather(gather_source + 2, v9i); + const ivec16 v9_masked = gather(ivec16{69}, gather_source + 2, + ivec16{-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0}, v9i); require(v9[0] == 42); require(v9[1] == 12); @@ -2105,7 +2105,7 @@ using namespace Ray::NS; require(v9_masked[14] == 11); require(v9_masked[15] == 69); - simd_fvec16 v9_ = {3, 6, 7, 6, 2, 12, 18, 0, 3, 6, 7, 6, 2, 12, 18, 0}; + fvec16 v9_ = {3, 6, 7, 6, 2, 12, 18, 0, 3, 6, 7, 6, 2, 12, 18, 0}; require(hsum(v9_) == 108); int scatter_destination[72] = {}; @@ -2113,7 +2113,7 @@ using namespace Ray::NS; require(memcmp(gather_source, scatter_destination, sizeof(gather_source)) == 0); - const simd_ivec16 scatter_mask = {-1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1}; + const ivec16 scatter_mask = {-1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1}; int masked_scatter_destination[] = {1, -1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 1, 2, 3, -1, 4, 0, 1, -1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 1, 2, 3, -1, 4, 0, @@ -2128,8 +2128,8 @@ using namespace Ray::NS; require(memcmp(masked_scatter_destination, masked_scatter_expected, sizeof(masked_scatter_destination)) == 0); - const simd_ivec16 v11 = {-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0}; - simd_ivec16 v12 = {0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0}; + const ivec16 v11 = {-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0}; + ivec16 v12 = {0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0}; v12 |= v11; @@ -2150,8 +2150,8 @@ using namespace Ray::NS; require(v12[14] == -1); require(v12[15] == 0); - const simd_ivec16 v13 = {-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0}; - simd_ivec16 v14 = {0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0}; + const ivec16 v13 = {-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0}; + ivec16 v14 = {0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0}; v14 &= v13; @@ -2172,13 +2172,13 @@ using namespace Ray::NS; require(v14[14] == 0); require(v14[15] == 0); - const simd_ivec16 v15 = {-2147483647, 1, -42, 42, -2147483647, 1, -42, 42, + const ivec16 v15 = {-2147483647, 1, -42, 42, -2147483647, 1, -42, 42, -2147483647, 1, -42, 42, -2147483647, 1, -42, 42}; - const simd_ivec16 v16 = srai(v15, 31); - require((v16 != simd_ivec16{-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0}).all_zeros()); + const ivec16 v16 = srai(v15, 31); + require((v16 != ivec16{-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0}).all_zeros()); - const simd_ivec16 v17 = {3, 1, 4, 1, 3, 1, 4, 1, 3, 1, 4, 1, 3, 1, 4, 1}; - const simd_ivec16 v18 = inclusive_scan(v17); + const ivec16 v17 = {3, 1, 4, 1, 3, 1, 4, 1, 3, 1, 4, 1, 3, 1, 4, 1}; + const ivec16 v18 = inclusive_scan(v17); require(v18[0] == 3); require(v18[1] == 4); @@ -2197,13 +2197,13 @@ using namespace Ray::NS; require(v18[14] == 35); require(v18[15] == 36); - const simd_uvec16 vmask = {0xffffffff, 0, 0, 0xffffffff, 0xffffffff, 0, 0, 0xffffffff, + const uvec16 vmask = {0xffffffff, 0, 0, 0xffffffff, 0xffffffff, 0, 0, 0xffffffff, 0xffffffff, 0, 0, 0xffffffff, 0xffffffff, 0, 0, 0xffffffff}; - simd_ivec16 v19 = v3; + ivec16 v19 = v3; where(vmask, v19) = v2; - const simd_ivec16 v20 = select(vmask, v2, v3); + const ivec16 v20 = select(vmask, v2, v3); require(v19.get<0>() == 4); require(v19.get<1>() == 7); @@ -2243,9 +2243,9 @@ using namespace Ray::NS; } { - printf("Test simd_uvec16 (%s) | ", simd_uvec16::is_native() ? "hard" : "soft"); + printf("Test uvec16 (%s) | ", uvec16::is_native() ? "hard" : "soft"); - simd_uvec16 v1, v2 = {42}, v3 = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; + uvec16 v1, v2 = {42}, v3 = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; require(v2[0] == 42); require(v2[1] == 42); @@ -2315,7 +2315,7 @@ using namespace Ray::NS; require(v3.get<14>() == 15); require(v3.get<15>() == 16); - simd_uvec16 v4(v2), v5 = v3; + uvec16 v4(v2), v5 = v3; require(v4[0] == 42); require(v4[1] == 42); @@ -2451,10 +2451,10 @@ using namespace Ray::NS; 0, 42, 0, 0, 12, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 23, 0, 0, 0, 42, 0, 0, 12, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 23, 0, 0}; - const simd_ivec16 v9i = {-1, 2, 6, 13, 17, 20, 24, 31, 35, 38, 42, 49, 53, 56, 60, 67}; - const simd_uvec16 v9 = gather(gather_source + 2, v9i); - const simd_uvec16 v9_masked = gather(simd_uvec16{69}, gather_source + 2, - simd_ivec16{-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0}, v9i); + const ivec16 v9i = {-1, 2, 6, 13, 17, 20, 24, 31, 35, 38, 42, 49, 53, 56, 60, 67}; + const uvec16 v9 = gather(gather_source + 2, v9i); + const uvec16 v9_masked = gather(uvec16{69}, gather_source + 2, + ivec16{-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0}, v9i); require(v9[0] == 42); require(v9[1] == 12); @@ -2490,7 +2490,7 @@ using namespace Ray::NS; require(v9_masked[14] == 11); require(v9_masked[15] == 69); - simd_fvec16 v9_ = {3, 6, 7, 6, 2, 12, 18, 0, 3, 6, 7, 6, 2, 12, 18, 0}; + fvec16 v9_ = {3, 6, 7, 6, 2, 12, 18, 0, 3, 6, 7, 6, 2, 12, 18, 0}; require(hsum(v9_) == 108); unsigned scatter_destination[72] = {}; @@ -2498,7 +2498,7 @@ using namespace Ray::NS; require(memcmp(gather_source, scatter_destination, sizeof(gather_source)) == 0); - const simd_ivec16 scatter_mask = {-1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1}; + const ivec16 scatter_mask = {-1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1}; unsigned masked_scatter_destination[] = { 1, 0xffffffff, 2, 3, 0xffffffff, 4, 5, 6, 0xffffffff, 7, 8, 9, 1, 2, 3, 0xffffffff, 4, 0, @@ -2515,9 +2515,9 @@ using namespace Ray::NS; require(memcmp(masked_scatter_destination, masked_scatter_expected, sizeof(masked_scatter_destination)) == 0); - const simd_uvec16 v11 = {0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, + const uvec16 v11 = {0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0}; - simd_uvec16 v12 = {0, 0xffffffff, 0, 0, 0, 0xffffffff, 0, 0, 0, 0xffffffff, 0, 0, 0, 0xffffffff, 0, 0}; + uvec16 v12 = {0, 0xffffffff, 0, 0, 0, 0xffffffff, 0, 0, 0, 0xffffffff, 0, 0, 0, 0xffffffff, 0, 0}; v12 |= v11; @@ -2538,9 +2538,9 @@ using namespace Ray::NS; require(v12[14] == 0xffffffff); require(v12[15] == 0); - const simd_uvec16 v13 = {0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, + const uvec16 v13 = {0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0}; - simd_uvec16 v14 = {0, 0xffffffff, 0, 0, 0, 0xffffffff, 0, 0, 0, 0xffffffff, 0, 0, 0, 0xffffffff, 0, 0}; + uvec16 v14 = {0, 0xffffffff, 0, 0, 0, 0xffffffff, 0, 0, 0, 0xffffffff, 0, 0, 0, 0xffffffff, 0, 0}; v14 &= v13; @@ -2561,8 +2561,8 @@ using namespace Ray::NS; require(v14[14] == 0); require(v14[15] == 0); - const simd_uvec16 v17 = {3, 1, 4, 1, 3, 1, 4, 1, 3, 1, 4, 1, 3, 1, 4, 1}; - const simd_uvec16 v18 = inclusive_scan(v17); + const uvec16 v17 = {3, 1, 4, 1, 3, 1, 4, 1, 3, 1, 4, 1, 3, 1, 4, 1}; + const uvec16 v18 = inclusive_scan(v17); require(v18[0] == 3); require(v18[1] == 4); @@ -2581,12 +2581,12 @@ using namespace Ray::NS; require(v18[14] == 35); require(v18[15] == 36); - const simd_ivec16 vmask = {-1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1}; + const ivec16 vmask = {-1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1}; - simd_uvec16 v19 = v3; + uvec16 v19 = v3; where(vmask, v19) = v2; - const simd_uvec16 v20 = select(vmask, v2, v3); + const uvec16 v20 = select(vmask, v2, v3); require(v19.get<0>() == 4); require(v19.get<1>() == 7); diff --git a/tests/test_simd_avx.cpp b/tests/test_simd_avx.cpp index 7e5cb2cdd..40eed7b2c 100644 --- a/tests/test_simd_avx.cpp +++ b/tests/test_simd_avx.cpp @@ -9,7 +9,7 @@ #define NS Avx #define USE_AVX -#include "../internal/simd/simd_vec.h" +#include "../internal/simd/simd.h" void test_simd_avx() { #include "test_simd.ipp" diff --git a/tests/test_simd_avx2.cpp b/tests/test_simd_avx2.cpp index 33f184362..44417aa6d 100644 --- a/tests/test_simd_avx2.cpp +++ b/tests/test_simd_avx2.cpp @@ -9,7 +9,7 @@ #define NS Avx2 #define USE_AVX2 -#include "../internal/simd/simd_vec.h" +#include "../internal/simd/simd.h" void test_simd_avx2() { #include "test_simd.ipp" diff --git a/tests/test_simd_avx512.cpp b/tests/test_simd_avx512.cpp index f4d1e7271..07c878f90 100644 --- a/tests/test_simd_avx512.cpp +++ b/tests/test_simd_avx512.cpp @@ -9,7 +9,7 @@ #define NS Avx512 #define USE_AVX512 -#include "../internal/simd/simd_vec.h" +#include "../internal/simd/simd.h" void test_simd_avx512() { #include "test_simd.ipp" diff --git a/tests/test_simd_sse41.cpp b/tests/test_simd_sse41.cpp index d44a38580..c5d1eaa71 100644 --- a/tests/test_simd_sse41.cpp +++ b/tests/test_simd_sse41.cpp @@ -9,7 +9,7 @@ #define NS Sse41 #define USE_SSE41 -#include "../internal/simd/simd_vec.h" +#include "../internal/simd/simd.h" void test_simd_sse41() { #include "test_simd.ipp"