diff --git a/include/common/core/memory.hpp b/include/common/core/memory.hpp index c152f187c..e5ff7bf17 100644 --- a/include/common/core/memory.hpp +++ b/include/common/core/memory.hpp @@ -476,24 +476,9 @@ __XETLA_API xetla_vector xetla_load_global( Y); return ret.xetla_format(); } else if constexpr (BlockWidth * sizeof(T) < sizeof(uint32_t)) { - constexpr auto scale_factor = sizeof(uint32_t) / sizeof(T); - xetla_vector ret = xetla_load_global< - uint32_t, - BlockWidth, - BlockHeight, - NBlocks, - Transposed, - Transformed, - L1H, - L2H>( - reinterpret_cast(Ptr), - SurfaceWidth, - SurfaceHeight, - SurfacePitch, - X / scale_factor, - Y); - return ret.xetla_format().xetla_select( - X % scale_factor); + xetla_vector byte_offsets = + xetla_vector_gen(0, SurfacePitch); + return xetla_load_global(Ptr, byte_offsets); } else { return __ESIMD_ENS::lsc_load_2d< T, diff --git a/include/subgroup/tile/impl/load_xe.hpp b/include/subgroup/tile/impl/load_xe.hpp index ad0bd2604..140e61593 100644 --- a/include/subgroup/tile/impl/load_xe.hpp +++ b/include/subgroup/tile/impl/load_xe.hpp @@ -236,10 +236,10 @@ tile_load(tile_t& tile, payload_t& payload) { reg_tmp .xetla_format< native_type_t, - block_size_x / scale_factor, + ld_blk_width / scale_factor, ld_blk_height>() .xetla_select< - block_size_x / scale_factor, + ld_blk_width / scale_factor, 1, ld_blk_size_y, 1>(0, 0); @@ -297,9 +297,9 @@ tile_load(tile_t& tile, payload_t& payload) { // xetla_tdescriptor tdesc = payload_row.row(j); auto reg_blk = tile.reg.xetla_select( processed_elems + j * remained_block_elems); - // constexpr uint32_t ld_blk_height = (reg_transpose && trans) - // ? detail::getNextPowerOf2() - // : remained_ld_blk_size_y; + constexpr uint32_t ld_blk_height = (reg_transpose && trans) + ? detail::getNextPowerOf2() + : remained_ld_blk_size_y; constexpr uint32_t tmp_size = ld_blk_height * block_size_x * arr_len; xetla_vector reg_tmp; #pragma unroll @@ -311,7 +311,7 @@ tile_load(tile_t& tile, payload_t& payload) { reg_tmp.xetla_format>() = xetla_load_global< native_type_t, block_size_x / scale_factor, - ld_blk_height, + remained_ld_blk_size_y, arr_len, trans, mem_transform, @@ -325,15 +325,6 @@ tile_load(tile_t& tile, payload_t& payload) { payload.offset_x + offset_x / scale_factor, payload.offset_y + num_block_y * block_size_y + ii * remained_ld_blk_size_y); - // xetla_tload_global< - // load_dtype, - // (ld_blk_height * block_size_x * arr_len / scale_factor), - // L1, - // L2, - // trans, - // mem_transform, - // arch_tag>(tdesc); - if constexpr (reg_transpose && trans) { reg_blk.xetla_select(ii * load_elems) .xetla_format>() = diff --git a/include/subgroup/tile/impl/payload_xe.hpp b/include/subgroup/tile/impl/payload_xe.hpp index c5de848dc..3480a7820 100644 --- a/include/subgroup/tile/impl/payload_xe.hpp +++ b/include/subgroup/tile/impl/payload_xe.hpp @@ -1841,20 +1841,6 @@ struct prefetch_payload_t< return channel >= 32 ? 32 : channel >= 16 ? 16 : channel >= 8 ? 8 : 1; } - static constexpr uint32_t num_channel = select_channel( - std::min(mem_transpose ? block_size_x : block_size_y, max_channel)); - - static constexpr uint32_t max_channel = - max_prefetch_vec_len / (vector_size * sizeof(prefetch_dtype)); - - static constexpr uint32_t select_channel(const uint32_t channel) { - return (channel >= load_store_attr::max_channel_num) - ? load_store_attr::max_channel_num - : channel >= 16 ? 16 - : channel >= 8 ? 8 - : 1; - } - static constexpr uint32_t num_channel = select_channel( std::min(mem_transpose ? block_size_x : block_size_y, max_channel));