diff --git a/src/layer/x86/convolution_im2col_gemm_int8.h b/src/layer/x86/convolution_im2col_gemm_int8.h index 0a3ae0ec6ee..fae073e753a 100644 --- a/src/layer/x86/convolution_im2col_gemm_int8.h +++ b/src/layer/x86/convolution_im2col_gemm_int8.h @@ -76,7 +76,7 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M #endif } -static NCNN_FORCEINLINE void convolution_im2col_gemm_get_optimal_tile_mnk_int8(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT) +static void convolution_im2col_gemm_get_optimal_tile_mnk_int8(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT) { // resolve optimal tile size from cache size const int l2_cache_size_int8 = (int)(get_cpu_level2_cache_size() / sizeof(signed char)); @@ -205,7 +205,7 @@ static NCNN_FORCEINLINE void convolution_im2col_gemm_get_optimal_tile_mnk_int8(i } } -static NCNN_FORCEINLINE void convolution_im2col_input_tile_conv1x1s1d1_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk) +static void convolution_im2col_input_tile_conv1x1s1d1_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk) { const int elempack = bottom_blob.elempack; const int cstep = (int)bottom_blob.cstep; @@ -896,8 +896,6 @@ static void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat& _mm_store_si128((__m128i*)pp, _p0); - // NCNN_LOGE("qwq"); - pp += 16; } } diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp index 65ee41b332d..8ef554fa198 100644 --- a/src/layer/x86/convolution_x86.cpp +++ b/src/layer/x86/convolution_x86.cpp @@ -993,7 +993,14 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con #if __SSE2__ if (opt.use_packing_layout) { +#if __AVX512F__ + out_elempack_int32 = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; +#elif __AVX__ + out_elempack_int32 = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; +#else out_elempack_int32 = num_output % 4 == 0 ? 4 : 1; +#endif + // out_elempack_int32 = num_output % 4 == 0 ? 4 : 1; } #endif // __SSE2__