From 48e1260a6f2033fc1f9f5b6df91f5a2ef588f16d Mon Sep 17 00:00:00 2001 From: nihui Date: Thu, 23 Jan 2025 19:38:13 +0800 Subject: [PATCH] restrict one dim dequantize scale bias size (#5886) --- src/layer/arm/convolution_arm.cpp | 11 +- src/layer/arm/dequantize_arm.cpp | 1691 ++---------------- src/layer/arm/dequantize_arm.h | 1 - src/layer/arm/dequantize_arm_asimdhp.cpp | 1402 ++------------- src/layer/dequantize.cpp | 182 +- src/layer/loongarch/dequantize_loongarch.cpp | 841 +-------- src/layer/mips/dequantize_mips.cpp | 841 +-------- src/layer/x86/dequantize_x86.cpp | 128 +- tests/test_dequantize.cpp | 18 +- 9 files changed, 540 insertions(+), 4575 deletions(-) diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp index 16624574b6a..4d379f65eae 100644 --- a/src/layer/arm/convolution_arm.cpp +++ b/src/layer/arm/convolution_arm.cpp @@ -1376,7 +1376,16 @@ int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, con #if __ARM_NEON if (opt.use_packing_layout) { - out_elempack_int32 = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; +#if NCNN_ARM82 + if (ncnn::cpu_support_arm_asimdhp() && opt.use_fp16_arithmetic) + { + out_elempack_int32 = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + } + else +#endif // NCNN_ARM82 + { + out_elempack_int32 = num_output % 4 == 0 ? 4 : 1; + } } #endif // __ARM_NEON diff --git a/src/layer/arm/dequantize_arm.cpp b/src/layer/arm/dequantize_arm.cpp index 1df5134acd9..bf2bd17adfc 100644 --- a/src/layer/arm/dequantize_arm.cpp +++ b/src/layer/arm/dequantize_arm.cpp @@ -38,878 +38,159 @@ Dequantize_arm::Dequantize_arm() #endif } -int Dequantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +static void dequantize(const int* intptr, float* ptr, const Mat& scale_data, const Mat& bias_data, int elemcount, int elempack) { - // assert bottom_blob.elembits() == 32 - -#if NCNN_ARM82 - if (support_fp16_storage && opt.use_fp16_storage) - { - if (opt.use_fp16_arithmetic) - return forward_fp16sa(bottom_blob, top_blob, opt); - else - return forward_fp16s(bottom_blob, top_blob, opt); - } -#endif - -#if NCNN_BF16 - if (opt.use_bf16_storage) - return forward_bf16s(bottom_blob, top_blob, opt); -#endif + const int scale_data_size = scale_data.w; + const int bias_data_size = bias_data.w; + const int size = elemcount * elempack; - int dims = bottom_blob.dims; - int elempack = bottom_blob.elempack; + // NCNN_LOGE("dequantize %d %d %d %d", scale_data_size, bias_data_size, elemcount, elempack); + float scale = scale_data[0]; #if __ARM_NEON - if (elempack == 8) + float32x4_t _scale = vdupq_n_f32(scale); + if (scale_data_size > 1) { - if (dims == 1) + if (elempack == 4) { - int w = bottom_blob.w; - int outw = w * 2; - - top_blob.create(outw, (size_t)16u, 4, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_data_size == 1) - { - float32x4_t _scale = vdupq_n_f32(scale_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1q_f32(ptr, _v); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - vst1q_f32(ptr, _v); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - vst1q_f32(ptr, _v); - } - } - } - else - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - float32x4_t _scale = vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1q_f32(ptr, _v); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - float32x4_t _scale = vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - vst1q_f32(ptr, _v); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - float32x4_t _scale = vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - vst1q_f32(ptr, _v); - } - } - } + _scale = vld1q_f32((const float*)scale_data); } + } +#endif // __ARM_NEON - if (dims == 2) + if (bias_data_size == 0) + { + int i = 0; +#if __ARM_NEON + for (; i + 3 < size; i += 4) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int outh = h * 2; - - top_blob.create(w, outh, (size_t)16u, 4, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr0 = top_blob.row(i * 2); - float* ptr1 = top_blob.row(i * 2 + 1); - - float32x4_t _scale0 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 8); - float32x4_t _scale1 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 8 + 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - _v0 = vmulq_f32(_v0, _scale0); - _v1 = vmulq_f32(_v1, _scale1); - vst1q_f32(ptr0, _v0); - vst1q_f32(ptr1, _v1); - - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr0 = top_blob.row(i * 2); - float* ptr1 = top_blob.row(i * 2 + 1); - - float32x4_t _scale0 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 8); - float32x4_t _scale1 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 8 + 4); - float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8); - float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8 + 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); -#if __aarch64__ - _v0 = vfmaq_f32(_bias0, _v0, _scale0); - _v1 = vfmaq_f32(_bias1, _v1, _scale1); -#else - _v0 = vmlaq_f32(_bias0, _v0, _scale0); - _v1 = vmlaq_f32(_bias1, _v1, _scale1); -#endif - vst1q_f32(ptr0, _v0); - vst1q_f32(ptr1, _v1); - - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } - } + float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); + _v = vmulq_f32(_v, _scale); + vst1q_f32(ptr, _v); + intptr += 4; + ptr += 4; } - - if (dims == 3) +#endif // __ARM_NEON + for (; i < size; i++) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int outc = channels * 2; - - top_blob.create(w, h, outc, (size_t)16u, 4, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr0 = top_blob.channel(q * 2); - float* ptr1 = top_blob.channel(q * 2 + 1); - - float32x4_t _scale0 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 8); - float32x4_t _scale1 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 8 + 4); - - int i = 0; - for (; i + 1 < size; i += 2) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - float32x4_t _v2 = vcvtq_f32_s32(vld1q_s32(intptr + 8)); - float32x4_t _v3 = vcvtq_f32_s32(vld1q_s32(intptr + 12)); - _v0 = vmulq_f32(_v0, _scale0); - _v1 = vmulq_f32(_v1, _scale1); - _v2 = vmulq_f32(_v2, _scale0); - _v3 = vmulq_f32(_v3, _scale1); - vst1q_f32(ptr0, _v0); - vst1q_f32(ptr0 + 4, _v2); - vst1q_f32(ptr1, _v1); - vst1q_f32(ptr1 + 4, _v3); - - intptr += 16; - ptr0 += 8; - ptr1 += 8; - } - for (; i < size; i++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - _v0 = vmulq_f32(_v0, _scale0); - _v1 = vmulq_f32(_v1, _scale1); - vst1q_f32(ptr0, _v0); - vst1q_f32(ptr1, _v1); - - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr0 = top_blob.channel(q * 2); - float* ptr1 = top_blob.channel(q * 2 + 1); - - float32x4_t _scale0 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 8); - float32x4_t _scale1 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 8 + 4); - float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8); - float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8 + 4); - - int i = 0; - for (; i + 1 < size; i += 2) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - float32x4_t _v2 = vcvtq_f32_s32(vld1q_s32(intptr + 8)); - float32x4_t _v3 = vcvtq_f32_s32(vld1q_s32(intptr + 12)); -#if __aarch64__ - _v0 = vfmaq_f32(_bias0, _v0, _scale0); - _v1 = vfmaq_f32(_bias1, _v1, _scale1); - _v2 = vfmaq_f32(_bias0, _v2, _scale0); - _v3 = vfmaq_f32(_bias1, _v3, _scale1); -#else - _v0 = vmlaq_f32(_bias0, _v0, _scale0); - _v1 = vmlaq_f32(_bias1, _v1, _scale1); - _v2 = vmlaq_f32(_bias0, _v2, _scale0); - _v3 = vmlaq_f32(_bias1, _v3, _scale1); -#endif - vst1q_f32(ptr0, _v0); - vst1q_f32(ptr0 + 4, _v2); - vst1q_f32(ptr1, _v1); - vst1q_f32(ptr1 + 4, _v3); - - intptr += 16; - ptr0 += 8; - ptr1 += 8; - } - for (; i < size; i++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); -#if __aarch64__ - _v0 = vfmaq_f32(_bias0, _v0, _scale0); - _v1 = vfmaq_f32(_bias1, _v1, _scale1); -#else - _v0 = vmlaq_f32(_bias0, _v0, _scale0); - _v1 = vmlaq_f32(_bias1, _v1, _scale1); -#endif - vst1q_f32(ptr0, _v0); - vst1q_f32(ptr1, _v1); - - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } - } + *ptr = *intptr * scale; + intptr++; + ptr++; } - - return 0; } - - if (elempack == 4) + else { - if (dims == 1) + float bias = bias_data[0]; +#if __ARM_NEON + float32x4_t _bias = vdupq_n_f32(bias); + if (bias_data_size > 1) { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)16u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_data_size == 1) + if (elempack == 4) { - float32x4_t _scale = vdupq_n_f32(scale_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1q_f32(ptr, _v); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - vst1q_f32(ptr, _v); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - vst1q_f32(ptr, _v); - } - } - } - else - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - float32x4_t _scale = vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1q_f32(ptr, _v); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - float32x4_t _scale = vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - vst1q_f32(ptr, _v); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - float32x4_t _scale = vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - vst1q_f32(ptr, _v); - } - } + _bias = vld1q_f32((const float*)bias_data); } } +#endif // __ARM_NEON - if (dims == 2) + int i = 0; +#if __ARM_NEON + for (; i + 3 < size; i += 4) { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)16u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr = top_blob.row(i); - - float32x4_t _scale = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1q_f32(ptr, _v); - - intptr += 4; - ptr += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr = top_blob.row(i); - - float32x4_t _scale = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _bias = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); + float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); #if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); + _v = vfmaq_f32(_bias, _v, _scale); #else - _v = vmlaq_f32(_bias, _v, _scale); + _v = vmlaq_f32(_bias, _v, _scale); #endif - vst1q_f32(ptr, _v); - - intptr += 4; - ptr += 4; - } - } - } + vst1q_f32(ptr, _v); + intptr += 4; + ptr += 4; } - - if (dims == 3) +#endif // __ARM_NEON + for (; i < size; i++) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)16u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr = top_blob.channel(q); - - float32x4_t _scale = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 4); - - int i = 0; - for (; i + 1 < size; i += 2) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - _v0 = vmulq_f32(_v0, _scale); - _v1 = vmulq_f32(_v1, _scale); - vst1q_f32(ptr, _v0); - vst1q_f32(ptr + 4, _v1); - - intptr += 8; - ptr += 8; - } - for (; i < size; i++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1q_f32(ptr, _v); - - intptr += 4; - ptr += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr = top_blob.channel(q); + *ptr = *intptr * scale + bias; + intptr++; + ptr++; + } + } +} - float32x4_t _scale = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 4); - float32x4_t _bias = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 4); +int Dequantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + // assert bottom_blob.elembits() == 32 - int i = 0; - for (; i + 1 < size; i += 2) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); -#if __aarch64__ - _v0 = vfmaq_f32(_bias, _v0, _scale); - _v1 = vfmaq_f32(_bias, _v1, _scale); -#else - _v0 = vmlaq_f32(_bias, _v0, _scale); - _v1 = vmlaq_f32(_bias, _v1, _scale); +#if NCNN_ARM82 + if (support_fp16_storage && opt.use_fp16_storage) + { + return forward_fp16s(bottom_blob, top_blob, opt); + } #endif - vst1q_f32(ptr, _v0); - vst1q_f32(ptr + 4, _v1); - intptr += 8; - ptr += 8; - } - for (; i < size; i++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); +#if NCNN_BF16 + if (opt.use_bf16_storage) + return forward_bf16s(bottom_blob, top_blob, opt); #endif - vst1q_f32(ptr, _v); - intptr += 4; - ptr += 4; - } - } - } - } + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; - return 0; - } -#endif // __ARM_NEON + top_blob.create_like(bottom_blob, opt.blob_allocator); + if (top_blob.empty()) + return -100; if (dims == 1) { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)4u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - const int* intptr = bottom_blob; - float* ptr = top_blob; + const int wp = std::max(1, w / opt.num_threads); + const int nn_w = (w + wp - 1) / wp; - if (scale_data_size == 1) + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_w; ii++) { - const float scale = scale_data[0]; + const int i = ii * wp; - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale; - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; + const int* intptr = (const int*)bottom_blob + i * elempack; + float* ptr = (float*)top_blob + i * elempack; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale + bias; - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale + bias_data[i]; - } - } - } - else - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale_data[i]; - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; + // assert scale_data_size == 1 + // assert bias_data_size == 0 || bias_data_size == 1 - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale_data[i] + bias; - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale_data[i] + bias_data[i]; - } - } + const int size = std::min(w - i, wp) * elempack; + + dequantize(intptr, ptr, scale_data, bias_data, size, 1); } } if (dims == 2) { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)4u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr = top_blob.row(i); + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; - - int j = 0; -#if __ARM_NEON - float32x4_t _scale = vdupq_n_f32(scale); - for (; j + 3 < w; j += 4) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1q_f32(ptr, _v); + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; - intptr += 4; - ptr += 4; - } -#endif // __ARM_NEON - for (; j < w; j++) - { - *ptr++ = *intptr++ * scale; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr = top_blob.row(i); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; - - int j = 0; -#if __ARM_NEON - float32x4_t _scale = vdupq_n_f32(scale); - float32x4_t _bias = vdupq_n_f32(bias); - for (; j + 3 < w; j += 4) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - vst1q_f32(ptr, _v); - - intptr += 4; - ptr += 4; - } -#endif // __ARM_NEON - for (; j < w; j++) - { - *ptr++ = *intptr++ * scale + bias; - } - } + dequantize(intptr, ptr, scale_data_i, bias_data_i, w, elempack); } } if (dims == 3) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)4u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr = top_blob.channel(q); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); - int i = 0; -#if __ARM_NEON - float32x4_t _scale = vdupq_n_f32(scale); - for (; i + 7 < size; i += 8) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - _v0 = vmulq_f32(_v0, _scale); - _v1 = vmulq_f32(_v1, _scale); - vst1q_f32(ptr, _v0); - vst1q_f32(ptr + 4, _v1); + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; - intptr += 8; - ptr += 8; - } - for (; i + 3 < size; i += 4) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1q_f32(ptr, _v); - - intptr += 4; - ptr += 4; - } -#endif // __ARM_NEON - for (; i < size; i++) - { - *ptr++ = *intptr++ * scale; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr = top_blob.channel(q); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; - - int i = 0; -#if __ARM_NEON - float32x4_t _scale = vdupq_n_f32(scale); - float32x4_t _bias = vdupq_n_f32(bias); - for (; i + 7 < size; i += 8) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); -#if __aarch64__ - _v0 = vfmaq_f32(_bias, _v0, _scale); - _v1 = vfmaq_f32(_bias, _v1, _scale); -#else - _v0 = vmlaq_f32(_bias, _v0, _scale); - _v1 = vmlaq_f32(_bias, _v1, _scale); -#endif - vst1q_f32(ptr, _v0); - vst1q_f32(ptr + 4, _v1); - - intptr += 8; - ptr += 8; - } - for (; i + 3 < size; i += 4) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - vst1q_f32(ptr, _v); - - intptr += 4; - ptr += 4; - } -#endif // __ARM_NEON - for (; i < size; i++) - { - *ptr++ = *intptr++ * scale + bias; - } - } + dequantize(intptr, ptr, scale_data_q, bias_data_q, w * h, elempack); } } @@ -917,754 +198,154 @@ int Dequantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& } #if NCNN_BF16 -int Dequantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +static void dequantize_bf16s(const int* intptr, unsigned short* ptr, const Mat& scale_data, const Mat& bias_data, int elemcount, int elempack) { - int dims = bottom_blob.dims; - int elempack = bottom_blob.elempack; + const int scale_data_size = scale_data.w; + const int bias_data_size = bias_data.w; + const int size = elemcount * elempack; + // NCNN_LOGE("dequantize_bf16s %d %d %d %d", scale_data_size, bias_data_size, elemcount, elempack); + + float scale = scale_data[0]; #if __ARM_NEON - if (elempack == 8) + float32x4_t _scale = vdupq_n_f32(scale); + if (scale_data_size > 1) { - if (dims == 1) + if (elempack == 4) { - int w = bottom_blob.w; - int outw = w * 2; - - top_blob.create(outw, (size_t)8u, 4, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_data_size == 1) - { - float32x4_t _scale = vdupq_n_f32(scale_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - unsigned short* ptr = (unsigned short*)top_blob + i * 4; - - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1_u16(ptr, float2bfloat(_v)); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - unsigned short* ptr = (unsigned short*)top_blob + i * 4; - - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - vst1_u16(ptr, float2bfloat(_v)); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - unsigned short* ptr = (unsigned short*)top_blob + i * 4; - - float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - vst1_u16(ptr, float2bfloat(_v)); - } - } - } - else - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - unsigned short* ptr = (unsigned short*)top_blob + i * 4; - - float32x4_t _scale = vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1_u16(ptr, float2bfloat(_v)); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - unsigned short* ptr = (unsigned short*)top_blob + i * 4; - - float32x4_t _scale = vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - vst1_u16(ptr, float2bfloat(_v)); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - unsigned short* ptr = (unsigned short*)top_blob + i * 4; - - float32x4_t _scale = vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - vst1_u16(ptr, float2bfloat(_v)); - } - } - } + _scale = vld1q_f32((const float*)scale_data); } + } +#endif // __ARM_NEON - if (dims == 2) + if (bias_data_size == 0) + { + int i = 0; +#if __ARM_NEON + for (; i + 3 < size; i += 4) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int outh = h * 2; - - top_blob.create(w, outh, (size_t)8u, 4, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - unsigned short* ptr0 = top_blob.row(i * 2); - unsigned short* ptr1 = top_blob.row(i * 2 + 1); - - float32x4_t _scale0 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 8); - float32x4_t _scale1 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 8 + 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - _v0 = vmulq_f32(_v0, _scale0); - _v1 = vmulq_f32(_v1, _scale1); - vst1_u16(ptr0, float2bfloat(_v0)); - vst1_u16(ptr1, float2bfloat(_v1)); - - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - unsigned short* ptr0 = top_blob.row(i * 2); - unsigned short* ptr1 = top_blob.row(i * 2 + 1); - - float32x4_t _scale0 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 8); - float32x4_t _scale1 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 8 + 4); - float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8); - float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8 + 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); -#if __aarch64__ - _v0 = vfmaq_f32(_bias0, _v0, _scale0); - _v1 = vfmaq_f32(_bias1, _v1, _scale1); -#else - _v0 = vmlaq_f32(_bias0, _v0, _scale0); - _v1 = vmlaq_f32(_bias1, _v1, _scale1); -#endif - vst1_u16(ptr0, float2bfloat(_v0)); - vst1_u16(ptr1, float2bfloat(_v1)); - - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } - } + float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); + _v = vmulq_f32(_v, _scale); + vst1_u16(ptr, float2bfloat(_v)); + intptr += 4; + ptr += 4; } - - if (dims == 3) +#endif // __ARM_NEON + for (; i < size; i++) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int outc = channels * 2; - - top_blob.create(w, h, outc, (size_t)8u, 4, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - unsigned short* ptr0 = top_blob.channel(q * 2); - unsigned short* ptr1 = top_blob.channel(q * 2 + 1); - - float32x4_t _scale0 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 8); - float32x4_t _scale1 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 8 + 4); - - for (int i = 0; i < size; i++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - _v0 = vmulq_f32(_v0, _scale0); - _v1 = vmulq_f32(_v1, _scale1); - vst1_u16(ptr0, float2bfloat(_v0)); - vst1_u16(ptr1, float2bfloat(_v1)); - - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - unsigned short* ptr0 = top_blob.channel(q * 2); - unsigned short* ptr1 = top_blob.channel(q * 2 + 1); - - float32x4_t _scale0 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 8); - float32x4_t _scale1 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 8 + 4); - float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8); - float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8 + 4); - - for (int i = 0; i < size; i++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); -#if __aarch64__ - _v0 = vfmaq_f32(_bias0, _v0, _scale0); - _v1 = vfmaq_f32(_bias1, _v1, _scale1); -#else - _v0 = vmlaq_f32(_bias0, _v0, _scale0); - _v1 = vmlaq_f32(_bias1, _v1, _scale1); -#endif - vst1_u16(ptr0, float2bfloat(_v0)); - vst1_u16(ptr1, float2bfloat(_v1)); - - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } - } + *ptr = float32_to_bfloat16(*intptr * scale); + intptr++; + ptr++; } - - return 0; } - - if (elempack == 4) + else { - if (dims == 1) + float bias = bias_data[0]; +#if __ARM_NEON + float32x4_t _bias = vdupq_n_f32(bias); + if (bias_data_size > 1) { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)8u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_data_size == 1) + if (elempack == 4) { - float32x4_t _scale = vdupq_n_f32(scale_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - unsigned short* ptr = (unsigned short*)top_blob + i * 4; - - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1_u16(ptr, float2bfloat(_v)); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - unsigned short* ptr = (unsigned short*)top_blob + i * 4; - - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - vst1_u16(ptr, float2bfloat(_v)); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - unsigned short* ptr = (unsigned short*)top_blob + i * 4; - - float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - vst1_u16(ptr, float2bfloat(_v)); - } - } - } - else - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - unsigned short* ptr = (unsigned short*)top_blob + i * 4; - - float32x4_t _scale = vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1_u16(ptr, float2bfloat(_v)); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - unsigned short* ptr = (unsigned short*)top_blob + i * 4; - - float32x4_t _scale = vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - vst1_u16(ptr, float2bfloat(_v)); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - unsigned short* ptr = (unsigned short*)top_blob + i * 4; - - float32x4_t _scale = vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - vst1_u16(ptr, float2bfloat(_v)); - } - } + _bias = vld1q_f32((const float*)bias_data); } } +#endif // __ARM_NEON - if (dims == 2) + int i = 0; +#if __ARM_NEON + for (; i + 3 < size; i += 4) { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)8u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - unsigned short* ptr = top_blob.row(i); - - float32x4_t _scale = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1_u16(ptr, float2bfloat(_v)); - - intptr += 4; - ptr += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - unsigned short* ptr = top_blob.row(i); - - float32x4_t _scale = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _bias = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); + float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); #if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); + _v = vfmaq_f32(_bias, _v, _scale); #else - _v = vmlaq_f32(_bias, _v, _scale); + _v = vmlaq_f32(_bias, _v, _scale); #endif - vst1_u16(ptr, float2bfloat(_v)); - - intptr += 4; - ptr += 4; - } - } - } + vst1_u16(ptr, float2bfloat(_v)); + intptr += 4; + ptr += 4; } - - if (dims == 3) +#endif // __ARM_NEON + for (; i < size; i++) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)8u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - unsigned short* ptr = top_blob.channel(q); - - float32x4_t _scale = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 4); - - for (int i = 0; i < size; i++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1_u16(ptr, float2bfloat(_v)); - - intptr += 4; - ptr += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - unsigned short* ptr = top_blob.channel(q); - - float32x4_t _scale = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 4); - float32x4_t _bias = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 4); - - for (int i = 0; i < size; i++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - vst1_u16(ptr, float2bfloat(_v)); - - intptr += 4; - ptr += 4; - } - } - } + *ptr = float32_to_bfloat16(*intptr * scale + bias); + intptr++; + ptr++; } - - return 0; } -#endif // __ARM_NEON +} + +int Dequantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; + const size_t out_elemsize = elempack * 2u; if (dims == 1) { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)2u, opt.blob_allocator); + top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - const int* intptr = bottom_blob; - unsigned short* ptr = top_blob; + const int wp = std::max(1, w / opt.num_threads); + const int nn_w = (w + wp - 1) / wp; - if (scale_data_size == 1) + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_w; ii++) { - const float scale = scale_data[0]; + const int i = ii * wp; - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = float32_to_bfloat16(intptr[i] * scale); - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; + const int* intptr = (const int*)bottom_blob + i * elempack; + unsigned short* ptr = (unsigned short*)top_blob + i * elempack; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = float32_to_bfloat16(intptr[i] * scale + bias); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = float32_to_bfloat16(intptr[i] * scale + bias_data[i]); - } - } - } - else - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = float32_to_bfloat16(intptr[i] * scale_data[i]); - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; + // assert scale_data_size == 1 + // assert bias_data_size == 0 || bias_data_size == 1 - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = float32_to_bfloat16(intptr[i] * scale_data[i] + bias); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = float32_to_bfloat16(intptr[i] * scale_data[i] + bias_data[i]); - } - } + const int size = std::min(w - i, wp) * elempack; + + dequantize_bf16s(intptr, ptr, scale_data, bias_data, size, 1); } } if (dims == 2) { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)2u, opt.blob_allocator); + top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - unsigned short* ptr = top_blob.row(i); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; - - int j = 0; -#if __ARM_NEON - float32x4_t _scale = vdupq_n_f32(scale); - for (; j + 3 < w; j += 4) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1_u16(ptr, float2bfloat(_v)); - - intptr += 4; - ptr += 4; - } -#endif // __ARM_NEON - for (; j < w; j++) - { - *ptr++ = float32_to_bfloat16(*intptr++ * scale); - } - } - } - else + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - unsigned short* ptr = top_blob.row(i); + const int* intptr = bottom_blob.row(i); + unsigned short* ptr = top_blob.row(i); - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; - int j = 0; -#if __ARM_NEON - float32x4_t _scale = vdupq_n_f32(scale); - float32x4_t _bias = vdupq_n_f32(bias); - for (; j + 3 < w; j += 4) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - vst1_u16(ptr, float2bfloat(_v)); - - intptr += 4; - ptr += 4; - } -#endif // __ARM_NEON - for (; j < w; j++) - { - *ptr++ = float32_to_bfloat16(*intptr++ * scale + bias); - } - } + dequantize_bf16s(intptr, ptr, scale_data_i, bias_data_i, w, elempack); } } if (dims == 3) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)2u, opt.blob_allocator); + top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - if (bias_data_size == 0) + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - unsigned short* ptr = top_blob.channel(q); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + const int* intptr = bottom_blob.channel(q); + unsigned short* ptr = top_blob.channel(q); - int i = 0; -#if __ARM_NEON - float32x4_t _scale = vdupq_n_f32(scale); - for (; i + 3 < size; i += 4) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1_u16(ptr, float2bfloat(_v)); + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; - intptr += 4; - ptr += 4; - } -#endif // __ARM_NEON - for (; i < size; i++) - { - *ptr++ = float32_to_bfloat16(*intptr++ * scale); - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - unsigned short* ptr = top_blob.channel(q); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; - - int i = 0; -#if __ARM_NEON - float32x4_t _scale = vdupq_n_f32(scale); - float32x4_t _bias = vdupq_n_f32(bias); - for (; i + 3 < size; i += 4) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); -#if __aarch64__ - _v = vfmaq_f32(_bias, _v, _scale); -#else - _v = vmlaq_f32(_bias, _v, _scale); -#endif - vst1_u16(ptr, float2bfloat(_v)); - - intptr += 4; - ptr += 4; - } -#endif // __ARM_NEON - for (; i < size; i++) - { - *ptr++ = float32_to_bfloat16(*intptr++ * scale + bias); - } - } + dequantize_bf16s(intptr, ptr, scale_data_q, bias_data_q, w * h, elempack); } } diff --git a/src/layer/arm/dequantize_arm.h b/src/layer/arm/dequantize_arm.h index 677c731db69..e9e854c8cbd 100644 --- a/src/layer/arm/dequantize_arm.h +++ b/src/layer/arm/dequantize_arm.h @@ -29,7 +29,6 @@ class Dequantize_arm : public Dequantize protected: #if NCNN_ARM82 int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; - int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif #if NCNN_BF16 int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; diff --git a/src/layer/arm/dequantize_arm_asimdhp.cpp b/src/layer/arm/dequantize_arm_asimdhp.cpp index 3e03c8638dd..18404104c42 100644 --- a/src/layer/arm/dequantize_arm_asimdhp.cpp +++ b/src/layer/arm/dequantize_arm_asimdhp.cpp @@ -22,1374 +22,176 @@ namespace ncnn { #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -int Dequantize_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +static void dequantize_fp16s(const int* intptr, __fp16* ptr, const Mat& scale_data, const Mat& bias_data, int elemcount, int elempack) { - int dims = bottom_blob.dims; - int elempack = bottom_blob.elempack; - - if (elempack == 8) - { - if (dims == 1) - { - int w = bottom_blob.w; - int outw = w * 2; - - top_blob.create(outw, (size_t)8u, 4, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_data_size == 1) - { - float32x4_t _scale = vdupq_n_f32(scale_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - __fp16* ptr = (__fp16*)top_blob + i * 4; - - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); + const int scale_data_size = scale_data.w; + const int bias_data_size = bias_data.w; + const int size = elemcount * elempack; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - __fp16* ptr = (__fp16*)top_blob + i * 4; - - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vfmaq_f32(_bias, _v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - __fp16* ptr = (__fp16*)top_blob + i * 4; - - float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vfmaq_f32(_bias, _v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - } - } - } - else - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - __fp16* ptr = (__fp16*)top_blob + i * 4; - - float32x4_t _scale = vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - __fp16* ptr = (__fp16*)top_blob + i * 4; - - float32x4_t _scale = vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vfmaq_f32(_bias, _v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - __fp16* ptr = (__fp16*)top_blob + i * 4; - - float32x4_t _scale = vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vfmaq_f32(_bias, _v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - } - } - } - } + // NCNN_LOGE("dequantize_fp16s %d %d %d %d", scale_data_size, bias_data_size, elemcount, elempack); - if (dims == 2) + float scale = scale_data[0]; + float32x4_t _scale0 = vdupq_n_f32(scale); + float32x4_t _scale1 = _scale0; + if (scale_data_size > 1) + { + if (elempack == 8) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int outh = h * 2; - - top_blob.create(w, outh, (size_t)8u, 4, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - __fp16* ptr0 = top_blob.row<__fp16>(i * 2); - __fp16* ptr1 = top_blob.row<__fp16>(i * 2 + 1); - - float32x4_t _scale0 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 8); - float32x4_t _scale1 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 8 + 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - _v0 = vmulq_f32(_v0, _scale0); - _v1 = vmulq_f32(_v1, _scale1); - vst1_f16(ptr0, vcvt_f16_f32(_v0)); - vst1_f16(ptr1, vcvt_f16_f32(_v1)); - - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - __fp16* ptr0 = top_blob.row<__fp16>(i * 2); - __fp16* ptr1 = top_blob.row<__fp16>(i * 2 + 1); - - float32x4_t _scale0 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 8); - float32x4_t _scale1 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 8 + 4); - float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8); - float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8 + 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - _v0 = vfmaq_f32(_bias0, _v0, _scale0); - _v1 = vfmaq_f32(_bias1, _v1, _scale1); - vst1_f16(ptr0, vcvt_f16_f32(_v0)); - vst1_f16(ptr1, vcvt_f16_f32(_v1)); - - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } - } + _scale0 = vld1q_f32((const float*)scale_data); + _scale1 = vld1q_f32((const float*)scale_data + 4); } - - if (dims == 3) + if (elempack == 4) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int outc = channels * 2; - - top_blob.create(w, h, outc, (size_t)8u, 4, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - __fp16* ptr0 = top_blob.channel(q * 2); - __fp16* ptr1 = top_blob.channel(q * 2 + 1); - - float32x4_t _scale0 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 8); - float32x4_t _scale1 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 8 + 4); - - for (int i = 0; i < size; i++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - _v0 = vmulq_f32(_v0, _scale0); - _v1 = vmulq_f32(_v1, _scale1); - vst1_f16(ptr0, vcvt_f16_f32(_v0)); - vst1_f16(ptr1, vcvt_f16_f32(_v1)); - - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - __fp16* ptr0 = top_blob.channel(q * 2); - __fp16* ptr1 = top_blob.channel(q * 2 + 1); - - float32x4_t _scale0 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 8); - float32x4_t _scale1 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 8 + 4); - float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8); - float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8 + 4); - - for (int i = 0; i < size; i++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - _v0 = vfmaq_f32(_bias0, _v0, _scale0); - _v1 = vfmaq_f32(_bias1, _v1, _scale1); - vst1_f16(ptr0, vcvt_f16_f32(_v0)); - vst1_f16(ptr1, vcvt_f16_f32(_v1)); - - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } - } + _scale0 = vld1q_f32((const float*)scale_data); + _scale1 = _scale0; } - - return 0; } - if (elempack == 4) + if (bias_data_size == 0) { - if (dims == 1) + int i = 0; + for (; i + 7 < size; i += 8) { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)8u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_data_size == 1) - { - float32x4_t _scale = vdupq_n_f32(scale_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - __fp16* ptr = (__fp16*)top_blob + i * 4; - - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - __fp16* ptr = (__fp16*)top_blob + i * 4; - - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vfmaq_f32(_bias, _v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - __fp16* ptr = (__fp16*)top_blob + i * 4; - - float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vfmaq_f32(_bias, _v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - } - } - } - else - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - __fp16* ptr = (__fp16*)top_blob + i * 4; - - float32x4_t _scale = vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - __fp16* ptr = (__fp16*)top_blob + i * 4; - - float32x4_t _scale = vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vfmaq_f32(_bias, _v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - __fp16* ptr = (__fp16*)top_blob + i * 4; - - float32x4_t _scale = vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vfmaq_f32(_bias, _v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - } - } - } + float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); + float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); + _v0 = vmulq_f32(_v0, _scale0); + _v1 = vmulq_f32(_v1, _scale1); + vst1q_f16(ptr, vcombine_f16(vcvt_f16_f32(_v0), vcvt_f16_f32(_v1))); + intptr += 8; + ptr += 8; } - - if (dims == 2) + for (; i + 3 < size; i += 4) { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)8u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - __fp16* ptr = top_blob.row<__fp16>(i); - - float32x4_t _scale = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - - intptr += 4; - ptr += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - __fp16* ptr = top_blob.row<__fp16>(i); - - float32x4_t _scale = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _bias = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vfmaq_f32(_bias, _v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - - intptr += 4; - ptr += 4; - } - } - } + float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); + _v = vmulq_f32(_v, _scale0); + vst1_f16(ptr, vcvt_f16_f32(_v)); + intptr += 4; + ptr += 4; } - - if (dims == 3) + for (; i < size; i++) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)8u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - __fp16* ptr = top_blob.channel(q); - - float32x4_t _scale = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 4); - - for (int i = 0; i < size; i++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - - intptr += 4; - ptr += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - __fp16* ptr = top_blob.channel(q); - - float32x4_t _scale = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 4); - float32x4_t _bias = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 4); - - for (int i = 0; i < size; i++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vfmaq_f32(_bias, _v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - - intptr += 4; - ptr += 4; - } - } - } + *ptr = (__fp16)(*intptr * scale); + intptr++; + ptr++; } - - return 0; } - - if (dims == 1) + else { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)2u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - const int* intptr = bottom_blob; - __fp16* ptr = top_blob; - - if (scale_data_size == 1) - { - const float scale = scale_data[0]; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = (__fp16)(intptr[i] * scale); - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = (__fp16)(intptr[i] * scale + bias); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = (__fp16)(intptr[i] * scale + bias_data[i]); - } - } - } - else + float bias = bias_data[0]; + float32x4_t _bias0 = vdupq_n_f32(bias); + float32x4_t _bias1 = _bias0; + if (bias_data_size > 1) { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = (__fp16)(intptr[i] * scale_data[i]); - } - } - else if (bias_data_size == 1) + if (elempack == 8) { - const float bias = bias_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = (__fp16)(intptr[i] * scale_data[i] + bias); - } + _bias0 = vld1q_f32((const float*)bias_data); + _bias1 = vld1q_f32((const float*)bias_data + 4); } - else + if (elempack == 4) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = (__fp16)(intptr[i] * scale_data[i] + bias_data[i]); - } + _bias0 = vld1q_f32((const float*)bias_data); + _bias1 = _bias0; } } - } - - if (dims == 2) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)2u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - __fp16* ptr = top_blob.row<__fp16>(i); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; - - int j = 0; - float32x4_t _scale = vdupq_n_f32(scale); - for (; j + 3 < w; j += 4) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - intptr += 4; - ptr += 4; - } - for (; j < w; j++) - { - *ptr++ = (__fp16)(*intptr++ * scale); - } - } - } - else + int i = 0; + for (; i + 7 < size; i += 8) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - __fp16* ptr = top_blob.row<__fp16>(i); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; - - int j = 0; - float32x4_t _scale = vdupq_n_f32(scale); - float32x4_t _bias = vdupq_n_f32(bias); - for (; j + 3 < w; j += 4) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vfmaq_f32(_bias, _v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - - intptr += 4; - ptr += 4; - } - for (; j < w; j++) - { - *ptr++ = (__fp16)(*intptr++ * scale + bias); - } - } + float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); + float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); + _v0 = vfmaq_f32(_bias0, _v0, _scale0); + _v1 = vfmaq_f32(_bias1, _v1, _scale1); + vst1q_f16(ptr, vcombine_f16(vcvt_f16_f32(_v0), vcvt_f16_f32(_v1))); + intptr += 8; + ptr += 8; } - } - - if (dims == 3) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)2u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) + for (; i + 3 < size; i += 4) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - __fp16* ptr = top_blob.channel(q); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; - - int i = 0; - float32x4_t _scale = vdupq_n_f32(scale); - for (; i + 3 < size; i += 4) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - - intptr += 4; - ptr += 4; - } - for (; i < size; i++) - { - *ptr++ = (__fp16)(*intptr++ * scale); - } - } + float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); + _v = vfmaq_f32(_bias0, _v, _scale0); + vst1_f16(ptr, vcvt_f16_f32(_v)); + intptr += 4; + ptr += 4; } - else + for (; i < size; i++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - __fp16* ptr = top_blob.channel(q); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; - - int i = 0; - float32x4_t _scale = vdupq_n_f32(scale); - float32x4_t _bias = vdupq_n_f32(bias); - for (; i + 3 < size; i += 4) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vfmaq_f32(_bias, _v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - - intptr += 4; - ptr += 4; - } - for (; i < size; i++) - { - *ptr++ = (__fp16)(*intptr++ * scale + bias); - } - } + *ptr = (__fp16)(*intptr * scale + bias); + intptr++; + ptr++; } } - - return 0; } -int Dequantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +int Dequantize_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { - int dims = bottom_blob.dims; - int elempack = bottom_blob.elempack; - - if (elempack == 8) - { - if (dims == 1) - { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)16u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_data_size == 1) - { - float32x4_t _scale = vdupq_n_f32(scale_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - __fp16* ptr = (__fp16*)top_blob + i * 8; - - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - _v0 = vmulq_f32(_v0, _scale); - _v1 = vmulq_f32(_v1, _scale); - vst1q_f16(ptr, vcombine_f16(vcvt_f16_f32(_v0), vcvt_f16_f32(_v1))); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - __fp16* ptr = (__fp16*)top_blob + i * 8; - - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - _v0 = vfmaq_f32(_bias, _v0, _scale); - _v1 = vfmaq_f32(_bias, _v1, _scale); - vst1q_f16(ptr, vcombine_f16(vcvt_f16_f32(_v0), vcvt_f16_f32(_v1))); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - __fp16* ptr = (__fp16*)top_blob + i * 8; - - float32x4_t _bias0 = vld1q_f32((const float*)bias_data + i * 8); - float32x4_t _bias1 = vld1q_f32((const float*)bias_data + i * 8 + 4); - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - _v0 = vfmaq_f32(_bias0, _v0, _scale); - _v1 = vfmaq_f32(_bias1, _v1, _scale); - vst1q_f16(ptr, vcombine_f16(vcvt_f16_f32(_v0), vcvt_f16_f32(_v1))); - } - } - } - else - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - __fp16* ptr = (__fp16*)top_blob + i * 8; - - float32x4_t _scale0 = vld1q_f32((const float*)scale_data + i * 8); - float32x4_t _scale1 = vld1q_f32((const float*)scale_data + i * 8 + 4); - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - _v0 = vmulq_f32(_v0, _scale0); - _v1 = vmulq_f32(_v1, _scale1); - vst1q_f16(ptr, vcombine_f16(vcvt_f16_f32(_v0), vcvt_f16_f32(_v1))); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - __fp16* ptr = (__fp16*)top_blob + i * 8; - - float32x4_t _scale0 = vld1q_f32((const float*)scale_data + i * 8); - float32x4_t _scale1 = vld1q_f32((const float*)scale_data + i * 8 + 4); - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - _v0 = vfmaq_f32(_bias, _v0, _scale0); - _v1 = vfmaq_f32(_bias, _v1, _scale1); - vst1q_f16(ptr, vcombine_f16(vcvt_f16_f32(_v0), vcvt_f16_f32(_v1))); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 8; - __fp16* ptr = (__fp16*)top_blob + i * 8; - - float32x4_t _scale0 = vld1q_f32((const float*)scale_data + i * 8); - float32x4_t _scale1 = vld1q_f32((const float*)scale_data + i * 8 + 4); - float32x4_t _bias0 = vld1q_f32((const float*)bias_data + i * 8); - float32x4_t _bias1 = vld1q_f32((const float*)bias_data + i * 8 + 4); - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - _v0 = vfmaq_f32(_bias0, _v0, _scale0); - _v1 = vfmaq_f32(_bias1, _v1, _scale1); - vst1q_f16(ptr, vcombine_f16(vcvt_f16_f32(_v0), vcvt_f16_f32(_v1))); - } - } - } - } - - if (dims == 2) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)16u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - __fp16* ptr = top_blob.row<__fp16>(i); - - float32x4_t _scale0 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 8); - float32x4_t _scale1 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 8 + 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - _v0 = vmulq_f32(_v0, _scale0); - _v1 = vmulq_f32(_v1, _scale1); - vst1q_f16(ptr, vcombine_f16(vcvt_f16_f32(_v0), vcvt_f16_f32(_v1))); - - intptr += 8; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - __fp16* ptr = top_blob.row<__fp16>(i); - - float32x4_t _scale0 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 8); - float32x4_t _scale1 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 8 + 4); - float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8); - float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 8 + 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - _v0 = vfmaq_f32(_bias0, _v0, _scale0); - _v1 = vfmaq_f32(_bias1, _v1, _scale1); - vst1q_f16(ptr, vcombine_f16(vcvt_f16_f32(_v0), vcvt_f16_f32(_v1))); - - intptr += 8; - ptr += 8; - } - } - } - } - - if (dims == 3) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)16u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - __fp16* ptr = top_blob.channel(q); - - float32x4_t _scale0 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 8); - float32x4_t _scale1 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 8 + 4); - - for (int i = 0; i < size; i++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - _v0 = vmulq_f32(_v0, _scale0); - _v1 = vmulq_f32(_v1, _scale1); - vst1q_f16(ptr, vcombine_f16(vcvt_f16_f32(_v0), vcvt_f16_f32(_v1))); - - intptr += 8; - ptr += 8; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - __fp16* ptr = top_blob.channel(q); - - float32x4_t _scale0 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 8); - float32x4_t _scale1 = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 8 + 4); - float32x4_t _bias0 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8); - float32x4_t _bias1 = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 8 + 4); - - for (int i = 0; i < size; i++) - { - float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr)); - float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4)); - _v0 = vfmaq_f32(_bias0, _v0, _scale0); - _v1 = vfmaq_f32(_bias1, _v1, _scale1); - vst1q_f16(ptr, vcombine_f16(vcvt_f16_f32(_v0), vcvt_f16_f32(_v1))); - - intptr += 8; - ptr += 8; - } - } - } - } - - return 0; - } - - if (elempack == 4) - { - if (dims == 1) - { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)8u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_data_size == 1) - { - float32x4_t _scale = vdupq_n_f32(scale_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - __fp16* ptr = (__fp16*)top_blob + i * 4; - - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - __fp16* ptr = (__fp16*)top_blob + i * 4; - - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vfmaq_f32(_bias, _v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - __fp16* ptr = (__fp16*)top_blob + i * 4; - - float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vfmaq_f32(_bias, _v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - } - } - } - else - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - __fp16* ptr = (__fp16*)top_blob + i * 4; - - float32x4_t _scale = vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - } - } - else if (bias_data_size == 1) - { - float32x4_t _bias = vdupq_n_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - __fp16* ptr = (__fp16*)top_blob + i * 4; - - float32x4_t _scale = vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vfmaq_f32(_bias, _v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - __fp16* ptr = (__fp16*)top_blob + i * 4; - - float32x4_t _scale = vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4); - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vfmaq_f32(_bias, _v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - } - } - } - } - - if (dims == 2) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)8u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - __fp16* ptr = top_blob.row<__fp16>(i); - - float32x4_t _scale = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - - intptr += 4; - ptr += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - __fp16* ptr = top_blob.row<__fp16>(i); - - float32x4_t _scale = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + i * 4); - float32x4_t _bias = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + i * 4); - - for (int j = 0; j < w; j++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vfmaq_f32(_bias, _v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - - intptr += 4; - ptr += 4; - } - } - } - } - - if (dims == 3) - { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)8u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - __fp16* ptr = top_blob.channel(q); - - float32x4_t _scale = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 4); - - for (int i = 0; i < size; i++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - - intptr += 4; - ptr += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - __fp16* ptr = top_blob.channel(q); - - float32x4_t _scale = scale_data_size == 1 ? vdupq_n_f32(scale_data[0]) : vld1q_f32((const float*)scale_data + q * 4); - float32x4_t _bias = bias_data_size == 1 ? vdupq_n_f32(bias_data[0]) : vld1q_f32((const float*)bias_data + q * 4); - - for (int i = 0; i < size; i++) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vfmaq_f32(_bias, _v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - - intptr += 4; - ptr += 4; - } - } - } - } - - return 0; - } + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; + const size_t out_elemsize = elempack * 2u; if (dims == 1) { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)2u, opt.blob_allocator); + top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - const int* intptr = bottom_blob; - __fp16* ptr = top_blob; + const int wp = std::max(1, w / opt.num_threads); + const int nn_w = (w + wp - 1) / wp; - if (scale_data_size == 1) + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_w; ii++) { - const float scale = scale_data[0]; + const int i = ii * wp; - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = (__fp16)(intptr[i] * scale); - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; + const int* intptr = (const int*)bottom_blob + i * elempack; + __fp16* ptr = (__fp16*)top_blob + i * elempack; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = (__fp16)(intptr[i] * scale + bias); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = (__fp16)(intptr[i] * scale + bias_data[i]); - } - } - } - else - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = (__fp16)(intptr[i] * scale_data[i]); - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; + // assert scale_data_size == 1 + // assert bias_data_size == 0 || bias_data_size == 1 - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = (__fp16)(intptr[i] * scale_data[i] + bias); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = (__fp16)(intptr[i] * scale_data[i] + bias_data[i]); - } - } + const int size = std::min(w - i, wp) * elempack; + + dequantize_fp16s(intptr, ptr, scale_data, bias_data, size, 1); } } if (dims == 2) { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)2u, opt.blob_allocator); + top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - __fp16* ptr = top_blob.row<__fp16>(i); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; - - int j = 0; - float32x4_t _scale = vdupq_n_f32(scale); - for (; j + 3 < w; j += 4) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - - intptr += 4; - ptr += 4; - } - for (; j < w; j++) - { - *ptr++ = (__fp16)(*intptr++ * scale); - } - } - } - else + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - __fp16* ptr = top_blob.row<__fp16>(i); + const int* intptr = bottom_blob.row(i); + __fp16* ptr = top_blob.row<__fp16>(i); - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; - int j = 0; - float32x4_t _scale = vdupq_n_f32(scale); - float32x4_t _bias = vdupq_n_f32(bias); - for (; j + 3 < w; j += 4) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vfmaq_f32(_bias, _v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - - intptr += 4; - ptr += 4; - } - for (; j < w; j++) - { - *ptr++ = (__fp16)(*intptr++ * scale + bias); - } - } + dequantize_fp16s(intptr, ptr, scale_data_i, bias_data_i, w, elempack); } } if (dims == 3) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)2u, opt.blob_allocator); + top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - __fp16* ptr = top_blob.channel(q); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; - - int i = 0; - float32x4_t _scale = vdupq_n_f32(scale); - for (; i + 3 < size; i += 4) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vmulq_f32(_v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - - intptr += 4; - ptr += 4; - } - for (; i < size; i++) - { - *ptr++ = (__fp16)(*intptr++ * scale); - } - } - } - else + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - __fp16* ptr = top_blob.channel(q); + const int* intptr = bottom_blob.channel(q); + __fp16* ptr = top_blob.channel(q); - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; - int i = 0; - float32x4_t _scale = vdupq_n_f32(scale); - float32x4_t _bias = vdupq_n_f32(bias); - for (; i + 3 < size; i += 4) - { - float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr)); - _v = vfmaq_f32(_bias, _v, _scale); - vst1_f16(ptr, vcvt_f16_f32(_v)); - - intptr += 4; - ptr += 4; - } - for (; i < size; i++) - { - *ptr++ = (__fp16)(*intptr++ * scale + bias); - } - } + dequantize_fp16s(intptr, ptr, scale_data_q, bias_data_q, w * h, elempack); } } diff --git a/src/layer/dequantize.cpp b/src/layer/dequantize.cpp index e0259de1bae..9adf6521011 100644 --- a/src/layer/dequantize.cpp +++ b/src/layer/dequantize.cpp @@ -46,170 +46,68 @@ int Dequantize::load_model(const ModelBin& mb) return 0; } +static void dequantize(const int* intptr, float* ptr, float scale, float bias, int size) +{ + for (int i = 0; i < size; i++) + { + *ptr = *intptr * scale + bias; + intptr++; + ptr++; + } +} + int Dequantize::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { - int dims = bottom_blob.dims; + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + + top_blob.create_like(bottom_blob, opt.blob_allocator); + if (top_blob.empty()) + return -100; if (dims == 1) { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)4u, opt.blob_allocator); - if (top_blob.empty()) - return -100; + // assert scale_data_size == 1 + // assert bias_data_size == 0 || bias_data_size == 1 const int* intptr = bottom_blob; float* ptr = top_blob; - if (scale_data_size == 1) - { - const float scale = scale_data[0]; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale; - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale + bias; - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale + bias_data[i]; - } - } - } - else - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale_data[i]; - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale_data[i] + bias; - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale_data[i] + bias_data[i]; - } - } - } + const float scale = scale_data[0]; + const float bias = bias_data_size == 0 ? 0.f : bias_data[0]; + + dequantize(intptr, ptr, scale, bias, w); } if (dims == 2) { - int w = bottom_blob.w; - int h = bottom_blob.h; + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); - top_blob.create(w, h, (size_t)4u, opt.blob_allocator); - if (top_blob.empty()) - return -100; + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + const float bias = bias_data_size == 0 ? 0.f : bias_data_size == 1 ? bias_data[0] : bias_data[i]; - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr = top_blob.row(i); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; - - for (int j = 0; j < w; j++) - { - ptr[j] = intptr[j] * scale; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr = top_blob.row(i); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; - - for (int j = 0; j < w; j++) - { - ptr[j] = intptr[j] * scale + bias; - } - } + dequantize(intptr, ptr, scale, bias, w); } } if (dims == 3) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); - top_blob.create(w, h, channels, (size_t)4u, opt.blob_allocator); - if (top_blob.empty()) - return -100; + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + const float bias = bias_data_size == 0 ? 0.f : bias_data_size == 1 ? bias_data[0] : bias_data[q]; - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr = top_blob.channel(q); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; - - for (int i = 0; i < size; i++) - { - ptr[i] = intptr[i] * scale; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr = top_blob.channel(q); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; - - for (int i = 0; i < size; i++) - { - ptr[i] = intptr[i] * scale + bias; - } - } + dequantize(intptr, ptr, scale, bias, w * h); } } diff --git a/src/layer/loongarch/dequantize_loongarch.cpp b/src/layer/loongarch/dequantize_loongarch.cpp index 5ee9595f89f..f2553cb3fb7 100644 --- a/src/layer/loongarch/dequantize_loongarch.cpp +++ b/src/layer/loongarch/dequantize_loongarch.cpp @@ -29,806 +29,145 @@ Dequantize_loongarch::Dequantize_loongarch() #endif } -int Dequantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +static void dequantize(const int* intptr, float* ptr, const Mat& scale_data, const Mat& bias_data, int elemcount, int elempack) { - // assert bottom_blob.elembits() == 32 + const int scale_data_size = scale_data.w; + const int bias_data_size = bias_data.w; + const int size = elemcount * elempack; - int dims = bottom_blob.dims; - int elempack = bottom_blob.elempack; + // NCNN_LOGE("dequantize %d %d %d %d", scale_data_size, bias_data_size, elemcount, elempack); + float scale = scale_data[0]; #if __loongarch_sx - if (elempack == 8) + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale); + if (scale_data_size > 1) { - if (dims == 1) + if (elempack == 4) { - int w = bottom_blob.w; - int outw = w * 2; - - top_blob.create(outw, (size_t)16u, 4, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_data_size == 1) - { - __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmul_s(_v, _scale); - __lsx_vst(_v, ptr, 0); - } - } - else if (bias_data_size == 1) - { - __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale, _v, _bias); - __lsx_vst(_v, ptr, 0); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale, _v, _bias); - __lsx_vst(_v, ptr, 0); - } - } - } - else - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmul_s(_v, _scale); - __lsx_vst(_v, ptr, 0); - } - } - else if (bias_data_size == 1) - { - __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale, _v, _bias); - __lsx_vst(_v, ptr, 0); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); - __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale, _v, _bias); - __lsx_vst(_v, ptr, 0); - } - } - } + _scale = (__m128)__lsx_vld((const float*)scale_data, 0); } + } +#endif // __loongarch_sx - if (dims == 2) + if (bias_data_size == 0) + { + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int outh = h * 2; - - top_blob.create(w, outh, (size_t)16u, 4, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr0 = top_blob.row(i * 2); - float* ptr1 = top_blob.row(i * 2 + 1); - - __m128 _scale0 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 8, 0); - __m128 _scale1 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 8 + 4, 0); - - for (int j = 0; j < w; j++) - { - __builtin_prefetch(intptr + 32); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmul_s(_v0, _scale0); - _v1 = __lsx_vfmul_s(_v1, _scale1); - __lsx_vst(_v0, ptr0, 0); - __lsx_vst(_v1, ptr1, 0); - - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr0 = top_blob.row(i * 2); - float* ptr1 = top_blob.row(i * 2 + 1); - - __m128 _scale0 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 8, 0); - __m128 _scale1 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 8 + 4, 0); - __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0); - __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0); - - for (int j = 0; j < w; j++) - { - __builtin_prefetch(intptr + 32); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); - _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); - __lsx_vst(_v0, ptr0, 0); - __lsx_vst(_v1, ptr1, 0); - - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } - } + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale); + __lsx_vst(_v, ptr, 0); + intptr += 4; + ptr += 4; } - - if (dims == 3) +#endif // __loongarch_sx + for (; i < size; i++) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int outc = channels * 2; - - top_blob.create(w, h, outc, (size_t)16u, 4, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr0 = top_blob.channel(q * 2); - float* ptr1 = top_blob.channel(q * 2 + 1); - - __m128 _scale0 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 8, 0); - __m128 _scale1 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 8 + 4, 0); - - int i = 0; - for (; i + 1 < size; i += 2) - { - __builtin_prefetch(intptr + 64); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); - __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); - _v0 = __lsx_vfmul_s(_v0, _scale0); - _v1 = __lsx_vfmul_s(_v1, _scale1); - _v2 = __lsx_vfmul_s(_v2, _scale0); - _v3 = __lsx_vfmul_s(_v3, _scale1); - __lsx_vst(_v0, ptr0, 0); - __lsx_vst(_v2, ptr0 + 4, 0); - __lsx_vst(_v1, ptr1, 0); - __lsx_vst(_v3, ptr1 + 4, 0); - - intptr += 16; - ptr0 += 8; - ptr1 += 8; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr + 32); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmul_s(_v0, _scale0); - _v1 = __lsx_vfmul_s(_v1, _scale1); - __lsx_vst(_v0, ptr0, 0); - __lsx_vst(_v1, ptr1, 0); - - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr0 = top_blob.channel(q * 2); - float* ptr1 = top_blob.channel(q * 2 + 1); - - __m128 _scale0 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 8, 0); - __m128 _scale1 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 8 + 4, 0); - __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0); - __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0); - - int i = 0; - for (; i + 1 < size; i += 2) - { - __builtin_prefetch(intptr + 64); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); - __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); - _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); - _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); - _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0); - _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1); - __lsx_vst(_v0, ptr0, 0); - __lsx_vst(_v2, ptr0 + 4, 0); - __lsx_vst(_v1, ptr1, 0); - __lsx_vst(_v3, ptr1 + 4, 0); - - intptr += 16; - ptr0 += 8; - ptr1 += 8; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr + 32); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); - _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); - __lsx_vst(_v0, ptr0, 0); - __lsx_vst(_v1, ptr1, 0); - - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } - } + *ptr = *intptr * scale; + intptr++; + ptr++; } - - return 0; } - - if (elempack == 4) + else { - if (dims == 1) + float bias = bias_data[0]; +#if __loongarch_sx + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias); + if (bias_data_size > 1) { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)16u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_data_size == 1) - { - __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmul_s(_v, _scale); - __lsx_vst(_v, ptr, 0); - } - } - else if (bias_data_size == 1) - { - __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale, _v, _bias); - __lsx_vst(_v, ptr, 0); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale, _v, _bias); - __lsx_vst(_v, ptr, 0); - } - } - } - else + if (elempack == 4) { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmul_s(_v, _scale); - __lsx_vst(_v, ptr, 0); - } - } - else if (bias_data_size == 1) - { - __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale, _v, _bias); - __lsx_vst(_v, ptr, 0); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); - __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale, _v, _bias); - __lsx_vst(_v, ptr, 0); - } - } + _bias = (__m128)__lsx_vld((const float*)bias_data, 0); } } +#endif // __loongarch_sx - if (dims == 2) + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)16u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr = top_blob.row(i); - - __m128 _scale = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); - - for (int j = 0; j < w; j++) - { - __builtin_prefetch(intptr + 16); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmul_s(_v, _scale); - __lsx_vst(_v, ptr, 0); - - intptr += 4; - ptr += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr = top_blob.row(i); - - __m128 _scale = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); - __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); - - for (int j = 0; j < w; j++) - { - __builtin_prefetch(intptr + 16); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale, _v, _bias); - __lsx_vst(_v, ptr, 0); - - intptr += 4; - ptr += 4; - } - } - } + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + intptr += 4; + ptr += 4; } - - if (dims == 3) +#endif // __loongarch_sx + for (; i < size; i++) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)16u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr = top_blob.channel(q); - - __m128 _scale = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 4, 0); - - int i = 0; - for (; i + 1 < size; i += 2) - { - __builtin_prefetch(intptr + 32); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmul_s(_v0, _scale); - _v1 = __lsx_vfmul_s(_v1, _scale); - __lsx_vst(_v0, ptr, 0); - __lsx_vst(_v1, ptr + 4, 0); - - intptr += 8; - ptr += 8; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr + 16); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmul_s(_v, _scale); - __lsx_vst(_v, ptr, 0); - - intptr += 4; - ptr += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr = top_blob.channel(q); - - __m128 _scale = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 4, 0); - __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 4, 0); - - int i = 0; - for (; i + 1 < size; i += 2) - { - __builtin_prefetch(intptr + 32); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmadd_s(_scale, _v0, _bias); - _v1 = __lsx_vfmadd_s(_scale, _v1, _bias); - __lsx_vst(_v0, ptr, 0); - __lsx_vst(_v1, ptr + 4, 0); + *ptr = *intptr * scale + bias; + intptr++; + ptr++; + } + } +} - intptr += 8; - ptr += 8; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr + 16); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale, _v, _bias); - __lsx_vst(_v, ptr, 0); +int Dequantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + // assert bottom_blob.elembits() == 32 - intptr += 4; - ptr += 4; - } - } - } - } + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; - return 0; - } -#endif // __loongarch_sx + top_blob.create_like(bottom_blob, opt.blob_allocator); + if (top_blob.empty()) + return -100; if (dims == 1) { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)4u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - const int* intptr = bottom_blob; - float* ptr = top_blob; + const int wp = std::max(1, w / opt.num_threads); + const int nn_w = (w + wp - 1) / wp; - if (scale_data_size == 1) + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_w; ii++) { - const float scale = scale_data[0]; + const int i = ii * wp; - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale; - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; + const int* intptr = (const int*)bottom_blob + i * elempack; + float* ptr = (float*)top_blob + i * elempack; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale + bias; - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale + bias_data[i]; - } - } - } - else - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale_data[i]; - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; + // assert scale_data_size == 1 + // assert bias_data_size == 0 || bias_data_size == 1 - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale_data[i] + bias; - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale_data[i] + bias_data[i]; - } - } + const int size = std::min(w - i, wp) * elempack; + + dequantize(intptr, ptr, scale_data, bias_data, size, 1); } } if (dims == 2) { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)4u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr = top_blob.row(i); + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; - int j = 0; -#if __loongarch_sx - __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale); - for (; j + 3 < w; j += 4) - { - __builtin_prefetch(intptr + 16); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmul_s(_v, _scale); - __lsx_vst(_v, ptr, 0); - - intptr += 4; - ptr += 4; - } -#endif // __loongarch_sx - for (; j < w; j++) - { - *ptr++ = *intptr++ * scale; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr = top_blob.row(i); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; - - int j = 0; -#if __loongarch_sx - __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale); - __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias); - for (; j + 3 < w; j += 4) - { - __builtin_prefetch(intptr + 16); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale, _v, _bias); - __lsx_vst(_v, ptr, 0); - - intptr += 4; - ptr += 4; - } -#endif // __loongarch_sx - for (; j < w; j++) - { - *ptr++ = *intptr++ * scale + bias; - } - } + dequantize(intptr, ptr, scale_data_i, bias_data_i, w, elempack); } } if (dims == 3) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)4u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr = top_blob.channel(q); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); - int i = 0; -#if __loongarch_sx - __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale); - for (; i + 7 < size; i += 8) - { - __builtin_prefetch(intptr + 32); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmul_s(_v0, _scale); - _v1 = __lsx_vfmul_s(_v1, _scale); - __lsx_vst(_v0, ptr, 0); - __lsx_vst(_v1, ptr + 4, 0); - - intptr += 8; - ptr += 8; - } - for (; i + 3 < size; i += 4) - { - __builtin_prefetch(intptr + 16); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmul_s(_v, _scale); - __lsx_vst(_v, ptr, 0); + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; - intptr += 4; - ptr += 4; - } -#endif // __loongarch_sx - for (; i < size; i++) - { - *ptr++ = *intptr++ * scale; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr = top_blob.channel(q); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; - - int i = 0; -#if __loongarch_sx - __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale); - __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias); - for (; i + 7 < size; i += 8) - { - __builtin_prefetch(intptr + 32); - __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); - _v0 = __lsx_vfmadd_s(_scale, _v0, _bias); - _v1 = __lsx_vfmadd_s(_scale, _v1, _bias); - __lsx_vst(_v0, ptr, 0); - __lsx_vst(_v1, ptr + 4, 0); - - intptr += 8; - ptr += 8; - } - for (; i + 3 < size; i += 4) - { - __builtin_prefetch(intptr + 16); - __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); - _v = __lsx_vfmadd_s(_scale, _v, _bias); - __lsx_vst(_v, ptr, 0); - - intptr += 4; - ptr += 4; - } -#endif // __loongarch_sx - for (; i < size; i++) - { - *ptr++ = *intptr++ * scale + bias; - } - } + dequantize(intptr, ptr, scale_data_q, bias_data_q, w * h, elempack); } } diff --git a/src/layer/mips/dequantize_mips.cpp b/src/layer/mips/dequantize_mips.cpp index aa11a8fe9ca..5ab3ed47e5a 100644 --- a/src/layer/mips/dequantize_mips.cpp +++ b/src/layer/mips/dequantize_mips.cpp @@ -29,806 +29,145 @@ Dequantize_mips::Dequantize_mips() #endif } -int Dequantize_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +static void dequantize(const int* intptr, float* ptr, const Mat& scale_data, const Mat& bias_data, int elemcount, int elempack) { - // assert bottom_blob.elembits() == 32 + const int scale_data_size = scale_data.w; + const int bias_data_size = bias_data.w; + const int size = elemcount * elempack; - int dims = bottom_blob.dims; - int elempack = bottom_blob.elempack; + // NCNN_LOGE("dequantize %d %d %d %d", scale_data_size, bias_data_size, elemcount, elempack); + float scale = scale_data[0]; #if __mips_msa - if (elempack == 8) + v4f32 _scale = (v4f32)__msa_fill_w_f32(scale); + if (scale_data_size > 1) { - if (dims == 1) + if (elempack == 4) { - int w = bottom_blob.w; - int outw = w * 2; - - top_blob.create(outw, (size_t)16u, 4, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_data_size == 1) - { - v4f32 _scale = (v4f32)__msa_fill_w_f32(scale_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmul_w(_v, _scale); - __msa_st_w((v4i32)_v, ptr, 0); - } - } - else if (bias_data_size == 1) - { - v4f32 _bias = (v4f32)__msa_fill_w_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale); - __msa_st_w((v4i32)_v, ptr, 0); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - v4f32 _bias = (v4f32)__msa_ld_w((const float*)bias_data + i * 4, 0); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale); - __msa_st_w((v4i32)_v, ptr, 0); - } - } - } - else - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - v4f32 _scale = (v4f32)__msa_ld_w((const float*)scale_data + i * 4, 0); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmul_w(_v, _scale); - __msa_st_w((v4i32)_v, ptr, 0); - } - } - else if (bias_data_size == 1) - { - v4f32 _bias = (v4f32)__msa_fill_w_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - v4f32 _scale = (v4f32)__msa_ld_w((const float*)scale_data + i * 4, 0); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale); - __msa_st_w((v4i32)_v, ptr, 0); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < outw; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - v4f32 _scale = (v4f32)__msa_ld_w((const float*)scale_data + i * 4, 0); - v4f32 _bias = (v4f32)__msa_ld_w((const float*)bias_data + i * 4, 0); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale); - __msa_st_w((v4i32)_v, ptr, 0); - } - } - } + _scale = (v4f32)__msa_ld_w((const float*)scale_data, 0); } + } +#endif // __mips_msa - if (dims == 2) + if (bias_data_size == 0) + { + int i = 0; +#if __mips_msa + for (; i + 3 < size; i += 4) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int outh = h * 2; - - top_blob.create(w, outh, (size_t)16u, 4, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr0 = top_blob.row(i * 2); - float* ptr1 = top_blob.row(i * 2 + 1); - - v4f32 _scale0 = scale_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_data[0]) : (v4f32)__msa_ld_w((const float*)scale_data + i * 8, 0); - v4f32 _scale1 = scale_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_data[0]) : (v4f32)__msa_ld_w((const float*)scale_data + i * 8 + 4, 0); - - for (int j = 0; j < w; j++) - { - __builtin_prefetch(intptr + 32); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmul_w(_v0, _scale0); - _v1 = __msa_fmul_w(_v1, _scale1); - __msa_st_w((v4i32)_v0, ptr0, 0); - __msa_st_w((v4i32)_v1, ptr1, 0); - - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr0 = top_blob.row(i * 2); - float* ptr1 = top_blob.row(i * 2 + 1); - - v4f32 _scale0 = scale_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_data[0]) : (v4f32)__msa_ld_w((const float*)scale_data + i * 8, 0); - v4f32 _scale1 = scale_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_data[0]) : (v4f32)__msa_ld_w((const float*)scale_data + i * 8 + 4, 0); - v4f32 _bias0 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8, 0); - v4f32 _bias1 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 8 + 4, 0); - - for (int j = 0; j < w; j++) - { - __builtin_prefetch(intptr + 32); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmadd_w(_bias0, _v0, _scale0); - _v1 = __msa_fmadd_w(_bias1, _v1, _scale1); - __msa_st_w((v4i32)_v0, ptr0, 0); - __msa_st_w((v4i32)_v1, ptr1, 0); - - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } - } + __builtin_prefetch(intptr + 16); + v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); + _v = __msa_fmul_w(_v, _scale); + __msa_st_w((v4i32)_v, ptr, 0); + intptr += 4; + ptr += 4; } - - if (dims == 3) +#endif // __mips_msa + for (; i < size; i++) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - int outc = channels * 2; - - top_blob.create(w, h, outc, (size_t)16u, 4, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr0 = top_blob.channel(q * 2); - float* ptr1 = top_blob.channel(q * 2 + 1); - - v4f32 _scale0 = scale_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_data[0]) : (v4f32)__msa_ld_w((const float*)scale_data + q * 8, 0); - v4f32 _scale1 = scale_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_data[0]) : (v4f32)__msa_ld_w((const float*)scale_data + q * 8 + 4, 0); - - int i = 0; - for (; i + 1 < size; i += 2) - { - __builtin_prefetch(intptr + 64); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - v4f32 _v2 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 8, 0)); - v4f32 _v3 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 12, 0)); - _v0 = __msa_fmul_w(_v0, _scale0); - _v1 = __msa_fmul_w(_v1, _scale1); - _v2 = __msa_fmul_w(_v2, _scale0); - _v3 = __msa_fmul_w(_v3, _scale1); - __msa_st_w((v4i32)_v0, ptr0, 0); - __msa_st_w((v4i32)_v2, ptr0 + 4, 0); - __msa_st_w((v4i32)_v1, ptr1, 0); - __msa_st_w((v4i32)_v3, ptr1 + 4, 0); - - intptr += 16; - ptr0 += 8; - ptr1 += 8; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr + 32); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmul_w(_v0, _scale0); - _v1 = __msa_fmul_w(_v1, _scale1); - __msa_st_w((v4i32)_v0, ptr0, 0); - __msa_st_w((v4i32)_v1, ptr1, 0); - - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr0 = top_blob.channel(q * 2); - float* ptr1 = top_blob.channel(q * 2 + 1); - - v4f32 _scale0 = scale_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_data[0]) : (v4f32)__msa_ld_w((const float*)scale_data + q * 8, 0); - v4f32 _scale1 = scale_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_data[0]) : (v4f32)__msa_ld_w((const float*)scale_data + q * 8 + 4, 0); - v4f32 _bias0 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 8, 0); - v4f32 _bias1 = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 8 + 4, 0); - - int i = 0; - for (; i + 1 < size; i += 2) - { - __builtin_prefetch(intptr + 64); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - v4f32 _v2 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 8, 0)); - v4f32 _v3 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 12, 0)); - _v0 = __msa_fmadd_w(_bias0, _v0, _scale0); - _v1 = __msa_fmadd_w(_bias1, _v1, _scale1); - _v2 = __msa_fmadd_w(_bias0, _v2, _scale0); - _v3 = __msa_fmadd_w(_bias1, _v3, _scale1); - __msa_st_w((v4i32)_v0, ptr0, 0); - __msa_st_w((v4i32)_v2, ptr0 + 4, 0); - __msa_st_w((v4i32)_v1, ptr1, 0); - __msa_st_w((v4i32)_v3, ptr1 + 4, 0); - - intptr += 16; - ptr0 += 8; - ptr1 += 8; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr + 32); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmadd_w(_bias0, _v0, _scale0); - _v1 = __msa_fmadd_w(_bias1, _v1, _scale1); - __msa_st_w((v4i32)_v0, ptr0, 0); - __msa_st_w((v4i32)_v1, ptr1, 0); - - intptr += 8; - ptr0 += 4; - ptr1 += 4; - } - } - } + *ptr = *intptr * scale; + intptr++; + ptr++; } - - return 0; } - - if (elempack == 4) + else { - if (dims == 1) + float bias = bias_data[0]; +#if __mips_msa + v4f32 _bias = (v4f32)__msa_fill_w_f32(bias); + if (bias_data_size > 1) { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)16u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (scale_data_size == 1) - { - v4f32 _scale = (v4f32)__msa_fill_w_f32(scale_data[0]); - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmul_w(_v, _scale); - __msa_st_w((v4i32)_v, ptr, 0); - } - } - else if (bias_data_size == 1) - { - v4f32 _bias = (v4f32)__msa_fill_w_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale); - __msa_st_w((v4i32)_v, ptr, 0); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - v4f32 _bias = (v4f32)__msa_ld_w((const float*)bias_data + i * 4, 0); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale); - __msa_st_w((v4i32)_v, ptr, 0); - } - } - } - else + if (elempack == 4) { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - v4f32 _scale = (v4f32)__msa_ld_w((const float*)scale_data + i * 4, 0); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmul_w(_v, _scale); - __msa_st_w((v4i32)_v, ptr, 0); - } - } - else if (bias_data_size == 1) - { - v4f32 _bias = (v4f32)__msa_fill_w_f32(bias_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - v4f32 _scale = (v4f32)__msa_ld_w((const float*)scale_data + i * 4, 0); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale); - __msa_st_w((v4i32)_v, ptr, 0); - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - const int* intptr = (const int*)bottom_blob + i * 4; - float* ptr = (float*)top_blob + i * 4; - - v4f32 _scale = (v4f32)__msa_ld_w((const float*)scale_data + i * 4, 0); - v4f32 _bias = (v4f32)__msa_ld_w((const float*)bias_data + i * 4, 0); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale); - __msa_st_w((v4i32)_v, ptr, 0); - } - } + _bias = (v4f32)__msa_ld_w((const float*)bias_data, 0); } } +#endif // __mips_msa - if (dims == 2) + int i = 0; +#if __mips_msa + for (; i + 3 < size; i += 4) { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)16u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr = top_blob.row(i); - - v4f32 _scale = scale_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_data[0]) : (v4f32)__msa_ld_w((const float*)scale_data + i * 4, 0); - - for (int j = 0; j < w; j++) - { - __builtin_prefetch(intptr + 16); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmul_w(_v, _scale); - __msa_st_w((v4i32)_v, ptr, 0); - - intptr += 4; - ptr += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr = top_blob.row(i); - - v4f32 _scale = scale_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_data[0]) : (v4f32)__msa_ld_w((const float*)scale_data + i * 4, 0); - v4f32 _bias = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + i * 4, 0); - - for (int j = 0; j < w; j++) - { - __builtin_prefetch(intptr + 16); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale); - __msa_st_w((v4i32)_v, ptr, 0); - - intptr += 4; - ptr += 4; - } - } - } + __builtin_prefetch(intptr + 16); + v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); + _v = __msa_fmadd_w(_bias, _v, _scale); + __msa_st_w((v4i32)_v, ptr, 0); + intptr += 4; + ptr += 4; } - - if (dims == 3) +#endif // __mips_msa + for (; i < size; i++) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)16u, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr = top_blob.channel(q); - - v4f32 _scale = scale_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_data[0]) : (v4f32)__msa_ld_w((const float*)scale_data + q * 4, 0); - - int i = 0; - for (; i + 1 < size; i += 2) - { - __builtin_prefetch(intptr + 32); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmul_w(_v0, _scale); - _v1 = __msa_fmul_w(_v1, _scale); - __msa_st_w((v4i32)_v0, ptr, 0); - __msa_st_w((v4i32)_v1, ptr + 4, 0); - - intptr += 8; - ptr += 8; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr + 16); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmul_w(_v, _scale); - __msa_st_w((v4i32)_v, ptr, 0); - - intptr += 4; - ptr += 4; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr = top_blob.channel(q); - - v4f32 _scale = scale_data_size == 1 ? (v4f32)__msa_fill_w_f32(scale_data[0]) : (v4f32)__msa_ld_w((const float*)scale_data + q * 4, 0); - v4f32 _bias = bias_data_size == 1 ? (v4f32)__msa_fill_w_f32(bias_data[0]) : (v4f32)__msa_ld_w((const float*)bias_data + q * 4, 0); - - int i = 0; - for (; i + 1 < size; i += 2) - { - __builtin_prefetch(intptr + 32); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmadd_w(_bias, _v0, _scale); - _v1 = __msa_fmadd_w(_bias, _v1, _scale); - __msa_st_w((v4i32)_v0, ptr, 0); - __msa_st_w((v4i32)_v1, ptr + 4, 0); + *ptr = *intptr * scale + bias; + intptr++; + ptr++; + } + } +} - intptr += 8; - ptr += 8; - } - for (; i < size; i++) - { - __builtin_prefetch(intptr + 16); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale); - __msa_st_w((v4i32)_v, ptr, 0); +int Dequantize_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + // assert bottom_blob.elembits() == 32 - intptr += 4; - ptr += 4; - } - } - } - } + const int dims = bottom_blob.dims; + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; - return 0; - } -#endif // __mips_msa + top_blob.create_like(bottom_blob, opt.blob_allocator); + if (top_blob.empty()) + return -100; if (dims == 1) { - int w = bottom_blob.w; - - top_blob.create(w, (size_t)4u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - const int* intptr = bottom_blob; - float* ptr = top_blob; + const int wp = std::max(1, w / opt.num_threads); + const int nn_w = (w + wp - 1) / wp; - if (scale_data_size == 1) + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_w; ii++) { - const float scale = scale_data[0]; + const int i = ii * wp; - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale; - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; + const int* intptr = (const int*)bottom_blob + i * elempack; + float* ptr = (float*)top_blob + i * elempack; - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale + bias; - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale + bias_data[i]; - } - } - } - else - { - if (bias_data_size == 0) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale_data[i]; - } - } - else if (bias_data_size == 1) - { - const float bias = bias_data[0]; + // assert scale_data_size == 1 + // assert bias_data_size == 0 || bias_data_size == 1 - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale_data[i] + bias; - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - ptr[i] = intptr[i] * scale_data[i] + bias_data[i]; - } - } + const int size = std::min(w - i, wp) * elempack; + + dequantize(intptr, ptr, scale_data, bias_data, size, 1); } } if (dims == 2) { - int w = bottom_blob.w; - int h = bottom_blob.h; - - top_blob.create(w, h, (size_t)4u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr = top_blob.row(i); + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; - int j = 0; -#if __mips_msa - v4f32 _scale = (v4f32)__msa_fill_w_f32(scale); - for (; j + 3 < w; j += 4) - { - __builtin_prefetch(intptr + 16); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmul_w(_v, _scale); - __msa_st_w((v4i32)_v, ptr, 0); - - intptr += 4; - ptr += 4; - } -#endif // __mips_msa - for (; j < w; j++) - { - *ptr++ = *intptr++ * scale; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - const int* intptr = bottom_blob.row(i); - float* ptr = top_blob.row(i); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; - - int j = 0; -#if __mips_msa - v4f32 _scale = (v4f32)__msa_fill_w_f32(scale); - v4f32 _bias = (v4f32)__msa_fill_w_f32(bias); - for (; j + 3 < w; j += 4) - { - __builtin_prefetch(intptr + 16); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale); - __msa_st_w((v4i32)_v, ptr, 0); - - intptr += 4; - ptr += 4; - } -#endif // __mips_msa - for (; j < w; j++) - { - *ptr++ = *intptr++ * scale + bias; - } - } + dequantize(intptr, ptr, scale_data_i, bias_data_i, w, elempack); } } if (dims == 3) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - int size = w * h; - - top_blob.create(w, h, channels, (size_t)4u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (bias_data_size == 0) + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr = top_blob.channel(q); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); - int i = 0; -#if __mips_msa - v4f32 _scale = (v4f32)__msa_fill_w_f32(scale); - for (; i + 7 < size; i += 8) - { - __builtin_prefetch(intptr + 32); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmul_w(_v0, _scale); - _v1 = __msa_fmul_w(_v1, _scale); - __msa_st_w((v4i32)_v0, ptr, 0); - __msa_st_w((v4i32)_v1, ptr + 4, 0); - - intptr += 8; - ptr += 8; - } - for (; i + 3 < size; i += 4) - { - __builtin_prefetch(intptr + 16); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmul_w(_v, _scale); - __msa_st_w((v4i32)_v, ptr, 0); + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; - intptr += 4; - ptr += 4; - } -#endif // __mips_msa - for (; i < size; i++) - { - *ptr++ = *intptr++ * scale; - } - } - } - else - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int* intptr = bottom_blob.channel(q); - float* ptr = top_blob.channel(q); - - const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; - const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; - - int i = 0; -#if __mips_msa - v4f32 _scale = (v4f32)__msa_fill_w_f32(scale); - v4f32 _bias = (v4f32)__msa_fill_w_f32(bias); - for (; i + 7 < size; i += 8) - { - __builtin_prefetch(intptr + 32); - v4f32 _v0 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - v4f32 _v1 = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr + 4, 0)); - _v0 = __msa_fmadd_w(_bias, _v0, _scale); - _v1 = __msa_fmadd_w(_bias, _v1, _scale); - __msa_st_w((v4i32)_v0, ptr, 0); - __msa_st_w((v4i32)_v1, ptr + 4, 0); - - intptr += 8; - ptr += 8; - } - for (; i + 3 < size; i += 4) - { - __builtin_prefetch(intptr + 16); - v4f32 _v = (v4f32)__msa_ffint_s_w(__msa_ld_w(intptr, 0)); - _v = __msa_fmadd_w(_bias, _v, _scale); - __msa_st_w((v4i32)_v, ptr, 0); - - intptr += 4; - ptr += 4; - } -#endif // __mips_msa - for (; i < size; i++) - { - *ptr++ = *intptr++ * scale + bias; - } - } + dequantize(intptr, ptr, scale_data_q, bias_data_q, w * h, elempack); } } diff --git a/src/layer/x86/dequantize_x86.cpp b/src/layer/x86/dequantize_x86.cpp index 3cc4b1805aa..272b820da90 100644 --- a/src/layer/x86/dequantize_x86.cpp +++ b/src/layer/x86/dequantize_x86.cpp @@ -40,45 +40,27 @@ static void dequantize(const int* intptr, float* ptr, const Mat& scale_data, con // NCNN_LOGE("dequantize %d %d %d %d", scale_data_size, bias_data_size, elemcount, elempack); - const float* scale_ptr = scale_data; - - float scale = 0.f; + float scale = scale_data[0]; #if __SSE2__ - __m128 _scale = _mm_setzero_ps(); + __m128 _scale = _mm_set1_ps(scale); #if __AVX__ - __m256 _scale_avx = _mm256_setzero_ps(); + __m256 _scale_avx = _mm256_set1_ps(scale); #if __AVX512F__ - __m512 _scale_avx512 = _mm512_setzero_ps(); + __m512 _scale_avx512 = _mm512_set1_ps(scale); #endif // __AVX512F__ #endif // __AVX__ -#endif // __SSE2__ - - if (scale_data_size == 1 || elempack == 1) + if (scale_data_size > 1) { - scale = scale_ptr[0]; -#if __SSE2__ - _scale = _mm_set1_ps(scale); -#if __AVX__ - _scale_avx = _mm256_set1_ps(scale); -#if __AVX512F__ - _scale_avx512 = _mm512_set1_ps(scale); -#endif // __AVX512F__ -#endif // __AVX__ -#endif // __SSE2__ - } - else - { -#if __SSE2__ #if __AVX__ #if __AVX512F__ if (elempack == 16) { - _scale_avx512 = _mm512_loadu_ps(scale_ptr); + _scale_avx512 = _mm512_loadu_ps((const float*)scale_data); } #endif // __AVX512F__ if (elempack == 8) { - _scale_avx = _mm256_loadu_ps(scale_ptr); + _scale_avx = _mm256_loadu_ps((const float*)scale_data); #if __AVX512F__ _scale_avx512 = combine8x2_ps(_scale_avx, _scale_avx); #endif // __AVX512F__ @@ -86,7 +68,7 @@ static void dequantize(const int* intptr, float* ptr, const Mat& scale_data, con #endif // __AVX__ if (elempack == 4) { - _scale = _mm_loadu_ps(scale_ptr); + _scale = _mm_loadu_ps((const float*)scale_data); #if __AVX__ _scale_avx = combine4x2_ps(_scale, _scale); #if __AVX512F__ @@ -94,8 +76,8 @@ static void dequantize(const int* intptr, float* ptr, const Mat& scale_data, con #endif // __AVX512F__ #endif // __AVX__ } -#endif // __SSE2__ } +#endif // __SSE2__ if (bias_data_size == 0) { @@ -139,45 +121,27 @@ static void dequantize(const int* intptr, float* ptr, const Mat& scale_data, con } else { - const float* bias_ptr = bias_data; - - float bias = 0.f; + float bias = bias_data[0]; #if __SSE2__ - __m128 _bias = _mm_setzero_ps(); + __m128 _bias = _mm_set1_ps(bias); #if __AVX__ - __m256 _bias_avx = _mm256_setzero_ps(); + __m256 _bias_avx = _mm256_set1_ps(bias); #if __AVX512F__ - __m512 _bias_avx512 = _mm512_setzero_ps(); + __m512 _bias_avx512 = _mm512_set1_ps(bias); #endif // __AVX512F__ #endif // __AVX__ -#endif // __SSE2__ - - if (bias_data_size == 1 || elempack == 1) - { - bias = bias_ptr[0]; -#if __SSE2__ - _bias = _mm_set1_ps(bias); -#if __AVX__ - _bias_avx = _mm256_set1_ps(bias); -#if __AVX512F__ - _bias_avx512 = _mm512_set1_ps(bias); -#endif // __AVX512F__ -#endif // __AVX__ -#endif // __SSE2__ - } - else + if (bias_data_size > 1) { -#if __SSE2__ #if __AVX__ #if __AVX512F__ if (elempack == 16) { - _bias_avx512 = _mm512_loadu_ps(bias_ptr); + _bias_avx512 = _mm512_loadu_ps((const float*)bias_data); } #endif // __AVX512F__ if (elempack == 8) { - _bias_avx = _mm256_loadu_ps(bias_ptr); + _bias_avx = _mm256_loadu_ps((const float*)bias_data); #if __AVX512F__ _bias_avx512 = combine8x2_ps(_bias_avx, _bias_avx); #endif // __AVX512F__ @@ -185,7 +149,7 @@ static void dequantize(const int* intptr, float* ptr, const Mat& scale_data, con #endif // __AVX__ if (elempack == 4) { - _bias = _mm_loadu_ps(bias_ptr); + _bias = _mm_loadu_ps((const float*)bias_data); #if __AVX__ _bias_avx = combine4x2_ps(_bias, _bias); #if __AVX512F__ @@ -193,8 +157,8 @@ static void dequantize(const int* intptr, float* ptr, const Mat& scale_data, con #endif // __AVX512F__ #endif // __AVX__ } -#endif // __SSE2__ } +#endif // __SSE2__ int i = 0; #if __SSE2__ @@ -261,62 +225,12 @@ int Dequantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& const int* intptr = (const int*)bottom_blob + i * elempack; float* ptr = (float*)top_blob + i * elempack; - const float* scale_ptr = scale_data_size > 1 ? (const float*)scale_data + i * elempack : scale_data; - const float* bias_ptr = bias_data_size > 1 ? (const float*)bias_data + i * elempack : bias_data; + // assert scale_data_size == 1 + // assert bias_data_size == 0 || bias_data_size == 1 const int size = std::min(w - i, wp) * elempack; - if (scale_data_size == 1) - { - const float scale = scale_ptr[0]; - if (bias_data_size == 0) - { - for (int j = 0; j < size; j++) - { - ptr[j] = intptr[j] * scale; - } - } - else if (bias_data_size == 1) - { - const float bias = bias_ptr[0]; - for (int j = 0; j < size; j++) - { - ptr[j] = intptr[j] * scale + bias; - } - } - else - { - for (int j = 0; j < size; j++) - { - ptr[j] = intptr[j] * scale + bias_ptr[j]; - } - } - } - else - { - if (bias_data_size == 0) - { - for (int j = 0; j < size; j++) - { - ptr[j] = intptr[j] * scale_ptr[j]; - } - } - else if (bias_data_size == 1) - { - const float bias = bias_ptr[0]; - for (int j = 0; j < size; j++) - { - ptr[j] = intptr[j] * scale_ptr[j] + bias; - } - } - else - { - for (int j = 0; j < size; j++) - { - ptr[j] = intptr[j] * scale_ptr[j] + bias_ptr[j]; - } - } - } + dequantize(intptr, ptr, scale_data, bias_data, size, 1); } } diff --git a/tests/test_dequantize.cpp b/tests/test_dequantize.cpp index 8ac6382762a..178d1acebab 100644 --- a/tests/test_dequantize.cpp +++ b/tests/test_dequantize.cpp @@ -96,30 +96,14 @@ static int test_dequantize_1() static int test_dequantize_2() { return 0 - || test_dequantize(RandomIntMat(128), 1, 128) || test_dequantize(RandomIntMat(128), 1, 1) || test_dequantize(RandomIntMat(128), 1, 0) - || test_dequantize(RandomIntMat(128), 128, 128) - || test_dequantize(RandomIntMat(128), 128, 1) - || test_dequantize(RandomIntMat(128), 128, 0) - || test_dequantize(RandomIntMat(120), 1, 120) || test_dequantize(RandomIntMat(120), 1, 1) || test_dequantize(RandomIntMat(120), 1, 0) - || test_dequantize(RandomIntMat(120), 120, 120) - || test_dequantize(RandomIntMat(120), 120, 1) - || test_dequantize(RandomIntMat(120), 120, 0) - || test_dequantize(RandomIntMat(124), 1, 124) || test_dequantize(RandomIntMat(124), 1, 1) || test_dequantize(RandomIntMat(124), 1, 0) - || test_dequantize(RandomIntMat(124), 124, 124) - || test_dequantize(RandomIntMat(124), 124, 1) - || test_dequantize(RandomIntMat(124), 124, 0) - || test_dequantize(RandomIntMat(127), 1, 127) || test_dequantize(RandomIntMat(127), 1, 1) - || test_dequantize(RandomIntMat(127), 1, 0) - || test_dequantize(RandomIntMat(127), 127, 127) - || test_dequantize(RandomIntMat(127), 127, 1) - || test_dequantize(RandomIntMat(127), 127, 0); + || test_dequantize(RandomIntMat(127), 1, 0); } int main()