From 42fa774c6fb60f5302de272025bbed4f47aa7537 Mon Sep 17 00:00:00 2001 From: luoyu-intel Date: Fri, 21 Jun 2024 17:32:29 +0800 Subject: [PATCH] fix memory leak, set lower extra memory size. --- bestla/bestla/bestla_prologue_b.h | 7 +++++++ neural_speed/core/layers/ne_bestla_sycl.cpp | 3 +++ neural_speed/models/llama/llama_utils.cpp | 4 ++-- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/bestla/bestla/bestla_prologue_b.h b/bestla/bestla/bestla_prologue_b.h index 09d93b5d0..0c78239ee 100644 --- a/bestla/bestla/bestla_prologue_b.h +++ b/bestla/bestla/bestla_prologue_b.h @@ -133,6 +133,12 @@ class WeightKBlockNInteger { transposeWeight(srcstor.mK, srcstor.mN, s8buf, srcstor.mN, s8transbuf, srcstor.mKPad, threading); compressWeight(srcstor.mKPad, srcstor.mNPad, s8transbuf, srcstor.mKPad, dststor.WPtr(), srcstor.mDType, threading); + if (s8buf) { + utils::afree(s8buf); + } + if (s8transbuf) { + utils::afree(s8transbuf); + } int nk_scale = utils::updiv(srcstor.mKPad, srcstor.mBlockSize); if (srcstor.mCorrection.mScaEleSize == 4) { transposeWeight(nk_scale, srcstor.mNPad, srcstor.template SPtr(), srcstor.mNPad, @@ -141,6 +147,7 @@ class WeightKBlockNInteger { transposeWeight(nk_scale, srcstor.mNPad, srcstor.template SPtr(), srcstor.mNPad, dststor.template SPtr(), dststor.CStep(), threading); } + } AUTOCALL void doubleQuantScale(float* scale, size_t scale_size, int dq_blocksize, BTLA_DTYPE qtype, utils::aligned_vector* dq_buf) { diff --git a/neural_speed/core/layers/ne_bestla_sycl.cpp b/neural_speed/core/layers/ne_bestla_sycl.cpp index 267dc7d5c..170f5c503 100644 --- a/neural_speed/core/layers/ne_bestla_sycl.cpp +++ b/neural_speed/core/layers/ne_bestla_sycl.cpp @@ -138,6 +138,9 @@ void bestla_device_load_storage(void* hoststor, void* devstor, void* deviceptr, dstor->fromHost(transtor, (sycl::queue*)device_queue); } } + if (ptr) { + delete ptr; + } } template diff --git a/neural_speed/models/llama/llama_utils.cpp b/neural_speed/models/llama/llama_utils.cpp index d3ac41d54..cc636581e 100644 --- a/neural_speed/models/llama/llama_utils.cpp +++ b/neural_speed/models/llama/llama_utils.cpp @@ -97,8 +97,8 @@ void Llama::load(model_context* ctx, model_progress_callback progress_callback, int n_cpu_layer = n_layer - n_gpu_layer; n_cpu_layer = n_cpu_layer < 0 ? 0 : n_cpu_layer; fprintf(stderr, "%s: ctx size = %7.2f MB\n", __func__, ctx_size / 1024.0 / 1024.0); - auto host_size = (ctx_size + (50 << 20)) * n_cpu_layer / n_layer + n_embd * n_vocab * sizeof(float); - auto device_size = (ctx_size + (50 << 20)) * n_gpu_layer / n_layer + n_embd * n_vocab * sizeof(float); + auto host_size = (ctx_size + (50 << 20)) * n_cpu_layer / n_layer + (50 << 20); + auto device_size = (ctx_size + (50 << 20)) * n_gpu_layer / n_layer + (50 << 20); fprintf(stderr, "%s: host ctx size = %7.2f MB\n", __func__, host_size / 1024.0 / 1024.0); #ifdef NS_SYCL fprintf(stderr, "%s: device ctx size = %7.2f MB\n", __func__, device_size / 1024.0 / 1024.0);