From 42fa774c6fb60f5302de272025bbed4f47aa7537 Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Fri, 21 Jun 2024 17:32:29 +0800
Subject: [PATCH] fix memory leak, set lower extra memory size.

---
 bestla/bestla/bestla_prologue_b.h           | 7 +++++++
 neural_speed/core/layers/ne_bestla_sycl.cpp | 3 +++
 neural_speed/models/llama/llama_utils.cpp   | 4 ++--
 3 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/bestla/bestla/bestla_prologue_b.h b/bestla/bestla/bestla_prologue_b.h
index 09d93b5d0..0c78239ee 100644
--- a/bestla/bestla/bestla_prologue_b.h
+++ b/bestla/bestla/bestla_prologue_b.h
@@ -133,6 +133,12 @@ class WeightKBlockNInteger {
     transposeWeight<int8_t>(srcstor.mK, srcstor.mN, s8buf, srcstor.mN, s8transbuf, srcstor.mKPad, threading);
     compressWeight(srcstor.mKPad, srcstor.mNPad, s8transbuf, srcstor.mKPad, dststor.WPtr<int8_t>(), srcstor.mDType,
                    threading);
+    if (s8buf) {
+      utils::afree(s8buf);
+    }
+    if (s8transbuf) {
+      utils::afree(s8transbuf);
+    }
     int nk_scale = utils::updiv(srcstor.mKPad, srcstor.mBlockSize);
     if (srcstor.mCorrection.mScaEleSize == 4) {
       transposeWeight<float>(nk_scale, srcstor.mNPad, srcstor.template SPtr<float>(), srcstor.mNPad,
@@ -141,6 +147,7 @@ class WeightKBlockNInteger {
       transposeWeight<uint16_t>(nk_scale, srcstor.mNPad, srcstor.template SPtr<uint16_t>(), srcstor.mNPad,
                                 dststor.template SPtr<uint16_t>(), dststor.CStep(), threading);
     }
+
   }
   AUTOCALL void doubleQuantScale(float* scale, size_t scale_size, int dq_blocksize, BTLA_DTYPE qtype,
                                  utils::aligned_vector<float>* dq_buf) {
diff --git a/neural_speed/core/layers/ne_bestla_sycl.cpp b/neural_speed/core/layers/ne_bestla_sycl.cpp
index 267dc7d5c..170f5c503 100644
--- a/neural_speed/core/layers/ne_bestla_sycl.cpp
+++ b/neural_speed/core/layers/ne_bestla_sycl.cpp
@@ -138,6 +138,9 @@ void bestla_device_load_storage(void* hoststor, void* devstor, void* deviceptr,
       dstor->fromHost(transtor, (sycl::queue*)device_queue);
     }
   }
+  if (ptr) {
+    delete ptr;
+  }
 }
 
 template <class GCT>
diff --git a/neural_speed/models/llama/llama_utils.cpp b/neural_speed/models/llama/llama_utils.cpp
index d3ac41d54..cc636581e 100644
--- a/neural_speed/models/llama/llama_utils.cpp
+++ b/neural_speed/models/llama/llama_utils.cpp
@@ -97,8 +97,8 @@ void Llama::load(model_context* ctx, model_progress_callback progress_callback,
   int n_cpu_layer = n_layer - n_gpu_layer;
   n_cpu_layer = n_cpu_layer < 0 ? 0 : n_cpu_layer;
   fprintf(stderr, "%s: ctx size   = %7.2f MB\n", __func__, ctx_size / 1024.0 / 1024.0);
-  auto host_size = (ctx_size + (50 << 20)) * n_cpu_layer / n_layer + n_embd * n_vocab * sizeof(float);
-  auto device_size = (ctx_size + (50 << 20)) * n_gpu_layer / n_layer + n_embd * n_vocab * sizeof(float);
+  auto host_size = (ctx_size + (50 << 20)) * n_cpu_layer / n_layer + (50 << 20);
+  auto device_size = (ctx_size + (50 << 20)) * n_gpu_layer / n_layer + (50 << 20);
   fprintf(stderr, "%s: host ctx size   = %7.2f MB\n", __func__, host_size / 1024.0 / 1024.0);
 #ifdef NS_SYCL
   fprintf(stderr, "%s: device ctx size   = %7.2f MB\n", __func__, device_size / 1024.0 / 1024.0);