update

Signed-off-by: n1ck-guo <heng.guo@intel.com>
intel · Dec 27, 2024 · 1562f39 · 1562f39
1 parent 08d2888
commit 1562f39
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 17 deletions.
diff --git a/auto_round/mllm/autoround_mllm.py b/auto_round/mllm/autoround_mllm.py
@@ -28,7 +28,8 @@
     clear_memory
 )
 from ..autoround import AutoRound
-from .template import get_template, Template, SUPPORT_ONLY_TEXT_MODELS
+from .template import get_template, Template
+from auto_round.special_model_handler import  SUPPORT_ONLY_TEXT_MODELS
 from .mllm_dataset import get_mllm_dataloader
 from ..low_cpu_mem.utils import get_layers_before_block
 
@@ -41,8 +42,10 @@ def _only_text_test(model, tokenizer, device, model_type):
 
     device = detect_device(device)
     text = ["only text", "test"]
+    ori_padding_size = tokenizer.padding_side
     tokenizer.padding_side = 'left'
     inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+    tokenizer.padding_size = ori_padding_size
 
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
@@ -53,19 +56,14 @@ def _only_text_test(model, tokenizer, device, model_type):
         return True
     except RuntimeError as e:
         if "CUDA out of memory" in str(e):
-            logger.warning(f"we strongly recommend using additional CUDA/HPU devices,e.g. "
-                            f"set `--device '0,1'` in our cmd line usage or "
-                            f"load the model with `device_mapping=auto`,"
-                            f" for optimal performance during calibration "
-                            f"Otherwise, the process may be significantly slower.")
             model = model.to("cpu")
             inputs = inputs.to("cpu")
             try:
-                model(**input)
+                model(**inputs)
             except:
                 return False
         return False
-    except:
+    except Exception as e:
         return False
 
 

diff --git a/auto_round/mllm/template.py b/auto_round/mllm/template.py
@@ -22,15 +22,6 @@
 from .processor import BasicProcessor, PROCESSORS
 
 TEMPLATES: Dict[str, "Template"] = {}
-SUPPORT_ONLY_TEXT_MODELS = [
-    "phi3_v",
-    "cogvlm2",
-    "llava",
-    "qwen2_vl",
-    "deepseek_vl_v2",
-    "chatglm",
-    "idefics3"
-]
 
 
 def fill_content(target, **kwargs):

diff --git a/auto_round/special_model_handler.py b/auto_round/special_model_handler.py
@@ -18,6 +18,16 @@
 mllms_with_limited_bs = ("llava", "qwen2_vl", "phi3_v", "mllama") # Limitations on batch_size
 skippable_cache_keys = ("past_key_value",)
 
+SUPPORT_ONLY_TEXT_MODELS = [
+    "phi3_v",
+    "cogvlm2",
+    "llava",
+    "qwen2_vl",
+    "deepseek_vl_v2",
+    "chatglm",
+    "idefics3"
+]
+
 def to_device(input, device=torch.device("cpu")):
     """Moves input data to the specified device.
 

diff --git a/test_cuda/test_support_vlms.py b/test_cuda/test_support_vlms.py
@@ -261,6 +261,16 @@ def test_cogvlm(self):
         response = response.split("<|end_of_text|>")[0]
         print(response)     
         shutil.rmtree(quantized_model_path, ignore_errors=True)
+
+    def test_72b(self):
+        model_path = "/data5/models/Qwen2-VL-72B-Instruct/"
+        res = os.system(
+            f"cd .. && {self.python_path} -m auto_round --mllm "
+            f"--model {model_path} --iter 1 --nsamples 1 --output_dir {self.save_dir} --device {self.device}"
+            )
+        self.assertFalse(res > 0 or res == -1, msg="qwen2-72b tuning fail")
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
+        quantized_model_path = os.path.join(self.save_dir, "Qwen2-VL-72B-Instruct-w4g128-auto_round")
 
 if __name__ == "__main__":
     unittest.main()