diff --git a/auto_round/mllm/autoround_mllm.py b/auto_round/mllm/autoround_mllm.py index c355c058..243c1341 100644 --- a/auto_round/mllm/autoround_mllm.py +++ b/auto_round/mllm/autoround_mllm.py @@ -28,7 +28,8 @@ clear_memory ) from ..autoround import AutoRound -from .template import get_template, Template, SUPPORT_ONLY_TEXT_MODELS +from .template import get_template, Template +from auto_round.special_model_handler import SUPPORT_ONLY_TEXT_MODELS from .mllm_dataset import get_mllm_dataloader from ..low_cpu_mem.utils import get_layers_before_block @@ -41,8 +42,10 @@ def _only_text_test(model, tokenizer, device, model_type): device = detect_device(device) text = ["only text", "test"] + ori_padding_size = tokenizer.padding_side tokenizer.padding_side = 'left' inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) + tokenizer.padding_size = ori_padding_size if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token @@ -53,19 +56,14 @@ def _only_text_test(model, tokenizer, device, model_type): return True except RuntimeError as e: if "CUDA out of memory" in str(e): - logger.warning(f"we strongly recommend using additional CUDA/HPU devices,e.g. " - f"set `--device '0,1'` in our cmd line usage or " - f"load the model with `device_mapping=auto`," - f" for optimal performance during calibration " - f"Otherwise, the process may be significantly slower.") model = model.to("cpu") inputs = inputs.to("cpu") try: - model(**input) + model(**inputs) except: return False return False - except: + except Exception as e: return False diff --git a/auto_round/mllm/template.py b/auto_round/mllm/template.py index 6c2a9f0e..08b4d9eb 100644 --- a/auto_round/mllm/template.py +++ b/auto_round/mllm/template.py @@ -22,15 +22,6 @@ from .processor import BasicProcessor, PROCESSORS TEMPLATES: Dict[str, "Template"] = {} -SUPPORT_ONLY_TEXT_MODELS = [ - "phi3_v", - "cogvlm2", - "llava", - "qwen2_vl", - "deepseek_vl_v2", - "chatglm", - "idefics3" -] def fill_content(target, **kwargs): diff --git a/auto_round/special_model_handler.py b/auto_round/special_model_handler.py index 36aa411a..95471c71 100644 --- a/auto_round/special_model_handler.py +++ b/auto_round/special_model_handler.py @@ -18,6 +18,16 @@ mllms_with_limited_bs = ("llava", "qwen2_vl", "phi3_v", "mllama") # Limitations on batch_size skippable_cache_keys = ("past_key_value",) +SUPPORT_ONLY_TEXT_MODELS = [ + "phi3_v", + "cogvlm2", + "llava", + "qwen2_vl", + "deepseek_vl_v2", + "chatglm", + "idefics3" +] + def to_device(input, device=torch.device("cpu")): """Moves input data to the specified device. diff --git a/test_cuda/test_support_vlms.py b/test_cuda/test_support_vlms.py index 81fc3f4a..ca854087 100644 --- a/test_cuda/test_support_vlms.py +++ b/test_cuda/test_support_vlms.py @@ -261,6 +261,16 @@ def test_cogvlm(self): response = response.split("<|end_of_text|>")[0] print(response) shutil.rmtree(quantized_model_path, ignore_errors=True) + + def test_72b(self): + model_path = "/data5/models/Qwen2-VL-72B-Instruct/" + res = os.system( + f"cd .. && {self.python_path} -m auto_round --mllm " + f"--model {model_path} --iter 1 --nsamples 1 --output_dir {self.save_dir} --device {self.device}" + ) + self.assertFalse(res > 0 or res == -1, msg="qwen2-72b tuning fail") + shutil.rmtree(quantized_model_path, ignore_errors=True) + quantized_model_path = os.path.join(self.save_dir, "Qwen2-VL-72B-Instruct-w4g128-auto_round") if __name__ == "__main__": unittest.main() \ No newline at end of file