Skip to content

Commit

Permalink
fix gpu ut
Browse files Browse the repository at this point in the history
Signed-off-by: n1ck-guo <heng.guo@intel.com>
  • Loading branch information
n1ck-guo committed Dec 6, 2024
1 parent ecc17be commit ee1f6fa
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 13 deletions.
2 changes: 2 additions & 0 deletions auto_round/mllm/autoround_mllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from ..utils import (
logger,
detect_device,
to_device,
to_dtype,
get_multimodal_block_names,
Expand All @@ -34,6 +35,7 @@
def _only_text_test(model, tokenizer, device):
"""Test if the model whether can use text-only datasets."""
try:
device = detect_device(device)
text = ["only text", "test"]
tokenizer.padding_side = 'left'
if tokenizer.pad_token is None:
Expand Down
6 changes: 5 additions & 1 deletion test_cuda/test_support_vlms.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def test_qwen2(self):
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text[0])
shutil.rmtree(quantized_model_path, ignore_errors=True)

def test_phi3(self):
model_path = "/models/Phi-3.5-vision-instruct/"
Expand Down Expand Up @@ -129,8 +130,8 @@ def test_phi3(self):
response = processor.batch_decode(generate_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=False)[0]

print(response)
shutil.rmtree(quantized_model_path, ignore_errors=True)

def test_llava(self):
model_path = "/models/llava-v1.5-7b/"
Expand Down Expand Up @@ -166,6 +167,7 @@ class DataArgs:

output = model.generate(inputs['input_ids'].to(model.device), images=image_input.unsqueeze(0).half(), max_new_tokens=50)
print(tokenizer.batch_decode(output))
shutil.rmtree(quantized_model_path, ignore_errors=True)

def test_llama(self):
model_path = "/models/Llama-3.2-11B-Vision-Instruct/"
Expand Down Expand Up @@ -204,6 +206,7 @@ def test_llama(self):

output = model.generate(**inputs, max_new_tokens=50)
print(processor.decode(output[0]))
shutil.rmtree(quantized_model_path, ignore_errors=True)

def test_cogvlm(self):
model_path = "/models/cogvlm2-llama3-chat-19B/"
Expand Down Expand Up @@ -257,6 +260,7 @@ def test_cogvlm(self):
response = tokenizer.decode(outputs[0])
response = response.split("<|end_of_text|>")[0]
print(response)
shutil.rmtree(quantized_model_path, ignore_errors=True)

if __name__ == "__main__":
unittest.main()
76 changes: 64 additions & 12 deletions test_cuda/test_vlms.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sys
import unittest
import re
import os

sys.path.insert(0, "..")

Expand Down Expand Up @@ -38,7 +39,7 @@ def tearDownClass(self):
# assert (
# res == """<s> There is a girl who likes adventure, and she is looking for a partner to go on a treasure hunt. She has found a map that leads to a hidden treasure, but she needs a partner to help her decipher the clues and find the treasure. You""")

def inference(self, quantized_model_dir):
def qwen_inference(self, quantized_model_dir):
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir)
processor = AutoProcessor.from_pretrained(quantized_model_dir, trust_remote_code=True)
Expand Down Expand Up @@ -104,19 +105,68 @@ def test_vlm_tune(self):

quantized_model_path = self.save_dir
autoround.save_quantized(quantized_model_path, format='auto_round', inplace=False)
self.inference(quantized_model_path)
self.qwen_inference(quantized_model_path)
shutil.rmtree(self.save_dir, ignore_errors=True)
autoround.save_quantized(quantized_model_path, format='auto_gptq', inplace=False)
self.inference(quantized_model_path)
self.qwen_inference(quantized_model_path)
shutil.rmtree(self.save_dir, ignore_errors=True)

def phi3_infernece(self, quantized_model_dir):
from transformers import AutoModelForCausalLM, AutoProcessor
quantized_model_path = os.path.join(quantized_model_dir, "Phi-3.5-vision-instruct-w4g128-auto_round")
res = os.system(f"cp /models/Phi-3.5-vision-instruct/*.py {quantized_model_path}")
model = AutoModelForCausalLM.from_pretrained(
quantized_model_path,
device_map="auto",
trust_remote_code=True,
torch_dtype="auto"
)
processor = AutoProcessor.from_pretrained(quantized_model_path,
trust_remote_code=True,
num_crops=4
)

image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
content = "Describe this image."
messages = [
{"role": "user",
"content": "<|image_1|>\n"+content},
]

prompt = processor.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
image_inputs = Image.open(requests.get(image_url, stream=True).raw)
inputs = processor(prompt, image_inputs, return_tensors="pt").to(model.device)

generation_args = {
"max_new_tokens": 1000,
"temperature": 0.0,
"do_sample": False,
}

generate_ids = model.generate(**inputs,
eos_token_id=processor.tokenizer.eos_token_id,
**generation_args
)

# remove input tokens
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(generate_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=False)[0]

print(response)

def test_quant_not_text(self):
from auto_round import AutoRoundMLLM
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer

## load the model
model_name = "/models/Qwen2-VL-2B-Instruct"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name = "/models/Phi-3.5-vision-instruct"
model = AutoModelForCausalLM.from_pretrained(
model_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
Expand All @@ -127,19 +177,21 @@ def test_quant_not_text(self):
bits=bits, group_size=group_size, sym=sym, iters=1, nsamples=1,quant_nontext_module=True)
autoround.quantize()

quantized_model_path = "./saved"
autoround.save_quantized(quantized_model_path, format='auto_round', inplace=False)
self.inference(quantized_model_path)
quantized_model_path = "./saved/Phi-3.5-vision-instruct-w4g128-auto_round"
autoround.save_quantized(quantized_model_path, format='auto_round', inplace=False, safe_serialization=False)
self.phi3_infernece("./saved")
shutil.rmtree("./saved", ignore_errors=True)

def test_quant_not_text_fp_layers(self):
import os
python_path = sys.executable
absolute_path = os.path.abspath(self.save_dir)
res = os.system(
f"cd .. && {python_path} -m auto_round --mllm --model /models/Qwen2-VL-2B-Instruct --fp_layers model.layers.27,visual.blocks.29 --quant_nontext_module --iters 1 --nsamples 1 --output_dir {absolute_path}")
self.inference(os.path.join(absolute_path,"Qwen2-VL-2B-Instruct-w4g128-auto_round"))
shutil.rmtree(os.path.join(absolute_path,"Qwen2-VL-2B-Instruct-w4g128-auto_round"), ignore_errors=True)
f"cd .. && {python_path} -m auto_round --mllm --model /models/Phi-3.5-vision-instruct "
f"--fp_layers model.layers.27,model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16 "
f"--quant_nontext_module --iters 1 --nsamples 1 --output_dir {absolute_path}")
self.phi3_infernece(absolute_path)
shutil.rmtree(absolute_path, ignore_errors=True)



Expand Down

0 comments on commit ee1f6fa

Please sign in to comment.