Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama2 initial commits #11

Open
wants to merge 3 commits into
base: llama2
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added 1-build/*.py/config.properties
Empty file.
9 changes: 0 additions & 9 deletions 1-build/Dockerfile-base-arm

This file was deleted.

9 changes: 0 additions & 9 deletions 1-build/Dockerfile-base-graviton

This file was deleted.

19 changes: 19 additions & 0 deletions 1-build/Dockerfile-base-inf
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
FROM amazonlinux:2

LABEL description="Base container for Inferentia1 models"
ENV PYTHONUNBUFFERED=TRUE
ENV PYTHONDONTWRITEBYTECODE=TRUE
ADD ./1-build/etc /etc
RUN echo -e '[neuron]\nname=Neuron YUM Repository\nbaseurl=https://yum.repos.neuron.amazonaws.com\nenabled=1\nmetadata_expire=0\n' >> /etc/yum.repos.d/neuron.repo
RUN rpm --import https://yum.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB
RUN yum update -y && \
yum install -y python3 python3-devel gcc-c++ && \
yum install -y tar gzip ca-certificates procps net-tools which vim wget libgomp htop jq bind-utils bc pciutils && \
yum install -y aws-neuronx-tools-2.*
RUN pip3 install --upgrade --force-reinstall --no-cache-dir neuron-cc[tensorflow] torch-neuron transformers==4.2.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com
RUN pip3 install --no-cache-dir torchserve==0.3.0 torch-model-archiver==0.3.0 configparser
RUN alternatives --install /usr/bin/python python /usr/bin/python3 1; alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
RUN echo "export PATH=/opt/aws/neuron/bin:$PATH" >> /root/.bashrc
RUN echo "alias ll='ls -alh --color=auto'" >> /root/.bashrc
ADD ./1-build/*.py /app/

43 changes: 11 additions & 32 deletions 1-build/Dockerfile-base-inf2
Original file line number Diff line number Diff line change
@@ -1,42 +1,21 @@
FROM amazonlinux:2

LABEL description="Base container for Inferentia2 models"
LABEL description="Base container for Inferentia1 models"
ENV PYTHONUNBUFFERED=TRUE
ENV PYTHONDONTWRITEBYTECODE=TRUE
ADD ./1-build/etc /etc
# Neuron SDK components version numbers
ARG NEURONX_RUNTIME_LIB_VERSION=2.16.*
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.16.*
ARG NEURONX_TOOLS_VERSION=2.13.*
ARG NEURONX_FRAMEWORK_VERSION=1.13.1.1.10.*
ARG NEURONX_TRANSFORMERS_VERSION=0.6.*
ARG NEURONX_CC_VERSION=2.9.*
ARG TORCHSERVE_VERSION=0.8.2

RUN echo -e '[neuron]\nname=Neuron YUM Repository\nbaseurl=https://yum.repos.neuron.amazonaws.com\nenabled=1\nmetadata_expire=0\n' >> /etc/yum.repos.d/neuron.repo
RUN rpm --import https://yum.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB
RUN amazon-linux-extras install -y python3.8
RUN yum update -y && \
yum install -y git tar gzip ca-certificates procps net-tools which vim wget libgomp htop jq bind-utils bc pciutils && \
yum install -y gcc-c++ && \
yum install -y jq java-11-amazon-corretto-headless # for torchserve
RUN yum install -y aws-neuronx-collectives-${NEURONX_COLLECTIVES_LIB_VERSION} && \
yum install -y aws-neuronx-runtime-lib-${NEURONX_RUNTIME_LIB_VERSION} && \
yum install -y aws-neuronx-tools-${NEURONX_TOOLS_VERSION}
ENV PATH="/opt/aws/neuron/bin:${PATH}"
RUN echo 'alias python=python3.8' >> ~/.bashrc
RUN echo 'alias pip=pip3.8' >> ~/.bashrc
RUN update-alternatives --install /usr/bin/pip pip /usr/bin/pip3.8 1

RUN pip3.8 install --extra-index-url https://pip.repos.neuron.amazonaws.com \
neuronx-cc==$NEURONX_CC_VERSION \
torch-neuronx==$NEURONX_FRAMEWORK_VERSION \
transformers-neuronx==$NEURONX_TRANSFORMERS_VERSION
RUN pip3.8 install "protobuf<4" \
&& pip3.8 install torchserve==${TORCHSERVE_VERSION} \
&& pip3.8 install torch-model-archiver==${TORCHSERVE_VERSION} \
&& pip3.8 install --no-deps --no-cache-dir -U torchvision==0.14.* captum==0.6.0 configparser

yum install -y python3 python3-devel gcc-c++ && \
yum install -y tar gzip ca-certificates procps net-tools which vim wget libgomp htop jq bind-utils bc pciutils && \
yum install -y aws-neuronx-tools-2.*
RUN yum install -y aws-neuronx-collectives-2.* && \
yum install -y aws-neuronx-runtime-lib-2.*
RUN pip3 install --upgrade --force-reinstall --no-cache-dir neuronx-cc[tensorflow] torch-neuronx transformers==4.2.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com
RUN pip3 install --no-cache-dir torchserve==0.3.0 torch-model-archiver==0.3.0 configparser
RUN alternatives --install /usr/bin/python python /usr/bin/python3 1; alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
RUN echo "export PATH=/opt/aws/neuron/bin:$PATH" >> /root/.bashrc
RUN echo "alias ll='ls -alh --color=auto'" >> /root/.bashrc
ADD ./1-build/*.py /app/

Empty file added 1-build/etc/hostname
Empty file.
Empty file added 1-build/etc/hosts
Empty file.
Empty file added 1-build/etc/resolv.conf
Empty file.
153 changes: 28 additions & 125 deletions 2-trace/model-tracer.py
Original file line number Diff line number Diff line change
@@ -1,127 +1,30 @@
######################################################################
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. #
# SPDX-License-Identifier: MIT-0 #
######################################################################

import platform
import torch
import os
import importlib
import torch
from configparser import ConfigParser

machine=platform.uname().machine
device_type='cpu'
if machine == 'aarch64':
device_type='arm'

try:
import torch_neuron
device_type='inf1'
except ImportError:
print('[WARN] Torch Neuron not Found')
pass
try:
import torch_neuronx
device_type='inf2'
except ImportError:
print('[WARN] Torch Neuronx not Found')
pass

import os

# 1. READ config.properties
print("\nParsing configuration ...")
path_prefix = os.getcwd()
with open(path_prefix + '/../config.properties') as f:
config_lines = '[global]\n' + f.read()
f.close()
config = ConfigParser()
config.read_string(config_lines)

model_name = config['global']['huggingface_model_name']
tokenizer_class_name = config['global']['huggingface_tokenizer_class']
model_class_name = config['global']['huggingface_model_class']
sequence_length=int(config['global']['sequence_length'])
processor=config['global']['processor']
pipeline_cores=config['global']['pipeline_cores']
batch_size=int(config['global']['batch_size'])
test=config['global']['test']

question = "What does the little engine say?"

context = """In the childrens story about the little engine a small locomotive is pulling a large load up a mountain.
Since the load is heavy and the engine is small it is not sure whether it will be able to do the job. This is a story
about how an optimistic attitude empowers everyone to achieve more. In the story the little engine says: 'I think I can' as it is
pulling the heavy load all the way to the top of the mountain. On the way down it says: I thought I could."""


# 2. LOAD PRE-TRAINED MODEL
print(f'\nLoading pre-trained model: {model_name}')
transformers = importlib.import_module("transformers")
tokenizer_class = getattr(transformers, tokenizer_class_name)
model_class = getattr(transformers, model_class_name)
tokenizer = tokenizer_class.from_pretrained(model_name)
model = model_class.from_pretrained(model_name, return_dict=False)

# 3. TOKENIZE THE INPUT
print('\nTokenizing input sample ...')
inputs = tokenizer.encode_plus(question,
context,
return_tensors="pt",
max_length=sequence_length,
padding='max_length',
truncation=True)
if device_type not in ['inf1', 'inf2']:
if torch.cuda.is_available():
device = torch.device("cuda")
device_type = "gpu"
model.to(device)
inputs.to(device)
else:
device = torch.device("cpu")

if device_type == processor:
print(f" ... Using device: {device_type}")
else:
print(f"[WARN] detected device_type ({device_type}) does not match the configured processor ({processor})")

# 2. COMPILE THE MODEL
print('\nTracing model ...')
example_inputs = (
torch.cat([inputs['input_ids']] * batch_size,0),
torch.cat([inputs['attention_mask']] * batch_size,0)
)
os.makedirs(f'traced-{model_name}', exist_ok=True)
torch.set_num_threads(6)
if 'inf' == processor:
model_traced = torch.neuron.trace(model,
example_inputs,
verbose=1,
compiler_workdir=f'./traced-{model_name}/compile_wd_{processor}_bs{batch_size}_seq{sequence_length}_pc{pipeline_cores}',
compiler_args = ['--neuroncore-pipeline-cores', str(pipeline_cores)])
elif 'inf2' == processor:
model_traced = torch_neuronx.trace(model,
example_inputs)
else:
model_traced = torch.jit.trace(model, example_inputs)

# 3. TEST THE COMPILED MODEL (Optional)
if test.lower() == 'true':
print("\nTesting traced model ...")
print(f"Question: {question}")
# Testing the traced model
answer_logits = model_traced(*example_inputs)
answer_start = answer_logits[0].argmax().item()
answer_end = answer_logits[1].argmax().item()+1
answer_txt = ""
if answer_end > answer_start:
answer_txt = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
else:
answer_txt = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:]))
print(f'Model Answer: {answer_txt}')

# 4. SAVE THE COMPILED MODEL
print('\nSaving traced model ...')
model_path=f'./traced-{model_name}/{model_name}_bs{batch_size}_seq{sequence_length}_pc{pipeline_cores}_{processor}.pt'
model_traced.save(model_path)

print(f'Done. Model saved as: {model_path}')
from transformers_neuronx.llama.model import LlamaForSampling
from transformers import AutoModelForCausalLM
from transformers_neuronx.module import save_pretrained_split
tp_degree = 2
batch_size = 1
sequence_length = 256
amp_type = 'bf16'
os.environ["NEURON_CC_FLAGS"] = "--model-type=transformer-inference"
os.environ['NEURON_RT_NUM_CORES'] = str(tp_degree)
os.environ["NEURONX_CACHE"]= "on"
os.environ["NEURONX_DUMP_TO"] = f"./neuron_cache/tp{tp_degree}_bs{batch_size}_seqlen{sequence_length}"
# create a directory for model
model_dir = "/app/llama_model" # hugging face format
os.makedirs(model_dir, exist_ok=True)
# initialize the model
model = AutoModelForCausalLM.from_pretrained(model_dir, low_cpu_mem_usage=True, torch_dtype=torch.float16)
# serialize the model
serialized_model_dir = os.path.join(model_dir, 'serialized')
os.makedirs(serialized_model_dir, exist_ok=True)
save_pretrained_split(model, serialized_model_dir)
# create neuron model
#transformers_neuronx = importlib.import_module("transformers_neuronx")
#neuron_model_class = getattr(transformers_neuronx, neuron_model_class_name)
neuron_model = LlamaForSampling.from_pretrained(serialized_model_dir, tp_degree=tp_degree, batch_size=batch_size, amp=amp_type)
# compile model for neuron
neuron_model.to_neuron()
124 changes: 124 additions & 0 deletions 2-trace/old_model-tracer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
######################################################################
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. #
# SPDX-License-Identifier: MIT-0 #
######################################################################

import torch
import importlib
from configparser import ConfigParser

device_type='cpu'

try:
import torch_neuron
device_type='inf1'
except ImportError:
print('[WARN] Torch Neuron not Found')
pass
try:
import torch_neuronx
device_type='inf2'
except ImportError:
print('[WARN] Torch Neuronx not Found')
pass

import os

# 1. READ config.properties
print("\nParsing configuration ...")
path_prefix = os.getcwd()
with open(path_prefix + '/../config.properties') as f:
config_lines = '[global]\n' + f.read()
f.close()
config = ConfigParser()
config.read_string(config_lines)

model_name = config['global']['huggingface_model_name']
tokenizer_class_name = config['global']['huggingface_tokenizer_class']
model_class_name = config['global']['huggingface_model_class']
sequence_length=int(config['global']['sequence_length'])
processor=config['global']['processor']
pipeline_cores=config['global']['pipeline_cores']
batch_size=int(config['global']['batch_size'])
test=config['global']['test']

question = "What does the little engine say?"

context = """In the childrens story about the little engine a small locomotive is pulling a large load up a mountain.
Since the load is heavy and the engine is small it is not sure whether it will be able to do the job. This is a story
about how an optimistic attitude empowers everyone to achieve more. In the story the little engine says: 'I think I can' as it is
pulling the heavy load all the way to the top of the mountain. On the way down it says: I thought I could."""


# 2. LOAD PRE-TRAINED MODEL
print(f'\nLoading pre-trained model: {model_name}')
transformers = importlib.import_module("transformers")
tokenizer_class = getattr(transformers, tokenizer_class_name)
model_class = getattr(transformers, model_class_name)
tokenizer = tokenizer_class.from_pretrained(model_name)
model = model_class.from_pretrained(model_name, return_dict=False)

# 3. TOKENIZE THE INPUT
print('\nTokenizing input sample ...')
inputs = tokenizer.encode_plus(question,
context,
return_tensors="pt",
max_length=sequence_length,
padding='max_length',
truncation=True)
if device_type not in ['inf1', 'inf2']:
if torch.cuda.is_available():
device = torch.device("cuda")
device_type = "gpu"
model.to(device)
inputs.to(device)
else:
device = torch.device("cpu")
device_type = 'cpu'

if device_type == processor:
print(f" ... Using device: {device_type}")
else:
print(f"[WARN] detected device_type ({device_type}) does not match the configured processor ({processor})")

# 2. COMPILE THE MODEL
print('\nTracing model ...')
example_inputs = (
torch.cat([inputs['input_ids']] * batch_size,0),
torch.cat([inputs['attention_mask']] * batch_size,0)
)
os.makedirs(f'traced-{model_name}', exist_ok=True)
torch.set_num_threads(6)
if 'inf' in processor:
model_traced = torch.neuron.trace(model,
example_inputs,
verbose=1,
compiler_workdir=f'./traced-{model_name}/compile_wd_{processor}_bs{batch_size}_seq{sequence_length}_pc{pipeline_cores}',
compiler_args = ['--neuroncore-pipeline-cores', str(pipeline_cores)])
elif 'inf2' in processor:
model_traced = torch_neuronx.trace(model,
example_inputs)
else:
model_traced = torch.jit.trace(model, example_inputs)

# 3. TEST THE COMPILED MODEL (Optional)
if test.lower() == 'true':
print("\nTesting traced model ...")
print(f"Question: {question}")
# Testing the traced model
answer_logits = model_traced(*example_inputs)
answer_start = answer_logits[0].argmax().item()
answer_end = answer_logits[1].argmax().item()+1
answer_txt = ""
if answer_end > answer_start:
answer_txt = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
else:
answer_txt = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:]))
print(f'Model Answer: {answer_txt}')

# 4. SAVE THE COMPILED MODEL
print('\nSaving traced model ...')
model_path=f'./traced-{model_name}/{model_name}_{processor}_bs{batch_size}_seq{sequence_length}_pc{pipeline_cores}.pt'
model_traced.save(model_path)

print(f'Done. Model saved as: {model_path}')
Loading