aws-solutions-library-samples · modestcigit · Nov 10, 2023 · Nov 16, 2023 · Nov 16, 2023
diff --git a/1-build/*.py/config.properties b/1-build/*.py/config.properties
diff --git a/1-build/Dockerfile-base-arm b/1-build/Dockerfile-base-arm
diff --git a/1-build/Dockerfile-base-graviton b/1-build/Dockerfile-base-graviton
diff --git a/1-build/Dockerfile-base-inf b/1-build/Dockerfile-base-inf
@@ -0,0 +1,19 @@
+FROM amazonlinux:2
+
+LABEL description="Base container for Inferentia1 models"
+ENV PYTHONUNBUFFERED=TRUE
+ENV PYTHONDONTWRITEBYTECODE=TRUE
+ADD ./1-build/etc /etc
+RUN echo -e '[neuron]\nname=Neuron YUM Repository\nbaseurl=https://yum.repos.neuron.amazonaws.com\nenabled=1\nmetadata_expire=0\n' >> /etc/yum.repos.d/neuron.repo
+RUN rpm --import https://yum.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB
+RUN yum update -y && \
+    yum install -y python3 python3-devel gcc-c++ && \
+    yum install -y tar gzip ca-certificates procps net-tools which vim wget libgomp htop jq bind-utils bc pciutils && \
+    yum install -y aws-neuronx-tools-2.*
+RUN pip3 install --upgrade --force-reinstall --no-cache-dir neuron-cc[tensorflow] torch-neuron transformers==4.2.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com
+RUN pip3 install --no-cache-dir torchserve==0.3.0 torch-model-archiver==0.3.0 configparser
+RUN alternatives --install /usr/bin/python python /usr/bin/python3 1; alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
+RUN echo "export PATH=/opt/aws/neuron/bin:$PATH" >> /root/.bashrc
+RUN echo "alias ll='ls -alh --color=auto'" >> /root/.bashrc 
+ADD ./1-build/*.py /app/
+
diff --git a/1-build/Dockerfile-base-inf2 b/1-build/Dockerfile-base-inf2
@@ -1,42 +1,21 @@
 FROM amazonlinux:2
-
-LABEL description="Base container for Inferentia2 models"
+  
+LABEL description="Base container for Inferentia1 models"
 ENV PYTHONUNBUFFERED=TRUE
 ENV PYTHONDONTWRITEBYTECODE=TRUE
 ADD ./1-build/etc /etc
-# Neuron SDK components version numbers
-ARG NEURONX_RUNTIME_LIB_VERSION=2.16.*
-ARG NEURONX_COLLECTIVES_LIB_VERSION=2.16.*
-ARG NEURONX_TOOLS_VERSION=2.13.*
-ARG NEURONX_FRAMEWORK_VERSION=1.13.1.1.10.*
-ARG NEURONX_TRANSFORMERS_VERSION=0.6.*
-ARG NEURONX_CC_VERSION=2.9.*
-ARG TORCHSERVE_VERSION=0.8.2
-
 RUN echo -e '[neuron]\nname=Neuron YUM Repository\nbaseurl=https://yum.repos.neuron.amazonaws.com\nenabled=1\nmetadata_expire=0\n' >> /etc/yum.repos.d/neuron.repo
 RUN rpm --import https://yum.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB
-RUN amazon-linux-extras install -y python3.8
 RUN yum update -y && \
-    yum install -y git tar gzip ca-certificates procps net-tools which vim wget libgomp htop jq bind-utils bc pciutils && \
-    yum install -y gcc-c++ && \
-    yum install -y jq java-11-amazon-corretto-headless  # for torchserve
-RUN yum install -y aws-neuronx-collectives-${NEURONX_COLLECTIVES_LIB_VERSION} && \
-    yum install -y aws-neuronx-runtime-lib-${NEURONX_RUNTIME_LIB_VERSION} && \
-    yum install -y aws-neuronx-tools-${NEURONX_TOOLS_VERSION}
-ENV PATH="/opt/aws/neuron/bin:${PATH}"
-RUN echo 'alias python=python3.8' >> ~/.bashrc
-RUN echo 'alias pip=pip3.8' >> ~/.bashrc
-RUN update-alternatives --install /usr/bin/pip pip /usr/bin/pip3.8 1
-
-RUN pip3.8 install --extra-index-url https://pip.repos.neuron.amazonaws.com \
-    neuronx-cc==$NEURONX_CC_VERSION \
-    torch-neuronx==$NEURONX_FRAMEWORK_VERSION \
-    transformers-neuronx==$NEURONX_TRANSFORMERS_VERSION 
-RUN pip3.8 install "protobuf<4" \
-    && pip3.8 install torchserve==${TORCHSERVE_VERSION} \
-    && pip3.8 install torch-model-archiver==${TORCHSERVE_VERSION} \
-    && pip3.8 install --no-deps --no-cache-dir -U torchvision==0.14.* captum==0.6.0 configparser
-
+    yum install -y python3 python3-devel gcc-c++ && \
+    yum install -y tar gzip ca-certificates procps net-tools which vim wget libgomp htop jq bind-utils bc pciutils && \
+    yum install -y aws-neuronx-tools-2.*
+RUN yum install -y aws-neuronx-collectives-2.* && \
+    yum install -y aws-neuronx-runtime-lib-2.*
+RUN pip3 install --upgrade --force-reinstall --no-cache-dir neuronx-cc[tensorflow] torch-neuronx transformers==4.2.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com
+RUN pip3 install --no-cache-dir torchserve==0.3.0 torch-model-archiver==0.3.0 configparser
+RUN alternatives --install /usr/bin/python python /usr/bin/python3 1; alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
+RUN echo "export PATH=/opt/aws/neuron/bin:$PATH" >> /root/.bashrc
 RUN echo "alias ll='ls -alh --color=auto'" >> /root/.bashrc 
 ADD ./1-build/*.py /app/
 
diff --git a/1-build/etc/hostname b/1-build/etc/hostname
diff --git a/1-build/etc/hosts b/1-build/etc/hosts
diff --git a/1-build/etc/resolv.conf b/1-build/etc/resolv.conf
diff --git a/2-trace/model-tracer.py b/2-trace/model-tracer.py
@@ -1,127 +1,30 @@
-######################################################################
-# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. #
-# SPDX-License-Identifier: MIT-0                                     #
-######################################################################
-
-import platform
-import torch
+import os
 import importlib
+import torch
 from configparser import ConfigParser
-
-machine=platform.uname().machine
-device_type='cpu'
-if machine == 'aarch64':
-    device_type='arm'
-
-try:
-    import torch_neuron
-    device_type='inf1'
-except ImportError:
-    print('[WARN] Torch Neuron not Found')
-    pass
-try:
-    import torch_neuronx
-    device_type='inf2'
-except ImportError:
-    print('[WARN] Torch Neuronx not Found')
-    pass
-
-import os
-
-# 1. READ config.properties
-print("\nParsing configuration ...")
-path_prefix = os.getcwd()
-with open(path_prefix + '/../config.properties') as f:
-    config_lines = '[global]\n' + f.read()
-    f.close()
-config = ConfigParser()
-config.read_string(config_lines)
-
-model_name = config['global']['huggingface_model_name']
-tokenizer_class_name = config['global']['huggingface_tokenizer_class']
-model_class_name = config['global']['huggingface_model_class']
-sequence_length=int(config['global']['sequence_length'])
-processor=config['global']['processor']
-pipeline_cores=config['global']['pipeline_cores']
-batch_size=int(config['global']['batch_size'])
-test=config['global']['test']
-
-question = "What does the little engine say?"
-
-context = """In the childrens story about the little engine a small locomotive is pulling a large load up a mountain.
-    Since the load is heavy and the engine is small it is not sure whether it will be able to do the job. This is a story 
-    about how an optimistic attitude empowers everyone to achieve more. In the story the little engine says: 'I think I can' as it is 
-    pulling the heavy load all the way to the top of the mountain. On the way down it says: I thought I could."""
-
-
-# 2. LOAD PRE-TRAINED MODEL
-print(f'\nLoading pre-trained model: {model_name}')
-transformers = importlib.import_module("transformers")
-tokenizer_class = getattr(transformers, tokenizer_class_name)
-model_class = getattr(transformers, model_class_name)
-tokenizer = tokenizer_class.from_pretrained(model_name)
-model = model_class.from_pretrained(model_name, return_dict=False)
-
-# 3. TOKENIZE THE INPUT
-print('\nTokenizing input sample ...')
-inputs = tokenizer.encode_plus(question,
-                               context,
-                               return_tensors="pt",
-                               max_length=sequence_length,
-                               padding='max_length',
-                               truncation=True)
-if device_type not in ['inf1', 'inf2']:
-    if torch.cuda.is_available():
-        device = torch.device("cuda")
-        device_type = "gpu"
-        model.to(device)
-        inputs.to(device)
-    else:
-        device = torch.device("cpu")
-
-if device_type == processor:
-    print(f"   ... Using device: {device_type}")
-else:
-    print(f"[WARN] detected device_type ({device_type}) does not match the configured processor ({processor})")
-
-# 2. COMPILE THE MODEL
-print('\nTracing model ...')
-example_inputs = (
-    torch.cat([inputs['input_ids']] * batch_size,0), 
-    torch.cat([inputs['attention_mask']] * batch_size,0)
-)
-os.makedirs(f'traced-{model_name}', exist_ok=True)
-torch.set_num_threads(6)
-if 'inf' == processor:
-    model_traced = torch.neuron.trace(model, 
-                                  example_inputs, 
-                                  verbose=1, 
-                                  compiler_workdir=f'./traced-{model_name}/compile_wd_{processor}_bs{batch_size}_seq{sequence_length}_pc{pipeline_cores}',  
-                                  compiler_args = ['--neuroncore-pipeline-cores', str(pipeline_cores)])
-elif 'inf2' == processor:
-    model_traced = torch_neuronx.trace(model,
-                                  example_inputs)
-else:
-    model_traced = torch.jit.trace(model, example_inputs)
-
-# 3. TEST THE COMPILED MODEL (Optional)        
-if test.lower() == 'true':
-    print("\nTesting traced model ...")
-    print(f"Question: {question}")
-    # Testing the traced model
-    answer_logits = model_traced(*example_inputs)
-    answer_start = answer_logits[0].argmax().item()
-    answer_end = answer_logits[1].argmax().item()+1
-    answer_txt = ""
-    if answer_end > answer_start:
-        answer_txt = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
-    else:
-        answer_txt = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:]))
-    print(f'Model Answer: {answer_txt}')
-
-# 4. SAVE THE COMPILED MODEL
-print('\nSaving traced model ...')
-model_path=f'./traced-{model_name}/{model_name}_bs{batch_size}_seq{sequence_length}_pc{pipeline_cores}_{processor}.pt'
-model_traced.save(model_path)
-
-print(f'Done. Model saved as: {model_path}')
+from transformers_neuronx.llama.model import LlamaForSampling
+from transformers import AutoModelForCausalLM
+from transformers_neuronx.module import save_pretrained_split
+tp_degree = 2
+batch_size = 1
+sequence_length = 256
+amp_type = 'bf16'
+os.environ["NEURON_CC_FLAGS"] = "--model-type=transformer-inference"
+os.environ['NEURON_RT_NUM_CORES'] = str(tp_degree)
+os.environ["NEURONX_CACHE"]= "on"
+os.environ["NEURONX_DUMP_TO"] = f"./neuron_cache/tp{tp_degree}_bs{batch_size}_seqlen{sequence_length}"
+# create a directory for model
+model_dir = "/app/llama_model" # hugging face format
+os.makedirs(model_dir, exist_ok=True)
+# initialize the model
+model = AutoModelForCausalLM.from_pretrained(model_dir, low_cpu_mem_usage=True, torch_dtype=torch.float16)
+# serialize the model
+serialized_model_dir = os.path.join(model_dir, 'serialized')
+os.makedirs(serialized_model_dir, exist_ok=True)
+save_pretrained_split(model, serialized_model_dir)
+# create neuron model
+#transformers_neuronx = importlib.import_module("transformers_neuronx")
+#neuron_model_class = getattr(transformers_neuronx, neuron_model_class_name)
+neuron_model = LlamaForSampling.from_pretrained(serialized_model_dir, tp_degree=tp_degree, batch_size=batch_size, amp=amp_type)
+# compile model for neuron
+neuron_model.to_neuron()
diff --git a/2-trace/old_model-tracer.py b/2-trace/old_model-tracer.py
@@ -0,0 +1,124 @@
+######################################################################
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. #
+# SPDX-License-Identifier: MIT-0                                     #
+######################################################################
+
+import torch
+import importlib
+from configparser import ConfigParser
+
+device_type='cpu'
+
+try:
+    import torch_neuron
+    device_type='inf1'
+except ImportError:
+    print('[WARN] Torch Neuron not Found')
+    pass
+try:
+    import torch_neuronx
+    device_type='inf2'
+except ImportError:
+    print('[WARN] Torch Neuronx not Found')
+    pass
+
+import os
+
+# 1. READ config.properties
+print("\nParsing configuration ...")
+path_prefix = os.getcwd()
+with open(path_prefix + '/../config.properties') as f:
+    config_lines = '[global]\n' + f.read()
+    f.close()
+config = ConfigParser()
+config.read_string(config_lines)
+
+model_name = config['global']['huggingface_model_name']
+tokenizer_class_name = config['global']['huggingface_tokenizer_class']
+model_class_name = config['global']['huggingface_model_class']
+sequence_length=int(config['global']['sequence_length'])
+processor=config['global']['processor']
+pipeline_cores=config['global']['pipeline_cores']
+batch_size=int(config['global']['batch_size'])
+test=config['global']['test']
+
+question = "What does the little engine say?"
+
+context = """In the childrens story about the little engine a small locomotive is pulling a large load up a mountain.
+    Since the load is heavy and the engine is small it is not sure whether it will be able to do the job. This is a story 
+    about how an optimistic attitude empowers everyone to achieve more. In the story the little engine says: 'I think I can' as it is 
+    pulling the heavy load all the way to the top of the mountain. On the way down it says: I thought I could."""
+
+
+# 2. LOAD PRE-TRAINED MODEL
+print(f'\nLoading pre-trained model: {model_name}')
+transformers = importlib.import_module("transformers")
+tokenizer_class = getattr(transformers, tokenizer_class_name)
+model_class = getattr(transformers, model_class_name)
+tokenizer = tokenizer_class.from_pretrained(model_name)
+model = model_class.from_pretrained(model_name, return_dict=False)
+
+# 3. TOKENIZE THE INPUT
+print('\nTokenizing input sample ...')
+inputs = tokenizer.encode_plus(question,
+                               context,
+                               return_tensors="pt",
+                               max_length=sequence_length,
+                               padding='max_length',
+                               truncation=True)
+if device_type not in ['inf1', 'inf2']:
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+        device_type = "gpu"
+        model.to(device)
+        inputs.to(device)
+    else:
+        device = torch.device("cpu")
+        device_type = 'cpu'
+
+if device_type == processor:
+    print(f"   ... Using device: {device_type}")
+else:
+    print(f"[WARN] detected device_type ({device_type}) does not match the configured processor ({processor})")
+
+# 2. COMPILE THE MODEL
+print('\nTracing model ...')
+example_inputs = (
+    torch.cat([inputs['input_ids']] * batch_size,0), 
+    torch.cat([inputs['attention_mask']] * batch_size,0)
+)
+os.makedirs(f'traced-{model_name}', exist_ok=True)
+torch.set_num_threads(6)
+if 'inf' in processor:
+    model_traced = torch.neuron.trace(model, 
+                                  example_inputs, 
+                                  verbose=1, 
+                                  compiler_workdir=f'./traced-{model_name}/compile_wd_{processor}_bs{batch_size}_seq{sequence_length}_pc{pipeline_cores}',  
+                                  compiler_args = ['--neuroncore-pipeline-cores', str(pipeline_cores)])
+elif 'inf2' in processor:
+    model_traced = torch_neuronx.trace(model,
+                                  example_inputs)
+else:
+    model_traced = torch.jit.trace(model, example_inputs)
+
+# 3. TEST THE COMPILED MODEL (Optional)        
+if test.lower() == 'true':
+    print("\nTesting traced model ...")
+    print(f"Question: {question}")
+    # Testing the traced model
+    answer_logits = model_traced(*example_inputs)
+    answer_start = answer_logits[0].argmax().item()
+    answer_end = answer_logits[1].argmax().item()+1
+    answer_txt = ""
+    if answer_end > answer_start:
+        answer_txt = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
+    else:
+        answer_txt = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:]))
+    print(f'Model Answer: {answer_txt}')
+
+# 4. SAVE THE COMPILED MODEL
+print('\nSaving traced model ...')
+model_path=f'./traced-{model_name}/{model_name}_{processor}_bs{batch_size}_seq{sequence_length}_pc{pipeline_cores}.pt'
+model_traced.save(model_path)
+
+print(f'Done. Model saved as: {model_path}')