diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 000000000..61668d828
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,102 @@
+sudo: required
+dist: trusty
+language: cpp
+env:
+  global:
+    - secure: xN+lGOH8LzepE1OoOrHelDgu1wf7nL/B7nBWhN7EnCB7S7hZJL/AakruHy4lMfQfF4XkrnPWmBlmc4wdLH+o6jPkUISm4nLRSTMnRV2L+Mjyzg3aIEua0xpO6rLUNgsShB8mfkieTJq+kSj3Yp2CM7GEzm+UNNxeJcY0VdUHy9msRRRbXiLViIrfwBEVC9He7xG9NWfqkpsORfoiPmVDm7YzuXALdB8qkX4AWggysz/BCVj0PwBMr754eEpOodQ9GeKDF2Kwy5vPAqK5f7zwshJtF9VevyA1A2M9y8BHJMymz4wGaSxLNMeUU85AmVIvmzX0weG94JQ7mlUVszNpO5CCIyjwCOF+IDUI8HCDJGOY7+gGnv4H2LhDwAXvFLD65FlMntQQe2e4KRTnFxtJvvghjv5FyxJSHwctLsgeDpr2uZDcAhK1yf8TNsqqMiXQj2yGLByJy8j5PjUyd8oN47uZo0T5DDMd5c3ztUppc5+DisIoqmoYQeom3lYbpeudaf492ZDBWEV4rS9COl1h7CnpanMBpXWLFc2zXyfTpRn3GifutiF8M3rSS2KHcPyb9JLePTrC4+itMkwB4SHo1VYk4H2RQAdPMDFHMKCeVs2Z4sF9pGPJR+JzRekaKFLDm73ihsuE0cnx1oPVQMjSWa0e7A1a9W4UQBvp9xR++i4=
+    - OMP_NUM_THREADS=3
+    - DO_BUILD=yes
+    - DO_TEST=yes
+matrix:
+  include:
+  - compiler: gcc-4
+    addons: { apt: { sources: [ ubuntu-toolchain-r-test ] , packages: [ g++-4.9, libtbb-dev ] } }
+    env:
+    - COMPILER=g++-4.9
+    - CMAKE_EXTRA_FLAGS="-DENABLE_WARNINGS=On"
+  - compiler: gcc-6
+    addons: { apt: { sources: [ ubuntu-toolchain-r-test ] , packages: [ g++-6, libtbb-dev ] } }
+    env:
+    - COMPILER=g++-6
+    - CMAKE_EXTRA_FLAGS="-DENABLE_WARNINGS=On"
+  - compiler: gcc-7
+    addons: { apt: { sources: [ ubuntu-toolchain-r-test ] , packages: [ g++-7, libtbb-dev ] } }
+    env:
+    - COMPILER=g++-7
+    - CMAKE_EXTRA_FLAGS="-DENABLE_WARNINGS=On"
+  - compiler: clang-5
+    addons: { apt: { sources: [ ubuntu-toolchain-r-test ] , packages: [ g++-6, libtbb-dev ] } }
+    env:
+    - COMPILER=clang++-5.0.0
+    - LLVM_VERSION=5.0.0
+    - DOWNLOAD_URL=http://releases.llvm.org/5.0.0/clang+llvm-5.0.0-linux-x86_64-ubuntu14.04.tar.xz
+    - CMAKE_EXTRA_FLAGS="-DCMAKE_CXX_FLAGS=-fmodules"
+  - compiler: clang-3.9
+    addons: { apt: { sources: [ ubuntu-toolchain-r-test ] , packages: [ g++-6, libtbb-dev ] } }
+    env:
+    - COMPILER=clang++-3.9.1
+    - LLVM_VERSION=3.9.1
+  - compiler: clang-4.0
+    addons: { apt: { sources: [ ubuntu-toolchain-r-test ] , packages: [ g++-6, libtbb-dev ] } }
+    env:
+    - COMPILER=clang++-4.0.0
+    - LLVM_VERSION=4.0.0
+  - compiler: intel-17
+    env:
+    - COMPILER=icpc
+    - TRAVIS_INSTALL_COMPILER="intel"
+  - compiler: nvcc
+    addons: { apt: { sources: [ ubuntu-toolchain-r-test ] , packages: [ g++-4.9, libtbb-dev ] } }
+    env:
+    - COMPILER=g++-4.9
+    - CMAKE_EXTRA_FLAGS="-DENABLE_CUDA=On"
+    - TRAVIS_INSTALL_COMPILER="nvcc"
+    - DO_TEST=no
+  - compiler: gcc-4.9-debug
+    addons: { apt: { sources: [ ubuntu-toolchain-r-test ] , packages: [ g++-4.9, libtbb-dev ] } }
+    env:
+    - COMPILER=g++-4.9
+    - CMAKE_EXTRA_FLAGS="-DCMAKE_BUILD_TYPE=Debug  -DENABLE_COVERAGE=On"
+  - compiler: clang-3.9-debug
+    addons: { apt: { sources: [ ubuntu-toolchain-r-test ] , packages: [ g++-6, libtbb-dev ] } }
+    env:
+    - COMPILER=clang++
+    - LLVM_VERSION=3.9.1
+    - CMAKE_EXTRA_FLAGS="-DCMAKE_BUILD_TYPE=Debug"
+  - compiler: nvcc-debug
+    addons: { apt: { sources: [ ubuntu-toolchain-r-test ] , packages: [ g++-4.9, libtbb-dev ] } }
+    env:
+    - COMPILER=g++-4.9
+    - CMAKE_EXTRA_FLAGS="-DCMAKE_BUILD_TYPE=Debug -DENABLE_CUDA=On"
+    - TRAVIS_INSTALL_COMPILER="nvcc"
+    - DO_TEST=no
+cache:
+  directories:
+  - $HOME/llvm
+before_install:
+- sudo apt-get update -qq
+- mkdir -p ${HOME}/download
+- if [[ -n "${LLVM_VERSION}" ]]; then . ./scripts/install_llvm.sh ; fi
+- CMAKE_URL="https://cmake.org/files/v3.7/cmake-3.7.0-rc2-Linux-x86_64.tar.gz"
+- curl -o ${HOME}/cmake-tarball.tar.gz ${CMAKE_URL} &&
+  mkdir -p ${HOME}/cmake &&
+  tar xf ${HOME}/cmake-tarball.tar.gz -C ${HOME}/cmake --strip-components=1 &&
+  export PATH=${HOME}/cmake/bin:${PATH}
+- if [[ "${TRAVIS_INSTALL_COMPILER}" == "intel" && -n "$INTEL_SERIAL_NUMBER" ]] ; then wget -q -O /dev/stdout 'https://raw.githubusercontent.com/nemequ/icc-travis/master/install-icc.sh' | /bin/sh; fi
+- if [[ "${TRAVIS_INSTALL_COMPILER}" == "intel" && -z "$INTEL_SERIAL_NUMBER" ]] ; then export DO_BUILD=no ; export DO_TEST=no ; fi
+- if [[ "${TRAVIS_INSTALL_COMPILER}" == "nvcc" ]]; then export DEBFILE=${HOME}/download/cuda-repo.deb; fi
+- if [[ "${TRAVIS_INSTALL_COMPILER}" == "nvcc" ]]; then export DOWNLOAD_URL=http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_8.0.61-1_amd64.deb; fi
+- if [[ "${TRAVIS_INSTALL_COMPILER}" == "nvcc" ]]; then if [[ ! -f /usr/local/cuda-8.0/bin/nvcc ]]; then if [[ ! -f ${DEBFILE} ]]; then travis_retry wget -O ${DEBFILE} ${DOWNLOAD_URL}; fi &&
+  travis_retry sudo dpkg -i ${DEBFILE} &&
+  travis_retry sudo apt-get update -qq &&
+  travis_retry sudo apt-get install --no-install-suggests --no-install-recommends -y cuda-drivers cuda-core-8-0 cuda-cudart-dev-8-0 cuda-cufft-dev-8-0 &&
+  travis_retry sudo apt-get clean; fi &&
+  export CUDA_HOME=/usr/local/cuda-8.0 &&
+  export CUDA_TOOLKIT_ROOT_DIR=${CUDA_HOME} &&
+  export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} &&
+  export PATH=${CUDA_HOME}/bin:${PATH}; fi
+script:
+- ./scripts/travis_build_and_test.sh
+after_success:
+- if [[ "${CMAKE_EXTRA_FLAGS}" == *"ENABLE_COVERAGE"* ]] ; then bash <(curl -s https://codecov.io/bash) -a "-f"; fi
+- if [[ "${TRAVIS_INSTALL_COMPILER}" == "intel" ]] ; then uninstall_intel_software ; fi
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bc5578f1d..1dbab69f0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,7 +37,7 @@ set(BLT_CXX_STANDARD 11)
 #
 
 set(ENABLE_TESTS On CACHE Bool "")
-set(ENABLE_EXAMPLES Off CACHE Bool "")
+set(ENABLE_EXAMPLES On CACHE Bool "")
 set(ENABLE_DOCUMENTATION Off CACHE Bool "")
 
 set(ENABLE_TBB Off CACHE Bool "")
@@ -62,7 +62,7 @@ include_directories(${RAJA_INCLUDE_DIRS})
 #
 
 set(RAJA_PERFSUITE_VERSION_MAJOR 0)
-set(RAJA_PERFSUITE_VERSION_MINOR 1)
+set(RAJA_PERFSUITE_VERSION_MINOR 2)
 set(RAJA_PERFSUITE_VERSION_PATCHLEVEL 0)
 
 set(RAJA_PERFSUITE_DEPENDS RAJA)
diff --git a/README.md b/README.md
index 810c20e78..4b587a63e 100644
--- a/README.md
+++ b/README.md
@@ -40,9 +40,9 @@ source repository. For example,
 ```
 > mkdir RAJA-PERFSUITE
 > cd RAJA-PERFSUITE
-> git clone --recursive https://github.com/llnl/rajaperf.git
+> git clone --recursive https://github.com/llnl/RAJAPerf.git
 > ls 
-raja-perfsuite
+RAJAPerf
 ```
 
 The Performance Suite has [RAJA] and the CMake-based [BLT] build system
@@ -51,23 +51,23 @@ the Performance Suite source code. Note that if you switch to a different
 branch, you will have to update the submodules; e.g.,
 
 ```
-> cd raja-perfsuite
+> cd RAJAPerf
 > git checkout <some branch name>
 > git submodule init
 > git submodule update
 ```
 
 RAJA and the Performance Suite are built together using the same CMake
-configuration. For convenience, we include some scripts in the 'scripts'
-directory some associated configuration files in the 'host-configs'
-that illustrate how to build the code on various platforms at LLNL. Each 
-build script will create a descriptively-named build space directory in
-the top-level erformance Suite directory, and run CMake with a configuration
-appropriate for the platform and compilers used. After CMake completes, 
-enter the build directory and type 'make' (or 'make -j' for parallel) to
-build the code. The provided configurations will build RAJA unit tests by 
-default. After the code builds, you can type 'make test' to verify that 
-everything is working properly.  For example,
+configuration. For convenience, we include scripts in the 'scripts'
+directory that invoke associated (CMake cache) configuration files in the 
+'host-configs' directory that illustrate how to build the code on various 
+platforms at LLNL. Each build script creates a descriptively-named build 
+space directory in the top-level Performance Suite directory and runs CMake 
+with a configuration appropriate for the platform and compilers used. After 
+CMake completes, enter the build directory and type 'make' (or 'make -j' for 
+a parallel build) to build the code. The provided configurations will build 
+RAJA unit tests by default. After the code builds, you can type 'make test' to 
+verify that the RAJA build is working properly.  For example,
 
 ```
 > ./scripts/blueos_nvcc8.0_clang-coral.sh
@@ -127,6 +127,21 @@ Lastly, the program will emit a summary of provided input if it is given
 something that it does not understand. Hopefully, this will make it easy for
 users to understand and correct erroneous usage.
 
+# Important notes
+
+ * The kernels that use RAJA 'nested' loop constructs will be replaced
+   at some point with new RAJA nested capabilities that are being developed. 
+   The new nested constructs are simpler, more flexible, and perform better.
+
+ * The OpenMP target variants of the kernels in the Suite are a 
+   work-in-progress. They are incomplete (a few RAJA features must be
+   filled in to make them comparable to other variants).
+
+ * The build system for the Suite needs to be reworked to have the
+   OpenMP target kernel variants run from the same executable as the CUDA
+   variants. Currently, a separate executable `./bin/raja-perf-nolibs.exe`
+   is generated for running OpenMP target variants when they are enabled.
+
 * * *
 
 # Generated output
@@ -165,7 +180,7 @@ Adding a new kernel to the suite involves three main steps:
 
 1. Add unique kernel ID and unique name to the suite. 
 2. If the kernel is part of a new kernel group, also add a unique group ID and name for the group.
-3. Implement a kernel class that contains all operations needed to run it.
+3. Implement a kernel class that contains all operations needed to run it, with source files organized as described below.
 
 These steps are described in the following sections.
 
@@ -174,11 +189,11 @@ These steps are described in the following sections.
 Two key pieces of information identify a kernel: the group in which it 
 resides and the name of the kernel itself. For concreteness, we describe
 how to add a kernel "Foo" that lives in the kernel group "Bar". The files 
-`RAJAPerfSuite.hxx` and `RAJAPerfSuite.cxx` define enumeration 
+`RAJAPerfSuite.hpp` and `RAJAPerfSuite.cpp` define enumeration 
 values and arrays of string names for the kernels, respectively. 
 
 First, add an enumeration value identifier for the kernel, that is unique 
-among all kernels, in the enum 'KerneID' in the header file `RAJAPerfSuite.hxx`:
+among all kernels, in the enum 'KerneID' in the header file `RAJAPerfSuite.hpp`:
 
 ```cpp
 enum KernelID {
@@ -194,7 +209,7 @@ this convention so that the kernel works with existing performance
 suite machinery. 
 
 Second, add the kernel name to the array of strings 'KernelNames' in the file
-`RAJAPerfSuite.cxx`:
+`RAJAPerfSuite.cpp`:
 
 ```cpp
 static const std::string KernelNames [] =
@@ -216,8 +231,8 @@ and matching one-to-one).
 
 If a kernel is added as part of a new group of kernels in the suite, a
 new value must be added to the 'GroupID' enum in the header file 
-`RAJAPerfSuite.hxx` and an associated group string name must be added to
-the 'GroupNames' array of strings in the file `RAJAPerfSuite.cxx`. Again,
+`RAJAPerfSuite.hpp` and an associated group string name must be added to
+the 'GroupNames' array of strings in the file `RAJAPerfSuite.cpp`. Again,
 the enumeration values and items in the string array must be kept
 consistent.
 
@@ -231,7 +246,8 @@ all operations needed to execute and record execution timing and result
 checksum information for each variant of the kernel. 
 
 Continuing with our example, we add 'Foo' class header and implementation 
-files 'Foo.hxx' and 'Foo.cxx', respectively, to the 'src/bar' directory. 
+files 'Foo.hpp', 'Foo.cpp' (CPU variants), `Foo-Cuda.cpp` (CUDA variants), 
+and `Foo-OMPTarget.cpp` (OpenMP target variants) to the 'src/bar' directory. 
 The class must inherit from the 'KernelBase' base class that defines the 
 interface for kernels in the suite. 
 
@@ -243,7 +259,15 @@ Here is what the header file for the Foo kernel object may look like:
 #ifndef RAJAPerf_Bar_Foo_HXX
 #define RAJAPerf_Bar_Foo_HXX
 
-#include "common/KernelBase.hxx"
+
+///
+/// Foo kernel reference implementation:
+///
+/// Describe it here...
+///
+
+
+#include "common/KernelBase.hpp"
 
 namespace rajaperf  
 {
@@ -265,6 +289,9 @@ public:
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid); 
+
 private:
   // Kernel-specific data (pointers, scalars, etc.) used in kernel...
 };
@@ -345,7 +372,8 @@ checksums can be compared at the end of a run.
 
 Note: to simplify these operations and help ensure consistency, there exist 
 utility methods to allocate, initialize, deallocate, and copy data, and compute
-checksums defined in the `DataUtils.hxx` header file in the 'common' directory.
+checksums defined in the `DataUtils.hpp` `CudaDataUtils.hpp`, and 
+`OpenMPTargetDataUtils.hpp` header files in the 'common' directory.
 
 ##### runKernel() method
 
@@ -360,18 +388,61 @@ void Foo::runKernel(VariantID vid)
   const Index_type run_reps = getRunReps();
   // ...
 
-  // Declare data for vid variant of kernel...
+  switch ( vid ) {
 
-  startTimer();
-  for (SampIndex_type irep = 0; irep < run_reps; ++irep) {
-     // Implementation of vid variant of kernel...
-  }
-  stopTimer();
+    case Base_Seq : {
 
-  // ...
+      // Declare data for baseline sequential variant of kernel...
+
+      startTimer();
+      for (SampIndex_type irep = 0; irep < run_reps; ++irep) {
+         // Implementation of kernel variant...
+      }
+      stopTimer();
+
+      // ...
+
+      break; 
+    }
+
+    // case statements for other CPU kernel variants.... 
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
+      break;
+    }
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
+      break;
+    }
+#endif
+
+    default : {
+      std::cout << "\n  <kernel-name> : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
 }
 ```
 
+All kernel implementation files are organized in this way. So following this
+pattern will keep all new additions consistent. 
+
+Note: There are three source files for each kernel: 'Foo.cpp' contains CPU 
+variants, `Foo-Cuda.cpp` contains CUDA variants, and `Foo-OMPTarget.cpp` 
+constains OpenMP target variants. The reason for this is that it makes it 
+easier to apply unique compiler flags to different variants and to manage
+compilation and linking issues that arise when some kernel variants are
+combined in the same translation unit.
+
 Note: for convenience, we make heavy use of macros to define data 
 declarations and kernel bodies in the suite. This significantly reduces
 the amount of redundant code required to implement multiple variants
@@ -391,7 +462,7 @@ compared to help identify differences, and potentially errors, in
 implementations, compiler optimizations, programming model execution, etc.
 
 Note: to simplify checksum computations and help ensure consistency, there 
-are methods to compute checksums defined in the `DataUtils.hxx` header file 
+are methods to compute checksums defined in the `DataUtils.hpp` header file 
 in the 'common' directory.
 
 ##### tearDown() method
@@ -407,7 +478,7 @@ The 'Executor' class object is responsible for creating kernel objects
 for the kernels to be run based on the suite input options. To ensure a new
 kernel object will be created properly, add a call to its class constructor 
 based on its 'KernelID' in the 'getKernelObject()' method in the 
-`RAJAPerfSuite.cxx` file.
+`RAJAPerfSuite.cpp` file.
 
   
 ## Adding a variant
@@ -420,7 +491,7 @@ items similar to adding a kernel as described above.
 
 First, add an enumeration value identifier for the variant, that is unique 
 among all variants, in the enum 'VariantID' in the header file 
-`RAJAPerfSuite.hxx`:
+`RAJAPerfSuite.hpp`:
 
 ```cpp
 enum VariantID {
@@ -431,7 +502,7 @@ enum VariantID {
 ```
 
 Second, add the variant name to the array of strings 'VariantNames' in the file
-`RAJAPerfSuite.cxx`:
+`RAJAPerfSuite.cpp`:
 
 ```cpp
 static const std::string VariantNames [] =
diff --git a/host-configs/blueos/clang_coral_2017_09_06.cmake b/host-configs/blueos/clang_coral_2017_09_18.cmake
similarity index 93%
rename from host-configs/blueos/clang_coral_2017_09_06.cmake
rename to host-configs/blueos/clang_coral_2017_09_18.cmake
index 21e89ac64..61a069f6b 100755
--- a/host-configs/blueos/clang_coral_2017_09_06.cmake
+++ b/host-configs/blueos/clang_coral_2017_09_18.cmake
@@ -14,8 +14,8 @@
 
 set(RAJA_COMPILER "RAJA_COMPILER_CLANG" CACHE STRING "")
 
-set(CMAKE_CXX_COMPILER "/usr/tce/packages/clang/clang-coral-2017.09.06/bin/clang++" CACHE PATH "")
-set(CMAKE_C_COMPILER "/usr/tce/packages/clang/clang-coral-2017.09.06/bin/clang" CACHE PATH "")
+set(CMAKE_CXX_COMPILER "/usr/tce/packages/clang/clang-coral-2017.09.18/bin/clang++" CACHE PATH "")
+set(CMAKE_C_COMPILER "/usr/tce/packages/clang/clang-coral-2017.09.18/bin/clang" CACHE PATH "")
 
 set(CMAKE_CXX_FLAGS_RELEASE "-O3" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g" CACHE STRING "")
diff --git a/host-configs/blueos/clang_coral_2017_10_13.cmake b/host-configs/blueos/clang_coral_2017_10_13.cmake
new file mode 100755
index 000000000..ca2adfacb
--- /dev/null
+++ b/host-configs/blueos/clang_coral_2017_10_13.cmake
@@ -0,0 +1,29 @@
+##
+## Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+##
+## Produced at the Lawrence Livermore National Laboratory.
+##
+## LLNL-CODE-738930
+##
+## All rights reserved.
+##
+## This file is part of the RAJA Performance Suite.
+##
+## For details about use and distribution, please read raja-perfsuite/LICENSE.
+##
+
+set(RAJA_COMPILER "RAJA_COMPILER_CLANG" CACHE STRING "")
+
+set(CMAKE_CXX_COMPILER "/usr/tce/packages/clang/clang-coral-2017.10.13/bin/clang++-gpu" CACHE PATH "")
+set(CMAKE_C_COMPILER "/usr/tce/packages/clang/clang-coral-2017.10.13/bin/clang-gpu" CACHE PATH "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fno-vectorize" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -fno-vectorize" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
+
+set(RAJA_RANGE_ALIGN 4 CACHE INT "")
+set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
+set(RAJA_DATA_ALIGN 64 CACHE INT "")
+set(RAJA_COHERENCE_BLOCK_SIZE 64 CACHE INT "")
+
+set(RAJA_HOST_CONFIG_LOADED On CACHE Bool "")
diff --git a/host-configs/blueos/nvcc_clang_coral_2017_09_06.cmake b/host-configs/blueos/nvcc_clang_coral_2017_09_18.cmake
similarity index 96%
rename from host-configs/blueos/nvcc_clang_coral_2017_09_06.cmake
rename to host-configs/blueos/nvcc_clang_coral_2017_09_18.cmake
index 179e46648..90fce06a5 100755
--- a/host-configs/blueos/nvcc_clang_coral_2017_09_06.cmake
+++ b/host-configs/blueos/nvcc_clang_coral_2017_09_18.cmake
@@ -14,8 +14,8 @@
 
 set(RAJA_COMPILER "RAJA_COMPILER_CLANG" CACHE STRING "")
 
-set(CMAKE_CXX_COMPILER "/usr/tce/packages/clang/clang-coral-2017.09.06/bin/clang++" CACHE PATH "")
-set(CMAKE_C_COMPILER "/usr/tce/packages/clang/clang-coral-2017.09.06/bin/clang" CACHE PATH "")
+set(CMAKE_CXX_COMPILER "/usr/tce/packages/clang/clang-coral-2017.09.18/bin/clang++" CACHE PATH "")
+set(CMAKE_C_COMPILER "/usr/tce/packages/clang/clang-coral-2017.09.18/bin/clang" CACHE PATH "")
 
 set(CMAKE_CXX_FLAGS_RELEASE "-O3" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g" CACHE STRING "")
diff --git a/host-configs/blueos/nvcc_clang_coral_2017_10_13.cmake b/host-configs/blueos/nvcc_clang_coral_2017_10_13.cmake
new file mode 100755
index 000000000..12748c3cb
--- /dev/null
+++ b/host-configs/blueos/nvcc_clang_coral_2017_10_13.cmake
@@ -0,0 +1,42 @@
+##
+## Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+##
+## Produced at the Lawrence Livermore National Laboratory.
+##
+## LLNL-CODE-738930
+##
+## All rights reserved.
+##
+## This file is part of the RAJA Performance Suite.
+##
+## For details about use and distribution, please read raja-perfsuite/LICENSE.
+##
+
+set(RAJA_COMPILER "RAJA_COMPILER_CLANG" CACHE STRING "")
+
+set(CMAKE_CXX_COMPILER "/usr/tce/packages/clang/clang-coral-2017.10.13/bin/clang++" CACHE PATH "")
+set(CMAKE_C_COMPILER "/usr/tce/packages/clang/clang-coral-2017.10.13/bin/clang" CACHE PATH "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-O3" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
+
+set(CUDA_COMMON_OPT_FLAGS -restrict; -arch sm_60; -std c++11; --expt-extended-lambda)
+set(CUDA_COMMON_DEBUG_FLAGS -restrict; -arch compute_30; -std c++11; --expt-extended-lambda)
+
+set(HOST_OPT_FLAGS -Xcompiler -O3 -Xcompiler -fopenmp)
+
+if(CMAKE_BUILD_TYPE MATCHES Release)
+  set(RAJA_NVCC_FLAGS -O3; ${CUDA_COMMON_OPT_FLAGS}; -ccbin; ${CMAKE_CXX_COMPILER} ; ${HOST_OPT_FLAGS} CACHE LIST "")
+elseif(CMAKE_BUILD_TYPE MATCHES RelWithDebInfo)
+  set(RAJA_NVCC_FLAGS -g; -G; -O3; ${CUDA_COMMON_OPT_FLAGS}; -ccbin; ${CMAKE_CXX_COMPILER} ; ${HOST_OPT_FLAGS} CACHE LIST "")
+elseif(CMAKE_BUILD_TYPE MATCHES Debug)
+  set(RAJA_NVCC_FLAGS -g; -G; -O0; ${CUDA_COMMON_DEBUG_FLAGS}; -ccbin; ${CMAKE_CXX_COMPILER} ; -Xcompiler -fopenmp CACHE LIST "")
+endif()
+
+set(RAJA_RANGE_ALIGN 4 CACHE INT "")
+set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
+set(RAJA_DATA_ALIGN 64 CACHE INT "")
+set(RAJA_COHERENCE_BLOCK_SIZE 64 CACHE INT "")
+
+set(RAJA_HOST_CONFIG_LOADED On CACHE Bool "")
diff --git a/host-configs/blueos/nvcc_clang_coral_2017_11_03.cmake b/host-configs/blueos/nvcc_clang_coral_2017_11_03.cmake
new file mode 100755
index 000000000..f4d3ada18
--- /dev/null
+++ b/host-configs/blueos/nvcc_clang_coral_2017_11_03.cmake
@@ -0,0 +1,42 @@
+##
+## Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+##
+## Produced at the Lawrence Livermore National Laboratory.
+##
+## LLNL-CODE-738930
+##
+## All rights reserved.
+##
+## This file is part of the RAJA Performance Suite.
+##
+## For details about use and distribution, please read raja-perfsuite/LICENSE.
+##
+
+set(RAJA_COMPILER "RAJA_COMPILER_CLANG" CACHE STRING "")
+
+set(CMAKE_CXX_COMPILER "/usr/tce/packages/clang/clang-coral-2017.11.03/bin/clang++" CACHE PATH "")
+set(CMAKE_C_COMPILER "/usr/tce/packages/clang/clang-coral-2017.11.03/bin/clang" CACHE PATH "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-O3" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
+
+set(CUDA_COMMON_OPT_FLAGS -restrict; -arch sm_60; -std c++11; --expt-extended-lambda)
+set(CUDA_COMMON_DEBUG_FLAGS -restrict; -arch compute_30; -std c++11; --expt-extended-lambda)
+
+set(HOST_OPT_FLAGS -Xcompiler -O3 -Xcompiler -fopenmp)
+
+if(CMAKE_BUILD_TYPE MATCHES Release)
+  set(RAJA_NVCC_FLAGS -O3; ${CUDA_COMMON_OPT_FLAGS}; -ccbin; ${CMAKE_CXX_COMPILER} ; ${HOST_OPT_FLAGS} CACHE LIST "")
+elseif(CMAKE_BUILD_TYPE MATCHES RelWithDebInfo)
+  set(RAJA_NVCC_FLAGS -g; -G; -O3; ${CUDA_COMMON_OPT_FLAGS}; -ccbin; ${CMAKE_CXX_COMPILER} ; ${HOST_OPT_FLAGS} CACHE LIST "")
+elseif(CMAKE_BUILD_TYPE MATCHES Debug)
+  set(RAJA_NVCC_FLAGS -g; -G; -O0; ${CUDA_COMMON_DEBUG_FLAGS}; -ccbin; ${CMAKE_CXX_COMPILER} ; -Xcompiler -fopenmp CACHE LIST "")
+endif()
+
+set(RAJA_RANGE_ALIGN 4 CACHE INT "")
+set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
+set(RAJA_DATA_ALIGN 64 CACHE INT "")
+set(RAJA_COHERENCE_BLOCK_SIZE 64 CACHE INT "")
+
+set(RAJA_HOST_CONFIG_LOADED On CACHE Bool "")
diff --git a/host-configs/blueos/nvcc_xl-beta-2017.09.13.cmake b/host-configs/blueos/nvcc_xl-beta-2017.09.13.cmake
new file mode 100755
index 000000000..f9fe38ca3
--- /dev/null
+++ b/host-configs/blueos/nvcc_xl-beta-2017.09.13.cmake
@@ -0,0 +1,42 @@
+##
+## Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+##
+## Produced at the Lawrence Livermore National Laboratory.
+##
+## LLNL-CODE-738930
+##
+## All rights reserved.
+##
+## This file is part of the RAJA Performance Suite.
+##
+## For details about use and distribution, please read raja-perfsuite/LICENSE.
+##
+
+set(RAJA_COMPILER "RAJA_COMPILER_XLC" CACHE STRING "")
+
+set(CMAKE_CXX_COMPILER "/usr/tce/packages/xl/xl-beta-2017.09.13/bin/xlC_r" CACHE PATH "")
+set(CMAKE_C_COMPILER "/usr/tce/packages/xl/xl-beta-2017.09.13/bin/xlC_r" CACHE PATH "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-O3" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
+
+set(CUDA_COMMON_OPT_FLAGS -restrict; -arch sm_60; -std c++11; --expt-extended-lambda)
+set(CUDA_COMMON_DEBUG_FLAGS -restrict; -arch compute_30; -std c++11; --expt-extended-lambda)
+
+set(HOST_OPT_FLAGS -Xcompiler -O3 -Xcompiler -m64 -Xcompiler -fopenmp)
+
+if(CMAKE_BUILD_TYPE MATCHES Release)
+  set(RAJA_NVCC_FLAGS -O3; ${CUDA_COMMON_OPT_FLAGS}; -ccbin; ${CMAKE_CXX_COMPILER} ; ${HOST_OPT_FLAGS} CACHE LIST "")
+elseif(CMAKE_BUILD_TYPE MATCHES RelWithDebInfo)
+  set(RAJA_NVCC_FLAGS -g; -G; -O3; ${CUDA_COMMON_OPT_FLAGS}; -ccbin; ${CMAKE_CXX_COMPILER} ; ${HOST_OPT_FLAGS} CACHE LIST "")
+elseif(CMAKE_BUILD_TYPE MATCHES Debug)
+  set(RAJA_NVCC_FLAGS -g; -G; -O0; ${CUDA_COMMON_DEBUG_FLAGS}; -ccbin; ${CMAKE_CXX_COMPILER} ; -Xcompiler -fopenmp CACHE LIST "")
+endif()
+
+set(RAJA_RANGE_ALIGN 4 CACHE INT "")
+set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
+set(RAJA_DATA_ALIGN 64 CACHE INT "")
+set(RAJA_COHERENCE_BLOCK_SIZE 64 CACHE INT "")
+
+set(RAJA_HOST_CONFIG_LOADED On CACHE Bool "")
diff --git a/host-configs/chaos/nvcc_gcc4_9_3.cmake b/host-configs/chaos/nvcc_gcc4_9_3.cmake
index b83cd8216..d661040d9 100755
--- a/host-configs/chaos/nvcc_gcc4_9_3.cmake
+++ b/host-configs/chaos/nvcc_gcc4_9_3.cmake
@@ -21,7 +21,7 @@ set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -mavx -finline-functions" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -g -mavx -finline-functions" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
 
-set(CUDA_COMMON_OPT_FLAGS -restrict; -arch compute_37; -std c++11; --expt-extended-lambda)
+set(CUDA_COMMON_OPT_FLAGS -restrict; -arch sm_35; -std c++11; --expt-extended-lambda)
 set(CUDA_COMMON_DEBUG_FLAGS -restrict; -arch compute_30; -std c++11; --expt-extended-lambda)
 
 set(HOST_OPT_FLAGS -Xcompiler -Ofast -Xcompiler -mavx -Xcompiler -finline-functions -Xcompiler -fopenmp)
diff --git a/host-configs/toss3/gcc_6_1_0.cmake b/host-configs/toss3/gcc_6_1_0.cmake
new file mode 100755
index 000000000..9cd6b769c
--- /dev/null
+++ b/host-configs/toss3/gcc_6_1_0.cmake
@@ -0,0 +1,29 @@
+##
+## Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+##
+## Produced at the Lawrence Livermore National Laboratory.
+##
+## LLNL-CODE-738930
+##
+## All rights reserved.
+##
+## This file is part of the RAJA Performance Suite.
+##
+## For details about use and distribution, please read raja-perfsuite/LICENSE.
+##
+
+set(RAJA_COMPILER "RAJA_COMPILER_GNU" CACHE STRING "")
+
+set(CMAKE_CXX_COMPILER "/usr/tce/packages/gcc/gcc-6.1.0/bin/g++" CACHE PATH "")
+set(CMAKE_C_COMPILER "/usr/tce/packages/gcc/gcc-6.1.0/bin/gcc" CACHE PATH "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -finline-functions -finline-limit=20000" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -g -finline-functions -finline-limit=20000" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
+
+set(RAJA_RANGE_ALIGN 4 CACHE INT "")
+set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
+set(RAJA_DATA_ALIGN 64 CACHE INT "")
+set(RAJA_COHERENCE_BLOCK_SIZE 64 CACHE INT "")
+
+set(RAJA_HOST_CONFIG_LOADED On CACHE Bool "")
diff --git a/host-configs/toss3/gcc_7_1_0.cmake b/host-configs/toss3/gcc_7_1_0.cmake
new file mode 100755
index 000000000..010f82c0e
--- /dev/null
+++ b/host-configs/toss3/gcc_7_1_0.cmake
@@ -0,0 +1,29 @@
+##
+## Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+##
+## Produced at the Lawrence Livermore National Laboratory.
+##
+## LLNL-CODE-738930
+##
+## All rights reserved.
+##
+## This file is part of the RAJA Performance Suite.
+##
+## For details about use and distribution, please read raja-perfsuite/LICENSE.
+##
+
+set(RAJA_COMPILER "RAJA_COMPILER_GNU" CACHE STRING "")
+
+set(CMAKE_CXX_COMPILER "/usr/tce/packages/gcc/gcc-7.1.0/bin/g++" CACHE PATH "")
+set(CMAKE_C_COMPILER "/usr/tce/packages/gcc/gcc-7.1.0/bin/gcc" CACHE PATH "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -finline-functions -finline-limit=20000" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -g -finline-functions -finline-limit=20000" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
+
+set(RAJA_RANGE_ALIGN 4 CACHE INT "")
+set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
+set(RAJA_DATA_ALIGN 64 CACHE INT "")
+set(RAJA_COHERENCE_BLOCK_SIZE 64 CACHE INT "")
+
+set(RAJA_HOST_CONFIG_LOADED On CACHE Bool "")
diff --git a/host-configs/toss3/icpc_18_0_beta.cmake b/host-configs/toss3/icpc_18_0_0.cmake
similarity index 78%
rename from host-configs/toss3/icpc_18_0_beta.cmake
rename to host-configs/toss3/icpc_18_0_0.cmake
index 7a524a6c4..fee6082b3 100755
--- a/host-configs/toss3/icpc_18_0_beta.cmake
+++ b/host-configs/toss3/icpc_18_0_0.cmake
@@ -14,10 +14,10 @@
 
 set(RAJA_COMPILER "RAJA_COMPILER_ICC" CACHE STRING "")
 
-set(CMAKE_CXX_COMPILER "/usr/tce/packages/intel/intel-18.0-beta/bin/icpc" CACHE PATH "")
-set(CMAKE_C_COMPILER "/usr/tce/packages/intel/intel-18.0-beta/bin/icc" CACHE PATH "")
+set(CMAKE_CXX_COMPILER "/usr/tce/packages/intel/intel-18.0.0/bin/icpc" CACHE PATH "")
+set(CMAKE_C_COMPILER "/usr/tce/packages/intel/intel-18.0.0/bin/icc" CACHE PATH "")
 
-set(COMMON_FLAGS "-gxx-name=/usr/tce/packages/gcc/gcc-4.9.3/bin/g++ -std=c++17")
+set(COMMON_FLAGS "-gxx-name=/usr/tce/packages/gcc/gcc-7.1.0/bin/g++ -std=c++11")
 set(CMAKE_CXX_FLAGS_RELEASE "${COMMON_FLAGS} -O3 -finline-functions -axCORE-AVX2 -diag-disable cpu-dispatch" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O3 -g -finline-functions -axCORE-AVX2 -diag-disable cpu-dispatch" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_DEBUG "${COMMON_FLAGS} -O0 -g" CACHE STRING "")
diff --git a/scripts/blueos_clang-coral-2017.09.06.sh b/scripts/blueos_clang-coral-2017.09.18.sh
similarity index 67%
rename from scripts/blueos_clang-coral-2017.09.06.sh
rename to scripts/blueos_clang-coral-2017.09.18.sh
index 70904b95a..47fe041dd 100755
--- a/scripts/blueos_clang-coral-2017.09.06.sh
+++ b/scripts/blueos_clang-coral-2017.09.18.sh
@@ -14,8 +14,8 @@
 ## For details about use and distribution, please read raja-perfsuite/LICENSE.
 ##
 
-rm -rf build_blueos-clang-coral-2017.09.06 2>/dev/null
-mkdir build_blueos-clang-coral-2017.09.06 && cd build_blueos-clang-coral-2017.09.06
+rm -rf build_blueos-clang-coral-2017.09.18 2>/dev/null
+mkdir build_blueos-clang-coral-2017.09.18 && cd build_blueos-clang-coral-2017.09.18
 
 module load cmake/3.7.2
 
@@ -23,10 +23,10 @@ PERFSUITE_DIR=$(git rev-parse --show-toplevel)
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
-  -C ${PERFSUITE_DIR}/host-configs/blueos/clang_coral_2017_09_06.cmake \
+  -C ${PERFSUITE_DIR}/host-configs/blueos/clang_coral_2017_09_18.cmake \
   -DENABLE_OPENMP=On \
   -DPERFSUITE_ENABLE_WARNINGS=Off \
   -DENABLE_ALL_WARNINGS=Off \
-  -DCMAKE_INSTALL_PREFIX=../install_blueos-clang-coral-2017.09.06 \
+  -DCMAKE_INSTALL_PREFIX=../install_blueos-clang-coral-2017.09.18 \
   "$@" \
   ${PERFSUITE_DIR}
diff --git a/scripts/blueos_clang-coral-2017.10.13_omptarget-nvcc8.0.sh b/scripts/blueos_clang-coral-2017.10.13_omptarget-nvcc8.0.sh
new file mode 100755
index 000000000..6bbdc152b
--- /dev/null
+++ b/scripts/blueos_clang-coral-2017.10.13_omptarget-nvcc8.0.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+##
+## Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+##
+## Produced at the Lawrence Livermore National Laboratory.
+##
+## LLNL-CODE-738930
+##
+## All rights reserved.
+## 
+## This file is part of the RAJA Performance Suite.
+##
+## For details about use and distribution, please read raja-perfsuite/LICENSE.
+##
+
+rm -rf build_blueos_clang-coral-2017.10.13_omptarget-nvcc8.0 >/dev/null
+mkdir build_blueos_clang-coral-2017.10.13_omptarget-nvcc8.0 && cd build_blueos_clang-coral-2017.10.13_omptarget-nvcc8.0
+
+module load cmake/3.9.2
+
+PERFSUITE_DIR=$(git rev-parse --show-toplevel)
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -C ${PERFSUITE_DIR}/host-configs/blueos/clang_coral_2017_10_13.cmake \
+  -DENABLE_OPENMP=On \
+  -DENABLE_CUDA=Off \
+  -DENABLE_TARGET_OPENMP=On \
+  -DOpenMP_CXX_FLAGS="-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-implicit-declare-target" \
+  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tcetmp/packages/cuda-8.0 \
+  -DPERFSUITE_ENABLE_WARNINGS=Off \
+  -DENABLE_ALL_WARNINGS=Off \
+  -DCMAKE_INSTALL_PREFIX=../install_blueos_clang-coral-2017.10.13_omptarget-nvcc8.0 \
+  "$@" \
+  ${PERFSUITE_DIR}
diff --git a/scripts/blueos_clang-coral-2017.10.13_omptarget-nvcc9.0.sh b/scripts/blueos_clang-coral-2017.10.13_omptarget-nvcc9.0.sh
new file mode 100755
index 000000000..75aaad97f
--- /dev/null
+++ b/scripts/blueos_clang-coral-2017.10.13_omptarget-nvcc9.0.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+##
+## Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+##
+## Produced at the Lawrence Livermore National Laboratory.
+##
+## LLNL-CODE-738930
+##
+## All rights reserved.
+## 
+## This file is part of the RAJA Performance Suite.
+##
+## For details about use and distribution, please read raja-perfsuite/LICENSE.
+##
+
+rm -rf build_blueos_clang-coral-2017.10.13_omptarget-nvcc9.0 >/dev/null
+mkdir build_blueos_clang-coral-2017.10.13_omptarget-nvcc9.0 && cd build_blueos_clang-coral-2017.10.13_omptarget-nvcc9.0
+
+module load cmake/3.9.2
+
+PERFSUITE_DIR=$(git rev-parse --show-toplevel)
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -C ${PERFSUITE_DIR}/host-configs/blueos/clang_coral_2017_10_13.cmake \
+  -DENABLE_OPENMP=On \
+  -DENABLE_CUDA=Off \
+  -DENABLE_TARGET_OPENMP=On \
+  -DOpenMP_CXX_FLAGS="-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-implicit-declare-target" \
+  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tcetmp/packages/cuda-9.0.176 \
+  -DPERFSUITE_ENABLE_WARNINGS=Off \
+  -DENABLE_ALL_WARNINGS=Off \
+  -DCMAKE_INSTALL_PREFIX=../install_blueos_clang-coral-2017.10.13_omptarget-nvcc9.0 \
+  "$@" \
+  ${PERFSUITE_DIR}
diff --git a/scripts/blueos_nvcc8.0_clang-coral-2017.09.06.sh b/scripts/blueos_nvcc8.0_clang-coral-2017.09.18.sh
similarity index 80%
rename from scripts/blueos_nvcc8.0_clang-coral-2017.09.06.sh
rename to scripts/blueos_nvcc8.0_clang-coral-2017.09.18.sh
index 95e910072..14374f997 100755
--- a/scripts/blueos_nvcc8.0_clang-coral-2017.09.06.sh
+++ b/scripts/blueos_nvcc8.0_clang-coral-2017.09.18.sh
@@ -14,8 +14,8 @@
 ## For details about use and distribution, please read raja-perfsuite/LICENSE.
 ##
 
-rm -rf build_blueos_nvcc8.0_clang-coral-2017.09.06 >/dev/null
-mkdir build_blueos_nvcc8.0_clang-coral-2017.09.06 && cd build_blueos_nvcc8.0_clang-coral-2017.09.06
+rm -rf build_blueos_nvcc8.0_clang-coral-2017.09.18 >/dev/null
+mkdir build_blueos_nvcc8.0_clang-coral-2017.09.18 && cd build_blueos_nvcc8.0_clang-coral-2017.09.18
 
 module load cmake/3.7.2
 
@@ -23,12 +23,12 @@ PERFSUITE_DIR=$(git rev-parse --show-toplevel)
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
-  -C ${PERFSUITE_DIR}/host-configs/blueos/nvcc_clang_coral_2017_09_06.cmake \
+  -C ${PERFSUITE_DIR}/host-configs/blueos/nvcc_clang_coral_2017_09_18.cmake \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
   -DCUDA_TOOLKIT_ROOT_DIR=/usr/tcetmp/packages/cuda-8.0 \
   -DPERFSUITE_ENABLE_WARNINGS=Off \
   -DENABLE_ALL_WARNINGS=Off \
-  -DCMAKE_INSTALL_PREFIX=../install_blueos_nvcc8.0_clang-coral-2017.09.06 \
+  -DCMAKE_INSTALL_PREFIX=../install_blueos_nvcc8.0_clang-coral-2017.09.18 \
   "$@" \
   ${PERFSUITE_DIR}
diff --git a/scripts/blueos_nvcc9.0_clang-coral-2017.08.31.sh b/scripts/blueos_nvcc9.0_clang-coral-2017.08.31.sh
index 4c20433d4..aaaa3ec76 100755
--- a/scripts/blueos_nvcc9.0_clang-coral-2017.08.31.sh
+++ b/scripts/blueos_nvcc9.0_clang-coral-2017.08.31.sh
@@ -26,7 +26,7 @@ cmake \
   -C ${PERFSUITE_DIR}/host-configs/blueos/nvcc_clang_coral_2017_08_31.cmake \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
-  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tcetmp/packages/cuda-9.0rc1 \
+  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tcetmp/packages/cuda-9.0.176 \
   -DPERFSUITE_ENABLE_WARNINGS=Off \
   -DENABLE_ALL_WARNINGS=Off \
   -DCMAKE_INSTALL_PREFIX=../install_blueos_nvcc9.0_clang-coral-2017.08.31 \
diff --git a/scripts/blueos_nvcc9.0_clang-coral-2017.09.06.sh b/scripts/blueos_nvcc9.0_clang-coral-2017.09.18.sh
similarity index 74%
rename from scripts/blueos_nvcc9.0_clang-coral-2017.09.06.sh
rename to scripts/blueos_nvcc9.0_clang-coral-2017.09.18.sh
index 63547f65a..f3129af5c 100755
--- a/scripts/blueos_nvcc9.0_clang-coral-2017.09.06.sh
+++ b/scripts/blueos_nvcc9.0_clang-coral-2017.09.18.sh
@@ -14,8 +14,8 @@
 ## For details about use and distribution, please read raja-perfsuite/LICENSE.
 ##
 
-rm -rf build_blueos_nvcc9.0_clang-coral-2017.09.06 >/dev/null
-mkdir build_blueos_nvcc9.0_clang-coral-2017.09.06 && cd build_blueos_nvcc9.0_clang-coral-2017.09.06
+rm -rf build_blueos_nvcc9.0_clang-coral-2017.09.18 >/dev/null
+mkdir build_blueos_nvcc9.0_clang-coral-2017.09.18 && cd build_blueos_nvcc9.0_clang-coral-2017.09.18
 
 module load cmake/3.7.2
 
@@ -23,12 +23,12 @@ PERFSUITE_DIR=$(git rev-parse --show-toplevel)
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
-  -C ${PERFSUITE_DIR}/host-configs/blueos/nvcc_clang_coral_2017_09_06.cmake \
+  -C ${PERFSUITE_DIR}/host-configs/blueos/nvcc_clang_coral_2017_09_18.cmake \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
-  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tcetmp/packages/cuda-9.0rc1 \
+  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tcetmp/packages/cuda-9.0.176 \
   -DPERFSUITE_ENABLE_WARNINGS=Off \
   -DENABLE_ALL_WARNINGS=Off \
-  -DCMAKE_INSTALL_PREFIX=../install_blueos_nvcc9.0_clang-coral-2017.09.06 \
+  -DCMAKE_INSTALL_PREFIX=../install_blueos_nvcc9.0_clang-coral-2017.09.18 \
   "$@" \
   ${PERFSUITE_DIR}
diff --git a/scripts/blueos_nvcc9.0_clang-coral-2017.10.13.sh b/scripts/blueos_nvcc9.0_clang-coral-2017.10.13.sh
new file mode 100755
index 000000000..f93e43984
--- /dev/null
+++ b/scripts/blueos_nvcc9.0_clang-coral-2017.10.13.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+##
+## Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+##
+## Produced at the Lawrence Livermore National Laboratory.
+##
+## LLNL-CODE-738930
+##
+## All rights reserved.
+## 
+## This file is part of the RAJA Performance Suite.
+##
+## For details about use and distribution, please read raja-perfsuite/LICENSE.
+##
+
+rm -rf build_blueos_nvcc9.0_clang-coral-2017.10.13 >/dev/null
+mkdir build_blueos_nvcc9.0_clang-coral-2017.10.13 && cd build_blueos_nvcc9.0_clang-coral-2017.10.13
+
+module load cmake/3.7.2
+
+PERFSUITE_DIR=$(git rev-parse --show-toplevel)
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -C ${PERFSUITE_DIR}/host-configs/blueos/nvcc_clang_coral_2017_10_13.cmake \
+  -DENABLE_OPENMP=On \
+  -DENABLE_CUDA=On \
+  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tcetmp/packages/cuda-9.0.176 \
+  -DPERFSUITE_ENABLE_WARNINGS=Off \
+  -DENABLE_ALL_WARNINGS=Off \
+  -DCMAKE_INSTALL_PREFIX=../install_blueos_nvcc9.0_clang-coral-2017.10.13 \
+  "$@" \
+  ${PERFSUITE_DIR}
diff --git a/scripts/blueos_nvcc9.0_clang-coral-2017.11.03.sh b/scripts/blueos_nvcc9.0_clang-coral-2017.11.03.sh
new file mode 100755
index 000000000..d56bd0ad0
--- /dev/null
+++ b/scripts/blueos_nvcc9.0_clang-coral-2017.11.03.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+##
+## Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+##
+## Produced at the Lawrence Livermore National Laboratory.
+##
+## LLNL-CODE-738930
+##
+## All rights reserved.
+## 
+## This file is part of the RAJA Performance Suite.
+##
+## For details about use and distribution, please read raja-perfsuite/LICENSE.
+##
+
+rm -rf build_blueos_nvcc9.0_clang-coral-2017.11.03 >/dev/null
+mkdir build_blueos_nvcc9.0_clang-coral-2017.11.03 && cd build_blueos_nvcc9.0_clang-coral-2017.11.03
+
+module load cmake/3.7.2
+
+PERFSUITE_DIR=$(git rev-parse --show-toplevel)
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -C ${PERFSUITE_DIR}/host-configs/blueos/nvcc_clang_coral_2017_11_03.cmake \
+  -DENABLE_OPENMP=On \
+  -DENABLE_CUDA=On \
+  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tcetmp/packages/cuda-9.0.176 \
+  -DPERFSUITE_ENABLE_WARNINGS=Off \
+  -DENABLE_ALL_WARNINGS=Off \
+  -DCMAKE_INSTALL_PREFIX=../install_blueos_nvcc9.0_clang-coral-2017.11.03 \
+  "$@" \
+  ${PERFSUITE_DIR}
diff --git a/scripts/blueos_nvcc9.0_gcc4.9.3.sh b/scripts/blueos_nvcc9.0_gcc4.9.3.sh
index 00a71643a..d45012db5 100755
--- a/scripts/blueos_nvcc9.0_gcc4.9.3.sh
+++ b/scripts/blueos_nvcc9.0_gcc4.9.3.sh
@@ -26,7 +26,7 @@ cmake \
   -C ${PERFSUITE_DIR}/host-configs/blueos/nvcc_gcc_4_9_3.cmake \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
-  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tcetmp/packages/cuda-9.0rc1 \
+  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tcetmp/packages/cuda-9.0.176 \
   -DPERFSUITE_ENABLE_WARNINGS=Off \
   -DENABLE_ALL_WARNINGS=Off \
   -DCMAKE_INSTALL_PREFIX=../install_blueos_nvcc9.0_gcc4.9.3 \
diff --git a/scripts/blueos_nvcc9.0_xl-beta-2017.09.13.sh b/scripts/blueos_nvcc9.0_xl-beta-2017.09.13.sh
new file mode 100755
index 000000000..43ea68fd2
--- /dev/null
+++ b/scripts/blueos_nvcc9.0_xl-beta-2017.09.13.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+##
+## Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+##
+## Produced at the Lawrence Livermore National Laboratory.
+##
+## LLNL-CODE-738930
+##
+## All rights reserved.
+## 
+## This file is part of the RAJA Performance Suite.
+##
+## For details about use and distribution, please read raja-perfsuite/LICENSE.
+##
+
+rm -rf build_blueos_nvcc9.0_xl-beta-2017.09.13 >/dev/null
+mkdir build_blueos_nvcc9.0_xl-beta-2017.09.13 && cd build_blueos_nvcc9.0_xl-beta-2017.09.13
+
+module load cmake/3.7.2
+
+PERFSUITE_DIR=$(git rev-parse --show-toplevel)
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -C ${PERFSUITE_DIR}/host-configs/blueos/nvcc_xl-beta-2017.09.13.cmake \
+  -DENABLE_OPENMP=On \
+  -DENABLE_CUDA=On \
+  -DCUDA_TOOLKIT_ROOT_DIR=/usr/tcetmp/packages/cuda-9.0.176 \
+  -DPERFSUITE_ENABLE_WARNINGS=Off \
+  -DENABLE_ALL_WARNINGS=Off \
+  -DCMAKE_INSTALL_PREFIX=../install_blueos_nvcc9.0_xl-beta-2017.09.13 \
+  "$@" \
+  ${PERFSUITE_DIR}
diff --git a/scripts/chaos_nvcc9.0_gcc4.9.3.sh b/scripts/chaos_nvcc9.0_gcc4.9.3.sh
new file mode 100755
index 000000000..50fd503b4
--- /dev/null
+++ b/scripts/chaos_nvcc9.0_gcc4.9.3.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+##
+## Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+##
+## Produced at the Lawrence Livermore National Laboratory.
+##
+## LLNL-CODE-738930
+##
+## All rights reserved.
+## 
+## This file is part of the RAJA Performance Suite.
+##
+## For details about use and distribution, please read raja-perfsuite/LICENSE.
+##
+
+rm -rf build_chaos-nvcc9.0_gcc4.9.3 2>/dev/null
+mkdir build_chaos-nvcc9.0_gcc4.9.3 && cd build_chaos-nvcc9.0_gcc4.9.3
+. /usr/local/tools/dotkit/init.sh && use cmake-3.4.1
+
+PERFSUITE_DIR=$(git rev-parse --show-toplevel)
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -C ${PERFSUITE_DIR}/host-configs/chaos/nvcc_gcc4_9_3.cmake \
+  -DENABLE_OPENMP=On \
+  -DENABLE_CUDA=On \
+  -DCUDA_TOOLKIT_ROOT_DIR=/opt/cudatoolkit-9.0 \
+  -DPERFSUITE_ENABLE_WARNINGS=Off \
+  -DENABLE_ALL_WARNINGS=Off \
+  -DCMAKE_INSTALL_PREFIX=../install_chaos-nvcc9.0_gcc4.9.3 \
+  "$@" \
+  ${PERFSUITE_DIR}
diff --git a/scripts/install_llvm.sh b/scripts/install_llvm.sh
new file mode 100755
index 000000000..7848d99f5
--- /dev/null
+++ b/scripts/install_llvm.sh
@@ -0,0 +1,15 @@
+export LLVM_PATH=${HOME}/llvm/
+export PATH=${LLVM_PATH}/bin:${PATH}
+export LD_LIBRARY_PATH=${LLVM_PATH}/lib:${LD_LIBRARY_PATH}
+[[ -z ${DOWNLOAD_URL+x} ]] && export DOWNLOAD_URL=http://releases.llvm.org/${LLVM_VERSION}/clang+llvm-${LLVM_VERSION}-x86_64-linux-gnu-ubuntu-14.04.tar.xz
+export TARFILE=${HOME}/download/llvm-${LLVM_VERSION}.tar.xz
+if ! [[ -x "${LLVM_PATH}/bin/clang++" ]]; then 
+    if ! [[ -f ${TARFILE} ]]; then
+        echo "curl -o ${TARFILE} ${DOWNLOAD_URL}"
+        curl -o ${TARFILE} ${DOWNLOAD_URL}
+    fi
+    tar xf ${TARFILE} -C ${HOME}/llvm --strip-components 1
+    ln -s ${LLVM_PATH}/bin/clang++ ${LLVM_PATH}/bin/clang++-${LLVM_VERSION}
+    ln -s ${LLVM_PATH}/bin/clang ${LLVM_PATH}/bin/clang-${LLVM_VERSION}
+fi
+
diff --git a/scripts/toss3_icpc18.0-beta.sh b/scripts/toss3_gcc6.1.0.sh
similarity index 68%
rename from scripts/toss3_icpc18.0-beta.sh
rename to scripts/toss3_gcc6.1.0.sh
index 80d678b54..9e4b73f61 100755
--- a/scripts/toss3_icpc18.0-beta.sh
+++ b/scripts/toss3_gcc6.1.0.sh
@@ -14,20 +14,19 @@
 ## For details about use and distribution, please read raja-perfsuite/LICENSE.
 ##
 
-rm -rf build_toss3-icpc-18.0-beta 2>/dev/null
-mkdir build_toss3-icpc-18.0-beta && cd build_toss3-icpc-18.0-beta
+rm -rf build_toss3-gcc-6.1.0 2>/dev/null
+mkdir build_toss3-gcc-6.1.0 && cd build_toss3-gcc-6.1.0
 
 module load cmake/3.5.2
-module load gcc/4.9.3
 
 PERFSUITE_DIR=$(git rev-parse --show-toplevel)
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
-  -C ${PERFSUITE_DIR}/host-configs/toss3/icpc_18_0_beta.cmake \
+  -C ${PERFSUITE_DIR}/host-configs/toss3/gcc_6_1_0.cmake \
   -DENABLE_OPENMP=On \
   -DPERFSUITE_ENABLE_WARNINGS=Off \
   -DENABLE_ALL_WARNINGS=Off \
-  -DCMAKE_INSTALL_PREFIX=../install_toss3-icpc-18.0-beta \
+  -DCMAKE_INSTALL_PREFIX=../install_toss3-gcc-6.1.0 \
   "$@" \
   ${PERFSUITE_DIR}
diff --git a/scripts/toss3_gcc7.1.0.sh b/scripts/toss3_gcc7.1.0.sh
new file mode 100755
index 000000000..deb3c79c8
--- /dev/null
+++ b/scripts/toss3_gcc7.1.0.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+##
+## Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+##
+## Produced at the Lawrence Livermore National Laboratory.
+##
+## LLNL-CODE-738930
+##
+## All rights reserved.
+## 
+## This file is part of the RAJA Performance Suite.
+##
+## For details about use and distribution, please read raja-perfsuite/LICENSE.
+##
+
+rm -rf build_toss3-gcc-7.1.0 2>/dev/null
+mkdir build_toss3-gcc-7.1.0 && cd build_toss3-gcc-7.1.0
+
+module load cmake/3.5.2
+
+PERFSUITE_DIR=$(git rev-parse --show-toplevel)
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -C ${PERFSUITE_DIR}/host-configs/toss3/gcc_7_1_0.cmake \
+  -DENABLE_OPENMP=On \
+  -DPERFSUITE_ENABLE_WARNINGS=Off \
+  -DENABLE_ALL_WARNINGS=Off \
+  -DCMAKE_INSTALL_PREFIX=../install_toss3-gcc-7.1.0 \
+  "$@" \
+  ${PERFSUITE_DIR}
diff --git a/scripts/toss3_icpc18.0.0.sh b/scripts/toss3_icpc18.0.0.sh
new file mode 100755
index 000000000..590a69c6a
--- /dev/null
+++ b/scripts/toss3_icpc18.0.0.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+##
+## Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+##
+## Produced at the Lawrence Livermore National Laboratory.
+##
+## LLNL-CODE-738930
+##
+## All rights reserved.
+## 
+## This file is part of the RAJA Performance Suite.
+##
+## For details about use and distribution, please read raja-perfsuite/LICENSE.
+##
+
+rm -rf build_toss3-icpc-18.0.0 2>/dev/null
+mkdir build_toss3-icpc-18.0.0 && cd build_toss3-icpc-18.0.0
+
+module load cmake/3.5.2
+module load gcc/7.1.0
+
+PERFSUITE_DIR=$(git rev-parse --show-toplevel)
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -C ${PERFSUITE_DIR}/host-configs/toss3/icpc_18_0_0.cmake \
+  -DENABLE_OPENMP=On \
+  -DPERFSUITE_ENABLE_WARNINGS=Off \
+  -DENABLE_ALL_WARNINGS=Off \
+  -DCMAKE_INSTALL_PREFIX=../install_toss3-icpc-18.0.0 \
+  "$@" \
+  ${PERFSUITE_DIR}
diff --git a/scripts/travis_build_and_test.sh b/scripts/travis_build_and_test.sh
new file mode 100755
index 000000000..85e4fe161
--- /dev/null
+++ b/scripts/travis_build_and_test.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+function or_die () {
+    "$@"
+    local status=$?
+    if [[ $status != 0 ]] ; then
+        echo ERROR $status command: $@
+        exit $status
+    fi
+}
+
+source ~/.bashrc
+cd ${TRAVIS_BUILD_DIR}
+or_die mkdir travis-build
+cd travis-build
+if [[ "$DO_BUILD" == "yes" ]] ; then
+    or_die cmake -DCMAKE_CXX_COMPILER="${COMPILER}" ${CMAKE_EXTRA_FLAGS} ../
+    cat CMakeCache.txt
+    or_die make -j 3 VERBOSE=1
+    if [[ "${DO_TEST}" == "yes" ]] ; then
+        or_die ./bin/raja-perf.exe --checkrun
+    fi
+fi
+
+exit 0
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index dccf5fd76..74f50b7c6 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -36,3 +36,106 @@ blt_add_executable(
   SOURCES RAJAPerfSuiteDriver.cpp
   DEPENDS_ON ${RAJA_PERFSUITE_EXECUTABLE_DEPENDS}
   )
+
+
+blt_add_executable(
+  NAME raja-perf-nolibs.exe
+  SOURCES RAJAPerfSuiteDriver.cpp
+  apps/AppsData.cpp
+  apps/DEL_DOT_VEC_2D.cpp
+  apps/DEL_DOT_VEC_2D-Cuda.cpp
+  apps/DEL_DOT_VEC_2D-OMPTarget.cpp
+  apps/ENERGY.cpp
+  apps/ENERGY-Cuda.cpp
+  apps/ENERGY-OMPTarget.cpp
+  apps/FIR.cpp
+  apps/FIR-Cuda.cpp
+  apps/FIR-OMPTarget.cpp
+  apps/PRESSURE.cpp
+  apps/PRESSURE-Cuda.cpp
+  apps/PRESSURE-OMPTarget.cpp
+  apps/VOL3D.cpp
+  apps/VOL3D-Cuda.cpp
+  apps/VOL3D-OMPTarget.cpp
+  apps/LTIMES.cpp
+  apps/LTIMES-Cuda.cpp
+  apps/LTIMES-OMPTarget.cpp
+  apps/LTIMES_NOVIEW.cpp
+  apps/LTIMES_NOVIEW-Cuda.cpp
+  apps/LTIMES_NOVIEW-OMPTarget.cpp
+  apps/WIP-COUPLE.cpp
+  basic/IF_QUAD.cpp
+  basic/IF_QUAD-Cuda.cpp
+  basic/IF_QUAD-OMPTarget.cpp
+  basic/INIT3.cpp
+  basic/INIT3-Cuda.cpp
+  basic/INIT3-OMPTarget.cpp
+  basic/MULADDSUB.cpp
+  basic/MULADDSUB-Cuda.cpp
+  basic/MULADDSUB-OMPTarget.cpp
+  basic/NESTED_INIT.cpp
+  basic/NESTED_INIT-Cuda.cpp
+  basic/NESTED_INIT-OMPTarget.cpp
+  basic/REDUCE3_INT.cpp
+  basic/REDUCE3_INT-Cuda.cpp
+  basic/REDUCE3_INT-OMPTarget.cpp
+  basic/TRAP_INT.cpp
+  basic/TRAP_INT-Cuda.cpp
+  basic/TRAP_INT-OMPTarget.cpp
+  basic/INIT_VIEW1D.cpp
+  basic/INIT_VIEW1D-Cuda.cpp
+  basic/INIT_VIEW1D-OMPTarget.cpp
+  basic/INIT_VIEW1D_OFFSET.cpp
+  basic/INIT_VIEW1D_OFFSET-Cuda.cpp
+  basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp
+  common/DataUtils.cpp
+  common/Executor.cpp
+  common/KernelBase.cpp
+  common/OutputUtils.cpp
+  common/RAJAPerfSuite.cpp
+  common/RPTypes.hpp
+  common/RunParams.cpp
+  lcals/DIFF_PREDICT.cpp
+  lcals/DIFF_PREDICT-Cuda.cpp
+  lcals/DIFF_PREDICT-OMPTarget.cpp
+  lcals/EOS.cpp
+  lcals/EOS-Cuda.cpp
+  lcals/EOS-OMPTarget.cpp
+  lcals/FIRST_DIFF.cpp
+  lcals/FIRST_DIFF-Cuda.cpp
+  lcals/FIRST_DIFF-OMPTarget.cpp
+  lcals/HYDRO_1D.cpp
+  lcals/HYDRO_1D-Cuda.cpp
+  lcals/HYDRO_1D-OMPTarget.cpp
+  lcals/INT_PREDICT.cpp
+  lcals/INT_PREDICT-Cuda.cpp
+  lcals/INT_PREDICT-OMPTarget.cpp
+  lcals/PLANCKIAN.cpp
+  lcals/PLANCKIAN-Cuda.cpp
+  lcals/PLANCKIAN-OMPTarget.cpp
+  polybench/POLYBENCH_2MM.cpp
+  polybench/POLYBENCH_2MM-Cuda.cpp
+  polybench/POLYBENCH_2MM-OMPTarget.cpp
+  polybench/POLYBENCH_3MM.cpp
+  polybench/POLYBENCH_3MM-Cuda.cpp
+  polybench/POLYBENCH_3MM-OMPTarget.cpp
+  polybench/POLYBENCH_GEMMVER.cpp
+  polybench/POLYBENCH_GEMMVER-Cuda.cpp
+  polybench/POLYBENCH_GEMMVER-OMPTarget.cpp
+  stream/ADD.cpp
+  stream/ADD-Cuda.cpp
+  stream/ADD-OMPTarget.cpp
+  stream/COPY.cpp
+  stream/COPY-Cuda.cpp
+  stream/COPY-OMPTarget.cpp
+  stream/DOT.cpp
+  stream/DOT-Cuda.cpp
+  stream/DOT-OMPTarget.cpp
+  stream/MUL.cpp
+  stream/MUL-Cuda.cpp
+  stream/MUL-OMPTarget.cpp
+  stream/TRIAD.cpp
+  stream/TRIAD-Cuda.cpp
+  stream/TRIAD-OMPTarget.cpp
+  DEPENDS_ON ${RAJA_PERFSUITE_DEPENDS}
+)
diff --git a/src/RAJAPerfSuiteDriver.cpp b/src/RAJAPerfSuiteDriver.cpp
index 02d5d2c4e..3fc4dc3f2 100644
--- a/src/RAJAPerfSuiteDriver.cpp
+++ b/src/RAJAPerfSuiteDriver.cpp
@@ -37,5 +37,6 @@ int main( int argc, char** argv )
   executor.outputRunData();
 
   std::cout << "\n\nDONE!!!...." << std::endl; 
+
   return 0;
 }
diff --git a/src/apps/AppsData.cpp b/src/apps/AppsData.cpp
new file mode 100644
index 000000000..1002d54c7
--- /dev/null
+++ b/src/apps/AppsData.cpp
@@ -0,0 +1,120 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "AppsData.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace apps
+{
+
+//
+// Set mesh positions for 2d mesh.
+//
+void setMeshPositions_2d(Real_ptr x, Real_type dx,
+                         Real_ptr y, Real_type dy,
+                         const ADomain& domain)
+{
+  if (domain.ndims != 2) {
+    std::cout << "\n******* ERROR!!! domain is not 2d *******" << std::endl;
+    return;
+  }
+
+  Index_type imin = domain.imin;
+  Index_type imax = domain.imax;
+  Index_type jmin = domain.jmin;
+  Index_type jmax = domain.jmax;
+
+  Index_type jp = domain.jp;
+
+  Index_type npnl = domain.NPNL; 
+  Index_type npnr = domain.NPNR; 
+
+  Real_ptr x1, x2, x3, x4;
+  Real_ptr y1, y2, y3, y4;
+  NDSET2D(domain.jp, x, x1,x2,x3,x4) ;
+  NDSET2D(domain.jp, y, y1,y2,y3,y4) ;
+
+  for (Index_type j = jmin - npnl; j < jmax + npnr; j++) {
+     for (Index_type i = imin - npnl; i < imax + npnr; i++) {
+        Index_type iz = i + j*jp ;
+
+        x3[iz] = x4[iz] = i*dx;
+        x1[iz] = x2[iz] = (i+1)*dx;
+
+        y1[iz] = y4[iz] = j*dy;
+        y2[iz] = y3[iz] = (j+1)*dy;
+
+     }
+  }
+}
+
+
+//
+// Set mesh positions for 2d mesh.
+//
+void setMeshPositions_3d(Real_ptr x, Real_type dx,
+                         Real_ptr y, Real_type dy,
+                         Real_ptr z, Real_type dz,
+                         const ADomain& domain)
+{
+  if (domain.ndims != 3) {
+    std::cout << "\n******* ERROR!!! domain is not 3d *******" << std::endl;
+    return;
+  }
+
+  Index_type imin = domain.imin;
+  Index_type imax = domain.imax;
+  Index_type jmin = domain.jmin;
+  Index_type jmax = domain.jmax;
+  Index_type kmin = domain.kmin;
+  Index_type kmax = domain.kmax;
+
+  Index_type jp = domain.jp;
+  Index_type kp = domain.kp;
+
+  Index_type npnl = domain.NPNL; 
+  Index_type npnr = domain.NPNR; 
+
+  Real_ptr x0, x1, x2, x3, x4, x5, x6, x7;
+  Real_ptr y0, y1, y2, y3, y4, y5, y6, y7;
+  Real_ptr z0, z1, z2, z3, z4, z5, z6, z7;
+  NDPTRSET(domain.jp, domain.kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
+  NDPTRSET(domain.jp, domain.kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ;
+  NDPTRSET(domain.jp, domain.kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ;
+
+  for (Index_type k = kmin - npnl; k < kmax + npnr; k++) {
+     for (Index_type j = jmin - npnl; j < jmax + npnr; j++) {
+        for (Index_type i = imin - npnl; i < imax + npnr; i++) {
+           Index_type iz = i + j*jp + kp*k ;
+
+           x0[iz] = x2[iz] = x4[iz] = x6[iz] = i*dx;
+           x1[iz] = x3[iz] = x5[iz] = x7[iz] = (i+1)*dx;
+
+           y0[iz] = y1[iz] = y4[iz] = y5[iz] = j*dy;
+           y2[iz] = y3[iz] = y6[iz] = y7[iz] = (j+1)*dy;
+
+           z0[iz] = z1[iz] = z2[iz] = z3[iz] = k*dz;
+           z4[iz] = z5[iz] = z6[iz] = z7[iz] = (k+1)*dz;
+
+        }
+     }
+  }
+}
+
+} // end namespace apps
+} // end namespace rajaperf
diff --git a/src/apps/AppsData.hpp b/src/apps/AppsData.hpp
index c62bc7188..438641580 100644
--- a/src/apps/AppsData.hpp
+++ b/src/apps/AppsData.hpp
@@ -16,6 +16,7 @@
 #ifndef RAJAPerf_AppsData_HPP
 #define RAJAPerf_AppsData_HPP
 
+#include "common/RPTypes.hpp"
 
 namespace rajaperf
 {
@@ -47,8 +48,12 @@ namespace apps
 //
 // Domain structure to mimic structured mesh loops code style.
 //
-struct ADomain
+class ADomain
 {
+public:
+
+   ADomain() = delete;
+
    ADomain( Index_type rzmax, Index_type ndims ) 
       : ndims(ndims), NPNL(2), NPNR(1)
    {
@@ -145,6 +150,18 @@ struct ADomain
    Index_type  n_real_zones;
 };
 
+//
+// Routines for initializing mesh positions for 2d/3d domains.
+//
+void setMeshPositions_2d(Real_ptr x, Real_type dx,
+                         Real_ptr y, Real_type dy,
+                         const ADomain& domain);
+
+void setMeshPositions_3d(Real_ptr x, Real_type dx,
+                         Real_ptr y, Real_type dy,
+                         Real_ptr z, Real_type dz,
+                         const ADomain& domain);
+
 } // end namespace apps
 } // end namespace rajaperf
 
diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt
index 637cc4cc8..64f5ad912 100644
--- a/src/apps/CMakeLists.txt
+++ b/src/apps/CMakeLists.txt
@@ -15,11 +15,28 @@
 
 blt_add_library(
   NAME apps
-  SOURCES PRESSURE.cpp 
+  SOURCES AppsData.cpp
+          PRESSURE.cpp 
+          PRESSURE-Cuda.cpp 
+          PRESSURE-OMPTarget.cpp 
           ENERGY.cpp 
+          ENERGY-Cuda.cpp 
+          ENERGY-OMPTarget.cpp 
           VOL3D.cpp 
+          VOL3D-Cuda.cpp 
+          VOL3D-OMPTarget.cpp 
           DEL_DOT_VEC_2D.cpp 
-          WIP-COUPLE.cpp
+          DEL_DOT_VEC_2D-Cuda.cpp 
+          DEL_DOT_VEC_2D-OMPTarget.cpp 
           FIR.cpp
+          FIR-Cuda.cpp
+          FIR-OMPTarget.cpp
+          LTIMES.cpp
+          LTIMES-Cuda.cpp
+          LTIMES-OMPTarget.cpp
+          LTIMES_NOVIEW.cpp
+          LTIMES_NOVIEW-Cuda.cpp
+          LTIMES_NOVIEW-OMPTarget.cpp
+          WIP-COUPLE.cpp
   DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS}
   )
diff --git a/src/apps/DEL_DOT_VEC_2D-Cuda.cpp b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp
new file mode 100644
index 000000000..c01bd153a
--- /dev/null
+++ b/src/apps/DEL_DOT_VEC_2D-Cuda.cpp
@@ -0,0 +1,157 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "DEL_DOT_VEC_2D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include "AppsData.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#define DEL_DOT_VEC_2D_DATA_SETUP_CUDA \
+  Real_ptr x; \
+  Real_ptr y; \
+  Real_ptr xdot; \
+  Real_ptr ydot; \
+  Real_ptr div; \
+  Index_ptr real_zones; \
+\
+  const Real_type ptiny = m_ptiny; \
+  const Real_type half = m_half; \
+\
+  Real_ptr x1,x2,x3,x4 ; \
+  Real_ptr y1,y2,y3,y4 ; \
+  Real_ptr fx1,fx2,fx3,fx4 ; \
+  Real_ptr fy1,fy2,fy3,fy4 ; \
+\
+  allocAndInitCudaDeviceData(x, m_x, m_array_length); \
+  allocAndInitCudaDeviceData(y, m_y, m_array_length); \
+  allocAndInitCudaDeviceData(xdot, m_xdot, m_array_length); \
+  allocAndInitCudaDeviceData(ydot, m_ydot, m_array_length); \
+  allocAndInitCudaDeviceData(div, m_div, m_array_length); \
+  allocAndInitCudaDeviceData(real_zones, m_domain->real_zones, iend);
+
+#define DEL_DOT_VEC_2D_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_div, div, m_array_length); \
+  deallocCudaDeviceData(x); \
+  deallocCudaDeviceData(y); \
+  deallocCudaDeviceData(xdot); \
+  deallocCudaDeviceData(ydot); \
+  deallocCudaDeviceData(div); \
+  deallocCudaDeviceData(real_zones);
+
+__global__ void deldotvec2d(Real_ptr div, 
+                            const Real_ptr x1, const Real_ptr x2,
+                            const Real_ptr x3, const Real_ptr x4,
+                            const Real_ptr y1, const Real_ptr y2,
+                            const Real_ptr y3, const Real_ptr y4,
+                            const Real_ptr fx1, const Real_ptr fx2,
+                            const Real_ptr fx3, const Real_ptr fx4,
+                            const Real_ptr fy1, const Real_ptr fy2,
+                            const Real_ptr fy3, const Real_ptr fy4,
+                            const Index_ptr real_zones,
+                            const Real_type half, const Real_type ptiny,
+                            Index_type iend)
+{
+   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
+   if (ii < iend) {
+     DEL_DOT_VEC_2D_BODY_INDEX;
+     DEL_DOT_VEC_2D_BODY;
+   }
+}
+
+
+void DEL_DOT_VEC_2D::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type iend = m_domain->n_real_zones;
+
+  if ( vid == Base_CUDA ) {
+
+    DEL_DOT_VEC_2D_DATA_SETUP_CUDA;
+
+    NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ;
+    NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ;
+    NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ;
+    NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+
+      deldotvec2d<<<grid_size, block_size>>>(div, 
+                                             x1, x2, x3, x4,
+                                             y1, y2, y3, y4,
+                                             fx1, fx2, fx3, fx4,
+                                             fy1, fy2, fy3, fy4,
+                                             real_zones,
+                                             half, ptiny,
+                                             iend);
+
+    }
+    stopTimer();
+
+    DEL_DOT_VEC_2D_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    DEL_DOT_VEC_2D_DATA_SETUP_CUDA;
+
+    NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ;
+    NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ;
+    NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ;
+    NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ;
+
+    RAJA::ListSegment zones(m_domain->real_zones, m_domain->n_real_zones);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+         zones, [=] __device__ (Index_type i) {
+         DEL_DOT_VEC_2D_BODY;
+       });
+
+    }
+    stopTimer();
+
+    DEL_DOT_VEC_2D_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  DEL_DOT_VEC_2D : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp
new file mode 100644
index 000000000..e36f8c490
--- /dev/null
+++ b/src/apps/DEL_DOT_VEC_2D-OMPTarget.cpp
@@ -0,0 +1,143 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "DEL_DOT_VEC_2D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include "AppsData.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define DEL_DOT_VEC_2D_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr x = m_x; \
+  Real_ptr y = m_y; \
+  Real_ptr xdot = m_xdot; \
+  Real_ptr ydot = m_ydot; \
+  Real_ptr div = m_div; \
+  Index_ptr real_zones; \
+\
+  const Real_type ptiny = m_ptiny; \
+  const Real_type half = m_half; \
+\
+  Real_ptr x1,x2,x3,x4 ; \
+  Real_ptr y1,y2,y3,y4 ; \
+  Real_ptr fx1,fx2,fx3,fx4 ; \
+  Real_ptr fy1,fy2,fy3,fy4 ; \
+\
+  allocAndInitOpenMPDeviceData(x, m_x, m_array_length, did, hid); \
+  allocAndInitOpenMPDeviceData(y, m_y, m_array_length, did, hid); \
+  allocAndInitOpenMPDeviceData(xdot, m_xdot, m_array_length, did, hid); \
+  allocAndInitOpenMPDeviceData(ydot, m_ydot, m_array_length, did, hid); \
+  allocAndInitOpenMPDeviceData(div, m_div, m_array_length, did, hid); \
+  allocAndInitOpenMPDeviceData(real_zones, m_domain->real_zones, iend, did, hid);
+
+#define DEL_DOT_VEC_2D_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_div, div, m_array_length, hid, did); \
+  deallocOpenMPDeviceData(x, did); \
+  deallocOpenMPDeviceData(y, did); \
+  deallocOpenMPDeviceData(xdot, did); \
+  deallocOpenMPDeviceData(ydot, did); \
+  deallocOpenMPDeviceData(div, did); \
+  deallocOpenMPDeviceData(real_zones, did);
+
+
+void DEL_DOT_VEC_2D::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = m_domain->n_real_zones;
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    DEL_DOT_VEC_2D_DATA_SETUP_OMP_TARGET;
+     
+    NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ;
+    NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ;
+    NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ;
+    NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(x1,x2,x3,x4, y1,y2,y3,y4, \
+                                       fx1,fx2,fx3,fx4, fy1,fy2,fy3,fy4, \
+                                       div, real_zones) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) 
+      for (Index_type ii = ibegin ; ii < iend ; ++ii ) {
+        DEL_DOT_VEC_2D_BODY_INDEX;
+        DEL_DOT_VEC_2D_BODY;
+      }
+
+    }
+    stopTimer();
+
+    DEL_DOT_VEC_2D_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    DEL_DOT_VEC_2D_DATA_SETUP_OMP_TARGET;
+     
+    NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ;
+    NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ;
+    NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ;
+    NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ;
+
+#if 0
+// we need to fix the fact that list segment data is in UM iff CUDA is 
+// enabled...
+    RAJA::ListSegment zones(m_domain->real_zones, m_domain->n_real_zones);  
+#endif
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::policy::omp::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](Index_type ii) {
+        DEL_DOT_VEC_2D_BODY_INDEX;
+        DEL_DOT_VEC_2D_BODY;
+      });
+
+    }
+    stopTimer();
+
+    DEL_DOT_VEC_2D_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     std::cout << "\n  DEL_DOT_VEC_2D : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/apps/DEL_DOT_VEC_2D.cpp b/src/apps/DEL_DOT_VEC_2D.cpp
index 993d7878e..528b620f1 100644
--- a/src/apps/DEL_DOT_VEC_2D.cpp
+++ b/src/apps/DEL_DOT_VEC_2D.cpp
@@ -13,44 +13,13 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-///
-/// DEL_DOT_VEC_2D kernel reference implementation:
-///
-/// for (Index_type ii = ibegin; ii < iend; ++ii ) {
-///   Index_type i = real_zones[ii];
-///
-///   Real_type xi  = half * ( x1[i]  + x2[i]  - x3[i]  - x4[i]  ) ;
-///   Real_type xj  = half * ( x2[i]  + x3[i]  - x4[i]  - x1[i]  ) ;
-/// 
-///   Real_type yi  = half * ( y1[i]  + y2[i]  - y3[i]  - y4[i]  ) ; 
-///   Real_type yj  = half * ( y2[i]  + y3[i]  - y4[i]  - y1[i]  ) ;
-///  
-///   Real_type fxi = half * ( fx1[i] + fx2[i] - fx3[i] - fx4[i] ) ;
-///   Real_type fxj = half * ( fx2[i] + fx3[i] - fx4[i] - fx1[i] ) ;
-///  
-///   Real_type fyi = half * ( fy1[i] + fy2[i] - fy3[i] - fy4[i] ) ;
-///   Real_type fyj = half * ( fy2[i] + fy3[i] - fy4[i] - fy1[i] ) ;
-/// 
-///   Real_type rarea  = 1.0 / ( xi * yj - xj * yi + ptiny ) ;
-/// 
-///   Real_type dfxdx  = rarea * ( fxi * yj - fxj * yi ) ;
-/// 
-///   Real_type dfydy  = rarea * ( fyj * xi - fyi * xj ) ;
-/// 
-///   Real_type affine = ( fy1[i] + fy2[i] + fy3[i] + fy4[i] ) /
-///                      ( y1[i]  + y2[i]  + y3[i]  + y4[i]  ) ;
-/// 
-///   div[i] = dfxdx + dfydy + affine ;
-/// }
-///
-
 #include "DEL_DOT_VEC_2D.hpp"
 
+#include "RAJA/RAJA.hpp"
+
 #include "AppsData.hpp"
 #include "common/DataUtils.hpp"
 
-#include "RAJA/RAJA.hpp"
-
 #include <iostream>
 
 namespace rajaperf 
@@ -58,13 +27,12 @@ namespace rajaperf
 namespace apps
 {
 
-#define DEL_DOT_VEC_2D_DATA \
+#define DEL_DOT_VEC_2D_DATA_SETUP_CPU \
   ResReal_ptr x = m_x; \
   ResReal_ptr y = m_y; \
   ResReal_ptr xdot = m_xdot; \
   ResReal_ptr ydot = m_ydot; \
   ResReal_ptr div = m_div; \
-  Index_ptr real_zones = m_domain->real_zones; \
 \
   const Real_type ptiny = m_ptiny; \
   const Real_type half = m_half; \
@@ -75,96 +43,6 @@ namespace apps
   ResReal_ptr fy1,fy2,fy3,fy4 ;
 
 
-#define DEL_DOT_VEC_2D_BODY \
-  Index_type i = real_zones[ii]; \
-\
-  Real_type xi  = half * ( x1[i]  + x2[i]  - x3[i]  - x4[i]  ) ; \
-  Real_type xj  = half * ( x2[i]  + x3[i]  - x4[i]  - x1[i]  ) ; \
- \
-  Real_type yi  = half * ( y1[i]  + y2[i]  - y3[i]  - y4[i]  ) ; \
-  Real_type yj  = half * ( y2[i]  + y3[i]  - y4[i]  - y1[i]  ) ; \
- \
-  Real_type fxi = half * ( fx1[i] + fx2[i] - fx3[i] - fx4[i] ) ; \
-  Real_type fxj = half * ( fx2[i] + fx3[i] - fx4[i] - fx1[i] ) ; \
- \
-  Real_type fyi = half * ( fy1[i] + fy2[i] - fy3[i] - fy4[i] ) ; \
-  Real_type fyj = half * ( fy2[i] + fy3[i] - fy4[i] - fy1[i] ) ; \
- \
-  Real_type rarea  = 1.0 / ( xi * yj - xj * yi + ptiny ) ; \
- \
-  Real_type dfxdx  = rarea * ( fxi * yj - fxj * yi ) ; \
- \
-  Real_type dfydy  = rarea * ( fyj * xi - fyi * xj ) ; \
- \
-  Real_type affine = ( fy1[i] + fy2[i] + fy3[i] + fy4[i] ) / \
-                     ( y1[i]  + y2[i]  + y3[i]  + y4[i]  ) ; \
- \
-  div[i] = dfxdx + dfydy + affine ;
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define DEL_DOT_VEC_2D_DATA_SETUP_CUDA \
-  Real_ptr x; \
-  Real_ptr y; \
-  Real_ptr xdot; \
-  Real_ptr ydot; \
-  Real_ptr div; \
-  Index_ptr real_zones; \
-\
-  const Real_type ptiny = m_ptiny; \
-  const Real_type half = m_half; \
-\
-  Real_ptr x1,x2,x3,x4 ; \
-  Real_ptr y1,y2,y3,y4 ; \
-  Real_ptr fx1,fx2,fx3,fx4 ; \
-  Real_ptr fy1,fy2,fy3,fy4 ; \
-\
-  allocAndInitCudaDeviceData(x, m_x, m_domain->nnalls); \
-  allocAndInitCudaDeviceData(y, m_y, m_domain->nnalls); \
-  allocAndInitCudaDeviceData(xdot, m_xdot, m_domain->nnalls); \
-  allocAndInitCudaDeviceData(ydot, m_ydot, m_domain->nnalls); \
-  allocAndInitCudaDeviceData(div, m_div, m_domain->nnalls); \
-  allocAndInitCudaDeviceData(real_zones, m_domain->real_zones, m_domain->n_real_zones);
-
-#define DEL_DOT_VEC_2D_DATA_TEARDOWN_CUDA \
-  getCudaDeviceData(m_div, div, iend); \
-  deallocCudaDeviceData(x); \
-  deallocCudaDeviceData(y); \
-  deallocCudaDeviceData(xdot); \
-  deallocCudaDeviceData(ydot); \
-  deallocCudaDeviceData(div);
-
-//  getCudaDeviceData(m_div, div, m_domain->nnalls); 
-
-__global__ void deldotvec2d(Real_ptr div, 
-                            const Real_ptr x1, const Real_ptr x2,
-                            const Real_ptr x3, const Real_ptr x4,
-                            const Real_ptr y1, const Real_ptr y2,
-                            const Real_ptr y3, const Real_ptr y4,
-                            const Real_ptr fx1, const Real_ptr fx2,
-                            const Real_ptr fx3, const Real_ptr fx4,
-                            const Real_ptr fy1, const Real_ptr fy2,
-                            const Real_ptr fy3, const Real_ptr fy4,
-                            const Index_ptr real_zones,
-                            const Real_type half, const Real_type ptiny,
-                            Index_type iend)
-{
-   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
-   if (ii < iend) {
-     DEL_DOT_VEC_2D_BODY;
-   }
-}
-
-#endif // if defined(RAJA_ENABLE_CUDA)
-
-
 DEL_DOT_VEC_2D::DEL_DOT_VEC_2D(const RunParams& params)
   : KernelBase(rajaperf::Apps_DEL_DOT_VEC_2D, params)
 {
@@ -172,6 +50,8 @@ DEL_DOT_VEC_2D::DEL_DOT_VEC_2D(const RunParams& params)
   setDefaultReps(1050);
 
   m_domain = new ADomain(getRunSize(), /* ndims = */ 2);
+
+  m_array_length = m_domain->nnalls;
 }
 
 DEL_DOT_VEC_2D::~DEL_DOT_VEC_2D() 
@@ -186,13 +66,17 @@ Index_type DEL_DOT_VEC_2D::getItsPerRep() const
 
 void DEL_DOT_VEC_2D::setUp(VariantID vid)
 {
-  int max_loop_index = m_domain->nnalls;
+  allocAndInitDataConst(m_x, m_array_length, 0.0, vid);
+  allocAndInitDataConst(m_y, m_array_length, 0.0, vid);
+
+  Real_type dx = 0.2;
+  Real_type dy = 0.1;
+  setMeshPositions_2d(m_x, dx, m_y, dy, *m_domain);
 
-  allocAndInitData(m_x, max_loop_index, vid);
-  allocAndInitData(m_y, max_loop_index, vid);
-  allocAndInitData(m_xdot, max_loop_index, vid);
-  allocAndInitData(m_ydot, max_loop_index, vid);
-  allocAndInitData(m_div, max_loop_index, vid);
+  allocAndInitData(m_xdot, m_array_length, vid);
+  allocAndInitData(m_ydot, m_array_length, vid);
+
+  allocAndInitDataConst(m_div, m_array_length, 0.0, vid);
 
   m_ptiny = 1.0e-20;
   m_half = 0.5;
@@ -208,7 +92,8 @@ void DEL_DOT_VEC_2D::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      DEL_DOT_VEC_2D_DATA;
+      DEL_DOT_VEC_2D_DATA_SETUP_CPU;
+      DEL_DOT_VEC_2D_DATA_INDEX;
 
       NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ;
       NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ;
@@ -219,6 +104,7 @@ void DEL_DOT_VEC_2D::runKernel(VariantID vid)
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
         for (Index_type ii = ibegin ; ii < iend ; ++ii ) {
+          DEL_DOT_VEC_2D_BODY_INDEX;
           DEL_DOT_VEC_2D_BODY;
         }
 
@@ -230,17 +116,19 @@ void DEL_DOT_VEC_2D::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      DEL_DOT_VEC_2D_DATA;
+      DEL_DOT_VEC_2D_DATA_SETUP_CPU;
 
       NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ;
       NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ;
       NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ;
       NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ;
 
+      RAJA::ListSegment zones(m_domain->real_zones, m_domain->n_real_zones);
+
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](int ii) {
+        RAJA::forall<RAJA::loop_exec>(zones, [=](Index_type i) {
           DEL_DOT_VEC_2D_BODY;
         }); 
 
@@ -253,7 +141,8 @@ void DEL_DOT_VEC_2D::runKernel(VariantID vid)
 #if defined(RAJA_ENABLE_OPENMP)      
     case Base_OpenMP : {
 
-      DEL_DOT_VEC_2D_DATA;
+      DEL_DOT_VEC_2D_DATA_SETUP_CPU;
+      DEL_DOT_VEC_2D_DATA_INDEX;
 
       NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ;
       NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ;
@@ -265,6 +154,7 @@ void DEL_DOT_VEC_2D::runKernel(VariantID vid)
 
         #pragma omp parallel for 
         for (Index_type ii = ibegin ; ii < iend ; ++ii ) {
+          DEL_DOT_VEC_2D_BODY_INDEX;
           DEL_DOT_VEC_2D_BODY;
         }
 
@@ -274,24 +164,21 @@ void DEL_DOT_VEC_2D::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // Not applicable
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      DEL_DOT_VEC_2D_DATA;
+      DEL_DOT_VEC_2D_DATA_SETUP_CPU;
 
       NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ;
       NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ;
       NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ;
       NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ;
 
+      RAJA::ListSegment zones(m_domain->real_zones, m_domain->n_real_zones);
+
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, [=](int ii) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(zones, [=](Index_type i) { 
           DEL_DOT_VEC_2D_BODY;
         });
 
@@ -302,75 +189,26 @@ void DEL_DOT_VEC_2D::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-
-      DEL_DOT_VEC_2D_DATA_SETUP_CUDA;
-
-      NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ;
-      NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ;
-      NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ;
-      NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-
-        deldotvec2d<<<grid_size, block_size>>>(div, 
-                                               x1, x2, x3, x4,
-                                               y1, y2, y3, y4,
-                                               fx1, fx2, fx3, fx4,
-                                               fy1, fy2, fy3, fy4,
-                                               real_zones,
-                                               half, ptiny,
-                                               iend);
-
-      }
-      stopTimer();
-
-      DEL_DOT_VEC_2D_DATA_TEARDOWN_CUDA;
-
-      break;
-    }
-
-    case RAJA_CUDA : {
-
-      DEL_DOT_VEC_2D_DATA_SETUP_CUDA;
-
-      NDSET2D(m_domain->jp, x,x1,x2,x3,x4) ;
-      NDSET2D(m_domain->jp, y,y1,y2,y3,y4) ;
-      NDSET2D(m_domain->jp, xdot,fx1,fx2,fx3,fx4) ;
-      NDSET2D(m_domain->jp, ydot,fy1,fy2,fy3,fy4) ;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-        RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend,
-           [=] __device__ (Index_type ii) {
-           DEL_DOT_VEC_2D_BODY;
-         });
-
-      }
-      stopTimer();
-
-      DEL_DOT_VEC_2D_DATA_TEARDOWN_CUDA;
-
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  DEL_DOT_VEC_2D : Unknown variant id = " << vid << std::endl;
     }
 
   }
@@ -378,7 +216,7 @@ void DEL_DOT_VEC_2D::runKernel(VariantID vid)
 
 void DEL_DOT_VEC_2D::updateChecksum(VariantID vid)
 {
-  checksum[vid] += calcChecksum(m_div, getRunSize());
+  checksum[vid] += calcChecksum(m_div, m_array_length);
 }
 
 void DEL_DOT_VEC_2D::tearDown(VariantID vid)
diff --git a/src/apps/DEL_DOT_VEC_2D.hpp b/src/apps/DEL_DOT_VEC_2D.hpp
index d92273f44..5d7fcf360 100644
--- a/src/apps/DEL_DOT_VEC_2D.hpp
+++ b/src/apps/DEL_DOT_VEC_2D.hpp
@@ -13,13 +13,75 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// DEL_DOT_VEC_2D kernel reference implementation:
+///
+/// for (Index_type ii = ibegin; ii < iend; ++ii ) {
+///   Index_type i = real_zones[ii];
+///
+///   Real_type xi  = half * ( x1[i]  + x2[i]  - x3[i]  - x4[i]  ) ;
+///   Real_type xj  = half * ( x2[i]  + x3[i]  - x4[i]  - x1[i]  ) ;
+///
+///   Real_type yi  = half * ( y1[i]  + y2[i]  - y3[i]  - y4[i]  ) ;
+///   Real_type yj  = half * ( y2[i]  + y3[i]  - y4[i]  - y1[i]  ) ;
+///
+///   Real_type fxi = half * ( fx1[i] + fx2[i] - fx3[i] - fx4[i] ) ;
+///   Real_type fxj = half * ( fx2[i] + fx3[i] - fx4[i] - fx1[i] ) ;
+///
+///   Real_type fyi = half * ( fy1[i] + fy2[i] - fy3[i] - fy4[i] ) ;
+///   Real_type fyj = half * ( fy2[i] + fy3[i] - fy4[i] - fy1[i] ) ;
+///
+///   Real_type rarea  = 1.0 / ( xi * yj - xj * yi + ptiny ) ;
+///
+///   Real_type dfxdx  = rarea * ( fxi * yj - fxj * yi ) ;
+///
+///   Real_type dfydy  = rarea * ( fyj * xi - fyi * xj ) ;
+///
+///   Real_type affine = ( fy1[i] + fy2[i] + fy3[i] + fy4[i] ) /
+///                      ( y1[i]  + y2[i]  + y3[i]  + y4[i]  ) ;
+///
+///   div[i] = dfxdx + dfydy + affine ;
+/// }
+///
 
 #ifndef RAJAPerf_Apps_DEL_DOT_VEC_2D_HPP
 #define RAJAPerf_Apps_DEL_DOT_VEC_2D_HPP
 
-#include "common/KernelBase.hpp"
+
+#define DEL_DOT_VEC_2D_DATA_INDEX \
+  Index_ptr real_zones = m_domain->real_zones;
+
+#define DEL_DOT_VEC_2D_BODY_INDEX \
+  Index_type i = real_zones[ii];
+
+#define DEL_DOT_VEC_2D_BODY \
+\
+  Real_type xi  = half * ( x1[i]  + x2[i]  - x3[i]  - x4[i]  ) ; \
+  Real_type xj  = half * ( x2[i]  + x3[i]  - x4[i]  - x1[i]  ) ; \
+ \
+  Real_type yi  = half * ( y1[i]  + y2[i]  - y3[i]  - y4[i]  ) ; \
+  Real_type yj  = half * ( y2[i]  + y3[i]  - y4[i]  - y1[i]  ) ; \
+ \
+  Real_type fxi = half * ( fx1[i] + fx2[i] - fx3[i] - fx4[i] ) ; \
+  Real_type fxj = half * ( fx2[i] + fx3[i] - fx4[i] - fx1[i] ) ; \
+ \
+  Real_type fyi = half * ( fy1[i] + fy2[i] - fy3[i] - fy4[i] ) ; \
+  Real_type fyj = half * ( fy2[i] + fy3[i] - fy4[i] - fy1[i] ) ; \
+ \
+  Real_type rarea  = 1.0 / ( xi * yj - xj * yi + ptiny ) ; \
+ \
+  Real_type dfxdx  = rarea * ( fxi * yj - fxj * yi ) ; \
+ \
+  Real_type dfydy  = rarea * ( fyj * xi - fyi * xj ) ; \
+ \
+  Real_type affine = ( fy1[i] + fy2[i] + fy3[i] + fy4[i] ) / \
+                     ( y1[i]  + y2[i]  + y3[i]  + y4[i]  ) ; \
+ \
+  div[i] = dfxdx + dfydy + affine ;
 
 
+#include "common/KernelBase.hpp"
+
 namespace rajaperf 
 {
 class RunParams;
@@ -43,6 +105,9 @@ class DEL_DOT_VEC_2D : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
   Real_ptr m_x;
   Real_ptr m_y;
@@ -54,6 +119,7 @@ class DEL_DOT_VEC_2D : public KernelBase
   Real_type m_half;
 
   ADomain* m_domain;
+  Index_type m_array_length;
 };
 
 } // end namespace apps
diff --git a/src/apps/ENERGY-Cuda.cpp b/src/apps/ENERGY-Cuda.cpp
new file mode 100644
index 000000000..e05ee6fd0
--- /dev/null
+++ b/src/apps/ENERGY-Cuda.cpp
@@ -0,0 +1,274 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "ENERGY.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#define ENERGY_DATA_SETUP_CUDA \
+  Real_ptr e_new; \
+  Real_ptr e_old; \
+  Real_ptr delvc; \
+  Real_ptr p_new; \
+  Real_ptr p_old; \
+  Real_ptr q_new; \
+  Real_ptr q_old; \
+  Real_ptr work; \
+  Real_ptr compHalfStep; \
+  Real_ptr pHalfStep; \
+  Real_ptr bvc; \
+  Real_ptr pbvc; \
+  Real_ptr ql_old; \
+  Real_ptr qq_old; \
+  Real_ptr vnewc; \
+  const Real_type rho0 = m_rho0; \
+  const Real_type e_cut = m_e_cut; \
+  const Real_type emin = m_emin; \
+  const Real_type q_cut = m_q_cut; \
+\
+  allocAndInitCudaDeviceData(e_new, m_e_new, iend); \
+  allocAndInitCudaDeviceData(e_old, m_e_old, iend); \
+  allocAndInitCudaDeviceData(delvc, m_delvc, iend); \
+  allocAndInitCudaDeviceData(p_new, m_p_new, iend); \
+  allocAndInitCudaDeviceData(p_old, m_p_old, iend); \
+  allocAndInitCudaDeviceData(q_new, m_q_new, iend); \
+  allocAndInitCudaDeviceData(q_old, m_q_old, iend); \
+  allocAndInitCudaDeviceData(work, m_work, iend); \
+  allocAndInitCudaDeviceData(compHalfStep, m_compHalfStep, iend); \
+  allocAndInitCudaDeviceData(pHalfStep, m_pHalfStep, iend); \
+  allocAndInitCudaDeviceData(bvc, m_bvc, iend); \
+  allocAndInitCudaDeviceData(pbvc, m_pbvc, iend); \
+  allocAndInitCudaDeviceData(ql_old, m_ql_old, iend); \
+  allocAndInitCudaDeviceData(qq_old, m_qq_old, iend); \
+  allocAndInitCudaDeviceData(vnewc, m_vnewc, iend);
+
+#define ENERGY_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_e_new, e_new, iend); \
+  getCudaDeviceData(m_q_new, q_new, iend); \
+  deallocCudaDeviceData(e_new); \
+  deallocCudaDeviceData(e_old); \
+  deallocCudaDeviceData(delvc); \
+  deallocCudaDeviceData(p_new); \
+  deallocCudaDeviceData(p_old); \
+  deallocCudaDeviceData(q_new); \
+  deallocCudaDeviceData(q_old); \
+  deallocCudaDeviceData(work); \
+  deallocCudaDeviceData(compHalfStep); \
+  deallocCudaDeviceData(pHalfStep); \
+  deallocCudaDeviceData(bvc); \
+  deallocCudaDeviceData(pbvc); \
+  deallocCudaDeviceData(ql_old); \
+  deallocCudaDeviceData(qq_old); \
+  deallocCudaDeviceData(vnewc);
+
+__global__ void energycalc1(Real_ptr e_new, Real_ptr e_old, Real_ptr delvc,
+                            Real_ptr p_old, Real_ptr q_old, Real_ptr work,
+                            Index_type iend)
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     ENERGY_BODY1;
+   }
+}
+
+__global__ void energycalc2(Real_ptr delvc, Real_ptr q_new,
+                            Real_ptr compHalfStep, Real_ptr pHalfStep,
+                            Real_ptr e_new, Real_ptr bvc, Real_ptr pbvc,
+                            Real_ptr ql_old, Real_ptr qq_old,
+                            Real_type rho0,
+                            Index_type iend)
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     ENERGY_BODY2;
+   }
+}
+
+__global__ void energycalc3(Real_ptr e_new, Real_ptr delvc,
+                            Real_ptr p_old, Real_ptr q_old, 
+                            Real_ptr pHalfStep, Real_ptr q_new,
+                            Index_type iend)
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     ENERGY_BODY3;
+   }
+}
+
+__global__ void energycalc4(Real_ptr e_new, Real_ptr work,
+                            Real_type e_cut, Real_type emin,
+                            Index_type iend)
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     ENERGY_BODY4;
+   }
+}
+
+__global__ void energycalc5(Real_ptr delvc,
+                            Real_ptr pbvc, Real_ptr e_new, Real_ptr vnewc,
+                            Real_ptr bvc, Real_ptr p_new,
+                            Real_ptr ql_old, Real_ptr qq_old,
+                            Real_ptr p_old, Real_ptr q_old,
+                            Real_ptr pHalfStep, Real_ptr q_new,
+                            Real_type rho0, Real_type e_cut, Real_type emin,
+                            Index_type iend)
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     ENERGY_BODY5;
+   }
+}
+
+__global__ void energycalc6(Real_ptr delvc,
+                            Real_ptr pbvc, Real_ptr e_new, Real_ptr vnewc,
+                            Real_ptr bvc, Real_ptr p_new,
+                            Real_ptr q_new,
+                            Real_ptr ql_old, Real_ptr qq_old,
+                            Real_type rho0, Real_type q_cut,
+                            Index_type iend)
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     ENERGY_BODY6;
+   }
+}
+
+
+void ENERGY::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_CUDA ) {
+    
+    ENERGY_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+
+       energycalc1<<<grid_size, block_size>>>( e_new, e_old, delvc,
+                                               p_old, q_old, work,
+                                               iend );
+
+       energycalc2<<<grid_size, block_size>>>( delvc, q_new,
+                                               compHalfStep, pHalfStep,
+                                               e_new, bvc, pbvc,
+                                               ql_old, qq_old,
+                                               rho0,
+                                               iend );
+
+       energycalc3<<<grid_size, block_size>>>( e_new, delvc,
+                                               p_old, q_old,
+                                               pHalfStep, q_new,
+                                               iend );
+
+       energycalc4<<<grid_size, block_size>>>( e_new, work,
+                                               e_cut, emin,
+                                               iend );
+
+       energycalc5<<<grid_size, block_size>>>( delvc,
+                                               pbvc, e_new, vnewc,
+                                               bvc, p_new,
+                                               ql_old, qq_old,
+                                               p_old, q_old,
+                                               pHalfStep, q_new,
+                                               rho0, e_cut, emin,
+                                               iend );
+
+       energycalc6<<<grid_size, block_size>>>( delvc,
+                                               pbvc, e_new, vnewc,
+                                               bvc, p_new,
+                                               q_new,
+                                               ql_old, qq_old,
+                                               rho0, q_cut,
+                                               iend );
+
+    }
+    stopTimer();
+
+    ENERGY_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    ENERGY_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+         ENERGY_BODY1;
+       });
+
+       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+         ENERGY_BODY2;
+       });
+
+       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+         ENERGY_BODY3;
+       });
+
+       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+         ENERGY_BODY4;
+       });
+
+       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+         ENERGY_BODY5;
+       });
+
+       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+         ENERGY_BODY6;
+       });
+
+    }
+    stopTimer();
+
+    ENERGY_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  ENERGY : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/apps/ENERGY-OMPTarget.cpp b/src/apps/ENERGY-OMPTarget.cpp
new file mode 100644
index 000000000..192dcc59b
--- /dev/null
+++ b/src/apps/ENERGY-OMPTarget.cpp
@@ -0,0 +1,207 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "ENERGY.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define ENERGY_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr e_new; \
+  Real_ptr e_old; \
+  Real_ptr delvc; \
+  Real_ptr p_new; \
+  Real_ptr p_old; \
+  Real_ptr q_new; \
+  Real_ptr q_old; \
+  Real_ptr work; \
+  Real_ptr compHalfStep; \
+  Real_ptr pHalfStep; \
+  Real_ptr bvc; \
+  Real_ptr pbvc; \
+  Real_ptr ql_old; \
+  Real_ptr qq_old; \
+  Real_ptr vnewc; \
+  const Real_type rho0 = m_rho0; \
+  const Real_type e_cut = m_e_cut; \
+  const Real_type emin = m_emin; \
+  const Real_type q_cut = m_q_cut; \
+\
+  allocAndInitOpenMPDeviceData(e_new, m_e_new, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(e_old, m_e_old, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(delvc, m_delvc, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(p_new, m_p_new, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(p_old, m_p_old, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(q_new, m_q_new, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(q_old, m_q_old, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(work, m_work, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(compHalfStep, m_compHalfStep, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(pHalfStep, m_pHalfStep, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(bvc, m_bvc, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(pbvc, m_pbvc, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(ql_old, m_ql_old, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(qq_old, m_qq_old, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(vnewc, m_vnewc, iend, did, hid);
+
+#define ENERGY_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_e_new, e_new, iend, hid, did); \
+  getOpenMPDeviceData(m_q_new, q_new, iend, hid, did); \
+  deallocOpenMPDeviceData(e_new, did); \
+  deallocOpenMPDeviceData(e_old, did); \
+  deallocOpenMPDeviceData(delvc, did); \
+  deallocOpenMPDeviceData(p_new, did); \
+  deallocOpenMPDeviceData(p_old, did); \
+  deallocOpenMPDeviceData(q_new, did); \
+  deallocOpenMPDeviceData(q_old, did); \
+  deallocOpenMPDeviceData(work, did); \
+  deallocOpenMPDeviceData(compHalfStep, did); \
+  deallocOpenMPDeviceData(pHalfStep, did); \
+  deallocOpenMPDeviceData(bvc, did); \
+  deallocOpenMPDeviceData(pbvc, did); \
+  deallocOpenMPDeviceData(ql_old, did); \
+  deallocOpenMPDeviceData(qq_old, did); \
+  deallocOpenMPDeviceData(vnewc, did);
+
+void ENERGY::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    ENERGY_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(e_new, e_old, delvc, \
+                                       p_old, q_old, work) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1)
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        ENERGY_BODY1;
+      }
+
+      #pragma omp target is_device_ptr(delvc, q_new, compHalfStep, \
+                                       pHalfStep, e_new, bvc, pbvc, \
+                                       ql_old, qq_old) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1)
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        ENERGY_BODY2;
+      }
+
+      #pragma omp target is_device_ptr(e_new, delvc, p_old, \
+                                       q_old, pHalfStep, q_new) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1)
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        ENERGY_BODY3;
+      }
+
+      #pragma omp target is_device_ptr(e_new, work) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1)
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        ENERGY_BODY4;
+      }
+
+      #pragma omp target is_device_ptr(delvc, pbvc, e_new, vnewc, \
+                                       bvc, p_new, ql_old, qq_old, \
+                                       p_old, q_old, pHalfStep, q_new) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1)
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        ENERGY_BODY5;
+      }
+
+      #pragma omp target is_device_ptr(delvc, pbvc, e_new, vnewc, \
+                                       bvc, p_new, q_new, ql_old, qq_old) \
+                                       device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1)
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        ENERGY_BODY6;
+      }
+        
+    }
+    stopTimer();
+
+    ENERGY_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    ENERGY_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](int i) {
+        ENERGY_BODY1;
+      });
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](int i) {
+        ENERGY_BODY2;
+      });
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](int i) {
+        ENERGY_BODY3;
+      });
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](int i) {
+        ENERGY_BODY4;
+      });
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](int i) {
+        ENERGY_BODY5;
+      });
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](int i) {
+        ENERGY_BODY6;
+     });
+
+    }
+    stopTimer();
+
+    ENERGY_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     std::cout << "\n  ENERGY : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/apps/ENERGY.cpp b/src/apps/ENERGY.cpp
index e956b466e..7283ce62c 100644
--- a/src/apps/ENERGY.cpp
+++ b/src/apps/ENERGY.cpp
@@ -13,90 +13,12 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-///
-/// ENERGY kernel reference implementation:
-///
-/// for (Index_type i = ibegin; i < iend; ++i ) {
-///   e_new[i] = e_old[i] - 0.5 * delvc[i] * 
-///              (p_old[i] + q_old[i]) + 0.5 * work[i];
-/// }
-///
-/// for (Index_type i = ibegin; i < iend; ++i ) {
-///   if ( delvc[i] > 0.0 ) { 
-///      q_new[i] = 0.0 ; 
-///   } 
-///   else { 
-///      Real_type vhalf = 1.0 / (1.0 + compHalfStep[i]) ; 
-///      Real_type ssc = ( pbvc[i] * e_new[i] 
-///         + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ; 
-///      if ( ssc <= 0.1111111e-36 ) { 
-///         ssc = 0.3333333e-18 ; 
-///      } else { 
-///         ssc = sqrt(ssc) ; 
-///      } 
-///      q_new[i] = (ssc*ql_old[i] + qq_old[i]) ; 
-///   }
-/// }
-///
-/// for (Index_type i = ibegin; i < iend; ++i ) {
-///   e_new[i] = e_new[i] + 0.5 * delvc[i] 
-///              * ( 3.0*(p_old[i] + q_old[i]) 
-///                  - 4.0*(pHalfStep[i] + q_new[i])) ;
-/// }
-///
-/// for (Index_type i = ibegin; i < iend; ++i ) {
-///   e_new[i] += 0.5 * work[i]; 
-///   if ( fabs(e_new[i]) < e_cut ) { e_new[i] = 0.0  ; } 
-///   if ( e_new[i]  < emin ) { e_new[i] = emin ; }
-/// }
-///
-/// for (Index_type i = ibegin; i < iend; ++i ) {
-///   Real_type q_tilde ; 
-///   if (delvc[i] > 0.0) { 
-///      q_tilde = 0. ; 
-///   } 
-///   else { 
-///      Real_type ssc = ( pbvc[i] * e_new[i] 
-///          + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ; 
-///      if ( ssc <= 0.1111111e-36 ) { 
-///         ssc = 0.3333333e-18 ; 
-///      } else { 
-///         ssc = sqrt(ssc) ; 
-///      } 
-///      q_tilde = (ssc*ql_old[i] + qq_old[i]) ; 
-///   } 
-///   e_new[i] = e_new[i] - ( 7.0*(p_old[i] + q_old[i]) 
-///                          - 8.0*(pHalfStep[i] + q_new[i]) 
-///                          + (p_new[i] + q_tilde)) * delvc[i] / 6.0 ; 
-///   if ( fabs(e_new[i]) < e_cut ) { 
-///      e_new[i] = 0.0  ; 
-///   } 
-///   if ( e_new[i]  < emin ) { 
-///      e_new[i] = emin ; 
-///   }
-/// }
-///
-/// for (Index_type i = ibegin; i < iend; ++i ) {
-///   if ( delvc[i] <= 0.0 ) { 
-///      Real_type ssc = ( pbvc[i] * e_new[i] 
-///              + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ; 
-///      if ( ssc <= 0.1111111e-36 ) { 
-///         ssc = 0.3333333e-18 ; 
-///      } else { 
-///         ssc = sqrt(ssc) ; 
-///      } 
-///      q_new[i] = (ssc*ql_old[i] + qq_old[i]) ; 
-///      if (fabs(q_new[i]) < q_cut) q_new[i] = 0.0 ; 
-///   }
-/// }
-///
-
 #include "ENERGY.hpp"
 
-#include "common/DataUtils.hpp"
-
 #include "RAJA/RAJA.hpp"
 
+#include "common/DataUtils.hpp"
+
 #include <iostream>
 
 namespace rajaperf 
@@ -104,7 +26,7 @@ namespace rajaperf
 namespace apps
 {
 
-#define ENERGY_DATA \
+#define ENERGY_DATA_SETUP_CPU \
   ResReal_ptr e_new = m_e_new; \
   ResReal_ptr e_old = m_e_old; \
   ResReal_ptr delvc = m_delvc; \
@@ -126,214 +48,6 @@ namespace apps
   const Real_type q_cut = m_q_cut;
 
 
-#define ENERGY_BODY1 \
-  e_new[i] = e_old[i] - 0.5 * delvc[i] * \
-             (p_old[i] + q_old[i]) + 0.5 * work[i];
-
-#define ENERGY_BODY2 \
-  if ( delvc[i] > 0.0 ) { \
-     q_new[i] = 0.0 ; \
-  } \
-  else { \
-     Real_type vhalf = 1.0 / (1.0 + compHalfStep[i]) ; \
-     Real_type ssc = ( pbvc[i] * e_new[i] \
-        + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ; \
-     if ( ssc <= 0.1111111e-36 ) { \
-        ssc = 0.3333333e-18 ; \
-     } else { \
-        ssc = sqrt(ssc) ; \
-     } \
-     q_new[i] = (ssc*ql_old[i] + qq_old[i]) ; \
-  }
-
-#define ENERGY_BODY3 \
-  e_new[i] = e_new[i] + 0.5 * delvc[i] \
-             * ( 3.0*(p_old[i] + q_old[i]) \
-                 - 4.0*(pHalfStep[i] + q_new[i])) ;
-
-#define ENERGY_BODY4 \
-  e_new[i] += 0.5 * work[i]; \
-  if ( fabs(e_new[i]) < e_cut ) { e_new[i] = 0.0  ; } \
-  if ( e_new[i]  < emin ) { e_new[i] = emin ; }
-
-#define ENERGY_BODY5 \
-  Real_type q_tilde ; \
-  if (delvc[i] > 0.0) { \
-     q_tilde = 0. ; \
-  } \
-  else { \
-     Real_type ssc = ( pbvc[i] * e_new[i] \
-         + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ; \
-     if ( ssc <= 0.1111111e-36 ) { \
-        ssc = 0.3333333e-18 ; \
-     } else { \
-        ssc = sqrt(ssc) ; \
-     } \
-     q_tilde = (ssc*ql_old[i] + qq_old[i]) ; \
-  } \
-  e_new[i] = e_new[i] - ( 7.0*(p_old[i] + q_old[i]) \
-                         - 8.0*(pHalfStep[i] + q_new[i]) \
-                         + (p_new[i] + q_tilde)) * delvc[i] / 6.0 ; \
-  if ( fabs(e_new[i]) < e_cut ) { \
-     e_new[i] = 0.0  ; \
-  } \
-  if ( e_new[i]  < emin ) { \
-     e_new[i] = emin ; \
-  }
-
-#define ENERGY_BODY6 \
-  if ( delvc[i] <= 0.0 ) { \
-     Real_type ssc = ( pbvc[i] * e_new[i] \
-             + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ; \
-     if ( ssc <= 0.1111111e-36 ) { \
-        ssc = 0.3333333e-18 ; \
-     } else { \
-        ssc = sqrt(ssc) ; \
-     } \
-     q_new[i] = (ssc*ql_old[i] + qq_old[i]) ; \
-     if (fabs(q_new[i]) < q_cut) q_new[i] = 0.0 ; \
-  }
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define ENERGY_DATA_SETUP_CUDA \
-  Real_ptr e_new; \
-  Real_ptr e_old; \
-  Real_ptr delvc; \
-  Real_ptr p_new; \
-  Real_ptr p_old; \
-  Real_ptr q_new; \
-  Real_ptr q_old; \
-  Real_ptr work; \
-  Real_ptr compHalfStep; \
-  Real_ptr pHalfStep; \
-  Real_ptr bvc; \
-  Real_ptr pbvc; \
-  Real_ptr ql_old; \
-  Real_ptr qq_old; \
-  Real_ptr vnewc; \
-  const Real_type rho0 = m_rho0; \
-  const Real_type e_cut = m_e_cut; \
-  const Real_type emin = m_emin; \
-  const Real_type q_cut = m_q_cut; \
-\
-  allocAndInitCudaDeviceData(e_new, m_e_new, iend); \
-  allocAndInitCudaDeviceData(e_old, m_e_old, iend); \
-  allocAndInitCudaDeviceData(delvc, m_delvc, iend); \
-  allocAndInitCudaDeviceData(p_new, m_p_new, iend); \
-  allocAndInitCudaDeviceData(p_old, m_p_old, iend); \
-  allocAndInitCudaDeviceData(q_new, m_q_new, iend); \
-  allocAndInitCudaDeviceData(q_old, m_q_old, iend); \
-  allocAndInitCudaDeviceData(work, m_work, iend); \
-  allocAndInitCudaDeviceData(compHalfStep, m_compHalfStep, iend); \
-  allocAndInitCudaDeviceData(pHalfStep, m_pHalfStep, iend); \
-  allocAndInitCudaDeviceData(bvc, m_bvc, iend); \
-  allocAndInitCudaDeviceData(pbvc, m_pbvc, iend); \
-  allocAndInitCudaDeviceData(ql_old, m_ql_old, iend); \
-  allocAndInitCudaDeviceData(qq_old, m_qq_old, iend); \
-  allocAndInitCudaDeviceData(vnewc, m_vnewc, iend);
-
-#define ENERGY_DATA_TEARDOWN_CUDA \
-  getCudaDeviceData(m_e_new, e_new, iend); \
-  getCudaDeviceData(m_q_new, q_new, iend); \
-  deallocCudaDeviceData(e_new); \
-  deallocCudaDeviceData(e_old); \
-  deallocCudaDeviceData(delvc); \
-  deallocCudaDeviceData(p_new); \
-  deallocCudaDeviceData(p_old); \
-  deallocCudaDeviceData(q_new); \
-  deallocCudaDeviceData(q_old); \
-  deallocCudaDeviceData(work); \
-  deallocCudaDeviceData(compHalfStep); \
-  deallocCudaDeviceData(pHalfStep); \
-  deallocCudaDeviceData(bvc); \
-  deallocCudaDeviceData(pbvc); \
-  deallocCudaDeviceData(ql_old); \
-  deallocCudaDeviceData(qq_old); \
-  deallocCudaDeviceData(vnewc);
-
-__global__ void energycalc1(Real_ptr e_new, Real_ptr e_old, Real_ptr delvc,
-                            Real_ptr p_old, Real_ptr q_old, Real_ptr work,
-                            Index_type iend)
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     ENERGY_BODY1;
-   }
-}
-
-__global__ void energycalc2(Real_ptr delvc, Real_ptr q_new,
-                            Real_ptr compHalfStep, Real_ptr pHalfStep,
-                            Real_ptr e_new, Real_ptr bvc, Real_ptr pbvc,
-                            Real_ptr ql_old, Real_ptr qq_old,
-                            Real_type rho0,
-                            Index_type iend)
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     ENERGY_BODY2;
-   }
-}
-
-__global__ void energycalc3(Real_ptr e_new, Real_ptr delvc,
-                            Real_ptr p_old, Real_ptr q_old, 
-                            Real_ptr pHalfStep, Real_ptr q_new,
-                            Index_type iend)
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     ENERGY_BODY3;
-   }
-}
-
-__global__ void energycalc4(Real_ptr e_new, Real_ptr work,
-                            Real_type e_cut, Real_type emin,
-                            Index_type iend)
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     ENERGY_BODY4;
-   }
-}
-
-__global__ void energycalc5(Real_ptr delvc,
-                            Real_ptr pbvc, Real_ptr e_new, Real_ptr vnewc,
-                            Real_ptr bvc, Real_ptr p_new,
-                            Real_ptr ql_old, Real_ptr qq_old,
-                            Real_ptr p_old, Real_ptr q_old,
-                            Real_ptr pHalfStep, Real_ptr q_new,
-                            Real_type rho0, Real_type e_cut, Real_type emin,
-                            Index_type iend)
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     ENERGY_BODY5;
-   }
-}
-
-__global__ void energycalc6(Real_ptr delvc,
-                            Real_ptr pbvc, Real_ptr e_new, Real_ptr vnewc,
-                            Real_ptr bvc, Real_ptr p_new,
-                            Real_ptr q_new,
-                            Real_ptr ql_old, Real_ptr qq_old,
-                            Real_type rho0, Real_type q_cut,
-                            Index_type iend)
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     ENERGY_BODY6;
-   }
-}
-
-#endif // if defined(RAJA_ENABLE_CUDA)
-
-
 ENERGY::ENERGY(const RunParams& params)
   : KernelBase(rajaperf::Apps_ENERGY, params)
 {
@@ -347,12 +61,12 @@ ENERGY::~ENERGY()
 
 void ENERGY::setUp(VariantID vid)
 {
-  allocAndInitData(m_e_new, getRunSize(), vid);
+  allocAndInitDataConst(m_e_new, getRunSize(), 0.0, vid);
   allocAndInitData(m_e_old, getRunSize(), vid);
   allocAndInitData(m_delvc, getRunSize(), vid);
   allocAndInitData(m_p_new, getRunSize(), vid);
   allocAndInitData(m_p_old, getRunSize(), vid);
-  allocAndInitData(m_q_new, getRunSize(), vid);
+  allocAndInitDataConst(m_q_new, getRunSize(), 0.0, vid);
   allocAndInitData(m_q_old, getRunSize(), vid);
   allocAndInitData(m_work, getRunSize(), vid);
   allocAndInitData(m_compHalfStep, getRunSize(), vid);
@@ -379,7 +93,7 @@ void ENERGY::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      ENERGY_DATA;
+      ENERGY_DATA_SETUP_CPU;
   
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -416,32 +130,38 @@ void ENERGY::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      ENERGY_DATA;
+      ENERGY_DATA_SETUP_CPU;
  
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](int i) {
           ENERGY_BODY1;
         }); 
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](int i) {
           ENERGY_BODY2;
         }); 
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](int i) {
           ENERGY_BODY3;
         }); 
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](int i) {
           ENERGY_BODY4;
         }); 
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](int i) {
           ENERGY_BODY5;
         }); 
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](int i) {
           ENERGY_BODY6;
         }); 
 
@@ -454,83 +174,48 @@ void ENERGY::runKernel(VariantID vid)
 #if defined(RAJA_ENABLE_OPENMP)      
     case Base_OpenMP : {
 
-      ENERGY_DATA;
- 
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-        #pragma omp parallel
-          {
-            #pragma omp for nowait schedule(static)
-            for (Index_type i = ibegin; i < iend; ++i ) {
-              ENERGY_BODY1;
-            }
-
-            #pragma omp for nowait schedule(static)
-            for (Index_type i = ibegin; i < iend; ++i ) {
-              ENERGY_BODY2;
-            }
-
-            #pragma omp for nowait schedule(static)
-            for (Index_type i = ibegin; i < iend; ++i ) {
-              ENERGY_BODY3;
-            }
-
-            #pragma omp for nowait schedule(static)
-            for (Index_type i = ibegin; i < iend; ++i ) {
-              ENERGY_BODY4;
-            }
-
-            #pragma omp for nowait schedule(static)
-            for (Index_type i = ibegin; i < iend; ++i ) {
-              ENERGY_BODY5;
-            }
-
-            #pragma omp for nowait schedule(static)
-            for (Index_type i = ibegin; i < iend; ++i ) {
-              ENERGY_BODY6;
-            }
-          } // omp parallel
-
-      }
-      stopTimer();
-
-      break;
-    }
-
-    case RAJALike_OpenMP : {
+//
+// NOTE: This kernel should be written to have an OpenMP parallel 
+//       region around it and then use an OpenMP for-nowait for
+//       each loop inside it. We currently don't have a clean way to
+//       do this in RAJA. So, the base OpenMP variant is coded the
+//       way it is to be able to do an "apples to apples" comparison.
+//
+//       This will be changed in the future when the required feature 
+//       is added to RAJA.
+//
 
-      ENERGY_DATA;
+      ENERGY_DATA_SETUP_CPU;
       
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
     
-        #pragma omp parallel for schedule(static)
+        #pragma omp parallel for
         for (Index_type i = ibegin; i < iend; ++i ) {
           ENERGY_BODY1;
         }
 
-        #pragma omp parallel for schedule(static)
+        #pragma omp parallel for
         for (Index_type i = ibegin; i < iend; ++i ) {
           ENERGY_BODY2;
         }
 
-        #pragma omp parallel for schedule(static)
+        #pragma omp parallel for
         for (Index_type i = ibegin; i < iend; ++i ) {
           ENERGY_BODY3;
         }
 
-        #pragma omp parallel for schedule(static)
+        #pragma omp parallel for
         for (Index_type i = ibegin; i < iend; ++i ) {
           ENERGY_BODY4;
         }
 
-        #pragma omp parallel for schedule(static)
+        #pragma omp parallel for
         for (Index_type i = ibegin; i < iend; ++i ) {
           ENERGY_BODY5;
         }
 
-        #pragma omp parallel for schedule(static)
+        #pragma omp parallel for
         for (Index_type i = ibegin; i < iend; ++i ) {
           ENERGY_BODY6;
         }
@@ -543,32 +228,38 @@ void ENERGY::runKernel(VariantID vid)
 
     case RAJA_OpenMP : {
 
-      ENERGY_DATA;
+      ENERGY_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](int i) {
           ENERGY_BODY1;
         });
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](int i) {
           ENERGY_BODY2;
         });
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](int i) {
           ENERGY_BODY3;
         });
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](int i) {
           ENERGY_BODY4;
         });
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](int i) {
           ENERGY_BODY5;
         });
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](int i) {
           ENERGY_BODY6;
         });
 
@@ -578,122 +269,26 @@ void ENERGY::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-    
-      ENERGY_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-
-         energycalc1<<<grid_size, block_size>>>( e_new, e_old, delvc,
-                                                 p_old, q_old, work,
-                                                 iend );
-
-         energycalc2<<<grid_size, block_size>>>( delvc, q_new,
-                                                 compHalfStep, pHalfStep,
-                                                 e_new, bvc, pbvc,
-                                                 ql_old, qq_old,
-                                                 rho0,
-                                                 iend );
-
-         energycalc3<<<grid_size, block_size>>>( e_new, delvc,
-                                                 p_old, q_old,
-                                                 pHalfStep, q_new,
-                                                 iend );
-
-         energycalc4<<<grid_size, block_size>>>( e_new, work,
-                                                 e_cut, emin,
-                                                 iend );
-
-         energycalc5<<<grid_size, block_size>>>( delvc,
-                                                 pbvc, e_new, vnewc,
-                                                 bvc, p_new,
-                                                 ql_old, qq_old,
-                                                 p_old, q_old,
-                                                 pHalfStep, q_new,
-                                                 rho0, e_cut, emin,
-                                                 iend );
-
-         energycalc6<<<grid_size, block_size>>>( delvc,
-                                                 pbvc, e_new, vnewc,
-                                                 bvc, p_new,
-                                                 q_new,
-                                                 ql_old, qq_old,
-                                                 rho0, q_cut,
-                                                 iend );
-
-      }
-      stopTimer();
-
-      ENERGY_DATA_TEARDOWN_CUDA;
-
-    }
-
-    case RAJA_CUDA : {
-
-      ENERGY_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend,
-           [=] __device__ (Index_type i) {
-           ENERGY_BODY1;
-         });
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend,
-           [=] __device__ (Index_type i) {
-           ENERGY_BODY2;
-         });
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend,
-           [=] __device__ (Index_type i) {
-           ENERGY_BODY3;
-         });
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend,
-           [=] __device__ (Index_type i) {
-           ENERGY_BODY4;
-         });
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend,
-           [=] __device__ (Index_type i) {
-           ENERGY_BODY5;
-         });
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend,
-           [=] __device__ (Index_type i) {
-           ENERGY_BODY6;
-         });
-
-      }
-      stopTimer();
-
-      ENERGY_DATA_TEARDOWN_CUDA;
-
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  ENERGY : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/apps/ENERGY.hpp b/src/apps/ENERGY.hpp
index a45742aa8..c959fede3 100644
--- a/src/apps/ENERGY.hpp
+++ b/src/apps/ENERGY.hpp
@@ -13,12 +13,158 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// ENERGY kernel reference implementation:
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   e_new[i] = e_old[i] - 0.5 * delvc[i] *
+///              (p_old[i] + q_old[i]) + 0.5 * work[i];
+/// }
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   if ( delvc[i] > 0.0 ) {
+///      q_new[i] = 0.0 ;
+///   }
+///   else {
+///      Real_type vhalf = 1.0 / (1.0 + compHalfStep[i]) ;
+///      Real_type ssc = ( pbvc[i] * e_new[i]
+///         + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ;
+///      if ( ssc <= 0.1111111e-36 ) {
+///         ssc = 0.3333333e-18 ;
+///      } else {
+///         ssc = sqrt(ssc) ;
+///      }
+///      q_new[i] = (ssc*ql_old[i] + qq_old[i]) ;
+///   }
+/// }
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   e_new[i] = e_new[i] + 0.5 * delvc[i]
+///              * ( 3.0*(p_old[i] + q_old[i])
+///                  - 4.0*(pHalfStep[i] + q_new[i])) ;
+/// }
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   e_new[i] += 0.5 * work[i];
+///   if ( fabs(e_new[i]) < e_cut ) { e_new[i] = 0.0  ; }
+///   if ( e_new[i]  < emin ) { e_new[i] = emin ; }
+/// }
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   Real_type q_tilde ;
+///   if (delvc[i] > 0.0) {
+///      q_tilde = 0. ;
+///   }
+///   else {
+///      Real_type ssc = ( pbvc[i] * e_new[i]
+///          + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;
+///      if ( ssc <= 0.1111111e-36 ) {
+///         ssc = 0.3333333e-18 ;
+///      } else {
+///         ssc = sqrt(ssc) ;
+///      }
+///      q_tilde = (ssc*ql_old[i] + qq_old[i]) ;
+///   }
+///   e_new[i] = e_new[i] - ( 7.0*(p_old[i] + q_old[i])
+///                          - 8.0*(pHalfStep[i] + q_new[i])
+///                          + (p_new[i] + q_tilde)) * delvc[i] / 6.0 ;
+///   if ( fabs(e_new[i]) < e_cut ) {
+///      e_new[i] = 0.0  ;
+///   }
+///   if ( e_new[i]  < emin ) {
+///      e_new[i] = emin ;
+///   }
+/// }
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   if ( delvc[i] <= 0.0 ) {
+///      Real_type ssc = ( pbvc[i] * e_new[i]
+///              + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ;
+///      if ( ssc <= 0.1111111e-36 ) {
+///         ssc = 0.3333333e-18 ;
+///      } else {
+///         ssc = sqrt(ssc) ;
+///      }
+///      q_new[i] = (ssc*ql_old[i] + qq_old[i]) ;
+///      if (fabs(q_new[i]) < q_cut) q_new[i] = 0.0 ;
+///   }
+/// }
+///
 
 #ifndef RAJAPerf_Apps_ENERGY_HPP
 #define RAJAPerf_Apps_ENERGY_HPP
 
-#include "common/KernelBase.hpp"
 
+#define ENERGY_BODY1 \
+  e_new[i] = e_old[i] - 0.5 * delvc[i] * \
+             (p_old[i] + q_old[i]) + 0.5 * work[i];
+
+#define ENERGY_BODY2 \
+  if ( delvc[i] > 0.0 ) { \
+     q_new[i] = 0.0 ; \
+  } \
+  else { \
+     Real_type vhalf = 1.0 / (1.0 + compHalfStep[i]) ; \
+     Real_type ssc = ( pbvc[i] * e_new[i] \
+        + vhalf * vhalf * bvc[i] * pHalfStep[i] ) / rho0 ; \
+     if ( ssc <= 0.1111111e-36 ) { \
+        ssc = 0.3333333e-18 ; \
+     } else { \
+        ssc = sqrt(ssc) ; \
+     } \
+     q_new[i] = (ssc*ql_old[i] + qq_old[i]) ; \
+  }
+
+#define ENERGY_BODY3 \
+  e_new[i] = e_new[i] + 0.5 * delvc[i] \
+             * ( 3.0*(p_old[i] + q_old[i]) \
+                 - 4.0*(pHalfStep[i] + q_new[i])) ;
+
+#define ENERGY_BODY4 \
+  e_new[i] += 0.5 * work[i]; \
+  if ( fabs(e_new[i]) < e_cut ) { e_new[i] = 0.0  ; } \
+  if ( e_new[i]  < emin ) { e_new[i] = emin ; }
+
+#define ENERGY_BODY5 \
+  Real_type q_tilde ; \
+  if (delvc[i] > 0.0) { \
+     q_tilde = 0. ; \
+  } \
+  else { \
+     Real_type ssc = ( pbvc[i] * e_new[i] \
+         + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ; \
+     if ( ssc <= 0.1111111e-36 ) { \
+        ssc = 0.3333333e-18 ; \
+     } else { \
+        ssc = sqrt(ssc) ; \
+     } \
+     q_tilde = (ssc*ql_old[i] + qq_old[i]) ; \
+  } \
+  e_new[i] = e_new[i] - ( 7.0*(p_old[i] + q_old[i]) \
+                         - 8.0*(pHalfStep[i] + q_new[i]) \
+                         + (p_new[i] + q_tilde)) * delvc[i] / 6.0 ; \
+  if ( fabs(e_new[i]) < e_cut ) { \
+     e_new[i] = 0.0  ; \
+  } \
+  if ( e_new[i]  < emin ) { \
+     e_new[i] = emin ; \
+  }
+
+#define ENERGY_BODY6 \
+  if ( delvc[i] <= 0.0 ) { \
+     Real_type ssc = ( pbvc[i] * e_new[i] \
+             + vnewc[i] * vnewc[i] * bvc[i] * p_new[i] ) / rho0 ; \
+     if ( ssc <= 0.1111111e-36 ) { \
+        ssc = 0.3333333e-18 ; \
+     } else { \
+        ssc = sqrt(ssc) ; \
+     } \
+     q_new[i] = (ssc*ql_old[i] + qq_old[i]) ; \
+     if (fabs(q_new[i]) < q_cut) q_new[i] = 0.0 ; \
+  }
+
+
+#include "common/KernelBase.hpp"
 
 namespace rajaperf 
 {
@@ -40,6 +186,9 @@ class ENERGY : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
   Real_ptr m_e_new;
   Real_ptr m_e_old;
diff --git a/src/apps/FIR-Cuda.cpp b/src/apps/FIR-Cuda.cpp
new file mode 100644
index 000000000..d18545412
--- /dev/null
+++ b/src/apps/FIR-Cuda.cpp
@@ -0,0 +1,166 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "FIR.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <algorithm>
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+#define USE_CUDA_CONSTANT_MEMORY
+//#undef USE_CUDA_CONSTANT_MEMORY
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#if defined(USE_CUDA_CONSTANT_MEMORY)
+
+__constant__ Real_type coeff[FIR_COEFFLEN];
+
+#define FIR_DATA_SETUP_CUDA \
+  Real_ptr in; \
+  Real_ptr out; \
+\
+  const Index_type coefflen = m_coefflen; \
+\
+  allocAndInitCudaDeviceData(in, m_in, getRunSize()); \
+  allocAndInitCudaDeviceData(out, m_out, getRunSize()); \
+  cudaMemcpyToSymbol(coeff, coeff_array, FIR_COEFFLEN * sizeof(Real_type));
+
+
+#define FIR_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_out, out, getRunSize()); \
+  deallocCudaDeviceData(in); \
+  deallocCudaDeviceData(out);
+
+__global__ void fir(Real_ptr out, Real_ptr in,
+                    const Index_type coefflen,
+                    Index_type iend)
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     FIR_BODY;
+   }
+}
+
+#else  // use global memry for coefficients 
+
+#define FIR_DATA_SETUP_CUDA \
+  Real_ptr in; \
+  Real_ptr out; \
+  Real_ptr coeff; \
+\
+  const Index_type coefflen = m_coefflen; \
+\
+  allocAndInitCudaDeviceData(in, m_in, getRunSize()); \
+  allocAndInitCudaDeviceData(out, m_out, getRunSize()); \
+  Real_ptr tcoeff = &coeff_array[0]; \
+  allocAndInitCudaDeviceData(coeff, tcoeff, FIR_COEFFLEN);
+
+
+#define FIR_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_out, out, getRunSize()); \
+  deallocCudaDeviceData(in); \
+  deallocCudaDeviceData(out); \
+  deallocCudaDeviceData(coeff);
+
+__global__ void fir(Real_ptr out, Real_ptr in,
+                    Real_ptr coeff,
+                    const Index_type coefflen,
+                    Index_type iend)
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     FIR_BODY;
+   }
+}
+
+#endif 
+
+
+void FIR::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize() - m_coefflen;
+
+  if ( vid == Base_CUDA ) {
+
+    FIR_COEFF;
+
+    FIR_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+
+#if defined(USE_CUDA_CONSTANT_MEMORY)
+       fir<<<grid_size, block_size>>>( out, in,
+                                       coefflen,
+                                       iend );
+#else
+       fir<<<grid_size, block_size>>>( out, in,
+                                       coeff,
+                                       coefflen,
+                                       iend );
+#endif
+
+    }
+    stopTimer();
+
+    FIR_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    FIR_COEFF;
+
+    FIR_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+         FIR_BODY;
+       });
+
+    }
+    stopTimer();
+
+    FIR_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  FIR : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/apps/FIR-OMPTarget.cpp b/src/apps/FIR-OMPTarget.cpp
new file mode 100644
index 000000000..f61069e82
--- /dev/null
+++ b/src/apps/FIR-OMPTarget.cpp
@@ -0,0 +1,113 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "FIR.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <algorithm>
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define FIR_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr in; \
+  Real_ptr out; \
+  Real_ptr coeff; \
+\
+  const Index_type coefflen = m_coefflen; \
+\
+  allocAndInitOpenMPDeviceData(in, m_in, getRunSize(), did, hid); \
+  allocAndInitOpenMPDeviceData(out, m_out, getRunSize(), did, hid); \
+  Real_ptr tcoeff = &coeff_array[0]; \
+  allocAndInitOpenMPDeviceData(coeff, tcoeff, FIR_COEFFLEN, did, hid);
+
+
+#define FIR_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_out, out, getRunSize(), hid, did); \
+  deallocOpenMPDeviceData(in, did); \
+  deallocOpenMPDeviceData(out, did); \
+  deallocOpenMPDeviceData(coeff, did);
+
+
+void FIR::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize() - m_coefflen;
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    FIR_COEFF;
+
+    FIR_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(in, out, coeff) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1)
+      for (Index_type i = ibegin; i < iend; ++i ) {
+         FIR_BODY;
+      }
+
+    }
+    stopTimer();
+
+    FIR_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    FIR_COEFF;
+
+    FIR_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](int i) {
+        FIR_BODY;
+      });
+
+    }
+    stopTimer();
+
+    FIR_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     std::cout << "\n  FIR : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/apps/FIR.cpp b/src/apps/FIR.cpp
index 3cbf2b764..0d3e76ead 100644
--- a/src/apps/FIR.cpp
+++ b/src/apps/FIR.cpp
@@ -13,28 +13,12 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-///
-/// FIR kernel reference implementation:
-///
-/// Real_type coeff[COEFFLEN] = { 3.0, -1.0, -1.0, -1.0, 
-///                               -1.0, 3.0, -1.0, -1.0, 
-///                               -1.0, -1.0, 3.0, -1.0, 
-///                               -1.0, -1.0, -1.0, 3.0 };
-///
-/// for (Index_type i = ibegin; i < iend; ++i ) {
-///   Real_type sum = 0.0; 
-///   for (Index_type j = 0; j < coefflen; ++j ) { 
-///     sum += coeff[j]*in[i+j]; 
-///   } 
-///   out[i] = sum;
-/// }
-
 #include "FIR.hpp"
 
-#include "common/DataUtils.hpp"
-
 #include "RAJA/RAJA.hpp"
 
+#include "common/DataUtils.hpp"
+
 #include <algorithm>
 #include <iostream>
 
@@ -43,118 +27,24 @@ namespace rajaperf
 namespace apps
 {
 
-#define USE_CONSTANT_MEMORY
-//#undef USE_CONSTANT_MEMORY
 
-#define COEFFLEN (16)
-
-#define FIR_COEFF \
-  Real_type coeff_array[COEFFLEN] = { 3.0, -1.0, -1.0, -1.0, \
-                                      -1.0, 3.0, -1.0, -1.0, \
-                                      -1.0, -1.0, 3.0, -1.0, \
-                                      -1.0, -1.0, -1.0, 3.0 };
-
-
-#define FIR_DATA \
+#define FIR_DATA_SETUP_CPU \
   ResReal_ptr in = m_in; \
   ResReal_ptr out = m_out; \
 \
-  Real_type coeff[COEFFLEN]; \
+  Real_type coeff[FIR_COEFFLEN]; \
   std::copy(std::begin(coeff_array), std::end(coeff_array), std::begin(coeff));\
 \
   const Index_type coefflen = m_coefflen;
 
 
-#define FIR_BODY \
-  Real_type sum = 0.0; \
-\
-  for (Index_type j = 0; j < coefflen; ++j ) { \
-    sum += coeff[j]*in[i+j]; \
-  } \
-  out[i] = sum;
-
-
-#if defined(RAJA_ENABLE_CUDA)
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#if defined(USE_CONSTANT_MEMORY)
-
-__constant__ Real_type coeff[COEFFLEN];
-
-#define FIR_DATA_SETUP_CUDA \
-  Real_ptr in; \
-  Real_ptr out; \
-\
-  const Index_type coefflen = m_coefflen; \
-\
-  allocAndInitCudaDeviceData(in, m_in, getRunSize()); \
-  allocAndInitCudaDeviceData(out, m_out, getRunSize()); \
-  cudaMemcpyToSymbol(coeff, coeff_array, COEFFLEN * sizeof(Real_type));
-
-
-#define FIR_DATA_TEARDOWN_CUDA \
-  getCudaDeviceData(m_out, out, getRunSize()); \
-  deallocCudaDeviceData(in); \
-  deallocCudaDeviceData(out);
-
-__global__ void fir(Real_ptr out, Real_ptr in,
-                    const Index_type coefflen,
-                    Index_type iend)
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     FIR_BODY;
-   }
-}
-
-#else  // use global memry for coefficients 
-
-#define FIR_DATA_SETUP_CUDA \
-  Real_ptr in; \
-  Real_ptr out; \
-  Real_ptr coeff; \
-\
-  const Index_type coefflen = m_coefflen; \
-\
-  allocAndInitCudaDeviceData(in, m_in, getRunSize()); \
-  allocAndInitCudaDeviceData(out, m_out, getRunSize()); \
-  Real_ptr tcoeff = &coeff_array[0]; \
-  allocAndInitCudaDeviceData(coeff, tcoeff, COEFFLEN);
-
-
-#define FIR_DATA_TEARDOWN_CUDA \
-  getCudaDeviceData(m_out, out, getRunSize()); \
-  deallocCudaDeviceData(in); \
-  deallocCudaDeviceData(out); \
-  deallocCudaDeviceData(coeff);
-
-__global__ void fir(Real_ptr out, Real_ptr in,
-                    Real_ptr coeff,
-                    const Index_type coefflen,
-                    Index_type iend)
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     FIR_BODY;
-   }
-}
-
-#endif 
-
-#endif // if defined(RAJA_ENABLE_CUDA)
-
-
 FIR::FIR(const RunParams& params)
   : KernelBase(rajaperf::Apps_FIR, params)
 {
   setDefaultSize(100000);
   setDefaultReps(1600);
 
-  m_coefflen = COEFFLEN;
+  m_coefflen = FIR_COEFFLEN;
 }
 
 FIR::~FIR() 
@@ -168,7 +58,7 @@ Index_type FIR::getItsPerRep() const {
 void FIR::setUp(VariantID vid)
 {
   allocAndInitData(m_in, getRunSize(), vid);
-  allocAndInitData(m_out, getRunSize(), vid);
+  allocAndInitDataConst(m_out, getRunSize(), 0.0, vid);
 }
 
 void FIR::runKernel(VariantID vid)
@@ -183,7 +73,7 @@ void FIR::runKernel(VariantID vid)
 
       FIR_COEFF;
 
-      FIR_DATA;
+      FIR_DATA_SETUP_CPU;
   
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -202,12 +92,13 @@ void FIR::runKernel(VariantID vid)
 
       FIR_COEFF;
 
-      FIR_DATA;
+      FIR_DATA_SETUP_CPU;
  
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::seq_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](int i) {
           FIR_BODY;
         }); 
 
@@ -222,10 +113,10 @@ void FIR::runKernel(VariantID vid)
 
       FIR_COEFF;
 
-      FIR_DATA;
+      FIR_DATA_SETUP_CPU;
  
       startTimer();
-      for (RepIndex_type irep = ibegin; irep < run_reps; ++irep) {
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
         #pragma omp parallel for
         for (Index_type i = ibegin; i < iend; ++i ) {
@@ -238,21 +129,17 @@ void FIR::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // Not applicable...
-      break;
-    }
-
     case RAJA_OpenMP : {
 
       FIR_COEFF;
 
-      FIR_DATA;
+      FIR_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](int i) {
           FIR_BODY;
         });
 
@@ -263,71 +150,26 @@ void FIR::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-
-      FIR_COEFF;
-
-      FIR_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-
-#if defined(USE_CONSTANT_MEMORY)
-         fir<<<grid_size, block_size>>>( out, in,
-                                         coefflen,
-                                         iend );
-#else
-         fir<<<grid_size, block_size>>>( out, in,
-                                         coeff,
-                                         coefflen,
-                                         iend );
-#endif
-
-      }
-      stopTimer();
-
-      FIR_DATA_TEARDOWN_CUDA;
-
-      break;
-    }
-
-    case RAJA_CUDA : {
-
-      FIR_COEFF;
-
-      FIR_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend,
-           [=] __device__ (Index_type i) {
-           FIR_BODY;
-         });
-
-      }
-      stopTimer();
-
-      FIR_DATA_TEARDOWN_CUDA;
-
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  FIR : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/apps/FIR.hpp b/src/apps/FIR.hpp
index 6824ceb63..6a347d0ce 100644
--- a/src/apps/FIR.hpp
+++ b/src/apps/FIR.hpp
@@ -13,13 +13,48 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// FIR kernel reference implementation:
+///
+/// #define FIR_COEFFLEN (16)
+///
+/// Real_type coeff[FIR_COEFFLEN] = { 3.0, -1.0, -1.0, -1.0,
+///                                  -1.0, 3.0, -1.0, -1.0,
+///                                  -1.0, -1.0, 3.0, -1.0,
+///                                  -1.0, -1.0, -1.0, 3.0 };
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   Real_type sum = 0.0;
+///   for (Index_type j = 0; j < coefflen; ++j ) {
+///     sum += coeff[j]*in[i+j];
+///   }
+///   out[i] = sum;
+/// }
+///
 
 #ifndef RAJAPerf_Apps_FIR_HPP
 #define RAJAPerf_Apps_FIR_HPP
 
-#include "common/KernelBase.hpp"
+
+#define FIR_COEFFLEN (16)
+
+#define FIR_COEFF \
+  Real_type coeff_array[FIR_COEFFLEN] = { 3.0, -1.0, -1.0, -1.0, \
+                                         -1.0, 3.0, -1.0, -1.0, \
+                                         -1.0, -1.0, 3.0, -1.0, \
+                                         -1.0, -1.0, -1.0, 3.0 };
+
+#define FIR_BODY \
+  Real_type sum = 0.0; \
+\
+  for (Index_type j = 0; j < coefflen; ++j ) { \
+    sum += coeff[j]*in[i+j]; \
+  } \
+  out[i] = sum;
 
 
+#include "common/KernelBase.hpp"
+
 namespace rajaperf 
 {
 class RunParams;
@@ -42,6 +77,9 @@ class FIR : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
   Real_ptr m_in;
   Real_ptr m_out;
diff --git a/src/apps/LTIMES-Cuda.cpp b/src/apps/LTIMES-Cuda.cpp
new file mode 100644
index 000000000..96a85df45
--- /dev/null
+++ b/src/apps/LTIMES-Cuda.cpp
@@ -0,0 +1,125 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "LTIMES.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+
+#define LTIMES_DATA_SETUP_CUDA \
+  Real_ptr phidat; \
+  Real_ptr elldat; \
+  Real_ptr psidat; \
+\
+  Index_type num_d = m_num_d; \
+  Index_type num_z = m_num_z; \
+  Index_type num_g = m_num_g; \
+  Index_type num_m = m_num_m; \
+\
+  allocAndInitCudaDeviceData(phidat, m_phidat, m_philen); \
+  allocAndInitCudaDeviceData(elldat, m_elldat, m_elllen); \
+  allocAndInitCudaDeviceData(psidat, m_psidat, m_psilen);
+
+#define LTIMES_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_phidat, phidat, m_philen); \
+  deallocCudaDeviceData(phidat); \
+  deallocCudaDeviceData(elldat); \
+  deallocCudaDeviceData(psidat);
+
+__global__ void ltimes(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat,
+                       Index_type num_d, Index_type num_g, Index_type num_m)
+{
+   Index_type m = threadIdx.x;
+   Index_type g = blockIdx.y;
+   Index_type z = blockIdx.z;
+
+   for (Index_type d = 0; d < num_d; ++d ) {
+     LTIMES_BODY;
+   }
+}
+
+
+void LTIMES::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  if ( vid == Base_CUDA ) {
+
+    LTIMES_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      dim3 nthreads_per_block(num_m, 1, 1);
+      dim3 nblocks(1, num_g, num_z);
+
+      ltimes<<<nblocks, nthreads_per_block>>>(phidat, elldat, psidat,
+                                              num_d, num_g, num_m);  
+
+    }
+    stopTimer();
+
+    LTIMES_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+      LTIMES_DATA_SETUP_CUDA;
+
+      LTIMES_VIEWS_RANGES_RAJA;
+
+      using EXEC_POL = RAJA::nested::Policy<
+                  RAJA::nested::CudaCollapse<
+                     RAJA::nested::For<1, RAJA::cuda_block_z_exec>,    //z
+                     RAJA::nested::For<2, RAJA::cuda_block_y_exec>,    //g
+                     RAJA::nested::For<3, RAJA::cuda_thread_x_exec> >, //m
+                   RAJA::nested::For<0, RAJA::cuda_loop_exec> >;       //d
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::nested::forall(EXEC_POL{},
+                             RAJA::make_tuple(IDRange(0, num_d),
+                                              IZRange(0, num_z),
+                                              IGRange(0, num_g),
+                                              IMRange(0, num_m)),
+          [=] __device__ (ID d, IZ z, IG g, IM m) {
+          LTIMES_BODY_RAJA;
+        });
+
+      }
+      stopTimer();
+
+      LTIMES_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n LTIMES : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/apps/LTIMES-OMPTarget.cpp b/src/apps/LTIMES-OMPTarget.cpp
new file mode 100644
index 000000000..725b01cf6
--- /dev/null
+++ b/src/apps/LTIMES-OMPTarget.cpp
@@ -0,0 +1,130 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "LTIMES.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define LTIMES_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr phidat; \
+  Real_ptr elldat; \
+  Real_ptr psidat; \
+\
+  Index_type num_d = m_num_d; \
+  Index_type num_z = m_num_z; \
+  Index_type num_g = m_num_g; \
+  Index_type num_m = m_num_m; \
+\
+  allocAndInitOpenMPDeviceData(phidat, m_phidat, m_philen, did, hid); \
+  allocAndInitOpenMPDeviceData(elldat, m_elldat, m_elllen, did, hid); \
+  allocAndInitOpenMPDeviceData(psidat, m_psidat, m_psilen, did, hid);
+
+#define LTIMES_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_phidat, phidat, m_philen, hid, did); \
+  deallocOpenMPDeviceData(phidat, did); \
+  deallocOpenMPDeviceData(elldat, did); \
+  deallocOpenMPDeviceData(psidat, did);
+
+
+void LTIMES::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    LTIMES_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(phidat, elldat, psidat) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) collapse(3)
+      for (Index_type z = 0; z < num_z; ++z ) {
+        for (Index_type g = 0; g < num_g; ++g ) {
+          for (Index_type m = 0; m < num_m; ++m ) {
+            for (Index_type d = 0; d < num_d; ++d ) {
+              LTIMES_BODY;
+            }
+          }
+        }
+      }
+
+    }
+    stopTimer();
+
+    LTIMES_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+#if 0 // disabled until RAJA::nested::OmpTargetCollapse works.
+
+    LTIMES_DATA_SETUP_OMP_TARGET;
+
+    LTIMES_VIEWS_RANGES_RAJA;
+
+    using EXEC_POL = RAJA::nested::Policy<
+                RAJA::nested::OmpTargetCollapse<
+                   RAJA::nested::For<1>,                  // z
+                   RAJA::nested::For<2>,                  // g
+                   RAJA::nested::For<3> >,                // m
+                 RAJA::nested::For<0, RAJA::loop_exec> >; // d
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::nested::forall(EXEC_POL{},
+                           RAJA::make_tuple(IDRange(0, num_d),
+                                            IZRange(0, num_z),
+                                            IGRange(0, num_g),
+                                            IMRange(0, num_m)),
+        [=] (Index_type d, Index_type z, Index_type g, Index_type m) {
+        LTIMES_BODY_RAJA;
+      });
+
+    }
+    stopTimer();
+
+    LTIMES_DATA_TEARDOWN_OMP_TARGET;
+
+#endif
+
+  } else {
+     std::cout << "\n LTIMES : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/apps/LTIMES.cpp b/src/apps/LTIMES.cpp
new file mode 100644
index 000000000..e5470ccdf
--- /dev/null
+++ b/src/apps/LTIMES.cpp
@@ -0,0 +1,229 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "LTIMES.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "common/DataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+
+#define LTIMES_DATA_SETUP_CPU \
+  ResReal_ptr phidat = m_phidat; \
+  ResReal_ptr elldat = m_elldat; \
+  ResReal_ptr psidat = m_psidat; \
+\
+  Index_type num_d = m_num_d; \
+  Index_type num_z = m_num_z; \
+  Index_type num_g = m_num_g; \
+  Index_type num_m = m_num_m;
+
+
+LTIMES::LTIMES(const RunParams& params)
+  : KernelBase(rajaperf::Apps_LTIMES, params)
+{
+  m_num_d_default = 64;
+  m_num_z_default = 500;
+  m_num_g_default = 32;
+  m_num_m_default = 25;
+
+  setDefaultSize(m_num_d_default * m_num_m_default * 
+                 m_num_g_default * m_num_z_default);
+  setDefaultReps(50);
+}
+
+LTIMES::~LTIMES() 
+{
+}
+
+void LTIMES::setUp(VariantID vid)
+{
+  m_num_z = run_params.getSizeFactor() * m_num_z_default;
+  m_num_g = m_num_g_default;  
+  m_num_m = m_num_m_default;  
+  m_num_d = m_num_d_default;  
+
+  m_philen = m_num_m * m_num_g * m_num_z;
+  m_elllen = m_num_d * m_num_m;
+  m_psilen = m_num_d * m_num_g * m_num_z;
+
+  allocAndInitDataConst(m_phidat, int(m_philen), Real_type(0.0), vid);
+  allocAndInitData(m_elldat, int(m_elllen), vid);
+  allocAndInitData(m_psidat, int(m_psilen), vid);
+}
+
+void LTIMES::runKernel(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  switch ( vid ) {
+
+    case Base_Seq : {
+
+      LTIMES_DATA_SETUP_CPU;
+  
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type z = 0; z < num_z; ++z ) {
+          for (Index_type g = 0; g < num_g; ++g ) {
+            for (Index_type m = 0; m < num_m; ++m ) {
+              for (Index_type d = 0; d < num_d; ++d ) {
+                LTIMES_BODY;
+              }
+            }
+          }
+        }
+
+      }
+      stopTimer();
+
+      break;
+    } 
+
+    case RAJA_Seq : {
+
+      LTIMES_DATA_SETUP_CPU;
+
+      LTIMES_VIEWS_RANGES_RAJA;
+
+      using EXEC_POL = RAJA::nested::Policy<
+                             RAJA::nested::For<1, RAJA::seq_exec>,
+                             RAJA::nested::For<2, RAJA::seq_exec>,
+                             RAJA::nested::For<3, RAJA::seq_exec>,
+                             RAJA::nested::For<0, RAJA::seq_exec> >;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      
+        RAJA::nested::forall(EXEC_POL{},
+                             RAJA::make_tuple(IDRange(0, num_d),
+                                              IZRange(0, num_z),
+                                              IGRange(0, num_g),
+                                              IMRange(0, num_m)), 
+          [=](ID d, IZ z, IG g, IM m) {
+          LTIMES_BODY_RAJA;
+        });
+
+      }
+      stopTimer(); 
+
+      break;
+    }
+
+#if defined(RAJA_ENABLE_OPENMP)      
+    case Base_OpenMP : {
+
+      LTIMES_DATA_SETUP_CPU;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        #pragma omp parallel for
+        for (Index_type z = 0; z < num_z; ++z ) {
+          for (Index_type g = 0; g < num_g; ++g ) {
+            for (Index_type m = 0; m < num_m; ++m ) {
+              for (Index_type d = 0; d < num_d; ++d ) {
+                LTIMES_BODY;
+              }
+            }
+          }
+        }  
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_OpenMP : {
+
+      LTIMES_DATA_SETUP_CPU;
+
+      LTIMES_VIEWS_RANGES_RAJA;
+
+      using EXEC_POL = RAJA::nested::Policy<
+                     RAJA::nested::For<1, RAJA::omp_parallel_for_exec>,
+                     RAJA::nested::For<2, RAJA::seq_exec>,
+                     RAJA::nested::For<3, RAJA::seq_exec>,
+                     RAJA::nested::For<0, RAJA::seq_exec> >;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::nested::forall(EXEC_POL{},
+                             RAJA::make_tuple(IDRange(0, num_d),
+                                              IZRange(0, num_z),
+                                              IGRange(0, num_g),
+                                              IMRange(0, num_m)),
+          [=](ID d, IZ z, IG g, IM m) {
+          LTIMES_BODY_RAJA;
+        });
+
+      }
+      stopTimer();
+
+      break;
+    }
+#endif
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
+      break;
+    }
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
+      break;
+    }
+#endif
+
+    default : {
+      std::cout << "\n LTIMES : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+}
+
+void LTIMES::updateChecksum(VariantID vid)
+{
+  checksum[vid] += calcChecksum(m_phidat, m_philen);
+}
+
+void LTIMES::tearDown(VariantID vid)
+{
+  (void) vid;
+ 
+  deallocData(m_phidat);
+  deallocData(m_elldat);
+  deallocData(m_psidat);
+}
+
+} // end namespace apps
+} // end namespace rajaperf
diff --git a/src/apps/LTIMES.hpp b/src/apps/LTIMES.hpp
new file mode 100644
index 000000000..90b8695cc
--- /dev/null
+++ b/src/apps/LTIMES.hpp
@@ -0,0 +1,133 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// LTIMES kernel reference implementation:
+///
+/// for (Index_type z = 0; z < num_z; ++z ) {
+///   for (Index_type g = 0; g < num_g; ++g ) {
+///     for (Index_type m = 0; z < num_m; ++m ) {
+///       for (Index_type d = 0; d < num_d; ++d ) {
+///
+///         phi[m+ (g * num_g) + (z * num_z * num_g)] +=
+///           ell[d+ (m * num_m)] * psi[d+ (g * num_g) + (z * num_z * num_g];
+///
+///       }
+///     }
+///   }
+/// }
+///
+/// The RAJA variants of this kernel use RAJA multi-dimensional data layouts 
+/// and views to do the same thing without explicit index calculations (see
+/// the loop body definitions below).
+///
+
+#ifndef RAJAPerf_Apps_LTIMES_HPP
+#define RAJAPerf_Apps_LTIMES_HPP
+
+
+#define LTIMES_BODY \
+  phidat[m+ (g * num_m) + (z * num_m * num_g)] += \
+    elldat[d+ (m * num_d)] * psidat[d+ (g * num_d) + (z * num_d * num_g)];
+
+#define LTIMES_BODY_RAJA \
+  phi(z, g, m) +=  ell(m, d) * psi(z, g, d);
+
+
+#define LTIMES_VIEWS_RANGES_RAJA \
+  using namespace ltimes_idx; \
+\
+  using PSI_VIEW = RAJA::TypedView<Real_type, RAJA::Layout<3>, IZ, IG, ID>; \
+  using ELL_VIEW = RAJA::TypedView<Real_type, RAJA::Layout<2>, IM, ID>; \
+  using PHI_VIEW = RAJA::TypedView<Real_type, RAJA::Layout<3>, IZ, IG, IM>; \
+\
+  PSI_VIEW psi(psidat, \
+               RAJA::make_permuted_layout( {num_z, num_g, num_d}, \
+                     RAJA::as_array<RAJA::Perm<0, 1, 2> >::get() ) ); \
+  ELL_VIEW ell(elldat, \
+               RAJA::make_permuted_layout( {num_m, num_d}, \
+                     RAJA::as_array<RAJA::Perm<0, 1> >::get() ) ); \
+  PHI_VIEW phi(phidat, \
+               RAJA::make_permuted_layout( {num_z, num_g, num_m}, \
+                     RAJA::as_array<RAJA::Perm<0, 1, 2> >::get() ) ); \
+\
+      using IDRange = RAJA::TypedRangeSegment<ID>; \
+      using IZRange = RAJA::TypedRangeSegment<IZ>; \
+      using IGRange = RAJA::TypedRangeSegment<IG>; \
+      using IMRange = RAJA::TypedRangeSegment<IM>;
+
+
+#include "common/KernelBase.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+namespace rajaperf 
+{
+class RunParams;
+
+namespace apps
+{
+
+//
+// These index value types cannot be defined in function scope for
+// RAJA CUDA variant to work.
+//
+namespace ltimes_idx {
+  RAJA_INDEX_VALUE(ID, "ID");
+  RAJA_INDEX_VALUE(IZ, "IZ");
+  RAJA_INDEX_VALUE(IG, "IG");
+  RAJA_INDEX_VALUE(IM, "IM");
+}
+
+class LTIMES : public KernelBase
+{
+public:
+
+  LTIMES(const RunParams& params);
+
+  ~LTIMES();
+
+  void setUp(VariantID vid);
+  void runKernel(VariantID vid); 
+  void updateChecksum(VariantID vid);
+  void tearDown(VariantID vid);
+
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
+private:
+  Real_ptr m_phidat;
+  Real_ptr m_elldat;
+  Real_ptr m_psidat;
+
+  Index_type m_num_d_default; 
+  Index_type m_num_z_default; 
+  Index_type m_num_g_default; 
+  Index_type m_num_m_default; 
+
+  Index_type m_num_d; 
+  Index_type m_num_z; 
+  Index_type m_num_g; 
+  Index_type m_num_m; 
+
+  Index_type m_philen;
+  Index_type m_elllen;
+  Index_type m_psilen;
+};
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif // closing endif for header file include guard
diff --git a/src/apps/LTIMES_NOVIEW-Cuda.cpp b/src/apps/LTIMES_NOVIEW-Cuda.cpp
new file mode 100644
index 000000000..38dd8bb2a
--- /dev/null
+++ b/src/apps/LTIMES_NOVIEW-Cuda.cpp
@@ -0,0 +1,125 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "LTIMES_NOVIEW.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+
+#define LTIMES_NOVIEW_DATA_SETUP_CUDA \
+  Real_ptr phidat; \
+  Real_ptr elldat; \
+  Real_ptr psidat; \
+\
+  Index_type num_d = m_num_d; \
+  Index_type num_z = m_num_z; \
+  Index_type num_g = m_num_g; \
+  Index_type num_m = m_num_m; \
+\
+  allocAndInitCudaDeviceData(phidat, m_phidat, m_philen); \
+  allocAndInitCudaDeviceData(elldat, m_elldat, m_elllen); \
+  allocAndInitCudaDeviceData(psidat, m_psidat, m_psilen);
+
+#define LTIMES_NOVIEW_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_phidat, phidat, m_philen); \
+  deallocCudaDeviceData(phidat); \
+  deallocCudaDeviceData(elldat); \
+  deallocCudaDeviceData(psidat);
+
+__global__ void ltimes_noview(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat,
+                              Index_type num_d, Index_type num_g, Index_type num_m)
+{
+   Index_type m = threadIdx.x;
+   Index_type g = blockIdx.y;
+   Index_type z = blockIdx.z;
+
+   for (Index_type d = 0; d < num_d; ++d ) {
+     LTIMES_NOVIEW_BODY;
+   }
+}
+
+
+void LTIMES_NOVIEW::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  if ( vid == Base_CUDA ) {
+
+    LTIMES_NOVIEW_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      dim3 nthreads_per_block(num_m, 1, 1);
+      dim3 nblocks(1, num_g, num_z);
+
+      ltimes_noview<<<nblocks, nthreads_per_block>>>(phidat, elldat, psidat,
+                                                     num_d, num_g, num_m);  
+
+    }
+    stopTimer();
+
+    LTIMES_NOVIEW_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    LTIMES_NOVIEW_DATA_SETUP_CUDA;
+
+    LTIMES_NOVIEW_RANGES_RAJA;
+
+    using EXEC_POL = RAJA::nested::Policy<
+                RAJA::nested::CudaCollapse<
+                   RAJA::nested::For<1, RAJA::cuda_block_z_exec>,    //z
+                   RAJA::nested::For<2, RAJA::cuda_block_y_exec>,    //g
+                   RAJA::nested::For<3, RAJA::cuda_thread_x_exec> >, //m
+                 RAJA::nested::For<0, RAJA::cuda_loop_exec> >;       //d
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::nested::forall(EXEC_POL{},
+                           RAJA::make_tuple(IDRange(0, num_d),
+                                            IZRange(0, num_z),
+                                            IGRange(0, num_g),
+                                            IMRange(0, num_m)),
+        [=] __device__ (Index_type d, Index_type z, Index_type g, Index_type m) {
+        LTIMES_NOVIEW_BODY;
+      });
+
+    }
+    stopTimer();
+
+    LTIMES_NOVIEW_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n LTIMES_NOVIEW : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/apps/LTIMES_NOVIEW-OMPTarget.cpp b/src/apps/LTIMES_NOVIEW-OMPTarget.cpp
new file mode 100644
index 000000000..581ef0b18
--- /dev/null
+++ b/src/apps/LTIMES_NOVIEW-OMPTarget.cpp
@@ -0,0 +1,155 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "LTIMES_NOVIEW.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define LTIMES_NOVIEW_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr phidat; \
+  Real_ptr elldat; \
+  Real_ptr psidat; \
+\
+  Index_type num_d = m_num_d; \
+  Index_type num_z = m_num_z; \
+  Index_type num_g = m_num_g; \
+  Index_type num_m = m_num_m; \
+\
+  allocAndInitOpenMPDeviceData(phidat, m_phidat, m_philen, did, hid); \
+  allocAndInitOpenMPDeviceData(elldat, m_elldat, m_elllen, did, hid); \
+  allocAndInitOpenMPDeviceData(psidat, m_psidat, m_psilen, did, hid);
+
+#define LTIMES_NOVIEW_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_phidat, phidat, m_philen, hid, did); \
+  deallocOpenMPDeviceData(phidat, did); \
+  deallocOpenMPDeviceData(elldat, did); \
+  deallocOpenMPDeviceData(psidat, did);
+
+
+void LTIMES_NOVIEW::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    LTIMES_NOVIEW_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(phidat, elldat, psidat) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) collapse(3)
+      for (Index_type z = 0; z < num_z; ++z ) {
+        for (Index_type g = 0; g < num_g; ++g ) {
+          for (Index_type m = 0; m < num_m; ++m ) {
+            for (Index_type d = 0; d < num_d; ++d ) {
+              LTIMES_NOVIEW_BODY;
+            }
+          }
+        }
+      }
+
+    }
+    stopTimer();
+
+    LTIMES_NOVIEW_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+#if 1 // temporary implementation until RAJA::nested::OmpTargetCollapse works.
+
+    LTIMES_NOVIEW_DATA_SETUP_OMP_TARGET;
+
+    LTIMES_NOVIEW_RANGES_RAJA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        IZRange(0, num_z), [=](Index_type z) {
+        for (Index_type g = 0; g < num_g; ++g ) {
+          for (Index_type m = 0; m < num_m; ++m ) {
+            for (Index_type d = 0; d < num_d; ++d ) {
+              LTIMES_NOVIEW_BODY;
+            }
+          }
+        }
+      });
+
+    }
+    stopTimer();
+
+    LTIMES_NOVIEW_DATA_TEARDOWN_OMP_TARGET;
+
+#else
+
+    LTIMES_NOVIEW_DATA_SETUP_OMP_TARGET;
+
+    LTIMES_NOVIEW_RANGES_RAJA;
+
+    using EXEC_POL = RAJA::nested::Policy<
+                RAJA::nested::OmpTargetCollapse<
+                   RAJA::nested::For<1>,                  // z 
+                   RAJA::nested::For<2>,                  // g
+                   RAJA::nested::For<3> >,                // m
+                 RAJA::nested::For<0, RAJA::loop_exec> >; // d
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::nested::forall(EXEC_POL{},
+                           RAJA::make_tuple(IDRange(0, num_d),
+                                            IZRange(0, num_z),
+                                            IGRange(0, num_g),
+                                            IMRange(0, num_m)),
+        [=] (Index_type d, Index_type z, Index_type g, Index_type m) {
+        LTIMES_NOVIEW_BODY;
+      });
+
+    }
+    stopTimer();
+
+    LTIMES_NOVIEW_DATA_TEARDOWN_OMP_TARGET;
+
+#endif
+
+  } else {
+     std::cout << "\n LTIMES_NOVIEW : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/apps/LTIMES_NOVIEW.cpp b/src/apps/LTIMES_NOVIEW.cpp
new file mode 100644
index 000000000..9254806b7
--- /dev/null
+++ b/src/apps/LTIMES_NOVIEW.cpp
@@ -0,0 +1,229 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "LTIMES_NOVIEW.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "common/DataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+
+#define LTIMES_NOVIEW_DATA_SETUP_CPU \
+  ResReal_ptr phidat = m_phidat; \
+  ResReal_ptr elldat = m_elldat; \
+  ResReal_ptr psidat = m_psidat; \
+\
+  Index_type num_d = m_num_d; \
+  Index_type num_z = m_num_z; \
+  Index_type num_g = m_num_g; \
+  Index_type num_m = m_num_m;
+
+
+LTIMES_NOVIEW::LTIMES_NOVIEW(const RunParams& params)
+  : KernelBase(rajaperf::Apps_LTIMES_NOVIEW, params)
+{
+  m_num_d_default = 64;
+  m_num_z_default = 500;
+  m_num_g_default = 32;
+  m_num_m_default = 25;
+
+  setDefaultSize(m_num_d_default * m_num_m_default * 
+                 m_num_g_default * m_num_z_default);
+  setDefaultReps(50);
+}
+
+LTIMES_NOVIEW::~LTIMES_NOVIEW() 
+{
+}
+
+void LTIMES_NOVIEW::setUp(VariantID vid)
+{
+  m_num_z = run_params.getSizeFactor() * m_num_z_default;
+  m_num_g = m_num_g_default;  
+  m_num_m = m_num_m_default;  
+  m_num_d = m_num_d_default;  
+
+  m_philen = m_num_m * m_num_g * m_num_z;
+  m_elllen = m_num_d * m_num_m;
+  m_psilen = m_num_d * m_num_g * m_num_z;
+
+  allocAndInitDataConst(m_phidat, int(m_philen), Real_type(0.0), vid);
+  allocAndInitData(m_elldat, int(m_elllen), vid);
+  allocAndInitData(m_psidat, int(m_psilen), vid);
+}
+
+void LTIMES_NOVIEW::runKernel(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  switch ( vid ) {
+
+    case Base_Seq : {
+
+      LTIMES_NOVIEW_DATA_SETUP_CPU;
+  
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type z = 0; z < num_z; ++z ) {
+          for (Index_type g = 0; g < num_g; ++g ) {
+            for (Index_type m = 0; m < num_m; ++m ) {
+              for (Index_type d = 0; d < num_d; ++d ) {
+                LTIMES_NOVIEW_BODY;
+              }
+            }
+          }
+        }
+
+      }
+      stopTimer();
+
+      break;
+    } 
+
+    case RAJA_Seq : {
+
+      LTIMES_NOVIEW_DATA_SETUP_CPU;
+
+      LTIMES_NOVIEW_RANGES_RAJA;
+
+      using EXEC_POL = RAJA::nested::Policy<
+                             RAJA::nested::For<1, RAJA::seq_exec>,
+                             RAJA::nested::For<2, RAJA::seq_exec>,
+                             RAJA::nested::For<3, RAJA::seq_exec>,
+                             RAJA::nested::For<0, RAJA::seq_exec> >;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      
+        RAJA::nested::forall(EXEC_POL{},
+                             RAJA::make_tuple(IDRange(0, num_d),
+                                              IZRange(0, num_z),
+                                              IGRange(0, num_g),
+                                              IMRange(0, num_m)), 
+          [=](Index_type d, Index_type z, Index_type g, Index_type m) {
+          LTIMES_NOVIEW_BODY;
+        });
+
+      }
+      stopTimer(); 
+
+      break;
+    }
+
+#if defined(RAJA_ENABLE_OPENMP)      
+    case Base_OpenMP : {
+
+      LTIMES_NOVIEW_DATA_SETUP_CPU;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        #pragma omp parallel for
+        for (Index_type z = 0; z < num_z; ++z ) {
+          for (Index_type g = 0; g < num_g; ++g ) {
+            for (Index_type m = 0; m < num_m; ++m ) {
+              for (Index_type d = 0; d < num_d; ++d ) {
+                LTIMES_NOVIEW_BODY;
+              }
+            }
+          }
+        }  
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_OpenMP : {
+
+      LTIMES_NOVIEW_DATA_SETUP_CPU;
+
+      LTIMES_NOVIEW_RANGES_RAJA;
+
+      using EXEC_POL = RAJA::nested::Policy<
+                     RAJA::nested::For<1, RAJA::omp_parallel_for_exec>,
+                     RAJA::nested::For<2, RAJA::seq_exec>,
+                     RAJA::nested::For<3, RAJA::seq_exec>,
+                     RAJA::nested::For<0, RAJA::seq_exec> >;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::nested::forall(EXEC_POL{},
+                             RAJA::make_tuple(IDRange(0, num_d),
+                                              IZRange(0, num_z),
+                                              IGRange(0, num_g),
+                                              IMRange(0, num_m)),
+          [=](Index_type d, Index_type z, Index_type g, Index_type m) {
+          LTIMES_NOVIEW_BODY;
+        });
+
+      }
+      stopTimer();
+
+      break;
+    }
+#endif
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
+      break;
+    }
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
+      break;
+    }
+#endif
+
+    default : {
+      std::cout << "\n LTIMES_NOVIEW : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+}
+
+void LTIMES_NOVIEW::updateChecksum(VariantID vid)
+{
+  checksum[vid] += calcChecksum(m_phidat, m_philen);
+}
+
+void LTIMES_NOVIEW::tearDown(VariantID vid)
+{
+  (void) vid;
+ 
+  deallocData(m_phidat);
+  deallocData(m_elldat);
+  deallocData(m_psidat);
+}
+
+} // end namespace apps
+} // end namespace rajaperf
diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp
new file mode 100644
index 000000000..4346bcf44
--- /dev/null
+++ b/src/apps/LTIMES_NOVIEW.hpp
@@ -0,0 +1,96 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// LTIMES_NOVIEW kernel reference implementation:
+///
+/// for (Index_type z = 0; z < num_z; ++z ) {
+///   for (Index_type g = 0; g < num_g; ++g ) {
+///     for (Index_type m = 0; z < num_m; ++m ) {
+///       for (Index_type d = 0; d < num_d; ++d ) {
+///
+///         phi[m+ (g * num_g) + (z * num_z * num_g)] +=
+///           ell[d+ (m * num_m)] * psi[d+ (g * num_g) + (z * num_z * num_g];
+///
+///       }
+///     }
+///   }
+/// }
+///
+
+#ifndef RAJAPerf_Apps_LTIMES_NOVIEW_HPP
+#define RAJAPerf_Apps_LTIMES_NOVIEW_HPP
+
+
+#define LTIMES_NOVIEW_BODY \
+  phidat[m+ (g * num_m) + (z * num_m * num_g)] += \
+    elldat[d+ (m * num_d)] * psidat[d+ (g * num_d) + (z * num_d * num_g)];
+
+#define LTIMES_NOVIEW_RANGES_RAJA \
+      using IDRange = RAJA::RangeSegment; \
+      using IZRange = RAJA::RangeSegment; \
+      using IGRange = RAJA::RangeSegment; \
+      using IMRange = RAJA::RangeSegment;
+
+
+#include "common/KernelBase.hpp"
+
+namespace rajaperf 
+{
+class RunParams;
+
+namespace apps
+{
+
+class LTIMES_NOVIEW : public KernelBase
+{
+public:
+
+  LTIMES_NOVIEW(const RunParams& params);
+
+  ~LTIMES_NOVIEW();
+
+  void setUp(VariantID vid);
+  void runKernel(VariantID vid); 
+  void updateChecksum(VariantID vid);
+  void tearDown(VariantID vid);
+
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
+private:
+  Real_ptr m_phidat;
+  Real_ptr m_elldat;
+  Real_ptr m_psidat;
+
+  Index_type m_num_d_default; 
+  Index_type m_num_z_default; 
+  Index_type m_num_g_default; 
+  Index_type m_num_m_default; 
+
+  Index_type m_num_d; 
+  Index_type m_num_z; 
+  Index_type m_num_g; 
+  Index_type m_num_m; 
+
+  Index_type m_philen;
+  Index_type m_elllen;
+  Index_type m_psilen;
+};
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif // closing endif for header file include guard
diff --git a/src/apps/PRESSURE-Cuda.cpp b/src/apps/PRESSURE-Cuda.cpp
new file mode 100644
index 000000000..4ce57c58a
--- /dev/null
+++ b/src/apps/PRESSURE-Cuda.cpp
@@ -0,0 +1,144 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "PRESSURE.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#define PRESSURE_DATA_SETUP_CUDA \
+  Real_ptr compression; \
+  Real_ptr bvc; \
+  Real_ptr p_new; \
+  Real_ptr e_old; \
+  Real_ptr vnewc; \
+  const Real_type cls = m_cls; \
+  const Real_type p_cut = m_p_cut; \
+  const Real_type pmin = m_pmin; \
+  const Real_type eosvmax = m_eosvmax; \
+\
+  allocAndInitCudaDeviceData(compression, m_compression, iend); \
+  allocAndInitCudaDeviceData(bvc, m_bvc, iend); \
+  allocAndInitCudaDeviceData(p_new, m_p_new, iend); \
+  allocAndInitCudaDeviceData(e_old, m_e_old, iend); \
+  allocAndInitCudaDeviceData(vnewc, m_vnewc, iend);
+
+#define PRESSURE_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_p_new, p_new, iend); \
+  deallocCudaDeviceData(compression); \
+  deallocCudaDeviceData(bvc); \
+  deallocCudaDeviceData(p_new); \
+  deallocCudaDeviceData(e_old); \
+  deallocCudaDeviceData(vnewc);
+
+__global__ void pressurecalc1(Real_ptr bvc, Real_ptr compression,
+                              const Real_type cls,
+                              Index_type iend)
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     PRESSURE_BODY1;
+   }
+}
+
+__global__ void pressurecalc2(Real_ptr p_new, Real_ptr bvc, Real_ptr e_old,
+                              Real_ptr vnewc,
+                              const Real_type p_cut, const Real_type eosvmax,
+                              const Real_type pmin,
+                              Index_type iend)
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     PRESSURE_BODY2;
+   }
+}
+
+
+void PRESSURE::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_CUDA ) {
+
+    PRESSURE_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+
+       pressurecalc1<<<grid_size, block_size>>>( bvc, compression,
+                                                 cls,
+                                                 iend );
+
+       pressurecalc2<<<grid_size, block_size>>>( p_new, bvc, e_old,
+                                                 vnewc,
+                                                 p_cut, eosvmax, pmin,
+                                                 iend );
+
+    }
+    stopTimer();
+
+    PRESSURE_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    PRESSURE_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+         PRESSURE_BODY1;
+       });
+
+       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+         PRESSURE_BODY2;
+       });
+
+    }
+    stopTimer();
+
+    PRESSURE_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  PRESSURE : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/apps/PRESSURE-OMPTarget.cpp b/src/apps/PRESSURE-OMPTarget.cpp
new file mode 100644
index 000000000..e501d2d96
--- /dev/null
+++ b/src/apps/PRESSURE-OMPTarget.cpp
@@ -0,0 +1,125 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "PRESSURE.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define PRESSURE_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr compression; \
+  Real_ptr bvc; \
+  Real_ptr p_new; \
+  Real_ptr e_old; \
+  Real_ptr vnewc; \
+  const Real_type cls = m_cls; \
+  const Real_type p_cut = m_p_cut; \
+  const Real_type pmin = m_pmin; \
+  const Real_type eosvmax = m_eosvmax; \
+\
+  allocAndInitOpenMPDeviceData(compression, m_compression, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(bvc, m_bvc, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(p_new, m_p_new, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(e_old, m_e_old, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(vnewc, m_vnewc, iend, did, hid);
+
+#define PRESSURE_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_p_new, p_new, iend, hid, did); \
+  deallocOpenMPDeviceData(compression, did); \
+  deallocOpenMPDeviceData(bvc, did); \
+  deallocOpenMPDeviceData(p_new, did); \
+  deallocOpenMPDeviceData(e_old, did); \
+  deallocOpenMPDeviceData(vnewc, did);
+
+
+void PRESSURE::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    PRESSURE_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(compression, bvc) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) 
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        PRESSURE_BODY1;
+      }
+
+      #pragma omp target is_device_ptr(bvc, p_new, e_old, vnewc) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) 
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        PRESSURE_BODY2;
+      }
+
+    }
+    stopTimer();
+
+    PRESSURE_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    PRESSURE_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](int i) {
+        PRESSURE_BODY1;
+      });
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](int i) {
+        PRESSURE_BODY2;
+      });
+
+    }
+    stopTimer();
+
+    PRESSURE_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+    std::cout << "\n  PRESSURE : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/apps/PRESSURE.cpp b/src/apps/PRESSURE.cpp
index 260b87e17..a6bd24795 100644
--- a/src/apps/PRESSURE.cpp
+++ b/src/apps/PRESSURE.cpp
@@ -13,27 +13,12 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-///
-/// PRESSURE kernel reference implementation:
-///
-/// for (Index_type i = ibegin; i < iend; ++i ) {
-///   bvc[i] = cls * (compression[i] + 1.0);
-/// }
-///
-/// for (Index_type i = ibegin; i < iend; ++i ) {
-///   p_new[i] = bvc[i] * e_old[i] ; 
-///   if ( fabs(p_new[i]) <  p_cut ) p_new[i] = 0.0 ; 
-///   if ( vnewc[i] >= eosvmax ) p_new[i] = 0.0 ; 
-///   if ( p_new[i]  <  pmin ) p_new[i]   = pmin ;
-/// }
-///
-
 #include "PRESSURE.hpp"
 
-#include "common/DataUtils.hpp"
-
 #include "RAJA/RAJA.hpp"
 
+#include "common/DataUtils.hpp"
+
 #include <iostream>
 
 namespace rajaperf 
@@ -41,7 +26,7 @@ namespace rajaperf
 namespace apps
 {
 
-#define PRESSURE_DATA \
+#define PRESSURE_DATA_SETUP_CPU \
   ResReal_ptr compression = m_compression; \
   ResReal_ptr bvc = m_bvc; \
   ResReal_ptr p_new = m_p_new; \
@@ -53,74 +38,6 @@ namespace apps
   const Real_type eosvmax = m_eosvmax; 
    
 
-#define PRESSURE_BODY1 \
-  bvc[i] = cls * (compression[i] + 1.0);
-
-#define PRESSURE_BODY2 \
-  p_new[i] = bvc[i] * e_old[i] ; \
-  if ( fabs(p_new[i]) <  p_cut ) p_new[i] = 0.0 ; \
-  if ( vnewc[i] >= eosvmax ) p_new[i] = 0.0 ; \
-  if ( p_new[i]  <  pmin ) p_new[i]   = pmin ;
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define PRESSURE_DATA_SETUP_CUDA \
-  Real_ptr compression; \
-  Real_ptr bvc; \
-  Real_ptr p_new; \
-  Real_ptr e_old; \
-  Real_ptr vnewc; \
-  const Real_type cls = m_cls; \
-  const Real_type p_cut = m_p_cut; \
-  const Real_type pmin = m_pmin; \
-  const Real_type eosvmax = m_eosvmax; \
-\
-  allocAndInitCudaDeviceData(compression, m_compression, iend); \
-  allocAndInitCudaDeviceData(bvc, m_bvc, iend); \
-  allocAndInitCudaDeviceData(p_new, m_p_new, iend); \
-  allocAndInitCudaDeviceData(e_old, m_e_old, iend); \
-  allocAndInitCudaDeviceData(vnewc, m_vnewc, iend);
-
-#define PRESSURE_DATA_TEARDOWN_CUDA \
-  getCudaDeviceData(m_p_new, p_new, iend); \
-  deallocCudaDeviceData(compression); \
-  deallocCudaDeviceData(bvc); \
-  deallocCudaDeviceData(p_new); \
-  deallocCudaDeviceData(e_old); \
-  deallocCudaDeviceData(vnewc);
-
-__global__ void pressurecalc1(Real_ptr bvc, Real_ptr compression,
-                              const Real_type cls,
-                              Index_type iend)
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     PRESSURE_BODY1;
-   }
-}
-
-__global__ void pressurecalc2(Real_ptr p_new, Real_ptr bvc, Real_ptr e_old,
-                              Real_ptr vnewc,
-                              const Real_type p_cut, const Real_type eosvmax,
-                              const Real_type pmin,
-                              Index_type iend)
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     PRESSURE_BODY2;
-   }
-}
-
-#endif // if defined(RAJA_ENABLE_CUDA)
-
-
 PRESSURE::PRESSURE(const RunParams& params)
   : KernelBase(rajaperf::Apps_PRESSURE, params)
 {
@@ -136,7 +53,7 @@ void PRESSURE::setUp(VariantID vid)
 {
   allocAndInitData(m_compression, getRunSize(), vid);
   allocAndInitData(m_bvc, getRunSize(), vid);
-  allocAndInitData(m_p_new, getRunSize(), vid);
+  allocAndInitDataConst(m_p_new, getRunSize(), 0.0, vid);
   allocAndInitData(m_e_old, getRunSize(), vid);
   allocAndInitData(m_vnewc, getRunSize(), vid);
   
@@ -156,7 +73,7 @@ void PRESSURE::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      PRESSURE_DATA;
+      PRESSURE_DATA_SETUP_CPU;
   
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -177,16 +94,18 @@ void PRESSURE::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      PRESSURE_DATA;
+      PRESSURE_DATA_SETUP_CPU;
  
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](int i) {
           PRESSURE_BODY1;
         }); 
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](int i) {
           PRESSURE_BODY2;
         }); 
 
@@ -199,33 +118,18 @@ void PRESSURE::runKernel(VariantID vid)
 #if defined(RAJA_ENABLE_OPENMP)      
     case Base_OpenMP : {
 
-      PRESSURE_DATA;
- 
-      startTimer();
-      for (RepIndex_type irep = ibegin; irep < run_reps; ++irep) {
-
-        #pragma omp parallel
-          {
-            #pragma omp for nowait schedule(static)
-            for (Index_type i = ibegin; i < iend; ++i ) {
-              PRESSURE_BODY1;
-            }
-
-            #pragma omp for nowait schedule(static)
-            for (Index_type i = ibegin; i < iend; ++i ) {
-              PRESSURE_BODY2;
-            }
-          } // omp parallel
-
-      }
-      stopTimer();
-
-      break;
-    }
-
-    case RAJALike_OpenMP : {
+//
+// NOTE: This kernel should be written to have an OpenMP parallel 
+//       region around it and then use an OpenMP for-nowait for
+//       each loop inside it. We currently don't have a clean way to
+//       do this in RAJA. So, the base OpenMP variant is coded the
+//       way it is to be able to do an "apples to apples" comparison.
+//
+//       This will be changed in the future when the required feature 
+//       is added to RAJA.
+//
 
-      PRESSURE_DATA;
+      PRESSURE_DATA_SETUP_CPU;
       
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -248,16 +152,18 @@ void PRESSURE::runKernel(VariantID vid)
 
     case RAJA_OpenMP : {
 
-      PRESSURE_DATA;
+      PRESSURE_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](int i) {
           PRESSURE_BODY1;
         });
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](int i) {
           PRESSURE_BODY2;
         });
 
@@ -268,71 +174,26 @@ void PRESSURE::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-
-      PRESSURE_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-
-         pressurecalc1<<<grid_size, block_size>>>( bvc, compression,
-                                                   cls,
-                                                   iend );
-
-         pressurecalc2<<<grid_size, block_size>>>( p_new, bvc, e_old,
-                                                   vnewc,
-                                                   p_cut, eosvmax, pmin,
-                                                   iend );
-
-      }
-      stopTimer();
-
-      PRESSURE_DATA_TEARDOWN_CUDA;
-
-      break;
-    }
-
-    case RAJA_CUDA : {
-
-      PRESSURE_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend,
-           [=] __device__ (Index_type i) {
-           PRESSURE_BODY1;
-         });
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend,
-           [=] __device__ (Index_type i) {
-           PRESSURE_BODY2;
-         });
-
-      }
-      stopTimer();
-
-      PRESSURE_DATA_TEARDOWN_CUDA;
-
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  PRESSURE : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/apps/PRESSURE.hpp b/src/apps/PRESSURE.hpp
index 63c6f043c..803dbb7c8 100644
--- a/src/apps/PRESSURE.hpp
+++ b/src/apps/PRESSURE.hpp
@@ -13,13 +13,37 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// PRESSURE kernel reference implementation:
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   bvc[i] = cls * (compression[i] + 1.0);
+/// }
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   p_new[i] = bvc[i] * e_old[i] ;
+///   if ( fabs(p_new[i]) <  p_cut ) p_new[i] = 0.0 ;
+///   if ( vnewc[i] >= eosvmax ) p_new[i] = 0.0 ;
+///   if ( p_new[i]  <  pmin ) p_new[i]   = pmin ;
+/// }
+///
 
 #ifndef RAJAPerf_Apps_PRESSURE_HPP
 #define RAJAPerf_Apps_PRESSURE_HPP
 
-#include "common/KernelBase.hpp"
+
+#define PRESSURE_BODY1 \
+  bvc[i] = cls * (compression[i] + 1.0);
+
+#define PRESSURE_BODY2 \
+  p_new[i] = bvc[i] * e_old[i] ; \
+  if ( fabs(p_new[i]) <  p_cut ) p_new[i] = 0.0 ; \
+  if ( vnewc[i] >= eosvmax ) p_new[i] = 0.0 ; \
+  if ( p_new[i]  <  pmin ) p_new[i]   = pmin ;
 
 
+#include "common/KernelBase.hpp"
+
 namespace rajaperf 
 {
 class RunParams;
@@ -40,6 +64,9 @@ class PRESSURE : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
   Real_ptr m_compression;
   Real_ptr m_bvc;
diff --git a/src/apps/VOL3D-Cuda.cpp b/src/apps/VOL3D-Cuda.cpp
new file mode 100644
index 000000000..0aa2d228f
--- /dev/null
+++ b/src/apps/VOL3D-Cuda.cpp
@@ -0,0 +1,147 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "VOL3D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include "AppsData.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#define VOL3D_DATA_SETUP_CUDA \
+  Real_ptr x; \
+  Real_ptr y; \
+  Real_ptr z; \
+  Real_ptr vol; \
+\
+  const Real_type vnormq = m_vnormq; \
+\
+  Real_ptr x0,x1,x2,x3,x4,x5,x6,x7 ; \
+  Real_ptr y0,y1,y2,y3,y4,y5,y6,y7 ; \
+  Real_ptr z0,z1,z2,z3,z4,z5,z6,z7 ; \
+\
+  allocAndInitCudaDeviceData(x, m_x, m_array_length); \
+  allocAndInitCudaDeviceData(y, m_y, m_array_length); \
+  allocAndInitCudaDeviceData(z, m_z, m_array_length); \
+  allocAndInitCudaDeviceData(vol, m_vol, m_array_length);
+
+#define VOL3D_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_vol, vol, m_array_length); \
+  deallocCudaDeviceData(x); \
+  deallocCudaDeviceData(y); \
+  deallocCudaDeviceData(z); \
+  deallocCudaDeviceData(vol);
+
+__global__ void vol3d(Real_ptr vol,
+                      const Real_ptr x0, const Real_ptr x1,
+                      const Real_ptr x2, const Real_ptr x3,
+                      const Real_ptr x4, const Real_ptr x5,
+                      const Real_ptr x6, const Real_ptr x7,
+                      const Real_ptr y0, const Real_ptr y1,
+                      const Real_ptr y2, const Real_ptr y3,
+                      const Real_ptr y4, const Real_ptr y5,
+                      const Real_ptr y6, const Real_ptr y7,
+                      const Real_ptr z0, const Real_ptr z1,
+                      const Real_ptr z2, const Real_ptr z3,
+                      const Real_ptr z4, const Real_ptr z5,
+                      const Real_ptr z6, const Real_ptr z7,
+                      const Real_type vnormq,
+                      Index_type ibegin, Index_type iend)
+{
+   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i = ii + ibegin; 
+   if (i < iend) {
+     VOL3D_BODY;
+   }
+}
+
+
+void VOL3D::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = m_domain->fpz;
+  const Index_type iend = m_domain->lpz+1;
+
+  if ( vid == Base_CUDA ) {
+
+    VOL3D_DATA_SETUP_CUDA;
+
+    NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
+    NDPTRSET(m_domain->jp, m_domain->kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ;
+    NDPTRSET(m_domain->jp, m_domain->kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+ 
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+
+      vol3d<<<grid_size, block_size>>>(vol,
+                                       x0, x1, x2, x3, x4, x5, x6, x7,
+                                       y0, y1, y2, y3, y4, y5, y6, y7,
+                                       z0, z1, z2, z3, z4, z5, z6, z7,
+                                       vnormq,
+                                       ibegin, iend);
+ 
+    }
+    stopTimer();
+ 
+    VOL3D_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    VOL3D_DATA_SETUP_CUDA;
+
+    NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
+    NDPTRSET(m_domain->jp, m_domain->kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ;
+    NDPTRSET(m_domain->jp, m_domain->kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+ 
+      RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+        RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+        VOL3D_BODY;
+      });
+ 
+    }
+    stopTimer();
+ 
+    VOL3D_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  VOL3D : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/apps/VOL3D-OMPTarget.cpp b/src/apps/VOL3D-OMPTarget.cpp
new file mode 100644
index 000000000..a5a0a3754
--- /dev/null
+++ b/src/apps/VOL3D-OMPTarget.cpp
@@ -0,0 +1,127 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "VOL3D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include "AppsData.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace apps
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define VOL3D_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr x; \
+  Real_ptr y; \
+  Real_ptr z; \
+  Real_ptr vol; \
+\
+  const Real_type vnormq = m_vnormq; \
+\
+  Real_ptr x0,x1,x2,x3,x4,x5,x6,x7 ; \
+  Real_ptr y0,y1,y2,y3,y4,y5,y6,y7 ; \
+  Real_ptr z0,z1,z2,z3,z4,z5,z6,z7 ; \
+\
+  allocAndInitOpenMPDeviceData(x, m_x, m_array_length, did, hid); \
+  allocAndInitOpenMPDeviceData(y, m_y, m_array_length, did, hid); \
+  allocAndInitOpenMPDeviceData(z, m_z, m_array_length, did, hid); \
+  allocAndInitOpenMPDeviceData(vol, m_vol, m_array_length, did, hid);
+
+#define VOL3D_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_vol, vol, m_array_length, hid, did); \
+  deallocOpenMPDeviceData(x, did); \
+  deallocOpenMPDeviceData(y, did); \
+  deallocOpenMPDeviceData(z, did); \
+  deallocOpenMPDeviceData(vol, did);
+
+
+void VOL3D::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = m_domain->fpz;
+  const Index_type iend = m_domain->lpz+1;
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    VOL3D_DATA_SETUP_OMP_TARGET;
+
+    NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
+    NDPTRSET(m_domain->jp, m_domain->kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ;
+    NDPTRSET(m_domain->jp, m_domain->kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(x0,x1,x2,x3,x4,x5,x6,x7, \
+                                       y0,y1,y2,y3,y4,y5,y6,y7, \
+                                       z0,z1,z2,z3,z4,z5,z6,z7, \
+                                       vol) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) 
+      for (Index_type i = ibegin ; i < iend ; ++i ) {
+        VOL3D_BODY;
+      }
+
+    }
+    stopTimer();
+
+    VOL3D_DATA_TEARDOWN_OMP_TARGET;     
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    VOL3D_DATA_SETUP_OMP_TARGET;
+
+    NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
+    NDPTRSET(m_domain->jp, m_domain->kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ;
+    NDPTRSET(m_domain->jp, m_domain->kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+
+        VOL3D_BODY;
+      });
+
+    }
+    stopTimer();
+
+    VOL3D_DATA_TEARDOWN_OMP_TARGET;     
+
+  } else {
+    std::cout << "\n  VOL3D : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace apps
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/apps/VOL3D.cpp b/src/apps/VOL3D.cpp
index 4a58cb4f5..282bed1af 100644
--- a/src/apps/VOL3D.cpp
+++ b/src/apps/VOL3D.cpp
@@ -13,73 +13,13 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-///
-/// VOL3D kernel reference implementation:
-///
-/// NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
-/// NDPTRSET(m_domain->jp, m_domain->kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ;
-/// NDPTRSET(m_domain->jp, m_domain->kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ;
-///
-/// for (Index_type i = ibegin ; i < iend ; ++i ) {
-///   Real_type x71 = x7[i] - x1[i] ; 
-///   Real_type x72 = x7[i] - x2[i] ; 
-///   Real_type x74 = x7[i] - x4[i] ; 
-///   Real_type x30 = x3[i] - x0[i] ; 
-///   Real_type x50 = x5[i] - x0[i] ; 
-///   Real_type x60 = x6[i] - x0[i] ; 
-///  
-///   Real_type y71 = y7[i] - y1[i] ; 
-///   Real_type y72 = y7[i] - y2[i] ; 
-///   Real_type y74 = y7[i] - y4[i] ; 
-///   Real_type y30 = y3[i] - y0[i] ; 
-///   Real_type y50 = y5[i] - y0[i] ; 
-///   Real_type y60 = y6[i] - y0[i] ; 
-///  
-///   Real_type z71 = z7[i] - z1[i] ; 
-///   Real_type z72 = z7[i] - z2[i] ; 
-///   Real_type z74 = z7[i] - z4[i] ; 
-///   Real_type z30 = z3[i] - z0[i] ; 
-///   Real_type z50 = z5[i] - z0[i] ; 
-///   Real_type z60 = z6[i] - z0[i] ; 
-///  
-///   Real_type xps = x71 + x60 ; 
-///   Real_type yps = y71 + y60 ; 
-///   Real_type zps = z71 + z60 ; 
-///  
-///   Real_type cyz = y72 * z30 - z72 * y30 ; 
-///   Real_type czx = z72 * x30 - x72 * z30 ; 
-///   Real_type cxy = x72 * y30 - y72 * x30 ; 
-///   vol[i] = xps * cyz + yps * czx + zps * cxy ; 
-///  
-///   xps = x72 + x50 ; 
-///   yps = y72 + y50 ; 
-///   zps = z72 + z50 ; 
-///  
-///   cyz = y74 * z60 - z74 * y60 ; 
-///   czx = z74 * x60 - x74 * z60 ; 
-///   cxy = x74 * y60 - y74 * x60 ; 
-///   vol[i] += xps * cyz + yps * czx + zps * cxy ; 
-///  
-///   xps = x74 + x30 ; 
-///   yps = y74 + y30 ; 
-///   zps = z74 + z30 ; 
-///  
-///   cyz = y71 * z50 - z71 * y50 ; 
-///   czx = z71 * x50 - x71 * z50 ; 
-///   cxy = x71 * y50 - y71 * x50 ; 
-///   vol[i] += xps * cyz + yps * czx + zps * cxy ; 
-///  
-///   vol[i] *= vnormq ;
-/// }
-///
-
 #include "VOL3D.hpp"
 
+#include "RAJA/RAJA.hpp"
+
 #include "AppsData.hpp"
 #include "common/DataUtils.hpp"
 
-#include "RAJA/RAJA.hpp"
-
 #include <iostream>
 
 namespace rajaperf 
@@ -87,127 +27,17 @@ namespace rajaperf
 namespace apps
 {
 
-#define VOL3D_DATA \
-  ResReal_ptr x = m_x; \
-  ResReal_ptr y = m_y; \
-  ResReal_ptr z = m_z; \
+#define VOL3D_DATA_SETUP_CPU \
+  Real_ptr x = m_x; \
+  Real_ptr y = m_y; \
+  Real_ptr z = m_z; \
   ResReal_ptr vol = m_vol; \
 \
   const Real_type vnormq = m_vnormq;
-\
-  ResReal_ptr x0,x1,x2,x3,x4,x5,x6,x7 ; \
-  ResReal_ptr y0,y1,y2,y3,y4,y5,y6,y7 ; \
-  ResReal_ptr z0,z1,z2,z3,z4,z5,z6,z7 ;
-
-
-#define VOL3D_BODY \
-  Real_type x71 = x7[i] - x1[i] ; \
-  Real_type x72 = x7[i] - x2[i] ; \
-  Real_type x74 = x7[i] - x4[i] ; \
-  Real_type x30 = x3[i] - x0[i] ; \
-  Real_type x50 = x5[i] - x0[i] ; \
-  Real_type x60 = x6[i] - x0[i] ; \
- \
-  Real_type y71 = y7[i] - y1[i] ; \
-  Real_type y72 = y7[i] - y2[i] ; \
-  Real_type y74 = y7[i] - y4[i] ; \
-  Real_type y30 = y3[i] - y0[i] ; \
-  Real_type y50 = y5[i] - y0[i] ; \
-  Real_type y60 = y6[i] - y0[i] ; \
- \
-  Real_type z71 = z7[i] - z1[i] ; \
-  Real_type z72 = z7[i] - z2[i] ; \
-  Real_type z74 = z7[i] - z4[i] ; \
-  Real_type z30 = z3[i] - z0[i] ; \
-  Real_type z50 = z5[i] - z0[i] ; \
-  Real_type z60 = z6[i] - z0[i] ; \
- \
-  Real_type xps = x71 + x60 ; \
-  Real_type yps = y71 + y60 ; \
-  Real_type zps = z71 + z60 ; \
- \
-  Real_type cyz = y72 * z30 - z72 * y30 ; \
-  Real_type czx = z72 * x30 - x72 * z30 ; \
-  Real_type cxy = x72 * y30 - y72 * x30 ; \
-  vol[i] = xps * cyz + yps * czx + zps * cxy ; \
- \
-  xps = x72 + x50 ; \
-  yps = y72 + y50 ; \
-  zps = z72 + z50 ; \
- \
-  cyz = y74 * z60 - z74 * y60 ; \
-  czx = z74 * x60 - x74 * z60 ; \
-  cxy = x74 * y60 - y74 * x60 ; \
-  vol[i] += xps * cyz + yps * czx + zps * cxy ; \
- \
-  xps = x74 + x30 ; \
-  yps = y74 + y30 ; \
-  zps = z74 + z30 ; \
- \
-  cyz = y71 * z50 - z71 * y50 ; \
-  czx = z71 * x50 - x71 * z50 ; \
-  cxy = x71 * y50 - y71 * x50 ; \
-  vol[i] += xps * cyz + yps * czx + zps * cxy ; \
- \
-  vol[i] *= vnormq ;
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define VOL3D_DATA_SETUP_CUDA \
-  Real_ptr x; \
-  Real_ptr y; \
-  Real_ptr z; \
-  Real_ptr vol; \
-\
-  const Real_type vnormq = m_vnormq; \
 \
   Real_ptr x0,x1,x2,x3,x4,x5,x6,x7 ; \
   Real_ptr y0,y1,y2,y3,y4,y5,y6,y7 ; \
-  Real_ptr z0,z1,z2,z3,z4,z5,z6,z7 ; \
-\
-  allocAndInitCudaDeviceData(x, m_x, iend); \
-  allocAndInitCudaDeviceData(y, m_y, iend); \
-  allocAndInitCudaDeviceData(z, m_z, iend); \
-  allocAndInitCudaDeviceData(vol, m_vol, iend);
-
-#define VOL3D_DATA_TEARDOWN_CUDA \
-  getCudaDeviceData(m_vol, vol, iend); \
-  deallocCudaDeviceData(x); \
-  deallocCudaDeviceData(y); \
-  deallocCudaDeviceData(z); \
-  deallocCudaDeviceData(vol);
-
-__global__ void vol3d(Real_ptr vol,
-                      const Real_ptr x0, const Real_ptr x1,
-                      const Real_ptr x2, const Real_ptr x3,
-                      const Real_ptr x4, const Real_ptr x5,
-                      const Real_ptr x6, const Real_ptr x7,
-                      const Real_ptr y0, const Real_ptr y1,
-                      const Real_ptr y2, const Real_ptr y3,
-                      const Real_ptr y4, const Real_ptr y5,
-                      const Real_ptr y6, const Real_ptr y7,
-                      const Real_ptr z0, const Real_ptr z1,
-                      const Real_ptr z2, const Real_ptr z3,
-                      const Real_ptr z4, const Real_ptr z5,
-                      const Real_ptr z6, const Real_ptr z7,
-                      const Real_type vnormq,
-                      Index_type ibegin, Index_type ilen)
-{
-   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
-   if (ii < ilen) {
-     Index_type i = ii + ibegin; 
-     VOL3D_BODY;
-   }
-}
-
-#endif // if defined(RAJA_ENABLE_CUDA)
+  Real_ptr z0,z1,z2,z3,z4,z5,z6,z7 ;
 
 
 VOL3D::VOL3D(const RunParams& params)
@@ -217,6 +47,8 @@ VOL3D::VOL3D(const RunParams& params)
   setDefaultReps(300);
 
   m_domain = new ADomain(getRunSize(), /* ndims = */ 3);
+
+  m_array_length = m_domain->nnalls;;
 }
 
 VOL3D::~VOL3D() 
@@ -230,12 +62,16 @@ Index_type VOL3D::getItsPerRep() const {
 
 void VOL3D::setUp(VariantID vid)
 {
-  int max_loop_index = m_domain->lpn;
+  allocAndInitDataConst(m_x, m_array_length, 0.0, vid);
+  allocAndInitDataConst(m_y, m_array_length, 0.0, vid);
+  allocAndInitDataConst(m_z, m_array_length, 0.0, vid);
+
+  Real_type dx = 0.3;
+  Real_type dy = 0.2;
+  Real_type dz = 0.1;
+  setMeshPositions_3d(m_x, dx, m_y, dy, m_z, dz, *m_domain);
 
-  allocAndInitData(m_x, max_loop_index, vid);
-  allocAndInitData(m_y, max_loop_index, vid);
-  allocAndInitData(m_z, max_loop_index, vid);
-  allocAndInitData(m_vol, max_loop_index, vid);
+  allocAndInitDataConst(m_vol, m_array_length, 0.0, vid);
 
   m_vnormq = 0.083333333333333333; /* vnormq = 1/12 */  
 }
@@ -250,7 +86,7 @@ void VOL3D::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      VOL3D_DATA;
+      VOL3D_DATA_SETUP_CPU;
 
       NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
       NDPTRSET(m_domain->jp, m_domain->kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ;
@@ -271,7 +107,7 @@ void VOL3D::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      VOL3D_DATA;
+      VOL3D_DATA_SETUP_CPU;
 
       NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
       NDPTRSET(m_domain->jp, m_domain->kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ;
@@ -280,7 +116,8 @@ void VOL3D::runKernel(VariantID vid)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           VOL3D_BODY;
         }); 
 
@@ -293,7 +130,7 @@ void VOL3D::runKernel(VariantID vid)
 #if defined(RAJA_ENABLE_OPENMP)      
     case Base_OpenMP : {
 
-      VOL3D_DATA;
+      VOL3D_DATA_SETUP_CPU;
 
       NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
       NDPTRSET(m_domain->jp, m_domain->kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ;
@@ -313,14 +150,9 @@ void VOL3D::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // Not applicable
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      VOL3D_DATA;
+      VOL3D_DATA_SETUP_CPU;
 
       NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
       NDPTRSET(m_domain->jp, m_domain->kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ;
@@ -329,7 +161,8 @@ void VOL3D::runKernel(VariantID vid)
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           VOL3D_BODY;
         });
 
@@ -340,74 +173,26 @@ void VOL3D::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-
-      VOL3D_DATA_SETUP_CUDA;
-
-      NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
-      NDPTRSET(m_domain->jp, m_domain->kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ;
-      NDPTRSET(m_domain->jp, m_domain->kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ;
-
-      const Index_type ibegin = m_domain->fpz;
-      const Index_type ilen = m_domain->lpz+1 - ibegin;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-
-        vol3d<<<grid_size, block_size>>>(vol,
-                                         x0, x1, x2, x3, x4, x5, x6, x7,
-                                         y0, y1, y2, y3, y4, y5, y6, y7,
-                                         z0, z1, z2, z3, z4, z5, z6, z7,
-                                         vnormq,
-                                         ibegin, ilen);
-
-      }
-      stopTimer();
-
-      VOL3D_DATA_TEARDOWN_CUDA;
-
-      break;
-    }
-
-    case RAJA_CUDA : {
-
-      VOL3D_DATA_SETUP_CUDA;
-
-      NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
-      NDPTRSET(m_domain->jp, m_domain->kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ;
-      NDPTRSET(m_domain->jp, m_domain->kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-        RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend,
-           [=] __device__ (Index_type i) {
-           VOL3D_BODY;
-        });
-
-      }
-      stopTimer();
-
-      VOL3D_DATA_TEARDOWN_CUDA;
-
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  VOL3D : Unknown variant id = " << vid << std::endl;
     }
 
   }
@@ -415,7 +200,7 @@ void VOL3D::runKernel(VariantID vid)
 
 void VOL3D::updateChecksum(VariantID vid)
 {
-  checksum[vid] += calcChecksum(m_vol, getRunSize());
+  checksum[vid] += calcChecksum(m_vol, m_array_length);
 }
 
 void VOL3D::tearDown(VariantID vid)
diff --git a/src/apps/VOL3D.hpp b/src/apps/VOL3D.hpp
index 232891f52..ed82359e1 100644
--- a/src/apps/VOL3D.hpp
+++ b/src/apps/VOL3D.hpp
@@ -13,13 +13,133 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// VOL3D kernel reference implementation:
+///
+/// NDPTRSET(m_domain->jp, m_domain->kp, x,x0,x1,x2,x3,x4,x5,x6,x7) ;
+/// NDPTRSET(m_domain->jp, m_domain->kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ;
+/// NDPTRSET(m_domain->jp, m_domain->kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ;
+///
+/// for (Index_type i = ibegin ; i < iend ; ++i ) {
+///   Real_type x71 = x7[i] - x1[i] ;
+///   Real_type x72 = x7[i] - x2[i] ;
+///   Real_type x74 = x7[i] - x4[i] ;
+///   Real_type x30 = x3[i] - x0[i] ;
+///   Real_type x50 = x5[i] - x0[i] ;
+///   Real_type x60 = x6[i] - x0[i] ;
+///
+///   Real_type y71 = y7[i] - y1[i] ;
+///   Real_type y72 = y7[i] - y2[i] ;
+///   Real_type y74 = y7[i] - y4[i] ;
+///   Real_type y30 = y3[i] - y0[i] ;
+///   Real_type y50 = y5[i] - y0[i] ;
+///   Real_type y60 = y6[i] - y0[i] ;
+///
+///   Real_type z71 = z7[i] - z1[i] ;
+///   Real_type z72 = z7[i] - z2[i] ;
+///   Real_type z74 = z7[i] - z4[i] ;
+///   Real_type z30 = z3[i] - z0[i] ;
+///   Real_type z50 = z5[i] - z0[i] ;
+///   Real_type z60 = z6[i] - z0[i] ;
+///
+///   Real_type xps = x71 + x60 ;
+///   Real_type yps = y71 + y60 ;
+///   Real_type zps = z71 + z60 ;
+///
+///   Real_type cyz = y72 * z30 - z72 * y30 ;
+///   Real_type czx = z72 * x30 - x72 * z30 ;
+///   Real_type cxy = x72 * y30 - y72 * x30 ;
+///   vol[i] = xps * cyz + yps * czx + zps * cxy ;
+///
+///   xps = x72 + x50 ;
+///   yps = y72 + y50 ;
+///   zps = z72 + z50 ;
+///
+///   cyz = y74 * z60 - z74 * y60 ;
+///   czx = z74 * x60 - x74 * z60 ;
+///   cxy = x74 * y60 - y74 * x60 ;
+///   vol[i] += xps * cyz + yps * czx + zps * cxy ;
+///
+///   xps = x74 + x30 ;
+///   yps = y74 + y30 ;
+///   zps = z74 + z30 ;
+///
+///   cyz = y71 * z50 - z71 * y50 ;
+///   czx = z71 * x50 - x71 * z50 ;
+///   cxy = x71 * y50 - y71 * x50 ;
+///   vol[i] += xps * cyz + yps * czx + zps * cxy ;
+///
+///   vol[i] *= vnormq ;
+/// }
+///
 
 #ifndef RAJAPerf_Apps_VOL3D_HPP
 #define RAJAPerf_Apps_VOL3D_HPP
 
-#include "common/KernelBase.hpp"
+
+#define VOL3D_BODY \
+  Real_type x71 = x7[i] - x1[i] ; \
+  Real_type x72 = x7[i] - x2[i] ; \
+  Real_type x74 = x7[i] - x4[i] ; \
+  Real_type x30 = x3[i] - x0[i] ; \
+  Real_type x50 = x5[i] - x0[i] ; \
+  Real_type x60 = x6[i] - x0[i] ; \
+ \
+  Real_type y71 = y7[i] - y1[i] ; \
+  Real_type y72 = y7[i] - y2[i] ; \
+  Real_type y74 = y7[i] - y4[i] ; \
+  Real_type y30 = y3[i] - y0[i] ; \
+  Real_type y50 = y5[i] - y0[i] ; \
+  Real_type y60 = y6[i] - y0[i] ; \
+ \
+  Real_type z71 = z7[i] - z1[i] ; \
+  Real_type z72 = z7[i] - z2[i] ; \
+  Real_type z74 = z7[i] - z4[i] ; \
+  Real_type z30 = z3[i] - z0[i] ; \
+  Real_type z50 = z5[i] - z0[i] ; \
+  Real_type z60 = z6[i] - z0[i] ; \
+ \
+  Real_type xps = x71 + x60 ; \
+  Real_type yps = y71 + y60 ; \
+  Real_type zps = z71 + z60 ; \
+ \
+  Real_type cyz = y72 * z30 - z72 * y30 ; \
+  Real_type czx = z72 * x30 - x72 * z30 ; \
+  Real_type cxy = x72 * y30 - y72 * x30 ; \
+  vol[i] = xps * cyz + yps * czx + zps * cxy ; \
+ \
+  xps = x72 + x50 ; \
+  yps = y72 + y50 ; \
+  zps = z72 + z50 ; \
+ \
+  cyz = y74 * z60 - z74 * y60 ; \
+  czx = z74 * x60 - x74 * z60 ; \
+  cxy = x74 * y60 - y74 * x60 ; \
+  vol[i] += xps * cyz + yps * czx + zps * cxy ; \
+ \
+  xps = x74 + x30 ; \
+  yps = y74 + y30 ; \
+  zps = z74 + z30 ; \
+ \
+  cyz = y74 * z60 - z74 * y60 ; \
+  czx = z74 * x60 - x74 * z60 ; \
+  cxy = x74 * y60 - y74 * x60 ; \
+  vol[i] += xps * cyz + yps * czx + zps * cxy ; \
+ \
+  xps = x74 + x30 ; \
+  yps = y74 + y30 ; \
+  zps = z74 + z30 ; \
+ \
+  cyz = y71 * z50 - z71 * y50 ; \
+  czx = z71 * x50 - x71 * z50 ; \
+  cxy = x71 * y50 - y71 * x50 ; \
+  vol[i] += xps * cyz + yps * czx + zps * cxy ; \
+ \
+  vol[i] *= vnormq ;
 
 
+#include "common/KernelBase.hpp"
+
 namespace rajaperf 
 {
 class RunParams;
@@ -43,6 +163,9 @@ class VOL3D : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
   Real_ptr m_x;
   Real_ptr m_y;
@@ -52,6 +175,7 @@ class VOL3D : public KernelBase
   Real_type m_vnormq;
 
   ADomain* m_domain;
+  Index_type m_array_length; 
 };
 
 } // end namespace apps
diff --git a/src/apps/WIP-COUPLE.cpp b/src/apps/WIP-COUPLE.cpp
index cf077c988..4a687ca36 100644
--- a/src/apps/WIP-COUPLE.cpp
+++ b/src/apps/WIP-COUPLE.cpp
@@ -13,72 +13,13 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-///
-/// COUPLE kernel reference implementation:
-///
-/// for (Index_type k = kmin ; k < kmax ; ++k ) {
-///   for (Index_type j = jmin; j < jmax; j++) { 
-///    
-///      Index_type it0=    ((k)*(jmax+1) + (j))*(imax+1) ; 
-///      Index_type idenac= ((k)*(jmax+2) + (j))*(imax+2) ; 
-///    
-///      for (Index_type i = imin; i < imax; i++) { 
-///    
-///         Complex_type c1 = c10 * denac[idenac+i]; 
-///         Complex_type c2 = c20 * denlw[it0+i]; 
-///    
-///         /* promote to doubles to avoid possible divide by zero */ 
-///         Real_type c1re = real(c1);  Real_type c1im = imag(c1); 
-///         Real_type c2re = real(c2);  Real_type c2im = imag(c2); 
-///    
-///         /* lamda = sqrt(|c1|^2 + |c2|^2) uses doubles to avoid underflow. */
-///         Real_type zlam = c1re*c1re + c1im*c1im + 
-///                          c2re*c2re + c2im*c2im + 1.0e-34; 
-///         zlam = sqrt(zlam); 
-///         Real_type snlamt = sin(zlam * dt * 0.5); 
-///         Real_type cslamt = cos(zlam * dt * 0.5); 
-///    
-///         Complex_type a0t = t0[it0+i]; 
-///         Complex_type a1t = t1[it0+i]; 
-///         Complex_type a2t = t2[it0+i] * fratio; 
-///    
-///         Real_type r_zlam= 1.0/zlam; 
-///         c1 *= r_zlam; 
-///         c2 *= r_zlam; 
-///         Real_type zac1 = zabs2(c1); 
-///         Real_type zac2 = zabs2(c2); 
-///    
-///         /* compute new A0 */ 
-///         Complex_type z3 = ( c1 * a1t + c2 * a2t ) * snlamt ; 
-///         t0[it0+i] = a0t * cslamt -  ireal * z3; 
-///    
-///         /* compute new A1  */ 
-///         Real_type r = zac1 * cslamt + zac2; 
-///         Complex_type z5 = c2 * a2t; 
-///         Complex_type z4 = conj(c1) * z5 * (cslamt-1); 
-///         z3 = conj(c1) * a0t * snlamt; 
-///         t1[it0+i] = a1t * r + z4 - ireal * z3; 
-///    
-///         /* compute new A2  */ 
-///         r = zac1 + zac2 * cslamt; 
-///         z5 = c1 * a1t; 
-///         z4 = conj(c2) * z5 * (cslamt-1); 
-///         z3 = conj(c2) * a0t * snlamt; 
-///         t2[it0+i] = ( a2t * r + z4 - ireal * z3 ) * r_fratio; 
-///    
-///      } /* i loop */ 
-///    
-///   } /* j loop */
-/// } /* k loop */
-///
-
 #include "WIP-COUPLE.hpp"
 
+#include "RAJA/RAJA.hpp"
+
 #include "AppsData.hpp"
 #include "common/DataUtils.hpp"
 
-#include "RAJA/RAJA.hpp"
-
 #include <iostream>
 
 namespace rajaperf 
@@ -86,7 +27,7 @@ namespace rajaperf
 namespace apps
 {
 
-#define COUPLE_DATA \
+#define COUPLE_DATA_SETUP_CPU \
   ResComplex_ptr t0 = m_t0; \
   ResComplex_ptr t1 = m_t1; \
   ResComplex_ptr t2 = m_t2; \
@@ -107,61 +48,6 @@ namespace apps
   const Index_type kmax = m_kmax;
 
 
-#define COUPLE_BODY \
-for (Index_type j = jmin; j < jmax; j++) { \
- \
-   Index_type it0=    ((k)*(jmax+1) + (j))*(imax+1) ; \
-   Index_type idenac= ((k)*(jmax+2) + (j))*(imax+2) ; \
- \
-   for (Index_type i = imin; i < imax; i++) { \
- \
-      Complex_type c1 = c10 * denac[idenac+i]; \
-      Complex_type c2 = c20 * denlw[it0+i]; \
- \
-      /* promote to doubles to avoid possible divide by zero */ \
-      Real_type c1re = real(c1);  Real_type c1im = imag(c1); \
-      Real_type c2re = real(c2);  Real_type c2im = imag(c2); \
- \
-      /* lamda = sqrt(|c1|^2 + |c2|^2) uses doubles to avoid underflow. */ \
-      Real_type zlam = c1re*c1re + c1im*c1im + \
-                       c2re*c2re + c2im*c2im + 1.0e-34; \
-      zlam = sqrt(zlam); \
-      Real_type snlamt = sin(zlam * dt * 0.5); \
-      Real_type cslamt = cos(zlam * dt * 0.5); \
- \
-      Complex_type a0t = t0[it0+i]; \
-      Complex_type a1t = t1[it0+i]; \
-      Complex_type a2t = t2[it0+i] * fratio; \
- \
-      Real_type r_zlam= 1.0/zlam; \
-      c1 *= r_zlam; \
-      c2 *= r_zlam; \
-      Real_type zac1 = zabs2(c1); \
-      Real_type zac2 = zabs2(c2); \
- \
-      /* compute new A0 */ \
-      Complex_type z3 = ( c1 * a1t + c2 * a2t ) * snlamt ; \
-      t0[it0+i] = a0t * cslamt -  ireal * z3; \
- \
-      /* compute new A1  */ \
-      Real_type r = zac1 * cslamt + zac2; \
-      Complex_type z5 = c2 * a2t; \
-      Complex_type z4 = conj(c1) * z5 * (cslamt-1); \
-      z3 = conj(c1) * a0t * snlamt; \
-      t1[it0+i] = a1t * r + z4 - ireal * z3; \
- \
-      /* compute new A2  */ \
-      r = zac1 + zac2 * cslamt; \
-      z5 = c1 * a1t; \
-      z4 = conj(c2) * z5 * (cslamt-1); \
-      z3 = conj(c2) * a0t * snlamt; \
-      t2[it0+i] = ( a2t * r + z4 - ireal * z3 ) * r_fratio; \
- \
-   } /* i loop */ \
- \
-} /* j loop */
-
-
 COUPLE::COUPLE(const RunParams& params)
   : KernelBase(rajaperf::Apps_COUPLE, params)
 {
@@ -214,15 +100,11 @@ void COUPLE::runKernel(VariantID vid)
 {
   const Index_type run_reps = getRunReps();
 
-//
-// RDH: Should we use forallN for this kernel???
-//
-
   switch ( vid ) {
 
     case Base_Seq : {
 
-      COUPLE_DATA;
+      COUPLE_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -239,12 +121,13 @@ void COUPLE::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      COUPLE_DATA;
+      COUPLE_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::seq_exec>(kmin, kmax, [=](int k) {
+        RAJA::forall<RAJA::seq_exec>(
+          RAJA::RangeSegment(kmin, kmax), [=](int k) {
           COUPLE_BODY;
         }); 
 
@@ -256,7 +139,7 @@ void COUPLE::runKernel(VariantID vid)
 
 #if defined(RAJA_ENABLE_OPENMP)      
     case Base_OpenMP : {
-      COUPLE_DATA;
+      COUPLE_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -271,19 +154,15 @@ void COUPLE::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // Not applicable
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      COUPLE_DATA;
+      COUPLE_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(kmin, kmax, [=](int k) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(kmin, kmax), [=](int k) {
           COUPLE_BODY;
         }); 
 
@@ -294,24 +173,26 @@ void COUPLE::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA :
-    case RAJA_CUDA : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_TARGET_OPENMP) && 0
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA) && 0
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  COUPLE : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/apps/WIP-COUPLE.hpp b/src/apps/WIP-COUPLE.hpp
index c258a0bb4..2c24cbcd7 100644
--- a/src/apps/WIP-COUPLE.hpp
+++ b/src/apps/WIP-COUPLE.hpp
@@ -13,13 +13,126 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// COUPLE kernel reference implementation:
+///
+/// for (Index_type k = kmin ; k < kmax ; ++k ) {
+///   for (Index_type j = jmin; j < jmax; j++) {
+///
+///      Index_type it0=    ((k)*(jmax+1) + (j))*(imax+1) ;
+///      Index_type idenac= ((k)*(jmax+2) + (j))*(imax+2) ;
+///
+///      for (Index_type i = imin; i < imax; i++) {
+///
+///         Complex_type c1 = c10 * denac[idenac+i];
+///         Complex_type c2 = c20 * denlw[it0+i];
+///
+///         /* promote to doubles to avoid possible divide by zero */
+///         Real_type c1re = real(c1);  Real_type c1im = imag(c1);
+///         Real_type c2re = real(c2);  Real_type c2im = imag(c2);
+///
+///         /* lamda = sqrt(|c1|^2 + |c2|^2) uses doubles to avoid underflow. */
+///         Real_type zlam = c1re*c1re + c1im*c1im +
+///                          c2re*c2re + c2im*c2im + 1.0e-34;
+///         zlam = sqrt(zlam);
+///         Real_type snlamt = sin(zlam * dt * 0.5);
+///         Real_type cslamt = cos(zlam * dt * 0.5);
+///
+///         Complex_type a0t = t0[it0+i];
+///         Complex_type a1t = t1[it0+i];
+///         Complex_type a2t = t2[it0+i] * fratio;
+///
+///         Real_type r_zlam= 1.0/zlam;
+///         c1 *= r_zlam;
+///         c2 *= r_zlam;
+///         Real_type zac1 = zabs2(c1);
+///         Real_type zac2 = zabs2(c2);
+///
+///         /* compute new A0 */
+///         Complex_type z3 = ( c1 * a1t + c2 * a2t ) * snlamt ;
+///         t0[it0+i] = a0t * cslamt -  ireal * z3;
+///
+///         /* compute new A1  */
+///         Real_type r = zac1 * cslamt + zac2;
+///         Complex_type z5 = c2 * a2t;
+///         Complex_type z4 = conj(c1) * z5 * (cslamt-1);
+///         z3 = conj(c1) * a0t * snlamt;
+///         t1[it0+i] = a1t * r + z4 - ireal * z3;
+///
+///         /* compute new A2  */
+///         r = zac1 + zac2 * cslamt;
+///         z5 = c1 * a1t;
+///         z4 = conj(c2) * z5 * (cslamt-1);
+///         z3 = conj(c2) * a0t * snlamt;
+///         t2[it0+i] = ( a2t * r + z4 - ireal * z3 ) * r_fratio;
+///
+///      } /* i loop */
+///
+///   } /* j loop */
+/// } /* k loop */
+///
 
 #ifndef RAJAPerf_Apps_COUPLE_HPP
 #define RAJAPerf_Apps_COUPLE_HPP
 
-#include "common/KernelBase.hpp"
+
+#define COUPLE_BODY \
+for (Index_type j = jmin; j < jmax; j++) { \
+ \
+   Index_type it0=    ((k)*(jmax+1) + (j))*(imax+1) ; \
+   Index_type idenac= ((k)*(jmax+2) + (j))*(imax+2) ; \
+ \
+   for (Index_type i = imin; i < imax; i++) { \
+ \
+      Complex_type c1 = c10 * denac[idenac+i]; \
+      Complex_type c2 = c20 * denlw[it0+i]; \
+ \
+      /* promote to doubles to avoid possible divide by zero */ \
+      Real_type c1re = real(c1);  Real_type c1im = imag(c1); \
+      Real_type c2re = real(c2);  Real_type c2im = imag(c2); \
+ \
+      /* lamda = sqrt(|c1|^2 + |c2|^2) uses doubles to avoid underflow. */ \
+      Real_type zlam = c1re*c1re + c1im*c1im + \
+                       c2re*c2re + c2im*c2im + 1.0e-34; \
+      zlam = sqrt(zlam); \
+      Real_type snlamt = sin(zlam * dt * 0.5); \
+      Real_type cslamt = cos(zlam * dt * 0.5); \
+ \
+      Complex_type a0t = t0[it0+i]; \
+      Complex_type a1t = t1[it0+i]; \
+      Complex_type a2t = t2[it0+i] * fratio; \
+ \
+      Real_type r_zlam= 1.0/zlam; \
+      c1 *= r_zlam; \
+      c2 *= r_zlam; \
+      Real_type zac1 = zabs2(c1); \
+      Real_type zac2 = zabs2(c2); \
+ \
+      /* compute new A0 */ \
+      Complex_type z3 = ( c1 * a1t + c2 * a2t ) * snlamt ; \
+      t0[it0+i] = a0t * cslamt -  ireal * z3; \
+ \
+      /* compute new A1  */ \
+      Real_type r = zac1 * cslamt + zac2; \
+      Complex_type z5 = c2 * a2t; \
+      Complex_type z4 = conj(c1) * z5 * (cslamt-1); \
+      z3 = conj(c1) * a0t * snlamt; \
+      t1[it0+i] = a1t * r + z4 - ireal * z3; \
+ \
+      /* compute new A2  */ \
+      r = zac1 + zac2 * cslamt; \
+      z5 = c1 * a1t; \
+      z4 = conj(c2) * z5 * (cslamt-1); \
+      z3 = conj(c2) * a0t * snlamt; \
+      t2[it0+i] = ( a2t * r + z4 - ireal * z3 ) * r_fratio; \
+ \
+   } /* i loop */ \
+ \
+} /* j loop */
 
 
+#include "common/KernelBase.hpp"
+
 namespace rajaperf 
 {
 class RunParams;
@@ -43,6 +156,9 @@ class COUPLE : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
   Complex_ptr m_t0;
   Complex_ptr m_t1;
diff --git a/src/basic/CMakeLists.txt b/src/basic/CMakeLists.txt
index 9d4cd1f8a..c1da874ab 100644
--- a/src/basic/CMakeLists.txt
+++ b/src/basic/CMakeLists.txt
@@ -15,11 +15,29 @@
 
 blt_add_library(
   NAME basic
-  SOURCES MULADDSUB.cpp 
+  SOURCES MULADDSUB.cpp
+          MULADDSUB-Cuda.cpp 
+          MULADDSUB-OMPTarget.cpp 
           IF_QUAD.cpp 
+          IF_QUAD-Cuda.cpp 
+          IF_QUAD-OMPTarget.cpp 
           TRAP_INT.cpp 
+          TRAP_INT-Cuda.cpp 
+          TRAP_INT-OMPTarget.cpp 
           INIT3.cpp
+          INIT3-Cuda.cpp
+          INIT3-OMPTarget.cpp
           REDUCE3_INT.cpp
+          REDUCE3_INT-Cuda.cpp
+          REDUCE3_INT-OMPTarget.cpp
           NESTED_INIT.cpp
+          NESTED_INIT-Cuda.cpp
+          NESTED_INIT-OMPTarget.cpp
+          INIT_VIEW1D.cpp
+          INIT_VIEW1D-Cuda.cpp
+          INIT_VIEW1D-OMPTarget.cpp
+          INIT_VIEW1D_OFFSET.cpp
+          INIT_VIEW1D_OFFSET-Cuda.cpp
+          INIT_VIEW1D_OFFSET-OMPTarget.cpp
   DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS}
   )
diff --git a/src/basic/IF_QUAD-Cuda.cpp b/src/basic/IF_QUAD-Cuda.cpp
new file mode 100644
index 000000000..add440b2b
--- /dev/null
+++ b/src/basic/IF_QUAD-Cuda.cpp
@@ -0,0 +1,117 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "IF_QUAD.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace basic
+{
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#define IF_QUAD_DATA_SETUP_CUDA \
+  Real_ptr a; \
+  Real_ptr b; \
+  Real_ptr c; \
+  Real_ptr x1; \
+  Real_ptr x2; \
+\
+  allocAndInitCudaDeviceData(a, m_a, iend); \
+  allocAndInitCudaDeviceData(b, m_b, iend); \
+  allocAndInitCudaDeviceData(c, m_c, iend); \
+  allocAndInitCudaDeviceData(x1, m_x1, iend); \
+  allocAndInitCudaDeviceData(x2, m_x2, iend);
+
+#define IF_QUAD_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_x1, x1, iend); \
+  getCudaDeviceData(m_x2, x2, iend); \
+  deallocCudaDeviceData(a); \
+  deallocCudaDeviceData(b); \
+  deallocCudaDeviceData(c); \
+  deallocCudaDeviceData(x1); \
+  deallocCudaDeviceData(x2);
+
+__global__ void ifquad(Real_ptr x1, Real_ptr x2,
+                       Real_ptr a, Real_ptr b, Real_ptr c,
+                       Index_type iend)
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     IF_QUAD_BODY;
+   }
+}
+
+
+void IF_QUAD::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_CUDA ) {
+
+    IF_QUAD_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+       ifquad<<<grid_size, block_size>>>( x1, x2, a, b, c,
+                                          iend );
+
+    }
+    stopTimer();
+
+    IF_QUAD_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    IF_QUAD_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+         IF_QUAD_BODY;
+       });
+
+    }
+    stopTimer();
+
+    IF_QUAD_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  IF_QUAD : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/basic/IF_QUAD-OMPTarget.cpp b/src/basic/IF_QUAD-OMPTarget.cpp
new file mode 100644
index 000000000..b1dc399dc
--- /dev/null
+++ b/src/basic/IF_QUAD-OMPTarget.cpp
@@ -0,0 +1,110 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "IF_QUAD.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace basic
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define IF_QUAD_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr a; \
+  Real_ptr b; \
+  Real_ptr c; \
+  Real_ptr x1; \
+  Real_ptr x2; \
+\
+  allocAndInitOpenMPDeviceData(a, m_a, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(b, m_b, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(c, m_c, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(x1, m_x1, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(x2, m_x2, iend, did, hid);
+
+#define IF_QUAD_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_x1, x1, iend, hid, did); \
+  getOpenMPDeviceData(m_x2, x2, iend, hid, did); \
+  deallocOpenMPDeviceData(a, did); \
+  deallocOpenMPDeviceData(b, did); \
+  deallocOpenMPDeviceData(c, did); \
+  deallocOpenMPDeviceData(x1, did); \
+  deallocOpenMPDeviceData(x2, did);
+
+void IF_QUAD::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    IF_QUAD_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(a, b, c, x1, x2) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) 
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        IF_QUAD_BODY;
+      }
+
+    }
+    stopTimer();
+
+    IF_QUAD_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    IF_QUAD_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+        IF_QUAD_BODY;
+      });
+
+    }
+    stopTimer();
+
+    IF_QUAD_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     std::cout << "\n  IF_QUAD : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/basic/IF_QUAD.cpp b/src/basic/IF_QUAD.cpp
index 53ed75037..9207a4fbc 100644
--- a/src/basic/IF_QUAD.cpp
+++ b/src/basic/IF_QUAD.cpp
@@ -13,29 +13,12 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-
-///
-/// IF_QUAD kernel reference implementation:
-///
-/// for (Index_type i = ibegin; i < iend; ++i ) {
-///   Real_type s = b[i]*b[i] - 4.0*a[i]*c[i];
-///   if ( s >= 0 ) {
-///     s = sqrt(s);
-///     x2[i] = (-b[i]+s)/(2.0*a[i]);
-///     x1[i] = (-b[i]-s)/(2.0*a[i]);
-///   } else {
-///     x2[i] = 0.0;
-///     x1[i] = 0.0;
-///   }
-/// }
-///
-
 #include "IF_QUAD.hpp"
 
-#include "common/DataUtils.hpp"
-
 #include "RAJA/RAJA.hpp"
 
+#include "common/DataUtils.hpp"
+
 #include <iostream>
 
 namespace rajaperf 
@@ -43,66 +26,14 @@ namespace rajaperf
 namespace basic
 {
 
-#define IF_QUAD_DATA \
+
+#define IF_QUAD_DATA_SETUP_CPU \
   ResReal_ptr a = m_a; \
   ResReal_ptr b = m_b; \
   ResReal_ptr c = m_c; \
   ResReal_ptr x1 = m_x1; \
   ResReal_ptr x2 = m_x2;
 
-#define IF_QUAD_BODY  \
-  Real_type s = b[i]*b[i] - 4.0*a[i]*c[i]; \
-  if ( s >= 0 ) { \
-    s = sqrt(s); \
-    x2[i] = (-b[i]+s)/(2.0*a[i]); \
-    x1[i] = (-b[i]-s)/(2.0*a[i]); \
-  } else { \
-    x2[i] = 0.0; \
-    x1[i] = 0.0; \
-  }
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define IF_QUAD_DATA_SETUP_CUDA \
-  Real_ptr a; \
-  Real_ptr b; \
-  Real_ptr c; \
-  Real_ptr x1; \
-  Real_ptr x2; \
-\
-  allocAndInitCudaDeviceData(a, m_a, iend); \
-  allocAndInitCudaDeviceData(b, m_b, iend); \
-  allocAndInitCudaDeviceData(c, m_c, iend); \
-  allocAndInitCudaDeviceData(x1, m_x1, iend); \
-  allocAndInitCudaDeviceData(x2, m_x2, iend);
-
-#define IF_QUAD_DATA_TEARDOWN_CUDA \
-  getCudaDeviceData(m_x1, x1, iend); \
-  getCudaDeviceData(m_x2, x2, iend); \
-  deallocCudaDeviceData(a); \
-  deallocCudaDeviceData(b); \
-  deallocCudaDeviceData(c); \
-  deallocCudaDeviceData(x1); \
-  deallocCudaDeviceData(x2);
-
-__global__ void ifquad(Real_ptr x1, Real_ptr x2,
-                       Real_ptr a, Real_ptr b, Real_ptr c,
-                       Index_type iend)
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     IF_QUAD_BODY;
-   }
-}
-
-#endif // if defined(RAJA_ENABLE_CUDA)
-
 
 IF_QUAD::IF_QUAD(const RunParams& params)
   : KernelBase(rajaperf::Basic_IF_QUAD, params)
@@ -120,8 +51,8 @@ void IF_QUAD::setUp(VariantID vid)
   allocAndInitDataRandSign(m_a, getRunSize(), vid);
   allocAndInitData(m_b, getRunSize(), vid);
   allocAndInitData(m_c, getRunSize(), vid);
-  allocAndInitData(m_x1, getRunSize(), vid);
-  allocAndInitData(m_x2, getRunSize(), vid);
+  allocAndInitDataConst(m_x1, getRunSize(), 0.0, vid);
+  allocAndInitDataConst(m_x2, getRunSize(), 0.0, vid);
 }
 
 void IF_QUAD::runKernel(VariantID vid)
@@ -134,7 +65,7 @@ void IF_QUAD::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      IF_QUAD_DATA;
+      IF_QUAD_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -151,12 +82,13 @@ void IF_QUAD::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      IF_QUAD_DATA;
+      IF_QUAD_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](int i) {
           IF_QUAD_BODY;
         });
 
@@ -169,7 +101,7 @@ void IF_QUAD::runKernel(VariantID vid)
 #if defined(RAJA_ENABLE_OPENMP)
     case Base_OpenMP : {
 
-      IF_QUAD_DATA;
+      IF_QUAD_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -185,19 +117,15 @@ void IF_QUAD::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // case is not defined...
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      IF_QUAD_DATA;
+      IF_QUAD_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](int i) {
           IF_QUAD_BODY;
         });
 
@@ -209,58 +137,26 @@ void IF_QUAD::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-
-      IF_QUAD_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-         ifquad<<<grid_size, block_size>>>( x1, x2, a, b, c,
-                                            iend );
-
-      }
-      stopTimer();
-
-      IF_QUAD_DATA_TEARDOWN_CUDA;
-
-      break;
-    }
-
-    case RAJA_CUDA : {
-
-      IF_QUAD_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend,
-           [=] __device__ (Index_type i) {
-           IF_QUAD_BODY;
-         });
-
-      }
-      stopTimer();
-
-      IF_QUAD_DATA_TEARDOWN_CUDA;
-
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  IF_QUAD : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/basic/IF_QUAD.hpp b/src/basic/IF_QUAD.hpp
index b26960f09..ac71e4d7c 100644
--- a/src/basic/IF_QUAD.hpp
+++ b/src/basic/IF_QUAD.hpp
@@ -13,12 +13,40 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// IF_QUAD kernel reference implementation:
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   Real_type s = b[i]*b[i] - 4.0*a[i]*c[i];
+///   if ( s >= 0 ) {
+///     s = sqrt(s);
+///     x2[i] = (-b[i]+s)/(2.0*a[i]);
+///     x1[i] = (-b[i]-s)/(2.0*a[i]);
+///   } else {
+///     x2[i] = 0.0;
+///     x1[i] = 0.0;
+///   }
+/// }
+///
 
 #ifndef RAJAPerf_Basic_IF_QUAD_HPP
 #define RAJAPerf_Basic_IF_QUAD_HPP
 
 #include "common/KernelBase.hpp"
 
+
+#define IF_QUAD_BODY  \
+  Real_type s = b[i]*b[i] - 4.0*a[i]*c[i]; \
+  if ( s >= 0 ) { \
+    s = sqrt(s); \
+    x2[i] = (-b[i]+s)/(2.0*a[i]); \
+    x1[i] = (-b[i]-s)/(2.0*a[i]); \
+  } else { \
+    x2[i] = 0.0; \
+    x1[i] = 0.0; \
+  }
+
+
 namespace rajaperf 
 {
 class RunParams;
@@ -39,6 +67,9 @@ class IF_QUAD : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
   Real_ptr m_a;
   Real_ptr m_b;
diff --git a/src/basic/INIT3-Cuda.cpp b/src/basic/INIT3-Cuda.cpp
new file mode 100644
index 000000000..a3fc91b96
--- /dev/null
+++ b/src/basic/INIT3-Cuda.cpp
@@ -0,0 +1,118 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INIT3.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace basic
+{
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#define INIT3_DATA_SETUP_CUDA \
+  Real_ptr out1; \
+  Real_ptr out2; \
+  Real_ptr out3; \
+  Real_ptr in1; \
+  Real_ptr in2; \
+\
+  allocAndInitCudaDeviceData(out1, m_out1, iend); \
+  allocAndInitCudaDeviceData(out2, m_out2, iend); \
+  allocAndInitCudaDeviceData(out3, m_out3, iend); \
+  allocAndInitCudaDeviceData(in1, m_in1, iend); \
+  allocAndInitCudaDeviceData(in2, m_in2, iend);
+
+#define INIT3_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_out1, out1, iend); \
+  getCudaDeviceData(m_out2, out2, iend); \
+  getCudaDeviceData(m_out3, out3, iend); \
+  deallocCudaDeviceData(out1); \
+  deallocCudaDeviceData(out2); \
+  deallocCudaDeviceData(out3); \
+  deallocCudaDeviceData(in1); \
+  deallocCudaDeviceData(in2);
+
+__global__ void init3(Real_ptr out1, Real_ptr out2, Real_ptr out3, 
+                      Real_ptr in1, Real_ptr in2, 
+                      Index_type iend) 
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     INIT3_BODY; 
+   }
+}
+
+
+void INIT3::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_CUDA ) {
+
+    INIT3_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      init3<<<grid_size, block_size>>>( out1, out2, out3, in1, in2, 
+                                        iend ); 
+
+    }
+    stopTimer();
+
+    INIT3_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    INIT3_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+        RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+        INIT3_BODY;
+      });
+
+    }
+    stopTimer();
+
+    INIT3_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  INIT3 : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/basic/INIT3-OMPTarget.cpp b/src/basic/INIT3-OMPTarget.cpp
new file mode 100644
index 000000000..f0cf7fba0
--- /dev/null
+++ b/src/basic/INIT3-OMPTarget.cpp
@@ -0,0 +1,112 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INIT3.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace basic
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define INIT3_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr out1; \
+  Real_ptr out2; \
+  Real_ptr out3; \
+  Real_ptr in1; \
+  Real_ptr in2; \
+\
+  allocAndInitOpenMPDeviceData(out1, m_out1, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(out2, m_out2, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(out3, m_out3, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(in1, m_in1, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(in2, m_in2, iend, did, hid);
+
+#define INIT3_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_out1, out1, iend, hid, did); \
+  getOpenMPDeviceData(m_out2, out2, iend, hid, did); \
+  getOpenMPDeviceData(m_out3, out3, iend, hid, did); \
+  deallocOpenMPDeviceData(out1, did); \
+  deallocOpenMPDeviceData(out2, did); \
+  deallocOpenMPDeviceData(out3, did); \
+  deallocOpenMPDeviceData(in1, did); \
+  deallocOpenMPDeviceData(in2, did);
+
+
+void INIT3::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    INIT3_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(out1, out2, out3, in1, in2) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1)
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        INIT3_BODY;
+      }
+
+    }
+    stopTimer();
+
+    INIT3_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    INIT3_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+        INIT3_BODY;
+      });
+
+    }
+    stopTimer();
+
+    INIT3_DATA_TEARDOWN_OMP_TARGET;
+  
+  } else {
+     std::cout << "\n  INIT3 : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/basic/INIT3.cpp b/src/basic/INIT3.cpp
index 366a7aa89..61744b773 100644
--- a/src/basic/INIT3.cpp
+++ b/src/basic/INIT3.cpp
@@ -13,20 +13,12 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-///
-/// INIT3 kernel reference implementation:
-///
-/// for (Index_type i = ibegin; i < iend; ++i ) {
-///   out1[i] = out2[i] = out3[i] = - in1[i] - in2[i] ;
-/// }
-///
-
 #include "INIT3.hpp"
 
-#include "common/DataUtils.hpp"
-
 #include "RAJA/RAJA.hpp"
 
+#include "common/DataUtils.hpp"
+
 #include <iostream>
 
 namespace rajaperf 
@@ -34,60 +26,14 @@ namespace rajaperf
 namespace basic
 {
 
-#define INIT3_DATA \
+
+#define INIT3_DATA_SETUP_CPU \
   ResReal_ptr out1 = m_out1; \
   ResReal_ptr out2 = m_out2; \
   ResReal_ptr out3 = m_out3; \
   ResReal_ptr in1 = m_in1; \
   ResReal_ptr in2 = m_in2;
 
-#define INIT3_BODY  \
-  out1[i] = out2[i] = out3[i] = - in1[i] - in2[i] ;
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define INIT3_DATA_SETUP_CUDA \
-  Real_ptr out1; \
-  Real_ptr out2; \
-  Real_ptr out3; \
-  Real_ptr in1; \
-  Real_ptr in2; \
-\
-  allocAndInitCudaDeviceData(out1, m_out1, iend); \
-  allocAndInitCudaDeviceData(out2, m_out2, iend); \
-  allocAndInitCudaDeviceData(out3, m_out3, iend); \
-  allocAndInitCudaDeviceData(in1, m_in1, iend); \
-  allocAndInitCudaDeviceData(in2, m_in2, iend);
-
-#define INIT3_DATA_TEARDOWN_CUDA \
-  getCudaDeviceData(m_out1, out1, iend); \
-  getCudaDeviceData(m_out2, out2, iend); \
-  getCudaDeviceData(m_out3, out3, iend); \
-  deallocCudaDeviceData(out1); \
-  deallocCudaDeviceData(out2); \
-  deallocCudaDeviceData(out3); \
-  deallocCudaDeviceData(in1); \
-  deallocCudaDeviceData(in2);
-
-__global__ void init3(Real_ptr out1, Real_ptr out2, Real_ptr out3, 
-                      Real_ptr in1, Real_ptr in2, 
-                      Index_type iend) 
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     INIT3_BODY; 
-   }
-}
-
-#endif // if defined(RAJA_ENABLE_CUDA)
-
 
 INIT3::INIT3(const RunParams& params)
   : KernelBase(rajaperf::Basic_INIT3, params)
@@ -102,9 +48,9 @@ INIT3::~INIT3()
 
 void INIT3::setUp(VariantID vid)
 {
-  allocAndInitData(m_out1, getRunSize(), vid);
-  allocAndInitData(m_out2, getRunSize(), vid);
-  allocAndInitData(m_out3, getRunSize(), vid);
+  allocAndInitDataConst(m_out1, getRunSize(), 0.0, vid);
+  allocAndInitDataConst(m_out2, getRunSize(), 0.0, vid);
+  allocAndInitDataConst(m_out3, getRunSize(), 0.0, vid);
   allocAndInitData(m_in1, getRunSize(), vid);
   allocAndInitData(m_in2, getRunSize(), vid);
 }
@@ -119,7 +65,7 @@ void INIT3::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      INIT3_DATA;
+      INIT3_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -136,12 +82,13 @@ void INIT3::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      INIT3_DATA;
+      INIT3_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](Index_type i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           INIT3_BODY;
         });
 
@@ -154,7 +101,7 @@ void INIT3::runKernel(VariantID vid)
 #if defined(RAJA_ENABLE_OPENMP)
     case Base_OpenMP : {
 
-      INIT3_DATA;
+      INIT3_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -170,20 +117,15 @@ void INIT3::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // case is not defined...
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      INIT3_DATA;
+      INIT3_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, 
-          [=](Index_type i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           INIT3_BODY;
         });
 
@@ -194,58 +136,26 @@ void INIT3::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-
-      INIT3_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-         init3<<<grid_size, block_size>>>( out1, out2, out3, in1, in2, 
-                                           iend ); 
-
-      }
-      stopTimer();
-
-      INIT3_DATA_TEARDOWN_CUDA;
-
-      break; 
-    }
-
-    case RAJA_CUDA : {
-
-      INIT3_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend, 
-           [=] __device__ (Index_type i) {
-           INIT3_BODY;
-         });
-
-      }
-      stopTimer();
-
-      INIT3_DATA_TEARDOWN_CUDA;
-
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  INIT3 : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/basic/INIT3.hpp b/src/basic/INIT3.hpp
index 4e1c79a3b..156de7957 100644
--- a/src/basic/INIT3.hpp
+++ b/src/basic/INIT3.hpp
@@ -13,10 +13,22 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// INIT3 kernel reference implementation:
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   out1[i] = out2[i] = out3[i] = - in1[i] - in2[i] ;
+/// }
+///
 
 #ifndef RAJAPerf_Basic_INIT3_HPP
 #define RAJAPerf_Basic_INIT3_HPP
 
+
+#define INIT3_BODY  \
+  out1[i] = out2[i] = out3[i] = - in1[i] - in2[i] ;
+
+
 #include "common/KernelBase.hpp"
 
 namespace rajaperf 
@@ -39,6 +51,9 @@ class INIT3 : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
   Real_ptr m_out1;
   Real_ptr m_out2;
diff --git a/src/basic/INIT_VIEW1D-Cuda.cpp b/src/basic/INIT_VIEW1D-Cuda.cpp
new file mode 100644
index 000000000..c4c63398e
--- /dev/null
+++ b/src/basic/INIT_VIEW1D-Cuda.cpp
@@ -0,0 +1,116 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INIT_VIEW1D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#define INIT_VIEW1D_DATA_SETUP_CUDA \
+  Real_ptr a; \
+  const Real_type v = m_val; \
+\
+  allocAndInitCudaDeviceData(a, m_a, iend);
+
+#define INIT_VIEW1D_DATA_RAJA_SETUP_CUDA \
+  Real_ptr a; \
+  const Real_type v = m_val; \
+\
+  allocAndInitCudaDeviceData(a, m_a, iend); \
+\
+  using ViewType = RAJA::View<Real_type, RAJA::Layout<1> >; \
+  const RAJA::Layout<1> my_layout(iend); \
+  ViewType view(a, my_layout);
+
+#define INIT_VIEW1D_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_a, a, iend); \
+  deallocCudaDeviceData(a);
+
+__global__ void initview1d(Real_ptr a, 
+                           Real_type v,
+                           const Index_type iend) 
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     INIT_VIEW1D_BODY; 
+   }
+}
+
+
+void INIT_VIEW1D::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_CUDA ) {
+
+    INIT_VIEW1D_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+       initview1d<<<grid_size, block_size>>>( a,
+                                              v, 
+                                              iend ); 
+
+    }
+    stopTimer();
+
+    INIT_VIEW1D_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    INIT_VIEW1D_DATA_RAJA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+        RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+        INIT_VIEW1D_BODY_RAJA;
+      });
+
+    }
+    stopTimer();
+
+    INIT_VIEW1D_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  INIT_VIEW1D : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/basic/INIT_VIEW1D-OMPTarget.cpp b/src/basic/INIT_VIEW1D-OMPTarget.cpp
new file mode 100644
index 000000000..ffa97370e
--- /dev/null
+++ b/src/basic/INIT_VIEW1D-OMPTarget.cpp
@@ -0,0 +1,112 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INIT_VIEW1D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace basic
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define INIT_VIEW1D_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr a; \
+  const Real_type v = m_val; \
+\
+  allocAndInitOpenMPDeviceData(a, m_a, iend, did, hid);
+
+#define INIT_VIEW1D_DATA_SETUP_RAJA_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr a; \
+  const Real_type v = m_val; \
+\
+  allocAndInitOpenMPDeviceData(a, m_a, iend, did, hid); \
+\
+  using ViewType = RAJA::View<Real_type, RAJA::Layout<1> >; \
+  const RAJA::Layout<1> my_layout(iend); \
+  ViewType view(a, my_layout);
+
+#define INIT_VIEW1D_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_a, a, iend, hid, did); \
+  deallocOpenMPDeviceData(a, did);
+
+
+void INIT_VIEW1D::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    INIT_VIEW1D_DATA_SETUP_OMP_TARGET;                 
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(a) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) 
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        INIT_VIEW1D_BODY;
+      }
+
+    }
+    stopTimer();
+
+    INIT_VIEW1D_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+     INIT_VIEW1D_DATA_SETUP_RAJA_OMP_TARGET
+
+     startTimer();
+     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+         RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+         INIT_VIEW1D_BODY_RAJA;
+       });
+
+     }
+     stopTimer();
+
+     INIT_VIEW1D_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     std::cout << "\n  INIT_VIEW1D : Unknown OMP Targetvariant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/basic/INIT_VIEW1D.cpp b/src/basic/INIT_VIEW1D.cpp
new file mode 100644
index 000000000..37c2a0bcb
--- /dev/null
+++ b/src/basic/INIT_VIEW1D.cpp
@@ -0,0 +1,179 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INIT_VIEW1D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "common/DataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace basic
+{
+
+
+#define INIT_VIEW1D_DATA_SETUP_CPU \
+  Real_ptr a = m_a; \
+  const Real_type v = m_val;
+
+#define INIT_VIEW1D_DATA_RAJA_SETUP_CPU \
+  Real_ptr a = m_a; \
+  const Real_type v = m_val; \
+\
+  using ViewType = RAJA::View<Real_type, RAJA::Layout<1> >; \
+  const RAJA::Layout<1> my_layout(iend); \
+  ViewType view(a, my_layout);
+
+
+INIT_VIEW1D::INIT_VIEW1D(const RunParams& params)
+  : KernelBase(rajaperf::Basic_INIT_VIEW1D, params)
+{
+   setDefaultSize(500000);
+   setDefaultReps(5000);
+}
+
+INIT_VIEW1D::~INIT_VIEW1D() 
+{
+}
+
+void INIT_VIEW1D::setUp(VariantID vid)
+{
+  allocAndInitDataConst(m_a, getRunSize(), 0.0, vid);
+  m_val = 0.123;
+}
+
+void INIT_VIEW1D::runKernel(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  switch ( vid ) {
+
+    case Base_Seq : {
+
+      INIT_VIEW1D_DATA_SETUP_CPU;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          INIT_VIEW1D_BODY;
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_Seq : {
+
+      INIT_VIEW1D_DATA_RAJA_SETUP_CPU;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+          INIT_VIEW1D_BODY_RAJA;
+        });
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+#if defined(RAJA_ENABLE_OPENMP)
+    case Base_OpenMP : {
+
+      INIT_VIEW1D_DATA_SETUP_CPU;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        #pragma omp parallel for
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          INIT_VIEW1D_BODY;
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_OpenMP : {
+
+      INIT_VIEW1D_DATA_RAJA_SETUP_CPU;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+          INIT_VIEW1D_BODY_RAJA;
+        });
+
+      }
+      stopTimer();
+
+      break;
+    }
+#endif
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
+      break;
+    }
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
+      break;
+    }
+#endif
+
+    default : {
+      std::cout << "\n  INIT_VIEW1D : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+}
+
+void INIT_VIEW1D::updateChecksum(VariantID vid)
+{
+  checksum[vid] += calcChecksum(m_a, getRunSize());
+}
+
+void INIT_VIEW1D::tearDown(VariantID vid)
+{
+  (void) vid;
+  deallocData(m_a);
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/INIT_VIEW1D.hpp b/src/basic/INIT_VIEW1D.hpp
new file mode 100644
index 000000000..1b26cfed6
--- /dev/null
+++ b/src/basic/INIT_VIEW1D.hpp
@@ -0,0 +1,74 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// INIT_VIEW1D kernel reference implementation:
+///
+/// const Real_type val = ...;
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   a[i] = val;
+/// }
+///
+/// RAJA variants use a "View" and "Layout" to do the same thing. These 
+/// RAJA constructs provide little benfit in 1D, but they are used here
+/// to exercise those RAJA mechanics in the simplest scenario.
+///
+
+#ifndef RAJAPerf_Basic_INIT_VIEW1D_HPP
+#define RAJAPerf_Basic_INIT_VIEW1D_HPP
+
+
+#define INIT_VIEW1D_BODY  \
+  a[i] = v;
+
+#define INIT_VIEW1D_BODY_RAJA  \
+  view(i) = v;
+
+
+#include "common/KernelBase.hpp"
+
+namespace rajaperf 
+{
+class RunParams;
+
+namespace basic
+{
+
+class INIT_VIEW1D : public KernelBase
+{
+public:
+
+  INIT_VIEW1D(const RunParams& params);
+
+  ~INIT_VIEW1D();
+
+  void setUp(VariantID vid);
+  void runKernel(VariantID vid); 
+  void updateChecksum(VariantID vid);
+  void tearDown(VariantID vid);
+
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
+private:
+  Real_ptr m_a;
+  Real_type m_val;
+};
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif // closing endif for header file include guard
diff --git a/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp
new file mode 100644
index 000000000..06ba6d510
--- /dev/null
+++ b/src/basic/INIT_VIEW1D_OFFSET-Cuda.cpp
@@ -0,0 +1,116 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INIT_VIEW1D_OFFSET.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace basic
+{
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#define INIT_VIEW1D_OFFSET_DATA_SETUP_CUDA \
+  Real_ptr a; \
+  const Real_type v = m_val; \
+\
+  allocAndInitCudaDeviceData(a, m_a, iend);
+
+#define INIT_VIEW1D_OFFSET_DATA_RAJA_SETUP_CUDA \
+  Real_ptr a; \
+  const Real_type v = m_val; \
+\
+  allocAndInitCudaDeviceData(a, m_a, iend); \
+\
+  using ViewType = RAJA::View<Real_type, RAJA::OffsetLayout<1> >; \
+  ViewType view(a, RAJA::make_offset_layout<1>({{1}}, {{iend+1}}));
+
+#define INIT_VIEW1D_OFFSET_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_a, a, iend); \
+  deallocCudaDeviceData(a);
+
+__global__ void initview1d_offset(Real_ptr a, 
+                                  Real_type v,
+                                  const Index_type ibegin,
+                                  const Index_type iend) 
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     INIT_VIEW1D_OFFSET_BODY; 
+   }
+}
+
+
+void INIT_VIEW1D_OFFSET::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 1;
+  const Index_type iend = getRunSize()+1;
+
+  if ( vid == Base_CUDA ) {
+
+    INIT_VIEW1D_OFFSET_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      initview1d_offset<<<grid_size, block_size>>>( a, v,
+                                                    ibegin,
+                                                    iend ); 
+
+    }
+    stopTimer();
+
+    INIT_VIEW1D_OFFSET_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    INIT_VIEW1D_OFFSET_DATA_RAJA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+        RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+        INIT_VIEW1D_OFFSET_BODY_RAJA;
+      });
+
+    }
+    stopTimer();
+
+    INIT_VIEW1D_OFFSET_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  INIT_VIEW1D_OFFSET : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp
new file mode 100644
index 000000000..7be3fd582
--- /dev/null
+++ b/src/basic/INIT_VIEW1D_OFFSET-OMPTarget.cpp
@@ -0,0 +1,112 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INIT_VIEW1D_OFFSET.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace basic
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define INIT_VIEW1D_OFFSET_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr a; \
+  const Real_type v = m_val; \
+\
+  allocAndInitOpenMPDeviceData(a, m_a, iend, did, hid);
+
+#define INIT_VIEW1D_OFFSET_DATA_SETUP_RAJA_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr a; \
+  const Real_type v = m_val; \
+\
+  allocAndInitOpenMPDeviceData(a, m_a, iend, did, hid); \
+\
+  using ViewType = RAJA::View<Real_type, RAJA::OffsetLayout<1> >; \
+  ViewType view(a, RAJA::make_offset_layout<1>({{1}}, {{iend+1}}));
+
+#define INIT_VIEW1D_OFFSET_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_a, a, iend, hid, did); \
+  deallocOpenMPDeviceData(a, did);
+
+
+void INIT_VIEW1D_OFFSET::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 1;
+  const Index_type iend = getRunSize()+1;
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    INIT_VIEW1D_OFFSET_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(a) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1)
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        INIT_VIEW1D_OFFSET_BODY;
+      }
+
+    }
+    stopTimer();
+
+    INIT_VIEW1D_OFFSET_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+     INIT_VIEW1D_OFFSET_DATA_SETUP_RAJA_OMP_TARGET
+
+     startTimer();
+     for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+         RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+         INIT_VIEW1D_OFFSET_BODY_RAJA;
+       });
+
+     }
+     stopTimer();
+
+     INIT_VIEW1D_OFFSET_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     std::cout << "\n  INIT_VIEW1D_OFFSET : Unknown OMP Targetvariant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
+
diff --git a/src/basic/INIT_VIEW1D_OFFSET.cpp b/src/basic/INIT_VIEW1D_OFFSET.cpp
new file mode 100644
index 000000000..f9f29b30c
--- /dev/null
+++ b/src/basic/INIT_VIEW1D_OFFSET.cpp
@@ -0,0 +1,178 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INIT_VIEW1D_OFFSET.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#include "common/DataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace basic
+{
+
+#define INIT_VIEW1D_OFFSET_DATA_SETUP_CPU \
+  Real_ptr a = m_a; \
+  const Real_type v = m_val;
+
+#define INIT_VIEW1D_OFFSET_DATA_RAJA_SETUP_CPU \
+  Real_ptr a = m_a; \
+  const Real_type v = m_val; \
+\
+  ViewType view(a, RAJA::make_offset_layout<1>({{1}}, {{iend+1}}));
+
+
+INIT_VIEW1D_OFFSET::INIT_VIEW1D_OFFSET(const RunParams& params)
+  : KernelBase(rajaperf::Basic_INIT_VIEW1D_OFFSET, params)
+{
+   setDefaultSize(500000);
+   setDefaultReps(5000);
+}
+
+INIT_VIEW1D_OFFSET::~INIT_VIEW1D_OFFSET() 
+{
+}
+
+void INIT_VIEW1D_OFFSET::setUp(VariantID vid)
+{
+  allocAndInitDataConst(m_a, getRunSize(), 0.0, vid);
+  m_val = 0.123;  
+}
+
+void INIT_VIEW1D_OFFSET::runKernel(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 1;
+  const Index_type iend = getRunSize()+1;
+
+  using ViewType = RAJA::View<Real_type, RAJA::OffsetLayout<1> >;
+
+  switch ( vid ) {
+
+    case Base_Seq : {
+
+      INIT_VIEW1D_OFFSET_DATA_SETUP_CPU;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          INIT_VIEW1D_OFFSET_BODY;
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_Seq : {
+
+      INIT_VIEW1D_OFFSET_DATA_RAJA_SETUP_CPU;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+          INIT_VIEW1D_OFFSET_BODY_RAJA;
+        });
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+#if defined(RAJA_ENABLE_OPENMP)
+    case Base_OpenMP : {
+
+      INIT_VIEW1D_OFFSET_DATA_SETUP_CPU;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        #pragma omp parallel for
+        for (Index_type i = ibegin; i < iend; ++i ) {
+          INIT_VIEW1D_OFFSET_BODY;
+        }
+
+      }
+      stopTimer();
+
+      break;
+    }
+
+    case RAJA_OpenMP : {
+
+      INIT_VIEW1D_OFFSET_DATA_RAJA_SETUP_CPU;
+
+      startTimer();
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+          INIT_VIEW1D_OFFSET_BODY_RAJA;
+        });
+
+      }
+      stopTimer();
+
+      break;
+    }
+#endif
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
+      break;
+    }
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
+      break;
+    }
+#endif
+
+    default : {
+      std::cout << "\n  INIT_VIEW1D_OFFSET : Unknown variant id = " << vid << std::endl;
+    }
+
+  }
+
+}
+
+void INIT_VIEW1D_OFFSET::updateChecksum(VariantID vid)
+{
+  checksum[vid] += calcChecksum(m_a, getRunSize());
+}
+
+void INIT_VIEW1D_OFFSET::tearDown(VariantID vid)
+{
+  (void) vid;
+  deallocData(m_a);
+}
+
+} // end namespace basic
+} // end namespace rajaperf
diff --git a/src/basic/INIT_VIEW1D_OFFSET.hpp b/src/basic/INIT_VIEW1D_OFFSET.hpp
new file mode 100644
index 000000000..191cfc2fc
--- /dev/null
+++ b/src/basic/INIT_VIEW1D_OFFSET.hpp
@@ -0,0 +1,74 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// INIT_VIEW1D_OFFSET kernel reference implementation:
+///
+/// const Real_type val = ...;
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   a[i-ibegin] = val;
+/// }
+///
+/// RAJA variants use a "View" and "Layout" to do the same thing. These 
+/// RAJA constructs provide little benfit in 1D, but they are used here
+/// to exercise those RAJA mechanics in the simplest scenario.
+///
+
+#ifndef RAJAPerf_Basic_INIT_VIEW1D_OFFSET_HPP
+#define RAJAPerf_Basic_INIT_VIEW1D_OFFSET_HPP
+
+
+#define INIT_VIEW1D_OFFSET_BODY  \
+  a[i-ibegin] = v;
+
+#define INIT_VIEW1D_OFFSET_BODY_RAJA  \
+  view(i) = v;
+
+
+#include "common/KernelBase.hpp"
+
+namespace rajaperf 
+{
+class RunParams;
+
+namespace basic
+{
+
+class INIT_VIEW1D_OFFSET : public KernelBase
+{
+public:
+
+  INIT_VIEW1D_OFFSET(const RunParams& params);
+
+  ~INIT_VIEW1D_OFFSET();
+
+  void setUp(VariantID vid);
+  void runKernel(VariantID vid); 
+  void updateChecksum(VariantID vid);
+  void tearDown(VariantID vid);
+
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
+private:
+  Real_ptr m_a;
+  Real_type m_val;
+};
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif // closing endif for header file include guard
diff --git a/src/basic/MULADDSUB-Cuda.cpp b/src/basic/MULADDSUB-Cuda.cpp
new file mode 100644
index 000000000..dde3c68d6
--- /dev/null
+++ b/src/basic/MULADDSUB-Cuda.cpp
@@ -0,0 +1,118 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MULADDSUB.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace basic
+{
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#define MULADDSUB_DATA_SETUP_CUDA \
+  Real_ptr out1; \
+  Real_ptr out2; \
+  Real_ptr out3; \
+  Real_ptr in1; \
+  Real_ptr in2; \
+\
+  allocAndInitCudaDeviceData(out1, m_out1, iend); \
+  allocAndInitCudaDeviceData(out2, m_out2, iend); \
+  allocAndInitCudaDeviceData(out3, m_out3, iend); \
+  allocAndInitCudaDeviceData(in1, m_in1, iend); \
+  allocAndInitCudaDeviceData(in2, m_in2, iend);
+
+#define MULADDSUB_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_out1, out1, iend); \
+  getCudaDeviceData(m_out2, out2, iend); \
+  getCudaDeviceData(m_out3, out3, iend); \
+  deallocCudaDeviceData(out1); \
+  deallocCudaDeviceData(out2); \
+  deallocCudaDeviceData(out3); \
+  deallocCudaDeviceData(in1); \
+  deallocCudaDeviceData(in2);
+
+__global__ void muladdsub(Real_ptr out1, Real_ptr out2, Real_ptr out3, 
+                          Real_ptr in1, Real_ptr in2, 
+                          Index_type iend) 
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     MULADDSUB_BODY; 
+   }
+}
+
+
+void MULADDSUB::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_CUDA ) {
+
+    MULADDSUB_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+       muladdsub<<<grid_size, block_size>>>( out1, out2, out3, in1, in2, 
+                                             iend ); 
+
+    }
+    stopTimer();
+
+    MULADDSUB_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    MULADDSUB_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+         MULADDSUB_BODY;
+       });
+
+    }
+    stopTimer();
+
+    MULADDSUB_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  MULADDSUB : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/basic/MULADDSUB-OMPTarget.cpp b/src/basic/MULADDSUB-OMPTarget.cpp
new file mode 100644
index 000000000..e4a0496b5
--- /dev/null
+++ b/src/basic/MULADDSUB-OMPTarget.cpp
@@ -0,0 +1,112 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MULADDSUB.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace basic
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define MULADDSUB_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr out1; \
+  Real_ptr out2; \
+  Real_ptr out3; \
+  Real_ptr in1; \
+  Real_ptr in2; \
+\
+  allocAndInitOpenMPDeviceData(out1, m_out1, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(out2, m_out2, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(out3, m_out3, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(in1, m_in1, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(in2, m_in2, iend, did, hid);
+
+#define MULADDSUB_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_out1, out1, iend, hid, did); \
+  getOpenMPDeviceData(m_out2, out2, iend, hid, did); \
+  getOpenMPDeviceData(m_out3, out3, iend, hid, did); \
+  deallocOpenMPDeviceData(out1, did); \
+  deallocOpenMPDeviceData(out2, did); \
+  deallocOpenMPDeviceData(out3, did); \
+  deallocOpenMPDeviceData(in1, did); \
+  deallocOpenMPDeviceData(in2, did);
+
+
+void MULADDSUB::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    MULADDSUB_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(out1, out2, out3, in1, in2) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1)
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        MULADDSUB_BODY;
+      }
+
+    }
+    stopTimer();
+
+    MULADDSUB_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    MULADDSUB_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+        MULADDSUB_BODY;
+      });
+
+    }
+    stopTimer();
+
+    MULADDSUB_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     std::cout << "\n  MULADDSUB : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/basic/MULADDSUB.cpp b/src/basic/MULADDSUB.cpp
index 639ee58b0..391e7a419 100644
--- a/src/basic/MULADDSUB.cpp
+++ b/src/basic/MULADDSUB.cpp
@@ -13,22 +13,12 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-///
-/// MULADDSUB kernel reference implementation:
-///
-/// for (Index_type i = ibegin; i < iend; ++i ) {
-///   out1[i] = in1[i] * in2[i] ;
-///   out2[i] = in1[i] + in2[i] ;
-///   out3[i] = in1[i] - in2[i] ;
-/// }
-///
-
 #include "MULADDSUB.hpp"
 
-#include "common/DataUtils.hpp"
-
 #include "RAJA/RAJA.hpp"
 
+#include "common/DataUtils.hpp"
+
 #include <iostream>
 
 namespace rajaperf 
@@ -36,62 +26,14 @@ namespace rajaperf
 namespace basic
 {
 
-#define MULADDSUB_DATA \
+
+#define MULADDSUB_DATA_SETUP_CPU \
   ResReal_ptr out1 = m_out1; \
   ResReal_ptr out2 = m_out2; \
   ResReal_ptr out3 = m_out3; \
   ResReal_ptr in1 = m_in1; \
   ResReal_ptr in2 = m_in2;
 
-#define MULADDSUB_BODY  \
-  out1[i] = in1[i] * in2[i] ; \
-  out2[i] = in1[i] + in2[i] ; \
-  out3[i] = in1[i] - in2[i] ;
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define MULADDSUB_DATA_SETUP_CUDA \
-  Real_ptr out1; \
-  Real_ptr out2; \
-  Real_ptr out3; \
-  Real_ptr in1; \
-  Real_ptr in2; \
-\
-  allocAndInitCudaDeviceData(out1, m_out1, iend); \
-  allocAndInitCudaDeviceData(out2, m_out2, iend); \
-  allocAndInitCudaDeviceData(out3, m_out3, iend); \
-  allocAndInitCudaDeviceData(in1, m_in1, iend); \
-  allocAndInitCudaDeviceData(in2, m_in2, iend);
-
-#define MULADDSUB_DATA_TEARDOWN_CUDA \
-  getCudaDeviceData(m_out1, out1, iend); \
-  getCudaDeviceData(m_out2, out2, iend); \
-  getCudaDeviceData(m_out3, out3, iend); \
-  deallocCudaDeviceData(out1); \
-  deallocCudaDeviceData(out2); \
-  deallocCudaDeviceData(out3); \
-  deallocCudaDeviceData(in1); \
-  deallocCudaDeviceData(in2);
-
-__global__ void muladdsub(Real_ptr out1, Real_ptr out2, Real_ptr out3, 
-                          Real_ptr in1, Real_ptr in2, 
-                          Index_type iend) 
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     MULADDSUB_BODY; 
-   }
-}
-
-#endif // if defined(RAJA_ENABLE_CUDA)
-
 
 MULADDSUB::MULADDSUB(const RunParams& params)
   : KernelBase(rajaperf::Basic_MULADDSUB, params)
@@ -106,9 +48,9 @@ MULADDSUB::~MULADDSUB()
 
 void MULADDSUB::setUp(VariantID vid)
 {
-  allocAndInitData(m_out1, getRunSize(), vid);
-  allocAndInitData(m_out2, getRunSize(), vid);
-  allocAndInitData(m_out3, getRunSize(), vid);
+  allocAndInitDataConst(m_out1, getRunSize(), 0.0, vid);
+  allocAndInitDataConst(m_out2, getRunSize(), 0.0, vid);
+  allocAndInitDataConst(m_out3, getRunSize(), 0.0, vid);
   allocAndInitData(m_in1, getRunSize(), vid);
   allocAndInitData(m_in2, getRunSize(), vid);
 }
@@ -123,7 +65,7 @@ void MULADDSUB::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      MULADDSUB_DATA;
+      MULADDSUB_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -140,12 +82,13 @@ void MULADDSUB::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      MULADDSUB_DATA;
+      MULADDSUB_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](Index_type i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           MULADDSUB_BODY;
         });
 
@@ -158,7 +101,7 @@ void MULADDSUB::runKernel(VariantID vid)
 #if defined(RAJA_ENABLE_OPENMP)
     case Base_OpenMP : {
 
-      MULADDSUB_DATA;
+      MULADDSUB_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -174,20 +117,15 @@ void MULADDSUB::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // case is not defined...
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      MULADDSUB_DATA;
+      MULADDSUB_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, 
-          [=](Index_type i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           MULADDSUB_BODY;
         });
 
@@ -198,58 +136,26 @@ void MULADDSUB::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-
-      MULADDSUB_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-         muladdsub<<<grid_size, block_size>>>( out1, out2, out3, in1, in2, 
-                                               iend ); 
-
-      }
-      stopTimer();
-
-      MULADDSUB_DATA_TEARDOWN_CUDA;
-
-      break; 
-    }
-
-    case RAJA_CUDA : {
-
-      MULADDSUB_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend, 
-           [=] __device__ (Index_type i) {
-           MULADDSUB_BODY;
-         });
-
-      }
-      stopTimer();
-
-      MULADDSUB_DATA_TEARDOWN_CUDA;
-
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  MULADDSUB : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/basic/MULADDSUB.hpp b/src/basic/MULADDSUB.hpp
index b2a9f5e6f..829ced00e 100644
--- a/src/basic/MULADDSUB.hpp
+++ b/src/basic/MULADDSUB.hpp
@@ -13,10 +13,26 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// MULADDSUB kernel reference implementation:
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   out1[i] = in1[i] * in2[i] ;
+///   out2[i] = in1[i] + in2[i] ;
+///   out3[i] = in1[i] - in2[i] ;
+/// }
+///
 
 #ifndef RAJAPerf_Basic_MULADDSUB_HPP
 #define RAJAPerf_Basic_MULADDSUB_HPP
 
+
+#define MULADDSUB_BODY  \
+  out1[i] = in1[i] * in2[i] ; \
+  out2[i] = in1[i] + in2[i] ; \
+  out3[i] = in1[i] - in2[i] ;
+
+
 #include "common/KernelBase.hpp"
 
 namespace rajaperf 
@@ -39,6 +55,9 @@ class MULADDSUB : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
   Real_ptr m_out1;
   Real_ptr m_out2;
diff --git a/src/basic/NESTED_INIT-Cuda.cpp b/src/basic/NESTED_INIT-Cuda.cpp
new file mode 100644
index 000000000..e7fafbf0f
--- /dev/null
+++ b/src/basic/NESTED_INIT-Cuda.cpp
@@ -0,0 +1,110 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "NESTED_INIT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace basic
+{
+
+#define NESTED_INIT_DATA_SETUP_CUDA \
+  Real_ptr array; \
+  Index_type ni = m_ni; \
+  Index_type nj = m_nj; \
+  Index_type nk = m_nk; \
+\
+  allocAndInitCudaDeviceData(array, m_array, m_array_length);
+
+#define NESTED_INIT_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_array, array, m_array_length); \
+  deallocCudaDeviceData(array);
+
+__global__ void nested_init(Real_ptr array,
+                            Index_type ni, Index_type nj)
+{
+   Index_type i = threadIdx.x;
+   Index_type j = blockIdx.y;
+   Index_type k = blockIdx.z;
+
+   NESTED_INIT_BODY; 
+}
+
+
+void NESTED_INIT::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  if ( vid == Base_CUDA ) {
+
+    NESTED_INIT_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      dim3 nthreads_per_block(ni, 1, 1);
+      dim3 nblocks(1, nj, nk);
+
+      nested_init<<<nblocks, nthreads_per_block>>>(array,
+                                                   ni, nj);
+
+    }
+    stopTimer();
+
+    NESTED_INIT_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    NESTED_INIT_DATA_SETUP_CUDA;
+
+    using EXEC_POL = RAJA::nested::Policy<
+                       RAJA::nested::CudaCollapse<
+                         RAJA::nested::For<2, RAJA::cuda_block_z_exec>,   //k
+                         RAJA::nested::For<1, RAJA::cuda_block_y_exec>,   //j
+                         RAJA::nested::For<0, RAJA::cuda_thread_x_exec> > >;//i
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::nested::forall(EXEC_POL{},
+                           RAJA::make_tuple(RAJA::RangeSegment(0, ni),
+                                            RAJA::RangeSegment(0, nj),
+                                            RAJA::RangeSegment(0, nk)),
+        [=] __device__ (Index_type i, Index_type j, Index_type k) {
+        NESTED_INIT_BODY;
+      });
+
+    }
+    stopTimer();
+
+    NESTED_INIT_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  NESTED_INIT : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/basic/NESTED_INIT-OMPTarget.cpp b/src/basic/NESTED_INIT-OMPTarget.cpp
new file mode 100644
index 000000000..452224d7e
--- /dev/null
+++ b/src/basic/NESTED_INIT-OMPTarget.cpp
@@ -0,0 +1,137 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "NESTED_INIT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace basic
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define NESTED_INIT_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr array = m_array; \
+  Index_type ni = m_ni; \
+  Index_type nj = m_nj; \
+  Index_type nk = m_nk; \
+\
+  allocAndInitOpenMPDeviceData(array, m_array, m_array_length, did, hid);
+
+#define NESTED_INIT_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_array, array, m_array_length, hid, did); \
+  deallocOpenMPDeviceData(array, did);
+
+
+void NESTED_INIT::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    NESTED_INIT_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(array) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) collapse(3) 
+      for (Index_type k = 0; k < nk; ++k ) {
+        for (Index_type j = 0; j < nj; ++j ) {
+          for (Index_type i = 0; i < ni; ++i ) {
+            NESTED_INIT_BODY;
+          }
+        }
+      }  
+
+    }
+    stopTimer();
+
+    NESTED_INIT_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+#if 1 // temporary implementation until RAJA::nested::OmpCollapse works.
+
+    NESTED_INIT_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(0, nk), [=](Index_type k) {
+        for (Index_type j = 0; j < nj; ++j ) {
+          for (Index_type i = 0; i < ni; ++i ) {
+            NESTED_INIT_BODY;
+          }
+        }
+      });
+
+    }
+    stopTimer();
+
+    NESTED_INIT_DATA_TEARDOWN_OMP_TARGET;
+
+#else
+
+    NESTED_INIT_DATA_SETUP_OMP_TARGET;
+
+    using EXEC_POL = RAJA::nested::Policy<
+                       RAJA::nested::OmpTargetCollapse<
+                         RAJA::nested::For<2>,          //k
+                         RAJA::nested::For<1>,          //j
+                         RAJA::nested::For<0> > >;      //i
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::nested::forall(EXEC_POL{},
+                           RAJA::make_tuple(RAJA::RangeSegment(0, ni),
+                                            RAJA::RangeSegment(0, nj),
+                                            RAJA::RangeSegment(0, nk)),
+        [=] __device__ (Index_type i, Index_type j, Index_type k) {
+        NESTED_INIT_BODY;
+      });
+
+    }
+    stopTimer();
+
+    NESTED_INIT_DATA_TEARDOWN_OMP_TARGET;
+
+#endif                            
+
+  } else { 
+     std::cout << "\n  NESTED_INIT : Unknown variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/basic/NESTED_INIT.cpp b/src/basic/NESTED_INIT.cpp
index d4e45237e..803300e6a 100644
--- a/src/basic/NESTED_INIT.cpp
+++ b/src/basic/NESTED_INIT.cpp
@@ -13,71 +13,25 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-///
-/// NESTED_INIT kernel reference implementation:
-///
-/// for (Index_type k = 0; k < nk; ++k ) {
-///   for (Index_type j = 0; j < nj; ++j ) {
-///     for (Index_type i = 0; i < ni; ++i ) {
-///       array[i+ni*(j+nj*k)] = 0.00000001 * i * j * k ;
-///     }
-///   }
-/// }
-///
-
 #include "NESTED_INIT.hpp"
 
-#include "common/DataUtils.hpp"
-
 #include "RAJA/RAJA.hpp"
-#include "RAJA/internal/MemUtils_CPU.hpp"
-
-#include <iostream>
 
-//#define USE_COLLAPSE
-#undef USE_COLLAPSE
+#include "common/DataUtils.hpp"
 
+#include <iostream>
 
 namespace rajaperf 
 {
 namespace basic
 {
 
-#define NESTED_INIT_DATA \
-  ResReal_ptr array = m_array; \
-  Int_type ni = m_ni; \
-  Int_type nj = m_nj; \
-  Int_type nk = m_nk;
-
-#define NESTED_INIT_BODY  \
-  array[i+ni*(j+nj*k)] = 0.00000001 * i * j * k ;
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
-#define NESTED_INIT_DATA_SETUP_CUDA \
-  Real_ptr array; \
-  Int_type ni = m_ni; \
-  Int_type nj = m_nj; \
-  Int_type nk = m_nk; \
-\
-  allocAndInitCudaDeviceData(array, m_array, ni * nj * nk);
-
-#define NESTED_INIT_DATA_TEARDOWN_CUDA \
-  getCudaDeviceData(m_array, array, ni * nj * nk); \
-  deallocCudaDeviceData(array);
-
-__global__ void nested_init(Real_ptr array,
-                            Int_type ni, Int_type nj)
-{
-   Index_type i = threadIdx.x;
-   Index_type j = blockIdx.y;
-   Index_type k = blockIdx.z;
-
-   NESTED_INIT_BODY; 
-}
 
-#endif // if defined(RAJA_ENABLE_CUDA)
+#define NESTED_INIT_DATA_SETUP_CPU \
+  ResReal_ptr array = m_array; \
+  Index_type ni = m_ni; \
+  Index_type nj = m_nj; \
+  Index_type nk = m_nk;
 
 
 NESTED_INIT::NESTED_INIT(const RunParams& params)
@@ -97,13 +51,10 @@ NESTED_INIT::~NESTED_INIT()
 
 void NESTED_INIT::setUp(VariantID vid)
 {
-  (void) vid;
-
   m_nk = m_nk_init * static_cast<Real_type>( getRunSize() ) / getDefaultSize();
+  m_array_length = m_ni * m_nj * m_nk;
 
-  int len = m_ni * m_nj * m_nk;
-  m_array = RAJA::allocate_aligned_type<Real_type>(RAJA::DATA_ALIGN,
-                                                   len*sizeof(Real_type)); 
+  allocAndInitDataConst(m_array, m_array_length, 0.0, vid);
 }
 
 void NESTED_INIT::runKernel(VariantID vid)
@@ -114,7 +65,7 @@ void NESTED_INIT::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      NESTED_INIT_DATA;
+      NESTED_INIT_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -135,21 +86,22 @@ void NESTED_INIT::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      NESTED_INIT_DATA;
+      NESTED_INIT_DATA_SETUP_CPU;
+
+      using EXEC_POL = RAJA::nested::Policy<
+                             RAJA::nested::For<2, RAJA::seq_exec>,    // k
+                             RAJA::nested::For<1, RAJA::seq_exec>,    // j
+                             RAJA::nested::For<0, RAJA::simd_exec> >; // i
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forallN< RAJA::NestedPolicy< 
-                       RAJA::ExecList< RAJA::simd_exec,
-                                       RAJA::seq_exec,
-                                       RAJA::seq_exec >, 
-                       RAJA::Permute<RAJA::PERM_KJI> > >(
-              RAJA::RangeSegment(0, ni),
-              RAJA::RangeSegment(0, nj),
-              RAJA::RangeSegment(0, nk),
-          [=](Index_type i, Index_type j, Index_type k) {     
-          NESTED_INIT_BODY;
+        RAJA::nested::forall(EXEC_POL{},
+                             RAJA::make_tuple(RAJA::RangeSegment(0, ni),
+                                              RAJA::RangeSegment(0, nj),
+                                              RAJA::RangeSegment(0, nk)),
+             [=](Index_type i, Index_type j, Index_type k) {     
+             NESTED_INIT_BODY;
         });
 
       }
@@ -161,25 +113,14 @@ void NESTED_INIT::runKernel(VariantID vid)
 #if defined(RAJA_ENABLE_OPENMP)
     case Base_OpenMP : {
 
-      NESTED_INIT_DATA;
+      NESTED_INIT_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-#if defined(USE_COLLAPSE)
-        #pragma omp parallel 
-          {
-            #pragma omp for nowait collapse(3) 
-            for (Index_type k = 0; k < nk; ++k ) {
-              for (Index_type j = 0; j < nj; ++j ) {
-                for (Index_type i = 0; i < ni; ++i ) {
-                  NESTED_INIT_BODY;
-                }
-              }
-            }
-          } // omp parallel
-#else
-          #pragma omp parallel for 
+// using collapse here doesn't appear to yield a performance benefit
+//        #pragma omp parallel for collapse(2)
+          #pragma omp parallel for
           for (Index_type k = 0; k < nk; ++k ) {
             for (Index_type j = 0; j < nj; ++j ) {
               for (Index_type i = 0; i < ni; ++i ) {
@@ -187,7 +128,6 @@ void NESTED_INIT::runKernel(VariantID vid)
               }
             }
           }
-#endif
 
       }
       stopTimer();
@@ -195,44 +135,25 @@ void NESTED_INIT::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // case is not defined...
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      NESTED_INIT_DATA;
+      NESTED_INIT_DATA_SETUP_CPU;
+
+      using EXEC_POL = RAJA::nested::Policy<
+                           RAJA::nested::For<2, RAJA::omp_parallel_for_exec>,//k
+                           RAJA::nested::For<1, RAJA::seq_exec>,             //j
+                           RAJA::nested::For<0, RAJA::simd_exec> >;          //i
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-#if defined(USE_COLLAPSE) 
-      // impact....is there something wrong with the forallN implementation?
-        RAJA::forallN< RAJA::NestedPolicy< 
-                       RAJA::ExecList< RAJA::simd_exec,
-                                       RAJA::omp_collapse_nowait_exec,
-                                       RAJA::omp_collapse_nowait_exec >, 
-                       RAJA::Permute<RAJA::PERM_KJI> > >(
-              RAJA::RangeSegment(0, ni),
-              RAJA::RangeSegment(0, nj),
-              RAJA::RangeSegment(0, nk),
-          [=](Index_type i, Index_type j, Index_type k) {     
-          NESTED_INIT_BODY;
-        });
-#else
-        RAJA::forallN< RAJA::NestedPolicy< 
-                       RAJA::ExecList< RAJA::simd_exec,
-                                       RAJA::seq_exec,
-                                       RAJA::omp_parallel_for_exec >, 
-                       RAJA::Permute<RAJA::PERM_KJI> > >(
-              RAJA::RangeSegment(0, ni),
-              RAJA::RangeSegment(0, nj),
-              RAJA::RangeSegment(0, nk),
-          [=](Index_type i, Index_type j, Index_type k) {     
-          NESTED_INIT_BODY;
+        RAJA::nested::forall(EXEC_POL{},
+                             RAJA::make_tuple(RAJA::RangeSegment(0, ni),
+                                              RAJA::RangeSegment(0, nj),
+                                              RAJA::RangeSegment(0, nk)),
+             [=](Index_type i, Index_type j, Index_type k) {     
+             NESTED_INIT_BODY;
         });
-#endif
 
       }
       stopTimer();
@@ -241,66 +162,26 @@ void NESTED_INIT::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-
-      NESTED_INIT_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-        dim3 nthreads_per_block(ni, 1, 1);
-        dim3 nblocks(1, nj, nk);
-
-        nested_init<<<nblocks, nthreads_per_block>>>(array,
-                                                     ni, nj);
-
-      }
-      stopTimer();
-
-      NESTED_INIT_DATA_TEARDOWN_CUDA;
-
-      break; 
-    }
-
-    case RAJA_CUDA : {
-
-      NESTED_INIT_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-        RAJA::forallN< RAJA::NestedPolicy< 
-                       RAJA::ExecList< RAJA::cuda_thread_x_exec,
-                                       RAJA::cuda_block_y_exec,
-                                       RAJA::cuda_block_z_exec >, 
-                       RAJA::Permute<RAJA::PERM_KJI> > >(
-              RAJA::RangeSegment(0, ni),
-              RAJA::RangeSegment(0, nj),
-              RAJA::RangeSegment(0, nk),
-          [=] __device__ (Index_type i, Index_type j, Index_type k) {     
-          NESTED_INIT_BODY;
-        });
-
-      }
-      stopTimer();
-
-      NESTED_INIT_DATA_TEARDOWN_CUDA;
-
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  NESTED_INIT : Unknown variant id = " << vid << std::endl;
     }
 
   }
@@ -309,7 +190,7 @@ void NESTED_INIT::runKernel(VariantID vid)
 
 void NESTED_INIT::updateChecksum(VariantID vid)
 {
-  checksum[vid] += calcChecksum(m_array, m_ni * m_nj * m_nk);
+  checksum[vid] += calcChecksum(m_array, m_array_length);
 }
 
 void NESTED_INIT::tearDown(VariantID vid)
diff --git a/src/basic/NESTED_INIT.hpp b/src/basic/NESTED_INIT.hpp
index 985d10ac5..dcdbb1e76 100644
--- a/src/basic/NESTED_INIT.hpp
+++ b/src/basic/NESTED_INIT.hpp
@@ -13,10 +13,26 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// NESTED_INIT kernel reference implementation:
+///
+/// for (Index_type k = 0; k < nk; ++k ) {
+///   for (Index_type j = 0; j < nj; ++j ) {
+///     for (Index_type i = 0; i < ni; ++i ) {
+///       array[i+ni*(j+nj*k)] = 0.00000001 * i * j * k ;
+///     }
+///   }
+/// }
+///
 
 #ifndef RAJAPerf_Basic_NESTED_INIT_HPP
 #define RAJAPerf_Basic_NESTED_INIT_HPP
 
+
+#define NESTED_INIT_BODY  \
+  array[i+ni*(j+nj*k)] = 0.00000001 * i * j * k ;
+
+
 #include "common/KernelBase.hpp"
 
 namespace rajaperf 
@@ -39,12 +55,18 @@ class NESTED_INIT : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
+  Index_type m_array_length;
+
   Real_ptr m_array;
-  Int_type m_ni;
-  Int_type m_nj;
-  Int_type m_nk;
-  Int_type m_nk_init;
+
+  Index_type m_ni;
+  Index_type m_nj;
+  Index_type m_nk;
+  Index_type m_nk_init;
 };
 
 } // end namespace basic
diff --git a/src/basic/REDUCE3_INT-Cuda.cpp b/src/basic/REDUCE3_INT-Cuda.cpp
new file mode 100644
index 000000000..96bdbd4ff
--- /dev/null
+++ b/src/basic/REDUCE3_INT-Cuda.cpp
@@ -0,0 +1,183 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "REDUCE3_INT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace basic
+{
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#define REDUCE3_INT_DATA_SETUP_CUDA \
+  Int_ptr vec; \
+\
+  allocAndInitCudaDeviceData(vec, m_vec, iend);
+
+#define REDUCE3_INT_DATA_TEARDOWN_CUDA \
+  deallocCudaDeviceData(vec);
+
+
+__global__ void reduce3int(Int_ptr vec,
+                           Int_ptr vsum, Int_type vsum_init,
+                           Int_ptr vmin, Int_type vmin_init,
+                           Int_ptr vmax, Int_type vmax_init,
+                           Index_type iend) 
+{
+  extern __shared__ Int_type psum[ ];
+  Int_type* pmin = (Int_type*)&psum[ 1 * blockDim.x ];
+  Int_type* pmax = (Int_type*)&psum[ 2 * blockDim.x ];
+
+  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  psum[ threadIdx.x ] = vsum_init;
+  pmin[ threadIdx.x ] = vmin_init;
+  pmax[ threadIdx.x ] = vmax_init;
+
+  for ( ; i < iend ; i += gridDim.x * blockDim.x ) {
+    psum[ threadIdx.x ] += vec[ i ];
+    pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], vec[ i ] );
+    pmax[ threadIdx.x ] = RAJA_MAX( pmax[ threadIdx.x ], vec[ i ] );
+  }
+  __syncthreads();
+
+  for ( i = blockDim.x / 2; i > 0; i /= 2 ) { 
+    if ( threadIdx.x < i ) { 
+      psum[ threadIdx.x ] += psum[ threadIdx.x + i ];
+      pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], pmin[ threadIdx.x + i ] );
+      pmax[ threadIdx.x ] = RAJA_MAX( pmax[ threadIdx.x ], pmax[ threadIdx.x + i ] );
+    }
+     __syncthreads();
+  }
+
+#if 1 // serialized access to shared data;
+  if ( threadIdx.x == 0 ) {
+    atomicAdd( vsum, psum[ 0 ] );
+    atomicMin( vmin, pmin[ 0 ] );
+    atomicMax( vmax, pmax[ 0 ] );
+  }
+#else // this doesn't work due to data races
+  if ( threadIdx.x == 0 ) {
+    *vsum += psum[ 0 ];
+    *vmin = RAJA_MIN( *vmin, pmin[ 0 ] );
+    *vmax = RAJA_MAX( *vmax, pmax[ 0 ] );
+  }
+#endif
+}
+
+
+void REDUCE3_INT::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_CUDA ) {
+
+    REDUCE3_INT_DATA_SETUP_CUDA;
+
+    Int_ptr vsum;
+    allocAndInitCudaDeviceData(vsum, &m_vsum_init, 1);
+    Int_ptr vmin;
+    allocAndInitCudaDeviceData(vmin, &m_vmin_init, 1);
+    Int_ptr vmax;
+    allocAndInitCudaDeviceData(vmax, &m_vmax_init, 1);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      initCudaDeviceData(vsum, &m_vsum_init, 1);
+      initCudaDeviceData(vmin, &m_vmin_init, 1);
+      initCudaDeviceData(vmax, &m_vmax_init, 1);
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      reduce3int<<<grid_size, block_size, 
+                   3*sizeof(Int_type)*block_size>>>(vec, 
+                                                    vsum, m_vsum_init,
+                                                    vmin, m_vmin_init,
+                                                    vmax, m_vmax_init,
+                                                    iend ); 
+
+      Int_type lsum;
+      Int_ptr plsum = &lsum;
+      getCudaDeviceData(plsum, vsum, 1);
+      m_vsum += lsum;
+
+      Int_type lmin;
+      Int_ptr plmin = &lmin;
+      getCudaDeviceData(plmin, vmin, 1);
+      m_vmin = RAJA_MIN(m_vmin, lmin);
+
+      Int_type lmax;
+      Int_ptr plmax = &lmax;
+      getCudaDeviceData(plmax, vmax, 1);
+      m_vmax = RAJA_MAX(m_vmax, lmax);
+
+    }
+    stopTimer();
+
+    REDUCE3_INT_DATA_TEARDOWN_CUDA;
+
+    deallocCudaDeviceData(vsum);
+    deallocCudaDeviceData(vmin);
+    deallocCudaDeviceData(vmax);
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    REDUCE3_INT_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::ReduceSum<RAJA::cuda_reduce<block_size>, Int_type> vsum(m_vsum_init);
+      RAJA::ReduceMin<RAJA::cuda_reduce<block_size>, Int_type> vmin(m_vmin_init);
+      RAJA::ReduceMax<RAJA::cuda_reduce<block_size>, Int_type> vmax(m_vmax_init);
+
+      RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+        RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+        REDUCE3_INT_BODY_RAJA;
+      });
+
+      m_vsum += static_cast<Int_type>(vsum.get());
+      m_vmin = RAJA_MIN(m_vmin, static_cast<Int_type>(vmin.get()));
+      m_vmax = RAJA_MAX(m_vmax, static_cast<Int_type>(vmax.get()));
+
+    }
+    stopTimer();
+
+    REDUCE3_INT_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  REDUCE3_INT : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/basic/REDUCE3_INT-OMPTarget.cpp b/src/basic/REDUCE3_INT-OMPTarget.cpp
new file mode 100644
index 000000000..8389b0060
--- /dev/null
+++ b/src/basic/REDUCE3_INT-OMPTarget.cpp
@@ -0,0 +1,117 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "REDUCE3_INT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace basic
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define REDUCE3_INT_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Int_ptr vec; \
+\
+  allocAndInitOpenMPDeviceData(vec, m_vec, iend, did, hid);
+
+#define REDUCE3_INT_DATA_TEARDOWN_OMP_TARGET \
+  deallocOpenMPDeviceData(vec, did); \
+
+
+void REDUCE3_INT::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    REDUCE3_INT_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Int_type vsum = m_vsum_init;
+      Int_type vmin = m_vmin_init;
+      Int_type vmax = m_vmax_init;
+
+      #pragma omp target is_device_ptr(vec) device( did ) map(tofrom:vsum, vmin, vmax)
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static,1) \
+                               reduction(+:vsum) \
+                               reduction(min:vmin) \
+                               reduction(max:vmax)
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        REDUCE3_INT_BODY;
+      }
+
+      m_vsum += vsum;
+      m_vmin = RAJA_MIN(m_vmin, vmin);
+      m_vmax = RAJA_MAX(m_vmax, vmax);
+
+    }
+    stopTimer();
+
+    REDUCE3_INT_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    REDUCE3_INT_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::ReduceSum<RAJA::omp_target_reduce<NUMTEAMS>, Int_type> vsum(m_vsum_init);
+      RAJA::ReduceMin<RAJA::omp_target_reduce<NUMTEAMS>, Int_type> vmin(m_vmin_init);
+      RAJA::ReduceMax<RAJA::omp_target_reduce<NUMTEAMS>, Int_type> vmax(m_vmax_init);
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend),
+        [=](Index_type i) {
+        REDUCE3_INT_BODY_RAJA;
+      });
+
+      m_vsum += static_cast<Real_type>(vsum.get());
+      m_vmin = RAJA_MIN(m_vmin, static_cast<Real_type>(vmin.get()));
+      m_vmax = RAJA_MAX(m_vmax, static_cast<Real_type>(vmax.get()));
+
+    }
+    stopTimer();
+
+    REDUCE3_INT_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     std::cout << "\n  REDUCE3_INT : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/basic/REDUCE3_INT.cpp b/src/basic/REDUCE3_INT.cpp
index 997a1fe1d..30e02587a 100644
--- a/src/basic/REDUCE3_INT.cpp
+++ b/src/basic/REDUCE3_INT.cpp
@@ -13,30 +13,12 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-///
-/// REDUCE3_INT kernel reference implementation:
-///
-/// Int_type vsum = m_vsum_init;
-/// Int_type vmin = m_vmin_init;
-/// Int_type vmax = m_vmax_init;
-/// 
-/// for (Index_type i = ibegin; i < iend; ++i ) {
-///   vsum += vec[i] ;
-///   vmin = RAJA_MIN(vmin, vec[i]) ;
-///   vmax = RAJA_MAX(vmax, vec[i]) ;
-/// }
-///
-/// m_vsum += vsum;
-/// m_vmin = RAJA_MIN(m_vmin, vmin);
-/// m_vmax = RAJA_MAX(m_vmax, vmax);
-///
-
 #include "REDUCE3_INT.hpp"
 
-#include "common/DataUtils.hpp"
-
 #include "RAJA/RAJA.hpp"
 
+#include "common/DataUtils.hpp"
+
 #include <limits>
 #include <iostream>
 
@@ -45,93 +27,19 @@ namespace rajaperf
 namespace basic
 {
 
-#define REDUCE3_INT_DATA \
-  Int_ptr vec = m_vec; \
-
-
-#define REDUCE3_INT_BODY  \
-  vsum += vec[i] ; \
-  vmin = RAJA_MIN(vmin, vec[i]) ; \
-  vmax = RAJA_MAX(vmax, vec[i]) ;
-
-#define REDUCE3_INT_BODY_RAJA  \
-  vsum += vec[i] ; \
-  vmin.min(vec[i]) ; \
-  vmax.max(vec[i]) ;
 
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define REDUCE3_INT_DATA_SETUP_CUDA \
-  Int_ptr vec; \
-\
-  allocAndInitCudaDeviceData(vec, m_vec, iend);
-
-#define REDUCE3_INT_DATA_TEARDOWN_CUDA \
-  deallocCudaDeviceData(vec);
-
-
-__global__ void reduce3int(Int_ptr vec,
-                           Int_ptr vsum, Int_type vsum_init,
-                           Int_ptr vmin, Int_type vmin_init,
-                           Int_ptr vmax, Int_type vmax_init,
-                           Index_type iend) 
-{
-  extern __shared__ Int_type psum[ ];
-  Int_type* pmin = (Int_type*)&psum[ 1 * blockDim.x ];
-  Int_type* pmax = (Int_type*)&psum[ 2 * blockDim.x ];
-
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-
-  psum[ threadIdx.x ] = vsum_init;
-  pmin[ threadIdx.x ] = vmin_init;
-  pmax[ threadIdx.x ] = vmax_init;
-
-  for ( ; i < iend ; i += gridDim.x * blockDim.x ) {
-    psum[ threadIdx.x ] += vec[ i ];
-    pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], vec[ i ] );
-    pmax[ threadIdx.x ] = RAJA_MAX( pmax[ threadIdx.x ], vec[ i ] );
-  }
-  __syncthreads();
-
-  for ( i = blockDim.x / 2; i > 0; i /= 2 ) { 
-    if ( threadIdx.x < i ) { 
-      psum[ threadIdx.x ] += psum[ threadIdx.x + i ];
-      pmin[ threadIdx.x ] = RAJA_MIN( pmin[ threadIdx.x ], pmin[ threadIdx.x + i ] );
-      pmax[ threadIdx.x ] = RAJA_MAX( pmax[ threadIdx.x ], pmax[ threadIdx.x + i ] );
-    }
-     __syncthreads();
-  }
-
-#if 1 // serialized access to shared data;
-  if ( threadIdx.x == 0 ) {
-    atomicAdd( vsum, psum[ 0 ] );
-    atomicMin( vmin, pmin[ 0 ] );
-    atomicMax( vmax, pmax[ 0 ] );
-  }
-#else // this doesn't work due to data races
-  if ( threadIdx.x == 0 ) {
-    *vsum += psum[ 0 ];
-    *vmin = RAJA_MIN( *vmin, pmin[ 0 ] );
-    *vmax = RAJA_MAX( *vmax, pmax[ 0 ] );
-  }
-#endif
-}
-
-#endif // if defined(RAJA_ENABLE_CUDA)
+#define REDUCE3_INT_DATA_SETUP_CPU \
+  Int_ptr vec = m_vec; \
 
 
 REDUCE3_INT::REDUCE3_INT(const RunParams& params)
   : KernelBase(rajaperf::Basic_REDUCE3_INT, params)
 {
    setDefaultSize(1000000);
-   setDefaultReps(5000);
+// setDefaultReps(5000);
+// Set reps to low value until we resolve RAJA omp-target 
+// reduction performance issues
+   setDefaultReps(100);
 }
 
 REDUCE3_INT::~REDUCE3_INT() 
@@ -160,7 +68,7 @@ void REDUCE3_INT::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      REDUCE3_INT_DATA;
+      REDUCE3_INT_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -185,7 +93,7 @@ void REDUCE3_INT::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      REDUCE3_INT_DATA;
+      REDUCE3_INT_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -194,13 +102,14 @@ void REDUCE3_INT::runKernel(VariantID vid)
         RAJA::ReduceMin<RAJA::seq_reduce, Int_type> vmin(m_vmin_init);
         RAJA::ReduceMax<RAJA::seq_reduce, Int_type> vmax(m_vmax_init);
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](Index_type i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           REDUCE3_INT_BODY_RAJA;
         });
 
-        m_vsum += static_cast<Real_type>(vsum.get());
-        m_vmin = RAJA_MIN(m_vmin, static_cast<Real_type>(vmin.get()));
-        m_vmax = RAJA_MAX(m_vmax, static_cast<Real_type>(vmax.get()));
+        m_vsum += static_cast<Int_type>(vsum.get());
+        m_vmin = RAJA_MIN(m_vmin, static_cast<Int_type>(vmin.get()));
+        m_vmax = RAJA_MAX(m_vmax, static_cast<Int_type>(vmax.get()));
 
       }
       stopTimer();
@@ -211,7 +120,7 @@ void REDUCE3_INT::runKernel(VariantID vid)
 #if defined(RAJA_ENABLE_OPENMP)
     case Base_OpenMP : {
 
-      REDUCE3_INT_DATA;
+      REDUCE3_INT_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -237,14 +146,9 @@ void REDUCE3_INT::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // case is not defined...
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      REDUCE3_INT_DATA;
+      REDUCE3_INT_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -253,14 +157,14 @@ void REDUCE3_INT::runKernel(VariantID vid)
         RAJA::ReduceMin<RAJA::omp_reduce, Int_type> vmin(m_vmin_init);
         RAJA::ReduceMax<RAJA::omp_reduce, Int_type> vmax(m_vmax_init);
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, 
-          [=](Index_type i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           REDUCE3_INT_BODY_RAJA;
         });
 
-        m_vsum += static_cast<Real_type>(vsum.get());
-        m_vmin = RAJA_MIN(m_vmin, static_cast<Real_type>(vmin.get()));
-        m_vmax = RAJA_MAX(m_vmax, static_cast<Real_type>(vmax.get()));
+        m_vsum += static_cast<Int_type>(vsum.get());
+        m_vmin = RAJA_MIN(m_vmin, static_cast<Int_type>(vmin.get()));
+        m_vmax = RAJA_MAX(m_vmax, static_cast<Int_type>(vmax.get()));
 
       }
       stopTimer();
@@ -269,94 +173,26 @@ void REDUCE3_INT::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-
-      REDUCE3_INT_DATA_SETUP_CUDA;
-      Int_ptr vsum;
-      allocAndInitCudaDeviceData(vsum, &m_vsum_init, 1);
-      Int_ptr vmin;
-      allocAndInitCudaDeviceData(vmin, &m_vmin_init, 1);
-      Int_ptr vmax;
-      allocAndInitCudaDeviceData(vmax, &m_vmax_init, 1);
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-        initCudaDeviceData(vsum, &m_vsum_init, 1);
-
-        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-        reduce3int<<<grid_size, block_size, 
-                     3*sizeof(Int_type)*block_size>>>(vec, 
-                                                      vsum, m_vsum_init,
-                                                      vmin, m_vmin_init,
-                                                      vmax, m_vmax_init,
-                                                      iend ); 
-
-        Int_type lsum;
-        Int_ptr plsum = &lsum;
-        getCudaDeviceData(plsum, vsum, 1);
-        m_vsum += lsum;
-
-        Int_type lmin;
-        Int_ptr plmin = &lmin;
-        getCudaDeviceData(plmin, vmin, 1);
-        m_vmin = RAJA_MIN(m_vmin, lmin);
-
-        Int_type lmax;
-        Int_ptr plmax = &lmax;
-        getCudaDeviceData(plmax, vmax, 1);
-        m_vmax = RAJA_MAX(m_vmax, lmax);
-
-      }
-      stopTimer();
-
-      REDUCE3_INT_DATA_TEARDOWN_CUDA;
-      deallocCudaDeviceData(vsum);
-
-      break; 
-    }
-
-    case RAJA_CUDA : {
-
-      REDUCE3_INT_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-        RAJA::ReduceSum<RAJA::cuda_reduce<block_size>, Int_type> vsum(m_vsum_init);
-        RAJA::ReduceMin<RAJA::cuda_reduce<block_size>, Int_type> vmin(m_vmin_init);
-        RAJA::ReduceMax<RAJA::cuda_reduce<block_size>, Int_type> vmax(m_vmax_init);
-
-        RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-          ibegin, iend, 
-          [=] __device__ (Index_type i) {
-          REDUCE3_INT_BODY_RAJA;
-        });
-
-        m_vsum += static_cast<Real_type>(vsum.get());
-        m_vmin = RAJA_MIN(m_vmin, static_cast<Real_type>(vmin.get()));
-        m_vmax = RAJA_MAX(m_vmax, static_cast<Real_type>(vmax.get()));
-
-      }
-      stopTimer();
-
-      REDUCE3_INT_DATA_TEARDOWN_CUDA;
-
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  REDUCE3_INT : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/basic/REDUCE3_INT.hpp b/src/basic/REDUCE3_INT.hpp
index 12decfdcb..8f3293aa3 100644
--- a/src/basic/REDUCE3_INT.hpp
+++ b/src/basic/REDUCE3_INT.hpp
@@ -13,10 +13,41 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// REDUCE3_INT kernel reference implementation:
+///
+/// Int_type vsum = m_vsum_init;
+/// Int_type vmin = m_vmin_init;
+/// Int_type vmax = m_vmax_init;
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   vsum += vec[i] ;
+///   vmin = RAJA_MIN(vmin, vec[i]) ;
+///   vmax = RAJA_MAX(vmax, vec[i]) ;
+/// }
+///
+/// m_vsum += vsum;
+/// m_vmin = RAJA_MIN(m_vmin, vmin);
+/// m_vmax = RAJA_MAX(m_vmax, vmax);
+///
+/// RAJA_MIN/MAX are macros that do what you would expect.
+///
 
 #ifndef RAJAPerf_Basic_REDUCE3_INT_HPP
 #define RAJAPerf_Basic_REDUCE3_INT_HPP
 
+
+#define REDUCE3_INT_BODY  \
+  vsum += vec[i] ; \
+  vmin = RAJA_MIN(vmin, vec[i]) ; \
+  vmax = RAJA_MAX(vmax, vec[i]) ;
+
+#define REDUCE3_INT_BODY_RAJA  \
+  vsum += vec[i] ; \
+  vmin.min(vec[i]) ; \
+  vmax.max(vec[i]) ;
+
+
 #include "common/KernelBase.hpp"
 
 namespace rajaperf 
@@ -39,6 +70,9 @@ class REDUCE3_INT : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
   Int_ptr m_vec;
   Int_type m_vsum;
diff --git a/src/basic/TRAP_INT-Cuda.cpp b/src/basic/TRAP_INT-Cuda.cpp
new file mode 100644
index 000000000..ea92f7804
--- /dev/null
+++ b/src/basic/TRAP_INT-Cuda.cpp
@@ -0,0 +1,168 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "TRAP_INT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace basic
+{
+
+//
+// Function used in TRAP_INT loop.
+//
+RAJA_INLINE
+RAJA_DEVICE
+Real_type trap_int_func(Real_type x,
+                        Real_type y,
+                        Real_type xp,
+                        Real_type yp)
+{
+   Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp);
+   denom = 1.0/sqrt(denom);
+   return denom;
+}
+
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#define TRAP_INT_DATA_SETUP_CUDA \
+  Real_type x0 = m_x0; \
+  Real_type xp = m_xp; \
+  Real_type y = m_y; \
+  Real_type yp = m_yp; \
+  Real_type h = m_h;
+
+#define TRAP_INT_DATA_TEARDOWN_CUDA // nothing to do here...
+
+
+__global__ void trapint(Real_type x0, Real_type xp,
+                        Real_type y, Real_type yp, 
+                        Real_type h, 
+                        Real_ptr sumx,
+                        Index_type iend)
+{
+  extern __shared__ Real_type psumx[ ];
+
+  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  psumx[ threadIdx.x ] = 0.0;
+  for ( ; i < iend ; i += gridDim.x * blockDim.x ) {
+    Real_type x = x0 + i*h;
+    Real_type val = trap_int_func(x, y, xp, yp);
+    psumx[ threadIdx.x ] += val;
+  }
+  __syncthreads();
+
+  for ( i = blockDim.x / 2; i > 0; i /= 2 ) {
+    if ( threadIdx.x < i ) {
+      psumx[ threadIdx.x ] += psumx[ threadIdx.x + i ];
+    }
+     __syncthreads();
+  }
+
+#if 1 // serialized access to shared data;
+  if ( threadIdx.x == 0 ) {
+    RAJA::atomic::atomicAdd<RAJA::atomic::cuda_atomic>( sumx, psumx[ 0 ] );
+  }
+#else // this doesn't work due to data races
+  if ( threadIdx.x == 0 ) {
+    *sumx += psumx[ 0 ];
+  }
+#endif
+
+}
+
+
+void TRAP_INT::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_CUDA ) {
+
+    TRAP_INT_DATA_SETUP_CUDA;
+
+    Real_ptr sumx;
+    allocAndInitCudaDeviceData(sumx, &m_sumx_init, 1);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      initCudaDeviceData(sumx, &m_sumx_init, 1); 
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      trapint<<<grid_size, block_size, 
+                sizeof(Real_type)*block_size>>>(x0, xp,
+                                                y, yp,
+                                                h,
+                                                sumx,
+                                                iend);
+
+      Real_type lsumx;
+      Real_ptr plsumx = &lsumx;
+      getCudaDeviceData(plsumx, sumx, 1);
+      m_sumx += lsumx * h;
+
+    }
+    stopTimer();
+
+    deallocCudaDeviceData(sumx);
+
+    TRAP_INT_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    TRAP_INT_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::ReduceSum<RAJA::cuda_reduce<block_size>, Real_type> sumx(m_sumx_init);
+
+      RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+        RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+        TRAP_INT_BODY;
+      });
+
+      m_sumx += static_cast<Real_type>(sumx.get()) * h;
+
+    }
+    stopTimer();
+
+    TRAP_INT_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  TRAP_INT : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/basic/TRAP_INT-OMPTarget.cpp b/src/basic/TRAP_INT-OMPTarget.cpp
new file mode 100644
index 000000000..2f9c9c4fa
--- /dev/null
+++ b/src/basic/TRAP_INT-OMPTarget.cpp
@@ -0,0 +1,120 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "TRAP_INT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace basic
+{
+
+//
+// Function used in TRAP_INT loop.
+//
+RAJA_INLINE
+Real_type trap_int_func(Real_type x,
+                        Real_type y,
+                        Real_type xp,
+                        Real_type yp)
+{
+   Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp);
+   denom = 1.0/sqrt(denom);
+   return denom;
+}
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define TRAP_INT_DATA_SETUP_OMP_TARGET \
+  Real_type x0 = m_x0; \
+  Real_type xp = m_xp; \
+  Real_type y = m_y; \
+  Real_type yp = m_yp; \
+  Real_type h = m_h;
+
+#define TRAP_INT_DATA_TEARDOWN_OMP_TARGET // nothing to do here...
+
+
+void TRAP_INT::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    TRAP_INT_DATA_SETUP_OMP_TARGET;
+
+    #pragma omp target data map(to:x0,xp,y,yp,h)
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Real_type sumx = m_sumx_init;
+
+      #pragma omp target teams distribute parallel for map(tofrom: sumx) reduction(+:sumx) \
+                         num_teams(NUMTEAMS) schedule(static, 1) 
+        
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        TRAP_INT_BODY;
+      }
+
+      m_sumx += sumx * h;
+
+    }
+    stopTimer();
+
+    #pragma omp target exit data map(delete: x0,xp,y,yp,h) 
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    TRAP_INT_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::ReduceSum<RAJA::omp_target_reduce<NUMTEAMS>, Real_type> sumx(m_sumx_init);
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+        TRAP_INT_BODY;
+      });
+
+      m_sumx += static_cast<Real_type>(sumx.get()) * h;
+
+    }
+    stopTimer();
+
+    TRAP_INT_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     std::cout << "\n  TRAP_INT : Unknown OMP Targetvariant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace basic
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/basic/TRAP_INT.cpp b/src/basic/TRAP_INT.cpp
index 24de91a11..8058d055b 100644
--- a/src/basic/TRAP_INT.cpp
+++ b/src/basic/TRAP_INT.cpp
@@ -13,31 +13,11 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-///
-/// TRAP_INT kernel reference implementation:
-///
-/// Real_type trap_int_func(Real_type x,
-///                         Real_type y,
-///                         Real_type xp,
-///                         Real_type yp)
-/// {
-///    Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp);
-///    denom = 1.0/sqrt(denom);
-///    return denom;
-/// }
-///
-/// for (Index_type i = ibegin; i < iend; ++i ) {
-///    Real_type x = x0 + i*h;
-///    sumx += trap_int_func(x, y, xp, yp);
-/// }
-///
-
 #include "TRAP_INT.hpp"
 
-#include "common/DataUtils.hpp"
-
 #include "RAJA/RAJA.hpp"
-#include "RAJA/policy/cuda.hpp"
+
+#include "common/DataUtils.hpp"
 
 #include <iostream>
 
@@ -50,7 +30,6 @@ namespace basic
 // Function used in TRAP_INT loop.
 //
 RAJA_INLINE
-RAJA_HOST_DEVICE
 Real_type trap_int_func(Real_type x,
                         Real_type y,
                         Real_type xp,
@@ -62,69 +41,13 @@ Real_type trap_int_func(Real_type x,
 }
 
 
-#define TRAP_INT_DATA \
+#define TRAP_INT_DATA_SETUP_CPU \
   Real_type x0 = m_x0; \
   Real_type xp = m_xp; \
   Real_type y = m_y; \
   Real_type yp = m_yp; \
   Real_type h = m_h;
 
-#define TRAP_INT_BODY \
-  Real_type x = x0 + i*h; \
-  sumx += trap_int_func(x, y, xp, yp);
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define TRAP_INT_DATA_SETUP_CUDA // nothing to do here...
-
-#define TRAP_INT_DATA_TEARDOWN_CUDA // nothing to do here...
-
-__global__ void trapint(Real_type x0, Real_type xp,
-                        Real_type y, Real_type yp, 
-                        Real_type h, 
-                        Real_ptr sumx,
-                        Index_type iend)
-{
-  extern __shared__ Real_type psumx[ ];
-
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-
-  psumx[ threadIdx.x ] = 0.0;
-  for ( ; i < iend ; i += gridDim.x * blockDim.x ) {
-    Real_type x = x0 + i*h;
-    Real_type val = trap_int_func(x, y, xp, yp);
-    psumx[ threadIdx.x ] += val;
-  }
-  __syncthreads();
-
-  for ( i = blockDim.x / 2; i > 0; i /= 2 ) {
-    if ( threadIdx.x < i ) {
-      psumx[ threadIdx.x ] += psumx[ threadIdx.x + i ];
-    }
-     __syncthreads();
-  }
-
-#if 1 // serialized access to shared data;
-  if ( threadIdx.x == 0 ) {
-    RAJA::atomic::atomicAdd<RAJA::atomic::cuda_atomic>( sumx, psumx[ 0 ] );
-  }
-#else // this doesn't work due to data races
-  if ( threadIdx.x == 0 ) {
-    *sumx += psumx[ 0 ];
-  }
-#endif
-
-}
-
-#endif // if defined(RAJA_ENABLE_CUDA)
-
 
 TRAP_INT::TRAP_INT(const RunParams& params)
   : KernelBase(rajaperf::Basic_TRAP_INT, params)
@@ -149,8 +72,7 @@ void TRAP_INT::setUp(VariantID vid)
 
   m_h = xn - m_x0;
 
-  m_sumx_init = 0.5*( trap_int_func(m_x0, m_y, m_xp, m_yp) +
-                      trap_int_func(xn, m_y, m_xp, m_yp) );  
+  m_sumx_init = 0.0;
 
   m_sumx = 0;
 }
@@ -165,7 +87,7 @@ void TRAP_INT::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      TRAP_INT_DATA;
+      TRAP_INT_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -186,14 +108,15 @@ void TRAP_INT::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      TRAP_INT_DATA;
+      TRAP_INT_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
         RAJA::ReduceSum<RAJA::seq_reduce, Real_type> sumx(m_sumx_init);
 
-        RAJA::forall<RAJA::seq_exec>(ibegin, iend, [=](int i) {
+        RAJA::forall<RAJA::seq_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](int i) {
           TRAP_INT_BODY;
         });
 
@@ -208,7 +131,7 @@ void TRAP_INT::runKernel(VariantID vid)
 #if defined(RAJA_ENABLE_OPENMP)
     case Base_OpenMP : {
 
-      TRAP_INT_DATA;
+      TRAP_INT_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -228,22 +151,17 @@ void TRAP_INT::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // case is not defined...
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      TRAP_INT_DATA;
+      TRAP_INT_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
         RAJA::ReduceSum<RAJA::omp_reduce, Real_type> sumx(m_sumx_init);
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, 
-          [=](Index_type i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           TRAP_INT_BODY;
         });
 
@@ -256,91 +174,35 @@ void TRAP_INT::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-
-      TRAP_INT_DATA;
-
-      Real_ptr sumx;
-      allocAndInitCudaDeviceData(sumx, &m_sumx_init, 1);
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-        initCudaDeviceData(sumx, &m_sumx_init, 1); 
-
-        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-        trapint<<<grid_size, block_size, 
-                  sizeof(Real_type)*block_size>>>(x0, xp,
-                                                  y, yp,
-                                                  h,
-                                                  sumx,
-                                                  iend);
-
-        Real_type lsumx;
-        Real_ptr plsumx = &lsumx;
-        getCudaDeviceData(plsumx, sumx, 1);
-        m_sumx += lsumx * h;
-
-      }
-      stopTimer();
-
-      deallocCudaDeviceData(sumx);
-
-      break;
-    }
-
-    case RAJA_CUDA : {
-
-      TRAP_INT_DATA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-        RAJA::ReduceSum<RAJA::cuda_reduce<block_size>, Real_type> sumx(m_sumx_init);
-
-        RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-          ibegin, iend,
-          [=] __device__ (Index_type i) {
-          TRAP_INT_BODY;
-        });
-
-        m_sumx += static_cast<Real_type>(sumx.get()) * h;
-
-      }
-      stopTimer();
-
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  TRAP_INT : Unknown variant id = " << vid << std::endl;
     }
 
   }
 
-#if 0
-std::cout << "\t\t sumx = "
-          << std::setprecision(20) << m_sumx << std::endl; 
-#endif
 }
 
 void TRAP_INT::updateChecksum(VariantID vid)
 {
-#if 1
-  checksum[vid] += (m_sumx + 0.00123) / (m_sumx - 0.00123);
-#else
   checksum[vid] += m_sumx;
-#endif
 }
 
 void TRAP_INT::tearDown(VariantID vid)
diff --git a/src/basic/TRAP_INT.hpp b/src/basic/TRAP_INT.hpp
index c31e1d677..41d36fc85 100644
--- a/src/basic/TRAP_INT.hpp
+++ b/src/basic/TRAP_INT.hpp
@@ -13,10 +13,34 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// TRAP_INT kernel reference implementation:
+///
+/// Real_type trap_int_func(Real_type x,
+///                         Real_type y,
+///                         Real_type xp,
+///                         Real_type yp)
+/// {
+///    Real_type denom = (x - xp)*(x - xp) + (y - yp)*(y - yp);
+///    denom = 1.0/sqrt(denom);
+///    return denom;
+/// }
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///    Real_type x = x0 + i*h;
+///    sumx += trap_int_func(x, y, xp, yp);
+/// }
+///
 
 #ifndef RAJAPerf_Basic_TRAP_INT_HPP
 #define RAJAPerf_Basic_TRAP_INT_HPP
 
+
+#define TRAP_INT_BODY \
+  Real_type x = x0 + i*h; \
+  sumx += trap_int_func(x, y, xp, yp);
+
+
 #include "common/KernelBase.hpp"
 
 namespace rajaperf 
@@ -39,6 +63,9 @@ class TRAP_INT : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
   Real_type m_x0;
   Real_type m_xp;
diff --git a/src/common/CudaDataUtils.hpp b/src/common/CudaDataUtils.hpp
new file mode 100644
index 000000000..6e45abc3d
--- /dev/null
+++ b/src/common/CudaDataUtils.hpp
@@ -0,0 +1,94 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Methods for CUDA kernel data allocation, initialization, and deallocation.
+///
+
+
+#ifndef RAJAPerf_CudaDataUtils_HPP
+#define RAJAPerf_CudaDataUtils_HPP
+
+#include "RPTypes.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+
+#include "RAJA/policy/cuda/raja_cudaerrchk.hpp"
+
+
+namespace rajaperf
+{
+
+/*!
+ * \brief Copy given hptr (host) data to CUDA device (dptr).
+ *
+ * Method assumes both host and device data arrays are allocated
+ * and of propoer size for copy operation to succeed.
+ */
+template <typename T>
+void initCudaDeviceData(T& dptr, const T hptr, int len)
+{
+  cudaErrchk( cudaMemcpy( dptr, hptr, 
+                          len * sizeof(typename std::remove_pointer<T>::type),
+                          cudaMemcpyHostToDevice ) );
+
+  incDataInitCount();
+}
+
+/*!
+ * \brief Allocate CUDA device data array (dptr) and copy given hptr (host) 
+ * data to device array.
+ */
+template <typename T>
+void allocAndInitCudaDeviceData(T& dptr, const T hptr, int len)
+{
+  cudaErrchk( cudaMalloc( (void**)&dptr,
+              len * sizeof(typename std::remove_pointer<T>::type) ) );
+
+  initCudaDeviceData(dptr, hptr, len);
+}
+
+/*!
+ * \brief Copy given dptr (CUDA device) data to host (hptr).
+ *
+ * Method assumes both host and device data arrays are allocated
+ * and of propoer size for copy operation to succeed.
+ */
+template <typename T>
+void getCudaDeviceData(T& hptr, const T dptr, int len)
+{
+  cudaErrchk( cudaMemcpy( hptr, dptr, 
+              len * sizeof(typename std::remove_pointer<T>::type),
+              cudaMemcpyDeviceToHost ) );
+}
+
+/*!
+ * \brief Free device data array.
+ */
+template <typename T>
+void deallocCudaDeviceData(T& dptr)
+{
+  cudaErrchk( cudaFree( dptr ) );
+  dptr = 0;
+}
+
+
+}  // closing brace for rajaperf namespace
+
+#endif // RAJA_ENABLE_CUDA
+
+#endif  // closing endif for header file include guard
+
diff --git a/src/common/DataUtils.cpp b/src/common/DataUtils.cpp
index f2ceb9393..7074174c7 100644
--- a/src/common/DataUtils.cpp
+++ b/src/common/DataUtils.cpp
@@ -16,6 +16,7 @@
 
 #include "DataUtils.hpp"
 
+
 #include "RAJA/internal/MemUtils_CPU.hpp"
 
 #include <cstdlib>
@@ -63,6 +64,17 @@ void allocAndInitData(Real_ptr& ptr, int len, VariantID vid )
   initData(ptr, len, vid);
 }
 
+void allocAndInitDataConst(Real_ptr& ptr, int len, Real_type val,
+                           VariantID vid)
+{
+  (void) vid;
+
+  ptr = 
+    RAJA::allocate_aligned_type<Real_type>(RAJA::DATA_ALIGN, 
+                                           len*sizeof(Real_type));
+  initDataConst(ptr, len, val, vid);
+}
+
 void allocAndInitDataRandSign(Real_ptr& ptr, int len, VariantID vid)
 {
   ptr =
@@ -80,7 +92,7 @@ void allocAndInitData(Complex_ptr& ptr, int len, VariantID vid)
 
 
 /*
- * Free data arrays.
+ * Free data arrays of given type.
  */
 void deallocData(Int_ptr& ptr)
 { 
@@ -108,7 +120,8 @@ void deallocData(Complex_ptr& ptr)
 
 
 /*
- * \brief Initialize Int_type data array.
+ * \brief Initialize Int_type data array to 
+ * randomly signed positive and negative values.
  */
 void initData(Int_ptr& ptr, int len, VariantID vid)
 {
@@ -117,7 +130,6 @@ void initData(Int_ptr& ptr, int len, VariantID vid)
 // First touch...
 #if defined(RAJA_ENABLE_OPENMP)
   if ( vid == Base_OpenMP ||
-       vid == RAJALike_OpenMP ||
        vid == RAJA_OpenMP ) {
     #pragma omp parallel for
     for (int i = 0; i < len; ++i) {
@@ -147,7 +159,9 @@ void initData(Int_ptr& ptr, int len, VariantID vid)
 }
 
 /*
- * Initialize Real_type data array.
+ * Initialize Real_type data array to non-random 
+ * positive values (0.0, 1.0) based on their array position 
+ * (index) and the order in which this method is called.
  */
 void initData(Real_ptr& ptr, int len, VariantID vid) 
 {
@@ -158,7 +172,6 @@ void initData(Real_ptr& ptr, int len, VariantID vid)
 // first touch...
 #if defined(RAJA_ENABLE_OPENMP)
   if ( vid == Base_OpenMP || 
-       vid == RAJALike_OpenMP || 
        vid == RAJA_OpenMP ) {
     #pragma omp parallel for
     for (int i = 0; i < len; ++i) { 
@@ -174,6 +187,31 @@ void initData(Real_ptr& ptr, int len, VariantID vid)
   incDataInitCount();
 }
 
+/*
+ * Initialize Real_type data array to constant values.
+ */
+void initDataConst(Real_ptr& ptr, int len, Real_type val,
+                   VariantID vid) 
+{
+
+// first touch...
+#if defined(RAJA_ENABLE_OPENMP)
+  if ( vid == Base_OpenMP ||
+       vid == RAJA_OpenMP ) {
+    #pragma omp parallel for
+    for (int i = 0; i < len; ++i) {
+      ptr[i] = 0;
+    };
+  }
+#endif
+
+  for (int i = 0; i < len; ++i) {
+    ptr[i] = val;
+  };
+
+  incDataInitCount();
+}
+
 /*
  * Initialize Real_type data array with random sign.
  */
@@ -184,7 +222,6 @@ void initDataRandSign(Real_ptr& ptr, int len, VariantID vid)
 // First touch...
 #if defined(RAJA_ENABLE_OPENMP)
   if ( vid == Base_OpenMP ||
-       vid == RAJALike_OpenMP ||
        vid == RAJA_OpenMP ) {
     #pragma omp parallel for
     for (int i = 0; i < len; ++i) {
@@ -218,7 +255,6 @@ void initData(Complex_ptr& ptr, int len, VariantID vid)
 
 #if defined(RAJA_ENABLE_OPENMP)
   if ( vid == Base_OpenMP ||
-       vid == RAJALike_OpenMP ||
        vid == RAJA_OpenMP ) {
     #pragma omp parallel for
     for (int i = 0; i < len; ++i) { 
diff --git a/src/common/DataUtils.hpp b/src/common/DataUtils.hpp
index 999fd4c55..99c62ccc4 100644
--- a/src/common/DataUtils.hpp
+++ b/src/common/DataUtils.hpp
@@ -24,7 +24,6 @@
 #include "RAJAPerfSuite.hpp"
 #include "RPTypes.hpp"
 
-#include "RAJA/policy/cuda/raja_cudaerrchk.hpp"
 
 namespace rajaperf
 {
@@ -43,18 +42,33 @@ void incDataInitCount();
 
 /*!
  * \brief Allocate and initialize Int_type data array.
+ * 
+ * Array is initialized using method initData(Int_ptr& ptr...) below.
  */
 void allocAndInitData(Int_ptr& ptr, int len,
                       VariantID vid = NumVariants);
 
 /*!
  * \brief Allocate and initialize aligned Real_type data array.
+ *
+ * Array is initialized using method initData(Real_ptr& ptr...) below.
  */
 void allocAndInitData(Real_ptr& ptr, int len,
                       VariantID vid = NumVariants);
 
+/*!
+ * \brief Allocate and initialize aligned Real_type data array.
+ * 
+ * Array entries are initialized using the method 
+ * initDataConst(Real_ptr& ptr...) below.
+ */
+void allocAndInitDataConst(Real_ptr& ptr, int len, Real_type val,
+                           VariantID vid = NumVariants);
+
 /*!
  * \brief Allocate and initialize aligned Real_type data array with random sign.
+ *
+ * Array is initialized using method initDataRandSign(Real_ptr& ptr...) below.
  */
 void allocAndInitDataRandSign(Real_ptr& ptr, int len,
                               VariantID vid = NumVariants);
@@ -78,76 +92,66 @@ void deallocData(Complex_ptr& ptr);
 
 /*!
  * \brief Initialize Int_type data array.
+ * 
+ * Array entries are randomly initialized to +/-1.
+ * Then, two randomly-chosen entries are reset, one to 
+ * a value > 1, one to a value < -1.
  */
 void initData(Int_ptr& ptr, int len,
               VariantID vid = NumVariants);
 
 /*!
  * \brief Initialize Real_type data array.
+ *
+ * Array entries are set (non-randomly) to positive values
+ * in the interval (0.0, 1.0) based on their array position (index)
+ * and the order in which this method is called.
  */
 void initData(Real_ptr& ptr, int len,
               VariantID vid = NumVariants);
 
+/*!
+ * \brief Initialize Real_type data array.
+ *
+ * Array entries are set to given constant value.
+ */
+void initDataConst(Real_ptr& ptr, int len, Real_type val,
+                   VariantID vid = NumVariants);
+
 /*!
  * \brief Initialize Real_type data array with random sign.
+ * 
+ * Array entries are initialized in the same way as the method 
+ * initData(Real_ptr& ptr...) above, but with random sign.
  */
 void initDataRandSign(Real_ptr& ptr, int len,
                       VariantID vid = NumVariants);
 
 /*!
  * \brief Initialize Complex_type data array.
+ *
+ * Real and imaginary array entries are initialized in the same way as the 
+ * method allocAndInitData(Real_ptr& ptr...) above.
  */
 void initData(Complex_ptr& ptr, int len,
               VariantID vid = NumVariants);
 
 /*!
  * \brief Initialize Real_type scalar data.
+ *
+ * Data is set similarly to an array enttry in the method 
+ * initData(Real_ptr& ptr...) above.
  */
 void initData(Real_type& d,
               VariantID vid = NumVariants);
 
-
-#if defined(RAJA_ENABLE_CUDA)
-
-template <typename T>
-void initCudaDeviceData(T& dptr, const T hptr, int len)
-{
-  cudaErrchk( cudaMemcpy( dptr, hptr, 
-                          len * sizeof(typename std::remove_pointer<T>::type),
-                          cudaMemcpyHostToDevice ) );
-
-  incDataInitCount();
-}
-
-template <typename T>
-void allocAndInitCudaDeviceData(T& dptr, const T hptr, int len)
-{
-  cudaErrchk( cudaMalloc( (void**)&dptr,
-              len * sizeof(typename std::remove_pointer<T>::type) ) );
-
-  initCudaDeviceData(dptr, hptr, len);
-}
-
-template <typename T>
-void getCudaDeviceData(T& hptr, const T dptr, int len)
-{
-  cudaErrchk( cudaMemcpy( hptr, dptr, 
-              len * sizeof(typename std::remove_pointer<T>::type),
-              cudaMemcpyDeviceToHost ) );
-}
-
-template <typename T>
-void deallocCudaDeviceData(T& dptr)
-{
-  cudaErrchk( cudaFree( dptr ) );
-  dptr = 0;
-}
-
-#endif
-
-
 /*!
  * \brief Calculate and return checksum for data arrays.
+ * 
+ * Checksums are computed as a weighted sum of array entries,
+ * where weight is a simple function of elemtn index.
+ *
+ * Checksumn is multiplied by given scale factor.
  */
 long double calcChecksum(Real_ptr d, int len, 
                          Real_type scale_factor = 1.0);
@@ -155,7 +159,6 @@ long double calcChecksum(Real_ptr d, int len,
 long double calcChecksum(Complex_ptr d, int len, 
                          Real_type scale_factor = 1.0);
 
-
 }  // closing brace for rajaperf namespace
 
 #endif  // closing endif for header file include guard
diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp
index 085142971..46026580b 100644
--- a/src/common/Executor.cpp
+++ b/src/common/Executor.cpp
@@ -136,7 +136,7 @@ void Executor::setupSuite()
 
       for (size_t ik = 0; ik < NumKernels && !found_it; ++ik) {
         KernelID kid = static_cast<KernelID>(ik);
-        if ( getFullKernelName(kid).find(*it) != string::npos ) {
+        if ( getKernelName(kid) == *it || getFullKernelName(kid) == *it ) {
           run_kern.insert(kid);
           found_it = true;
         }
@@ -236,7 +236,10 @@ void Executor::setupSuite()
 
     for (KIDset::iterator kid = run_kern.begin(); 
          kid != run_kern.end(); ++kid) {
-      kernels.push_back( getKernelObject(*kid, run_params) );
+/// RDH DISABLE COUPLE KERNEL
+      if ( *kid != Apps_COUPLE ) {
+        kernels.push_back( getKernelObject(*kid, run_params) );
+      }
     }
 
     if ( !(run_params.getInvalidVariantInput().empty()) ) {
@@ -350,13 +353,29 @@ void Executor::runSuite()
   cout << "\n\nRunning specified kernels and variants...\n";
 
   const int npasses = run_params.getNumPasses();
-  for (int ip = 0; ip < npasses; ++ip) {  
+  for (int ip = 0; ip < npasses; ++ip) {
+    if ( run_params.showProgress() ) {
+      std::cout << "\nPass throught suite # " << ip << "\n";
+    }
+
     for (size_t ik = 0; ik < kernels.size(); ++ik) {
+      KernelBase* kernel = kernels[ik];
+      if ( run_params.showProgress() ) {
+        std::cout << "\n   Running kernel -- " << kernel->getName() << "\n"; 
+      }
+
       for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
+         KernelBase* kern = kernels[ik];
+         if ( run_params.showProgress() ) {
+           cout << kern->getName() << " " <<  getVariantName(variant_ids[iv]) << endl;
+         }  
          kernels[ik]->execute( variant_ids[iv] );
-      } 
-    }
-  }
+      } // loop over variants 
+
+    } // loop over kernels
+
+  } // loop over passes through suite
+
 }
 
 void Executor::outputRunData()
@@ -707,12 +726,12 @@ void Executor::writeChecksumReport(const string& filename)
     // Set basic table formatting parameters.
     //
     const string equal_line("===================================================================================================");
-    const string dash_line("----------------------------------------------------------------------------------------------------");
+    const string dash_line("----------------------------------------------------------------------------------------");
     const string dash_line_short("-------------------------------------------------------");
     string dot_line("........................................................");
 
     size_t prec = 20;
-    size_t checksum_width = prec + 4;
+    size_t checksum_width = prec + 8;
 
     size_t namecol_width = 0;
     for (size_t ik = 0; ik < kernels.size(); ++ik) {
diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp
index 876dc49a7..64a117d06 100644
--- a/src/common/KernelBase.cpp
+++ b/src/common/KernelBase.cpp
@@ -17,16 +17,15 @@
 #include "KernelBase.hpp"
 
 #include "RunParams.hpp"
-#include "DataUtils.hpp"
 
 #include <cmath>
 
 namespace rajaperf {
 
 KernelBase::KernelBase(KernelID kid, const RunParams& params) 
-  : kernel_id(kid),
+  : run_params(params),
+    kernel_id(kid),
     name( getFullKernelName(kernel_id) ),
-    run_params(params),
     default_size(0),
     default_reps(0),
     running_variant(NumVariants)
diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp
index 316193fe1..71797f644 100644
--- a/src/common/KernelBase.hpp
+++ b/src/common/KernelBase.hpp
@@ -88,6 +88,8 @@ class KernelBase
 protected:
   int num_exec[NumVariants];
 
+  const RunParams& run_params;
+
   RAJA::Timer::ElapsedType min_time[NumVariants];
   RAJA::Timer::ElapsedType max_time[NumVariants];
   RAJA::Timer::ElapsedType tot_time[NumVariants];
@@ -103,8 +105,6 @@ class KernelBase
   KernelID    kernel_id;
   std::string name;
 
-  const RunParams& run_params;
-
   RAJA::Timer timer;
 
   Index_type default_size;
diff --git a/src/common/OpenMPTargetDataUtils.hpp b/src/common/OpenMPTargetDataUtils.hpp
new file mode 100644
index 000000000..0501690de
--- /dev/null
+++ b/src/common/OpenMPTargetDataUtils.hpp
@@ -0,0 +1,94 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Methods for openmp target kernel data allocation, initialization, 
+/// and deallocation.
+///
+
+
+#ifndef RAJAPerf_OpenMPTargetDataUtils_HPP
+#define RAJAPerf_OpenMPTargetDataUtils_HPP
+
+#include "RPTypes.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+
+namespace rajaperf
+{
+
+/*!
+ * \brief Copy given hptr (host) data to device (dptr).
+ *
+ * Method assumes both host and device data arrays are allocated
+ * and of propoer size for copy operation to succeed.
+ */
+template <typename T>
+void initOpenMPDeviceData(T& dptr, const T hptr, int len, 
+                          int did, int hid)
+{
+  omp_target_memcpy( dptr, hptr, 
+                     len * sizeof(typename std::remove_pointer<T>::type),
+                     0, 0, did, hid ); 
+
+  incDataInitCount();
+}
+
+/*!
+ * \brief Allocate device data array (dptr) and copy given hptr (host) 
+ * data to device array.
+ */
+template <typename T>
+void allocAndInitOpenMPDeviceData(T& dptr, const T hptr, int len,
+                                  int did, int hid)
+{
+  dptr = static_cast<T>( omp_target_alloc(
+                         len * sizeof(typename std::remove_pointer<T>::type), 
+                         did) );
+
+  initOpenMPDeviceData(dptr, hptr, len, did, hid);
+}
+
+/*!
+ * \brief Copy given device ptr (dptr) data to host ptr (hptr).
+ *
+ * Method assumes both host and device data arrays are allocated
+ * and of propoer size for copy operation to succeed.
+ */
+template <typename T>
+void getOpenMPDeviceData(T& hptr, const T dptr, int len, int hid, int did)
+{
+  omp_target_memcpy( hptr, dptr, 
+                     len * sizeof(typename std::remove_pointer<T>::type),
+                     0, 0, hid, did );
+}
+
+/*!
+ * \brief Free device data array.
+ */
+template <typename T>
+void deallocOpenMPDeviceData(T& dptr, int did)
+{
+  omp_target_free( dptr, did );
+  dptr = 0;
+}
+
+
+}  // closing brace for rajaperf namespace
+
+#endif // RAJA_ENABLE_TARGET_OPENMP
+
+#endif  // closing endif for header file include guard
diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp
index f6365301c..3fc3eac8e 100644
--- a/src/common/RAJAPerfSuite.cpp
+++ b/src/common/RAJAPerfSuite.cpp
@@ -27,6 +27,8 @@
 #include "basic/INIT3.hpp"
 #include "basic/REDUCE3_INT.hpp"
 #include "basic/NESTED_INIT.hpp"
+#include "basic/INIT_VIEW1D.hpp"
+#include "basic/INIT_VIEW1D_OFFSET.hpp"
 
 //
 // Lcals kernels...
@@ -62,8 +64,10 @@
 #include "apps/ENERGY.hpp"
 #include "apps/VOL3D.hpp"
 #include "apps/DEL_DOT_VEC_2D.hpp"
-#include "apps/WIP-COUPLE.hpp"
 #include "apps/FIR.hpp"
+#include "apps/LTIMES.hpp"
+#include "apps/LTIMES_NOVIEW.hpp"
+#include "apps/WIP-COUPLE.hpp"
 
 
 #include <iostream>
@@ -120,6 +124,8 @@ static const std::string KernelNames [] =
   std::string("Basic_INIT3"),
   std::string("Basic_REDUCE3_INT"),
   std::string("Basic_NESTED_INIT"),
+  std::string("Basic_INIT_VIEW1D"),
+  std::string("Basic_INIT_VIEW1D_OFFSET"),
 
 //
 // Lcals kernels...
@@ -134,14 +140,10 @@ static const std::string KernelNames [] =
 //
 // Polybench kernels...
 //
-#if 1
   std::string("Polybench_2MM"),
   std::string("Polybench_3MM"),
   std::string("Polybench_GEMMVER"),
 
-  
-#endif
-
 //
 // Stream kernels...
 //
@@ -158,8 +160,10 @@ static const std::string KernelNames [] =
   std::string("Apps_ENERGY"),
   std::string("Apps_VOL3D"),
   std::string("Apps_DEL_DOT_VEC_2D"),
-  std::string("Apps_COUPLE"),
   std::string("Apps_FIR"),
+  std::string("Apps_LTIMES"),
+  std::string("Apps_LTIMES_NOVIEW"),
+  std::string("Apps_COUPLE"),
 
   std::string("Unknown Kernel")  // Keep this at the end and DO NOT remove....
 
@@ -183,19 +187,22 @@ static const std::string VariantNames [] =
 
   std::string("Base_Seq"),
   std::string("RAJA_Seq"),
+
 #if defined(RAJA_ENABLE_OPENMP)
   std::string("Base_OpenMP"),
-  std::string("RAJALike_OpenMP"),
   std::string("RAJA_OpenMP"),
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)  
+  std::string("Base_OpenMPTarget"),
+  std::string("RAJA_OpenMPTarget"),
+#endif
+
 #endif
+
 #if defined(RAJA_ENABLE_CUDA)
   std::string("Base_CUDA"),
   std::string("RAJA_CUDA"),
 #endif
-#if 0
-  std::string("Base_OpenMPTarget"),
-  std::string("RAJA_OpenMPTarget"),
-#endif
 
   std::string("Unknown Variant")  // Keep this at the end and DO NOT remove....
 
@@ -296,6 +303,14 @@ KernelBase* getKernelObject(KernelID kid,
        kernel = new basic::NESTED_INIT(run_params);
        break;
     }
+    case Basic_INIT_VIEW1D : {
+       kernel = new basic::INIT_VIEW1D(run_params);
+       break;
+    }
+    case Basic_INIT_VIEW1D_OFFSET : {
+       kernel = new basic::INIT_VIEW1D_OFFSET(run_params);
+       break;
+    }
 
 //
 // Lcals kernels...
@@ -328,22 +343,18 @@ KernelBase* getKernelObject(KernelID kid,
 //
 // Polybench kernels...
 //
-#if 1
     case Polybench_2MM : {
        kernel = new polybench::POLYBENCH_2MM(run_params);
        break;
     }
-
     case Polybench_3MM : {
        kernel = new polybench::POLYBENCH_3MM(run_params);
        break;
     }
-
     case Polybench_GEMMVER : {
        kernel = new polybench::POLYBENCH_GEMMVER(run_params);
        break;
     }
-#endif
 
 //
 // Stream kernels...
@@ -388,14 +399,22 @@ KernelBase* getKernelObject(KernelID kid,
        kernel = new apps::DEL_DOT_VEC_2D(run_params);
        break;
     }
-    case Apps_COUPLE : {
-       kernel = new apps::COUPLE(run_params);
-       break;
-    }
     case Apps_FIR : {
        kernel = new apps::FIR(run_params);
        break;
     }
+    case Apps_LTIMES : {
+       kernel = new apps::LTIMES(run_params);
+       break;
+    }
+    case Apps_LTIMES_NOVIEW : {
+       kernel = new apps::LTIMES_NOVIEW(run_params);
+       break;
+    }
+    case Apps_COUPLE : {
+       kernel = new apps::COUPLE(run_params);
+       break;
+    }
 
     default: {
       std::cout << "\n Unknown Kernel ID = " << kid << std::endl;
diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp
index f93c203c8..ab0c95a0a 100644
--- a/src/common/RAJAPerfSuite.hpp
+++ b/src/common/RAJAPerfSuite.hpp
@@ -102,6 +102,8 @@ enum KernelID {
   Basic_INIT3,
   Basic_REDUCE3_INT,
   Basic_NESTED_INIT,
+  Basic_INIT_VIEW1D,
+  Basic_INIT_VIEW1D_OFFSET,
 
 //
 // Lcals kernels...
@@ -116,11 +118,9 @@ enum KernelID {
 //
 // Polybench kernels...
 //
-#if 1
   Polybench_2MM,
   Polybench_3MM,
   Polybench_GEMMVER,
-#endif
 
 //
 // Stream kernels...
@@ -138,8 +138,10 @@ enum KernelID {
   Apps_ENERGY,
   Apps_VOL3D,
   Apps_DEL_DOT_VEC_2D,
-  Apps_COUPLE,
   Apps_FIR,
+  Apps_LTIMES,
+  Apps_LTIMES_NOVIEW,
+  Apps_COUPLE,
 
   NumKernels // Keep this one last and NEVER comment out (!!)
 
@@ -162,19 +164,22 @@ enum VariantID {
 
   Base_Seq = 0,
   RAJA_Seq,
+
 #if defined(RAJA_ENABLE_OPENMP)
   Base_OpenMP,
-  RAJALike_OpenMP,
   RAJA_OpenMP,
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)  
+  Base_OpenMPTarget,
+  RAJA_OpenMPTarget,
 #endif
+
+#endif
+
 #if defined(RAJA_ENABLE_CUDA)
   Base_CUDA,
   RAJA_CUDA,
 #endif
-#if 0
-  Base_OpenMPTarget,
-  RAJA_OpenMPTarget,
-#endif
 
   NumVariants // Keep this one last and NEVER comment out (!!)
 
diff --git a/src/common/RPTypes.hpp b/src/common/RPTypes.hpp
index 532bd4437..cefb77457 100644
--- a/src/common/RPTypes.hpp
+++ b/src/common/RPTypes.hpp
@@ -31,7 +31,7 @@
 #undef RP_USE_FLOAT
 
 #define RP_USE_COMPLEX
-//#undef RP_USE_DOUBLE
+//#undef RP_USE_COMPLEX
 
 #if defined(RP_USE_COMPLEX)
 #include <complex>
@@ -52,7 +52,7 @@ namespace rajaperf
  *
  ******************************************************************************
  */
-typedef volatile int RepIndex_type;
+using RepIndex_type = volatile int;
 
 
 /*!
@@ -62,13 +62,9 @@ typedef volatile int RepIndex_type;
  *
  ******************************************************************************
  */
-#if 0 // Index_type
-typedef RAJA::Index_type Index_type;
-#else
-typedef int Index_type;
-#endif
+using Index_type = RAJA::Index_type;
 ///
-typedef Index_type* Index_ptr;
+using Index_ptr = Index_type*;
 
 
 /*!
@@ -78,9 +74,9 @@ typedef Index_type* Index_ptr;
  *
  ******************************************************************************
  */
-typedef int Int_type;
+using Int_type = int;
 ///
-typedef Int_type* Int_ptr;
+using Int_ptr = Int_type*;
 
 
 /*!
@@ -90,7 +86,7 @@ typedef Int_type* Int_ptr;
  *
  ******************************************************************************
  */
-typedef long double Checksum_type;
+using Checksum_type = long double;
 
 
 /*!
@@ -102,26 +98,26 @@ typedef long double Checksum_type;
  */
 #if defined(RP_USE_DOUBLE)
 ///
-typedef double Real_type;
+using Real_type = double;
 
 #elif defined(RP_USE_FLOAT)
 ///
-typedef float Real_type;
+using Real_type = float;
 
 #else
 #error Real_type is undefined!
 
 #endif
 
-typedef Real_type* Real_ptr;
+using Real_ptr = Real_type*;
 typedef Real_type* RAJA_RESTRICT ResReal_ptr;
 
 
 #if defined(RP_USE_COMPLEX)
 ///
-typedef std::complex<Real_type> Complex_type;
+using Complex_type = std::complex<Real_type>;
 
-typedef Complex_type* Complex_ptr;
+using Complex_ptr = Complex_type*;
 typedef Complex_type* RAJA_RESTRICT ResComplex_ptr;
 #endif
 
diff --git a/src/common/RunParams.cpp b/src/common/RunParams.cpp
index b603ecb12..78c360bff 100644
--- a/src/common/RunParams.cpp
+++ b/src/common/RunParams.cpp
@@ -35,6 +35,7 @@ namespace rajaperf
  */
 RunParams::RunParams(int argc, char** argv)
  : input_state(Undefined),
+   show_progress(false),
    npasses(1),
    rep_fact(1.0),
    size_fact(1.0),
@@ -75,6 +76,7 @@ RunParams::~RunParams()
  */
 void RunParams::print(std::ostream& str) const
 {
+  str << "\n show_progress = " << show_progress; 
   str << "\n npasses = " << npasses; 
   str << "\n rep_fact = " << rep_fact; 
   str << "\n size_fact = " << size_fact; 
@@ -127,6 +129,11 @@ void RunParams::parseCommandLineOptions(int argc, char** argv)
       printHelpMessage(std::cout);
       input_state = InfoRequest;
 
+    } else if ( opt == std::string("--show-progress") ||
+                opt == std::string("-sp") ) {
+
+      show_progress = true;
+
     } else if ( opt == std::string("--print-kernels") ||
                 opt == std::string("-pk") ) {
      
@@ -307,11 +314,13 @@ void RunParams::printHelpMessage(std::ostream& str) const
   str << "\nUsage: ./raja-perf.exe [options]\n";
   str << "Valid options are:\n"; 
 
-  str << "\t --help, -h (prints options with descriptions}\n\n";
+  str << "\t --help, -h (print options with descriptions}\n\n";
+
+  str << "\t --show-progress, -sp (print progress during run}\n\n";
 
-  str << "\t --print-kernels, -pk (prints valid kernel names}\n\n";
+  str << "\t --print-kernels, -pk (print valid kernel names}\n\n";
 
-  str << "\t --print-variants, -pv (prints valid variant names}\n\n";
+  str << "\t --print-variants, -pv (print valid variant names}\n\n";
 
   str << "\t --npasses <int> [default is 1]\n"
       << "\t      (num passes through suite)\n"; 
@@ -383,7 +392,10 @@ void RunParams::printKernelNames(std::ostream& str) const
   str << "\nAvailable kernels:";
   str << "\n------------------\n";
   for (int ik = 0; ik < NumKernels; ++ik) {
-    str << getKernelName(static_cast<KernelID>(ik)) << std::endl;
+/// RDH DISABLE COUPLE KERNEL
+    if (static_cast<KernelID>(ik) != Apps_COUPLE) {
+      str << getKernelName(static_cast<KernelID>(ik)) << std::endl;
+    }
   }
   str.flush();
 }
@@ -394,7 +406,10 @@ void RunParams::printFullKernelNames(std::ostream& str) const
   str << "\nAvailable kernels (<group name>_<kernel name>):";
   str << "\n-----------------------------------------\n";
   for (int ik = 0; ik < NumKernels; ++ik) {
-    str << getFullKernelName(static_cast<KernelID>(ik)) << std::endl;
+/// RDH DISABLE COUPLE KERNEL
+    if (static_cast<KernelID>(ik) != Apps_COUPLE) {
+      str << getFullKernelName(static_cast<KernelID>(ik)) << std::endl;
+    }
   }
   str.flush();
 }
diff --git a/src/common/RunParams.hpp b/src/common/RunParams.hpp
index d637667c4..611f3ab97 100644
--- a/src/common/RunParams.hpp
+++ b/src/common/RunParams.hpp
@@ -67,7 +67,9 @@ class RunParams {
 
 
 //@{
-//! @name Getters/setters for processing input
+//! @name Getters/setters for processing input and run parameters
+
+  bool showProgress() const { return show_progress; }
 
   int getNumPasses() const { return npasses; }
 
@@ -127,6 +129,8 @@ class RunParams {
 
   InputOpt input_state;  /*!< state of command line input */
 
+  bool show_progress;    /*!< true -> show run progress; false -> do not */
+
   int npasses;           /*!< Number of passes through suite  */
   double rep_fact;       /*!< pct of default kernel reps to run */
   double size_fact;      /*!< pct of default kernel iteration space to run */
diff --git a/src/lcals/CMakeLists.txt b/src/lcals/CMakeLists.txt
index 7da10d0ff..24865d806 100644
--- a/src/lcals/CMakeLists.txt
+++ b/src/lcals/CMakeLists.txt
@@ -16,10 +16,22 @@
 blt_add_library(
   NAME lcals
   SOURCES HYDRO_1D.cpp 
+          HYDRO_1D-Cuda.cpp
+          HYDRO_1D-OMPTarget.cpp
           EOS.cpp 
+          EOS-Cuda.cpp 
+          EOS-OMPTarget.cpp 
           INT_PREDICT.cpp 
+          INT_PREDICT-Cuda.cpp 
+          INT_PREDICT-OMPTarget.cpp 
           DIFF_PREDICT.cpp 
+          DIFF_PREDICT-Cuda.cpp 
+          DIFF_PREDICT-OMPTarget.cpp 
           FIRST_DIFF.cpp 
+          FIRST_DIFF-Cuda.cpp 
+          FIRST_DIFF-OMPTarget.cpp 
           PLANCKIAN.cpp 
+          PLANCKIAN-Cuda.cpp 
+          PLANCKIAN-OMPTarget.cpp 
   DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS}
   )
diff --git a/src/lcals/DIFF_PREDICT-Cuda.cpp b/src/lcals/DIFF_PREDICT-Cuda.cpp
new file mode 100644
index 000000000..3495b696f
--- /dev/null
+++ b/src/lcals/DIFF_PREDICT-Cuda.cpp
@@ -0,0 +1,109 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "DIFF_PREDICT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace lcals
+{
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#define DIFF_PREDICT_DATA_SETUP_CUDA \
+  Real_ptr px; \
+  Real_ptr cx; \
+  const Index_type offset = m_offset; \
+\
+  allocAndInitCudaDeviceData(px, m_px, m_array_length); \
+  allocAndInitCudaDeviceData(cx, m_cx, m_array_length);
+
+#define DIFF_PREDICT_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_px, px, m_array_length); \
+  deallocCudaDeviceData(px); \
+  deallocCudaDeviceData(cx);
+
+__global__ void diff_predict(Real_ptr px, Real_ptr cx,
+                             const Index_type offset, 
+                             Index_type iend) 
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     DIFF_PREDICT_BODY; 
+   }
+}
+
+
+void DIFF_PREDICT::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_CUDA ) {
+
+    DIFF_PREDICT_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+       diff_predict<<<grid_size, block_size>>>( px, cx,
+                                                offset,
+                                                iend ); 
+
+    }
+    stopTimer();
+
+    DIFF_PREDICT_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    DIFF_PREDICT_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+         DIFF_PREDICT_BODY;
+       });
+
+    }
+    stopTimer();
+
+    DIFF_PREDICT_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  DIFF_PREDICT : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/lcals/DIFF_PREDICT-OMPTarget.cpp b/src/lcals/DIFF_PREDICT-OMPTarget.cpp
new file mode 100644
index 000000000..6854403d1
--- /dev/null
+++ b/src/lcals/DIFF_PREDICT-OMPTarget.cpp
@@ -0,0 +1,102 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "DIFF_PREDICT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace lcals
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define DIFF_PREDICT_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr px; \
+  Real_ptr cx; \
+  const Index_type offset = m_offset; \
+\
+  allocAndInitOpenMPDeviceData(px, m_px, m_array_length, did, hid); \
+  allocAndInitOpenMPDeviceData(cx, m_cx, m_array_length, did, hid);
+
+#define DIFF_PREDICT_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_px, px, m_array_length, hid, did); \
+  deallocOpenMPDeviceData(px, did); \
+  deallocOpenMPDeviceData(cx, did);
+
+
+void DIFF_PREDICT::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    DIFF_PREDICT_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(px, cx) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) 
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        DIFF_PREDICT_BODY;
+      }
+
+    }
+    stopTimer();
+
+    DIFF_PREDICT_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    DIFF_PREDICT_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+        DIFF_PREDICT_BODY;
+      });
+
+    }
+    stopTimer();
+
+    DIFF_PREDICT_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     std::cout << "\n  DIFF_PREDICT : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/lcals/DIFF_PREDICT.cpp b/src/lcals/DIFF_PREDICT.cpp
index 3812c591c..fd3505e6c 100644
--- a/src/lcals/DIFF_PREDICT.cpp
+++ b/src/lcals/DIFF_PREDICT.cpp
@@ -13,13 +13,12 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-
 #include "DIFF_PREDICT.hpp"
 
-#include "common/DataUtils.hpp"
-
 #include "RAJA/RAJA.hpp"
 
+#include "common/DataUtils.hpp"
+
 #include <iostream>
 
 namespace rajaperf 
@@ -27,68 +26,12 @@ namespace rajaperf
 namespace lcals
 {
 
-#define DIFF_PREDICT_DATA \
+
+#define DIFF_PREDICT_DATA_SETUP_CPU \
   ResReal_ptr px = m_px; \
   ResReal_ptr cx = m_cx; \
   const Index_type offset = m_offset;
 
-#define DIFF_PREDICT_BODY  \
-  Real_type ar, br, cr; \
-\
-  ar                  = cx[i + offset * 4];       \
-  br                  = ar - px[i + offset * 4];  \
-  px[i + offset * 4]  = ar;                       \
-  cr                  = br - px[i + offset * 5];  \
-  px[i + offset * 5]  = br;                       \
-  ar                  = cr - px[i + offset * 6];  \
-  px[i + offset * 6]  = cr;                       \
-  br                  = ar - px[i + offset * 7];  \
-  px[i + offset * 7]  = ar;                       \
-  cr                  = br - px[i + offset * 8];  \
-  px[i + offset * 8]  = br;                       \
-  ar                  = cr - px[i + offset * 9];  \
-  px[i + offset * 9]  = cr;                       \
-  br                  = ar - px[i + offset * 10]; \
-  px[i + offset * 10] = ar;                       \
-  cr                  = br - px[i + offset * 11]; \
-  px[i + offset * 11] = br;                       \
-  px[i + offset * 13] = cr - px[i + offset * 12]; \
-  px[i + offset * 12] = cr;
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define DIFF_PREDICT_DATA_SETUP_CUDA \
-  Real_ptr px; \
-  Real_ptr cx; \
-  const Index_type offset = m_offset; \
-\
-  allocAndInitCudaDeviceData(px, m_px, m_offset*14); \
-  allocAndInitCudaDeviceData(cx, m_cx, m_offset*14);
-
-#define DIFF_PREDICT_DATA_TEARDOWN_CUDA \
-  getCudaDeviceData(m_px, px, m_offset*14); \
-  deallocCudaDeviceData(px); \
-  deallocCudaDeviceData(cx);
-
-__global__ void diff_predict(Real_ptr px, Real_ptr cx,
-                             const Index_type offset, 
-                             Index_type iend) 
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     DIFF_PREDICT_BODY; 
-   }
-}
-
-#endif // if defined(RAJA_ENABLE_CUDA)
-
 
 DIFF_PREDICT::DIFF_PREDICT(const RunParams& params)
   : KernelBase(rajaperf::Lcals_DIFF_PREDICT, params)
@@ -103,10 +46,11 @@ DIFF_PREDICT::~DIFF_PREDICT()
 
 void DIFF_PREDICT::setUp(VariantID vid)
 {
-  allocAndInitData(m_px, getRunSize()*14, vid);
-  allocAndInitData(m_cx, getRunSize()*14, vid);
-
+  m_array_length = getRunSize() * 14;
   m_offset = getRunSize();
+
+  allocAndInitDataConst(m_px, m_array_length, 0.0, vid);
+  allocAndInitData(m_cx, m_array_length, vid);
 }
 
 void DIFF_PREDICT::runKernel(VariantID vid)
@@ -119,7 +63,7 @@ void DIFF_PREDICT::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      DIFF_PREDICT_DATA;
+      DIFF_PREDICT_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -136,12 +80,13 @@ void DIFF_PREDICT::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      DIFF_PREDICT_DATA;
+      DIFF_PREDICT_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](Index_type i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           DIFF_PREDICT_BODY;
         });
 
@@ -154,7 +99,7 @@ void DIFF_PREDICT::runKernel(VariantID vid)
 #if defined(RAJA_ENABLE_OPENMP)
     case Base_OpenMP : {
 
-      DIFF_PREDICT_DATA;
+      DIFF_PREDICT_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -170,20 +115,15 @@ void DIFF_PREDICT::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // case is not defined...
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      DIFF_PREDICT_DATA;
+      DIFF_PREDICT_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, 
-          [=](Index_type i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           DIFF_PREDICT_BODY;
         });
 
@@ -194,59 +134,26 @@ void DIFF_PREDICT::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-
-      DIFF_PREDICT_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-         diff_predict<<<grid_size, block_size>>>( px, cx,
-                                                  offset,
-                                                  iend ); 
-
-      }
-      stopTimer();
-
-      DIFF_PREDICT_DATA_TEARDOWN_CUDA;
-
-      break; 
-    }
-
-    case RAJA_CUDA : {
-
-      DIFF_PREDICT_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend, 
-           [=] __device__ (Index_type i) {
-           DIFF_PREDICT_BODY;
-         });
-
-      }
-      stopTimer();
-
-      DIFF_PREDICT_DATA_TEARDOWN_CUDA;
-
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  DIFF_PREDICT : Unknown variant id = " << vid << std::endl;
     }
 
   }
@@ -255,7 +162,7 @@ void DIFF_PREDICT::runKernel(VariantID vid)
 
 void DIFF_PREDICT::updateChecksum(VariantID vid)
 {
-  checksum[vid] += calcChecksum(m_px, m_offset*14);
+  checksum[vid] += calcChecksum(m_px, m_array_length);
 }
 
 void DIFF_PREDICT::tearDown(VariantID vid)
diff --git a/src/lcals/DIFF_PREDICT.hpp b/src/lcals/DIFF_PREDICT.hpp
index aaa6fc45f..7bed07c01 100644
--- a/src/lcals/DIFF_PREDICT.hpp
+++ b/src/lcals/DIFF_PREDICT.hpp
@@ -13,10 +13,62 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// DIFF_PREDICT kernel reference implementation:
+///
+/// Index_type offset = iend - ibegin;
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   ar                  = cx[i + offset * 4];
+///   br                  = ar - px[i + offset * 4];
+///   px[i + offset * 4]  = ar;
+///   cr                  = br - px[i + offset * 5];
+///   px[i + offset * 5]  = br;
+///   ar                  = cr - px[i + offset * 6];
+///   px[i + offset * 6]  = cr;
+///   br                  = ar - px[i + offset * 7];
+///   px[i + offset * 7]  = ar;
+///   cr                  = br - px[i + offset * 8];
+///   px[i + offset * 8]  = br;
+///   ar                  = cr - px[i + offset * 9];
+///   px[i + offset * 9]  = cr;
+///   br                  = ar - px[i + offset * 10];
+///   px[i + offset * 10] = ar;
+///   cr                  = br - px[i + offset * 11];
+///   px[i + offset * 11] = br;
+///   px[i + offset * 13] = cr - px[i + offset * 12];
+///   px[i + offset * 12] = cr;
+/// }
+///
 
 #ifndef RAJAPerf_Basic_DIFF_PREDICT_HPP
 #define RAJAPerf_Basic_DIFF_PREDICT_HPP
 
+
+#define DIFF_PREDICT_BODY  \
+  Real_type ar, br, cr; \
+\
+  ar                  = cx[i + offset * 4];       \
+  br                  = ar - px[i + offset * 4];  \
+  px[i + offset * 4]  = ar;                       \
+  cr                  = br - px[i + offset * 5];  \
+  px[i + offset * 5]  = br;                       \
+  ar                  = cr - px[i + offset * 6];  \
+  px[i + offset * 6]  = cr;                       \
+  br                  = ar - px[i + offset * 7];  \
+  px[i + offset * 7]  = ar;                       \
+  cr                  = br - px[i + offset * 8];  \
+  px[i + offset * 8]  = br;                       \
+  ar                  = cr - px[i + offset * 9];  \
+  px[i + offset * 9]  = cr;                       \
+  br                  = ar - px[i + offset * 10]; \
+  px[i + offset * 10] = ar;                       \
+  cr                  = br - px[i + offset * 11]; \
+  px[i + offset * 11] = br;                       \
+  px[i + offset * 13] = cr - px[i + offset * 12]; \
+  px[i + offset * 12] = cr;
+
+
 #include "common/KernelBase.hpp"
 
 namespace rajaperf 
@@ -39,10 +91,14 @@ class DIFF_PREDICT : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
   Real_ptr m_px;
   Real_ptr m_cx;
 
+  Index_type m_array_length;
   Index_type m_offset;
 };
 
diff --git a/src/lcals/EOS-Cuda.cpp b/src/lcals/EOS-Cuda.cpp
new file mode 100644
index 000000000..6de036083
--- /dev/null
+++ b/src/lcals/EOS-Cuda.cpp
@@ -0,0 +1,117 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "EOS.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace lcals
+{
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#define EOS_DATA_SETUP_CUDA \
+  Real_ptr x; \
+  Real_ptr y; \
+  Real_ptr z; \
+  Real_ptr u; \
+  const Real_type q = m_q; \
+  const Real_type r = m_r; \
+  const Real_type t = m_t; \
+\
+  allocAndInitCudaDeviceData(x, m_x, m_array_length); \
+  allocAndInitCudaDeviceData(y, m_y, m_array_length); \
+  allocAndInitCudaDeviceData(z, m_z, m_array_length); \
+  allocAndInitCudaDeviceData(u, m_u, m_array_length);
+
+#define EOS_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_x, x, m_array_length); \
+  deallocCudaDeviceData(x); \
+  deallocCudaDeviceData(y); \
+  deallocCudaDeviceData(z); \
+  deallocCudaDeviceData(u);
+
+__global__ void eos(Real_ptr x, Real_ptr y, Real_ptr z, Real_ptr u,
+                    Real_type q, Real_type r, Real_type t,
+                    Index_type iend) 
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     EOS_BODY; 
+   }
+}
+
+
+void EOS::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_CUDA ) {
+
+    EOS_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+       eos<<<grid_size, block_size>>>( x, y, z, u, 
+                                       q, r, t,
+                                       iend ); 
+
+    }
+    stopTimer();
+
+    EOS_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    EOS_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+         EOS_BODY;
+       });
+
+    }
+    stopTimer();
+
+    EOS_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  EOS : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/lcals/EOS-OMPTarget.cpp b/src/lcals/EOS-OMPTarget.cpp
new file mode 100644
index 000000000..3ed0bd065
--- /dev/null
+++ b/src/lcals/EOS-OMPTarget.cpp
@@ -0,0 +1,110 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "EOS.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace lcals
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define EOS_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr x; \
+  Real_ptr y; \
+  Real_ptr z; \
+  Real_ptr u; \
+  const Real_type q = m_q; \
+  const Real_type r = m_r; \
+  const Real_type t = m_t; \
+\
+  allocAndInitOpenMPDeviceData(x, m_x, m_array_length, did, hid); \
+  allocAndInitOpenMPDeviceData(y, m_y, m_array_length, did, hid); \
+  allocAndInitOpenMPDeviceData(z, m_z, m_array_length, did, hid); \
+  allocAndInitOpenMPDeviceData(u, m_u, m_array_length, did, hid);
+
+#define EOS_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_x, x, m_array_length, hid, did); \
+  deallocOpenMPDeviceData(x, did); \
+  deallocOpenMPDeviceData(y, did); \
+  deallocOpenMPDeviceData(z, did); \
+  deallocOpenMPDeviceData(u, did);
+
+
+void EOS::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    EOS_DATA_SETUP_OMP_TARGET
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(x, y, z, u) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) 
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        EOS_BODY;
+      }
+
+    }
+    stopTimer();
+
+    EOS_DATA_TEARDOWN_OMP_TARGET
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    EOS_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+        EOS_BODY;
+      });  
+
+    }
+    stopTimer();
+
+    EOS_DATA_TEARDOWN_OMP_TARGET
+
+  } else { 
+     std::cout << "\n  EOS : Unknown OMP Tagretvariant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/lcals/EOS.cpp b/src/lcals/EOS.cpp
index 2e2655e58..02fd19780 100644
--- a/src/lcals/EOS.cpp
+++ b/src/lcals/EOS.cpp
@@ -13,13 +13,12 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-
 #include "EOS.hpp"
 
-#include "common/DataUtils.hpp"
-
 #include "RAJA/RAJA.hpp"
 
+#include "common/DataUtils.hpp"
+
 #include <iostream>
 
 namespace rajaperf 
@@ -27,7 +26,8 @@ namespace rajaperf
 namespace lcals
 {
 
-#define EOS_DATA \
+
+#define EOS_DATA_SETUP_CPU \
   ResReal_ptr x = m_x; \
   ResReal_ptr y = m_y; \
   ResReal_ptr z = m_z; \
@@ -37,53 +37,6 @@ namespace lcals
   const Real_type r = m_r; \
   const Real_type t = m_t;
 
-#define EOS_BODY  \
-  x[i] = u[i] + r*( z[i] + r*y[i] ) + \
-                t*( u[i+3] + r*( u[i+2] + r*u[i+1] ) + \
-                   t*( u[i+6] + q*( u[i+5] + q*u[i+4] ) ) );
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define EOS_DATA_SETUP_CUDA \
-  Real_ptr x; \
-  Real_ptr y; \
-  Real_ptr z; \
-  Real_ptr u; \
-  const Real_type q = m_q; \
-  const Real_type r = m_r; \
-  const Real_type t = m_t; \
-\
-  allocAndInitCudaDeviceData(x, m_x, iend+7); \
-  allocAndInitCudaDeviceData(y, m_y, iend+7); \
-  allocAndInitCudaDeviceData(z, m_z, iend+7); \
-  allocAndInitCudaDeviceData(u, m_u, iend+7);
-
-#define EOS_DATA_TEARDOWN_CUDA \
-  getCudaDeviceData(m_x, x, iend); \
-  deallocCudaDeviceData(x); \
-  deallocCudaDeviceData(y); \
-  deallocCudaDeviceData(z); \
-  deallocCudaDeviceData(u);
-
-__global__ void eos(Real_ptr x, Real_ptr y, Real_ptr z, Real_ptr u,
-                    Real_type q, Real_type r, Real_type t,
-                    Index_type iend) 
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     EOS_BODY; 
-   }
-}
-
-#endif // if defined(RAJA_ENABLE_CUDA)
-
 
 EOS::EOS(const RunParams& params)
   : KernelBase(rajaperf::Lcals_EOS, params)
@@ -98,10 +51,12 @@ EOS::~EOS()
 
 void EOS::setUp(VariantID vid)
 {
-  allocAndInitData(m_x, getRunSize()+7, vid);
-  allocAndInitData(m_y, getRunSize()+7, vid);
-  allocAndInitData(m_z, getRunSize()+7, vid);
-  allocAndInitData(m_u, getRunSize()+7, vid);
+  m_array_length = getRunSize() + 7;
+
+  allocAndInitDataConst(m_x, m_array_length, 0.0, vid);
+  allocAndInitData(m_y, m_array_length, vid);
+  allocAndInitData(m_z, m_array_length, vid);
+  allocAndInitData(m_u, m_array_length, vid);
 
   initData(m_q, vid);
   initData(m_r, vid);
@@ -118,7 +73,7 @@ void EOS::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      EOS_DATA;
+      EOS_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -135,12 +90,13 @@ void EOS::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      EOS_DATA;
+      EOS_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](Index_type i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           EOS_BODY;
         });
 
@@ -153,7 +109,7 @@ void EOS::runKernel(VariantID vid)
 #if defined(RAJA_ENABLE_OPENMP)
     case Base_OpenMP : {
 
-      EOS_DATA;
+      EOS_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -169,20 +125,15 @@ void EOS::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // case is not defined...
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      EOS_DATA;
+      EOS_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, 
-          [=](Index_type i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           EOS_BODY;
         });
 
@@ -193,59 +144,26 @@ void EOS::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-
-      EOS_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-         eos<<<grid_size, block_size>>>( x, y, z, u, 
-                                         q, r, t,
-                                         iend ); 
-
-      }
-      stopTimer();
-
-      EOS_DATA_TEARDOWN_CUDA;
-
-      break; 
-    }
-
-    case RAJA_CUDA : {
-
-      EOS_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend, 
-           [=] __device__ (Index_type i) {
-           EOS_BODY;
-         });
-
-      }
-      stopTimer();
-
-      EOS_DATA_TEARDOWN_CUDA;
-
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  EOS : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/lcals/EOS.hpp b/src/lcals/EOS.hpp
index 3e8beca29..c68eb0475 100644
--- a/src/lcals/EOS.hpp
+++ b/src/lcals/EOS.hpp
@@ -13,10 +13,26 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// EOS kernel reference implementation:
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   x[i] = u[i] + r*( z[i] + r*y[i] ) +
+///                 t*( u[i+3] + r*( u[i+2] + r*u[i+1] ) +
+///                    t*( u[i+6] + q*( u[i+5] + q*u[i+4] ) ) );
+/// }
+///
 
 #ifndef RAJAPerf_Basic_EOS_HPP
 #define RAJAPerf_Basic_EOS_HPP
 
+
+#define EOS_BODY  \
+  x[i] = u[i] + r*( z[i] + r*y[i] ) + \
+                t*( u[i+3] + r*( u[i+2] + r*u[i+1] ) + \
+                   t*( u[i+6] + q*( u[i+5] + q*u[i+4] ) ) );
+
+
 #include "common/KernelBase.hpp"
 
 namespace rajaperf 
@@ -39,6 +55,9 @@ class EOS : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
   Real_ptr m_x;
   Real_ptr m_y;
@@ -48,6 +67,8 @@ class EOS : public KernelBase
   Real_type m_q;
   Real_type m_r;
   Real_type m_t;
+
+  Index_type m_array_length;
 };
 
 } // end namespace lcals
diff --git a/src/lcals/FIRST_DIFF-Cuda.cpp b/src/lcals/FIRST_DIFF-Cuda.cpp
new file mode 100644
index 000000000..a9fc8c1be
--- /dev/null
+++ b/src/lcals/FIRST_DIFF-Cuda.cpp
@@ -0,0 +1,106 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "FIRST_DIFF.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace lcals
+{
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#define FIRST_DIFF_DATA_SETUP_CUDA \
+  Real_ptr x; \
+  Real_ptr y; \
+\
+  allocAndInitCudaDeviceData(x, m_x, m_array_length); \
+  allocAndInitCudaDeviceData(y, m_y, m_array_length);
+
+#define FIRST_DIFF_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_x, x, m_array_length); \
+  deallocCudaDeviceData(x); \
+  deallocCudaDeviceData(y);
+
+__global__ void first_diff(Real_ptr x, Real_ptr y,
+                           Index_type iend) 
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     FIRST_DIFF_BODY; 
+   }
+}
+
+
+void FIRST_DIFF::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_CUDA ) {
+
+    FIRST_DIFF_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+       first_diff<<<grid_size, block_size>>>( x, y,
+                                              iend ); 
+
+    }
+    stopTimer();
+
+    FIRST_DIFF_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    FIRST_DIFF_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+         FIRST_DIFF_BODY;
+       });
+
+    }
+    stopTimer();
+
+    FIRST_DIFF_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  FIRST_DIFF : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/lcals/FIRST_DIFF-OMPTarget.cpp b/src/lcals/FIRST_DIFF-OMPTarget.cpp
new file mode 100644
index 000000000..d95e78b88
--- /dev/null
+++ b/src/lcals/FIRST_DIFF-OMPTarget.cpp
@@ -0,0 +1,102 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "FIRST_DIFF.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace lcals
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define FIRST_DIFF_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr x; \
+  Real_ptr y; \
+\
+  allocAndInitOpenMPDeviceData(x, m_x, m_array_length, did, hid); \
+  allocAndInitOpenMPDeviceData(y, m_y, m_array_length, did, hid);
+
+#define FIRST_DIFF_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_x, x, iend, hid, did); \
+  deallocOpenMPDeviceData(x, did); \
+  deallocOpenMPDeviceData(y, did);
+
+
+void FIRST_DIFF::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    FIRST_DIFF_DATA_SETUP_OMP_TARGET;
+                       
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(x, y) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) 
+        
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        FIRST_DIFF_BODY;
+      }
+
+    }
+    stopTimer();
+
+    FIRST_DIFF_DATA_TEARDOWN_OMP_TARGET;
+                       
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    FIRST_DIFF_DATA_SETUP_OMP_TARGET;
+                       
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+        FIRST_DIFF_BODY;
+      });
+
+    }
+    stopTimer();
+
+    FIRST_DIFF_DATA_TEARDOWN_OMP_TARGET;
+                       
+  } else {                          
+     std::cout << "\n  FIRST_DIFF : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/lcals/FIRST_DIFF.cpp b/src/lcals/FIRST_DIFF.cpp
index 6001b8480..48dc47f0d 100644
--- a/src/lcals/FIRST_DIFF.cpp
+++ b/src/lcals/FIRST_DIFF.cpp
@@ -13,13 +13,12 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-
 #include "FIRST_DIFF.hpp"
 
-#include "common/DataUtils.hpp"
-
 #include "RAJA/RAJA.hpp"
 
+#include "common/DataUtils.hpp"
+
 #include <iostream>
 
 namespace rajaperf 
@@ -27,45 +26,11 @@ namespace rajaperf
 namespace lcals
 {
 
-#define FIRST_DIFF_DATA \
+
+#define FIRST_DIFF_DATA_SETUP_CPU \
   ResReal_ptr x = m_x; \
   ResReal_ptr y = m_y;
 
-#define FIRST_DIFF_BODY  \
-  x[i] = y[i+1] - y[i];
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define FIRST_DIFF_DATA_SETUP_CUDA \
-  Real_ptr x; \
-  Real_ptr y; \
-\
-  allocAndInitCudaDeviceData(x, m_x, iend+1); \
-  allocAndInitCudaDeviceData(y, m_y, iend+1);
-
-#define FIRST_DIFF_DATA_TEARDOWN_CUDA \
-  getCudaDeviceData(m_x, x, iend); \
-  deallocCudaDeviceData(x); \
-  deallocCudaDeviceData(y);
-
-__global__ void first_diff(Real_ptr x, Real_ptr y,
-                           Index_type iend) 
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     FIRST_DIFF_BODY; 
-   }
-}
-
-#endif // if defined(RAJA_ENABLE_CUDA)
-
 
 FIRST_DIFF::FIRST_DIFF(const RunParams& params)
   : KernelBase(rajaperf::Lcals_FIRST_DIFF, params)
@@ -80,8 +45,9 @@ FIRST_DIFF::~FIRST_DIFF()
 
 void FIRST_DIFF::setUp(VariantID vid)
 {
-  allocAndInitData(m_x, getRunSize()+1, vid);
-  allocAndInitData(m_y, getRunSize()+1, vid);
+  m_array_length = getRunSize()+1; 
+  allocAndInitDataConst(m_x, m_array_length, 0.0, vid);
+  allocAndInitData(m_y, m_array_length, vid);
 }
 
 void FIRST_DIFF::runKernel(VariantID vid)
@@ -94,7 +60,7 @@ void FIRST_DIFF::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      FIRST_DIFF_DATA;
+      FIRST_DIFF_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -111,12 +77,13 @@ void FIRST_DIFF::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      FIRST_DIFF_DATA;
+      FIRST_DIFF_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](Index_type i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           FIRST_DIFF_BODY;
         });
 
@@ -129,7 +96,7 @@ void FIRST_DIFF::runKernel(VariantID vid)
 #if defined(RAJA_ENABLE_OPENMP)
     case Base_OpenMP : {
 
-      FIRST_DIFF_DATA;
+      FIRST_DIFF_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -145,20 +112,15 @@ void FIRST_DIFF::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // case is not defined...
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      FIRST_DIFF_DATA;
+      FIRST_DIFF_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, 
-          [=](Index_type i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           FIRST_DIFF_BODY;
         });
 
@@ -169,58 +131,26 @@ void FIRST_DIFF::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-
-      FIRST_DIFF_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-         first_diff<<<grid_size, block_size>>>( x, y,
-                                                iend ); 
-
-      }
-      stopTimer();
-
-      FIRST_DIFF_DATA_TEARDOWN_CUDA;
-
-      break; 
-    }
-
-    case RAJA_CUDA : {
-
-      FIRST_DIFF_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend, 
-           [=] __device__ (Index_type i) {
-           FIRST_DIFF_BODY;
-         });
-
-      }
-      stopTimer();
-
-      FIRST_DIFF_DATA_TEARDOWN_CUDA;
-
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  FIRST_DIFF : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/lcals/FIRST_DIFF.hpp b/src/lcals/FIRST_DIFF.hpp
index 3f1dbf9af..d7d235e60 100644
--- a/src/lcals/FIRST_DIFF.hpp
+++ b/src/lcals/FIRST_DIFF.hpp
@@ -13,10 +13,22 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// FIRST_DIFF kernel reference implementation:
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   x[i] = y[i+1] - y[i];
+/// }
+///
 
 #ifndef RAJAPerf_Basic_FIRST_DIFF_HPP
 #define RAJAPerf_Basic_FIRST_DIFF_HPP
 
+
+#define FIRST_DIFF_BODY  \
+  x[i] = y[i+1] - y[i];
+
+
 #include "common/KernelBase.hpp"
 
 namespace rajaperf 
@@ -39,9 +51,14 @@ class FIRST_DIFF : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
   Real_ptr m_x;
   Real_ptr m_y;
+
+  Index_type m_array_length;
 };
 
 } // end namespace lcals
diff --git a/src/lcals/HYDRO_1D-Cuda.cpp b/src/lcals/HYDRO_1D-Cuda.cpp
new file mode 100644
index 000000000..3be4a8fdd
--- /dev/null
+++ b/src/lcals/HYDRO_1D-Cuda.cpp
@@ -0,0 +1,114 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HYDRO_1D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace lcals
+{
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#define HYDRO_1D_DATA_SETUP_CUDA \
+  Real_ptr x; \
+  Real_ptr y; \
+  Real_ptr z; \
+  const Real_type q = m_q; \
+  const Real_type r = m_r; \
+  const Real_type t = m_t; \
+\
+  allocAndInitCudaDeviceData(x, m_x, m_array_length); \
+  allocAndInitCudaDeviceData(y, m_y, m_array_length); \
+  allocAndInitCudaDeviceData(z, m_z, m_array_length);
+
+#define HYDRO_1D_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_x, x, m_array_length); \
+  deallocCudaDeviceData(x); \
+  deallocCudaDeviceData(y); \
+  deallocCudaDeviceData(z); \
+
+__global__ void hydro_1d(Real_ptr x, Real_ptr y, Real_ptr z,
+                         Real_type q, Real_type r, Real_type t,
+                         Index_type iend) 
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     HYDRO_1D_BODY; 
+   }
+}
+
+
+void HYDRO_1D::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_CUDA ) {
+
+    HYDRO_1D_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+       hydro_1d<<<grid_size, block_size>>>( x, y, z,
+                                            q, r, t,
+                                            iend ); 
+
+    }
+    stopTimer();
+
+    HYDRO_1D_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    HYDRO_1D_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+         HYDRO_1D_BODY;
+       });
+
+    }
+    stopTimer();
+
+    HYDRO_1D_DATA_TEARDOWN_CUDA;
+
+  } else { 
+     std::cout << "\n  HYDRO_1D : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/lcals/HYDRO_1D-OMPTarget.cpp b/src/lcals/HYDRO_1D-OMPTarget.cpp
new file mode 100644
index 000000000..60a892509
--- /dev/null
+++ b/src/lcals/HYDRO_1D-OMPTarget.cpp
@@ -0,0 +1,107 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "HYDRO_1D.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace lcals
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define HYDRO_1D_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr x; \
+  Real_ptr y; \
+  Real_ptr z; \
+  const Real_type q = m_q; \
+  const Real_type r = m_r; \
+  const Real_type t = m_t; \
+\
+  allocAndInitOpenMPDeviceData(x, m_x, m_array_length, did, hid); \
+  allocAndInitOpenMPDeviceData(y, m_y, m_array_length, did, hid); \
+  allocAndInitOpenMPDeviceData(z, m_z, m_array_length, did, hid);
+
+#define HYDRO_1D_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_x, x, m_array_length, hid, did); \
+  deallocOpenMPDeviceData(x, did); \
+  deallocOpenMPDeviceData(y, did); \
+  deallocOpenMPDeviceData(z, did); \
+
+
+void HYDRO_1D::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    HYDRO_1D_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(x, y, z) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) 
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        HYDRO_1D_BODY;
+      }
+
+    }
+    stopTimer();
+
+    HYDRO_1D_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    HYDRO_1D_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+        HYDRO_1D_BODY;
+      });
+
+    }
+    stopTimer();
+
+    HYDRO_1D_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     std::cout << "\n  HYDRO_1D : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/lcals/HYDRO_1D.cpp b/src/lcals/HYDRO_1D.cpp
index f0e0e554e..3dbdc6235 100644
--- a/src/lcals/HYDRO_1D.cpp
+++ b/src/lcals/HYDRO_1D.cpp
@@ -13,13 +13,12 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-
 #include "HYDRO_1D.hpp"
 
-#include "common/DataUtils.hpp"
-
 #include "RAJA/RAJA.hpp"
 
+#include "common/DataUtils.hpp"
+
 #include <iostream>
 
 namespace rajaperf 
@@ -27,7 +26,8 @@ namespace rajaperf
 namespace lcals
 {
 
-#define HYDRO_1D_DATA \
+
+#define HYDRO_1D_DATA_SETUP_CPU \
   ResReal_ptr x = m_x; \
   ResReal_ptr y = m_y; \
   ResReal_ptr z = m_z; \
@@ -36,48 +36,6 @@ namespace lcals
   const Real_type r = m_r; \
   const Real_type t = m_t;
 
-#define HYDRO_1D_BODY  \
-  x[i] = q + y[i]*( r*z[i+10] + t*z[i+11] );
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define HYDRO_1D_DATA_SETUP_CUDA \
-  Real_ptr x; \
-  Real_ptr y; \
-  Real_ptr z; \
-  const Real_type q = m_q; \
-  const Real_type r = m_r; \
-  const Real_type t = m_t; \
-\
-  allocAndInitCudaDeviceData(x, m_x, iend+12); \
-  allocAndInitCudaDeviceData(y, m_y, iend+12); \
-  allocAndInitCudaDeviceData(z, m_z, iend+12);
-
-#define HYDRO_1D_DATA_TEARDOWN_CUDA \
-  getCudaDeviceData(m_x, x, iend); \
-  deallocCudaDeviceData(x); \
-  deallocCudaDeviceData(y); \
-  deallocCudaDeviceData(z); \
-
-__global__ void hydro_1d(Real_ptr x, Real_ptr y, Real_ptr z,
-                         Real_type q, Real_type r, Real_type t,
-                         Index_type iend) 
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     HYDRO_1D_BODY; 
-   }
-}
-
-#endif // if defined(RAJA_ENABLE_CUDA)
-
 
 HYDRO_1D::HYDRO_1D(const RunParams& params)
   : KernelBase(rajaperf::Lcals_HYDRO_1D, params)
@@ -92,9 +50,11 @@ HYDRO_1D::~HYDRO_1D()
 
 void HYDRO_1D::setUp(VariantID vid)
 {
-  allocAndInitData(m_x, getRunSize()+12, vid);
-  allocAndInitData(m_y, getRunSize()+12, vid);
-  allocAndInitData(m_z, getRunSize()+12, vid);
+  m_array_length = getRunSize() + 12;
+
+  allocAndInitDataConst(m_x, m_array_length, 0.0, vid);
+  allocAndInitData(m_y, m_array_length, vid);
+  allocAndInitData(m_z, m_array_length, vid);
 
   initData(m_q, vid);
   initData(m_r, vid);
@@ -111,7 +71,7 @@ void HYDRO_1D::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      HYDRO_1D_DATA;
+      HYDRO_1D_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -128,12 +88,13 @@ void HYDRO_1D::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      HYDRO_1D_DATA;
+      HYDRO_1D_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](Index_type i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           HYDRO_1D_BODY;
         });
 
@@ -146,7 +107,7 @@ void HYDRO_1D::runKernel(VariantID vid)
 #if defined(RAJA_ENABLE_OPENMP)
     case Base_OpenMP : {
 
-      HYDRO_1D_DATA;
+      HYDRO_1D_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -162,20 +123,16 @@ void HYDRO_1D::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // case is not defined...
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      HYDRO_1D_DATA;
-
+      HYDRO_1D_DATA_SETUP_CPU;
+      
       startTimer();
+
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, 
-          [=](Index_type i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           HYDRO_1D_BODY;
         });
 
@@ -186,59 +143,26 @@ void HYDRO_1D::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-
-      HYDRO_1D_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-         hydro_1d<<<grid_size, block_size>>>( x, y, z,
-                                              q, r, t,
-                                              iend ); 
-
-      }
-      stopTimer();
-
-      HYDRO_1D_DATA_TEARDOWN_CUDA;
-
-      break; 
-    }
-
-    case RAJA_CUDA : {
-
-      HYDRO_1D_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend, 
-           [=] __device__ (Index_type i) {
-           HYDRO_1D_BODY;
-         });
-
-      }
-      stopTimer();
-
-      HYDRO_1D_DATA_TEARDOWN_CUDA;
-
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  HYDRO_1D : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/lcals/HYDRO_1D.hpp b/src/lcals/HYDRO_1D.hpp
index f3c201e55..aa48c0925 100644
--- a/src/lcals/HYDRO_1D.hpp
+++ b/src/lcals/HYDRO_1D.hpp
@@ -13,10 +13,22 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// HYDRO_1D kernel reference implementation:
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   x[i] = q + y[i]*( r*z[i+10] + t*z[i+11] );
+/// }
+///
 
 #ifndef RAJAPerf_Basic_HYDRO_1D_HPP
 #define RAJAPerf_Basic_HYDRO_1D_HPP
 
+
+#define HYDRO_1D_BODY  \
+  x[i] = q + y[i]*( r*z[i+10] + t*z[i+11] );
+
+
 #include "common/KernelBase.hpp"
 
 namespace rajaperf 
@@ -39,6 +51,9 @@ class HYDRO_1D : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
   Real_ptr m_x;
   Real_ptr m_y;
@@ -47,6 +62,8 @@ class HYDRO_1D : public KernelBase
   Real_type m_q;
   Real_type m_r;
   Real_type m_t;
+
+  Index_type m_array_length; 
 };
 
 } // end namespace lcals
diff --git a/src/lcals/INT_PREDICT-Cuda.cpp b/src/lcals/INT_PREDICT-Cuda.cpp
new file mode 100644
index 000000000..8c80fd8b5
--- /dev/null
+++ b/src/lcals/INT_PREDICT-Cuda.cpp
@@ -0,0 +1,119 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INT_PREDICT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace lcals
+{
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#define INT_PREDICT_DATA_SETUP_CUDA \
+  Real_ptr px; \
+  Real_type dm22 = m_dm22; \
+  Real_type dm23 = m_dm23; \
+  Real_type dm24 = m_dm24; \
+  Real_type dm25 = m_dm25; \
+  Real_type dm26 = m_dm26; \
+  Real_type dm27 = m_dm27; \
+  Real_type dm28 = m_dm28; \
+  Real_type c0 = m_c0; \
+  const Index_type offset = m_offset; \
+\
+  allocAndInitCudaDeviceData(px, m_px, m_array_length);
+
+#define INT_PREDICT_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_px, px, m_array_length); \
+  deallocCudaDeviceData(px);
+
+__global__ void int_predict(Real_ptr px,
+                            Real_type dm22, Real_type dm23, Real_type dm24,
+                            Real_type dm25, Real_type dm26, Real_type dm27,
+                            Real_type dm28, Real_type c0,
+                            const Index_type offset, 
+                            Index_type iend) 
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     INT_PREDICT_BODY; 
+   }
+}
+
+
+void INT_PREDICT::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_CUDA ) {
+
+    INT_PREDICT_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+       int_predict<<<grid_size, block_size>>>( px, 
+                                               dm22, dm23, dm24, dm25,
+                                               dm26, dm27, dm28, c0,
+                                               offset,
+                                               iend ); 
+
+    }
+    stopTimer();
+
+    INT_PREDICT_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    INT_PREDICT_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+         INT_PREDICT_BODY;
+       });
+
+    }
+    stopTimer();
+
+    INT_PREDICT_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  INT_PREDICT : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/lcals/INT_PREDICT-OMPTarget.cpp b/src/lcals/INT_PREDICT-OMPTarget.cpp
new file mode 100644
index 000000000..ab64bb0a7
--- /dev/null
+++ b/src/lcals/INT_PREDICT-OMPTarget.cpp
@@ -0,0 +1,107 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "INT_PREDICT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace lcals
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define INT_PREDICT_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr px; \
+  Real_type dm22 = m_dm22; \
+  Real_type dm23 = m_dm23; \
+  Real_type dm24 = m_dm24; \
+  Real_type dm25 = m_dm25; \
+  Real_type dm26 = m_dm26; \
+  Real_type dm27 = m_dm27; \
+  Real_type dm28 = m_dm28; \
+  Real_type c0 = m_c0; \
+  const Index_type offset = m_offset; \
+\
+  allocAndInitOpenMPDeviceData(px, m_px, m_array_length, did, hid);
+
+#define INT_PREDICT_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_px, px, m_array_length, hid, did); \
+  deallocOpenMPDeviceData(px, did);
+
+
+void INT_PREDICT::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    INT_PREDICT_DATA_SETUP_OMP_TARGET;
+                              
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(px) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) 
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        INT_PREDICT_BODY;
+      }
+
+    }
+    stopTimer();
+
+    INT_PREDICT_DATA_TEARDOWN_OMP_TARGET;
+                              
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    INT_PREDICT_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) { 
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+        INT_PREDICT_BODY;
+      });
+
+    }
+    stopTimer();
+
+    INT_PREDICT_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     std::cout << "\n  INT_PREDICT : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/lcals/INT_PREDICT.cpp b/src/lcals/INT_PREDICT.cpp
index 4fe142574..3df42fa73 100644
--- a/src/lcals/INT_PREDICT.cpp
+++ b/src/lcals/INT_PREDICT.cpp
@@ -13,13 +13,12 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-
 #include "INT_PREDICT.hpp"
 
-#include "common/DataUtils.hpp"
-
 #include "RAJA/RAJA.hpp"
 
+#include "common/DataUtils.hpp"
+
 #include <iostream>
 
 namespace rajaperf 
@@ -27,7 +26,8 @@ namespace rajaperf
 namespace lcals
 {
 
-#define INT_PREDICT_DATA \
+
+#define INT_PREDICT_DATA_SETUP_CPU \
   ResReal_ptr px = m_px; \
   Real_type dm22 = m_dm22; \
   Real_type dm23 = m_dm23; \
@@ -39,56 +39,6 @@ namespace lcals
   Real_type c0 = m_c0; \
   const Index_type offset = m_offset;
 
-#define INT_PREDICT_BODY  \
-  px[i] = dm28*px[i + offset * 12] + dm27*px[i + offset * 11] + \
-          dm26*px[i + offset * 10] + dm25*px[i + offset *  9] + \
-          dm24*px[i + offset *  8] + dm23*px[i + offset *  7] + \
-          dm22*px[i + offset *  6] + \
-          c0*( px[i + offset *  4] + px[i + offset *  5] ) + \
-          px[i + offset *  2]; 
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define INT_PREDICT_DATA_SETUP_CUDA \
-  Real_ptr px; \
-  Real_type dm22 = m_dm22; \
-  Real_type dm23 = m_dm23; \
-  Real_type dm24 = m_dm24; \
-  Real_type dm25 = m_dm25; \
-  Real_type dm26 = m_dm26; \
-  Real_type dm27 = m_dm27; \
-  Real_type dm28 = m_dm28; \
-  Real_type c0 = m_c0; \
-  const Index_type offset = m_offset; \
-\
-  allocAndInitCudaDeviceData(px, m_px, m_offset*13);
-
-#define INT_PREDICT_DATA_TEARDOWN_CUDA \
-  getCudaDeviceData(m_px, px, m_offset*13); \
-  deallocCudaDeviceData(px);
-
-__global__ void int_predict(Real_ptr px,
-                            Real_type dm22, Real_type dm23, Real_type dm24,
-                            Real_type dm25, Real_type dm26, Real_type dm27,
-                            Real_type dm28, Real_type c0,
-                            const Index_type offset, 
-                            Index_type iend) 
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     INT_PREDICT_BODY; 
-   }
-}
-
-#endif // if defined(RAJA_ENABLE_CUDA)
-
 
 INT_PREDICT::INT_PREDICT(const RunParams& params)
   : KernelBase(rajaperf::Lcals_INT_PREDICT, params)
@@ -103,7 +53,11 @@ INT_PREDICT::~INT_PREDICT()
 
 void INT_PREDICT::setUp(VariantID vid)
 {
-  allocAndInitData(m_px, getRunSize()*13, vid);
+  m_array_length = getRunSize() * 13;
+  m_offset = getRunSize();
+
+  m_px_initval = 1.0;
+  allocAndInitDataConst(m_px, m_array_length, m_px_initval, vid);
 
   initData(m_dm22);
   initData(m_dm23);
@@ -113,8 +67,6 @@ void INT_PREDICT::setUp(VariantID vid)
   initData(m_dm27);
   initData(m_dm28);
   initData(m_c0);
-
-  m_offset = getRunSize();
 }
 
 void INT_PREDICT::runKernel(VariantID vid)
@@ -127,7 +79,7 @@ void INT_PREDICT::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      INT_PREDICT_DATA;
+      INT_PREDICT_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -144,12 +96,13 @@ void INT_PREDICT::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      INT_PREDICT_DATA;
+      INT_PREDICT_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](Index_type i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) { 
           INT_PREDICT_BODY;
         });
 
@@ -162,7 +115,7 @@ void INT_PREDICT::runKernel(VariantID vid)
 #if defined(RAJA_ENABLE_OPENMP)
     case Base_OpenMP : {
 
-      INT_PREDICT_DATA;
+      INT_PREDICT_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -178,20 +131,15 @@ void INT_PREDICT::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // case is not defined...
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      INT_PREDICT_DATA;
+      INT_PREDICT_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, 
-          [=](Index_type i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           INT_PREDICT_BODY;
         });
 
@@ -202,61 +150,26 @@ void INT_PREDICT::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-
-      INT_PREDICT_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-         int_predict<<<grid_size, block_size>>>( px, 
-                                                 dm22, dm23, dm24, dm25,
-                                                 dm26, dm27, dm28, c0,
-                                                 offset,
-                                                 iend ); 
-
-      }
-      stopTimer();
-
-      INT_PREDICT_DATA_TEARDOWN_CUDA;
-
-      break; 
-    }
-
-    case RAJA_CUDA : {
-
-      INT_PREDICT_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend, 
-           [=] __device__ (Index_type i) {
-           INT_PREDICT_BODY;
-         });
-
-      }
-      stopTimer();
-
-      INT_PREDICT_DATA_TEARDOWN_CUDA;
-
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  INT_PREDICT : Unknown variant id = " << vid << std::endl;
     }
 
   }
@@ -265,7 +178,11 @@ void INT_PREDICT::runKernel(VariantID vid)
 
 void INT_PREDICT::updateChecksum(VariantID vid)
 {
-  checksum[vid] += calcChecksum(m_px, m_offset*13);
+  for (Index_type i = 0; i < getRunSize(); ++i) {
+    m_px[i] -= m_px_initval;
+  }
+  
+  checksum[vid] += calcChecksum(m_px, getRunSize());
 }
 
 void INT_PREDICT::tearDown(VariantID vid)
diff --git a/src/lcals/INT_PREDICT.hpp b/src/lcals/INT_PREDICT.hpp
index 82b66c6e3..83b3c12de 100644
--- a/src/lcals/INT_PREDICT.hpp
+++ b/src/lcals/INT_PREDICT.hpp
@@ -13,10 +13,34 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// INT_PREDICT kernel reference implementation:
+///
+/// Index_type offset = iend - ibegin;
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   px[i] = dm28*px[i + offset * 12] + dm27*px[i + offset * 11] +
+///           dm26*px[i + offset * 10] + dm25*px[i + offset *  9] +
+///           dm24*px[i + offset *  8] + dm23*px[i + offset *  7] +
+///           dm22*px[i + offset *  6] +
+///           c0*( px[i + offset *  4] + px[i + offset *  5] ) +
+///           px[i + offset *  2];
+/// }
+///
 
 #ifndef RAJAPerf_Basic_INT_PREDICT_HPP
 #define RAJAPerf_Basic_INT_PREDICT_HPP
 
+
+#define INT_PREDICT_BODY  \
+  px[i] = dm28*px[i + offset * 12] + dm27*px[i + offset * 11] + \
+          dm26*px[i + offset * 10] + dm25*px[i + offset *  9] + \
+          dm24*px[i + offset *  8] + dm23*px[i + offset *  7] + \
+          dm22*px[i + offset *  6] + \
+          c0*( px[i + offset *  4] + px[i + offset *  5] ) + \
+          px[i + offset *  2];
+
+
 #include "common/KernelBase.hpp"
 
 namespace rajaperf 
@@ -39,8 +63,15 @@ class INT_PREDICT : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
+  Index_type m_array_length;
+  Index_type m_offset;
+
   Real_ptr m_px;
+  Real_type m_px_initval;
 
   Real_type m_dm22;
   Real_type m_dm23;
@@ -50,8 +81,6 @@ class INT_PREDICT : public KernelBase
   Real_type m_dm27;
   Real_type m_dm28;
   Real_type m_c0;
-
-  Index_type m_offset;
 };
 
 } // end namespace lcals
diff --git a/src/lcals/PLANCKIAN-Cuda.cpp b/src/lcals/PLANCKIAN-Cuda.cpp
new file mode 100644
index 000000000..c810fe383
--- /dev/null
+++ b/src/lcals/PLANCKIAN-Cuda.cpp
@@ -0,0 +1,118 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "PLANCKIAN.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+#include <cmath>
+
+namespace rajaperf 
+{
+namespace lcals
+{
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#define PLANCKIAN_DATA_SETUP_CUDA \
+  Real_ptr x; \
+  Real_ptr y; \
+  Real_ptr u; \
+  Real_ptr v; \
+  Real_ptr w; \
+\
+  allocAndInitCudaDeviceData(x, m_x, iend); \
+  allocAndInitCudaDeviceData(y, m_y, iend); \
+  allocAndInitCudaDeviceData(u, m_u, iend); \
+  allocAndInitCudaDeviceData(v, m_v, iend); \
+  allocAndInitCudaDeviceData(w, m_w, iend);
+
+#define PLANCKIAN_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_w, w, iend); \
+  deallocCudaDeviceData(x); \
+  deallocCudaDeviceData(y); \
+  deallocCudaDeviceData(u); \
+  deallocCudaDeviceData(v); \
+  deallocCudaDeviceData(w);
+
+__global__ void planckian(Real_ptr x, Real_ptr y,
+                          Real_ptr u, Real_ptr v, Real_ptr w, 
+                          Index_type iend) 
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     PLANCKIAN_BODY; 
+   }
+}
+
+
+void PLANCKIAN::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_CUDA ) {
+
+    PLANCKIAN_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+       planckian<<<grid_size, block_size>>>( x, y, 
+                                             u, v, w,
+                                             iend );
+
+    }
+    stopTimer();
+
+    PLANCKIAN_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    PLANCKIAN_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+         PLANCKIAN_BODY;
+       });
+
+    }
+    stopTimer();
+
+    PLANCKIAN_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  PLANCKIAN : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/lcals/PLANCKIAN-OMPTarget.cpp b/src/lcals/PLANCKIAN-OMPTarget.cpp
new file mode 100644
index 000000000..4a855f34f
--- /dev/null
+++ b/src/lcals/PLANCKIAN-OMPTarget.cpp
@@ -0,0 +1,111 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "PLANCKIAN.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+#include <cmath>
+
+namespace rajaperf 
+{
+namespace lcals
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define PLANCKIAN_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr x; \
+  Real_ptr y; \
+  Real_ptr u; \
+  Real_ptr v; \
+  Real_ptr w; \
+\
+  allocAndInitOpenMPDeviceData(x, m_x, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(y, m_y, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(u, m_u, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(v, m_v, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(w, m_w, iend, did, hid);
+
+#define PLANCKIAN_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_w, w, iend, hid, did); \
+  deallocOpenMPDeviceData(x, did); \
+  deallocOpenMPDeviceData(y, did); \
+  deallocOpenMPDeviceData(u, did); \
+  deallocOpenMPDeviceData(v, did); \
+  deallocOpenMPDeviceData(w, did);
+
+
+void PLANCKIAN::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    PLANCKIAN_DATA_SETUP_OMP_TARGET;
+                              
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(x, y, u, v, w) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) 
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        PLANCKIAN_BODY;
+      }
+
+    }
+    stopTimer();
+
+    PLANCKIAN_DATA_TEARDOWN_OMP_TARGET; 
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    PLANCKIAN_DATA_SETUP_OMP_TARGET; 
+                              
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+        PLANCKIAN_BODY;
+      });
+
+    }
+    stopTimer();
+
+    PLANCKIAN_DATA_TEARDOWN_OMP_TARGET; 
+
+  } else {
+     std::cout << "\n  PLANCKIAN : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace lcals
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/lcals/PLANCKIAN.cpp b/src/lcals/PLANCKIAN.cpp
index 733883b36..7f87f2088 100644
--- a/src/lcals/PLANCKIAN.cpp
+++ b/src/lcals/PLANCKIAN.cpp
@@ -13,13 +13,12 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-
 #include "PLANCKIAN.hpp"
 
-#include "common/DataUtils.hpp"
-
 #include "RAJA/RAJA.hpp"
 
+#include "common/DataUtils.hpp"
+
 #include <iostream>
 #include <cmath>
 
@@ -28,59 +27,14 @@ namespace rajaperf
 namespace lcals
 {
 
-#define PLANCKIAN_DATA \
+
+#define PLANCKIAN_DATA_SETUP_CPU \
   ResReal_ptr x = m_x; \
   ResReal_ptr y = m_y; \
   ResReal_ptr u = m_u; \
   ResReal_ptr v = m_v; \
   ResReal_ptr w = m_w;
 
-#define PLANCKIAN_BODY  \
-  y[i] = u[i] / v[i]; \
-  w[i] = x[i] / ( exp( y[i] ) - 1.0 );
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define PLANCKIAN_DATA_SETUP_CUDA \
-  Real_ptr x; \
-  Real_ptr y; \
-  Real_ptr u; \
-  Real_ptr v; \
-  Real_ptr w; \
-\
-  allocAndInitCudaDeviceData(x, m_x, iend); \
-  allocAndInitCudaDeviceData(y, m_y, iend); \
-  allocAndInitCudaDeviceData(u, m_u, iend); \
-  allocAndInitCudaDeviceData(v, m_v, iend); \
-  allocAndInitCudaDeviceData(w, m_w, iend);
-
-#define PLANCKIAN_DATA_TEARDOWN_CUDA \
-  getCudaDeviceData(m_w, w, iend); \
-  deallocCudaDeviceData(x); \
-  deallocCudaDeviceData(y); \
-  deallocCudaDeviceData(u); \
-  deallocCudaDeviceData(v); \
-  deallocCudaDeviceData(w);
-
-__global__ void planckian(Real_ptr x, Real_ptr y,
-                          Real_ptr u, Real_ptr v, Real_ptr w, 
-                          Index_type iend) 
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     PLANCKIAN_BODY; 
-   }
-}
-
-#endif // if defined(RAJA_ENABLE_CUDA)
-
 
 PLANCKIAN::PLANCKIAN(const RunParams& params)
   : KernelBase(rajaperf::Lcals_PLANCKIAN, params)
@@ -99,7 +53,7 @@ void PLANCKIAN::setUp(VariantID vid)
   allocAndInitData(m_y, getRunSize(), vid);
   allocAndInitData(m_u, getRunSize(), vid);
   allocAndInitData(m_v, getRunSize(), vid);
-  allocAndInitData(m_w, getRunSize(), vid);
+  allocAndInitDataConst(m_w, getRunSize(), 0.0, vid);
 }
 
 void PLANCKIAN::runKernel(VariantID vid)
@@ -112,7 +66,7 @@ void PLANCKIAN::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      PLANCKIAN_DATA;
+      PLANCKIAN_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -129,12 +83,13 @@ void PLANCKIAN::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      PLANCKIAN_DATA;
+      PLANCKIAN_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](Index_type i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           PLANCKIAN_BODY;
         });
 
@@ -147,7 +102,7 @@ void PLANCKIAN::runKernel(VariantID vid)
 #if defined(RAJA_ENABLE_OPENMP)
     case Base_OpenMP : {
 
-      PLANCKIAN_DATA;
+      PLANCKIAN_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -163,20 +118,15 @@ void PLANCKIAN::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // case is not defined...
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      PLANCKIAN_DATA;
+      PLANCKIAN_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, 
-          [=](Index_type i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           PLANCKIAN_BODY;
         });
 
@@ -187,59 +137,26 @@ void PLANCKIAN::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-
-      PLANCKIAN_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-         planckian<<<grid_size, block_size>>>( x, y, 
-                                               u, v, w,
-                                               iend );
-
-      }
-      stopTimer();
-
-      PLANCKIAN_DATA_TEARDOWN_CUDA;
-
-      break; 
-    }
-
-    case RAJA_CUDA : {
-
-      PLANCKIAN_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend, 
-           [=] __device__ (Index_type i) {
-           PLANCKIAN_BODY;
-         });
-
-      }
-      stopTimer();
-
-      PLANCKIAN_DATA_TEARDOWN_CUDA;
-
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  PLANCKIAN : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/lcals/PLANCKIAN.hpp b/src/lcals/PLANCKIAN.hpp
index 28bf2f949..eedb840b5 100644
--- a/src/lcals/PLANCKIAN.hpp
+++ b/src/lcals/PLANCKIAN.hpp
@@ -13,10 +13,24 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// PLANCKIAN kernel reference implementation:
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   y[i] = u[i] / v[i];
+///   w[i] = x[i] / ( exp( y[i] ) - 1.0 );
+/// }
+///
 
 #ifndef RAJAPerf_Basic_PLANCKIAN_HPP
 #define RAJAPerf_Basic_PLANCKIAN_HPP
 
+
+#define PLANCKIAN_BODY  \
+  y[i] = u[i] / v[i]; \
+  w[i] = x[i] / ( exp( y[i] ) - 1.0 );
+
+
 #include "common/KernelBase.hpp"
 
 namespace rajaperf 
@@ -39,14 +53,15 @@ class PLANCKIAN : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
   Real_ptr m_x;
   Real_ptr m_y;
   Real_ptr m_u;
   Real_ptr m_v;
   Real_ptr m_w;
-
-  Real_type m_expmax;
 };
 
 } // end namespace lcals
diff --git a/src/polybench/CMakeLists.txt b/src/polybench/CMakeLists.txt
index 3ebfdbca5..90f82e99e 100644
--- a/src/polybench/CMakeLists.txt
+++ b/src/polybench/CMakeLists.txt
@@ -16,7 +16,13 @@
 blt_add_library(
   NAME polybench
   SOURCES POLYBENCH_2MM.cpp 
+          POLYBENCH_2MM-Cuda.cpp
+          POLYBENCH_2MM-OMPTarget.cpp
           POLYBENCH_3MM.cpp
+          POLYBENCH_3MM-Cuda.cpp
+          POLYBENCH_3MM-OMPTarget.cpp
           POLYBENCH_GEMMVER.cpp
+          POLYBENCH_GEMMVER-Cuda.cpp
+          POLYBENCH_GEMMVER-OMPTarget.cpp
   DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS}
   )
diff --git a/src/polybench/POLYBENCH_2MM-Cuda.cpp b/src/polybench/POLYBENCH_2MM-Cuda.cpp
new file mode 100644
index 000000000..6177c454e
--- /dev/null
+++ b/src/polybench/POLYBENCH_2MM-Cuda.cpp
@@ -0,0 +1,185 @@
+  
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "POLYBENCH_2MM.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace polybench
+{
+
+//
+// Define thread block size for CUDA execution
+//
+const size_t block_size = 256;
+
+#define POLYBENCH_2MM_DATA_SETUP_CUDA \
+  Real_ptr tmp = m_tmp; \
+  Real_ptr A = m_A; \
+  Real_ptr B = m_B; \
+  Real_ptr C = m_C; \
+  Real_ptr D = m_D; \
+  Real_type alpha = m_alpha; \
+  Real_type beta = m_beta; \
+\
+  memcpy(m_D,m_DD,m_ni * m_nl * sizeof(Real_type)); \
+  allocAndInitCudaDeviceData(tmp, m_tmp, m_ni * m_nj); \
+  allocAndInitCudaDeviceData(A, m_A, m_ni * m_nk); \
+  allocAndInitCudaDeviceData(B, m_B, m_nk * m_nj); \
+  allocAndInitCudaDeviceData(C, m_C, m_nj * m_nl); \
+  allocAndInitCudaDeviceData(D, m_D, m_ni * m_nl); 
+
+
+#define POLYBENCH_2MM_TEARDOWN_CUDA \
+  getCudaDeviceData(m_D, D, m_ni * m_nl); \
+  deallocCudaDeviceData(tmp); \
+  deallocCudaDeviceData(A); \
+  deallocCudaDeviceData(B); \
+  deallocCudaDeviceData(C); \
+  deallocCudaDeviceData(D);
+
+__global__ void polybench_2mm_cuda_1(Real_ptr tmp, Real_ptr A,
+                       Real_ptr B, Real_ptr C, Real_ptr D,
+                       Real_type alpha, Real_type beta, Index_type ni, Index_type nj,
+                       Index_type nk, Index_type nl)
+{
+   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i,j,k;
+   if (ii < ni * nj) {
+     *(tmp + ii) = 0.0;
+     i = ii/nj; j = ii % nj;
+     for (k=0; k < nk; k++) {
+       POLYBENCH_2MM_BODY2;              
+     }
+   }
+
+
+}
+
+__global__ void polybench_2mm_cuda_2(Real_ptr tmp, Real_ptr A,
+                       Real_ptr B, Real_ptr C, Real_ptr D,
+                       Real_type alpha, Real_type beta, Index_type ni, Index_type nj,
+                       Index_type nk, Index_type nl)
+{
+   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i,l,j;
+   if (ii < ni * nl) {
+     *(D + ii) *= beta;
+     i = ii/nl; l = ii % nl;
+     for (j=0; j < nj; j++) {
+       POLYBENCH_2MM_BODY4;              
+     }
+   }
+}
+
+
+void POLYBENCH_2MM::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ni = m_ni;
+  const Index_type nj = m_nj;
+  const Index_type nk = m_nk;
+  const Index_type nl = m_nl;
+
+
+  if ( vid == Base_CUDA ) {
+
+    POLYBENCH_2MM_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      size_t grid_size = RAJA_DIVIDE_CEILING_INT(m_ni * m_nj, block_size);
+      polybench_2mm_cuda_1<<<grid_size,block_size>>>(tmp,A,B,C,D,alpha,beta,
+                                                     m_ni,m_nj,m_nk,m_nl);
+
+      memcpy(m_D,m_DD,m_ni * m_nl * sizeof(Real_type));
+      initCudaDeviceData(D,m_D,m_ni * m_nl ); 
+
+      grid_size = RAJA_DIVIDE_CEILING_INT(m_ni * m_nl, block_size);
+      polybench_2mm_cuda_2<<<grid_size,block_size>>>(tmp,A,B,C,D,alpha,beta,
+                                                     m_ni,m_nj,m_nk,m_nl);
+
+    }
+    cudaDeviceSynchronize();
+    stopTimer();
+
+    POLYBENCH_2MM_TEARDOWN_CUDA;
+
+  } else if (vid == RAJA_CUDA) {
+
+    POLYBENCH_2MM_DATA_SETUP_CUDA;
+
+    using EXEC_POL = RAJA::nested::Policy<
+                       RAJA::nested::CudaCollapse<
+                         RAJA::nested::For<1, RAJA::cuda_block_y_exec>,   
+                         RAJA::nested::For<0, RAJA::cuda_thread_x_exec> > >;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+     
+      RAJA::nested::forall(EXEC_POL{},
+                           RAJA::make_tuple(RAJA::RangeSegment(0, ni),
+                                            RAJA::RangeSegment(0, nj)),
+        [=] __device__ (Index_type i, Index_type j) {
+
+        POLYBENCH_2MM_BODY1;
+        for (Index_type k=0;k<nk;k++) {
+          POLYBENCH_2MM_BODY2; 
+        }
+
+      });
+
+      memcpy(m_D,m_DD,m_ni * m_nl * sizeof(Real_type));
+      initCudaDeviceData(D,m_D,m_ni * m_nl ); 
+
+      RAJA::nested::forall(EXEC_POL{},
+                           RAJA::make_tuple(RAJA::RangeSegment(0, ni),
+                                            RAJA::RangeSegment(0, nl)),
+        [=] __device__ (Index_type i, Index_type l) {
+
+        POLYBENCH_2MM_BODY3;
+        for (Index_type j=0;j<nj;j++) {
+          POLYBENCH_2MM_BODY4; 
+        }
+
+      });
+
+
+    }
+    stopTimer();
+
+    POLYBENCH_2MM_TEARDOWN_CUDA;
+
+  } else {
+      std::cout << "\n  POLYBENCH_2MM : Unknown Cuda variant id = " << vid << std::endl;
+  }
+
+}
+
+} // end namespace polybench
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
+  
diff --git a/src/polybench/POLYBENCH_2MM-OMPTarget.cpp b/src/polybench/POLYBENCH_2MM-OMPTarget.cpp
new file mode 100644
index 000000000..65906ea79
--- /dev/null
+++ b/src/polybench/POLYBENCH_2MM-OMPTarget.cpp
@@ -0,0 +1,156 @@
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "POLYBENCH_2MM.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace polybench
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define POLYBENCH_2MM_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr tmp; \
+  Real_ptr A; \
+  Real_ptr B; \
+  Real_ptr C; \
+  Real_ptr D; \
+  Real_type alpha = m_alpha; \
+  Real_type beta = m_beta; \
+\
+  memcpy(m_D,m_DD,m_ni * m_nl * sizeof(Real_type)); \
+  allocAndInitOpenMPDeviceData(tmp, m_tmp, m_ni * m_nj, did, hid); \
+  allocAndInitOpenMPDeviceData(A, m_A, m_ni * m_nk, did, hid); \
+  allocAndInitOpenMPDeviceData(B, m_B, m_nk * m_nj, did, hid); \
+  allocAndInitOpenMPDeviceData(C, m_C, m_nj * m_nl, did, hid); \
+  allocAndInitOpenMPDeviceData(D, m_D, m_ni * m_nl, did, hid); 
+
+
+#define POLYBENCH_2MM_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_D, D, m_ni * m_nl, hid, did); \
+  deallocOpenMPDeviceData(tmp, did); \
+  deallocOpenMPDeviceData(A, did); \
+  deallocOpenMPDeviceData(B, did); \
+  deallocOpenMPDeviceData(C, did); \
+  deallocOpenMPDeviceData(D, did);
+
+
+void POLYBENCH_2MM::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ni = m_ni;
+  const Index_type nj = m_nj;
+  const Index_type nk = m_nk;
+  const Index_type nl = m_nl;
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    POLYBENCH_2MM_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      
+      #pragma omp target is_device_ptr(tmp,A,B,C,D) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) collapse(2) 
+      for (Index_type i = 0; i < ni; i++ ) {
+        for(Index_type j = 0; j < nj; j++) {
+          POLYBENCH_2MM_BODY1;
+          for(Index_type k = 0; k < nk; k++) {
+            POLYBENCH_2MM_BODY2;
+          }
+        }
+      }
+
+      memcpy(m_D,m_DD,m_ni * m_nl * sizeof(Real_type));
+      //#pragma omp target update to(D[0: m_ni * m_nl])
+      initOpenMPDeviceData(D,m_D,m_ni * m_nl, did, hid); 
+
+      #pragma omp target is_device_ptr(tmp,A,B,C,D) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) collapse(2)
+      for(Index_type i = 0; i < ni; i++) {
+        for(Index_type l = 0; l < nl; l++) {
+          POLYBENCH_2MM_BODY3;
+          for(Index_type j = 0; j < nj; j++) {
+            POLYBENCH_2MM_BODY4;
+          }
+        }  
+      }
+
+    } // end run_reps
+    stopTimer(); 
+    POLYBENCH_2MM_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    POLYBENCH_2MM_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::policy::omp::omp_target_parallel_for_exec<NUMTEAMS>>(
+          RAJA::RangeSegment(0,ni * nj), [=](Index_type ii) {
+        Index_type i,j,k;
+        *(tmp + ii) = 0.0;
+        i = ii/nj; j = ii % nj;
+        for(k=0;k<nk;k++) {
+          POLYBENCH_2MM_BODY2; 
+        }
+      });
+
+      memcpy(m_D,m_DD,m_ni * m_nl * sizeof(Real_type));
+
+      //#pragma omp target update to(D[0: m_ni * m_nl])
+      initOpenMPDeviceData(D,m_D,m_ni * m_nl, did, hid); 
+      RAJA::forall<RAJA::policy::omp::omp_target_parallel_for_exec<NUMTEAMS>>(
+          RAJA::RangeSegment(0,ni * nl), [=](Index_type ii) {
+        *(D + ii) *= beta;
+        Index_type i,l,j;
+        i = ii/nl; l = ii % nl;
+        for(j=0;j<nj;j++) {
+          POLYBENCH_2MM_BODY4;
+        }  
+      });
+
+    } // for run_reps
+    stopTimer();
+    POLYBENCH_2MM_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     std::cout << "\n  POLYBENCH_2MM : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace polybench
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
+
diff --git a/src/polybench/POLYBENCH_2MM.cpp b/src/polybench/POLYBENCH_2MM.cpp
index 500b3d6dc..0d6d072e4 100644
--- a/src/polybench/POLYBENCH_2MM.cpp
+++ b/src/polybench/POLYBENCH_2MM.cpp
@@ -39,19 +39,21 @@
 
 #include "POLYBENCH_2MM.hpp"
 
+#include "RAJA/RAJA.hpp"
+#include "RAJA/util/defines.hpp"
 #include "common/DataUtils.hpp"
-#include <RAJA/RAJA.hpp>
 
 
 #include <iostream>
 #include <cstring>
 
+
 namespace rajaperf 
 {
 namespace polybench
 {
 
-#define POLYBENCH_2MM_DATA \
+#define POLYBENCH_2MM_DATA_SETUP_CPU \
   ResReal_ptr tmp = m_tmp; \
   ResReal_ptr A = m_A; \
   ResReal_ptr B = m_B; \
@@ -60,105 +62,21 @@ namespace polybench
   Real_type alpha = m_alpha; \
   Real_type beta = m_beta; 
 
-
-#define POLYBENCH_2MM_BODY1 \
-  *(tmp + i * nj + j) = 0.0;
-
-#define POLYBENCH_2MM_BODY2 \
-  *(tmp + i * nj + j) += alpha * *(A + i * nk + k) * *(B + k * nj + j);
-
-#define POLYBENCH_2MM_BODY3 \
-  *(D + i * nl + l) *= beta;
-
-#define POLYBENCH_2MM_BODY4 \
-  *(D + i * nl + l) += *(tmp + i * nj + j) * *(C + j * nl + l);
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define POLYBENCH_2MM_DATA_SETUP_CUDA \
-  Real_ptr tmp = m_tmp; \
-  Real_ptr A = m_A; \
-  Real_ptr B = m_B; \
-  Real_ptr C = m_C; \
-  Real_ptr D = m_D; \
-  Real_type alpha = m_alpha; \
-  Real_type beta = m_beta; \
-\
-  memcpy(m_D,m_DD,m_ni * m_nl * sizeof(Real_type)); \
-  allocAndInitCudaDeviceData(tmp, m_tmp, m_ni * m_nj); \
-  allocAndInitCudaDeviceData(A, m_A, m_ni * m_nk); \
-  allocAndInitCudaDeviceData(B, m_B, m_nk * m_nj); \
-  allocAndInitCudaDeviceData(C, m_C, m_nj * m_nl); \
-  allocAndInitCudaDeviceData(D, m_D, m_ni * m_nl); 
-
-
-#define POLYBENCH_2MM_TEARDOWN_CUDA \
-  getCudaDeviceData(m_D, D, m_ni * m_nl); \
-  deallocCudaDeviceData(tmp); \
-  deallocCudaDeviceData(A); \
-  deallocCudaDeviceData(B); \
-  deallocCudaDeviceData(C); \
-  deallocCudaDeviceData(D);
-
-__global__ void polybench_2mm_cuda_1(Real_ptr tmp, Real_ptr A,
-                       Real_ptr B, Real_ptr C, Real_ptr D,
-                       Real_type alpha, Real_type beta, Index_type ni, Index_type nj,
-                       Index_type nk, Index_type nl)
-{
-   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
-   Index_type i,j,k;
-   if (ii < ni * nj) {
-     *(tmp + ii) = 0.0;
-     i = ii/nj; j = ii % nj;
-     for(k=0; k < nk; k++) {
-       POLYBENCH_2MM_BODY2;              
-     }
-   }
-
-
-}
-
-__global__ void polybench_2mm_cuda_2(Real_ptr tmp, Real_ptr A,
-                       Real_ptr B, Real_ptr C, Real_ptr D,
-                       Real_type alpha, Real_type beta, Index_type ni, Index_type nj,
-                       Index_type nk, Index_type nl)
-{
-   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
-   Index_type i,l,j;
-   if (ii < ni * nl) {
-     *(D + ii) *= beta;
-     i = ii/nl; l = ii % nl;
-     for(j=0; j < nj; j++) {
-       POLYBENCH_2MM_BODY4;              
-     }
-   }
-}
-
-
-#endif // if defined(RAJA_ENABLE_CUDA)
   
 POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params)
   : KernelBase(rajaperf::Polybench_2MM, params)
 {
-  setDefaultReps(1);
   m_alpha = 1.5;
   m_beta = 1.2;
   SizeSpec_T lsizespec = KernelBase::getSizeSpec();
   switch(lsizespec) {
     case Mini:
       m_ni=16; m_nj=18; m_nk=22; m_nl=24;
-      m_run_reps = 100000;
+      m_run_reps = 10000;
       break;
     case Small:
       m_ni=40; m_nj=50; m_nk=70; m_nl=80;
-      m_run_reps = 10000;
+      m_run_reps = 1000;
       break;
     case Medium:
       m_ni=180; m_nj=190; m_nk=210; m_nl=220;
@@ -178,29 +96,24 @@ POLYBENCH_2MM::POLYBENCH_2MM(const RunParams& params)
       break;
   }
 
+  setDefaultSize( m_ni*m_nj*(1+m_nk) + m_ni*m_nl*(1+m_nj) );
   setDefaultReps(m_run_reps);
-  allocAndInitData(m_tmp, m_ni * m_nj);
-  allocAndInitData(m_A, m_ni * m_nk);
-  allocAndInitData(m_B, m_nk * m_nj);
-  allocAndInitData(m_C, m_nj * m_nl);
-  allocAndInitData(m_D, m_ni * m_nl);
-  allocAndInitData(m_DD, m_ni * m_nl);
-
 }
 
 POLYBENCH_2MM::~POLYBENCH_2MM() 
 {
-  deallocData(m_tmp);
-  deallocData(m_A);
-  deallocData(m_B);
-  deallocData(m_C);
-  deallocData(m_D);
-  deallocData(m_DD);
+
 }
 
 void POLYBENCH_2MM::setUp(VariantID vid)
 {
   (void) vid;
+  allocAndInitData(m_tmp, m_ni * m_nj, vid);
+  allocAndInitData(m_A, m_ni * m_nk, vid);
+  allocAndInitData(m_B, m_nk * m_nj, vid);
+  allocAndInitData(m_C, m_nj * m_nl, vid);
+  allocAndInitDataConst(m_D, m_ni * m_nl, 0.0, vid);
+  allocAndInitData(m_DD, m_ni * m_nl, vid);
 }
 
 void POLYBENCH_2MM::runKernel(VariantID vid)
@@ -215,22 +128,31 @@ void POLYBENCH_2MM::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      POLYBENCH_2MM_DATA;
+      POLYBENCH_2MM_DATA_SETUP_CPU;
+
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-        for (Index_type i = 0; i < ni; i++ ) 
-          for(Index_type j = 0; j < nj; j++) {
+
+        for (Index_type i = 0; i < ni; i++ ) { 
+          for (Index_type j = 0; j < nj; j++) {
             POLYBENCH_2MM_BODY1;
-            for(Index_type k = 0; k < nk; k++)
+            for (Index_type k = 0; k < nk; k++) {
               POLYBENCH_2MM_BODY2;
+            }
           }
+        }
+
         memcpy(m_D,m_DD,m_ni * m_nl * sizeof(Real_type));
-        for(Index_type i = 0; i < ni; i++)
-          for(Index_type l = 0; l < nl; l++) {
+
+        for (Index_type i = 0; i < ni; i++) {
+          for (Index_type l = 0; l < nl; l++) {
             POLYBENCH_2MM_BODY3;
-            for(Index_type j = 0; j < nj; j++)
+            for (Index_type j = 0; j < nj; j++) {
               POLYBENCH_2MM_BODY4;
+            }
           }
+        }
+
       }
       stopTimer();
 
@@ -239,27 +161,39 @@ void POLYBENCH_2MM::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      POLYBENCH_2MM_DATA;
-      resetTimer();
+      POLYBENCH_2MM_DATA_SETUP_CPU;
+
+      using EXEC_POL = RAJA::nested::Policy<
+        RAJA::nested::For<1, RAJA::seq_exec>,
+        RAJA::nested::For<0, RAJA::seq_exec> >;
+
       startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {      
+      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<RAJA::seq_exec,RAJA::seq_exec>>> (RAJA::RangeSegment{0, ni}, RAJA::RangeSegment{0, nj}, [=] (int i, int j) {
-          POLYBENCH_2MM_BODY1;
+        RAJA::nested::forall(EXEC_POL{},
+          RAJA::make_tuple(RAJA::RangeSegment(0, ni),
+                           RAJA::RangeSegment(0, nj)),
+            [=](Index_type i, Index_type j) {     
+            POLYBENCH_2MM_BODY1;
 
-          RAJA::forall<RAJA::seq_exec> (RAJA::RangeSegment{0, nk}, [=] (int k) {
-            POLYBENCH_2MM_BODY2; 
-          });
+            RAJA::forall<RAJA::seq_exec> (
+              RAJA::RangeSegment{0, nk}, [=] (int k) {
+              POLYBENCH_2MM_BODY2; 
+            });
         });
 
         memcpy(m_D,m_DD,m_ni * m_nl * sizeof(Real_type));
 
-        RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<RAJA::seq_exec,RAJA::seq_exec>>> (RAJA::RangeSegment{0, ni}, RAJA::RangeSegment{0, nl}, [=] (int i, int l) {
-          POLYBENCH_2MM_BODY3;
+        RAJA::nested::forall(EXEC_POL{},
+          RAJA::make_tuple(RAJA::RangeSegment(0, ni),
+                           RAJA::RangeSegment(0, nl)),
+            [=](Index_type i, Index_type l) {     
+            POLYBENCH_2MM_BODY3;
 
-          RAJA::forall<RAJA::seq_exec> (RAJA::RangeSegment{0, nj}, [=] (int j) {
-            POLYBENCH_2MM_BODY4;
-          });
+            RAJA::forall<RAJA::seq_exec> (
+              RAJA::RangeSegment{0, nj}, [=] (int j) {
+              POLYBENCH_2MM_BODY4; 
+            });
         });
 
       }
@@ -271,28 +205,32 @@ void POLYBENCH_2MM::runKernel(VariantID vid)
 #if defined(RAJA_ENABLE_OPENMP)      
     case Base_OpenMP : {
 
-      POLYBENCH_2MM_DATA;
+      POLYBENCH_2MM_DATA_SETUP_CPU;
+
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-        #pragma omp parallel for  
-        for (Index_type i = 0; i < ni; i++ ) 
+
+        #pragma omp parallel for collapse(2) 
+        for (Index_type i = 0; i < ni; i++ ) {
           for(Index_type j = 0; j < nj; j++) {
             POLYBENCH_2MM_BODY1;
-            for(Index_type k = 0; k < nk; k++) {
-
+            for (Index_type k = 0; k < nk; k++) {
               POLYBENCH_2MM_BODY2;
             }
           }
-
+        }
 
         memcpy(m_D,m_DD,m_ni * m_nl * sizeof(Real_type));
-        #pragma omp parallel for   
-        for(Index_type i = 0; i < ni; i++)
+
+        #pragma omp parallel for collapse(2)  
+        for(Index_type i = 0; i < ni; i++) {
           for(Index_type l = 0; l < nl; l++) {
             POLYBENCH_2MM_BODY3;
-            for(Index_type j = 0; j < nj; j++)
+            for (Index_type j = 0; j < nj; j++) {
               POLYBENCH_2MM_BODY4;
-          }  
+            }
+          }
+        }
 
       }
       stopTimer();
@@ -300,106 +238,70 @@ void POLYBENCH_2MM::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // case is not defined...
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      POLYBENCH_2MM_DATA;
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-        RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<RAJA::omp_parallel_for_exec,RAJA::seq_exec>>> (RAJA::RangeSegment{0, ni}, RAJA::RangeSegment{0, nj}, [=] (int i, int j) {
-          POLYBENCH_2MM_BODY1;
-
-          RAJA::forall<RAJA::seq_exec> (RAJA::RangeSegment{0, nk}, [=] (int k) {
-            POLYBENCH_2MM_BODY2; 
-          });
-        });
-        memcpy(m_D,m_DD,m_ni * m_nl * sizeof(Real_type));
-        RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<RAJA::omp_parallel_for_exec,RAJA::seq_exec>>> (RAJA::RangeSegment{0, ni}, RAJA::RangeSegment{0, nl}, [=] (int i, int l) {
-          POLYBENCH_2MM_BODY3;
-
-          RAJA::forall<RAJA::seq_exec> (RAJA::RangeSegment{0, nj}, [=] (int j) {
-            POLYBENCH_2MM_BODY4;
-          });
-        });
+      POLYBENCH_2MM_DATA_SETUP_CPU;
 
+      using EXEC_POL = RAJA::nested::Policy<
+        RAJA::nested::For<1, RAJA::omp_parallel_for_exec>,
+        RAJA::nested::For<0, RAJA::seq_exec> >;
 
-      }
-      stopTimer();
-
-      break;
-    }
-#endif
-
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-
-      POLYBENCH_2MM_DATA_SETUP_CUDA;
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-        size_t grid_size = RAJA_DIVIDE_CEILING_INT(m_ni * m_nj, block_size);
-        polybench_2mm_cuda_1<<<grid_size,block_size>>>(tmp,A,B,C,D,alpha,beta,m_ni,m_nj,m_nk,m_nl);
 
-        memcpy(m_D,m_DD,m_ni * m_nl * sizeof(Real_type));
-        initCudaDeviceData(D,m_D,m_ni * m_nl ); 
-
-        grid_size = RAJA_DIVIDE_CEILING_INT(m_ni * m_nl, block_size);
-        polybench_2mm_cuda_2<<<grid_size,block_size>>>(tmp,A,B,C,D,alpha,beta,m_ni,m_nj,m_nk,m_nl);
-      }
-      cudaDeviceSynchronize();
-      stopTimer();
-      POLYBENCH_2MM_TEARDOWN_CUDA;
-      break;
-    }
-
-    case RAJA_CUDA : {
+        RAJA::nested::forall(EXEC_POL{},
+          RAJA::make_tuple(RAJA::RangeSegment(0, ni),
+                           RAJA::RangeSegment(0, nj)),
+            [=](Index_type i, Index_type j) {     
+            POLYBENCH_2MM_BODY1;
 
-      POLYBENCH_2MM_DATA_SETUP_CUDA;
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-       
-        RAJA::forall<RAJA::cuda_exec<block_size>> (RAJA::RangeSegment{0, ni * nj}, [=] __device__ (int ii) {
-          Index_type i,j,k;
-          *(tmp + ii) = 0.0;
-          i = ii/nj; j = ii % nj;
-          for(k=0;k<nk;k++) {
-            POLYBENCH_2MM_BODY2; 
-          }
+            RAJA::forall<RAJA::seq_exec> (
+              RAJA::RangeSegment{0, nk}, [=] (int k) {
+              POLYBENCH_2MM_BODY2; 
+            });
         });
 
         memcpy(m_D,m_DD,m_ni * m_nl * sizeof(Real_type));
-        initCudaDeviceData(D,m_D,m_ni * m_nl ); 
-
-        RAJA::forall<RAJA::cuda_exec<block_size>> (RAJA::RangeSegment{0, ni * nl}, [=] __device__ (int ii) {
-          *(D + ii) *= beta;
-          Index_type i,l,j;
-          i = ii/nl; l = ii % nl;
-          for(j=0;j<nj;j++) {
-            POLYBENCH_2MM_BODY4;
-          }  
+
+        RAJA::nested::forall(EXEC_POL{},
+          RAJA::make_tuple(RAJA::RangeSegment(0, ni),
+                           RAJA::RangeSegment(0, nl)),
+            [=](Index_type i, Index_type l) {     
+            POLYBENCH_2MM_BODY3;
+
+            RAJA::forall<RAJA::seq_exec> (
+              RAJA::RangeSegment{0, nj}, [=] (int j) {
+              POLYBENCH_2MM_BODY4; 
+            });
         });
 
       }
       stopTimer();
-      POLYBENCH_2MM_TEARDOWN_CUDA;
+
       break;
     }
+#endif //RAJA_ENABLE_OPENMP
 
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
+      break;
+    }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  POLYBENCH_2MM : Unknown variant id = " << vid << std::endl;
     }
 
   }
@@ -414,8 +316,13 @@ void POLYBENCH_2MM::updateChecksum(VariantID vid)
 void POLYBENCH_2MM::tearDown(VariantID vid)
 {
   (void) vid;
-
+  deallocData(m_tmp);
+  deallocData(m_A);
+  deallocData(m_B);
+  deallocData(m_C);
+  deallocData(m_D);
+  deallocData(m_DD);
 }
 
-} // end namespace basic
+} // end namespace polybench
 } // end namespace rajaperf
diff --git a/src/polybench/POLYBENCH_2MM.hpp b/src/polybench/POLYBENCH_2MM.hpp
index 241d0ebed..e6a98feeb 100644
--- a/src/polybench/POLYBENCH_2MM.hpp
+++ b/src/polybench/POLYBENCH_2MM.hpp
@@ -12,11 +12,46 @@
 // For details about use and distribution, please read raja-perfsuite/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+///
+/// POLYBENCH_2MM kernel reference implementation:
+///
+/// D := alpha*A*B*C + beta*D
+///
+/// for (Index_type i = 0; i < m_ni; i++) {
+///   for (Index_type j = 0; j < m_nj; j++) {
+///     m_tmp[i][j] = 0.0;
+///     for (Index_type k = 0; k < m_nk; ++k) {
+///       m_tmp[i][j] += m_alpha * m_A[i][k] * m_B[k][j];
+///     }
+///   }
+/// } 
+/// for (Index_type i = 0; i < m_ni; i++) {
+///   for (Index_type j = 0; j < m_nl; j++) {
+///     m_D[i][j] *= m_beta;
+///     for (Index_type k = 0; k < m_nj; ++k) {
+///       m_D[i][j] += m_tmp[i][k] * m_C[k][j];
+///     } 
+///   }
+/// } 
+///
+
 
 
 #ifndef RAJAPerf_POLYBENCH_2MM_HXX
 #define RAJAPerf_POLYBENCH_2MM_HXX
 
+#define POLYBENCH_2MM_BODY1 \
+  *(tmp + i * nj + j) = 0.0;
+
+#define POLYBENCH_2MM_BODY2 \
+  *(tmp + i * nj + j) += alpha * *(A + i * nk + k) * *(B + k * nj + j);
+
+#define POLYBENCH_2MM_BODY3 \
+  *(D + i * nl + l) *= beta;
+
+#define POLYBENCH_2MM_BODY4 \
+  *(D + i * nl + l) += *(tmp + i * nj + j) * *(C + j * nl + l);
+
 #include "common/KernelBase.hpp"
 
 namespace rajaperf 
@@ -40,6 +75,8 @@ class POLYBENCH_2MM : public KernelBase
   void runKernel(VariantID vid); 
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
 
 private:
   Index_type m_ni;
diff --git a/src/polybench/POLYBENCH_3MM-Cuda.cpp b/src/polybench/POLYBENCH_3MM-Cuda.cpp
new file mode 100644
index 000000000..9eac7a79f
--- /dev/null
+++ b/src/polybench/POLYBENCH_3MM-Cuda.cpp
@@ -0,0 +1,213 @@
+  
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "POLYBENCH_3MM.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace polybench
+{
+
+//
+// Define thread block size for CUDA execution
+//
+const size_t block_size = 256;
+
+#define POLYBENCH_3MM_DATA_SETUP_CUDA \
+  Real_ptr A = m_A; \
+  Real_ptr B = m_B; \
+  Real_ptr C = m_C; \
+  Real_ptr D = m_D; \
+  Real_ptr E = m_E; \
+  Real_ptr F = m_F; \
+  Real_ptr G = m_G; \
+\
+  allocAndInitCudaDeviceData(A, m_A, m_ni * m_nk); \
+  allocAndInitCudaDeviceData(B, m_B, m_nk * m_nj); \
+  allocAndInitCudaDeviceData(C, m_C, m_nj * m_nm); \
+  allocAndInitCudaDeviceData(D, m_D, m_nm * m_nl); \
+  allocAndInitCudaDeviceData(E, m_E, m_ni * m_nj); \
+  allocAndInitCudaDeviceData(F, m_F, m_nj * m_nl); \
+  allocAndInitCudaDeviceData(G, m_G, m_ni * m_nl); 
+
+
+#define POLYBENCH_3MM_TEARDOWN_CUDA \
+  getCudaDeviceData(m_G, G, m_ni * m_nl); \
+  deallocCudaDeviceData(A); \
+  deallocCudaDeviceData(B); \
+  deallocCudaDeviceData(C); \
+  deallocCudaDeviceData(D); \
+  deallocCudaDeviceData(E); \
+  deallocCudaDeviceData(F); \
+  deallocCudaDeviceData(G);
+
+__global__ void polybench_3mm_cuda_1(Real_ptr A,
+                       Real_ptr B, Real_ptr C, Real_ptr D,
+                       Real_ptr E, Real_ptr F, Real_ptr G,
+                       Index_type ni, Index_type nj,
+                       Index_type nk, Index_type nl, Index_type nm)
+{
+   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i,j,k;
+   if (ii < ni * nj) {
+     *(E + ii) = 0.0;
+     i = ii/nj; j = ii % nj;
+     for (k=0; k < nk; k++) {
+       POLYBENCH_3MM_BODY2;              
+     }
+   }
+}
+
+__global__ void polybench_3mm_cuda_2(Real_ptr A,
+                       Real_ptr B, Real_ptr C, Real_ptr D,
+                       Real_ptr E, Real_ptr F, Real_ptr G,
+                       Index_type ni, Index_type nj,
+                       Index_type nk, Index_type nl, Index_type nm)
+{
+   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type j,l,m;
+   if (ii < nj * nl) {
+     *(F + ii) = 0.0;
+     j = ii/nl; l = ii % nl;
+     for (m=0; m < nm; m++) {
+       POLYBENCH_3MM_BODY4;              
+     }
+   }
+}
+
+
+__global__ void polybench_3mm_cuda_3(Real_ptr A,
+                       Real_ptr B, Real_ptr C, Real_ptr D,
+                       Real_ptr E, Real_ptr F, Real_ptr G,
+                       Index_type ni, Index_type nj,
+                       Index_type nk, Index_type nl, Index_type nm)
+{
+   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i,l,j;
+   if (ii < ni * nl) {
+     *(G + ii) = 0.0;
+     i = ii/nl; l = ii % nl;
+     for (j=0; j < nj; j++) {
+       POLYBENCH_3MM_BODY6;              
+     }
+   }
+}
+
+void POLYBENCH_3MM::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ni = m_ni;
+  const Index_type nj = m_nj;
+  const Index_type nk = m_nk;
+  const Index_type nl = m_nl;
+  const Index_type nm = m_nm;
+
+  
+  if ( vid == Base_CUDA ) {
+
+    POLYBENCH_3MM_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      size_t grid_size = RAJA_DIVIDE_CEILING_INT(m_ni * m_nj, block_size);
+      polybench_3mm_cuda_1<<<grid_size,block_size>>>(A,B,C,D,E,F,G,
+                                                     m_ni,m_nj,m_nk,m_nl,m_nm);
+
+      grid_size = RAJA_DIVIDE_CEILING_INT(m_nj * m_nl, block_size);
+      polybench_3mm_cuda_2<<<grid_size,block_size>>>(A,B,C,D,E,F,G,
+                                                     m_ni,m_nj,m_nk,m_nl,m_nm);
+
+      grid_size = RAJA_DIVIDE_CEILING_INT(m_ni * m_nl, block_size);
+      polybench_3mm_cuda_3<<<grid_size,block_size>>>(A,B,C,D,E,F,G,
+                                                     m_ni,m_nj,m_nk,m_nl,m_nm);
+    }
+    cudaDeviceSynchronize();
+    stopTimer();
+
+    
+    POLYBENCH_3MM_TEARDOWN_CUDA;
+
+  } else if (vid == RAJA_CUDA) {
+
+    POLYBENCH_3MM_DATA_SETUP_CUDA;
+
+    using EXEC_POL = RAJA::nested::Policy<
+                       RAJA::nested::CudaCollapse<
+                         RAJA::nested::For<1, RAJA::cuda_block_y_exec>,   
+                         RAJA::nested::For<0, RAJA::cuda_thread_x_exec> > >;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      RAJA::nested::forall(EXEC_POL{},
+                           RAJA::make_tuple(RAJA::RangeSegment(0, ni),
+                                            RAJA::RangeSegment(0, nj)),
+        [=] __device__ (Index_type i, Index_type j) {
+
+        POLYBENCH_3MM_BODY1;
+        for (Index_type k=0;k<nk;k++) {
+          POLYBENCH_3MM_BODY2; 
+        }
+
+      });
+
+      RAJA::nested::forall(EXEC_POL{},
+                           RAJA::make_tuple(RAJA::RangeSegment(0, nj),
+                                            RAJA::RangeSegment(0, nl)),
+        [=] __device__ (Index_type j, Index_type l) {
+
+        POLYBENCH_3MM_BODY3;
+        for (Index_type m=0;m<nm;m++) {
+          POLYBENCH_3MM_BODY4; 
+        }
+
+      });
+
+      RAJA::nested::forall(EXEC_POL{},
+                           RAJA::make_tuple(RAJA::RangeSegment(0, ni),
+                                            RAJA::RangeSegment(0, nl)),
+        [=] __device__ (Index_type i, Index_type l) {
+
+        POLYBENCH_3MM_BODY5;
+        for (Index_type j=0;j<nj;j++) {
+          POLYBENCH_3MM_BODY6; 
+        }
+
+      });
+    }
+    stopTimer();
+
+    POLYBENCH_3MM_TEARDOWN_CUDA;
+
+  } else {
+      std::cout << "\n  POLYBENCH_3MM : Unknown Cuda variant id = " << vid << std::endl;
+  }
+
+}
+
+} // end namespace polybench
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
+  
diff --git a/src/polybench/POLYBENCH_3MM-OMPTarget.cpp b/src/polybench/POLYBENCH_3MM-OMPTarget.cpp
new file mode 100644
index 000000000..87a07f7cd
--- /dev/null
+++ b/src/polybench/POLYBENCH_3MM-OMPTarget.cpp
@@ -0,0 +1,174 @@
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "POLYBENCH_3MM.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace polybench
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define POLYBENCH_3MM_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr A; \
+  Real_ptr B; \
+  Real_ptr C; \
+  Real_ptr D; \
+  Real_ptr E; \
+  Real_ptr F; \
+  Real_ptr G; \
+\
+  allocAndInitOpenMPDeviceData(A, m_A, m_ni * m_nk, did, hid); \
+  allocAndInitOpenMPDeviceData(B, m_B, m_nk * m_nj, did, hid); \
+  allocAndInitOpenMPDeviceData(C, m_C, m_nj * m_nm, did, hid); \
+  allocAndInitOpenMPDeviceData(D, m_D, m_nm * m_nl, did, hid); \
+  allocAndInitOpenMPDeviceData(E, m_E, m_ni * m_nj, did, hid); \
+  allocAndInitOpenMPDeviceData(F, m_F, m_nj * m_nl, did, hid); \
+  allocAndInitOpenMPDeviceData(G, m_G, m_ni * m_nl, did, hid); 
+
+
+#define POLYBENCH_3MM_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_G, G, m_ni * m_nl, hid, did); \
+  deallocOpenMPDeviceData(A, did); \
+  deallocOpenMPDeviceData(B, did); \
+  deallocOpenMPDeviceData(C, did); \
+  deallocOpenMPDeviceData(D, did); \
+  deallocOpenMPDeviceData(E, did); \
+  deallocOpenMPDeviceData(F, did); \
+  deallocOpenMPDeviceData(G, did);
+
+void POLYBENCH_3MM::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ni = m_ni;
+  const Index_type nj = m_nj;
+  const Index_type nk = m_nk;
+  const Index_type nl = m_nl;
+  const Index_type nm = m_nm;
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    POLYBENCH_3MM_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+    
+      #pragma omp target is_device_ptr(A,B,C,D,E,F,G) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) collapse(2)
+      for (Index_type i = 0; i < ni; i++ ) {
+        for(Index_type j = 0; j < nj; j++) {
+          POLYBENCH_3MM_BODY1;
+          for(Index_type k = 0; k < nk; k++) {
+            POLYBENCH_3MM_BODY2;
+          }
+        }
+      }
+
+      #pragma omp target is_device_ptr(A,B,C,D,E,F,G) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) collapse(2)
+      for(Index_type j = 0; j < nj; j++) {
+        for(Index_type l = 0; l < nl; l++) {
+          POLYBENCH_3MM_BODY3;
+          for(Index_type m = 0; m < nm; m++) {
+            POLYBENCH_3MM_BODY4;
+          }
+        }
+      }
+
+      #pragma omp target is_device_ptr(A,B,C,D,E,F,G) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) collapse(2)
+      for(Index_type i = 0; i < ni; i++) {
+        for(Index_type l = 0; l < nl; l++) {
+          POLYBENCH_3MM_BODY5;
+          for(Index_type j = 0; j < nj; j++) {
+            POLYBENCH_3MM_BODY6;
+          }
+        }
+      }
+
+    } // end run_reps
+    stopTimer(); 
+    POLYBENCH_3MM_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    POLYBENCH_3MM_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+      
+      RAJA::forall<RAJA::policy::omp::omp_target_parallel_for_exec<NUMTEAMS>>(
+          RAJA::RangeSegment(0,ni * nj), [=](Index_type ii) {
+        Index_type i,j,k;
+        *(E + ii) = 0.0;
+        i = ii/nj; j = ii % nj;
+        for(k=0;k<nk;k++) {
+          POLYBENCH_3MM_BODY2; 
+        }
+      });
+
+
+      RAJA::forall<RAJA::policy::omp::omp_target_parallel_for_exec<NUMTEAMS>>(
+          RAJA::RangeSegment(0,nj * nl), [=](Index_type ii) {
+        Index_type j,l,m;
+        *(F + ii) = 0.0;
+        j = ii/nl; l = ii % nl;
+        for(m=0;m<nm;m++) {
+          POLYBENCH_3MM_BODY4; 
+        }
+      });
+
+
+      RAJA::forall<RAJA::policy::omp::omp_target_parallel_for_exec<NUMTEAMS>>(
+          RAJA::RangeSegment(0,ni * nl), [=](Index_type ii) {
+        Index_type i,l,j;
+        *(G + ii) = 0.0;
+        i = ii/nl; l = ii % nl;
+        for(j=0;j<nj;j++) {
+          POLYBENCH_3MM_BODY6; 
+        }
+      });
+
+    } // for run_reps
+    stopTimer();
+    POLYBENCH_3MM_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     std::cout << "\n  POLYBENCH_3MM : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace polybench
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
+
diff --git a/src/polybench/POLYBENCH_3MM.cpp b/src/polybench/POLYBENCH_3MM.cpp
index 628bf8724..1cf1df9aa 100644
--- a/src/polybench/POLYBENCH_3MM.cpp
+++ b/src/polybench/POLYBENCH_3MM.cpp
@@ -48,18 +48,18 @@
 
 #include "POLYBENCH_3MM.hpp"
 
+#include "RAJA/RAJA.hpp"
 #include "common/DataUtils.hpp"
-#include <RAJA/RAJA.hpp>
-
 
 #include <iostream>
+#include <cstring>
 
 namespace rajaperf 
 {
 namespace polybench
 {
 
-#define POLYBENCH_3MM_DATA \
+#define POLYBENCH_3MM_DATA_SETUP_CPU \
   ResReal_ptr A = m_A; \
   ResReal_ptr B = m_B; \
   ResReal_ptr C = m_C; \
@@ -68,122 +68,10 @@ namespace polybench
   ResReal_ptr F = m_F; \
   ResReal_ptr G = m_G; 
   
-
-
-#define POLYBENCH_3MM_BODY1 \
-  *(E + i * nj + j) = 0.0;
-
-#define POLYBENCH_3MM_BODY2 \
-  *(E + i * nj + j) += *(A + i * nk + k) * *(B + k * nj + j);
-
-#define POLYBENCH_3MM_BODY3 \
-  *(F + j * nl + l) = 0.0;
-
-#define POLYBENCH_3MM_BODY4 \
-  *(F + j * nl + l)  += *(C + j * nm + m) * *(D + m * nl + l);
-
-#define POLYBENCH_3MM_BODY5 \
-  *(G + i * nl + l) = 0.0;
-
-#define POLYBENCH_3MM_BODY6 \
-  *(G + i * nl + l) += *(E + i * nj + j) * *(F + j * nl + l);
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define POLYBENCH_3MM_DATA_SETUP_CUDA \
-  Real_ptr A = m_A; \
-  Real_ptr B = m_B; \
-  Real_ptr C = m_C; \
-  Real_ptr D = m_D; \
-  Real_ptr E = m_E; \
-  Real_ptr F = m_F; \
-  Real_ptr G = m_G; \
-\
-  allocAndInitCudaDeviceData(A, m_A, m_ni * m_nk); \
-  allocAndInitCudaDeviceData(B, m_B, m_nk * m_nj); \
-  allocAndInitCudaDeviceData(C, m_C, m_nj * m_nm); \
-  allocAndInitCudaDeviceData(D, m_D, m_nm * m_nl); \
-  allocAndInitCudaDeviceData(E, m_E, m_ni * m_nj); \
-  allocAndInitCudaDeviceData(F, m_F, m_nj * m_nl); \
-  allocAndInitCudaDeviceData(G, m_G, m_ni * m_nl); 
-
-
-#define POLYBENCH_3MM_TEARDOWN_CUDA \
-  getCudaDeviceData(m_G, G, m_ni * m_nl); \
-  deallocCudaDeviceData(A); \
-  deallocCudaDeviceData(B); \
-  deallocCudaDeviceData(C); \
-  deallocCudaDeviceData(D); \
-  deallocCudaDeviceData(E); \
-  deallocCudaDeviceData(F); \
-  deallocCudaDeviceData(G);
-
-__global__ void polybench_3mm_cuda_1(Real_ptr A,
-                       Real_ptr B, Real_ptr C, Real_ptr D,
-                       Real_ptr E, Real_ptr F, Real_ptr G,
-                       Index_type ni, Index_type nj,
-                       Index_type nk, Index_type nl, Index_type nm)
-{
-   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
-   Index_type i,j,k;
-   if (ii < ni * nj) {
-     *(E + ii) = 0.0;
-     i = ii/nj; j = ii % nj;
-     for(k=0; k < nk; k++) {
-       POLYBENCH_3MM_BODY2;              
-     }
-   }
-}
-
-__global__ void polybench_3mm_cuda_2(Real_ptr A,
-                       Real_ptr B, Real_ptr C, Real_ptr D,
-                       Real_ptr E, Real_ptr F, Real_ptr G,
-                       Index_type ni, Index_type nj,
-                       Index_type nk, Index_type nl, Index_type nm)
-{
-   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
-   Index_type j,l,m;
-   if (ii < nj * nl) {
-     *(F + ii) = 0.0;
-     j = ii/nl; l = ii % nl;
-     for(m=0; m < nm; m++) {
-       POLYBENCH_3MM_BODY4;              
-     }
-   }
-}
-
-
-__global__ void polybench_3mm_cuda_3(Real_ptr A,
-                       Real_ptr B, Real_ptr C, Real_ptr D,
-                       Real_ptr E, Real_ptr F, Real_ptr G,
-                       Index_type ni, Index_type nj,
-                       Index_type nk, Index_type nl, Index_type nm)
-{
-   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
-   Index_type i,l,j;
-   if (ii < ni * nl) {
-     *(G + ii) = 0.0;
-     i = ii/nl; l = ii % nl;
-     for(j=0; j < nj; j++) {
-       POLYBENCH_3MM_BODY6;              
-     }
-   }
-}
-
-
-#endif // if defined(RAJA_RAJA_ENABLE_CUDA)
   
 POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params)
   : KernelBase(rajaperf::Polybench_3MM, params)
 {
-  setDefaultReps(1);
   SizeSpec_T lsizespec = KernelBase::getSizeSpec();
   switch(lsizespec) {
     case Mini:
@@ -211,30 +99,27 @@ POLYBENCH_3MM::POLYBENCH_3MM(const RunParams& params)
       m_run_reps = 100;
       break;
   }
+
+  setDefaultSize(m_ni*m_nj*(1+m_nk) + m_nj*m_nl*(1+m_nm) + m_ni*m_nl*(1+m_nj));
   setDefaultReps(m_run_reps);
-  allocAndInitData(m_A, m_ni * m_nk);
-  allocAndInitData(m_B, m_nk * m_nj);
-  allocAndInitData(m_C, m_nj * m_nm);
-  allocAndInitData(m_D, m_nm * m_nl);
-  allocAndInitData(m_E, m_ni * m_nj);
-  allocAndInitData(m_F, m_nj * m_nl);
-  allocAndInitData(m_G, m_ni * m_nl);
+
+
 }
 
 POLYBENCH_3MM::~POLYBENCH_3MM() 
 {
-  deallocData(m_A);
-  deallocData(m_B);
-  deallocData(m_C);
-  deallocData(m_D);
-  deallocData(m_E);
-  deallocData(m_F);
-  deallocData(m_G);
 }
 
 void POLYBENCH_3MM::setUp(VariantID vid)
 {
   (void) vid;
+  allocAndInitData(m_A, m_ni * m_nk, vid);
+  allocAndInitData(m_B, m_nk * m_nj, vid);
+  allocAndInitData(m_C, m_nj * m_nm, vid);
+  allocAndInitData(m_D, m_nm * m_nl, vid);
+  allocAndInitData(m_E, m_ni * m_nj, vid);
+  allocAndInitData(m_F, m_nj * m_nl, vid);
+  allocAndInitDataConst(m_G, m_ni * m_nl, 0.0, vid);
 }
 
 void POLYBENCH_3MM::runKernel(VariantID vid)
@@ -250,29 +135,38 @@ void POLYBENCH_3MM::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      POLYBENCH_3MM_DATA;
+      POLYBENCH_3MM_DATA_SETUP_CPU;
+
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-        for (Index_type i = 0; i < ni; i++ ) 
-          for(Index_type j = 0; j < nj; j++) {
+
+        for (Index_type i = 0; i < ni; i++ ) {
+          for (Index_type j = 0; j < nj; j++) {
             POLYBENCH_3MM_BODY1;
-            for(Index_type k = 0; k < nk; k++)
+            for (Index_type k = 0; k < nk; k++) {
               POLYBENCH_3MM_BODY2;
+            }
           }
+        }
 
-        for(Index_type j = 0; j < nj; j++)
-          for(Index_type l = 0; l < nl; l++) {
+        for (Index_type j = 0; j < nj; j++) {
+          for (Index_type l = 0; l < nl; l++) {
             POLYBENCH_3MM_BODY3;
-            for(Index_type m = 0; m < nm; m++)
+            for (Index_type m = 0; m < nm; m++) {
               POLYBENCH_3MM_BODY4;
+            }
           }
+        }
 
-        for(Index_type i = 0; i < ni; i++)
-          for(Index_type l = 0; l < nl; l++) {
+        for (Index_type i = 0; i < ni; i++) {
+          for (Index_type l = 0; l < nl; l++) {
             POLYBENCH_3MM_BODY5;
-            for(Index_type j = 0; j < nj; j++)
+            for (Index_type j = 0; j < nj; j++) {
               POLYBENCH_3MM_BODY6;
+            }
           }
+        }
+
       }
       stopTimer();
 
@@ -280,68 +174,95 @@ void POLYBENCH_3MM::runKernel(VariantID vid)
     }
 
     case RAJA_Seq : {
-      POLYBENCH_3MM_DATA;
+
+      POLYBENCH_3MM_DATA_SETUP_CPU;
+      
+      using EXEC_POL = RAJA::nested::Policy<
+        RAJA::nested::For<1, RAJA::seq_exec>,
+        RAJA::nested::For<0, RAJA::seq_exec> >;
+
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-        RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<RAJA::seq_exec,RAJA::seq_exec>>> (RAJA::RangeSegment{0, ni}, RAJA::RangeSegment{0, nj}, [=] (int i, int j) {
-          POLYBENCH_3MM_BODY1;
 
-          RAJA::forall<RAJA::seq_exec> (RAJA::RangeSegment{0, nk}, [=] (int k) {
-            POLYBENCH_3MM_BODY2; 
-          });
+        RAJA::nested::forall(EXEC_POL{},
+          RAJA::make_tuple(RAJA::RangeSegment(0, ni),
+                           RAJA::RangeSegment(0, nj)),
+            [=](Index_type i, Index_type j) {     
+            POLYBENCH_3MM_BODY1;
+
+            RAJA::forall<RAJA::seq_exec> (
+              RAJA::RangeSegment{0, nk}, [=] (int k) {
+              POLYBENCH_3MM_BODY2; 
+            });
         });
 
-        RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<RAJA::seq_exec,RAJA::seq_exec>>> (RAJA::RangeSegment{0, nj}, RAJA::RangeSegment{0, nl}, [=] (int j, int l) {
-          POLYBENCH_3MM_BODY3;
+        RAJA::nested::forall(EXEC_POL{},
+          RAJA::make_tuple(RAJA::RangeSegment(0, nj),
+                           RAJA::RangeSegment(0, nl)),
+            [=](Index_type j, Index_type l) {     
+            POLYBENCH_3MM_BODY3;
 
-          RAJA::forall<RAJA::seq_exec> (RAJA::RangeSegment{0, nm}, [=] (int m) {
-            POLYBENCH_3MM_BODY4;
-          });
+            RAJA::forall<RAJA::seq_exec> (
+              RAJA::RangeSegment{0, nm}, [=] (int m) {
+              POLYBENCH_3MM_BODY4; 
+            });
         });
 
-        RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<RAJA::seq_exec,RAJA::seq_exec>>> (RAJA::RangeSegment{0, ni}, RAJA::RangeSegment{0, nl}, [=] (int i, int l) {
-          POLYBENCH_3MM_BODY5;
+        RAJA::nested::forall(EXEC_POL{},
+          RAJA::make_tuple(RAJA::RangeSegment(0, ni),
+                           RAJA::RangeSegment(0, nl)),
+            [=](Index_type i, Index_type l) {     
+            POLYBENCH_3MM_BODY5;
 
-          RAJA::forall<RAJA::seq_exec> (RAJA::RangeSegment{0, nj}, [=] (int j) {
-            POLYBENCH_3MM_BODY6;
-          });
+            RAJA::forall<RAJA::seq_exec> (
+              RAJA::RangeSegment{0, nj}, [=] (int j) {
+              POLYBENCH_3MM_BODY6; 
+            });
         });
 
-      }
+      } // end run_reps
       stopTimer();
+
       break;
     }
 
 #if defined(RAJA_ENABLE_OPENMP)      
     case Base_OpenMP : {
 
-      POLYBENCH_3MM_DATA;
+      POLYBENCH_3MM_DATA_SETUP_CPU;
+
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
         #pragma omp parallel for  
-        for (Index_type i = 0; i < ni; i++ ) 
-          for(Index_type j = 0; j < nj; j++) {
+        for (Index_type i = 0; i < ni; i++ )  {
+          for (Index_type j = 0; j < nj; j++) {
             POLYBENCH_3MM_BODY1;
-            for(Index_type k = 0; k < nk; k++) {
+            for (Index_type k = 0; k < nk; k++) {
               POLYBENCH_3MM_BODY2;
             }
           }
+        }
 
         #pragma omp parallel for   
-        for(Index_type j = 0; j < nj; j++)
-          for(Index_type l = 0; l < nl; l++) {
+        for (Index_type j = 0; j < nj; j++) {
+          for (Index_type l = 0; l < nl; l++) {
             POLYBENCH_3MM_BODY3;
-            for(Index_type m = 0; m < nm; m++)
+            for (Index_type m = 0; m < nm; m++) {
               POLYBENCH_3MM_BODY4;
+            }
           }
+        }
 
         #pragma omp parallel for   
-        for(Index_type i = 0; i < ni; i++)
-          for(Index_type l = 0; l < nl; l++) {
+        for (Index_type i = 0; i < ni; i++) {
+          for (Index_type l = 0; l < nl; l++) {
             POLYBENCH_3MM_BODY5;
-            for(Index_type j = 0; j < nj; j++)
+            for (Index_type j = 0; j < nj; j++) {
               POLYBENCH_3MM_BODY6;
-          }  
+            }
+          }
+        }
 
       }
       stopTimer();
@@ -349,117 +270,88 @@ void POLYBENCH_3MM::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // case is not defined...
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      POLYBENCH_3MM_DATA;
+      POLYBENCH_3MM_DATA_SETUP_CPU;
+
+      using EXEC_POL = RAJA::nested::Policy<
+        RAJA::nested::For<1, RAJA::omp_parallel_for_exec>,
+        RAJA::nested::For<0, RAJA::seq_exec> >;
+
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-        RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<RAJA::omp_parallel_for_exec,RAJA::seq_exec>>> (RAJA::RangeSegment{0, ni}, RAJA::RangeSegment{0, nj}, [=] (int i, int j) {
+        
+        RAJA::nested::forall(EXEC_POL{},
+          RAJA::make_tuple( RAJA::RangeSegment{0, ni}, 
+                            RAJA::RangeSegment{0, nj}),
+          [=] (int i, int j) {
+
           POLYBENCH_3MM_BODY1;
 
-          RAJA::forall<RAJA::seq_exec> (RAJA::RangeSegment{0, nk}, [=] (int k) {
+          RAJA::forall<RAJA::seq_exec> (
+          RAJA::RangeSegment{0, nk}, [=] (int k) {
             POLYBENCH_3MM_BODY2; 
           });
+
         });
 
-        RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<RAJA::omp_parallel_for_exec,RAJA::seq_exec>>> (RAJA::RangeSegment{0, nj}, RAJA::RangeSegment{0, nl}, [=] (int j, int l) {
+        RAJA::nested::forall(EXEC_POL{},
+          RAJA::make_tuple( RAJA::RangeSegment{0, nj}, 
+                            RAJA::RangeSegment{0, nl}),
+          [=] (int j, int l) {
+
           POLYBENCH_3MM_BODY3;
 
-          RAJA::forall<RAJA::seq_exec> (RAJA::RangeSegment{0, nm}, [=] (int m) {
+          RAJA::forall<RAJA::seq_exec> (
+          RAJA::RangeSegment{0, nm}, [=] (int m) {
             POLYBENCH_3MM_BODY4;
           });
-        });
-
-        RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<RAJA::omp_parallel_for_exec,RAJA::seq_exec>>> (RAJA::RangeSegment{0, ni}, RAJA::RangeSegment{0, nl}, [=] (int i, int l) {
-          POLYBENCH_3MM_BODY5;
 
-          RAJA::forall<RAJA::seq_exec> (RAJA::RangeSegment{0, nj}, [=] (int j) {
-            POLYBENCH_3MM_BODY6;
-          });
         });
 
-      }
-      stopTimer();
-
-      break;
-    }
-#endif
+        RAJA::nested::forall(EXEC_POL{},
+          RAJA::make_tuple( RAJA::RangeSegment{0, ni}, 
+                            RAJA::RangeSegment{0, nl}),
+          [=] (int i, int l) {
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-      POLYBENCH_3MM_DATA_SETUP_CUDA;
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-        size_t grid_size = RAJA_DIVIDE_CEILING_INT(m_ni * m_nj, block_size);
-        polybench_3mm_cuda_1<<<grid_size,block_size>>>(A,B,C,D,E,F,G,m_ni,m_nj,m_nk,m_nl,m_nm);
-
-        grid_size = RAJA_DIVIDE_CEILING_INT(m_nj * m_nl, block_size);
-        polybench_3mm_cuda_2<<<grid_size,block_size>>>(A,B,C,D,E,F,G,m_ni,m_nj,m_nk,m_nl,m_nm);
-
-        grid_size = RAJA_DIVIDE_CEILING_INT(m_ni * m_nl, block_size);
-        polybench_3mm_cuda_3<<<grid_size,block_size>>>(A,B,C,D,E,F,G,m_ni,m_nj,m_nk,m_nl,m_nm);
-      }
-      cudaDeviceSynchronize();
-      stopTimer();
-      POLYBENCH_3MM_TEARDOWN_CUDA;
-      break;
-    }
+            POLYBENCH_3MM_BODY5;
 
-    case RAJA_CUDA : {
-      POLYBENCH_3MM_DATA_SETUP_CUDA;
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-       
-        RAJA::forall<RAJA::cuda_exec<block_size>> (RAJA::RangeSegment{0, ni * nj}, [=] __device__ (int ii) {
-          Index_type i,j,k;
-          *(E + ii) = 0.0;
-          i = ii/nj; j = ii % nj;
-          for(k=0;k<nk;k++) {
-            POLYBENCH_3MM_BODY2; 
-          }
-        });
+            RAJA::forall<RAJA::seq_exec> (
+              RAJA::RangeSegment{0, nj}, [=] (int j) {
+              POLYBENCH_3MM_BODY6;
+          });
 
-        RAJA::forall<RAJA::cuda_exec<block_size>> (RAJA::RangeSegment{0, nj * nl}, [=] __device__ (int ii) {
-          *(F + ii) = 0.0;
-          Index_type j,l,m;
-          j = ii/nl; l = ii % nl;
-          for(m=0;m<nm;m++) {
-            POLYBENCH_3MM_BODY4;
-          }  
         });
 
-        RAJA::forall<RAJA::cuda_exec<block_size>> (RAJA::RangeSegment{0, ni * nl}, [=] __device__ (int ii) {
-          *(G + ii) = 0.0;
-          Index_type i,l,j;
-          i = ii/nl; l = ii % nl;
-          for(j=0;j<nj;j++) {
-            POLYBENCH_3MM_BODY6;
-          }  
-        });
 
       }
       stopTimer();
-      POLYBENCH_3MM_TEARDOWN_CUDA;
+
       break;
     }
 
+#endif //RAJA_ENABLE_OPENMP
+                       
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
+      break;
+    }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  POLYBENCH_2MM : Unknown variant id = " << vid << std::endl;
     }
 
   }
@@ -474,7 +366,13 @@ void POLYBENCH_3MM::updateChecksum(VariantID vid)
 void POLYBENCH_3MM::tearDown(VariantID vid)
 {
   (void) vid;
-
+  deallocData(m_A);
+  deallocData(m_B);
+  deallocData(m_C);
+  deallocData(m_D);
+  deallocData(m_E);
+  deallocData(m_F);
+  deallocData(m_G);
 }
 
 } // end namespace basic
diff --git a/src/polybench/POLYBENCH_3MM.hpp b/src/polybench/POLYBENCH_3MM.hpp
index 5864fea02..fc9f002c4 100644
--- a/src/polybench/POLYBENCH_3MM.hpp
+++ b/src/polybench/POLYBENCH_3MM.hpp
@@ -12,11 +12,61 @@
 // For details about use and distribution, please read raja-perfsuite/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
+///
+/// POLYBENCH_3MM kernel reference implementation:
+///
+/// E := A*B 
+/// F := C*D 
+/// G := E*F 
+///
+/// for (Index_type i = 0; i < _PB_NI; i++) {
+///   for (Index_type j = 0; j < _PB_NJ; j++) {
+///     E[i][j] = SCALAR_VAL(0.0);
+///     for (Index_type k = 0; k < _PB_NK; ++k) {
+///       E[i][j] += A[i][k] * B[k][j];
+///     }
+///   }
+/// } 
+/// for (Index_type i = 0; i < _PB_NJ; i++) {
+///   for (Index_type j = 0; j < _PB_NL; j++) {
+///	F[i][j] = SCALAR_VAL(0.0);
+///	for (Index_type k = 0; k < _PB_NM; ++k) {
+///	  F[i][j] += C[i][k] * D[k][j];
+///     }
+///   }
+/// }
+/// for (Index_type i = 0; i < _PB_NI; i++) {
+///   for (Index_type j = 0; j < _PB_NL; j++) {
+///     G[i][j] = SCALAR_VAL(0.0);
+///     for (Index_type k = 0; k < _PB_NJ; ++k) {
+///	  G[i][j] += E[i][k] * F[k][j];
+///     }
+///   }
+/// }
+///
 
 #ifndef RAJAPerf_POLYBENCH_3MM_HXX
 #define RAJAPerf_POLYBENCH_3MM_HXX
 
+#define POLYBENCH_3MM_BODY1 \
+  *(E + i * nj + j) = 0.0;
+
+#define POLYBENCH_3MM_BODY2 \
+  *(E + i * nj + j) += *(A + i * nk + k) * *(B + k * nj + j);
+
+#define POLYBENCH_3MM_BODY3 \
+  *(F + j * nl + l) = 0.0;
+
+#define POLYBENCH_3MM_BODY4 \
+  *(F + j * nl + l)  += *(C + j * nm + m) * *(D + m * nl + l);
+
+#define POLYBENCH_3MM_BODY5 \
+  *(G + i * nl + l) = 0.0;
+
+#define POLYBENCH_3MM_BODY6 \
+  *(G + i * nl + l) += *(E + i * nj + j) * *(F + j * nl + l);
+
+
 #include "common/KernelBase.hpp"
 
 namespace rajaperf 
@@ -40,6 +90,8 @@ class POLYBENCH_3MM : public KernelBase
   void runKernel(VariantID vid); 
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
 
 private:
   Index_type m_ni;
diff --git a/src/polybench/POLYBENCH_GEMMVER-Cuda.cpp b/src/polybench/POLYBENCH_GEMMVER-Cuda.cpp
new file mode 100644
index 000000000..324d268fa
--- /dev/null
+++ b/src/polybench/POLYBENCH_GEMMVER-Cuda.cpp
@@ -0,0 +1,216 @@
+  
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "POLYBENCH_GEMMVER.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace polybench
+{
+
+//
+// Define thread block size for CUDA execution
+//
+const size_t block_size = 256;
+
+#define POLYBENCH_GEMMVER_DATA_SETUP_CUDA \
+  Index_type n = m_n; \
+  Real_type alpha = m_alpha; \
+  Real_type beta = m_beta; \
+  Real_ptr A = m_A; \
+  Real_ptr u1 = m_u1; \
+  Real_ptr v1 = m_v1; \
+  Real_ptr u2 = m_u2; \
+  Real_ptr v2 = m_v2; \
+  Real_ptr w = m_w; \
+  Real_ptr x = m_x; \
+  Real_ptr y = m_y; \
+  Real_ptr z = m_z; \
+\
+  allocAndInitCudaDeviceData(A, m_A, m_n * m_n); \
+  allocAndInitCudaDeviceData(u1, m_u1, m_n); \
+  allocAndInitCudaDeviceData(v1, m_v1, m_n); \
+  allocAndInitCudaDeviceData(u2, m_u2, m_n); \
+  allocAndInitCudaDeviceData(v2, m_v2, m_n); \
+  allocAndInitCudaDeviceData(w, m_w, m_n); \
+  allocAndInitCudaDeviceData(x, m_x, m_n); \
+  allocAndInitCudaDeviceData(y, m_y, m_n); \
+  allocAndInitCudaDeviceData(z, m_z, m_n); 
+
+
+#define POLYBENCH_GEMMVER_TEARDOWN_CUDA \
+  getCudaDeviceData(m_w, w, m_n); \
+  deallocCudaDeviceData(A); \
+  deallocCudaDeviceData(u1); \
+  deallocCudaDeviceData(v1); \
+  deallocCudaDeviceData(u2); \
+  deallocCudaDeviceData(v2); \
+  deallocCudaDeviceData(w); \
+  deallocCudaDeviceData(x); \
+  deallocCudaDeviceData(y); \
+  deallocCudaDeviceData(z); 
+
+__global__ void polybench_gemmver_cuda_1(Real_ptr A,
+                       Real_ptr u1, Real_ptr v1, Real_ptr u2,
+                       Real_ptr v2, Index_type n)
+{
+   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i,j;
+   if (ii < n * n) {
+     i = ii/n; j = ii % n;
+     POLYBENCH_GEMMVER_BODY1;
+   }
+}
+
+__global__ void polybench_gemmver_cuda_2(Real_type beta,
+                       Real_ptr A, Real_ptr x, Real_ptr y,
+                       Index_type n)
+{
+   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i,jj;
+   if (ii < n * n) {
+     i = ii/n; jj = ii % n;
+     if(jj == 0) {
+       for(Index_type j=0; j < n; ++j) { 
+         POLYBENCH_GEMMVER_BODY2;
+       } 
+     }   
+          
+   }
+}
+
+
+__global__ void polybench_gemmver_cuda_3(Real_ptr x,
+                       Real_ptr z, Real_ptr v1, Real_ptr u2,
+                       Real_ptr v2, Index_type n)
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < n) {
+     POLYBENCH_GEMMVER_BODY3;              
+   }
+}
+
+__global__ void polybench_gemmver_cuda_4(Real_type alpha,
+                       Real_ptr A, Real_ptr x, Real_ptr w,
+                       Index_type n)
+{
+   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
+   Index_type i,jj;
+   if (ii < n * n) {
+     i = ii/n; jj = ii % n;
+     if(jj == 0) {
+       for(Index_type j=0; j < n; ++j) { 
+         POLYBENCH_GEMMVER_BODY4;
+       } 
+     }   
+   }
+}
+
+
+
+void POLYBENCH_GEMMVER::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  
+  if ( vid == Base_CUDA ) {
+
+    POLYBENCH_GEMMVER_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      size_t grid_size = RAJA_DIVIDE_CEILING_INT(m_n * m_n, block_size);
+      polybench_gemmver_cuda_1<<<grid_size,block_size>>>(A,u1,v1,u2,v2,n);
+
+      grid_size = RAJA_DIVIDE_CEILING_INT(m_n * m_n, block_size);
+      polybench_gemmver_cuda_2<<<grid_size,block_size>>>(beta,A,x,y,n);
+
+      grid_size = RAJA_DIVIDE_CEILING_INT(m_n , block_size);
+      polybench_gemmver_cuda_3<<<grid_size,block_size>>>(x,z,v1,u2,v2,n);
+
+      grid_size = RAJA_DIVIDE_CEILING_INT(m_n * m_n, block_size);
+      polybench_gemmver_cuda_4<<<grid_size,block_size>>>(alpha,A,x,w,n);
+
+    }
+    cudaDeviceSynchronize();
+    stopTimer();
+
+    POLYBENCH_GEMMVER_TEARDOWN_CUDA;
+
+  } else if (vid == RAJA_CUDA) {
+
+    POLYBENCH_GEMMVER_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+     
+      RAJA::forall<RAJA::cuda_exec<block_size>> (
+        RAJA::RangeSegment{0, n * n}, [=] __device__ (int ii) {
+        Index_type i,j;
+        i = ii/n; j = ii % n;
+        POLYBENCH_GEMMVER_BODY1; 
+      });
+
+      RAJA::forall<RAJA::cuda_exec<block_size>> (
+        RAJA::RangeSegment{0, n * n}, [=] __device__ (int ii) {
+          Index_type i,jj;
+          i = ii/n; jj = ii % n;
+          if(jj == 0) {
+            for(Index_type j=0; j < n; ++j) { 
+              POLYBENCH_GEMMVER_BODY2;
+            } 
+          }
+      });
+
+      RAJA::forall<RAJA::cuda_exec<block_size>> (
+        RAJA::RangeSegment{0, n}, [=] __device__ (int i) {
+        POLYBENCH_GEMMVER_BODY3;
+      });
+
+      RAJA::forall<RAJA::cuda_exec<block_size>> (
+        RAJA::RangeSegment{0, n * n}, [=] __device__ (int ii) {
+          Index_type i,jj;
+          i = ii/n; jj = ii % n;
+          if(jj == 0) {
+            for(Index_type j=0; j < n; ++j) { 
+              POLYBENCH_GEMMVER_BODY4;
+            } 
+          }   
+      });
+      
+    }
+    stopTimer();
+
+    POLYBENCH_GEMMVER_TEARDOWN_CUDA;
+  } else {
+      std::cout << "\n  POLYBENCH_GEMMVER : Unknown Cuda variant id = " << vid << std::endl;
+  }
+
+}
+
+} // end namespace polybench
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
+  
diff --git a/src/polybench/POLYBENCH_GEMMVER-OMPTarget.cpp b/src/polybench/POLYBENCH_GEMMVER-OMPTarget.cpp
new file mode 100644
index 000000000..19a1a420e
--- /dev/null
+++ b/src/polybench/POLYBENCH_GEMMVER-OMPTarget.cpp
@@ -0,0 +1,188 @@
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "POLYBENCH_GEMMVER.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace polybench
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define POLYBENCH_GEMMVER_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Index_type n = m_n; \
+  Real_type alpha = m_alpha; \
+  Real_type beta = m_beta; \
+  Real_ptr A; \
+  Real_ptr u1; \
+  Real_ptr v1; \
+  Real_ptr u2; \
+  Real_ptr v2; \
+  Real_ptr w; \
+  Real_ptr x; \
+  Real_ptr y; \
+  Real_ptr z; \
+\
+  allocAndInitOpenMPDeviceData(A, m_A, m_n * m_n, did, hid); \
+  allocAndInitOpenMPDeviceData(u1, m_u1, m_n, did, hid); \
+  allocAndInitOpenMPDeviceData(v1, m_v1, m_n, did, hid); \
+  allocAndInitOpenMPDeviceData(u2, m_u2, m_n, did, hid); \
+  allocAndInitOpenMPDeviceData(v2, m_v2, m_n, did, hid); \
+  allocAndInitOpenMPDeviceData(w, m_w, m_n, did, hid); \
+  allocAndInitOpenMPDeviceData(x, m_x, m_n, did, hid); \
+  allocAndInitOpenMPDeviceData(y, m_y, m_n, did, hid); \
+  allocAndInitOpenMPDeviceData(z, m_z, m_n, did, hid); 
+
+#define POLYBENCH_GEMMVER_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_w, w, m_n, hid, did); \
+  deallocOpenMPDeviceData(A, did); \
+  deallocOpenMPDeviceData(u1, did); \
+  deallocOpenMPDeviceData(v1, did); \
+  deallocOpenMPDeviceData(u2, did); \
+  deallocOpenMPDeviceData(v2, did); \
+  deallocOpenMPDeviceData(w, did); \
+  deallocOpenMPDeviceData(x, did); \
+  deallocOpenMPDeviceData(y, did); \
+  deallocOpenMPDeviceData(z, did); 
+
+  
+
+void POLYBENCH_GEMMVER::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type n = m_n;
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    POLYBENCH_GEMMVER_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(A,u1,v1,u2,v2,w,x,y,z) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) collapse(2)
+      
+      for (Index_type i = 0; i < n; i++ ) {
+        for(Index_type j = 0; j < n; j++) {
+          POLYBENCH_GEMMVER_BODY1;
+        }
+      }
+
+      #pragma omp target is_device_ptr(A,u1,v1,u2,v2,w,x,y,z) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) collapse(2) 
+      for (Index_type i = 0; i < n; i++ ) { 
+        for(Index_type jj = 0; jj < n; jj++) {
+           if(jj == 0) {
+            for(Index_type j=0; j < n; ++j) { 
+              POLYBENCH_GEMMVER_BODY2;
+            } 
+          }
+        }
+      }
+
+      #pragma omp target is_device_ptr(A,u1,v1,u2,v2,w,x,y,z) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) 
+      for (Index_type i = 0; i < n; i++ ) {
+        POLYBENCH_GEMMVER_BODY3;
+      }
+
+      #pragma omp target is_device_ptr(A,u1,v1,u2,v2,w,x,y,z) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1) collapse(2)
+      for (Index_type i = 0; i < n; i++ ) {
+        for(Index_type jj = 0; jj < n; jj++) {
+           if(jj == 0) {
+            for(Index_type j=0; j < n; ++j) { 
+              POLYBENCH_GEMMVER_BODY4;
+            } 
+          }   
+        }
+      }
+
+    } // end run_reps
+    stopTimer(); 
+    POLYBENCH_GEMMVER_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    POLYBENCH_GEMMVER_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::policy::omp::omp_target_parallel_for_exec<NUMTEAMS>>(
+          RAJA::RangeSegment(0,n * n), [=](Index_type ii) {
+        Index_type i,j;
+        i = ii/n; j = ii % n;
+        POLYBENCH_GEMMVER_BODY1; 
+      });
+
+      RAJA::forall<RAJA::policy::omp::omp_target_parallel_for_exec<NUMTEAMS>>(
+          RAJA::RangeSegment(0,n * n), [=](Index_type ii) {
+          Index_type i,jj;
+          i = ii/n; jj = ii % n;
+          if(jj == 0) {
+            for(Index_type j=0; j < n; j++) {
+              POLYBENCH_GEMMVER_BODY2;
+            } 
+          }
+      });
+
+      RAJA::forall<RAJA::policy::omp::omp_target_parallel_for_exec<NUMTEAMS>>(
+          RAJA::RangeSegment(0,n), [=](Index_type i) {
+        POLYBENCH_GEMMVER_BODY3; 
+      });
+
+      RAJA::forall<RAJA::policy::omp::omp_target_parallel_for_exec<NUMTEAMS>>(
+          RAJA::RangeSegment(0,n * n), [=](Index_type ii) {
+          Index_type i,jj;
+          i = ii/n; jj = ii % n;
+          if(jj == 0) {
+            for(Index_type j=0; j < n; j++) { 
+              POLYBENCH_GEMMVER_BODY4;
+            } 
+          }   
+      });
+
+    } // for run_reps
+    stopTimer();
+    POLYBENCH_GEMMVER_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     std::cout << "\n  POLYBENCH_GEMMVER : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace polybench
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
+
diff --git a/src/polybench/POLYBENCH_GEMMVER.cpp b/src/polybench/POLYBENCH_GEMMVER.cpp
index 5d966d1df..c6e1a67ed 100644
--- a/src/polybench/POLYBENCH_GEMMVER.cpp
+++ b/src/polybench/POLYBENCH_GEMMVER.cpp
@@ -25,7 +25,7 @@
 /// for (Index_type i = 0; i < _PB_N; i++) {
 ///   for (Index_type j = 0; j < _PB_N; j++) {
 ///     x[i] = x[i] + beta * A[j][i] * y[j];
-///  }
+///   }
 /// }
 ///
 /// for (Index_type i = 0; i < _PB_N; i++) {
@@ -42,18 +42,19 @@
 
 #include "POLYBENCH_GEMMVER.hpp"
 
+#include "RAJA/RAJA.hpp"
+#include "RAJA/util/defines.hpp"
 #include "common/DataUtils.hpp"
-#include <RAJA/RAJA.hpp>
-
 
 #include <iostream>
+#include <cstring>
 
 namespace rajaperf 
 {
 namespace polybench
 {
 
-#define POLYBENCH_GEMMVER_DATA \
+#define POLYBENCH_GEMMVER_DATA_SETUP_CPU \
   Real_type alpha = m_alpha; \
   Real_type beta = m_beta; \
   ResReal_ptr A = m_A; \
@@ -65,178 +66,65 @@ namespace polybench
   ResReal_ptr x = m_x; \
   ResReal_ptr y = m_y; \
   ResReal_ptr z = m_z; 
-  
-
-#define POLYBENCH_GEMMVER_BODY1 \
-  *(A + i * n + j) = *(A + i * n + j) + *(u1 + i) * *(v1 + j) + *(u2 + i) * *(v2 + j)
-
-#define POLYBENCH_GEMMVER_BODY2 \
-  *(x + i) = *(x + i) + beta * *(A + j * n + i) * *(y + j);
-
-#define POLYBENCH_GEMMVER_BODY3 \
-  *(x + i) = *(x + i) + *(z + i);
-
-#define POLYBENCH_GEMMVER_BODY4 \
-  *(w + i) = *(w + i) + alpha * *(A + i * n + j) * *(x + j);
-
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define POLYBENCH_GEMMVER_DATA_SETUP_CUDA \
-  Real_type alpha = m_alpha; \
-  Real_type beta = m_beta; \
-  Real_ptr A = m_A; \
-  Real_ptr u1 = m_u1; \
-  Real_ptr v1 = m_v1; \
-  Real_ptr u2 = m_u2; \
-  Real_ptr v2 = m_v2; \
-  Real_ptr w = m_w; \
-  Real_ptr x = m_x; \
-  Real_ptr y = m_y; \
-  Real_ptr z = m_z; \
-\
-  allocAndInitCudaDeviceData(A, m_A, m_n * m_n); \
-  allocAndInitCudaDeviceData(u1, m_u1, m_n); \
-  allocAndInitCudaDeviceData(v1, m_v1, m_n); \
-  allocAndInitCudaDeviceData(u2, m_u2, m_n); \
-  allocAndInitCudaDeviceData(v2, m_v2, m_n); \
-  allocAndInitCudaDeviceData(w, m_w, m_n); \
-  allocAndInitCudaDeviceData(x, m_x, m_n); \
-  allocAndInitCudaDeviceData(y, m_y, m_n); \
-  allocAndInitCudaDeviceData(z, m_z, m_n); 
-
-
-#define POLYBENCH_GEMMVER_TEARDOWN_CUDA \
-  getCudaDeviceData(m_w, w, m_n); \
-  deallocCudaDeviceData(A); \
-  deallocCudaDeviceData(u1); \
-  deallocCudaDeviceData(v1); \
-  deallocCudaDeviceData(u2); \
-  deallocCudaDeviceData(v2); \
-  deallocCudaDeviceData(w); \
-  deallocCudaDeviceData(x); \
-  deallocCudaDeviceData(y); \
-  deallocCudaDeviceData(z); 
-
-__global__ void polybench_gemmver_cuda_1(Real_ptr A,
-                       Real_ptr u1, Real_ptr v1, Real_ptr u2,
-                       Real_ptr v2, Index_type n)
-{
-   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
-   Index_type i,j;
-   if (ii < n * n) {
-     i = ii/n; j = ii % n;
-     POLYBENCH_GEMMVER_BODY1;              
-   }
-}
-
-__global__ void polybench_gemmver_cuda_2(Real_type beta,
-                       Real_ptr A, Real_ptr x, Real_ptr y,
-                       Index_type n)
-{
-   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
-   Index_type i,j;
-   if (ii < n * n) {
-     i = ii/n; j = ii % n;
-     POLYBENCH_GEMMVER_BODY2;              
-   }
-}
-
-
-__global__ void polybench_gemmver_cuda_3(Real_ptr x,
-                       Real_ptr z, Real_ptr v1, Real_ptr u2,
-                       Real_ptr v2, Index_type n)
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < n) {
-     POLYBENCH_GEMMVER_BODY3;              
-   }
-}
-
-__global__ void polybench_gemmver_cuda_4(Real_type alpha,
-                       Real_ptr A, Real_ptr x, Real_ptr w,
-                       Index_type n)
-{
-   Index_type ii = blockIdx.x * blockDim.x + threadIdx.x;
-   Index_type i,j;
-   if (ii < n * n) {
-     i = ii/n; j = ii % n;
-     POLYBENCH_GEMMVER_BODY4;              
-   }
-}
-
 
-
-#endif // if defined(RAJA_ENABLE_CUDA)
   
 POLYBENCH_GEMMVER::POLYBENCH_GEMMVER(const RunParams& params)
   : KernelBase(rajaperf::Polybench_GEMMVER, params)
 {
-  //setDefaultReps(2000);
-
   SizeSpec_T lsizespec = KernelBase::getSizeSpec();
+  int run_reps = 0;
   switch(lsizespec) {
     case Mini:
       m_n=40;
-      m_run_reps = 200000;
+      run_reps = 200;
       break;
     case Small:
       m_n=120; 
-      m_run_reps = 20000;
+      run_reps = 200;
       break;
     case Medium:
       m_n=400;
-      m_run_reps = 2000;
+      run_reps = 20;
       break;
     case Large:
       m_n=2000;
-      m_run_reps = 20;
+      run_reps = 20;
       break;
     case Extralarge:
       m_n=4000; 
-      m_run_reps = 5;
+      run_reps = 5;
       break;
     default:
       m_n=400;
-      m_run_reps = 2000;
+      run_reps = 20;
       break;
   }
 
-  setDefaultReps(m_run_reps);
-  allocAndInitData(m_A, m_n * m_n);
-  allocAndInitData(m_u1, m_n);
-  allocAndInitData(m_v1, m_n);
-  allocAndInitData(m_u2, m_n);
-  allocAndInitData(m_v2, m_n);
-  allocAndInitData(m_w, m_n);
-  allocAndInitData(m_x, m_n);
-  allocAndInitData(m_y, m_n);
-  allocAndInitData(m_z, m_n);
+  setDefaultSize(m_n*m_n + m_n*m_n + m_n + m_n*m_n);
+  setDefaultReps(run_reps);
+
+  m_alpha = 1.5;
+  m_beta = 1.2;
 }
 
 POLYBENCH_GEMMVER::~POLYBENCH_GEMMVER() 
 {
-  deallocData(m_A);
-  deallocData(m_u1);
-  deallocData(m_v1);
-  deallocData(m_u2);
-  deallocData(m_v2);
-  deallocData(m_w);
-  deallocData(m_x);
-  deallocData(m_y);
-  deallocData(m_z);
 }
 
 void POLYBENCH_GEMMVER::setUp(VariantID vid)
 {
   (void) vid;
+
+  allocAndInitData(m_A, m_n * m_n, vid);
+  allocAndInitData(m_u1, m_n, vid);
+  allocAndInitData(m_v1, m_n, vid);
+  allocAndInitData(m_u2, m_n, vid);
+  allocAndInitData(m_v2, m_n, vid);
+  allocAndInitDataConst(m_w, m_n, 0.0, vid);
+  allocAndInitData(m_x, m_n, vid);
+  allocAndInitData(m_y, m_n, vid);
+  allocAndInitData(m_z, m_n, vid);
+
 }
 
 void POLYBENCH_GEMMVER::runKernel(VariantID vid)
@@ -249,28 +137,33 @@ void POLYBENCH_GEMMVER::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      POLYBENCH_GEMMVER_DATA;
+      POLYBENCH_GEMMVER_DATA_SETUP_CPU;
+
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-        for (Index_type i = 0; i < n; i++ ) 
-          for(Index_type j = 0; j < n; j++) {
+
+        for (Index_type i = 0; i < n; i++ ) {
+          for (Index_type j = 0; j < n; j++) {
             POLYBENCH_GEMMVER_BODY1;
           }
+        }
 
-        for (Index_type i = 0; i < n; i++ ) 
-          for(Index_type j = 0; j < n; j++) {
+        for (Index_type i = 0; i < n; i++ ) { 
+          for (Index_type j = 0; j < n; j++) {
             POLYBENCH_GEMMVER_BODY2;
           }
+        }
 
-        for (Index_type i = 0; i < n; i++ ) 
-          for(Index_type j = 0; j < n; j++) {
-            POLYBENCH_GEMMVER_BODY3;
-          }
+        for (Index_type i = 0; i < n; i++ ) { 
+          POLYBENCH_GEMMVER_BODY3;
+        }
 
-        for (Index_type i = 0; i < n; i++ ) 
-          for(Index_type j = 0; j < n; j++) {
+        for (Index_type i = 0; i < n; i++ ) { 
+          for (Index_type j = 0; j < n; j++) {
             POLYBENCH_GEMMVER_BODY4;
           }
+        }
+
       }
       stopTimer();
 
@@ -278,168 +171,154 @@ void POLYBENCH_GEMMVER::runKernel(VariantID vid)
     }
 
     case RAJA_Seq : {
-      POLYBENCH_GEMMVER_DATA;
+
+      POLYBENCH_GEMMVER_DATA_SETUP_CPU;
+
+      using EXEC_POL = RAJA::nested::Policy<
+        RAJA::nested::For<1, RAJA::seq_exec>,
+        RAJA::nested::For<0, RAJA::seq_exec> >;
+
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-        RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<RAJA::seq_exec,RAJA::seq_exec>>> (RAJA::RangeSegment{0, n}, RAJA::RangeSegment{0, n}, [=] (int i, int j) {
-          POLYBENCH_GEMMVER_BODY1;
-        });
 
-        RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<RAJA::seq_exec,RAJA::seq_exec>>> (RAJA::RangeSegment{0, n}, RAJA::RangeSegment{0, n}, [=] (int i, int j) {
-          POLYBENCH_GEMMVER_BODY2;
+        RAJA::nested::forall(EXEC_POL{},
+          RAJA::make_tuple(RAJA::RangeSegment(0, n),
+                           RAJA::RangeSegment(0, n)),
+            [=](Index_type i, Index_type j) {     
+            POLYBENCH_GEMMVER_BODY1;
         });
 
+        RAJA::nested::forall(EXEC_POL{},
+          RAJA::make_tuple(RAJA::RangeSegment(0, n),
+                           RAJA::RangeSegment(0, n)),
+            [=](Index_type i, Index_type j) {     
+            POLYBENCH_GEMMVER_BODY2;
+        });
 
-        RAJA::forall<RAJA::seq_exec> (RAJA::RangeSegment{0, n}, [=] (int i) {
-            POLYBENCH_GEMMVER_BODY3; 
+        RAJA::forall<RAJA::seq_exec> (
+          RAJA::RangeSegment{0, n}, [=] (int i) {
+          POLYBENCH_GEMMVER_BODY3; 
         });
 
-        RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<RAJA::seq_exec,RAJA::seq_exec>>> (RAJA::RangeSegment{0, n}, RAJA::RangeSegment{0, n}, [=] (int i, int j) {
-          POLYBENCH_GEMMVER_BODY4;
+        RAJA::nested::forall(EXEC_POL{},
+          RAJA::make_tuple(RAJA::RangeSegment(0, n),
+                           RAJA::RangeSegment(0, n)),
+            [=](Index_type i, Index_type j) {     
+            POLYBENCH_GEMMVER_BODY4;
         });
 
+
       }
       stopTimer();
+
       break;
     }
 
 #if defined(RAJA_ENABLE_OPENMP)      
     case Base_OpenMP : {
 
-      POLYBENCH_GEMMVER_DATA;
+      POLYBENCH_GEMMVER_DATA_SETUP_CPU;
+
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
         #pragma omp parallel for  
-        for (Index_type i = 0; i < n; i++ ) 
-          for(Index_type j = 0; j < n; j++) {
+        for (Index_type i = 0; i < n; i++ ) {
+          for (Index_type j = 0; j < n; j++) {
             POLYBENCH_GEMMVER_BODY1;
           }
+        }
+
 
         #pragma omp parallel for  
-        for (Index_type i = 0; i < n; i++ ) 
-          for(Index_type j = 0; j < n; j++) {
+        for (Index_type i = 0; i < n; i++ ) {
+          for (Index_type j = 0; j < n; j++) {
             POLYBENCH_GEMMVER_BODY2;
           }
+        } 
 
         #pragma omp parallel for  
-        for (Index_type i = 0; i < n; i++ ) 
-          for(Index_type j = 0; j < n; j++) {
-            POLYBENCH_GEMMVER_BODY3;
-          }
+        for (Index_type i = 0; i < n; i++ ) {
+          POLYBENCH_GEMMVER_BODY3;
+        }
 
         #pragma omp parallel for  
-        for (Index_type i = 0; i < n; i++ ) 
-          for(Index_type j = 0; j < n; j++) {
+        for (Index_type i = 0; i < n; i++ ) {
+          for (Index_type j = 0; j < n; j++) {
             POLYBENCH_GEMMVER_BODY4;
           }
+        }
+
       }
       stopTimer();
 
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // case is not defined...
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      POLYBENCH_GEMMVER_DATA;
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-        RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<RAJA::omp_parallel_for_exec,RAJA::seq_exec>>> (RAJA::RangeSegment{0, n}, RAJA::RangeSegment{0, n}, [=] (int i, int j) {
-          POLYBENCH_GEMMVER_BODY1;
-        });
-
-        RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<RAJA::omp_parallel_for_exec,RAJA::seq_exec>>> (RAJA::RangeSegment{0, n}, RAJA::RangeSegment{0, n}, [=] (int i, int j) {
-          POLYBENCH_GEMMVER_BODY2;
-        });
-
-
-        RAJA::forall<RAJA::omp_parallel_for_exec> (RAJA::RangeSegment{0, n}, [=] (int i) {
-            POLYBENCH_GEMMVER_BODY3; 
-        });
-
-        RAJA::forallN<RAJA::NestedPolicy<RAJA::ExecList<RAJA::omp_parallel_for_exec,RAJA::seq_exec>>> (RAJA::RangeSegment{0, n}, RAJA::RangeSegment{0, n}, [=] (int i, int j) {
-          POLYBENCH_GEMMVER_BODY4;
-        });
-      }
-      stopTimer();
+      POLYBENCH_GEMMVER_DATA_SETUP_CPU;
 
-      break;
-    }
-#endif
+      using EXEC_POL = RAJA::nested::Policy<
+        RAJA::nested::For<1, RAJA::seq_exec>,
+        RAJA::nested::For<0, RAJA::omp_parallel_for_exec> >;
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-      POLYBENCH_GEMMVER_DATA_SETUP_CUDA;
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-        size_t grid_size = RAJA_DIVIDE_CEILING_INT(m_n * m_n, block_size);
-        polybench_gemmver_cuda_1<<<grid_size,block_size>>>(A,u1,v1,u2,v2,m_n);
-
-        grid_size = RAJA_DIVIDE_CEILING_INT(m_n * m_n, block_size);
-        polybench_gemmver_cuda_2<<<grid_size,block_size>>>(beta,A,x,y,m_n);
 
-        grid_size = RAJA_DIVIDE_CEILING_INT(m_n , block_size);
-        polybench_gemmver_cuda_3<<<grid_size,block_size>>>(x,z,v1,u2,v2,m_n);
-
-        grid_size = RAJA_DIVIDE_CEILING_INT(m_n * m_n, block_size);
-        polybench_gemmver_cuda_4<<<grid_size,block_size>>>(alpha,A,x,w,m_n);
-      }
-      cudaDeviceSynchronize();
-      stopTimer();
-      POLYBENCH_GEMMVER_TEARDOWN_CUDA;
-      break;
-    }
-
-    case RAJA_CUDA : {
-      POLYBENCH_GEMMVER_DATA_SETUP_CUDA;
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-       
-        RAJA::forall<RAJA::cuda_exec<block_size>> (RAJA::RangeSegment{0, n * n}, [=] __device__ (int ii) {
-          Index_type i,j;
-          i = ii/n; j = ii % n;
-          POLYBENCH_GEMMVER_BODY1; 
+        RAJA::nested::forall(EXEC_POL{},
+          RAJA::make_tuple(RAJA::RangeSegment(0, n),
+                           RAJA::RangeSegment(0, n)),
+            [=](Index_type i, Index_type j) {     
+            POLYBENCH_GEMMVER_BODY1;
         });
 
-        RAJA::forall<RAJA::cuda_exec<block_size>> (RAJA::RangeSegment{0, n * n}, [=] __device__ (int ii) {
-          Index_type i,j;
-          i = ii/n; j = ii % n;
-          POLYBENCH_GEMMVER_BODY2;
+        RAJA::nested::forall(EXEC_POL{},
+          RAJA::make_tuple(RAJA::RangeSegment(0, n),
+                           RAJA::RangeSegment(0, n)),
+            [=](Index_type i, Index_type j) {     
+            POLYBENCH_GEMMVER_BODY2;
         });
 
-        RAJA::forall<RAJA::cuda_exec<block_size>> (RAJA::RangeSegment{0, n}, [=] __device__ (int i) {
-          POLYBENCH_GEMMVER_BODY3;
+        RAJA::forall<RAJA::omp_parallel_for_exec> (
+          RAJA::RangeSegment{0, n}, [=] (int i) {
+          POLYBENCH_GEMMVER_BODY3; 
         });
 
-        RAJA::forall<RAJA::cuda_exec<block_size>> (RAJA::RangeSegment{0, n * n}, [=] __device__ (int ii) {
-          Index_type i,j;
-          i = ii/n; j = ii % n;
-          POLYBENCH_GEMMVER_BODY4;
+        RAJA::nested::forall(EXEC_POL{},
+          RAJA::make_tuple(RAJA::RangeSegment(0, n),
+                           RAJA::RangeSegment(0, n)),
+            [=](Index_type i, Index_type j) {     
+            POLYBENCH_GEMMVER_BODY4;
         });
 
       }
       stopTimer();
-      POLYBENCH_GEMMVER_TEARDOWN_CUDA;
+
       break;
     }
+#endif //RAJA_ENABLE_OPENMP
 
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
+      break;
+    }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  POLYBENCH_GEMMVER : Unknown variant id = " << vid << std::endl;
     }
 
   }
@@ -454,7 +333,15 @@ void POLYBENCH_GEMMVER::updateChecksum(VariantID vid)
 void POLYBENCH_GEMMVER::tearDown(VariantID vid)
 {
   (void) vid;
-
+  deallocData(m_A);
+  deallocData(m_u1);
+  deallocData(m_v1);
+  deallocData(m_u2);
+  deallocData(m_v2);
+  deallocData(m_w);
+  deallocData(m_x);
+  deallocData(m_y);
+  deallocData(m_z);
 }
 
 } // end namespace basic
diff --git a/src/polybench/POLYBENCH_GEMMVER.hpp b/src/polybench/POLYBENCH_GEMMVER.hpp
index e33baf4f3..591244050 100644
--- a/src/polybench/POLYBENCH_GEMMVER.hpp
+++ b/src/polybench/POLYBENCH_GEMMVER.hpp
@@ -12,11 +12,50 @@
 // For details about use and distribution, please read raja-perfsuite/LICENSE.
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+///
+/// POLYBENCH_GEMMVER kernel reference implementation:
+///
+/// for (Index_type i = 0; i < _PB_N; i++) {
+///   for (Index_type j = 0; j < _PB_N; j++) {
+///     A[i][j] = A[i][j] + u1[i] * v1[j] + u2[i] * v2[j];
+///   }
+/// }
+///
+/// for (Index_type i = 0; i < _PB_N; i++) {
+///   for (Index_type j = 0; j < _PB_N; j++) {
+///     x[i] = x[i] + beta * A[j][i] * y[j];
+///   }
+/// }
+///
+/// for (Index_type i = 0; i < _PB_N; i++) {
+///   x[i] = x[i] + z[i];
+/// }
+///
+/// for (Index_type i = 0; i < _PB_N; i++) {
+///   for (Index_type j = 0; j < _PB_N; j++) {
+///     w[i] = w[i] +  alpha * A[i][j] * x[j];
+///   }
+/// }
+///
+
 
 
 #ifndef RAJAPerf_POLYBENCH_GEMMVER_HXX
 #define RAJAPerf_POLYBENCH_GEMMVER_HXX
 
+#define POLYBENCH_GEMMVER_BODY1 \
+  *(A + i * n + j) = *(A + i * n +j)  + *(u1 + i) * *(v1 + j) + *(u2 + i) * *(v2 + j)
+
+#define POLYBENCH_GEMMVER_BODY2 \
+  *(x + i) = *(x+i) + beta * *(A + j * n + i) * *(y + j);
+
+#define POLYBENCH_GEMMVER_BODY3 \
+  *(x + i) = *(x + i) + *(z + i);
+
+#define POLYBENCH_GEMMVER_BODY4 \
+  *(w + i) = *(w+i) + alpha * *(A + i * n + j) * *(x + j);
+
+
 #include "common/KernelBase.hpp"
 
 namespace rajaperf 
@@ -40,10 +79,11 @@ class POLYBENCH_GEMMVER : public KernelBase
   void runKernel(VariantID vid); 
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
 
 private:
   Index_type m_n;
-  Index_type m_run_reps;
   Real_type m_alpha;
   Real_type m_beta;
   Real_ptr m_A;
diff --git a/src/stream/ADD-Cuda.cpp b/src/stream/ADD-Cuda.cpp
new file mode 100644
index 000000000..ccfe09391
--- /dev/null
+++ b/src/stream/ADD-Cuda.cpp
@@ -0,0 +1,110 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "ADD.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace stream
+{
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#define ADD_DATA_SETUP_CUDA \
+  Real_ptr a; \
+  Real_ptr b; \
+  Real_ptr c; \
+\
+  allocAndInitCudaDeviceData(a, m_a, iend); \
+  allocAndInitCudaDeviceData(b, m_b, iend); \
+  allocAndInitCudaDeviceData(c, m_c, iend);
+
+#define ADD_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_c, c, iend); \
+  deallocCudaDeviceData(a); \
+  deallocCudaDeviceData(b); \
+  deallocCudaDeviceData(c);
+
+__global__ void add(Real_ptr c, Real_ptr a, Real_ptr b,
+                     Index_type iend)
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     ADD_BODY;
+   }
+}
+
+
+void ADD::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_CUDA ) {
+
+    ADD_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      add<<<grid_size, block_size>>>( c, a, b,
+                                      iend );
+
+    }
+    stopTimer();
+
+    ADD_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    ADD_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+        RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+        ADD_BODY;
+      });
+
+    }
+    stopTimer();
+
+    ADD_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  ADD : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
+
diff --git a/src/stream/ADD-OMPTarget.cpp b/src/stream/ADD-OMPTarget.cpp
new file mode 100644
index 000000000..b0f416221
--- /dev/null
+++ b/src/stream/ADD-OMPTarget.cpp
@@ -0,0 +1,104 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "ADD.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace stream
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define ADD_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr a; \
+  Real_ptr b; \
+  Real_ptr c; \
+\
+  allocAndInitOpenMPDeviceData(a, m_a, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(b, m_b, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(c, m_c, iend, did, hid);
+
+#define ADD_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_c, c, iend, hid, did); \
+  deallocOpenMPDeviceData(a, did); \
+  deallocOpenMPDeviceData(b, did); \
+  deallocOpenMPDeviceData(c, did);
+
+void ADD::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    ADD_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(a, b, c) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1)
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        ADD_BODY;
+      }
+
+    }
+    stopTimer();
+
+    ADD_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    ADD_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+        ADD_BODY;
+      });
+
+    }
+    stopTimer();
+
+    ADD_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     std::cout << "\n  ADD : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
+
diff --git a/src/stream/ADD.cpp b/src/stream/ADD.cpp
index 69998d93c..3b88deb62 100644
--- a/src/stream/ADD.cpp
+++ b/src/stream/ADD.cpp
@@ -13,20 +13,12 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-///
-/// ADD kernel reference implementation:
-///
-/// for (Index_type i = ibegin; i < iend; ++i ) {
-///   c[i] = a[i] + b[i];
-/// }
-///
-
 #include "ADD.hpp"
 
-#include "common/DataUtils.hpp"
-
 #include "RAJA/RAJA.hpp"
 
+#include "common/DataUtils.hpp"
+
 #include <iostream>
 
 namespace rajaperf 
@@ -34,49 +26,12 @@ namespace rajaperf
 namespace stream
 {
 
-#define ADD_DATA \
+ 
+#define ADD_DATA_SETUP_CPU \
   ResReal_ptr a = m_a; \
   ResReal_ptr b = m_b; \
   ResReal_ptr c = m_c;
 
-#define ADD_BODY  \
-  c[i] = a[i] + b[i];
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define ADD_DATA_SETUP_CUDA \
-  Real_ptr a; \
-  Real_ptr b; \
-  Real_ptr c; \
-\
-  allocAndInitCudaDeviceData(a, m_a, iend); \
-  allocAndInitCudaDeviceData(b, m_b, iend); \
-  allocAndInitCudaDeviceData(c, m_c, iend);
-
-#define ADD_DATA_TEARDOWN_CUDA \
-  getCudaDeviceData(m_c, c, iend); \
-  deallocCudaDeviceData(a); \
-  deallocCudaDeviceData(b); \
-  deallocCudaDeviceData(c)
-
-__global__ void add(Real_ptr c, Real_ptr a, Real_ptr b,
-                     Index_type iend) 
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     ADD_BODY; 
-   }
-}
-
-#endif // if defined(RAJA_ENABLE_CUDA)
-
 
 ADD::ADD(const RunParams& params)
   : KernelBase(rajaperf::Stream_ADD, params)
@@ -93,7 +48,7 @@ void ADD::setUp(VariantID vid)
 {
   allocAndInitData(m_a, getRunSize(), vid);
   allocAndInitData(m_b, getRunSize(), vid);
-  allocAndInitData(m_c, getRunSize(), vid);
+  allocAndInitDataConst(m_c, getRunSize(), 0.0, vid);
 }
 
 void ADD::runKernel(VariantID vid)
@@ -106,7 +61,7 @@ void ADD::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      ADD_DATA;
+      ADD_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -123,12 +78,13 @@ void ADD::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      ADD_DATA;
+      ADD_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](Index_type i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           ADD_BODY;
         });
 
@@ -141,7 +97,7 @@ void ADD::runKernel(VariantID vid)
 #if defined(RAJA_ENABLE_OPENMP)
     case Base_OpenMP : {
 
-      ADD_DATA;
+      ADD_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -157,20 +113,15 @@ void ADD::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // case is not defined...
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      ADD_DATA;
+      ADD_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, 
-          [=](Index_type i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           ADD_BODY;
         });
 
@@ -181,58 +132,26 @@ void ADD::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-
-      ADD_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-         add<<<grid_size, block_size>>>( c, a, b,
-                                         iend ); 
-
-      }
-      stopTimer();
-
-      ADD_DATA_TEARDOWN_CUDA;
-
-      break; 
-    }
-
-    case RAJA_CUDA : {
-
-      ADD_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend, 
-           [=] __device__ (Index_type i) {
-           ADD_BODY;
-         });
-
-      }
-      stopTimer();
-
-      ADD_DATA_TEARDOWN_CUDA;
-
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget : 
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA : 
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  ADD : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/stream/ADD.hpp b/src/stream/ADD.hpp
index 677ac7785..622a1dba6 100644
--- a/src/stream/ADD.hpp
+++ b/src/stream/ADD.hpp
@@ -13,10 +13,22 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// ADD kernel reference implementation:
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   c[i] = a[i] + b[i];
+/// }
+///
 
 #ifndef RAJAPerf_Stream_ADD_HPP
 #define RAJAPerf_Stream_ADD_HPP
 
+
+#define ADD_BODY  \
+  c[i] = a[i] + b[i]; 
+
+
 #include "common/KernelBase.hpp"
 
 namespace rajaperf 
@@ -39,10 +51,14 @@ class ADD : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
   Real_ptr m_a;
   Real_ptr m_b;
   Real_ptr m_c;
+
 };
 
 } // end namespace stream
diff --git a/src/stream/CMakeLists.txt b/src/stream/CMakeLists.txt
index b809ef08f..5108eb0ad 100644
--- a/src/stream/CMakeLists.txt
+++ b/src/stream/CMakeLists.txt
@@ -15,10 +15,20 @@
 
 blt_add_library(
   NAME stream
-  SOURCES COPY.cpp 
+  SOURCES ADD.cpp
+          ADD-Cuda.cpp
+          ADD-OMPTarget.cpp
+          COPY.cpp 
+          COPY-Cuda.cpp
+          COPY-OMPTarget.cpp
+          DOT.cpp 
+          DOT-Cuda.cpp 
+          DOT-OMPTarget.cpp 
           MUL.cpp 
-          ADD.cpp 
+          MUL-Cuda.cpp 
+          MUL-OMPTarget.cpp 
           TRIAD.cpp 
-          DOT.cpp 
+          TRIAD-Cuda.cpp 
+          TRIAD-OMPTarget.cpp 
   DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS}
   )
diff --git a/src/stream/COPY-Cuda.cpp b/src/stream/COPY-Cuda.cpp
new file mode 100644
index 000000000..460dfa104
--- /dev/null
+++ b/src/stream/COPY-Cuda.cpp
@@ -0,0 +1,107 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "COPY.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace stream
+{
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#define COPY_DATA_SETUP_CUDA \
+  Real_ptr a; \
+  Real_ptr c; \
+\
+  allocAndInitCudaDeviceData(a, m_a, iend); \
+  allocAndInitCudaDeviceData(c, m_c, iend);
+
+#define COPY_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_c, c, iend); \
+  deallocCudaDeviceData(a); \
+  deallocCudaDeviceData(c);
+
+__global__ void copy(Real_ptr c, Real_ptr a,
+                     Index_type iend) 
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     COPY_BODY; 
+   }
+}
+
+
+void COPY::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_CUDA ) {
+
+    COPY_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+       copy<<<grid_size, block_size>>>( c, a,
+                                        iend ); 
+
+    }
+    stopTimer();
+
+    COPY_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    COPY_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+         COPY_BODY;
+       });
+
+    }
+    stopTimer();
+
+    COPY_DATA_TEARDOWN_CUDA;
+
+  } else {
+      std::cout << "\n  COPY : Unknown Cuda variant id = " << vid << std::endl;
+   }
+
+}
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/stream/COPY-OMPTarget.cpp b/src/stream/COPY-OMPTarget.cpp
new file mode 100644
index 000000000..f8e009010
--- /dev/null
+++ b/src/stream/COPY-OMPTarget.cpp
@@ -0,0 +1,101 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "COPY.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace stream
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define COPY_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr a; \
+  Real_ptr c; \
+\
+  allocAndInitOpenMPDeviceData(a, m_a, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(c, m_c, iend, did, hid);
+
+#define COPY_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_c, c, iend, hid, did); \
+  deallocOpenMPDeviceData(a, did); \
+  deallocOpenMPDeviceData(c, did);
+
+
+void COPY::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    COPY_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(a, c) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1)
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        COPY_BODY;
+      }
+
+    }
+    stopTimer();
+
+    COPY_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    COPY_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+        COPY_BODY;
+      });
+
+    }
+    stopTimer();
+
+    COPY_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     std::cout << "\n  COPY : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/stream/COPY.cpp b/src/stream/COPY.cpp
index d83a759ed..fadec945d 100644
--- a/src/stream/COPY.cpp
+++ b/src/stream/COPY.cpp
@@ -13,20 +13,12 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-///
-/// COPY kernel reference implementation:
-///
-/// for (Index_type i = ibegin; i < iend; ++i ) {
-///   c[i] = a[i] ;
-/// }
-///
-
 #include "COPY.hpp"
 
-#include "common/DataUtils.hpp"
-
 #include "RAJA/RAJA.hpp"
 
+#include "common/DataUtils.hpp"
+
 #include <iostream>
 
 namespace rajaperf 
@@ -34,45 +26,11 @@ namespace rajaperf
 namespace stream
 {
 
-#define COPY_DATA \
+
+#define COPY_DATA_SETUP_CPU \
   ResReal_ptr a = m_a; \
   ResReal_ptr c = m_c;
 
-#define COPY_BODY  \
-  c[i] = a[i] ;
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define COPY_DATA_SETUP_CUDA \
-  Real_ptr a; \
-  Real_ptr c; \
-\
-  allocAndInitCudaDeviceData(a, m_a, iend); \
-  allocAndInitCudaDeviceData(c, m_c, iend);
-
-#define COPY_DATA_TEARDOWN_CUDA \
-  getCudaDeviceData(m_c, c, iend); \
-  deallocCudaDeviceData(a); \
-  deallocCudaDeviceData(c)
-
-__global__ void copy(Real_ptr c, Real_ptr a,
-                     Index_type iend) 
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     COPY_BODY; 
-   }
-}
-
-#endif // if defined(RAJA_ENABLE_CUDA)
-
 
 COPY::COPY(const RunParams& params)
   : KernelBase(rajaperf::Stream_COPY, params)
@@ -88,7 +46,7 @@ COPY::~COPY()
 void COPY::setUp(VariantID vid)
 {
   allocAndInitData(m_a, getRunSize(), vid);
-  allocAndInitData(m_c, getRunSize(), vid);
+  allocAndInitDataConst(m_c, getRunSize(), 0.0, vid);
 }
 
 void COPY::runKernel(VariantID vid)
@@ -101,7 +59,7 @@ void COPY::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      COPY_DATA;
+      COPY_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -118,12 +76,13 @@ void COPY::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      COPY_DATA;
+      COPY_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](Index_type i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           COPY_BODY;
         });
 
@@ -136,7 +95,7 @@ void COPY::runKernel(VariantID vid)
 #if defined(RAJA_ENABLE_OPENMP)
     case Base_OpenMP : {
 
-      COPY_DATA;
+      COPY_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -152,20 +111,15 @@ void COPY::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // case is not defined...
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      COPY_DATA;
+      COPY_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, 
-          [=](Index_type i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           COPY_BODY;
         });
 
@@ -176,58 +130,26 @@ void COPY::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-
-      COPY_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-         copy<<<grid_size, block_size>>>( c, a,
-                                          iend ); 
-
-      }
-      stopTimer();
-
-      COPY_DATA_TEARDOWN_CUDA;
-
-      break; 
-    }
-
-    case RAJA_CUDA : {
-
-      COPY_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend, 
-           [=] __device__ (Index_type i) {
-           COPY_BODY;
-         });
-
-      }
-      stopTimer();
-
-      COPY_DATA_TEARDOWN_CUDA;
-
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  COPY : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/stream/COPY.hpp b/src/stream/COPY.hpp
index 28cb7a4b1..7a322195b 100644
--- a/src/stream/COPY.hpp
+++ b/src/stream/COPY.hpp
@@ -13,10 +13,22 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// COPY kernel reference implementation:
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   c[i] = a[i] ;
+/// }
+///
 
 #ifndef RAJAPerf_Stream_COPY_HPP
 #define RAJAPerf_Stream_COPY_HPP
 
+
+#define COPY_BODY  \
+  c[i] = a[i] ;
+
+
 #include "common/KernelBase.hpp"
 
 namespace rajaperf 
@@ -39,6 +51,9 @@ class COPY : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
   Real_ptr m_a;
   Real_ptr m_c;
diff --git a/src/stream/DOT-Cuda.cpp b/src/stream/DOT-Cuda.cpp
new file mode 100644
index 000000000..81de341d0
--- /dev/null
+++ b/src/stream/DOT-Cuda.cpp
@@ -0,0 +1,180 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "DOT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+#define USE_THRUST
+//#undef USE_THRUST
+
+
+#if defined(RAJA_ENABLE_CUDA) && defined(USE_THRUST)
+#include <thrust/device_vector.h>
+#include <thrust/inner_product.h>
+#endif
+
+namespace rajaperf 
+{
+namespace stream
+{
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#define DOT_DATA_SETUP_CUDA \
+  Real_ptr a; \
+  Real_ptr b; \
+\
+  allocAndInitCudaDeviceData(a, m_a, iend); \
+  allocAndInitCudaDeviceData(b, m_b, iend);
+
+#define DOT_DATA_TEARDOWN_CUDA \
+  deallocCudaDeviceData(a); \
+  deallocCudaDeviceData(b);
+
+#if defined(USE_THRUST)
+// Nothing to do here...
+#else
+__global__ void dot(Real_ptr a, Real_ptr b,
+                    Real_ptr dprod, Real_type dprod_init,
+                    Index_type iend) 
+{
+  extern __shared__ Real_type pdot[ ];
+
+  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  pdot[ threadIdx.x ] = dprod_init; 
+  for ( ; i < iend ; i += gridDim.x * blockDim.x ) {
+    pdot[ threadIdx.x ] += a[ i ] * b[i];
+  }
+  __syncthreads();
+
+  for ( i = blockDim.x / 2; i > 0; i /= 2 ) {
+    if ( threadIdx.x < i ) {
+      pdot[ threadIdx.x ] += pdot[ threadIdx.x + i ];
+    }
+     __syncthreads();
+  }
+
+#if 1 // serialized access to shared data;
+  if ( threadIdx.x == 0 ) {
+    RAJA::_atomicAdd( dprod, pdot[ 0 ] );
+  }
+#else // this doesn't work due to data races
+  if ( threadIdx.x == 0 ) {
+    *dprod += pdot[ 0 ];
+  }
+#endif
+
+}
+#endif
+
+
+void DOT::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_CUDA ) {
+
+#if defined(USE_THRUST)
+
+    thrust::device_vector<Real_type> va(m_a, m_a+iend);
+    thrust::device_vector<Real_type> vb(m_b, m_b+iend);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Real_type dprod = thrust::inner_product(va.begin(), va.end(), 
+                                              vb.begin(), m_dot_init);
+
+      m_dot += dprod;
+
+    }
+    stopTimer();
+      
+#else // don't use thrust
+
+    DOT_DATA_SETUP_CUDA;
+
+    Real_ptr dprod;
+    allocAndInitCudaDeviceData(dprod, &m_dot_init, 1);
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      initCudaDeviceData(dprod, &m_dot_init, 1);
+
+      const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+      dot<<<grid_size, block_size, 
+            sizeof(Real_type)*block_size>>>( a, b, 
+                                             dprod, m_dot_init,
+                                             iend ); 
+
+      Real_type lprod;
+      Real_ptr plprod = &lprod;
+      getCudaDeviceData(plprod, dprod, 1);
+      m_dot += lprod;  
+
+    }
+    stopTimer();
+
+    DOT_DATA_TEARDOWN_CUDA;
+
+    deallocCudaDeviceData(dprod);
+
+#endif
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    DOT_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::ReduceSum<RAJA::cuda_reduce<block_size>, Real_type> dot(m_dot_init);
+
+       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+         DOT_BODY;
+       });
+
+       m_dot += static_cast<Real_type>(dot.get());
+
+    }
+    stopTimer();
+
+    DOT_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  DOT : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/stream/DOT-OMPTarget.cpp b/src/stream/DOT-OMPTarget.cpp
new file mode 100644
index 000000000..0fdf5eaa0
--- /dev/null
+++ b/src/stream/DOT-OMPTarget.cpp
@@ -0,0 +1,108 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "DOT.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace stream
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define DOT_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr a; \
+  Real_ptr b; \
+\
+  allocAndInitOpenMPDeviceData(a, m_a, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(b, m_b, iend, did, hid);
+
+#define DOT_DATA_TEARDOWN_OMP_TARGET \
+  deallocOpenMPDeviceData(a, did); \
+  deallocOpenMPDeviceData(b, did);
+
+void DOT::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    DOT_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      Real_type dot = m_dot_init;
+
+      #pragma omp target is_device_ptr(a, b) device( did ) map(tofrom:dot)
+      #pragma omp teams distribute parallel for reduction(+:dot) \
+              num_teams(NUMTEAMS) schedule(static, 1)
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        DOT_BODY;
+      }
+
+      m_dot += dot;
+
+    }
+    stopTimer();
+
+    DOT_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    DOT_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::ReduceSum<RAJA::omp_target_reduce<NUMTEAMS>, Real_type> dot(m_dot_init);
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+        DOT_BODY;
+      });
+
+      m_dot += static_cast<Real_type>(dot.get());
+
+    }
+    stopTimer();
+
+    DOT_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     std::cout << "\n  DOT : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/stream/DOT.cpp b/src/stream/DOT.cpp
index 2abddb615..d8f9d4fd7 100644
--- a/src/stream/DOT.cpp
+++ b/src/stream/DOT.cpp
@@ -13,107 +13,24 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-///
-/// ADD kernel reference implementation:
-///
-/// Real_type dot = m_dot_init;
-///
-/// for (Index_type i = ibegin; i < iend; ++i ) {
-///   dot += a[i] b b[i];
-/// }
-///
-/// m_dot += dot;
-///
-
 #include "DOT.hpp"
 
-#include "common/DataUtils.hpp"
-
 #include "RAJA/RAJA.hpp"
-#include "RAJA/policy/cuda.hpp"
-
-#include <iostream>
-
-#define USE_THRUST
-//#undef USE_THRUST
 
+#include "common/DataUtils.hpp"
 
-#if defined(RAJA_ENABLE_CUDA) && defined(USE_THRUST)
-#include <thrust/device_vector.h>
-#include <thrust/inner_product.h>
-#endif
+#include <iostream>
 
 namespace rajaperf 
 {
 namespace stream
 {
 
-#define DOT_DATA \
+
+#define DOT_DATA_SETUP_CPU \
   ResReal_ptr a = m_a; \
   ResReal_ptr b = m_b;
 
-#define DOT_BODY  \
-  dot += a[i] * b[i] ;
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define DOT_DATA_SETUP_CUDA \
-  Real_ptr a; \
-  Real_ptr b; \
-\
-  allocAndInitCudaDeviceData(a, m_a, iend); \
-  allocAndInitCudaDeviceData(b, m_b, iend);
-
-#define DOT_DATA_TEARDOWN_CUDA \
-  deallocCudaDeviceData(a); \
-  deallocCudaDeviceData(b);
-
-#if defined(USE_THRUST)
-// Nothing to do here...
-#else
-__global__ void dot(Real_ptr a, Real_ptr b,
-                    Real_ptr dprod, Real_type dprod_init,
-                    Index_type iend) 
-{
-  extern __shared__ Real_type pdot[ ];
-
-  Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-
-  pdot[ threadIdx.x ] = dprod_init; 
-  for ( ; i < iend ; i += gridDim.x * blockDim.x ) {
-    pdot[ threadIdx.x ] += a[ i ] * b[i];
-  }
-  __syncthreads();
-
-  for ( i = blockDim.x / 2; i > 0; i /= 2 ) {
-    if ( threadIdx.x < i ) {
-      pdot[ threadIdx.x ] += pdot[ threadIdx.x + i ];
-    }
-     __syncthreads();
-  }
-
-#if 1 // serialized access to shared data;
-  if ( threadIdx.x == 0 ) {
-    RAJA::_atomicAdd( dprod, pdot[ 0 ] );
-  }
-#else // this doesn't work due to data races
-  if ( threadIdx.x == 0 ) {
-    *dprod += pdot[ 0 ];
-  }
-#endif
-
-}
-#endif
-
-#endif // if defined(RAJA_ENABLE_CUDA)
-
 
 DOT::DOT(const RunParams& params)
   : KernelBase(rajaperf::Stream_DOT, params)
@@ -145,7 +62,7 @@ void DOT::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      DOT_DATA;
+      DOT_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -166,14 +83,15 @@ void DOT::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      DOT_DATA;
+      DOT_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
         RAJA::ReduceSum<RAJA::seq_reduce, Real_type> dot(m_dot_init);
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](Index_type i) {
+        RAJA::forall<RAJA::loop_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           DOT_BODY;
         });
 
@@ -188,7 +106,7 @@ void DOT::runKernel(VariantID vid)
 #if defined(RAJA_ENABLE_OPENMP)
     case Base_OpenMP : {
 
-      DOT_DATA;
+      DOT_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -208,26 +126,21 @@ void DOT::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // case is not defined...
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      DOT_DATA;
+      DOT_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
         RAJA::ReduceSum<RAJA::omp_reduce, Real_type> dot(m_dot_init);
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, 
-          [=](Index_type i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           DOT_BODY;
         });
 
-        m_dot += static_cast<Real_type>(dot.get());
+        m_dot += dot;
 
       }
       stopTimer();
@@ -236,92 +149,26 @@ void DOT::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-
-#if defined(USE_THRUST)
-
-      thrust::device_vector<Real_type> va(m_a, m_a+iend);
-      thrust::device_vector<Real_type> vb(m_b, m_b+iend);
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-        Real_type dprod = thrust::inner_product(va.begin(), va.end(), 
-                                                vb.begin(), m_dot_init);
-
-        m_dot += dprod;
-
-      }
-      stopTimer();
-      
-#else
-      DOT_DATA_SETUP_CUDA;
-      Real_ptr dprod;
-      allocAndInitCudaDeviceData(dprod, &m_dot_init, 1);
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-        initCudaDeviceData(dprod, &m_dot_init, 1);
-
-        const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-        dot<<<grid_size, block_size, 
-              sizeof(Real_type)*block_size>>>( a, b, 
-                                               dprod, m_dot_init,
-                                               iend ); 
-
-        Real_type lprod;
-        Real_ptr plprod = &lprod;
-        getCudaDeviceData(plprod, dprod, 1);
-        m_dot += lprod;  
-
-      }
-      stopTimer();
-
-      DOT_DATA_TEARDOWN_CUDA;
-      deallocCudaDeviceData(dprod);
-#endif
-
-      break; 
-    }
-
-    case RAJA_CUDA : {
-
-      DOT_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         RAJA::ReduceSum<RAJA::cuda_reduce<block_size>, Real_type> dot(m_dot_init);
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend, 
-           [=] __device__ (Index_type i) {
-           DOT_BODY;
-         });
-
-         m_dot += static_cast<Real_type>(dot.get());
-
-      }
-      stopTimer();
-
-      DOT_DATA_TEARDOWN_CUDA;
-
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  DOT : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/stream/DOT.hpp b/src/stream/DOT.hpp
index ef43c4950..0e8a105c1 100644
--- a/src/stream/DOT.hpp
+++ b/src/stream/DOT.hpp
@@ -13,10 +13,22 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// DOT kernel reference implementation:
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   dot += a[i] * b[i];
+/// }
+///
 
 #ifndef RAJAPerf_Stream_DOT_HPP
 #define RAJAPerf_Stream_DOT_HPP
 
+
+#define DOT_BODY  \
+  dot += a[i] * b[i] ;
+
+
 #include "common/KernelBase.hpp"
 
 namespace rajaperf 
@@ -39,6 +51,9 @@ class DOT : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
   Real_ptr m_a;
   Real_ptr m_b;
diff --git a/src/stream/MUL-Cuda.cpp b/src/stream/MUL-Cuda.cpp
new file mode 100644
index 000000000..19a4048e8
--- /dev/null
+++ b/src/stream/MUL-Cuda.cpp
@@ -0,0 +1,106 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MUL.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace stream
+{
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#define MUL_DATA_SETUP_CUDA \
+  Real_ptr b; \
+  Real_ptr c; \
+  Real_type alpha = m_alpha; \
+\
+  allocAndInitCudaDeviceData(b, m_b, iend); \
+  allocAndInitCudaDeviceData(c, m_c, iend);
+
+#define MUL_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_b, b, iend); \
+  deallocCudaDeviceData(b); \
+  deallocCudaDeviceData(c)
+
+__global__ void mul(Real_ptr b, Real_ptr c, Real_type alpha,
+                    Index_type iend) 
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     MUL_BODY; 
+   }
+}
+
+void MUL::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_CUDA ) {
+
+    MUL_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+       mul<<<grid_size, block_size>>>( b, c, alpha,
+                                       iend ); 
+
+    }
+    stopTimer();
+
+    MUL_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    MUL_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+         MUL_BODY;
+       });
+
+    }
+    stopTimer();
+
+    MUL_DATA_TEARDOWN_CUDA;
+
+  } else {
+     std::cout << "\n  MUL : Unknown Cuda variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/stream/MUL-OMPTarget.cpp b/src/stream/MUL-OMPTarget.cpp
new file mode 100644
index 000000000..df2978887
--- /dev/null
+++ b/src/stream/MUL-OMPTarget.cpp
@@ -0,0 +1,101 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "MUL.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace stream
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define MUL_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr b; \
+  Real_ptr c; \
+  Real_type alpha = m_alpha; \
+\
+  allocAndInitOpenMPDeviceData(b, m_b, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(c, m_c, iend, did, hid);
+
+#define MUL_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_b, b, iend, hid, did); \
+  deallocOpenMPDeviceData(b, did); \
+  deallocOpenMPDeviceData(c, did);
+
+void MUL::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    MUL_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(b, c) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1)
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        MUL_BODY;
+      }
+
+    }
+    stopTimer();
+
+    MUL_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    MUL_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+        RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+        MUL_BODY;
+      });
+
+    }
+    stopTimer();
+
+    MUL_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     std::cout << "\n  MUL : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
diff --git a/src/stream/MUL.cpp b/src/stream/MUL.cpp
index 5c382b9bb..579b0df71 100644
--- a/src/stream/MUL.cpp
+++ b/src/stream/MUL.cpp
@@ -13,20 +13,12 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-///
-/// MUL kernel reference implementation:
-///
-/// for (Index_type i = ibegin; i < iend; ++i ) {
-///   b[i] = alpha * c[i] ;
-/// }
-///
-
 #include "MUL.hpp"
 
-#include "common/DataUtils.hpp"
-
 #include "RAJA/RAJA.hpp"
 
+#include "common/DataUtils.hpp"
+
 #include <iostream>
 
 namespace rajaperf 
@@ -34,47 +26,12 @@ namespace rajaperf
 namespace stream
 {
 
-#define MUL_DATA \
+
+#define MUL_DATA_SETUP_CPU \
   ResReal_ptr b = m_b; \
   ResReal_ptr c = m_c; \
   Real_type alpha = m_alpha;
 
-#define MUL_BODY  \
-  b[i] = alpha * c[i] ;
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define MUL_DATA_SETUP_CUDA \
-  Real_ptr b; \
-  Real_ptr c; \
-  Real_type alpha = m_alpha; \
-\
-  allocAndInitCudaDeviceData(b, m_b, iend); \
-  allocAndInitCudaDeviceData(c, m_c, iend);
-
-#define MUL_DATA_TEARDOWN_CUDA \
-  getCudaDeviceData(m_b, b, iend); \
-  deallocCudaDeviceData(b); \
-  deallocCudaDeviceData(c)
-
-__global__ void mul(Real_ptr b, Real_ptr c, Real_type alpha,
-                    Index_type iend) 
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     MUL_BODY; 
-   }
-}
-
-#endif // if defined(RAJA_ENABLE_CUDA)
-
 
 MUL::MUL(const RunParams& params)
   : KernelBase(rajaperf::Stream_MUL, params)
@@ -85,11 +42,12 @@ MUL::MUL(const RunParams& params)
 
 MUL::~MUL() 
 {
+
 }
 
 void MUL::setUp(VariantID vid)
 {
-  allocAndInitData(m_b, getRunSize(), vid);
+  allocAndInitDataConst(m_b, getRunSize(), 0.0, vid);
   allocAndInitData(m_c, getRunSize(), vid);
   initData(m_alpha, vid);
 }
@@ -104,7 +62,7 @@ void MUL::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      MUL_DATA;
+      MUL_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -121,12 +79,13 @@ void MUL::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      MUL_DATA;
+      MUL_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](Index_type i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           MUL_BODY;
         });
 
@@ -139,7 +98,7 @@ void MUL::runKernel(VariantID vid)
 #if defined(RAJA_ENABLE_OPENMP)
     case Base_OpenMP : {
 
-      MUL_DATA;
+      MUL_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -155,20 +114,15 @@ void MUL::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // case is not defined...
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      MUL_DATA;
+      MUL_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, 
-          [=](Index_type i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           MUL_BODY;
         });
 
@@ -179,58 +133,26 @@ void MUL::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-
-      MUL_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-         mul<<<grid_size, block_size>>>( b, c, alpha,
-                                         iend ); 
-
-      }
-      stopTimer();
-
-      MUL_DATA_TEARDOWN_CUDA;
-
-      break; 
-    }
-
-    case RAJA_CUDA : {
-
-      MUL_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend, 
-           [=] __device__ (Index_type i) {
-           MUL_BODY;
-         });
-
-      }
-      stopTimer();
-
-      MUL_DATA_TEARDOWN_CUDA;
-
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  MUL : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/stream/MUL.hpp b/src/stream/MUL.hpp
index 052d3c0ed..c0b7d6a6b 100644
--- a/src/stream/MUL.hpp
+++ b/src/stream/MUL.hpp
@@ -13,10 +13,22 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// MUL kernel reference implementation:
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   b[i] = alpha * c[i] ;
+/// }
+///
 
 #ifndef RAJAPerf_Stream_MUL_HPP
 #define RAJAPerf_Stream_MUL_HPP
 
+
+#define MUL_BODY  \
+  b[i] = alpha * c[i] ;
+
+
 #include "common/KernelBase.hpp"
 
 namespace rajaperf 
@@ -39,6 +51,9 @@ class MUL : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
   Real_ptr m_b;
   Real_ptr m_c;
diff --git a/src/stream/TRIAD-Cuda.cpp b/src/stream/TRIAD-Cuda.cpp
new file mode 100644
index 000000000..91d428f1b
--- /dev/null
+++ b/src/stream/TRIAD-Cuda.cpp
@@ -0,0 +1,111 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "TRIAD.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include "common/CudaDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf 
+{
+namespace stream
+{
+
+  //
+  // Define thread block size for CUDA execution
+  //
+  const size_t block_size = 256;
+
+
+#define TRIAD_DATA_SETUP_CUDA \
+  Real_ptr a; \
+  Real_ptr b; \
+  Real_ptr c; \
+  Real_type alpha = m_alpha; \
+\
+  allocAndInitCudaDeviceData(a, m_a, iend); \
+  allocAndInitCudaDeviceData(b, m_b, iend); \
+  allocAndInitCudaDeviceData(c, m_c, iend);
+
+#define TRIAD_DATA_TEARDOWN_CUDA \
+  getCudaDeviceData(m_a, a, iend); \
+  deallocCudaDeviceData(a); \
+  deallocCudaDeviceData(b); \
+  deallocCudaDeviceData(c);
+
+__global__ void triad(Real_ptr a, Real_ptr b, Real_ptr c, Real_type alpha,
+                      Index_type iend) 
+{
+   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+   if (i < iend) {
+     TRIAD_BODY; 
+   }
+}
+
+
+void TRIAD::runCudaVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_CUDA ) {
+
+    TRIAD_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
+       triad<<<grid_size, block_size>>>( a, b, c, alpha,
+                                         iend ); 
+
+    }
+    stopTimer();
+
+    TRIAD_DATA_TEARDOWN_CUDA;
+
+  } else if ( vid == RAJA_CUDA ) {
+
+    TRIAD_DATA_SETUP_CUDA;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+       RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
+         RAJA::RangeSegment(ibegin, iend), [=] __device__ (Index_type i) {
+         TRIAD_BODY;
+       });
+
+    }
+    stopTimer();
+
+    TRIAD_DATA_TEARDOWN_CUDA;
+
+  } else {
+      std::cout << "\n  TRIAD : Unknown Cuda variant id = " << vid << std::endl;
+  }
+
+}
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_CUDA
diff --git a/src/stream/TRIAD-OMPTarget.cpp b/src/stream/TRIAD-OMPTarget.cpp
new file mode 100644
index 000000000..9f1fa4f10
--- /dev/null
+++ b/src/stream/TRIAD-OMPTarget.cpp
@@ -0,0 +1,105 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-738930
+//
+// All rights reserved.
+//
+// This file is part of the RAJA Performance Suite.
+//
+// For details about use and distribution, please read raja-perfsuite/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "TRIAD.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+#include "common/OpenMPTargetDataUtils.hpp"
+
+#include <iostream>
+
+namespace rajaperf
+{
+namespace stream
+{
+
+//
+// Define thread block size for target execution
+//
+#define NUMTEAMS 128
+
+#define TRIAD_DATA_SETUP_OMP_TARGET \
+  int hid = omp_get_initial_device(); \
+  int did = omp_get_default_device(); \
+\
+  Real_ptr a; \
+  Real_ptr b; \
+  Real_ptr c; \
+  Real_type alpha = m_alpha; \
+\
+  allocAndInitOpenMPDeviceData(a, m_a, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(b, m_b, iend, did, hid); \
+  allocAndInitOpenMPDeviceData(c, m_c, iend, did, hid);
+
+#define TRIAD_DATA_TEARDOWN_OMP_TARGET \
+  getOpenMPDeviceData(m_a, a, iend, hid, did); \
+  deallocOpenMPDeviceData(a, did); \
+  deallocOpenMPDeviceData(b, did); \
+  deallocOpenMPDeviceData(c, did);
+
+void TRIAD::runOpenMPTargetVariant(VariantID vid)
+{
+  const Index_type run_reps = getRunReps();
+  const Index_type ibegin = 0;
+  const Index_type iend = getRunSize();
+
+  if ( vid == Base_OpenMPTarget ) {
+
+    TRIAD_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      #pragma omp target is_device_ptr(a, b, c) device( did )
+      #pragma omp teams distribute parallel for num_teams(NUMTEAMS) schedule(static, 1)
+      for (Index_type i = ibegin; i < iend; ++i ) {
+        TRIAD_BODY;
+      }
+
+    }
+    stopTimer();
+
+    TRIAD_DATA_TEARDOWN_OMP_TARGET;
+
+  } else if ( vid == RAJA_OpenMPTarget ) {
+
+    TRIAD_DATA_SETUP_OMP_TARGET;
+
+    startTimer();
+    for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
+
+      RAJA::forall<RAJA::omp_target_parallel_for_exec<NUMTEAMS>>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
+        TRIAD_BODY;
+      });
+
+    }
+    stopTimer();
+
+    TRIAD_DATA_TEARDOWN_OMP_TARGET;
+
+  } else {
+     std::cout << "\n  TRIAD : Unknown OMP Target variant id = " << vid << std::endl;
+  }
+}
+
+} // end namespace stream
+} // end namespace rajaperf
+
+#endif  // RAJA_ENABLE_TARGET_OPENMP
+
diff --git a/src/stream/TRIAD.cpp b/src/stream/TRIAD.cpp
index ed3347380..1afed85c2 100644
--- a/src/stream/TRIAD.cpp
+++ b/src/stream/TRIAD.cpp
@@ -13,20 +13,12 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-///      
-/// TRIAD kernel reference implementation:
-///
-/// for (Index_type i = ibegin; i < iend; ++i ) {
-///   a[i] = b[i] + alpha * c[i] ;
-/// }
-///
-
 #include "TRIAD.hpp"
 
-#include "common/DataUtils.hpp"
-
 #include "RAJA/RAJA.hpp"
 
+#include "common/DataUtils.hpp"
+
 #include <iostream>
 
 namespace rajaperf 
@@ -34,51 +26,13 @@ namespace rajaperf
 namespace stream
 {
 
-#define TRIAD_DATA \
+
+#define TRIAD_DATA_SETUP_CPU \
   ResReal_ptr a = m_a; \
   ResReal_ptr b = m_b; \
   ResReal_ptr c = m_c; \
   Real_type alpha = m_alpha;
 
-#define TRIAD_BODY  \
-  a[i] = b[i] + alpha * c[i] ;
-
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  //
-  // Define thread block size for CUDA execution
-  //
-  const size_t block_size = 256;
-
-
-#define TRIAD_DATA_SETUP_CUDA \
-  Real_ptr a; \
-  Real_ptr b; \
-  Real_ptr c; \
-  Real_type alpha = m_alpha; \
-\
-  allocAndInitCudaDeviceData(a, m_a, iend); \
-  allocAndInitCudaDeviceData(b, m_b, iend); \
-  allocAndInitCudaDeviceData(c, m_c, iend);
-
-#define TRIAD_DATA_TEARDOWN_CUDA \
-  getCudaDeviceData(m_a, a, iend); \
-  deallocCudaDeviceData(a); \
-  deallocCudaDeviceData(b); \
-  deallocCudaDeviceData(c)
-
-__global__ void triad(Real_ptr a, Real_ptr b, Real_ptr c, Real_type alpha,
-                      Index_type iend) 
-{
-   Index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-   if (i < iend) {
-     TRIAD_BODY; 
-   }
-}
-
-#endif // if defined(RAJA_ENABLE_CUDA)
-
 
 TRIAD::TRIAD(const RunParams& params)
   : KernelBase(rajaperf::Stream_TRIAD, params)
@@ -93,7 +47,7 @@ TRIAD::~TRIAD()
 
 void TRIAD::setUp(VariantID vid)
 {
-  allocAndInitData(m_a, getRunSize(), vid);
+  allocAndInitDataConst(m_a, getRunSize(), 0.0, vid);
   allocAndInitData(m_b, getRunSize(), vid);
   allocAndInitData(m_c, getRunSize(), vid);
   initData(m_alpha, vid);
@@ -109,7 +63,7 @@ void TRIAD::runKernel(VariantID vid)
 
     case Base_Seq : {
 
-      TRIAD_DATA;
+      TRIAD_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -126,12 +80,13 @@ void TRIAD::runKernel(VariantID vid)
 
     case RAJA_Seq : {
 
-      TRIAD_DATA;
+      TRIAD_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::simd_exec>(ibegin, iend, [=](Index_type i) {
+        RAJA::forall<RAJA::simd_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           TRIAD_BODY;
         });
 
@@ -144,7 +99,7 @@ void TRIAD::runKernel(VariantID vid)
 #if defined(RAJA_ENABLE_OPENMP)
     case Base_OpenMP : {
 
-      TRIAD_DATA;
+      TRIAD_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
@@ -160,20 +115,15 @@ void TRIAD::runKernel(VariantID vid)
       break;
     }
 
-    case RAJALike_OpenMP : {
-      // case is not defined...
-      break;
-    }
-
     case RAJA_OpenMP : {
 
-      TRIAD_DATA;
+      TRIAD_DATA_SETUP_CPU;
 
       startTimer();
       for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
 
-        RAJA::forall<RAJA::omp_parallel_for_exec>(ibegin, iend, 
-          [=](Index_type i) {
+        RAJA::forall<RAJA::omp_parallel_for_exec>(
+          RAJA::RangeSegment(ibegin, iend), [=](Index_type i) {
           TRIAD_BODY;
         });
 
@@ -184,58 +134,26 @@ void TRIAD::runKernel(VariantID vid)
     }
 #endif
 
-#if defined(RAJA_ENABLE_CUDA)
-    case Base_CUDA : {
-
-      TRIAD_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         const size_t grid_size = RAJA_DIVIDE_CEILING_INT(iend, block_size);
-         triad<<<grid_size, block_size>>>( a, b, c, alpha,
-                                           iend ); 
-
-      }
-      stopTimer();
-
-      TRIAD_DATA_TEARDOWN_CUDA;
-
-      break; 
-    }
-
-    case RAJA_CUDA : {
-
-      TRIAD_DATA_SETUP_CUDA;
-
-      startTimer();
-      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
-
-         RAJA::forall< RAJA::cuda_exec<block_size, true /*async*/> >(
-           ibegin, iend, 
-           [=] __device__ (Index_type i) {
-           TRIAD_BODY;
-         });
-
-      }
-      stopTimer();
-
-      TRIAD_DATA_TEARDOWN_CUDA;
-
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+    case Base_OpenMPTarget :
+    case RAJA_OpenMPTarget :
+    {
+      runOpenMPTargetVariant(vid);
       break;
     }
 #endif
 
-#if 0
-    case Base_OpenMPTarget :
-    case RAJA_OpenMPTarget : {
-      // Fill these in later...you get the idea...
+#if defined(RAJA_ENABLE_CUDA)
+    case Base_CUDA :
+    case RAJA_CUDA :
+    {
+      runCudaVariant(vid);
       break;
     }
 #endif
 
     default : {
-      std::cout << "\n  Unknown variant id = " << vid << std::endl;
+      std::cout << "\n  TRIAD : Unknown variant id = " << vid << std::endl;
     }
 
   }
diff --git a/src/stream/TRIAD.hpp b/src/stream/TRIAD.hpp
index 00a2898c4..0e49e27cb 100644
--- a/src/stream/TRIAD.hpp
+++ b/src/stream/TRIAD.hpp
@@ -13,10 +13,22 @@
 //
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+///
+/// TRIAD kernel reference implementation:
+///
+/// for (Index_type i = ibegin; i < iend; ++i ) {
+///   a[i] = b[i] + alpha * c[i] ;
+/// }
+///
 
 #ifndef RAJAPerf_Stream_TRIAD_HPP
 #define RAJAPerf_Stream_TRIAD_HPP
 
+
+#define TRIAD_BODY  \
+  a[i] = b[i] + alpha * c[i] ;
+
+
 #include "common/KernelBase.hpp"
 
 namespace rajaperf 
@@ -39,6 +51,9 @@ class TRIAD : public KernelBase
   void updateChecksum(VariantID vid);
   void tearDown(VariantID vid);
 
+  void runCudaVariant(VariantID vid);
+  void runOpenMPTargetVariant(VariantID vid);
+
 private:
   Real_ptr m_a;
   Real_ptr m_b;
diff --git a/tpl/RAJA b/tpl/RAJA
index 7bce93d50..5067caee4 160000
--- a/tpl/RAJA
+++ b/tpl/RAJA
@@ -1 +1 @@
-Subproject commit 7bce93d50364c79503b7747e13626dc9e5081514
+Subproject commit 5067caee43620531fbecf22af206b8c21d76a9d6